ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1741 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
- #
10
- # March, June 2010
11
- #
12
- # The module implements "4-bit" GCM GHASH function and underlying
13
- # single multiplication operation in GF(2^128). "4-bit" means that
14
- # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15
- # function features so called "528B" variant utilizing additional
16
- # 256+16 bytes of per-key storage [+512 bytes shared table].
17
- # Performance results are for this streamed GHASH subroutine and are
18
- # expressed in cycles per processed byte, less is better:
19
- #
20
- # gcc 3.4.x(*) assembler
21
- #
22
- # P4 28.6 14.0 +100%
23
- # Opteron 19.3 7.7 +150%
24
- # Core2 17.8 8.1(**) +120%
25
- # Atom 31.6 16.8 +88%
26
- # VIA Nano 21.8 10.1 +115%
27
- #
28
- # (*) comparison is not completely fair, because C results are
29
- # for vanilla "256B" implementation, while assembler results
30
- # are for "528B";-)
31
- # (**) it's mystery [to me] why Core2 result is not same as for
32
- # Opteron;
33
-
34
- # May 2010
35
- #
36
- # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
37
- # See ghash-x86.pl for background information and details about coding
38
- # techniques.
39
- #
40
- # Special thanks to David Woodhouse <dwmw2@infradead.org> for
41
- # providing access to a Westmere-based system on behalf of Intel
42
- # Open Source Technology Centre.
43
-
44
- # December 2012
45
- #
46
- # Overhaul: aggregate Karatsuba post-processing, improve ILP in
47
- # reduction_alg9, increase reduction aggregate factor to 4x. As for
48
- # the latter. ghash-x86.pl discusses that it makes lesser sense to
49
- # increase aggregate factor. Then why increase here? Critical path
50
- # consists of 3 independent pclmulqdq instructions, Karatsuba post-
51
- # processing and reduction. "On top" of this we lay down aggregated
52
- # multiplication operations, triplets of independent pclmulqdq's. As
53
- # issue rate for pclmulqdq is limited, it makes lesser sense to
54
- # aggregate more multiplications than it takes to perform remaining
55
- # non-multiplication operations. 2x is near-optimal coefficient for
56
- # contemporary Intel CPUs (therefore modest improvement coefficient),
57
- # but not for Bulldozer. Latter is because logical SIMD operations
58
- # are twice as slow in comparison to Intel, so that critical path is
59
- # longer. A CPU with higher pclmulqdq issue rate would also benefit
60
- # from higher aggregate factor...
61
- #
62
- # Westmere 1.78(+13%)
63
- # Sandy Bridge 1.80(+8%)
64
- # Ivy Bridge 1.80(+7%)
65
- # Haswell 0.55(+93%) (if system doesn't support AVX)
66
- # Broadwell 0.45(+110%)(if system doesn't support AVX)
67
- # Bulldozer 1.49(+27%)
68
- # Silvermont 2.88(+13%)
69
-
70
- # March 2013
71
- #
72
- # ... 8x aggregate factor AVX code path is using reduction algorithm
73
- # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
74
- # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
75
- # sub-optimally in comparison to above mentioned version. But thanks
76
- # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
77
- # it performs in 0.41 cycles per byte on Haswell processor, and in
78
- # 0.29 on Broadwell.
79
- #
80
- # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
81
-
82
- $flavour = shift;
83
- $output = shift;
84
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
85
-
86
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
87
-
88
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
89
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
90
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
91
- die "can't locate x86_64-xlate.pl";
92
-
93
- # In upstream, this is controlled by shelling out to the compiler to check
94
- # versions, but BoringSSL is intended to be used with pre-generated perlasm
95
- # output, so this isn't useful anyway.
96
- #
97
- # TODO(davidben): Enable this after testing. $avx goes up to 2.
98
- $avx = 0;
99
-
100
- open OUT,"| \"$^X\" $xlate $flavour $output";
101
- *STDOUT=*OUT;
102
-
103
- $do4xaggr=1;
104
-
105
- # common register layout
106
- $nlo="%rax";
107
- $nhi="%rbx";
108
- $Zlo="%r8";
109
- $Zhi="%r9";
110
- $tmp="%r10";
111
- $rem_4bit = "%r11";
112
-
113
- $Xi="%rdi";
114
- $Htbl="%rsi";
115
-
116
- # per-function register layout
117
- $cnt="%rcx";
118
- $rem="%rdx";
119
-
120
- sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
121
- $r =~ s/%[er]([sd]i)/%\1l/ or
122
- $r =~ s/%[er](bp)/%\1l/ or
123
- $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
124
-
125
- sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
126
- { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
127
- my $arg = pop;
128
- $arg = "\$$arg" if ($arg*1 eq $arg);
129
- $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
130
- }
131
-
132
- { my $N;
133
- sub loop() {
134
- my $inp = shift;
135
-
136
- $N++;
137
- $code.=<<___;
138
- xor $nlo,$nlo
139
- xor $nhi,$nhi
140
- mov `&LB("$Zlo")`,`&LB("$nlo")`
141
- mov `&LB("$Zlo")`,`&LB("$nhi")`
142
- shl \$4,`&LB("$nlo")`
143
- mov \$14,$cnt
144
- mov 8($Htbl,$nlo),$Zlo
145
- mov ($Htbl,$nlo),$Zhi
146
- and \$0xf0,`&LB("$nhi")`
147
- mov $Zlo,$rem
148
- jmp .Loop$N
149
-
150
- .align 16
151
- .Loop$N:
152
- shr \$4,$Zlo
153
- and \$0xf,$rem
154
- mov $Zhi,$tmp
155
- mov ($inp,$cnt),`&LB("$nlo")`
156
- shr \$4,$Zhi
157
- xor 8($Htbl,$nhi),$Zlo
158
- shl \$60,$tmp
159
- xor ($Htbl,$nhi),$Zhi
160
- mov `&LB("$nlo")`,`&LB("$nhi")`
161
- xor ($rem_4bit,$rem,8),$Zhi
162
- mov $Zlo,$rem
163
- shl \$4,`&LB("$nlo")`
164
- xor $tmp,$Zlo
165
- dec $cnt
166
- js .Lbreak$N
167
-
168
- shr \$4,$Zlo
169
- and \$0xf,$rem
170
- mov $Zhi,$tmp
171
- shr \$4,$Zhi
172
- xor 8($Htbl,$nlo),$Zlo
173
- shl \$60,$tmp
174
- xor ($Htbl,$nlo),$Zhi
175
- and \$0xf0,`&LB("$nhi")`
176
- xor ($rem_4bit,$rem,8),$Zhi
177
- mov $Zlo,$rem
178
- xor $tmp,$Zlo
179
- jmp .Loop$N
180
-
181
- .align 16
182
- .Lbreak$N:
183
- shr \$4,$Zlo
184
- and \$0xf,$rem
185
- mov $Zhi,$tmp
186
- shr \$4,$Zhi
187
- xor 8($Htbl,$nlo),$Zlo
188
- shl \$60,$tmp
189
- xor ($Htbl,$nlo),$Zhi
190
- and \$0xf0,`&LB("$nhi")`
191
- xor ($rem_4bit,$rem,8),$Zhi
192
- mov $Zlo,$rem
193
- xor $tmp,$Zlo
194
-
195
- shr \$4,$Zlo
196
- and \$0xf,$rem
197
- mov $Zhi,$tmp
198
- shr \$4,$Zhi
199
- xor 8($Htbl,$nhi),$Zlo
200
- shl \$60,$tmp
201
- xor ($Htbl,$nhi),$Zhi
202
- xor $tmp,$Zlo
203
- xor ($rem_4bit,$rem,8),$Zhi
204
-
205
- bswap $Zlo
206
- bswap $Zhi
207
- ___
208
- }}
209
-
210
- $code=<<___;
211
- .text
212
- .extern OPENSSL_ia32cap_P
213
-
214
- .globl gcm_gmult_4bit
215
- .type gcm_gmult_4bit,\@function,2
216
- .align 16
217
- gcm_gmult_4bit:
218
- push %rbx
219
- push %rbp # %rbp and %r12 are pushed exclusively in
220
- push %r12 # order to reuse Win64 exception handler...
221
- .Lgmult_prologue:
222
-
223
- movzb 15($Xi),$Zlo
224
- lea .Lrem_4bit(%rip),$rem_4bit
225
- ___
226
- &loop ($Xi);
227
- $code.=<<___;
228
- mov $Zlo,8($Xi)
229
- mov $Zhi,($Xi)
230
-
231
- mov 16(%rsp),%rbx
232
- lea 24(%rsp),%rsp
233
- .Lgmult_epilogue:
234
- ret
235
- .size gcm_gmult_4bit,.-gcm_gmult_4bit
236
- ___
237
-
238
- # per-function register layout
239
- $inp="%rdx";
240
- $len="%rcx";
241
- $rem_8bit=$rem_4bit;
242
-
243
- $code.=<<___;
244
- .globl gcm_ghash_4bit
245
- .type gcm_ghash_4bit,\@function,4
246
- .align 16
247
- gcm_ghash_4bit:
248
- push %rbx
249
- push %rbp
250
- push %r12
251
- push %r13
252
- push %r14
253
- push %r15
254
- sub \$280,%rsp
255
- .Lghash_prologue:
256
- mov $inp,%r14 # reassign couple of args
257
- mov $len,%r15
258
- ___
259
- { my $inp="%r14";
260
- my $dat="%edx";
261
- my $len="%r15";
262
- my @nhi=("%ebx","%ecx");
263
- my @rem=("%r12","%r13");
264
- my $Hshr4="%rbp";
265
-
266
- &sub ($Htbl,-128); # size optimization
267
- &lea ($Hshr4,"16+128(%rsp)");
268
- { my @lo =($nlo,$nhi);
269
- my @hi =($Zlo,$Zhi);
270
-
271
- &xor ($dat,$dat);
272
- for ($i=0,$j=-2;$i<18;$i++,$j++) {
273
- &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
274
- &or ($lo[0],$tmp) if ($i>1);
275
- &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
276
- &shr ($lo[1],4) if ($i>0 && $i<17);
277
- &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
278
- &shr ($hi[1],4) if ($i>0 && $i<17);
279
- &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
280
- &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
281
- &shl (&LB($dat),4) if ($i>0 && $i<17);
282
- &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
283
- &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
284
- &shl ($tmp,60) if ($i>0 && $i<17);
285
-
286
- push (@lo,shift(@lo));
287
- push (@hi,shift(@hi));
288
- }
289
- }
290
- &add ($Htbl,-128);
291
- &mov ($Zlo,"8($Xi)");
292
- &mov ($Zhi,"0($Xi)");
293
- &add ($len,$inp); # pointer to the end of data
294
- &lea ($rem_8bit,".Lrem_8bit(%rip)");
295
- &jmp (".Louter_loop");
296
-
297
- $code.=".align 16\n.Louter_loop:\n";
298
- &xor ($Zhi,"($inp)");
299
- &mov ("%rdx","8($inp)");
300
- &lea ($inp,"16($inp)");
301
- &xor ("%rdx",$Zlo);
302
- &mov ("($Xi)",$Zhi);
303
- &mov ("8($Xi)","%rdx");
304
- &shr ("%rdx",32);
305
-
306
- &xor ($nlo,$nlo);
307
- &rol ($dat,8);
308
- &mov (&LB($nlo),&LB($dat));
309
- &movz ($nhi[0],&LB($dat));
310
- &shl (&LB($nlo),4);
311
- &shr ($nhi[0],4);
312
-
313
- for ($j=11,$i=0;$i<15;$i++) {
314
- &rol ($dat,8);
315
- &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
316
- &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
317
- &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
318
- &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
319
-
320
- &mov (&LB($nlo),&LB($dat));
321
- &xor ($Zlo,$tmp) if ($i>0);
322
- &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
323
-
324
- &movz ($nhi[1],&LB($dat));
325
- &shl (&LB($nlo),4);
326
- &movzb ($rem[0],"(%rsp,$nhi[0])");
327
-
328
- &shr ($nhi[1],4) if ($i<14);
329
- &and ($nhi[1],0xf0) if ($i==14);
330
- &shl ($rem[1],48) if ($i>0);
331
- &xor ($rem[0],$Zlo);
332
-
333
- &mov ($tmp,$Zhi);
334
- &xor ($Zhi,$rem[1]) if ($i>0);
335
- &shr ($Zlo,8);
336
-
337
- &movz ($rem[0],&LB($rem[0]));
338
- &mov ($dat,"$j($Xi)") if (--$j%4==0);
339
- &shr ($Zhi,8);
340
-
341
- &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
342
- &shl ($tmp,56);
343
- &xor ($Zhi,"($Hshr4,$nhi[0],8)");
344
-
345
- unshift (@nhi,pop(@nhi)); # "rotate" registers
346
- unshift (@rem,pop(@rem));
347
- }
348
- &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
349
- &xor ($Zlo,"8($Htbl,$nlo)");
350
- &xor ($Zhi,"($Htbl,$nlo)");
351
-
352
- &shl ($rem[1],48);
353
- &xor ($Zlo,$tmp);
354
-
355
- &xor ($Zhi,$rem[1]);
356
- &movz ($rem[0],&LB($Zlo));
357
- &shr ($Zlo,4);
358
-
359
- &mov ($tmp,$Zhi);
360
- &shl (&LB($rem[0]),4);
361
- &shr ($Zhi,4);
362
-
363
- &xor ($Zlo,"8($Htbl,$nhi[0])");
364
- &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
365
- &shl ($tmp,60);
366
-
367
- &xor ($Zhi,"($Htbl,$nhi[0])");
368
- &xor ($Zlo,$tmp);
369
- &shl ($rem[0],48);
370
-
371
- &bswap ($Zlo);
372
- &xor ($Zhi,$rem[0]);
373
-
374
- &bswap ($Zhi);
375
- &cmp ($inp,$len);
376
- &jb (".Louter_loop");
377
- }
378
- $code.=<<___;
379
- mov $Zlo,8($Xi)
380
- mov $Zhi,($Xi)
381
-
382
- lea 280(%rsp),%rsi
383
- mov 0(%rsi),%r15
384
- mov 8(%rsi),%r14
385
- mov 16(%rsi),%r13
386
- mov 24(%rsi),%r12
387
- mov 32(%rsi),%rbp
388
- mov 40(%rsi),%rbx
389
- lea 48(%rsi),%rsp
390
- .Lghash_epilogue:
391
- ret
392
- .size gcm_ghash_4bit,.-gcm_ghash_4bit
393
- ___
394
-
395
- ######################################################################
396
- # PCLMULQDQ version.
397
-
398
- @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
399
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
400
-
401
- ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
402
- ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
403
-
404
- sub clmul64x64_T2 { # minimal register pressure
405
- my ($Xhi,$Xi,$Hkey,$HK)=@_;
406
-
407
- if (!defined($HK)) { $HK = $T2;
408
- $code.=<<___;
409
- movdqa $Xi,$Xhi #
410
- pshufd \$0b01001110,$Xi,$T1
411
- pshufd \$0b01001110,$Hkey,$T2
412
- pxor $Xi,$T1 #
413
- pxor $Hkey,$T2
414
- ___
415
- } else {
416
- $code.=<<___;
417
- movdqa $Xi,$Xhi #
418
- pshufd \$0b01001110,$Xi,$T1
419
- pxor $Xi,$T1 #
420
- ___
421
- }
422
- $code.=<<___;
423
- pclmulqdq \$0x00,$Hkey,$Xi #######
424
- pclmulqdq \$0x11,$Hkey,$Xhi #######
425
- pclmulqdq \$0x00,$HK,$T1 #######
426
- pxor $Xi,$T1 #
427
- pxor $Xhi,$T1 #
428
-
429
- movdqa $T1,$T2 #
430
- psrldq \$8,$T1
431
- pslldq \$8,$T2 #
432
- pxor $T1,$Xhi
433
- pxor $T2,$Xi #
434
- ___
435
- }
436
-
437
- sub reduction_alg9 { # 17/11 times faster than Intel version
438
- my ($Xhi,$Xi) = @_;
439
-
440
- $code.=<<___;
441
- # 1st phase
442
- movdqa $Xi,$T2 #
443
- movdqa $Xi,$T1
444
- psllq \$5,$Xi
445
- pxor $Xi,$T1 #
446
- psllq \$1,$Xi
447
- pxor $T1,$Xi #
448
- psllq \$57,$Xi #
449
- movdqa $Xi,$T1 #
450
- pslldq \$8,$Xi
451
- psrldq \$8,$T1 #
452
- pxor $T2,$Xi
453
- pxor $T1,$Xhi #
454
-
455
- # 2nd phase
456
- movdqa $Xi,$T2
457
- psrlq \$1,$Xi
458
- pxor $T2,$Xhi #
459
- pxor $Xi,$T2
460
- psrlq \$5,$Xi
461
- pxor $T2,$Xi #
462
- psrlq \$1,$Xi #
463
- pxor $Xhi,$Xi #
464
- ___
465
- }
466
-
467
- { my ($Htbl,$Xip)=@_4args;
468
- my $HK="%xmm6";
469
-
470
- $code.=<<___;
471
- .globl gcm_init_clmul
472
- .type gcm_init_clmul,\@abi-omnipotent
473
- .align 16
474
- gcm_init_clmul:
475
- .L_init_clmul:
476
- ___
477
- $code.=<<___ if ($win64);
478
- .LSEH_begin_gcm_init_clmul:
479
- # I can't trust assembler to use specific encoding:-(
480
- .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
481
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
482
- ___
483
- $code.=<<___;
484
- movdqu ($Xip),$Hkey
485
- pshufd \$0b01001110,$Hkey,$Hkey # dword swap
486
-
487
- # <<1 twist
488
- pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
489
- movdqa $Hkey,$T1
490
- psllq \$1,$Hkey
491
- pxor $T3,$T3 #
492
- psrlq \$63,$T1
493
- pcmpgtd $T2,$T3 # broadcast carry bit
494
- pslldq \$8,$T1
495
- por $T1,$Hkey # H<<=1
496
-
497
- # magic reduction
498
- pand .L0x1c2_polynomial(%rip),$T3
499
- pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
500
-
501
- # calculate H^2
502
- pshufd \$0b01001110,$Hkey,$HK
503
- movdqa $Hkey,$Xi
504
- pxor $Hkey,$HK
505
- ___
506
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
507
- &reduction_alg9 ($Xhi,$Xi);
508
- $code.=<<___;
509
- pshufd \$0b01001110,$Hkey,$T1
510
- pshufd \$0b01001110,$Xi,$T2
511
- pxor $Hkey,$T1 # Karatsuba pre-processing
512
- movdqu $Hkey,0x00($Htbl) # save H
513
- pxor $Xi,$T2 # Karatsuba pre-processing
514
- movdqu $Xi,0x10($Htbl) # save H^2
515
- palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
516
- movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
517
- ___
518
- if ($do4xaggr) {
519
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
520
- &reduction_alg9 ($Xhi,$Xi);
521
- $code.=<<___;
522
- movdqa $Xi,$T3
523
- ___
524
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
525
- &reduction_alg9 ($Xhi,$Xi);
526
- $code.=<<___;
527
- pshufd \$0b01001110,$T3,$T1
528
- pshufd \$0b01001110,$Xi,$T2
529
- pxor $T3,$T1 # Karatsuba pre-processing
530
- movdqu $T3,0x30($Htbl) # save H^3
531
- pxor $Xi,$T2 # Karatsuba pre-processing
532
- movdqu $Xi,0x40($Htbl) # save H^4
533
- palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
534
- movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
535
- ___
536
- }
537
- $code.=<<___ if ($win64);
538
- movaps (%rsp),%xmm6
539
- lea 0x18(%rsp),%rsp
540
- .LSEH_end_gcm_init_clmul:
541
- ___
542
- $code.=<<___;
543
- ret
544
- .size gcm_init_clmul,.-gcm_init_clmul
545
- ___
546
- }
547
-
548
- { my ($Xip,$Htbl)=@_4args;
549
-
550
- $code.=<<___;
551
- .globl gcm_gmult_clmul
552
- .type gcm_gmult_clmul,\@abi-omnipotent
553
- .align 16
554
- gcm_gmult_clmul:
555
- .L_gmult_clmul:
556
- movdqu ($Xip),$Xi
557
- movdqa .Lbswap_mask(%rip),$T3
558
- movdqu ($Htbl),$Hkey
559
- movdqu 0x20($Htbl),$T2
560
- pshufb $T3,$Xi
561
- ___
562
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
563
- $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
564
- # experimental alternative. special thing about is that there
565
- # no dependency between the two multiplications...
566
- mov \$`0xE1<<1`,%eax
567
- mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
568
- mov \$0x07,%r11d
569
- movq %rax,$T1
570
- movq %r10,$T2
571
- movq %r11,$T3 # borrow $T3
572
- pand $Xi,$T3
573
- pshufb $T3,$T2 # ($Xi&7)·0xE0
574
- movq %rax,$T3
575
- pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
576
- pxor $Xi,$T2
577
- pslldq \$15,$T2
578
- paddd $T2,$T2 # <<(64+56+1)
579
- pxor $T2,$Xi
580
- pclmulqdq \$0x01,$T3,$Xi
581
- movdqa .Lbswap_mask(%rip),$T3 # reload $T3
582
- psrldq \$1,$T1
583
- pxor $T1,$Xhi
584
- pslldq \$7,$Xi
585
- pxor $Xhi,$Xi
586
- ___
587
- $code.=<<___;
588
- pshufb $T3,$Xi
589
- movdqu $Xi,($Xip)
590
- ret
591
- .size gcm_gmult_clmul,.-gcm_gmult_clmul
592
- ___
593
- }
594
-
595
- { my ($Xip,$Htbl,$inp,$len)=@_4args;
596
- my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
597
- my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
598
-
599
- $code.=<<___;
600
- .globl gcm_ghash_clmul
601
- .type gcm_ghash_clmul,\@abi-omnipotent
602
- .align 32
603
- gcm_ghash_clmul:
604
- .L_ghash_clmul:
605
- ___
606
- $code.=<<___ if ($win64);
607
- lea -0x88(%rsp),%rax
608
- .LSEH_begin_gcm_ghash_clmul:
609
- # I can't trust assembler to use specific encoding:-(
610
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
611
- .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
612
- .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
613
- .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
614
- .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
615
- .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
616
- .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
617
- .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
618
- .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
619
- .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
620
- .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
621
- ___
622
- $code.=<<___;
623
- movdqa .Lbswap_mask(%rip),$T3
624
-
625
- movdqu ($Xip),$Xi
626
- movdqu ($Htbl),$Hkey
627
- movdqu 0x20($Htbl),$HK
628
- pshufb $T3,$Xi
629
-
630
- sub \$0x10,$len
631
- jz .Lodd_tail
632
-
633
- movdqu 0x10($Htbl),$Hkey2
634
- ___
635
- if ($do4xaggr) {
636
- my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
637
-
638
- $code.=<<___;
639
- mov OPENSSL_ia32cap_P+4(%rip),%eax
640
- cmp \$0x30,$len
641
- jb .Lskip4x
642
-
643
- and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
644
- cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
645
- je .Lskip4x
646
-
647
- sub \$0x30,$len
648
- mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
649
- movdqu 0x30($Htbl),$Hkey3
650
- movdqu 0x40($Htbl),$Hkey4
651
-
652
- #######
653
- # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
654
- #
655
- movdqu 0x30($inp),$Xln
656
- movdqu 0x20($inp),$Xl
657
- pshufb $T3,$Xln
658
- pshufb $T3,$Xl
659
- movdqa $Xln,$Xhn
660
- pshufd \$0b01001110,$Xln,$Xmn
661
- pxor $Xln,$Xmn
662
- pclmulqdq \$0x00,$Hkey,$Xln
663
- pclmulqdq \$0x11,$Hkey,$Xhn
664
- pclmulqdq \$0x00,$HK,$Xmn
665
-
666
- movdqa $Xl,$Xh
667
- pshufd \$0b01001110,$Xl,$Xm
668
- pxor $Xl,$Xm
669
- pclmulqdq \$0x00,$Hkey2,$Xl
670
- pclmulqdq \$0x11,$Hkey2,$Xh
671
- pclmulqdq \$0x10,$HK,$Xm
672
- xorps $Xl,$Xln
673
- xorps $Xh,$Xhn
674
- movups 0x50($Htbl),$HK
675
- xorps $Xm,$Xmn
676
-
677
- movdqu 0x10($inp),$Xl
678
- movdqu 0($inp),$T1
679
- pshufb $T3,$Xl
680
- pshufb $T3,$T1
681
- movdqa $Xl,$Xh
682
- pshufd \$0b01001110,$Xl,$Xm
683
- pxor $T1,$Xi
684
- pxor $Xl,$Xm
685
- pclmulqdq \$0x00,$Hkey3,$Xl
686
- movdqa $Xi,$Xhi
687
- pshufd \$0b01001110,$Xi,$T1
688
- pxor $Xi,$T1
689
- pclmulqdq \$0x11,$Hkey3,$Xh
690
- pclmulqdq \$0x00,$HK,$Xm
691
- xorps $Xl,$Xln
692
- xorps $Xh,$Xhn
693
-
694
- lea 0x40($inp),$inp
695
- sub \$0x40,$len
696
- jc .Ltail4x
697
-
698
- jmp .Lmod4_loop
699
- .align 32
700
- .Lmod4_loop:
701
- pclmulqdq \$0x00,$Hkey4,$Xi
702
- xorps $Xm,$Xmn
703
- movdqu 0x30($inp),$Xl
704
- pshufb $T3,$Xl
705
- pclmulqdq \$0x11,$Hkey4,$Xhi
706
- xorps $Xln,$Xi
707
- movdqu 0x20($inp),$Xln
708
- movdqa $Xl,$Xh
709
- pclmulqdq \$0x10,$HK,$T1
710
- pshufd \$0b01001110,$Xl,$Xm
711
- xorps $Xhn,$Xhi
712
- pxor $Xl,$Xm
713
- pshufb $T3,$Xln
714
- movups 0x20($Htbl),$HK
715
- xorps $Xmn,$T1
716
- pclmulqdq \$0x00,$Hkey,$Xl
717
- pshufd \$0b01001110,$Xln,$Xmn
718
-
719
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
720
- movdqa $Xln,$Xhn
721
- pxor $Xhi,$T1 #
722
- pxor $Xln,$Xmn
723
- movdqa $T1,$T2 #
724
- pclmulqdq \$0x11,$Hkey,$Xh
725
- pslldq \$8,$T1
726
- psrldq \$8,$T2 #
727
- pxor $T1,$Xi
728
- movdqa .L7_mask(%rip),$T1
729
- pxor $T2,$Xhi #
730
- movq %rax,$T2
731
-
732
- pand $Xi,$T1 # 1st phase
733
- pshufb $T1,$T2 #
734
- pxor $Xi,$T2 #
735
- pclmulqdq \$0x00,$HK,$Xm
736
- psllq \$57,$T2 #
737
- movdqa $T2,$T1 #
738
- pslldq \$8,$T2
739
- pclmulqdq \$0x00,$Hkey2,$Xln
740
- psrldq \$8,$T1 #
741
- pxor $T2,$Xi
742
- pxor $T1,$Xhi #
743
- movdqu 0($inp),$T1
744
-
745
- movdqa $Xi,$T2 # 2nd phase
746
- psrlq \$1,$Xi
747
- pclmulqdq \$0x11,$Hkey2,$Xhn
748
- xorps $Xl,$Xln
749
- movdqu 0x10($inp),$Xl
750
- pshufb $T3,$Xl
751
- pclmulqdq \$0x10,$HK,$Xmn
752
- xorps $Xh,$Xhn
753
- movups 0x50($Htbl),$HK
754
- pshufb $T3,$T1
755
- pxor $T2,$Xhi #
756
- pxor $Xi,$T2
757
- psrlq \$5,$Xi
758
-
759
- movdqa $Xl,$Xh
760
- pxor $Xm,$Xmn
761
- pshufd \$0b01001110,$Xl,$Xm
762
- pxor $T2,$Xi #
763
- pxor $T1,$Xhi
764
- pxor $Xl,$Xm
765
- pclmulqdq \$0x00,$Hkey3,$Xl
766
- psrlq \$1,$Xi #
767
- pxor $Xhi,$Xi #
768
- movdqa $Xi,$Xhi
769
- pclmulqdq \$0x11,$Hkey3,$Xh
770
- xorps $Xl,$Xln
771
- pshufd \$0b01001110,$Xi,$T1
772
- pxor $Xi,$T1
773
-
774
- pclmulqdq \$0x00,$HK,$Xm
775
- xorps $Xh,$Xhn
776
-
777
- lea 0x40($inp),$inp
778
- sub \$0x40,$len
779
- jnc .Lmod4_loop
780
-
781
- .Ltail4x:
782
- pclmulqdq \$0x00,$Hkey4,$Xi
783
- pclmulqdq \$0x11,$Hkey4,$Xhi
784
- pclmulqdq \$0x10,$HK,$T1
785
- xorps $Xm,$Xmn
786
- xorps $Xln,$Xi
787
- xorps $Xhn,$Xhi
788
- pxor $Xi,$Xhi # aggregated Karatsuba post-processing
789
- pxor $Xmn,$T1
790
-
791
- pxor $Xhi,$T1 #
792
- pxor $Xi,$Xhi
793
-
794
- movdqa $T1,$T2 #
795
- psrldq \$8,$T1
796
- pslldq \$8,$T2 #
797
- pxor $T1,$Xhi
798
- pxor $T2,$Xi #
799
- ___
800
- &reduction_alg9($Xhi,$Xi);
801
- $code.=<<___;
802
- add \$0x40,$len
803
- jz .Ldone
804
- movdqu 0x20($Htbl),$HK
805
- sub \$0x10,$len
806
- jz .Lodd_tail
807
- .Lskip4x:
808
- ___
809
- }
810
- $code.=<<___;
811
- #######
812
- # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
813
- # [(H*Ii+1) + (H*Xi+1)] mod P =
814
- # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
815
- #
816
- movdqu ($inp),$T1 # Ii
817
- movdqu 16($inp),$Xln # Ii+1
818
- pshufb $T3,$T1
819
- pshufb $T3,$Xln
820
- pxor $T1,$Xi # Ii+Xi
821
-
822
- movdqa $Xln,$Xhn
823
- pshufd \$0b01001110,$Xln,$Xmn
824
- pxor $Xln,$Xmn
825
- pclmulqdq \$0x00,$Hkey,$Xln
826
- pclmulqdq \$0x11,$Hkey,$Xhn
827
- pclmulqdq \$0x00,$HK,$Xmn
828
-
829
- lea 32($inp),$inp # i+=2
830
- nop
831
- sub \$0x20,$len
832
- jbe .Leven_tail
833
- nop
834
- jmp .Lmod_loop
835
-
836
- .align 32
837
- .Lmod_loop:
838
- movdqa $Xi,$Xhi
839
- movdqa $Xmn,$T1
840
- pshufd \$0b01001110,$Xi,$Xmn #
841
- pxor $Xi,$Xmn #
842
-
843
- pclmulqdq \$0x00,$Hkey2,$Xi
844
- pclmulqdq \$0x11,$Hkey2,$Xhi
845
- pclmulqdq \$0x10,$HK,$Xmn
846
-
847
- pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
848
- pxor $Xhn,$Xhi
849
- movdqu ($inp),$T2 # Ii
850
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
851
- pshufb $T3,$T2
852
- movdqu 16($inp),$Xln # Ii+1
853
-
854
- pxor $Xhi,$T1
855
- pxor $T2,$Xhi # "Ii+Xi", consume early
856
- pxor $T1,$Xmn
857
- pshufb $T3,$Xln
858
- movdqa $Xmn,$T1 #
859
- psrldq \$8,$T1
860
- pslldq \$8,$Xmn #
861
- pxor $T1,$Xhi
862
- pxor $Xmn,$Xi #
863
-
864
- movdqa $Xln,$Xhn #
865
-
866
- movdqa $Xi,$T2 # 1st phase
867
- movdqa $Xi,$T1
868
- psllq \$5,$Xi
869
- pxor $Xi,$T1 #
870
- pclmulqdq \$0x00,$Hkey,$Xln #######
871
- psllq \$1,$Xi
872
- pxor $T1,$Xi #
873
- psllq \$57,$Xi #
874
- movdqa $Xi,$T1 #
875
- pslldq \$8,$Xi
876
- psrldq \$8,$T1 #
877
- pxor $T2,$Xi
878
- pshufd \$0b01001110,$Xhn,$Xmn
879
- pxor $T1,$Xhi #
880
- pxor $Xhn,$Xmn #
881
-
882
- movdqa $Xi,$T2 # 2nd phase
883
- psrlq \$1,$Xi
884
- pclmulqdq \$0x11,$Hkey,$Xhn #######
885
- pxor $T2,$Xhi #
886
- pxor $Xi,$T2
887
- psrlq \$5,$Xi
888
- pxor $T2,$Xi #
889
- lea 32($inp),$inp
890
- psrlq \$1,$Xi #
891
- pclmulqdq \$0x00,$HK,$Xmn #######
892
- pxor $Xhi,$Xi #
893
-
894
- sub \$0x20,$len
895
- ja .Lmod_loop
896
-
897
- .Leven_tail:
898
- movdqa $Xi,$Xhi
899
- movdqa $Xmn,$T1
900
- pshufd \$0b01001110,$Xi,$Xmn #
901
- pxor $Xi,$Xmn #
902
-
903
- pclmulqdq \$0x00,$Hkey2,$Xi
904
- pclmulqdq \$0x11,$Hkey2,$Xhi
905
- pclmulqdq \$0x10,$HK,$Xmn
906
-
907
- pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
908
- pxor $Xhn,$Xhi
909
- pxor $Xi,$T1
910
- pxor $Xhi,$T1
911
- pxor $T1,$Xmn
912
- movdqa $Xmn,$T1 #
913
- psrldq \$8,$T1
914
- pslldq \$8,$Xmn #
915
- pxor $T1,$Xhi
916
- pxor $Xmn,$Xi #
917
- ___
918
- &reduction_alg9 ($Xhi,$Xi);
919
- $code.=<<___;
920
- test $len,$len
921
- jnz .Ldone
922
-
923
- .Lodd_tail:
924
- movdqu ($inp),$T1 # Ii
925
- pshufb $T3,$T1
926
- pxor $T1,$Xi # Ii+Xi
927
- ___
928
- &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
929
- &reduction_alg9 ($Xhi,$Xi);
930
- $code.=<<___;
931
- .Ldone:
932
- pshufb $T3,$Xi
933
- movdqu $Xi,($Xip)
934
- ___
935
- $code.=<<___ if ($win64);
936
- movaps (%rsp),%xmm6
937
- movaps 0x10(%rsp),%xmm7
938
- movaps 0x20(%rsp),%xmm8
939
- movaps 0x30(%rsp),%xmm9
940
- movaps 0x40(%rsp),%xmm10
941
- movaps 0x50(%rsp),%xmm11
942
- movaps 0x60(%rsp),%xmm12
943
- movaps 0x70(%rsp),%xmm13
944
- movaps 0x80(%rsp),%xmm14
945
- movaps 0x90(%rsp),%xmm15
946
- lea 0xa8(%rsp),%rsp
947
- .LSEH_end_gcm_ghash_clmul:
948
- ___
949
- $code.=<<___;
950
- ret
951
- .size gcm_ghash_clmul,.-gcm_ghash_clmul
952
- ___
953
- }
954
-
955
- $code.=<<___;
956
- .globl gcm_init_avx
957
- .type gcm_init_avx,\@abi-omnipotent
958
- .align 32
959
- gcm_init_avx:
960
- ___
961
- if ($avx) {
962
- my ($Htbl,$Xip)=@_4args;
963
- my $HK="%xmm6";
964
-
965
- $code.=<<___ if ($win64);
966
- .LSEH_begin_gcm_init_avx:
967
- # I can't trust assembler to use specific encoding:-(
968
- .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
969
- .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
970
- ___
971
- $code.=<<___;
972
- vzeroupper
973
-
974
- vmovdqu ($Xip),$Hkey
975
- vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
976
-
977
- # <<1 twist
978
- vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
979
- vpsrlq \$63,$Hkey,$T1
980
- vpsllq \$1,$Hkey,$Hkey
981
- vpxor $T3,$T3,$T3 #
982
- vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
983
- vpslldq \$8,$T1,$T1
984
- vpor $T1,$Hkey,$Hkey # H<<=1
985
-
986
- # magic reduction
987
- vpand .L0x1c2_polynomial(%rip),$T3,$T3
988
- vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
989
-
990
- vpunpckhqdq $Hkey,$Hkey,$HK
991
- vmovdqa $Hkey,$Xi
992
- vpxor $Hkey,$HK,$HK
993
- mov \$4,%r10 # up to H^8
994
- jmp .Linit_start_avx
995
- ___
996
-
997
- sub clmul64x64_avx {
998
- my ($Xhi,$Xi,$Hkey,$HK)=@_;
999
-
1000
- if (!defined($HK)) { $HK = $T2;
1001
- $code.=<<___;
1002
- vpunpckhqdq $Xi,$Xi,$T1
1003
- vpunpckhqdq $Hkey,$Hkey,$T2
1004
- vpxor $Xi,$T1,$T1 #
1005
- vpxor $Hkey,$T2,$T2
1006
- ___
1007
- } else {
1008
- $code.=<<___;
1009
- vpunpckhqdq $Xi,$Xi,$T1
1010
- vpxor $Xi,$T1,$T1 #
1011
- ___
1012
- }
1013
- $code.=<<___;
1014
- vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
1015
- vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
1016
- vpclmulqdq \$0x00,$HK,$T1,$T1 #######
1017
- vpxor $Xi,$Xhi,$T2 #
1018
- vpxor $T2,$T1,$T1 #
1019
-
1020
- vpslldq \$8,$T1,$T2 #
1021
- vpsrldq \$8,$T1,$T1
1022
- vpxor $T2,$Xi,$Xi #
1023
- vpxor $T1,$Xhi,$Xhi
1024
- ___
1025
- }
1026
-
1027
- sub reduction_avx {
1028
- my ($Xhi,$Xi) = @_;
1029
-
1030
- $code.=<<___;
1031
- vpsllq \$57,$Xi,$T1 # 1st phase
1032
- vpsllq \$62,$Xi,$T2
1033
- vpxor $T1,$T2,$T2 #
1034
- vpsllq \$63,$Xi,$T1
1035
- vpxor $T1,$T2,$T2 #
1036
- vpslldq \$8,$T2,$T1 #
1037
- vpsrldq \$8,$T2,$T2
1038
- vpxor $T1,$Xi,$Xi #
1039
- vpxor $T2,$Xhi,$Xhi
1040
-
1041
- vpsrlq \$1,$Xi,$T2 # 2nd phase
1042
- vpxor $Xi,$Xhi,$Xhi
1043
- vpxor $T2,$Xi,$Xi #
1044
- vpsrlq \$5,$T2,$T2
1045
- vpxor $T2,$Xi,$Xi #
1046
- vpsrlq \$1,$Xi,$Xi #
1047
- vpxor $Xhi,$Xi,$Xi #
1048
- ___
1049
- }
1050
-
1051
- $code.=<<___;
1052
- .align 32
1053
- .Linit_loop_avx:
1054
- vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
1055
- vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
1056
- ___
1057
- &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
1058
- &reduction_avx ($Xhi,$Xi);
1059
- $code.=<<___;
1060
- .Linit_start_avx:
1061
- vmovdqa $Xi,$T3
1062
- ___
1063
- &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
1064
- &reduction_avx ($Xhi,$Xi);
1065
- $code.=<<___;
1066
- vpshufd \$0b01001110,$T3,$T1
1067
- vpshufd \$0b01001110,$Xi,$T2
1068
- vpxor $T3,$T1,$T1 # Karatsuba pre-processing
1069
- vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
1070
- vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
1071
- vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
1072
- lea 0x30($Htbl),$Htbl
1073
- sub \$1,%r10
1074
- jnz .Linit_loop_avx
1075
-
1076
- vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
1077
- vmovdqu $T3,-0x10($Htbl)
1078
-
1079
- vzeroupper
1080
- ___
1081
- $code.=<<___ if ($win64);
1082
- movaps (%rsp),%xmm6
1083
- lea 0x18(%rsp),%rsp
1084
- .LSEH_end_gcm_init_avx:
1085
- ___
1086
- $code.=<<___;
1087
- ret
1088
- .size gcm_init_avx,.-gcm_init_avx
1089
- ___
1090
- } else {
1091
- $code.=<<___;
1092
- jmp .L_init_clmul
1093
- .size gcm_init_avx,.-gcm_init_avx
1094
- ___
1095
- }
1096
-
1097
- $code.=<<___;
1098
- .globl gcm_gmult_avx
1099
- .type gcm_gmult_avx,\@abi-omnipotent
1100
- .align 32
1101
- gcm_gmult_avx:
1102
- jmp .L_gmult_clmul
1103
- .size gcm_gmult_avx,.-gcm_gmult_avx
1104
- ___
1105
-
1106
- $code.=<<___;
1107
- .globl gcm_ghash_avx
1108
- .type gcm_ghash_avx,\@abi-omnipotent
1109
- .align 32
1110
- gcm_ghash_avx:
1111
- ___
1112
- if ($avx) {
1113
- my ($Xip,$Htbl,$inp,$len)=@_4args;
1114
- my ($Xlo,$Xhi,$Xmi,
1115
- $Zlo,$Zhi,$Zmi,
1116
- $Hkey,$HK,$T1,$T2,
1117
- $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
1118
-
1119
- $code.=<<___ if ($win64);
1120
- lea -0x88(%rsp),%rax
1121
- .LSEH_begin_gcm_ghash_avx:
1122
- # I can't trust assembler to use specific encoding:-(
1123
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1124
- .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
1125
- .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
1126
- .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
1127
- .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
1128
- .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
1129
- .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
1130
- .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
1131
- .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
1132
- .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
1133
- .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
1134
- ___
1135
- $code.=<<___;
1136
- vzeroupper
1137
-
1138
- vmovdqu ($Xip),$Xi # load $Xi
1139
- lea .L0x1c2_polynomial(%rip),%r10
1140
- lea 0x40($Htbl),$Htbl # size optimization
1141
- vmovdqu .Lbswap_mask(%rip),$bswap
1142
- vpshufb $bswap,$Xi,$Xi
1143
- cmp \$0x80,$len
1144
- jb .Lshort_avx
1145
- sub \$0x80,$len
1146
-
1147
- vmovdqu 0x70($inp),$Ii # I[7]
1148
- vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1149
- vpshufb $bswap,$Ii,$Ii
1150
- vmovdqu 0x20-0x40($Htbl),$HK
1151
-
1152
- vpunpckhqdq $Ii,$Ii,$T2
1153
- vmovdqu 0x60($inp),$Ij # I[6]
1154
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1155
- vpxor $Ii,$T2,$T2
1156
- vpshufb $bswap,$Ij,$Ij
1157
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1158
- vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1159
- vpunpckhqdq $Ij,$Ij,$T1
1160
- vmovdqu 0x50($inp),$Ii # I[5]
1161
- vpclmulqdq \$0x00,$HK,$T2,$Xmi
1162
- vpxor $Ij,$T1,$T1
1163
-
1164
- vpshufb $bswap,$Ii,$Ii
1165
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1166
- vpunpckhqdq $Ii,$Ii,$T2
1167
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1168
- vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1169
- vpxor $Ii,$T2,$T2
1170
- vmovdqu 0x40($inp),$Ij # I[4]
1171
- vpclmulqdq \$0x10,$HK,$T1,$Zmi
1172
- vmovdqu 0x50-0x40($Htbl),$HK
1173
-
1174
- vpshufb $bswap,$Ij,$Ij
1175
- vpxor $Xlo,$Zlo,$Zlo
1176
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1177
- vpxor $Xhi,$Zhi,$Zhi
1178
- vpunpckhqdq $Ij,$Ij,$T1
1179
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1180
- vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1181
- vpxor $Xmi,$Zmi,$Zmi
1182
- vpclmulqdq \$0x00,$HK,$T2,$Xmi
1183
- vpxor $Ij,$T1,$T1
1184
-
1185
- vmovdqu 0x30($inp),$Ii # I[3]
1186
- vpxor $Zlo,$Xlo,$Xlo
1187
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1188
- vpxor $Zhi,$Xhi,$Xhi
1189
- vpshufb $bswap,$Ii,$Ii
1190
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1191
- vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1192
- vpxor $Zmi,$Xmi,$Xmi
1193
- vpunpckhqdq $Ii,$Ii,$T2
1194
- vpclmulqdq \$0x10,$HK,$T1,$Zmi
1195
- vmovdqu 0x80-0x40($Htbl),$HK
1196
- vpxor $Ii,$T2,$T2
1197
-
1198
- vmovdqu 0x20($inp),$Ij # I[2]
1199
- vpxor $Xlo,$Zlo,$Zlo
1200
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1201
- vpxor $Xhi,$Zhi,$Zhi
1202
- vpshufb $bswap,$Ij,$Ij
1203
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1204
- vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1205
- vpxor $Xmi,$Zmi,$Zmi
1206
- vpunpckhqdq $Ij,$Ij,$T1
1207
- vpclmulqdq \$0x00,$HK,$T2,$Xmi
1208
- vpxor $Ij,$T1,$T1
1209
-
1210
- vmovdqu 0x10($inp),$Ii # I[1]
1211
- vpxor $Zlo,$Xlo,$Xlo
1212
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1213
- vpxor $Zhi,$Xhi,$Xhi
1214
- vpshufb $bswap,$Ii,$Ii
1215
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1216
- vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1217
- vpxor $Zmi,$Xmi,$Xmi
1218
- vpunpckhqdq $Ii,$Ii,$T2
1219
- vpclmulqdq \$0x10,$HK,$T1,$Zmi
1220
- vmovdqu 0xb0-0x40($Htbl),$HK
1221
- vpxor $Ii,$T2,$T2
1222
-
1223
- vmovdqu ($inp),$Ij # I[0]
1224
- vpxor $Xlo,$Zlo,$Zlo
1225
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1226
- vpxor $Xhi,$Zhi,$Zhi
1227
- vpshufb $bswap,$Ij,$Ij
1228
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1229
- vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1230
- vpxor $Xmi,$Zmi,$Zmi
1231
- vpclmulqdq \$0x10,$HK,$T2,$Xmi
1232
-
1233
- lea 0x80($inp),$inp
1234
- cmp \$0x80,$len
1235
- jb .Ltail_avx
1236
-
1237
- vpxor $Xi,$Ij,$Ij # accumulate $Xi
1238
- sub \$0x80,$len
1239
- jmp .Loop8x_avx
1240
-
1241
- .align 32
1242
- .Loop8x_avx:
1243
- vpunpckhqdq $Ij,$Ij,$T1
1244
- vmovdqu 0x70($inp),$Ii # I[7]
1245
- vpxor $Xlo,$Zlo,$Zlo
1246
- vpxor $Ij,$T1,$T1
1247
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
1248
- vpshufb $bswap,$Ii,$Ii
1249
- vpxor $Xhi,$Zhi,$Zhi
1250
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
1251
- vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1252
- vpunpckhqdq $Ii,$Ii,$T2
1253
- vpxor $Xmi,$Zmi,$Zmi
1254
- vpclmulqdq \$0x00,$HK,$T1,$Tred
1255
- vmovdqu 0x20-0x40($Htbl),$HK
1256
- vpxor $Ii,$T2,$T2
1257
-
1258
- vmovdqu 0x60($inp),$Ij # I[6]
1259
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1260
- vpxor $Zlo,$Xi,$Xi # collect result
1261
- vpshufb $bswap,$Ij,$Ij
1262
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1263
- vxorps $Zhi,$Xo,$Xo
1264
- vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1265
- vpunpckhqdq $Ij,$Ij,$T1
1266
- vpclmulqdq \$0x00,$HK, $T2,$Xmi
1267
- vpxor $Zmi,$Tred,$Tred
1268
- vxorps $Ij,$T1,$T1
1269
-
1270
- vmovdqu 0x50($inp),$Ii # I[5]
1271
- vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
1272
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1273
- vpxor $Xo,$Tred,$Tred
1274
- vpslldq \$8,$Tred,$T2
1275
- vpxor $Xlo,$Zlo,$Zlo
1276
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1277
- vpsrldq \$8,$Tred,$Tred
1278
- vpxor $T2, $Xi, $Xi
1279
- vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1280
- vpshufb $bswap,$Ii,$Ii
1281
- vxorps $Tred,$Xo, $Xo
1282
- vpxor $Xhi,$Zhi,$Zhi
1283
- vpunpckhqdq $Ii,$Ii,$T2
1284
- vpclmulqdq \$0x10,$HK, $T1,$Zmi
1285
- vmovdqu 0x50-0x40($Htbl),$HK
1286
- vpxor $Ii,$T2,$T2
1287
- vpxor $Xmi,$Zmi,$Zmi
1288
-
1289
- vmovdqu 0x40($inp),$Ij # I[4]
1290
- vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
1291
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1292
- vpshufb $bswap,$Ij,$Ij
1293
- vpxor $Zlo,$Xlo,$Xlo
1294
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1295
- vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1296
- vpunpckhqdq $Ij,$Ij,$T1
1297
- vpxor $Zhi,$Xhi,$Xhi
1298
- vpclmulqdq \$0x00,$HK, $T2,$Xmi
1299
- vxorps $Ij,$T1,$T1
1300
- vpxor $Zmi,$Xmi,$Xmi
1301
-
1302
- vmovdqu 0x30($inp),$Ii # I[3]
1303
- vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1304
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1305
- vpshufb $bswap,$Ii,$Ii
1306
- vpxor $Xlo,$Zlo,$Zlo
1307
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1308
- vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1309
- vpunpckhqdq $Ii,$Ii,$T2
1310
- vpxor $Xhi,$Zhi,$Zhi
1311
- vpclmulqdq \$0x10,$HK, $T1,$Zmi
1312
- vmovdqu 0x80-0x40($Htbl),$HK
1313
- vpxor $Ii,$T2,$T2
1314
- vpxor $Xmi,$Zmi,$Zmi
1315
-
1316
- vmovdqu 0x20($inp),$Ij # I[2]
1317
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1318
- vpshufb $bswap,$Ij,$Ij
1319
- vpxor $Zlo,$Xlo,$Xlo
1320
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1321
- vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1322
- vpunpckhqdq $Ij,$Ij,$T1
1323
- vpxor $Zhi,$Xhi,$Xhi
1324
- vpclmulqdq \$0x00,$HK, $T2,$Xmi
1325
- vpxor $Ij,$T1,$T1
1326
- vpxor $Zmi,$Xmi,$Xmi
1327
- vxorps $Tred,$Xi,$Xi
1328
-
1329
- vmovdqu 0x10($inp),$Ii # I[1]
1330
- vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
1331
- vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1332
- vpshufb $bswap,$Ii,$Ii
1333
- vpxor $Xlo,$Zlo,$Zlo
1334
- vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1335
- vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1336
- vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1337
- vxorps $Xo,$Tred,$Tred
1338
- vpunpckhqdq $Ii,$Ii,$T2
1339
- vpxor $Xhi,$Zhi,$Zhi
1340
- vpclmulqdq \$0x10,$HK, $T1,$Zmi
1341
- vmovdqu 0xb0-0x40($Htbl),$HK
1342
- vpxor $Ii,$T2,$T2
1343
- vpxor $Xmi,$Zmi,$Zmi
1344
-
1345
- vmovdqu ($inp),$Ij # I[0]
1346
- vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1347
- vpshufb $bswap,$Ij,$Ij
1348
- vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1349
- vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1350
- vpxor $Tred,$Ij,$Ij
1351
- vpclmulqdq \$0x10,$HK, $T2,$Xmi
1352
- vpxor $Xi,$Ij,$Ij # accumulate $Xi
1353
-
1354
- lea 0x80($inp),$inp
1355
- sub \$0x80,$len
1356
- jnc .Loop8x_avx
1357
-
1358
- add \$0x80,$len
1359
- jmp .Ltail_no_xor_avx
1360
-
1361
- .align 32
1362
- .Lshort_avx:
1363
- vmovdqu -0x10($inp,$len),$Ii # very last word
1364
- lea ($inp,$len),$inp
1365
- vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1366
- vmovdqu 0x20-0x40($Htbl),$HK
1367
- vpshufb $bswap,$Ii,$Ij
1368
-
1369
- vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
1370
- vmovdqa $Xhi,$Zhi # $Zhi and
1371
- vmovdqa $Xmi,$Zmi # $Zmi
1372
- sub \$0x10,$len
1373
- jz .Ltail_avx
1374
-
1375
- vpunpckhqdq $Ij,$Ij,$T1
1376
- vpxor $Xlo,$Zlo,$Zlo
1377
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1378
- vpxor $Ij,$T1,$T1
1379
- vmovdqu -0x20($inp),$Ii
1380
- vpxor $Xhi,$Zhi,$Zhi
1381
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1382
- vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1383
- vpshufb $bswap,$Ii,$Ij
1384
- vpxor $Xmi,$Zmi,$Zmi
1385
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1386
- vpsrldq \$8,$HK,$HK
1387
- sub \$0x10,$len
1388
- jz .Ltail_avx
1389
-
1390
- vpunpckhqdq $Ij,$Ij,$T1
1391
- vpxor $Xlo,$Zlo,$Zlo
1392
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1393
- vpxor $Ij,$T1,$T1
1394
- vmovdqu -0x30($inp),$Ii
1395
- vpxor $Xhi,$Zhi,$Zhi
1396
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1397
- vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1398
- vpshufb $bswap,$Ii,$Ij
1399
- vpxor $Xmi,$Zmi,$Zmi
1400
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1401
- vmovdqu 0x50-0x40($Htbl),$HK
1402
- sub \$0x10,$len
1403
- jz .Ltail_avx
1404
-
1405
- vpunpckhqdq $Ij,$Ij,$T1
1406
- vpxor $Xlo,$Zlo,$Zlo
1407
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1408
- vpxor $Ij,$T1,$T1
1409
- vmovdqu -0x40($inp),$Ii
1410
- vpxor $Xhi,$Zhi,$Zhi
1411
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1412
- vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1413
- vpshufb $bswap,$Ii,$Ij
1414
- vpxor $Xmi,$Zmi,$Zmi
1415
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1416
- vpsrldq \$8,$HK,$HK
1417
- sub \$0x10,$len
1418
- jz .Ltail_avx
1419
-
1420
- vpunpckhqdq $Ij,$Ij,$T1
1421
- vpxor $Xlo,$Zlo,$Zlo
1422
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1423
- vpxor $Ij,$T1,$T1
1424
- vmovdqu -0x50($inp),$Ii
1425
- vpxor $Xhi,$Zhi,$Zhi
1426
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1427
- vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1428
- vpshufb $bswap,$Ii,$Ij
1429
- vpxor $Xmi,$Zmi,$Zmi
1430
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1431
- vmovdqu 0x80-0x40($Htbl),$HK
1432
- sub \$0x10,$len
1433
- jz .Ltail_avx
1434
-
1435
- vpunpckhqdq $Ij,$Ij,$T1
1436
- vpxor $Xlo,$Zlo,$Zlo
1437
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1438
- vpxor $Ij,$T1,$T1
1439
- vmovdqu -0x60($inp),$Ii
1440
- vpxor $Xhi,$Zhi,$Zhi
1441
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1442
- vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1443
- vpshufb $bswap,$Ii,$Ij
1444
- vpxor $Xmi,$Zmi,$Zmi
1445
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1446
- vpsrldq \$8,$HK,$HK
1447
- sub \$0x10,$len
1448
- jz .Ltail_avx
1449
-
1450
- vpunpckhqdq $Ij,$Ij,$T1
1451
- vpxor $Xlo,$Zlo,$Zlo
1452
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1453
- vpxor $Ij,$T1,$T1
1454
- vmovdqu -0x70($inp),$Ii
1455
- vpxor $Xhi,$Zhi,$Zhi
1456
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1457
- vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1458
- vpshufb $bswap,$Ii,$Ij
1459
- vpxor $Xmi,$Zmi,$Zmi
1460
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1461
- vmovq 0xb8-0x40($Htbl),$HK
1462
- sub \$0x10,$len
1463
- jmp .Ltail_avx
1464
-
1465
- .align 32
1466
- .Ltail_avx:
1467
- vpxor $Xi,$Ij,$Ij # accumulate $Xi
1468
- .Ltail_no_xor_avx:
1469
- vpunpckhqdq $Ij,$Ij,$T1
1470
- vpxor $Xlo,$Zlo,$Zlo
1471
- vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1472
- vpxor $Ij,$T1,$T1
1473
- vpxor $Xhi,$Zhi,$Zhi
1474
- vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1475
- vpxor $Xmi,$Zmi,$Zmi
1476
- vpclmulqdq \$0x00,$HK,$T1,$Xmi
1477
-
1478
- vmovdqu (%r10),$Tred
1479
-
1480
- vpxor $Xlo,$Zlo,$Xi
1481
- vpxor $Xhi,$Zhi,$Xo
1482
- vpxor $Xmi,$Zmi,$Zmi
1483
-
1484
- vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
1485
- vpxor $Xo, $Zmi,$Zmi
1486
- vpslldq \$8, $Zmi,$T2
1487
- vpsrldq \$8, $Zmi,$Zmi
1488
- vpxor $T2, $Xi, $Xi
1489
- vpxor $Zmi,$Xo, $Xo
1490
-
1491
- vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
1492
- vpalignr \$8,$Xi,$Xi,$Xi
1493
- vpxor $T2,$Xi,$Xi
1494
-
1495
- vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
1496
- vpalignr \$8,$Xi,$Xi,$Xi
1497
- vpxor $Xo,$Xi,$Xi
1498
- vpxor $T2,$Xi,$Xi
1499
-
1500
- cmp \$0,$len
1501
- jne .Lshort_avx
1502
-
1503
- vpshufb $bswap,$Xi,$Xi
1504
- vmovdqu $Xi,($Xip)
1505
- vzeroupper
1506
- ___
1507
- $code.=<<___ if ($win64);
1508
- movaps (%rsp),%xmm6
1509
- movaps 0x10(%rsp),%xmm7
1510
- movaps 0x20(%rsp),%xmm8
1511
- movaps 0x30(%rsp),%xmm9
1512
- movaps 0x40(%rsp),%xmm10
1513
- movaps 0x50(%rsp),%xmm11
1514
- movaps 0x60(%rsp),%xmm12
1515
- movaps 0x70(%rsp),%xmm13
1516
- movaps 0x80(%rsp),%xmm14
1517
- movaps 0x90(%rsp),%xmm15
1518
- lea 0xa8(%rsp),%rsp
1519
- .LSEH_end_gcm_ghash_avx:
1520
- ___
1521
- $code.=<<___;
1522
- ret
1523
- .size gcm_ghash_avx,.-gcm_ghash_avx
1524
- ___
1525
- } else {
1526
- $code.=<<___;
1527
- jmp .L_ghash_clmul
1528
- .size gcm_ghash_avx,.-gcm_ghash_avx
1529
- ___
1530
- }
1531
-
1532
- $code.=<<___;
1533
- .align 64
1534
- .Lbswap_mask:
1535
- .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1536
- .L0x1c2_polynomial:
1537
- .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1538
- .L7_mask:
1539
- .long 7,0,7,0
1540
- .L7_mask_poly:
1541
- .long 7,0,`0xE1<<1`,0
1542
- .align 64
1543
- .type .Lrem_4bit,\@object
1544
- .Lrem_4bit:
1545
- .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
1546
- .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
1547
- .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
1548
- .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
1549
- .type .Lrem_8bit,\@object
1550
- .Lrem_8bit:
1551
- .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1552
- .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1553
- .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1554
- .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1555
- .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1556
- .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1557
- .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1558
- .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1559
- .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1560
- .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1561
- .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1562
- .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1563
- .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1564
- .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1565
- .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1566
- .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1567
- .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1568
- .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1569
- .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1570
- .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1571
- .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1572
- .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1573
- .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1574
- .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1575
- .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1576
- .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1577
- .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1578
- .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1579
- .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1580
- .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1581
- .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1582
- .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1583
-
1584
- .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1585
- .align 64
1586
- ___
1587
-
1588
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1589
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1590
- if ($win64) {
1591
- $rec="%rcx";
1592
- $frame="%rdx";
1593
- $context="%r8";
1594
- $disp="%r9";
1595
-
1596
- $code.=<<___;
1597
- .extern __imp_RtlVirtualUnwind
1598
- .type se_handler,\@abi-omnipotent
1599
- .align 16
1600
- se_handler:
1601
- push %rsi
1602
- push %rdi
1603
- push %rbx
1604
- push %rbp
1605
- push %r12
1606
- push %r13
1607
- push %r14
1608
- push %r15
1609
- pushfq
1610
- sub \$64,%rsp
1611
-
1612
- mov 120($context),%rax # pull context->Rax
1613
- mov 248($context),%rbx # pull context->Rip
1614
-
1615
- mov 8($disp),%rsi # disp->ImageBase
1616
- mov 56($disp),%r11 # disp->HandlerData
1617
-
1618
- mov 0(%r11),%r10d # HandlerData[0]
1619
- lea (%rsi,%r10),%r10 # prologue label
1620
- cmp %r10,%rbx # context->Rip<prologue label
1621
- jb .Lin_prologue
1622
-
1623
- mov 152($context),%rax # pull context->Rsp
1624
-
1625
- mov 4(%r11),%r10d # HandlerData[1]
1626
- lea (%rsi,%r10),%r10 # epilogue label
1627
- cmp %r10,%rbx # context->Rip>=epilogue label
1628
- jae .Lin_prologue
1629
-
1630
- lea 24(%rax),%rax # adjust "rsp"
1631
-
1632
- mov -8(%rax),%rbx
1633
- mov -16(%rax),%rbp
1634
- mov -24(%rax),%r12
1635
- mov %rbx,144($context) # restore context->Rbx
1636
- mov %rbp,160($context) # restore context->Rbp
1637
- mov %r12,216($context) # restore context->R12
1638
-
1639
- .Lin_prologue:
1640
- mov 8(%rax),%rdi
1641
- mov 16(%rax),%rsi
1642
- mov %rax,152($context) # restore context->Rsp
1643
- mov %rsi,168($context) # restore context->Rsi
1644
- mov %rdi,176($context) # restore context->Rdi
1645
-
1646
- mov 40($disp),%rdi # disp->ContextRecord
1647
- mov $context,%rsi # context
1648
- mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1649
- .long 0xa548f3fc # cld; rep movsq
1650
-
1651
- mov $disp,%rsi
1652
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1653
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
1654
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
1655
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1656
- mov 40(%rsi),%r10 # disp->ContextRecord
1657
- lea 56(%rsi),%r11 # &disp->HandlerData
1658
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
1659
- mov %r10,32(%rsp) # arg5
1660
- mov %r11,40(%rsp) # arg6
1661
- mov %r12,48(%rsp) # arg7
1662
- mov %rcx,56(%rsp) # arg8, (NULL)
1663
- call *__imp_RtlVirtualUnwind(%rip)
1664
-
1665
- mov \$1,%eax # ExceptionContinueSearch
1666
- add \$64,%rsp
1667
- popfq
1668
- pop %r15
1669
- pop %r14
1670
- pop %r13
1671
- pop %r12
1672
- pop %rbp
1673
- pop %rbx
1674
- pop %rdi
1675
- pop %rsi
1676
- ret
1677
- .size se_handler,.-se_handler
1678
-
1679
- .section .pdata
1680
- .align 4
1681
- .rva .LSEH_begin_gcm_gmult_4bit
1682
- .rva .LSEH_end_gcm_gmult_4bit
1683
- .rva .LSEH_info_gcm_gmult_4bit
1684
-
1685
- .rva .LSEH_begin_gcm_ghash_4bit
1686
- .rva .LSEH_end_gcm_ghash_4bit
1687
- .rva .LSEH_info_gcm_ghash_4bit
1688
-
1689
- .rva .LSEH_begin_gcm_init_clmul
1690
- .rva .LSEH_end_gcm_init_clmul
1691
- .rva .LSEH_info_gcm_init_clmul
1692
-
1693
- .rva .LSEH_begin_gcm_ghash_clmul
1694
- .rva .LSEH_end_gcm_ghash_clmul
1695
- .rva .LSEH_info_gcm_ghash_clmul
1696
- ___
1697
- $code.=<<___ if ($avx);
1698
- .rva .LSEH_begin_gcm_init_avx
1699
- .rva .LSEH_end_gcm_init_avx
1700
- .rva .LSEH_info_gcm_init_clmul
1701
-
1702
- .rva .LSEH_begin_gcm_ghash_avx
1703
- .rva .LSEH_end_gcm_ghash_avx
1704
- .rva .LSEH_info_gcm_ghash_clmul
1705
- ___
1706
- $code.=<<___;
1707
- .section .xdata
1708
- .align 8
1709
- .LSEH_info_gcm_gmult_4bit:
1710
- .byte 9,0,0,0
1711
- .rva se_handler
1712
- .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
1713
- .LSEH_info_gcm_ghash_4bit:
1714
- .byte 9,0,0,0
1715
- .rva se_handler
1716
- .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
1717
- .LSEH_info_gcm_init_clmul:
1718
- .byte 0x01,0x08,0x03,0x00
1719
- .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1720
- .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
1721
- .LSEH_info_gcm_ghash_clmul:
1722
- .byte 0x01,0x33,0x16,0x00
1723
- .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
1724
- .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
1725
- .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
1726
- .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
1727
- .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
1728
- .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
1729
- .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
1730
- .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
1731
- .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1732
- .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1733
- .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1734
- ___
1735
- }
1736
-
1737
- $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1738
-
1739
- print $code;
1740
-
1741
- close STDOUT;