ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1741 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # March, June 2010
11
+ #
12
+ # The module implements "4-bit" GCM GHASH function and underlying
13
+ # single multiplication operation in GF(2^128). "4-bit" means that
14
+ # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
15
+ # function features so called "528B" variant utilizing additional
16
+ # 256+16 bytes of per-key storage [+512 bytes shared table].
17
+ # Performance results are for this streamed GHASH subroutine and are
18
+ # expressed in cycles per processed byte, less is better:
19
+ #
20
+ # gcc 3.4.x(*) assembler
21
+ #
22
+ # P4 28.6 14.0 +100%
23
+ # Opteron 19.3 7.7 +150%
24
+ # Core2 17.8 8.1(**) +120%
25
+ # Atom 31.6 16.8 +88%
26
+ # VIA Nano 21.8 10.1 +115%
27
+ #
28
+ # (*) comparison is not completely fair, because C results are
29
+ # for vanilla "256B" implementation, while assembler results
30
+ # are for "528B";-)
31
+ # (**) it's mystery [to me] why Core2 result is not same as for
32
+ # Opteron;
33
+
34
+ # May 2010
35
+ #
36
+ # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
37
+ # See ghash-x86.pl for background information and details about coding
38
+ # techniques.
39
+ #
40
+ # Special thanks to David Woodhouse <dwmw2@infradead.org> for
41
+ # providing access to a Westmere-based system on behalf of Intel
42
+ # Open Source Technology Centre.
43
+
44
+ # December 2012
45
+ #
46
+ # Overhaul: aggregate Karatsuba post-processing, improve ILP in
47
+ # reduction_alg9, increase reduction aggregate factor to 4x. As for
48
+ # the latter. ghash-x86.pl discusses that it makes lesser sense to
49
+ # increase aggregate factor. Then why increase here? Critical path
50
+ # consists of 3 independent pclmulqdq instructions, Karatsuba post-
51
+ # processing and reduction. "On top" of this we lay down aggregated
52
+ # multiplication operations, triplets of independent pclmulqdq's. As
53
+ # issue rate for pclmulqdq is limited, it makes lesser sense to
54
+ # aggregate more multiplications than it takes to perform remaining
55
+ # non-multiplication operations. 2x is near-optimal coefficient for
56
+ # contemporary Intel CPUs (therefore modest improvement coefficient),
57
+ # but not for Bulldozer. Latter is because logical SIMD operations
58
+ # are twice as slow in comparison to Intel, so that critical path is
59
+ # longer. A CPU with higher pclmulqdq issue rate would also benefit
60
+ # from higher aggregate factor...
61
+ #
62
+ # Westmere 1.78(+13%)
63
+ # Sandy Bridge 1.80(+8%)
64
+ # Ivy Bridge 1.80(+7%)
65
+ # Haswell 0.55(+93%) (if system doesn't support AVX)
66
+ # Broadwell 0.45(+110%)(if system doesn't support AVX)
67
+ # Bulldozer 1.49(+27%)
68
+ # Silvermont 2.88(+13%)
69
+
70
+ # March 2013
71
+ #
72
+ # ... 8x aggregate factor AVX code path is using reduction algorithm
73
+ # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
74
+ # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
75
+ # sub-optimally in comparison to above mentioned version. But thanks
76
+ # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
77
+ # it performs in 0.41 cycles per byte on Haswell processor, and in
78
+ # 0.29 on Broadwell.
79
+ #
80
+ # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
81
+
82
+ $flavour = shift;
83
+ $output = shift;
84
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
85
+
86
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
87
+
88
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
89
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
90
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
91
+ die "can't locate x86_64-xlate.pl";
92
+
93
+ # In upstream, this is controlled by shelling out to the compiler to check
94
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
95
+ # output, so this isn't useful anyway.
96
+ #
97
+ # TODO(davidben): Enable this after testing. $avx goes up to 2.
98
+ $avx = 0;
99
+
100
+ open OUT,"| \"$^X\" $xlate $flavour $output";
101
+ *STDOUT=*OUT;
102
+
103
+ $do4xaggr=1;
104
+
105
+ # common register layout
106
+ $nlo="%rax";
107
+ $nhi="%rbx";
108
+ $Zlo="%r8";
109
+ $Zhi="%r9";
110
+ $tmp="%r10";
111
+ $rem_4bit = "%r11";
112
+
113
+ $Xi="%rdi";
114
+ $Htbl="%rsi";
115
+
116
+ # per-function register layout
117
+ $cnt="%rcx";
118
+ $rem="%rdx";
119
+
120
+ sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
121
+ $r =~ s/%[er]([sd]i)/%\1l/ or
122
+ $r =~ s/%[er](bp)/%\1l/ or
123
+ $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
124
+
125
+ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
126
+ { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
127
+ my $arg = pop;
128
+ $arg = "\$$arg" if ($arg*1 eq $arg);
129
+ $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
130
+ }
131
+
132
+ { my $N;
133
+ sub loop() {
134
+ my $inp = shift;
135
+
136
+ $N++;
137
+ $code.=<<___;
138
+ xor $nlo,$nlo
139
+ xor $nhi,$nhi
140
+ mov `&LB("$Zlo")`,`&LB("$nlo")`
141
+ mov `&LB("$Zlo")`,`&LB("$nhi")`
142
+ shl \$4,`&LB("$nlo")`
143
+ mov \$14,$cnt
144
+ mov 8($Htbl,$nlo),$Zlo
145
+ mov ($Htbl,$nlo),$Zhi
146
+ and \$0xf0,`&LB("$nhi")`
147
+ mov $Zlo,$rem
148
+ jmp .Loop$N
149
+
150
+ .align 16
151
+ .Loop$N:
152
+ shr \$4,$Zlo
153
+ and \$0xf,$rem
154
+ mov $Zhi,$tmp
155
+ mov ($inp,$cnt),`&LB("$nlo")`
156
+ shr \$4,$Zhi
157
+ xor 8($Htbl,$nhi),$Zlo
158
+ shl \$60,$tmp
159
+ xor ($Htbl,$nhi),$Zhi
160
+ mov `&LB("$nlo")`,`&LB("$nhi")`
161
+ xor ($rem_4bit,$rem,8),$Zhi
162
+ mov $Zlo,$rem
163
+ shl \$4,`&LB("$nlo")`
164
+ xor $tmp,$Zlo
165
+ dec $cnt
166
+ js .Lbreak$N
167
+
168
+ shr \$4,$Zlo
169
+ and \$0xf,$rem
170
+ mov $Zhi,$tmp
171
+ shr \$4,$Zhi
172
+ xor 8($Htbl,$nlo),$Zlo
173
+ shl \$60,$tmp
174
+ xor ($Htbl,$nlo),$Zhi
175
+ and \$0xf0,`&LB("$nhi")`
176
+ xor ($rem_4bit,$rem,8),$Zhi
177
+ mov $Zlo,$rem
178
+ xor $tmp,$Zlo
179
+ jmp .Loop$N
180
+
181
+ .align 16
182
+ .Lbreak$N:
183
+ shr \$4,$Zlo
184
+ and \$0xf,$rem
185
+ mov $Zhi,$tmp
186
+ shr \$4,$Zhi
187
+ xor 8($Htbl,$nlo),$Zlo
188
+ shl \$60,$tmp
189
+ xor ($Htbl,$nlo),$Zhi
190
+ and \$0xf0,`&LB("$nhi")`
191
+ xor ($rem_4bit,$rem,8),$Zhi
192
+ mov $Zlo,$rem
193
+ xor $tmp,$Zlo
194
+
195
+ shr \$4,$Zlo
196
+ and \$0xf,$rem
197
+ mov $Zhi,$tmp
198
+ shr \$4,$Zhi
199
+ xor 8($Htbl,$nhi),$Zlo
200
+ shl \$60,$tmp
201
+ xor ($Htbl,$nhi),$Zhi
202
+ xor $tmp,$Zlo
203
+ xor ($rem_4bit,$rem,8),$Zhi
204
+
205
+ bswap $Zlo
206
+ bswap $Zhi
207
+ ___
208
+ }}
209
+
210
+ $code=<<___;
211
+ .text
212
+ .extern OPENSSL_ia32cap_P
213
+
214
+ .globl gcm_gmult_4bit
215
+ .type gcm_gmult_4bit,\@function,2
216
+ .align 16
217
+ gcm_gmult_4bit:
218
+ push %rbx
219
+ push %rbp # %rbp and %r12 are pushed exclusively in
220
+ push %r12 # order to reuse Win64 exception handler...
221
+ .Lgmult_prologue:
222
+
223
+ movzb 15($Xi),$Zlo
224
+ lea .Lrem_4bit(%rip),$rem_4bit
225
+ ___
226
+ &loop ($Xi);
227
+ $code.=<<___;
228
+ mov $Zlo,8($Xi)
229
+ mov $Zhi,($Xi)
230
+
231
+ mov 16(%rsp),%rbx
232
+ lea 24(%rsp),%rsp
233
+ .Lgmult_epilogue:
234
+ ret
235
+ .size gcm_gmult_4bit,.-gcm_gmult_4bit
236
+ ___
237
+
238
+ # per-function register layout
239
+ $inp="%rdx";
240
+ $len="%rcx";
241
+ $rem_8bit=$rem_4bit;
242
+
243
+ $code.=<<___;
244
+ .globl gcm_ghash_4bit
245
+ .type gcm_ghash_4bit,\@function,4
246
+ .align 16
247
+ gcm_ghash_4bit:
248
+ push %rbx
249
+ push %rbp
250
+ push %r12
251
+ push %r13
252
+ push %r14
253
+ push %r15
254
+ sub \$280,%rsp
255
+ .Lghash_prologue:
256
+ mov $inp,%r14 # reassign couple of args
257
+ mov $len,%r15
258
+ ___
259
+ { my $inp="%r14";
260
+ my $dat="%edx";
261
+ my $len="%r15";
262
+ my @nhi=("%ebx","%ecx");
263
+ my @rem=("%r12","%r13");
264
+ my $Hshr4="%rbp";
265
+
266
+ &sub ($Htbl,-128); # size optimization
267
+ &lea ($Hshr4,"16+128(%rsp)");
268
+ { my @lo =($nlo,$nhi);
269
+ my @hi =($Zlo,$Zhi);
270
+
271
+ &xor ($dat,$dat);
272
+ for ($i=0,$j=-2;$i<18;$i++,$j++) {
273
+ &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
274
+ &or ($lo[0],$tmp) if ($i>1);
275
+ &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
276
+ &shr ($lo[1],4) if ($i>0 && $i<17);
277
+ &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
278
+ &shr ($hi[1],4) if ($i>0 && $i<17);
279
+ &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
280
+ &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
281
+ &shl (&LB($dat),4) if ($i>0 && $i<17);
282
+ &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
283
+ &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
284
+ &shl ($tmp,60) if ($i>0 && $i<17);
285
+
286
+ push (@lo,shift(@lo));
287
+ push (@hi,shift(@hi));
288
+ }
289
+ }
290
+ &add ($Htbl,-128);
291
+ &mov ($Zlo,"8($Xi)");
292
+ &mov ($Zhi,"0($Xi)");
293
+ &add ($len,$inp); # pointer to the end of data
294
+ &lea ($rem_8bit,".Lrem_8bit(%rip)");
295
+ &jmp (".Louter_loop");
296
+
297
+ $code.=".align 16\n.Louter_loop:\n";
298
+ &xor ($Zhi,"($inp)");
299
+ &mov ("%rdx","8($inp)");
300
+ &lea ($inp,"16($inp)");
301
+ &xor ("%rdx",$Zlo);
302
+ &mov ("($Xi)",$Zhi);
303
+ &mov ("8($Xi)","%rdx");
304
+ &shr ("%rdx",32);
305
+
306
+ &xor ($nlo,$nlo);
307
+ &rol ($dat,8);
308
+ &mov (&LB($nlo),&LB($dat));
309
+ &movz ($nhi[0],&LB($dat));
310
+ &shl (&LB($nlo),4);
311
+ &shr ($nhi[0],4);
312
+
313
+ for ($j=11,$i=0;$i<15;$i++) {
314
+ &rol ($dat,8);
315
+ &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
316
+ &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
317
+ &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
318
+ &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
319
+
320
+ &mov (&LB($nlo),&LB($dat));
321
+ &xor ($Zlo,$tmp) if ($i>0);
322
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
323
+
324
+ &movz ($nhi[1],&LB($dat));
325
+ &shl (&LB($nlo),4);
326
+ &movzb ($rem[0],"(%rsp,$nhi[0])");
327
+
328
+ &shr ($nhi[1],4) if ($i<14);
329
+ &and ($nhi[1],0xf0) if ($i==14);
330
+ &shl ($rem[1],48) if ($i>0);
331
+ &xor ($rem[0],$Zlo);
332
+
333
+ &mov ($tmp,$Zhi);
334
+ &xor ($Zhi,$rem[1]) if ($i>0);
335
+ &shr ($Zlo,8);
336
+
337
+ &movz ($rem[0],&LB($rem[0]));
338
+ &mov ($dat,"$j($Xi)") if (--$j%4==0);
339
+ &shr ($Zhi,8);
340
+
341
+ &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
342
+ &shl ($tmp,56);
343
+ &xor ($Zhi,"($Hshr4,$nhi[0],8)");
344
+
345
+ unshift (@nhi,pop(@nhi)); # "rotate" registers
346
+ unshift (@rem,pop(@rem));
347
+ }
348
+ &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
349
+ &xor ($Zlo,"8($Htbl,$nlo)");
350
+ &xor ($Zhi,"($Htbl,$nlo)");
351
+
352
+ &shl ($rem[1],48);
353
+ &xor ($Zlo,$tmp);
354
+
355
+ &xor ($Zhi,$rem[1]);
356
+ &movz ($rem[0],&LB($Zlo));
357
+ &shr ($Zlo,4);
358
+
359
+ &mov ($tmp,$Zhi);
360
+ &shl (&LB($rem[0]),4);
361
+ &shr ($Zhi,4);
362
+
363
+ &xor ($Zlo,"8($Htbl,$nhi[0])");
364
+ &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
365
+ &shl ($tmp,60);
366
+
367
+ &xor ($Zhi,"($Htbl,$nhi[0])");
368
+ &xor ($Zlo,$tmp);
369
+ &shl ($rem[0],48);
370
+
371
+ &bswap ($Zlo);
372
+ &xor ($Zhi,$rem[0]);
373
+
374
+ &bswap ($Zhi);
375
+ &cmp ($inp,$len);
376
+ &jb (".Louter_loop");
377
+ }
378
+ $code.=<<___;
379
+ mov $Zlo,8($Xi)
380
+ mov $Zhi,($Xi)
381
+
382
+ lea 280(%rsp),%rsi
383
+ mov 0(%rsi),%r15
384
+ mov 8(%rsi),%r14
385
+ mov 16(%rsi),%r13
386
+ mov 24(%rsi),%r12
387
+ mov 32(%rsi),%rbp
388
+ mov 40(%rsi),%rbx
389
+ lea 48(%rsi),%rsp
390
+ .Lghash_epilogue:
391
+ ret
392
+ .size gcm_ghash_4bit,.-gcm_ghash_4bit
393
+ ___
394
+
395
+ ######################################################################
396
+ # PCLMULQDQ version.
397
+
398
+ @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
399
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
400
+
401
+ ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
402
+ ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
403
+
404
+ sub clmul64x64_T2 { # minimal register pressure
405
+ my ($Xhi,$Xi,$Hkey,$HK)=@_;
406
+
407
+ if (!defined($HK)) { $HK = $T2;
408
+ $code.=<<___;
409
+ movdqa $Xi,$Xhi #
410
+ pshufd \$0b01001110,$Xi,$T1
411
+ pshufd \$0b01001110,$Hkey,$T2
412
+ pxor $Xi,$T1 #
413
+ pxor $Hkey,$T2
414
+ ___
415
+ } else {
416
+ $code.=<<___;
417
+ movdqa $Xi,$Xhi #
418
+ pshufd \$0b01001110,$Xi,$T1
419
+ pxor $Xi,$T1 #
420
+ ___
421
+ }
422
+ $code.=<<___;
423
+ pclmulqdq \$0x00,$Hkey,$Xi #######
424
+ pclmulqdq \$0x11,$Hkey,$Xhi #######
425
+ pclmulqdq \$0x00,$HK,$T1 #######
426
+ pxor $Xi,$T1 #
427
+ pxor $Xhi,$T1 #
428
+
429
+ movdqa $T1,$T2 #
430
+ psrldq \$8,$T1
431
+ pslldq \$8,$T2 #
432
+ pxor $T1,$Xhi
433
+ pxor $T2,$Xi #
434
+ ___
435
+ }
436
+
437
+ sub reduction_alg9 { # 17/11 times faster than Intel version
438
+ my ($Xhi,$Xi) = @_;
439
+
440
+ $code.=<<___;
441
+ # 1st phase
442
+ movdqa $Xi,$T2 #
443
+ movdqa $Xi,$T1
444
+ psllq \$5,$Xi
445
+ pxor $Xi,$T1 #
446
+ psllq \$1,$Xi
447
+ pxor $T1,$Xi #
448
+ psllq \$57,$Xi #
449
+ movdqa $Xi,$T1 #
450
+ pslldq \$8,$Xi
451
+ psrldq \$8,$T1 #
452
+ pxor $T2,$Xi
453
+ pxor $T1,$Xhi #
454
+
455
+ # 2nd phase
456
+ movdqa $Xi,$T2
457
+ psrlq \$1,$Xi
458
+ pxor $T2,$Xhi #
459
+ pxor $Xi,$T2
460
+ psrlq \$5,$Xi
461
+ pxor $T2,$Xi #
462
+ psrlq \$1,$Xi #
463
+ pxor $Xhi,$Xi #
464
+ ___
465
+ }
466
+
467
+ { my ($Htbl,$Xip)=@_4args;
468
+ my $HK="%xmm6";
469
+
470
+ $code.=<<___;
471
+ .globl gcm_init_clmul
472
+ .type gcm_init_clmul,\@abi-omnipotent
473
+ .align 16
474
+ gcm_init_clmul:
475
+ .L_init_clmul:
476
+ ___
477
+ $code.=<<___ if ($win64);
478
+ .LSEH_begin_gcm_init_clmul:
479
+ # I can't trust assembler to use specific encoding:-(
480
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
481
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
482
+ ___
483
+ $code.=<<___;
484
+ movdqu ($Xip),$Hkey
485
+ pshufd \$0b01001110,$Hkey,$Hkey # dword swap
486
+
487
+ # <<1 twist
488
+ pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
489
+ movdqa $Hkey,$T1
490
+ psllq \$1,$Hkey
491
+ pxor $T3,$T3 #
492
+ psrlq \$63,$T1
493
+ pcmpgtd $T2,$T3 # broadcast carry bit
494
+ pslldq \$8,$T1
495
+ por $T1,$Hkey # H<<=1
496
+
497
+ # magic reduction
498
+ pand .L0x1c2_polynomial(%rip),$T3
499
+ pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
500
+
501
+ # calculate H^2
502
+ pshufd \$0b01001110,$Hkey,$HK
503
+ movdqa $Hkey,$Xi
504
+ pxor $Hkey,$HK
505
+ ___
506
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
507
+ &reduction_alg9 ($Xhi,$Xi);
508
+ $code.=<<___;
509
+ pshufd \$0b01001110,$Hkey,$T1
510
+ pshufd \$0b01001110,$Xi,$T2
511
+ pxor $Hkey,$T1 # Karatsuba pre-processing
512
+ movdqu $Hkey,0x00($Htbl) # save H
513
+ pxor $Xi,$T2 # Karatsuba pre-processing
514
+ movdqu $Xi,0x10($Htbl) # save H^2
515
+ palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
516
+ movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
517
+ ___
518
+ if ($do4xaggr) {
519
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
520
+ &reduction_alg9 ($Xhi,$Xi);
521
+ $code.=<<___;
522
+ movdqa $Xi,$T3
523
+ ___
524
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
525
+ &reduction_alg9 ($Xhi,$Xi);
526
+ $code.=<<___;
527
+ pshufd \$0b01001110,$T3,$T1
528
+ pshufd \$0b01001110,$Xi,$T2
529
+ pxor $T3,$T1 # Karatsuba pre-processing
530
+ movdqu $T3,0x30($Htbl) # save H^3
531
+ pxor $Xi,$T2 # Karatsuba pre-processing
532
+ movdqu $Xi,0x40($Htbl) # save H^4
533
+ palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
534
+ movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
535
+ ___
536
+ }
537
+ $code.=<<___ if ($win64);
538
+ movaps (%rsp),%xmm6
539
+ lea 0x18(%rsp),%rsp
540
+ .LSEH_end_gcm_init_clmul:
541
+ ___
542
+ $code.=<<___;
543
+ ret
544
+ .size gcm_init_clmul,.-gcm_init_clmul
545
+ ___
546
+ }
547
+
548
+ { my ($Xip,$Htbl)=@_4args;
549
+
550
+ $code.=<<___;
551
+ .globl gcm_gmult_clmul
552
+ .type gcm_gmult_clmul,\@abi-omnipotent
553
+ .align 16
554
+ gcm_gmult_clmul:
555
+ .L_gmult_clmul:
556
+ movdqu ($Xip),$Xi
557
+ movdqa .Lbswap_mask(%rip),$T3
558
+ movdqu ($Htbl),$Hkey
559
+ movdqu 0x20($Htbl),$T2
560
+ pshufb $T3,$Xi
561
+ ___
562
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
563
+ $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
564
+ # experimental alternative. special thing about is that there
565
+ # no dependency between the two multiplications...
566
+ mov \$`0xE1<<1`,%eax
567
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
568
+ mov \$0x07,%r11d
569
+ movq %rax,$T1
570
+ movq %r10,$T2
571
+ movq %r11,$T3 # borrow $T3
572
+ pand $Xi,$T3
573
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
574
+ movq %rax,$T3
575
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
576
+ pxor $Xi,$T2
577
+ pslldq \$15,$T2
578
+ paddd $T2,$T2 # <<(64+56+1)
579
+ pxor $T2,$Xi
580
+ pclmulqdq \$0x01,$T3,$Xi
581
+ movdqa .Lbswap_mask(%rip),$T3 # reload $T3
582
+ psrldq \$1,$T1
583
+ pxor $T1,$Xhi
584
+ pslldq \$7,$Xi
585
+ pxor $Xhi,$Xi
586
+ ___
587
+ $code.=<<___;
588
+ pshufb $T3,$Xi
589
+ movdqu $Xi,($Xip)
590
+ ret
591
+ .size gcm_gmult_clmul,.-gcm_gmult_clmul
592
+ ___
593
+ }
594
+
595
+ { my ($Xip,$Htbl,$inp,$len)=@_4args;
596
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
597
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
598
+
599
+ $code.=<<___;
600
+ .globl gcm_ghash_clmul
601
+ .type gcm_ghash_clmul,\@abi-omnipotent
602
+ .align 32
603
+ gcm_ghash_clmul:
604
+ .L_ghash_clmul:
605
+ ___
606
+ $code.=<<___ if ($win64);
607
+ lea -0x88(%rsp),%rax
608
+ .LSEH_begin_gcm_ghash_clmul:
609
+ # I can't trust assembler to use specific encoding:-(
610
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
611
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
612
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
613
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
614
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
615
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
616
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
617
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
618
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
619
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
620
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
621
+ ___
622
+ $code.=<<___;
623
+ movdqa .Lbswap_mask(%rip),$T3
624
+
625
+ movdqu ($Xip),$Xi
626
+ movdqu ($Htbl),$Hkey
627
+ movdqu 0x20($Htbl),$HK
628
+ pshufb $T3,$Xi
629
+
630
+ sub \$0x10,$len
631
+ jz .Lodd_tail
632
+
633
+ movdqu 0x10($Htbl),$Hkey2
634
+ ___
635
+ if ($do4xaggr) {
636
+ my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
637
+
638
+ $code.=<<___;
639
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
640
+ cmp \$0x30,$len
641
+ jb .Lskip4x
642
+
643
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
644
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
645
+ je .Lskip4x
646
+
647
+ sub \$0x30,$len
648
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
649
+ movdqu 0x30($Htbl),$Hkey3
650
+ movdqu 0x40($Htbl),$Hkey4
651
+
652
+ #######
653
+ # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
654
+ #
655
+ movdqu 0x30($inp),$Xln
656
+ movdqu 0x20($inp),$Xl
657
+ pshufb $T3,$Xln
658
+ pshufb $T3,$Xl
659
+ movdqa $Xln,$Xhn
660
+ pshufd \$0b01001110,$Xln,$Xmn
661
+ pxor $Xln,$Xmn
662
+ pclmulqdq \$0x00,$Hkey,$Xln
663
+ pclmulqdq \$0x11,$Hkey,$Xhn
664
+ pclmulqdq \$0x00,$HK,$Xmn
665
+
666
+ movdqa $Xl,$Xh
667
+ pshufd \$0b01001110,$Xl,$Xm
668
+ pxor $Xl,$Xm
669
+ pclmulqdq \$0x00,$Hkey2,$Xl
670
+ pclmulqdq \$0x11,$Hkey2,$Xh
671
+ pclmulqdq \$0x10,$HK,$Xm
672
+ xorps $Xl,$Xln
673
+ xorps $Xh,$Xhn
674
+ movups 0x50($Htbl),$HK
675
+ xorps $Xm,$Xmn
676
+
677
+ movdqu 0x10($inp),$Xl
678
+ movdqu 0($inp),$T1
679
+ pshufb $T3,$Xl
680
+ pshufb $T3,$T1
681
+ movdqa $Xl,$Xh
682
+ pshufd \$0b01001110,$Xl,$Xm
683
+ pxor $T1,$Xi
684
+ pxor $Xl,$Xm
685
+ pclmulqdq \$0x00,$Hkey3,$Xl
686
+ movdqa $Xi,$Xhi
687
+ pshufd \$0b01001110,$Xi,$T1
688
+ pxor $Xi,$T1
689
+ pclmulqdq \$0x11,$Hkey3,$Xh
690
+ pclmulqdq \$0x00,$HK,$Xm
691
+ xorps $Xl,$Xln
692
+ xorps $Xh,$Xhn
693
+
694
+ lea 0x40($inp),$inp
695
+ sub \$0x40,$len
696
+ jc .Ltail4x
697
+
698
+ jmp .Lmod4_loop
699
+ .align 32
700
+ .Lmod4_loop:
701
+ pclmulqdq \$0x00,$Hkey4,$Xi
702
+ xorps $Xm,$Xmn
703
+ movdqu 0x30($inp),$Xl
704
+ pshufb $T3,$Xl
705
+ pclmulqdq \$0x11,$Hkey4,$Xhi
706
+ xorps $Xln,$Xi
707
+ movdqu 0x20($inp),$Xln
708
+ movdqa $Xl,$Xh
709
+ pclmulqdq \$0x10,$HK,$T1
710
+ pshufd \$0b01001110,$Xl,$Xm
711
+ xorps $Xhn,$Xhi
712
+ pxor $Xl,$Xm
713
+ pshufb $T3,$Xln
714
+ movups 0x20($Htbl),$HK
715
+ xorps $Xmn,$T1
716
+ pclmulqdq \$0x00,$Hkey,$Xl
717
+ pshufd \$0b01001110,$Xln,$Xmn
718
+
719
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
720
+ movdqa $Xln,$Xhn
721
+ pxor $Xhi,$T1 #
722
+ pxor $Xln,$Xmn
723
+ movdqa $T1,$T2 #
724
+ pclmulqdq \$0x11,$Hkey,$Xh
725
+ pslldq \$8,$T1
726
+ psrldq \$8,$T2 #
727
+ pxor $T1,$Xi
728
+ movdqa .L7_mask(%rip),$T1
729
+ pxor $T2,$Xhi #
730
+ movq %rax,$T2
731
+
732
+ pand $Xi,$T1 # 1st phase
733
+ pshufb $T1,$T2 #
734
+ pxor $Xi,$T2 #
735
+ pclmulqdq \$0x00,$HK,$Xm
736
+ psllq \$57,$T2 #
737
+ movdqa $T2,$T1 #
738
+ pslldq \$8,$T2
739
+ pclmulqdq \$0x00,$Hkey2,$Xln
740
+ psrldq \$8,$T1 #
741
+ pxor $T2,$Xi
742
+ pxor $T1,$Xhi #
743
+ movdqu 0($inp),$T1
744
+
745
+ movdqa $Xi,$T2 # 2nd phase
746
+ psrlq \$1,$Xi
747
+ pclmulqdq \$0x11,$Hkey2,$Xhn
748
+ xorps $Xl,$Xln
749
+ movdqu 0x10($inp),$Xl
750
+ pshufb $T3,$Xl
751
+ pclmulqdq \$0x10,$HK,$Xmn
752
+ xorps $Xh,$Xhn
753
+ movups 0x50($Htbl),$HK
754
+ pshufb $T3,$T1
755
+ pxor $T2,$Xhi #
756
+ pxor $Xi,$T2
757
+ psrlq \$5,$Xi
758
+
759
+ movdqa $Xl,$Xh
760
+ pxor $Xm,$Xmn
761
+ pshufd \$0b01001110,$Xl,$Xm
762
+ pxor $T2,$Xi #
763
+ pxor $T1,$Xhi
764
+ pxor $Xl,$Xm
765
+ pclmulqdq \$0x00,$Hkey3,$Xl
766
+ psrlq \$1,$Xi #
767
+ pxor $Xhi,$Xi #
768
+ movdqa $Xi,$Xhi
769
+ pclmulqdq \$0x11,$Hkey3,$Xh
770
+ xorps $Xl,$Xln
771
+ pshufd \$0b01001110,$Xi,$T1
772
+ pxor $Xi,$T1
773
+
774
+ pclmulqdq \$0x00,$HK,$Xm
775
+ xorps $Xh,$Xhn
776
+
777
+ lea 0x40($inp),$inp
778
+ sub \$0x40,$len
779
+ jnc .Lmod4_loop
780
+
781
+ .Ltail4x:
782
+ pclmulqdq \$0x00,$Hkey4,$Xi
783
+ pclmulqdq \$0x11,$Hkey4,$Xhi
784
+ pclmulqdq \$0x10,$HK,$T1
785
+ xorps $Xm,$Xmn
786
+ xorps $Xln,$Xi
787
+ xorps $Xhn,$Xhi
788
+ pxor $Xi,$Xhi # aggregated Karatsuba post-processing
789
+ pxor $Xmn,$T1
790
+
791
+ pxor $Xhi,$T1 #
792
+ pxor $Xi,$Xhi
793
+
794
+ movdqa $T1,$T2 #
795
+ psrldq \$8,$T1
796
+ pslldq \$8,$T2 #
797
+ pxor $T1,$Xhi
798
+ pxor $T2,$Xi #
799
+ ___
800
+ &reduction_alg9($Xhi,$Xi);
801
+ $code.=<<___;
802
+ add \$0x40,$len
803
+ jz .Ldone
804
+ movdqu 0x20($Htbl),$HK
805
+ sub \$0x10,$len
806
+ jz .Lodd_tail
807
+ .Lskip4x:
808
+ ___
809
+ }
810
+ $code.=<<___;
811
+ #######
812
+ # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
813
+ # [(H*Ii+1) + (H*Xi+1)] mod P =
814
+ # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
815
+ #
816
+ movdqu ($inp),$T1 # Ii
817
+ movdqu 16($inp),$Xln # Ii+1
818
+ pshufb $T3,$T1
819
+ pshufb $T3,$Xln
820
+ pxor $T1,$Xi # Ii+Xi
821
+
822
+ movdqa $Xln,$Xhn
823
+ pshufd \$0b01001110,$Xln,$Xmn
824
+ pxor $Xln,$Xmn
825
+ pclmulqdq \$0x00,$Hkey,$Xln
826
+ pclmulqdq \$0x11,$Hkey,$Xhn
827
+ pclmulqdq \$0x00,$HK,$Xmn
828
+
829
+ lea 32($inp),$inp # i+=2
830
+ nop
831
+ sub \$0x20,$len
832
+ jbe .Leven_tail
833
+ nop
834
+ jmp .Lmod_loop
835
+
836
+ .align 32
837
+ .Lmod_loop:
838
+ movdqa $Xi,$Xhi
839
+ movdqa $Xmn,$T1
840
+ pshufd \$0b01001110,$Xi,$Xmn #
841
+ pxor $Xi,$Xmn #
842
+
843
+ pclmulqdq \$0x00,$Hkey2,$Xi
844
+ pclmulqdq \$0x11,$Hkey2,$Xhi
845
+ pclmulqdq \$0x10,$HK,$Xmn
846
+
847
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
848
+ pxor $Xhn,$Xhi
849
+ movdqu ($inp),$T2 # Ii
850
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
851
+ pshufb $T3,$T2
852
+ movdqu 16($inp),$Xln # Ii+1
853
+
854
+ pxor $Xhi,$T1
855
+ pxor $T2,$Xhi # "Ii+Xi", consume early
856
+ pxor $T1,$Xmn
857
+ pshufb $T3,$Xln
858
+ movdqa $Xmn,$T1 #
859
+ psrldq \$8,$T1
860
+ pslldq \$8,$Xmn #
861
+ pxor $T1,$Xhi
862
+ pxor $Xmn,$Xi #
863
+
864
+ movdqa $Xln,$Xhn #
865
+
866
+ movdqa $Xi,$T2 # 1st phase
867
+ movdqa $Xi,$T1
868
+ psllq \$5,$Xi
869
+ pxor $Xi,$T1 #
870
+ pclmulqdq \$0x00,$Hkey,$Xln #######
871
+ psllq \$1,$Xi
872
+ pxor $T1,$Xi #
873
+ psllq \$57,$Xi #
874
+ movdqa $Xi,$T1 #
875
+ pslldq \$8,$Xi
876
+ psrldq \$8,$T1 #
877
+ pxor $T2,$Xi
878
+ pshufd \$0b01001110,$Xhn,$Xmn
879
+ pxor $T1,$Xhi #
880
+ pxor $Xhn,$Xmn #
881
+
882
+ movdqa $Xi,$T2 # 2nd phase
883
+ psrlq \$1,$Xi
884
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
885
+ pxor $T2,$Xhi #
886
+ pxor $Xi,$T2
887
+ psrlq \$5,$Xi
888
+ pxor $T2,$Xi #
889
+ lea 32($inp),$inp
890
+ psrlq \$1,$Xi #
891
+ pclmulqdq \$0x00,$HK,$Xmn #######
892
+ pxor $Xhi,$Xi #
893
+
894
+ sub \$0x20,$len
895
+ ja .Lmod_loop
896
+
897
+ .Leven_tail:
898
+ movdqa $Xi,$Xhi
899
+ movdqa $Xmn,$T1
900
+ pshufd \$0b01001110,$Xi,$Xmn #
901
+ pxor $Xi,$Xmn #
902
+
903
+ pclmulqdq \$0x00,$Hkey2,$Xi
904
+ pclmulqdq \$0x11,$Hkey2,$Xhi
905
+ pclmulqdq \$0x10,$HK,$Xmn
906
+
907
+ pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
908
+ pxor $Xhn,$Xhi
909
+ pxor $Xi,$T1
910
+ pxor $Xhi,$T1
911
+ pxor $T1,$Xmn
912
+ movdqa $Xmn,$T1 #
913
+ psrldq \$8,$T1
914
+ pslldq \$8,$Xmn #
915
+ pxor $T1,$Xhi
916
+ pxor $Xmn,$Xi #
917
+ ___
918
+ &reduction_alg9 ($Xhi,$Xi);
919
+ $code.=<<___;
920
+ test $len,$len
921
+ jnz .Ldone
922
+
923
+ .Lodd_tail:
924
+ movdqu ($inp),$T1 # Ii
925
+ pshufb $T3,$T1
926
+ pxor $T1,$Xi # Ii+Xi
927
+ ___
928
+ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
929
+ &reduction_alg9 ($Xhi,$Xi);
930
+ $code.=<<___;
931
+ .Ldone:
932
+ pshufb $T3,$Xi
933
+ movdqu $Xi,($Xip)
934
+ ___
935
+ $code.=<<___ if ($win64);
936
+ movaps (%rsp),%xmm6
937
+ movaps 0x10(%rsp),%xmm7
938
+ movaps 0x20(%rsp),%xmm8
939
+ movaps 0x30(%rsp),%xmm9
940
+ movaps 0x40(%rsp),%xmm10
941
+ movaps 0x50(%rsp),%xmm11
942
+ movaps 0x60(%rsp),%xmm12
943
+ movaps 0x70(%rsp),%xmm13
944
+ movaps 0x80(%rsp),%xmm14
945
+ movaps 0x90(%rsp),%xmm15
946
+ lea 0xa8(%rsp),%rsp
947
+ .LSEH_end_gcm_ghash_clmul:
948
+ ___
949
+ $code.=<<___;
950
+ ret
951
+ .size gcm_ghash_clmul,.-gcm_ghash_clmul
952
+ ___
953
+ }
954
+
955
+ $code.=<<___;
956
+ .globl gcm_init_avx
957
+ .type gcm_init_avx,\@abi-omnipotent
958
+ .align 32
959
+ gcm_init_avx:
960
+ ___
961
+ if ($avx) {
962
+ my ($Htbl,$Xip)=@_4args;
963
+ my $HK="%xmm6";
964
+
965
+ $code.=<<___ if ($win64);
966
+ .LSEH_begin_gcm_init_avx:
967
+ # I can't trust assembler to use specific encoding:-(
968
+ .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
969
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
970
+ ___
971
+ $code.=<<___;
972
+ vzeroupper
973
+
974
+ vmovdqu ($Xip),$Hkey
975
+ vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
976
+
977
+ # <<1 twist
978
+ vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
979
+ vpsrlq \$63,$Hkey,$T1
980
+ vpsllq \$1,$Hkey,$Hkey
981
+ vpxor $T3,$T3,$T3 #
982
+ vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
983
+ vpslldq \$8,$T1,$T1
984
+ vpor $T1,$Hkey,$Hkey # H<<=1
985
+
986
+ # magic reduction
987
+ vpand .L0x1c2_polynomial(%rip),$T3,$T3
988
+ vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
989
+
990
+ vpunpckhqdq $Hkey,$Hkey,$HK
991
+ vmovdqa $Hkey,$Xi
992
+ vpxor $Hkey,$HK,$HK
993
+ mov \$4,%r10 # up to H^8
994
+ jmp .Linit_start_avx
995
+ ___
996
+
997
+ sub clmul64x64_avx {
998
+ my ($Xhi,$Xi,$Hkey,$HK)=@_;
999
+
1000
+ if (!defined($HK)) { $HK = $T2;
1001
+ $code.=<<___;
1002
+ vpunpckhqdq $Xi,$Xi,$T1
1003
+ vpunpckhqdq $Hkey,$Hkey,$T2
1004
+ vpxor $Xi,$T1,$T1 #
1005
+ vpxor $Hkey,$T2,$T2
1006
+ ___
1007
+ } else {
1008
+ $code.=<<___;
1009
+ vpunpckhqdq $Xi,$Xi,$T1
1010
+ vpxor $Xi,$T1,$T1 #
1011
+ ___
1012
+ }
1013
+ $code.=<<___;
1014
+ vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
1015
+ vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
1016
+ vpclmulqdq \$0x00,$HK,$T1,$T1 #######
1017
+ vpxor $Xi,$Xhi,$T2 #
1018
+ vpxor $T2,$T1,$T1 #
1019
+
1020
+ vpslldq \$8,$T1,$T2 #
1021
+ vpsrldq \$8,$T1,$T1
1022
+ vpxor $T2,$Xi,$Xi #
1023
+ vpxor $T1,$Xhi,$Xhi
1024
+ ___
1025
+ }
1026
+
1027
+ sub reduction_avx {
1028
+ my ($Xhi,$Xi) = @_;
1029
+
1030
+ $code.=<<___;
1031
+ vpsllq \$57,$Xi,$T1 # 1st phase
1032
+ vpsllq \$62,$Xi,$T2
1033
+ vpxor $T1,$T2,$T2 #
1034
+ vpsllq \$63,$Xi,$T1
1035
+ vpxor $T1,$T2,$T2 #
1036
+ vpslldq \$8,$T2,$T1 #
1037
+ vpsrldq \$8,$T2,$T2
1038
+ vpxor $T1,$Xi,$Xi #
1039
+ vpxor $T2,$Xhi,$Xhi
1040
+
1041
+ vpsrlq \$1,$Xi,$T2 # 2nd phase
1042
+ vpxor $Xi,$Xhi,$Xhi
1043
+ vpxor $T2,$Xi,$Xi #
1044
+ vpsrlq \$5,$T2,$T2
1045
+ vpxor $T2,$Xi,$Xi #
1046
+ vpsrlq \$1,$Xi,$Xi #
1047
+ vpxor $Xhi,$Xi,$Xi #
1048
+ ___
1049
+ }
1050
+
1051
+ $code.=<<___;
1052
+ .align 32
1053
+ .Linit_loop_avx:
1054
+ vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
1055
+ vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
1056
+ ___
1057
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
1058
+ &reduction_avx ($Xhi,$Xi);
1059
+ $code.=<<___;
1060
+ .Linit_start_avx:
1061
+ vmovdqa $Xi,$T3
1062
+ ___
1063
+ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
1064
+ &reduction_avx ($Xhi,$Xi);
1065
+ $code.=<<___;
1066
+ vpshufd \$0b01001110,$T3,$T1
1067
+ vpshufd \$0b01001110,$Xi,$T2
1068
+ vpxor $T3,$T1,$T1 # Karatsuba pre-processing
1069
+ vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
1070
+ vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
1071
+ vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
1072
+ lea 0x30($Htbl),$Htbl
1073
+ sub \$1,%r10
1074
+ jnz .Linit_loop_avx
1075
+
1076
+ vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
1077
+ vmovdqu $T3,-0x10($Htbl)
1078
+
1079
+ vzeroupper
1080
+ ___
1081
+ $code.=<<___ if ($win64);
1082
+ movaps (%rsp),%xmm6
1083
+ lea 0x18(%rsp),%rsp
1084
+ .LSEH_end_gcm_init_avx:
1085
+ ___
1086
+ $code.=<<___;
1087
+ ret
1088
+ .size gcm_init_avx,.-gcm_init_avx
1089
+ ___
1090
+ } else {
1091
+ $code.=<<___;
1092
+ jmp .L_init_clmul
1093
+ .size gcm_init_avx,.-gcm_init_avx
1094
+ ___
1095
+ }
1096
+
1097
+ $code.=<<___;
1098
+ .globl gcm_gmult_avx
1099
+ .type gcm_gmult_avx,\@abi-omnipotent
1100
+ .align 32
1101
+ gcm_gmult_avx:
1102
+ jmp .L_gmult_clmul
1103
+ .size gcm_gmult_avx,.-gcm_gmult_avx
1104
+ ___
1105
+
1106
+ $code.=<<___;
1107
+ .globl gcm_ghash_avx
1108
+ .type gcm_ghash_avx,\@abi-omnipotent
1109
+ .align 32
1110
+ gcm_ghash_avx:
1111
+ ___
1112
+ if ($avx) {
1113
+ my ($Xip,$Htbl,$inp,$len)=@_4args;
1114
+ my ($Xlo,$Xhi,$Xmi,
1115
+ $Zlo,$Zhi,$Zmi,
1116
+ $Hkey,$HK,$T1,$T2,
1117
+ $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
1118
+
1119
+ $code.=<<___ if ($win64);
1120
+ lea -0x88(%rsp),%rax
1121
+ .LSEH_begin_gcm_ghash_avx:
1122
+ # I can't trust assembler to use specific encoding:-(
1123
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1124
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
1125
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
1126
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
1127
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
1128
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
1129
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
1130
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
1131
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
1132
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
1133
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
1134
+ ___
1135
+ $code.=<<___;
1136
+ vzeroupper
1137
+
1138
+ vmovdqu ($Xip),$Xi # load $Xi
1139
+ lea .L0x1c2_polynomial(%rip),%r10
1140
+ lea 0x40($Htbl),$Htbl # size optimization
1141
+ vmovdqu .Lbswap_mask(%rip),$bswap
1142
+ vpshufb $bswap,$Xi,$Xi
1143
+ cmp \$0x80,$len
1144
+ jb .Lshort_avx
1145
+ sub \$0x80,$len
1146
+
1147
+ vmovdqu 0x70($inp),$Ii # I[7]
1148
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1149
+ vpshufb $bswap,$Ii,$Ii
1150
+ vmovdqu 0x20-0x40($Htbl),$HK
1151
+
1152
+ vpunpckhqdq $Ii,$Ii,$T2
1153
+ vmovdqu 0x60($inp),$Ij # I[6]
1154
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1155
+ vpxor $Ii,$T2,$T2
1156
+ vpshufb $bswap,$Ij,$Ij
1157
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1158
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1159
+ vpunpckhqdq $Ij,$Ij,$T1
1160
+ vmovdqu 0x50($inp),$Ii # I[5]
1161
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
1162
+ vpxor $Ij,$T1,$T1
1163
+
1164
+ vpshufb $bswap,$Ii,$Ii
1165
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1166
+ vpunpckhqdq $Ii,$Ii,$T2
1167
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1168
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1169
+ vpxor $Ii,$T2,$T2
1170
+ vmovdqu 0x40($inp),$Ij # I[4]
1171
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
1172
+ vmovdqu 0x50-0x40($Htbl),$HK
1173
+
1174
+ vpshufb $bswap,$Ij,$Ij
1175
+ vpxor $Xlo,$Zlo,$Zlo
1176
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1177
+ vpxor $Xhi,$Zhi,$Zhi
1178
+ vpunpckhqdq $Ij,$Ij,$T1
1179
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1180
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1181
+ vpxor $Xmi,$Zmi,$Zmi
1182
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
1183
+ vpxor $Ij,$T1,$T1
1184
+
1185
+ vmovdqu 0x30($inp),$Ii # I[3]
1186
+ vpxor $Zlo,$Xlo,$Xlo
1187
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1188
+ vpxor $Zhi,$Xhi,$Xhi
1189
+ vpshufb $bswap,$Ii,$Ii
1190
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1191
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1192
+ vpxor $Zmi,$Xmi,$Xmi
1193
+ vpunpckhqdq $Ii,$Ii,$T2
1194
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
1195
+ vmovdqu 0x80-0x40($Htbl),$HK
1196
+ vpxor $Ii,$T2,$T2
1197
+
1198
+ vmovdqu 0x20($inp),$Ij # I[2]
1199
+ vpxor $Xlo,$Zlo,$Zlo
1200
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1201
+ vpxor $Xhi,$Zhi,$Zhi
1202
+ vpshufb $bswap,$Ij,$Ij
1203
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1204
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1205
+ vpxor $Xmi,$Zmi,$Zmi
1206
+ vpunpckhqdq $Ij,$Ij,$T1
1207
+ vpclmulqdq \$0x00,$HK,$T2,$Xmi
1208
+ vpxor $Ij,$T1,$T1
1209
+
1210
+ vmovdqu 0x10($inp),$Ii # I[1]
1211
+ vpxor $Zlo,$Xlo,$Xlo
1212
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1213
+ vpxor $Zhi,$Xhi,$Xhi
1214
+ vpshufb $bswap,$Ii,$Ii
1215
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1216
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1217
+ vpxor $Zmi,$Xmi,$Xmi
1218
+ vpunpckhqdq $Ii,$Ii,$T2
1219
+ vpclmulqdq \$0x10,$HK,$T1,$Zmi
1220
+ vmovdqu 0xb0-0x40($Htbl),$HK
1221
+ vpxor $Ii,$T2,$T2
1222
+
1223
+ vmovdqu ($inp),$Ij # I[0]
1224
+ vpxor $Xlo,$Zlo,$Zlo
1225
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1226
+ vpxor $Xhi,$Zhi,$Zhi
1227
+ vpshufb $bswap,$Ij,$Ij
1228
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1229
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1230
+ vpxor $Xmi,$Zmi,$Zmi
1231
+ vpclmulqdq \$0x10,$HK,$T2,$Xmi
1232
+
1233
+ lea 0x80($inp),$inp
1234
+ cmp \$0x80,$len
1235
+ jb .Ltail_avx
1236
+
1237
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
1238
+ sub \$0x80,$len
1239
+ jmp .Loop8x_avx
1240
+
1241
+ .align 32
1242
+ .Loop8x_avx:
1243
+ vpunpckhqdq $Ij,$Ij,$T1
1244
+ vmovdqu 0x70($inp),$Ii # I[7]
1245
+ vpxor $Xlo,$Zlo,$Zlo
1246
+ vpxor $Ij,$T1,$T1
1247
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
1248
+ vpshufb $bswap,$Ii,$Ii
1249
+ vpxor $Xhi,$Zhi,$Zhi
1250
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
1251
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1252
+ vpunpckhqdq $Ii,$Ii,$T2
1253
+ vpxor $Xmi,$Zmi,$Zmi
1254
+ vpclmulqdq \$0x00,$HK,$T1,$Tred
1255
+ vmovdqu 0x20-0x40($Htbl),$HK
1256
+ vpxor $Ii,$T2,$T2
1257
+
1258
+ vmovdqu 0x60($inp),$Ij # I[6]
1259
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1260
+ vpxor $Zlo,$Xi,$Xi # collect result
1261
+ vpshufb $bswap,$Ij,$Ij
1262
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1263
+ vxorps $Zhi,$Xo,$Xo
1264
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1265
+ vpunpckhqdq $Ij,$Ij,$T1
1266
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
1267
+ vpxor $Zmi,$Tred,$Tred
1268
+ vxorps $Ij,$T1,$T1
1269
+
1270
+ vmovdqu 0x50($inp),$Ii # I[5]
1271
+ vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
1272
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1273
+ vpxor $Xo,$Tred,$Tred
1274
+ vpslldq \$8,$Tred,$T2
1275
+ vpxor $Xlo,$Zlo,$Zlo
1276
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1277
+ vpsrldq \$8,$Tred,$Tred
1278
+ vpxor $T2, $Xi, $Xi
1279
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1280
+ vpshufb $bswap,$Ii,$Ii
1281
+ vxorps $Tred,$Xo, $Xo
1282
+ vpxor $Xhi,$Zhi,$Zhi
1283
+ vpunpckhqdq $Ii,$Ii,$T2
1284
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
1285
+ vmovdqu 0x50-0x40($Htbl),$HK
1286
+ vpxor $Ii,$T2,$T2
1287
+ vpxor $Xmi,$Zmi,$Zmi
1288
+
1289
+ vmovdqu 0x40($inp),$Ij # I[4]
1290
+ vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
1291
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1292
+ vpshufb $bswap,$Ij,$Ij
1293
+ vpxor $Zlo,$Xlo,$Xlo
1294
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1295
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1296
+ vpunpckhqdq $Ij,$Ij,$T1
1297
+ vpxor $Zhi,$Xhi,$Xhi
1298
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
1299
+ vxorps $Ij,$T1,$T1
1300
+ vpxor $Zmi,$Xmi,$Xmi
1301
+
1302
+ vmovdqu 0x30($inp),$Ii # I[3]
1303
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1304
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1305
+ vpshufb $bswap,$Ii,$Ii
1306
+ vpxor $Xlo,$Zlo,$Zlo
1307
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1308
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1309
+ vpunpckhqdq $Ii,$Ii,$T2
1310
+ vpxor $Xhi,$Zhi,$Zhi
1311
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
1312
+ vmovdqu 0x80-0x40($Htbl),$HK
1313
+ vpxor $Ii,$T2,$T2
1314
+ vpxor $Xmi,$Zmi,$Zmi
1315
+
1316
+ vmovdqu 0x20($inp),$Ij # I[2]
1317
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1318
+ vpshufb $bswap,$Ij,$Ij
1319
+ vpxor $Zlo,$Xlo,$Xlo
1320
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1321
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1322
+ vpunpckhqdq $Ij,$Ij,$T1
1323
+ vpxor $Zhi,$Xhi,$Xhi
1324
+ vpclmulqdq \$0x00,$HK, $T2,$Xmi
1325
+ vpxor $Ij,$T1,$T1
1326
+ vpxor $Zmi,$Xmi,$Xmi
1327
+ vxorps $Tred,$Xi,$Xi
1328
+
1329
+ vmovdqu 0x10($inp),$Ii # I[1]
1330
+ vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
1331
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1332
+ vpshufb $bswap,$Ii,$Ii
1333
+ vpxor $Xlo,$Zlo,$Zlo
1334
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1335
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1336
+ vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1337
+ vxorps $Xo,$Tred,$Tred
1338
+ vpunpckhqdq $Ii,$Ii,$T2
1339
+ vpxor $Xhi,$Zhi,$Zhi
1340
+ vpclmulqdq \$0x10,$HK, $T1,$Zmi
1341
+ vmovdqu 0xb0-0x40($Htbl),$HK
1342
+ vpxor $Ii,$T2,$T2
1343
+ vpxor $Xmi,$Zmi,$Zmi
1344
+
1345
+ vmovdqu ($inp),$Ij # I[0]
1346
+ vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1347
+ vpshufb $bswap,$Ij,$Ij
1348
+ vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1349
+ vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1350
+ vpxor $Tred,$Ij,$Ij
1351
+ vpclmulqdq \$0x10,$HK, $T2,$Xmi
1352
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
1353
+
1354
+ lea 0x80($inp),$inp
1355
+ sub \$0x80,$len
1356
+ jnc .Loop8x_avx
1357
+
1358
+ add \$0x80,$len
1359
+ jmp .Ltail_no_xor_avx
1360
+
1361
+ .align 32
1362
+ .Lshort_avx:
1363
+ vmovdqu -0x10($inp,$len),$Ii # very last word
1364
+ lea ($inp,$len),$inp
1365
+ vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1366
+ vmovdqu 0x20-0x40($Htbl),$HK
1367
+ vpshufb $bswap,$Ii,$Ij
1368
+
1369
+ vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
1370
+ vmovdqa $Xhi,$Zhi # $Zhi and
1371
+ vmovdqa $Xmi,$Zmi # $Zmi
1372
+ sub \$0x10,$len
1373
+ jz .Ltail_avx
1374
+
1375
+ vpunpckhqdq $Ij,$Ij,$T1
1376
+ vpxor $Xlo,$Zlo,$Zlo
1377
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1378
+ vpxor $Ij,$T1,$T1
1379
+ vmovdqu -0x20($inp),$Ii
1380
+ vpxor $Xhi,$Zhi,$Zhi
1381
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1382
+ vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1383
+ vpshufb $bswap,$Ii,$Ij
1384
+ vpxor $Xmi,$Zmi,$Zmi
1385
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1386
+ vpsrldq \$8,$HK,$HK
1387
+ sub \$0x10,$len
1388
+ jz .Ltail_avx
1389
+
1390
+ vpunpckhqdq $Ij,$Ij,$T1
1391
+ vpxor $Xlo,$Zlo,$Zlo
1392
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1393
+ vpxor $Ij,$T1,$T1
1394
+ vmovdqu -0x30($inp),$Ii
1395
+ vpxor $Xhi,$Zhi,$Zhi
1396
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1397
+ vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1398
+ vpshufb $bswap,$Ii,$Ij
1399
+ vpxor $Xmi,$Zmi,$Zmi
1400
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1401
+ vmovdqu 0x50-0x40($Htbl),$HK
1402
+ sub \$0x10,$len
1403
+ jz .Ltail_avx
1404
+
1405
+ vpunpckhqdq $Ij,$Ij,$T1
1406
+ vpxor $Xlo,$Zlo,$Zlo
1407
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1408
+ vpxor $Ij,$T1,$T1
1409
+ vmovdqu -0x40($inp),$Ii
1410
+ vpxor $Xhi,$Zhi,$Zhi
1411
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1412
+ vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1413
+ vpshufb $bswap,$Ii,$Ij
1414
+ vpxor $Xmi,$Zmi,$Zmi
1415
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1416
+ vpsrldq \$8,$HK,$HK
1417
+ sub \$0x10,$len
1418
+ jz .Ltail_avx
1419
+
1420
+ vpunpckhqdq $Ij,$Ij,$T1
1421
+ vpxor $Xlo,$Zlo,$Zlo
1422
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1423
+ vpxor $Ij,$T1,$T1
1424
+ vmovdqu -0x50($inp),$Ii
1425
+ vpxor $Xhi,$Zhi,$Zhi
1426
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1427
+ vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1428
+ vpshufb $bswap,$Ii,$Ij
1429
+ vpxor $Xmi,$Zmi,$Zmi
1430
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1431
+ vmovdqu 0x80-0x40($Htbl),$HK
1432
+ sub \$0x10,$len
1433
+ jz .Ltail_avx
1434
+
1435
+ vpunpckhqdq $Ij,$Ij,$T1
1436
+ vpxor $Xlo,$Zlo,$Zlo
1437
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1438
+ vpxor $Ij,$T1,$T1
1439
+ vmovdqu -0x60($inp),$Ii
1440
+ vpxor $Xhi,$Zhi,$Zhi
1441
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1442
+ vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1443
+ vpshufb $bswap,$Ii,$Ij
1444
+ vpxor $Xmi,$Zmi,$Zmi
1445
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1446
+ vpsrldq \$8,$HK,$HK
1447
+ sub \$0x10,$len
1448
+ jz .Ltail_avx
1449
+
1450
+ vpunpckhqdq $Ij,$Ij,$T1
1451
+ vpxor $Xlo,$Zlo,$Zlo
1452
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1453
+ vpxor $Ij,$T1,$T1
1454
+ vmovdqu -0x70($inp),$Ii
1455
+ vpxor $Xhi,$Zhi,$Zhi
1456
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1457
+ vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1458
+ vpshufb $bswap,$Ii,$Ij
1459
+ vpxor $Xmi,$Zmi,$Zmi
1460
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1461
+ vmovq 0xb8-0x40($Htbl),$HK
1462
+ sub \$0x10,$len
1463
+ jmp .Ltail_avx
1464
+
1465
+ .align 32
1466
+ .Ltail_avx:
1467
+ vpxor $Xi,$Ij,$Ij # accumulate $Xi
1468
+ .Ltail_no_xor_avx:
1469
+ vpunpckhqdq $Ij,$Ij,$T1
1470
+ vpxor $Xlo,$Zlo,$Zlo
1471
+ vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1472
+ vpxor $Ij,$T1,$T1
1473
+ vpxor $Xhi,$Zhi,$Zhi
1474
+ vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1475
+ vpxor $Xmi,$Zmi,$Zmi
1476
+ vpclmulqdq \$0x00,$HK,$T1,$Xmi
1477
+
1478
+ vmovdqu (%r10),$Tred
1479
+
1480
+ vpxor $Xlo,$Zlo,$Xi
1481
+ vpxor $Xhi,$Zhi,$Xo
1482
+ vpxor $Xmi,$Zmi,$Zmi
1483
+
1484
+ vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
1485
+ vpxor $Xo, $Zmi,$Zmi
1486
+ vpslldq \$8, $Zmi,$T2
1487
+ vpsrldq \$8, $Zmi,$Zmi
1488
+ vpxor $T2, $Xi, $Xi
1489
+ vpxor $Zmi,$Xo, $Xo
1490
+
1491
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
1492
+ vpalignr \$8,$Xi,$Xi,$Xi
1493
+ vpxor $T2,$Xi,$Xi
1494
+
1495
+ vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
1496
+ vpalignr \$8,$Xi,$Xi,$Xi
1497
+ vpxor $Xo,$Xi,$Xi
1498
+ vpxor $T2,$Xi,$Xi
1499
+
1500
+ cmp \$0,$len
1501
+ jne .Lshort_avx
1502
+
1503
+ vpshufb $bswap,$Xi,$Xi
1504
+ vmovdqu $Xi,($Xip)
1505
+ vzeroupper
1506
+ ___
1507
+ $code.=<<___ if ($win64);
1508
+ movaps (%rsp),%xmm6
1509
+ movaps 0x10(%rsp),%xmm7
1510
+ movaps 0x20(%rsp),%xmm8
1511
+ movaps 0x30(%rsp),%xmm9
1512
+ movaps 0x40(%rsp),%xmm10
1513
+ movaps 0x50(%rsp),%xmm11
1514
+ movaps 0x60(%rsp),%xmm12
1515
+ movaps 0x70(%rsp),%xmm13
1516
+ movaps 0x80(%rsp),%xmm14
1517
+ movaps 0x90(%rsp),%xmm15
1518
+ lea 0xa8(%rsp),%rsp
1519
+ .LSEH_end_gcm_ghash_avx:
1520
+ ___
1521
+ $code.=<<___;
1522
+ ret
1523
+ .size gcm_ghash_avx,.-gcm_ghash_avx
1524
+ ___
1525
+ } else {
1526
+ $code.=<<___;
1527
+ jmp .L_ghash_clmul
1528
+ .size gcm_ghash_avx,.-gcm_ghash_avx
1529
+ ___
1530
+ }
1531
+
1532
+ $code.=<<___;
1533
+ .align 64
1534
+ .Lbswap_mask:
1535
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1536
+ .L0x1c2_polynomial:
1537
+ .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1538
+ .L7_mask:
1539
+ .long 7,0,7,0
1540
+ .L7_mask_poly:
1541
+ .long 7,0,`0xE1<<1`,0
1542
+ .align 64
1543
+ .type .Lrem_4bit,\@object
1544
+ .Lrem_4bit:
1545
+ .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
1546
+ .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
1547
+ .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
1548
+ .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
1549
+ .type .Lrem_8bit,\@object
1550
+ .Lrem_8bit:
1551
+ .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1552
+ .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1553
+ .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1554
+ .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1555
+ .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1556
+ .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1557
+ .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1558
+ .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1559
+ .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1560
+ .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1561
+ .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1562
+ .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1563
+ .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1564
+ .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1565
+ .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1566
+ .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1567
+ .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1568
+ .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1569
+ .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1570
+ .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1571
+ .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1572
+ .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1573
+ .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1574
+ .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1575
+ .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1576
+ .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1577
+ .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1578
+ .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1579
+ .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1580
+ .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1581
+ .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1582
+ .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1583
+
1584
+ .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1585
+ .align 64
1586
+ ___
1587
+
1588
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1589
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1590
+ if ($win64) {
1591
+ $rec="%rcx";
1592
+ $frame="%rdx";
1593
+ $context="%r8";
1594
+ $disp="%r9";
1595
+
1596
+ $code.=<<___;
1597
+ .extern __imp_RtlVirtualUnwind
1598
+ .type se_handler,\@abi-omnipotent
1599
+ .align 16
1600
+ se_handler:
1601
+ push %rsi
1602
+ push %rdi
1603
+ push %rbx
1604
+ push %rbp
1605
+ push %r12
1606
+ push %r13
1607
+ push %r14
1608
+ push %r15
1609
+ pushfq
1610
+ sub \$64,%rsp
1611
+
1612
+ mov 120($context),%rax # pull context->Rax
1613
+ mov 248($context),%rbx # pull context->Rip
1614
+
1615
+ mov 8($disp),%rsi # disp->ImageBase
1616
+ mov 56($disp),%r11 # disp->HandlerData
1617
+
1618
+ mov 0(%r11),%r10d # HandlerData[0]
1619
+ lea (%rsi,%r10),%r10 # prologue label
1620
+ cmp %r10,%rbx # context->Rip<prologue label
1621
+ jb .Lin_prologue
1622
+
1623
+ mov 152($context),%rax # pull context->Rsp
1624
+
1625
+ mov 4(%r11),%r10d # HandlerData[1]
1626
+ lea (%rsi,%r10),%r10 # epilogue label
1627
+ cmp %r10,%rbx # context->Rip>=epilogue label
1628
+ jae .Lin_prologue
1629
+
1630
+ lea 24(%rax),%rax # adjust "rsp"
1631
+
1632
+ mov -8(%rax),%rbx
1633
+ mov -16(%rax),%rbp
1634
+ mov -24(%rax),%r12
1635
+ mov %rbx,144($context) # restore context->Rbx
1636
+ mov %rbp,160($context) # restore context->Rbp
1637
+ mov %r12,216($context) # restore context->R12
1638
+
1639
+ .Lin_prologue:
1640
+ mov 8(%rax),%rdi
1641
+ mov 16(%rax),%rsi
1642
+ mov %rax,152($context) # restore context->Rsp
1643
+ mov %rsi,168($context) # restore context->Rsi
1644
+ mov %rdi,176($context) # restore context->Rdi
1645
+
1646
+ mov 40($disp),%rdi # disp->ContextRecord
1647
+ mov $context,%rsi # context
1648
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1649
+ .long 0xa548f3fc # cld; rep movsq
1650
+
1651
+ mov $disp,%rsi
1652
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1653
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1654
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1655
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1656
+ mov 40(%rsi),%r10 # disp->ContextRecord
1657
+ lea 56(%rsi),%r11 # &disp->HandlerData
1658
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1659
+ mov %r10,32(%rsp) # arg5
1660
+ mov %r11,40(%rsp) # arg6
1661
+ mov %r12,48(%rsp) # arg7
1662
+ mov %rcx,56(%rsp) # arg8, (NULL)
1663
+ call *__imp_RtlVirtualUnwind(%rip)
1664
+
1665
+ mov \$1,%eax # ExceptionContinueSearch
1666
+ add \$64,%rsp
1667
+ popfq
1668
+ pop %r15
1669
+ pop %r14
1670
+ pop %r13
1671
+ pop %r12
1672
+ pop %rbp
1673
+ pop %rbx
1674
+ pop %rdi
1675
+ pop %rsi
1676
+ ret
1677
+ .size se_handler,.-se_handler
1678
+
1679
+ .section .pdata
1680
+ .align 4
1681
+ .rva .LSEH_begin_gcm_gmult_4bit
1682
+ .rva .LSEH_end_gcm_gmult_4bit
1683
+ .rva .LSEH_info_gcm_gmult_4bit
1684
+
1685
+ .rva .LSEH_begin_gcm_ghash_4bit
1686
+ .rva .LSEH_end_gcm_ghash_4bit
1687
+ .rva .LSEH_info_gcm_ghash_4bit
1688
+
1689
+ .rva .LSEH_begin_gcm_init_clmul
1690
+ .rva .LSEH_end_gcm_init_clmul
1691
+ .rva .LSEH_info_gcm_init_clmul
1692
+
1693
+ .rva .LSEH_begin_gcm_ghash_clmul
1694
+ .rva .LSEH_end_gcm_ghash_clmul
1695
+ .rva .LSEH_info_gcm_ghash_clmul
1696
+ ___
1697
+ $code.=<<___ if ($avx);
1698
+ .rva .LSEH_begin_gcm_init_avx
1699
+ .rva .LSEH_end_gcm_init_avx
1700
+ .rva .LSEH_info_gcm_init_clmul
1701
+
1702
+ .rva .LSEH_begin_gcm_ghash_avx
1703
+ .rva .LSEH_end_gcm_ghash_avx
1704
+ .rva .LSEH_info_gcm_ghash_clmul
1705
+ ___
1706
+ $code.=<<___;
1707
+ .section .xdata
1708
+ .align 8
1709
+ .LSEH_info_gcm_gmult_4bit:
1710
+ .byte 9,0,0,0
1711
+ .rva se_handler
1712
+ .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
1713
+ .LSEH_info_gcm_ghash_4bit:
1714
+ .byte 9,0,0,0
1715
+ .rva se_handler
1716
+ .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
1717
+ .LSEH_info_gcm_init_clmul:
1718
+ .byte 0x01,0x08,0x03,0x00
1719
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1720
+ .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
1721
+ .LSEH_info_gcm_ghash_clmul:
1722
+ .byte 0x01,0x33,0x16,0x00
1723
+ .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
1724
+ .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
1725
+ .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
1726
+ .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
1727
+ .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
1728
+ .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
1729
+ .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
1730
+ .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
1731
+ .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1732
+ .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1733
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1734
+ ___
1735
+ }
1736
+
1737
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1738
+
1739
+ print $code;
1740
+
1741
+ close STDOUT;