ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1393 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # October 2005.
11
+ #
12
+ # Montgomery multiplication routine for x86_64. While it gives modest
13
+ # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14
+ # than twice, >2x, as fast. Most common rsa1024 sign is improved by
15
+ # respectful 50%. It remains to be seen if loop unrolling and
16
+ # dedicated squaring routine can provide further improvement...
17
+
18
+ # July 2011.
19
+ #
20
+ # Add dedicated squaring procedure. Performance improvement varies
21
+ # from platform to platform, but in average it's ~5%/15%/25%/33%
22
+ # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
+
24
+ # August 2011.
25
+ #
26
+ # Unroll and modulo-schedule inner loops in such manner that they
27
+ # are "fallen through" for input lengths of 8, which is critical for
28
+ # 1024-bit RSA *sign*. Average performance improvement in comparison
29
+ # to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30
+ # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
+
32
+ # June 2013.
33
+ #
34
+ # Optimize reduction in squaring procedure and improve 1024+-bit RSA
35
+ # sign performance by 10-16% on Intel Sandy Bridge and later
36
+ # (virtually same on non-Intel processors).
37
+
38
+ # August 2013.
39
+ #
40
+ # Add MULX/ADOX/ADCX code path.
41
+
42
+ $flavour = shift;
43
+ $output = shift;
44
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
+
46
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
+
48
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51
+ die "can't locate x86_64-xlate.pl";
52
+
53
+ open OUT,"| \"$^X\" $xlate $flavour $output";
54
+ *STDOUT=*OUT;
55
+
56
+ # In upstream, this is controlled by shelling out to the compiler to check
57
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
58
+ # output, so this isn't useful anyway.
59
+ #
60
+ # TODO(davidben): Enable this option after testing. $addx goes up to 1.
61
+ $addx = 0;
62
+
63
+ # int bn_mul_mont(
64
+ $rp="%rdi"; # BN_ULONG *rp,
65
+ $ap="%rsi"; # const BN_ULONG *ap,
66
+ $bp="%rdx"; # const BN_ULONG *bp,
67
+ $np="%rcx"; # const BN_ULONG *np,
68
+ $n0="%r8"; # const BN_ULONG *n0,
69
+ $num="%r9"; # int num);
70
+ $lo0="%r10";
71
+ $hi0="%r11";
72
+ $hi1="%r13";
73
+ $i="%r14";
74
+ $j="%r15";
75
+ $m0="%rbx";
76
+ $m1="%rbp";
77
+
78
+ $code=<<___;
79
+ .text
80
+
81
+ .extern OPENSSL_ia32cap_P
82
+
83
+ .globl bn_mul_mont
84
+ .type bn_mul_mont,\@function,6
85
+ .align 16
86
+ bn_mul_mont:
87
+ test \$3,${num}d
88
+ jnz .Lmul_enter
89
+ cmp \$8,${num}d
90
+ jb .Lmul_enter
91
+ ___
92
+ $code.=<<___ if ($addx);
93
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
94
+ ___
95
+ $code.=<<___;
96
+ cmp $ap,$bp
97
+ jne .Lmul4x_enter
98
+ test \$7,${num}d
99
+ jz .Lsqr8x_enter
100
+ jmp .Lmul4x_enter
101
+
102
+ .align 16
103
+ .Lmul_enter:
104
+ push %rbx
105
+ push %rbp
106
+ push %r12
107
+ push %r13
108
+ push %r14
109
+ push %r15
110
+
111
+ mov ${num}d,${num}d
112
+ lea 2($num),%r10
113
+ mov %rsp,%r11
114
+ neg %r10
115
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
116
+ and \$-1024,%rsp # minimize TLB usage
117
+
118
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
119
+ .Lmul_body:
120
+ mov $bp,%r12 # reassign $bp
121
+ ___
122
+ $bp="%r12";
123
+ $code.=<<___;
124
+ mov ($n0),$n0 # pull n0[0] value
125
+ mov ($bp),$m0 # m0=bp[0]
126
+ mov ($ap),%rax
127
+
128
+ xor $i,$i # i=0
129
+ xor $j,$j # j=0
130
+
131
+ mov $n0,$m1
132
+ mulq $m0 # ap[0]*bp[0]
133
+ mov %rax,$lo0
134
+ mov ($np),%rax
135
+
136
+ imulq $lo0,$m1 # "tp[0]"*n0
137
+ mov %rdx,$hi0
138
+
139
+ mulq $m1 # np[0]*m1
140
+ add %rax,$lo0 # discarded
141
+ mov 8($ap),%rax
142
+ adc \$0,%rdx
143
+ mov %rdx,$hi1
144
+
145
+ lea 1($j),$j # j++
146
+ jmp .L1st_enter
147
+
148
+ .align 16
149
+ .L1st:
150
+ add %rax,$hi1
151
+ mov ($ap,$j,8),%rax
152
+ adc \$0,%rdx
153
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
154
+ mov $lo0,$hi0
155
+ adc \$0,%rdx
156
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
157
+ mov %rdx,$hi1
158
+
159
+ .L1st_enter:
160
+ mulq $m0 # ap[j]*bp[0]
161
+ add %rax,$hi0
162
+ mov ($np,$j,8),%rax
163
+ adc \$0,%rdx
164
+ lea 1($j),$j # j++
165
+ mov %rdx,$lo0
166
+
167
+ mulq $m1 # np[j]*m1
168
+ cmp $num,$j
169
+ jne .L1st
170
+
171
+ add %rax,$hi1
172
+ mov ($ap),%rax # ap[0]
173
+ adc \$0,%rdx
174
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
175
+ adc \$0,%rdx
176
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
177
+ mov %rdx,$hi1
178
+ mov $lo0,$hi0
179
+
180
+ xor %rdx,%rdx
181
+ add $hi0,$hi1
182
+ adc \$0,%rdx
183
+ mov $hi1,-8(%rsp,$num,8)
184
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
185
+
186
+ lea 1($i),$i # i++
187
+ jmp .Louter
188
+ .align 16
189
+ .Louter:
190
+ mov ($bp,$i,8),$m0 # m0=bp[i]
191
+ xor $j,$j # j=0
192
+ mov $n0,$m1
193
+ mov (%rsp),$lo0
194
+ mulq $m0 # ap[0]*bp[i]
195
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
196
+ mov ($np),%rax
197
+ adc \$0,%rdx
198
+
199
+ imulq $lo0,$m1 # tp[0]*n0
200
+ mov %rdx,$hi0
201
+
202
+ mulq $m1 # np[0]*m1
203
+ add %rax,$lo0 # discarded
204
+ mov 8($ap),%rax
205
+ adc \$0,%rdx
206
+ mov 8(%rsp),$lo0 # tp[1]
207
+ mov %rdx,$hi1
208
+
209
+ lea 1($j),$j # j++
210
+ jmp .Linner_enter
211
+
212
+ .align 16
213
+ .Linner:
214
+ add %rax,$hi1
215
+ mov ($ap,$j,8),%rax
216
+ adc \$0,%rdx
217
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
218
+ mov (%rsp,$j,8),$lo0
219
+ adc \$0,%rdx
220
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
221
+ mov %rdx,$hi1
222
+
223
+ .Linner_enter:
224
+ mulq $m0 # ap[j]*bp[i]
225
+ add %rax,$hi0
226
+ mov ($np,$j,8),%rax
227
+ adc \$0,%rdx
228
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
229
+ mov %rdx,$hi0
230
+ adc \$0,$hi0
231
+ lea 1($j),$j # j++
232
+
233
+ mulq $m1 # np[j]*m1
234
+ cmp $num,$j
235
+ jne .Linner
236
+
237
+ add %rax,$hi1
238
+ mov ($ap),%rax # ap[0]
239
+ adc \$0,%rdx
240
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
241
+ mov (%rsp,$j,8),$lo0
242
+ adc \$0,%rdx
243
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
244
+ mov %rdx,$hi1
245
+
246
+ xor %rdx,%rdx
247
+ add $hi0,$hi1
248
+ adc \$0,%rdx
249
+ add $lo0,$hi1 # pull upmost overflow bit
250
+ adc \$0,%rdx
251
+ mov $hi1,-8(%rsp,$num,8)
252
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
253
+
254
+ lea 1($i),$i # i++
255
+ cmp $num,$i
256
+ jb .Louter
257
+
258
+ xor $i,$i # i=0 and clear CF!
259
+ mov (%rsp),%rax # tp[0]
260
+ lea (%rsp),$ap # borrow ap for tp
261
+ mov $num,$j # j=num
262
+ jmp .Lsub
263
+ .align 16
264
+ .Lsub: sbb ($np,$i,8),%rax
265
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
266
+ mov 8($ap,$i,8),%rax # tp[i+1]
267
+ lea 1($i),$i # i++
268
+ dec $j # doesn't affect CF!
269
+ jnz .Lsub
270
+
271
+ sbb \$0,%rax # handle upmost overflow bit
272
+ xor $i,$i
273
+ mov $num,$j # j=num
274
+ .align 16
275
+ .Lcopy: # copy or in-place refresh
276
+ mov (%rsp,$i,8),$ap
277
+ mov ($rp,$i,8),$np
278
+ xor $np,$ap # conditional select:
279
+ and %rax,$ap # ((ap ^ np) & %rax) ^ np
280
+ xor $np,$ap # ap = borrow?tp:rp
281
+ mov $i,(%rsp,$i,8) # zap temporary vector
282
+ mov $ap,($rp,$i,8) # rp[i]=tp[i]
283
+ lea 1($i),$i
284
+ sub \$1,$j
285
+ jnz .Lcopy
286
+
287
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
288
+ mov \$1,%rax
289
+ mov (%rsi),%r15
290
+ mov 8(%rsi),%r14
291
+ mov 16(%rsi),%r13
292
+ mov 24(%rsi),%r12
293
+ mov 32(%rsi),%rbp
294
+ mov 40(%rsi),%rbx
295
+ lea 48(%rsi),%rsp
296
+ .Lmul_epilogue:
297
+ ret
298
+ .size bn_mul_mont,.-bn_mul_mont
299
+ ___
300
+ {{{
301
+ my @A=("%r10","%r11");
302
+ my @N=("%r13","%rdi");
303
+ $code.=<<___;
304
+ .type bn_mul4x_mont,\@function,6
305
+ .align 16
306
+ bn_mul4x_mont:
307
+ .Lmul4x_enter:
308
+ ___
309
+ $code.=<<___ if ($addx);
310
+ and \$0x80100,%r11d
311
+ cmp \$0x80100,%r11d
312
+ je .Lmulx4x_enter
313
+ ___
314
+ $code.=<<___;
315
+ push %rbx
316
+ push %rbp
317
+ push %r12
318
+ push %r13
319
+ push %r14
320
+ push %r15
321
+
322
+ mov ${num}d,${num}d
323
+ lea 4($num),%r10
324
+ mov %rsp,%r11
325
+ neg %r10
326
+ lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
327
+ and \$-1024,%rsp # minimize TLB usage
328
+
329
+ mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
330
+ .Lmul4x_body:
331
+ mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
332
+ mov %rdx,%r12 # reassign $bp
333
+ ___
334
+ $bp="%r12";
335
+ $code.=<<___;
336
+ mov ($n0),$n0 # pull n0[0] value
337
+ mov ($bp),$m0 # m0=bp[0]
338
+ mov ($ap),%rax
339
+
340
+ xor $i,$i # i=0
341
+ xor $j,$j # j=0
342
+
343
+ mov $n0,$m1
344
+ mulq $m0 # ap[0]*bp[0]
345
+ mov %rax,$A[0]
346
+ mov ($np),%rax
347
+
348
+ imulq $A[0],$m1 # "tp[0]"*n0
349
+ mov %rdx,$A[1]
350
+
351
+ mulq $m1 # np[0]*m1
352
+ add %rax,$A[0] # discarded
353
+ mov 8($ap),%rax
354
+ adc \$0,%rdx
355
+ mov %rdx,$N[1]
356
+
357
+ mulq $m0
358
+ add %rax,$A[1]
359
+ mov 8($np),%rax
360
+ adc \$0,%rdx
361
+ mov %rdx,$A[0]
362
+
363
+ mulq $m1
364
+ add %rax,$N[1]
365
+ mov 16($ap),%rax
366
+ adc \$0,%rdx
367
+ add $A[1],$N[1]
368
+ lea 4($j),$j # j++
369
+ adc \$0,%rdx
370
+ mov $N[1],(%rsp)
371
+ mov %rdx,$N[0]
372
+ jmp .L1st4x
373
+ .align 16
374
+ .L1st4x:
375
+ mulq $m0 # ap[j]*bp[0]
376
+ add %rax,$A[0]
377
+ mov -16($np,$j,8),%rax
378
+ adc \$0,%rdx
379
+ mov %rdx,$A[1]
380
+
381
+ mulq $m1 # np[j]*m1
382
+ add %rax,$N[0]
383
+ mov -8($ap,$j,8),%rax
384
+ adc \$0,%rdx
385
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
386
+ adc \$0,%rdx
387
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
388
+ mov %rdx,$N[1]
389
+
390
+ mulq $m0 # ap[j]*bp[0]
391
+ add %rax,$A[1]
392
+ mov -8($np,$j,8),%rax
393
+ adc \$0,%rdx
394
+ mov %rdx,$A[0]
395
+
396
+ mulq $m1 # np[j]*m1
397
+ add %rax,$N[1]
398
+ mov ($ap,$j,8),%rax
399
+ adc \$0,%rdx
400
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
401
+ adc \$0,%rdx
402
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
403
+ mov %rdx,$N[0]
404
+
405
+ mulq $m0 # ap[j]*bp[0]
406
+ add %rax,$A[0]
407
+ mov ($np,$j,8),%rax
408
+ adc \$0,%rdx
409
+ mov %rdx,$A[1]
410
+
411
+ mulq $m1 # np[j]*m1
412
+ add %rax,$N[0]
413
+ mov 8($ap,$j,8),%rax
414
+ adc \$0,%rdx
415
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
416
+ adc \$0,%rdx
417
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
418
+ mov %rdx,$N[1]
419
+
420
+ mulq $m0 # ap[j]*bp[0]
421
+ add %rax,$A[1]
422
+ mov 8($np,$j,8),%rax
423
+ adc \$0,%rdx
424
+ lea 4($j),$j # j++
425
+ mov %rdx,$A[0]
426
+
427
+ mulq $m1 # np[j]*m1
428
+ add %rax,$N[1]
429
+ mov -16($ap,$j,8),%rax
430
+ adc \$0,%rdx
431
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
432
+ adc \$0,%rdx
433
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
434
+ mov %rdx,$N[0]
435
+ cmp $num,$j
436
+ jb .L1st4x
437
+
438
+ mulq $m0 # ap[j]*bp[0]
439
+ add %rax,$A[0]
440
+ mov -16($np,$j,8),%rax
441
+ adc \$0,%rdx
442
+ mov %rdx,$A[1]
443
+
444
+ mulq $m1 # np[j]*m1
445
+ add %rax,$N[0]
446
+ mov -8($ap,$j,8),%rax
447
+ adc \$0,%rdx
448
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
449
+ adc \$0,%rdx
450
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
451
+ mov %rdx,$N[1]
452
+
453
+ mulq $m0 # ap[j]*bp[0]
454
+ add %rax,$A[1]
455
+ mov -8($np,$j,8),%rax
456
+ adc \$0,%rdx
457
+ mov %rdx,$A[0]
458
+
459
+ mulq $m1 # np[j]*m1
460
+ add %rax,$N[1]
461
+ mov ($ap),%rax # ap[0]
462
+ adc \$0,%rdx
463
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
464
+ adc \$0,%rdx
465
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
466
+ mov %rdx,$N[0]
467
+
468
+ xor $N[1],$N[1]
469
+ add $A[0],$N[0]
470
+ adc \$0,$N[1]
471
+ mov $N[0],-8(%rsp,$j,8)
472
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
473
+
474
+ lea 1($i),$i # i++
475
+ .align 4
476
+ .Louter4x:
477
+ mov ($bp,$i,8),$m0 # m0=bp[i]
478
+ xor $j,$j # j=0
479
+ mov (%rsp),$A[0]
480
+ mov $n0,$m1
481
+ mulq $m0 # ap[0]*bp[i]
482
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
483
+ mov ($np),%rax
484
+ adc \$0,%rdx
485
+
486
+ imulq $A[0],$m1 # tp[0]*n0
487
+ mov %rdx,$A[1]
488
+
489
+ mulq $m1 # np[0]*m1
490
+ add %rax,$A[0] # "$N[0]", discarded
491
+ mov 8($ap),%rax
492
+ adc \$0,%rdx
493
+ mov %rdx,$N[1]
494
+
495
+ mulq $m0 # ap[j]*bp[i]
496
+ add %rax,$A[1]
497
+ mov 8($np),%rax
498
+ adc \$0,%rdx
499
+ add 8(%rsp),$A[1] # +tp[1]
500
+ adc \$0,%rdx
501
+ mov %rdx,$A[0]
502
+
503
+ mulq $m1 # np[j]*m1
504
+ add %rax,$N[1]
505
+ mov 16($ap),%rax
506
+ adc \$0,%rdx
507
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
508
+ lea 4($j),$j # j+=2
509
+ adc \$0,%rdx
510
+ mov $N[1],(%rsp) # tp[j-1]
511
+ mov %rdx,$N[0]
512
+ jmp .Linner4x
513
+ .align 16
514
+ .Linner4x:
515
+ mulq $m0 # ap[j]*bp[i]
516
+ add %rax,$A[0]
517
+ mov -16($np,$j,8),%rax
518
+ adc \$0,%rdx
519
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
520
+ adc \$0,%rdx
521
+ mov %rdx,$A[1]
522
+
523
+ mulq $m1 # np[j]*m1
524
+ add %rax,$N[0]
525
+ mov -8($ap,$j,8),%rax
526
+ adc \$0,%rdx
527
+ add $A[0],$N[0]
528
+ adc \$0,%rdx
529
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
530
+ mov %rdx,$N[1]
531
+
532
+ mulq $m0 # ap[j]*bp[i]
533
+ add %rax,$A[1]
534
+ mov -8($np,$j,8),%rax
535
+ adc \$0,%rdx
536
+ add -8(%rsp,$j,8),$A[1]
537
+ adc \$0,%rdx
538
+ mov %rdx,$A[0]
539
+
540
+ mulq $m1 # np[j]*m1
541
+ add %rax,$N[1]
542
+ mov ($ap,$j,8),%rax
543
+ adc \$0,%rdx
544
+ add $A[1],$N[1]
545
+ adc \$0,%rdx
546
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
547
+ mov %rdx,$N[0]
548
+
549
+ mulq $m0 # ap[j]*bp[i]
550
+ add %rax,$A[0]
551
+ mov ($np,$j,8),%rax
552
+ adc \$0,%rdx
553
+ add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
554
+ adc \$0,%rdx
555
+ mov %rdx,$A[1]
556
+
557
+ mulq $m1 # np[j]*m1
558
+ add %rax,$N[0]
559
+ mov 8($ap,$j,8),%rax
560
+ adc \$0,%rdx
561
+ add $A[0],$N[0]
562
+ adc \$0,%rdx
563
+ mov $N[0],-8(%rsp,$j,8) # tp[j-1]
564
+ mov %rdx,$N[1]
565
+
566
+ mulq $m0 # ap[j]*bp[i]
567
+ add %rax,$A[1]
568
+ mov 8($np,$j,8),%rax
569
+ adc \$0,%rdx
570
+ add 8(%rsp,$j,8),$A[1]
571
+ adc \$0,%rdx
572
+ lea 4($j),$j # j++
573
+ mov %rdx,$A[0]
574
+
575
+ mulq $m1 # np[j]*m1
576
+ add %rax,$N[1]
577
+ mov -16($ap,$j,8),%rax
578
+ adc \$0,%rdx
579
+ add $A[1],$N[1]
580
+ adc \$0,%rdx
581
+ mov $N[1],-32(%rsp,$j,8) # tp[j-1]
582
+ mov %rdx,$N[0]
583
+ cmp $num,$j
584
+ jb .Linner4x
585
+
586
+ mulq $m0 # ap[j]*bp[i]
587
+ add %rax,$A[0]
588
+ mov -16($np,$j,8),%rax
589
+ adc \$0,%rdx
590
+ add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
591
+ adc \$0,%rdx
592
+ mov %rdx,$A[1]
593
+
594
+ mulq $m1 # np[j]*m1
595
+ add %rax,$N[0]
596
+ mov -8($ap,$j,8),%rax
597
+ adc \$0,%rdx
598
+ add $A[0],$N[0]
599
+ adc \$0,%rdx
600
+ mov $N[0],-24(%rsp,$j,8) # tp[j-1]
601
+ mov %rdx,$N[1]
602
+
603
+ mulq $m0 # ap[j]*bp[i]
604
+ add %rax,$A[1]
605
+ mov -8($np,$j,8),%rax
606
+ adc \$0,%rdx
607
+ add -8(%rsp,$j,8),$A[1]
608
+ adc \$0,%rdx
609
+ lea 1($i),$i # i++
610
+ mov %rdx,$A[0]
611
+
612
+ mulq $m1 # np[j]*m1
613
+ add %rax,$N[1]
614
+ mov ($ap),%rax # ap[0]
615
+ adc \$0,%rdx
616
+ add $A[1],$N[1]
617
+ adc \$0,%rdx
618
+ mov $N[1],-16(%rsp,$j,8) # tp[j-1]
619
+ mov %rdx,$N[0]
620
+
621
+ xor $N[1],$N[1]
622
+ add $A[0],$N[0]
623
+ adc \$0,$N[1]
624
+ add (%rsp,$num,8),$N[0] # pull upmost overflow bit
625
+ adc \$0,$N[1]
626
+ mov $N[0],-8(%rsp,$j,8)
627
+ mov $N[1],(%rsp,$j,8) # store upmost overflow bit
628
+
629
+ cmp $num,$i
630
+ jb .Louter4x
631
+ ___
632
+ {
633
+ my @ri=("%rax","%rdx",$m0,$m1);
634
+ $code.=<<___;
635
+ mov 16(%rsp,$num,8),$rp # restore $rp
636
+ mov 0(%rsp),@ri[0] # tp[0]
637
+ mov 8(%rsp),@ri[1] # tp[1]
638
+ shr \$2,$num # num/=4
639
+ lea (%rsp),$ap # borrow ap for tp
640
+ xor $i,$i # i=0 and clear CF!
641
+
642
+ sub 0($np),@ri[0]
643
+ mov 16($ap),@ri[2] # tp[2]
644
+ mov 24($ap),@ri[3] # tp[3]
645
+ sbb 8($np),@ri[1]
646
+ lea -1($num),$j # j=num/4-1
647
+ jmp .Lsub4x
648
+ .align 16
649
+ .Lsub4x:
650
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
651
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
652
+ sbb 16($np,$i,8),@ri[2]
653
+ mov 32($ap,$i,8),@ri[0] # tp[i+1]
654
+ mov 40($ap,$i,8),@ri[1]
655
+ sbb 24($np,$i,8),@ri[3]
656
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
657
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
658
+ sbb 32($np,$i,8),@ri[0]
659
+ mov 48($ap,$i,8),@ri[2]
660
+ mov 56($ap,$i,8),@ri[3]
661
+ sbb 40($np,$i,8),@ri[1]
662
+ lea 4($i),$i # i++
663
+ dec $j # doesnn't affect CF!
664
+ jnz .Lsub4x
665
+
666
+ mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
667
+ mov 32($ap,$i,8),@ri[0] # load overflow bit
668
+ sbb 16($np,$i,8),@ri[2]
669
+ mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
670
+ sbb 24($np,$i,8),@ri[3]
671
+ mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
672
+
673
+ sbb \$0,@ri[0] # handle upmost overflow bit
674
+ mov @ri[0],%xmm0
675
+ punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits
676
+ mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
677
+ xor $i,$i # i=0
678
+
679
+ mov $num,$j
680
+ pxor %xmm5,%xmm5
681
+ jmp .Lcopy4x
682
+ .align 16
683
+ .Lcopy4x: # copy or in-place refresh
684
+ movdqu (%rsp,$i),%xmm2
685
+ movdqu 16(%rsp,$i),%xmm4
686
+ movdqu ($rp,$i),%xmm1
687
+ movdqu 16($rp,$i),%xmm3
688
+ pxor %xmm1,%xmm2 # conditional select
689
+ pxor %xmm3,%xmm4
690
+ pand %xmm0,%xmm2
691
+ pand %xmm0,%xmm4
692
+ pxor %xmm1,%xmm2
693
+ pxor %xmm3,%xmm4
694
+ movdqu %xmm2,($rp,$i)
695
+ movdqu %xmm4,16($rp,$i)
696
+ movdqa %xmm5,(%rsp,$i) # zap temporary vectors
697
+ movdqa %xmm5,16(%rsp,$i)
698
+
699
+ lea 32($i),$i
700
+ dec $j
701
+ jnz .Lcopy4x
702
+
703
+ shl \$2,$num
704
+ ___
705
+ }
706
+ $code.=<<___;
707
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
708
+ mov \$1,%rax
709
+ mov (%rsi),%r15
710
+ mov 8(%rsi),%r14
711
+ mov 16(%rsi),%r13
712
+ mov 24(%rsi),%r12
713
+ mov 32(%rsi),%rbp
714
+ mov 40(%rsi),%rbx
715
+ lea 48(%rsi),%rsp
716
+ .Lmul4x_epilogue:
717
+ ret
718
+ .size bn_mul4x_mont,.-bn_mul4x_mont
719
+ ___
720
+ }}}
721
+ {{{
722
+ ######################################################################
723
+ # void bn_sqr8x_mont(
724
+ my $rptr="%rdi"; # const BN_ULONG *rptr,
725
+ my $aptr="%rsi"; # const BN_ULONG *aptr,
726
+ my $bptr="%rdx"; # not used
727
+ my $nptr="%rcx"; # const BN_ULONG *nptr,
728
+ my $n0 ="%r8"; # const BN_ULONG *n0);
729
+ my $num ="%r9"; # int num, has to be divisible by 8
730
+
731
+ my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
732
+ my @A0=("%r10","%r11");
733
+ my @A1=("%r12","%r13");
734
+ my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
735
+
736
+ $code.=<<___ if ($addx);
737
+ .extern bn_sqrx8x_internal # see x86_64-mont5 module
738
+ ___
739
+ $code.=<<___;
740
+ .extern bn_sqr8x_internal # see x86_64-mont5 module
741
+
742
+ .type bn_sqr8x_mont,\@function,6
743
+ .align 32
744
+ bn_sqr8x_mont:
745
+ .Lsqr8x_enter:
746
+ mov %rsp,%rax
747
+ push %rbx
748
+ push %rbp
749
+ push %r12
750
+ push %r13
751
+ push %r14
752
+ push %r15
753
+
754
+ mov ${num}d,%r10d
755
+ shl \$3,${num}d # convert $num to bytes
756
+ shl \$3+2,%r10 # 4*$num
757
+ neg $num
758
+
759
+ ##############################################################
760
+ # ensure that stack frame doesn't alias with $aptr modulo
761
+ # 4096. this is done to allow memory disambiguation logic
762
+ # do its job.
763
+ #
764
+ lea -64(%rsp,$num,4),%r11
765
+ mov ($n0),$n0 # *n0
766
+ sub $aptr,%r11
767
+ and \$4095,%r11
768
+ cmp %r11,%r10
769
+ jb .Lsqr8x_sp_alt
770
+ sub %r11,%rsp # align with $aptr
771
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
772
+ jmp .Lsqr8x_sp_done
773
+
774
+ .align 32
775
+ .Lsqr8x_sp_alt:
776
+ lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
777
+ lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
778
+ sub %r10,%r11
779
+ mov \$0,%r10
780
+ cmovc %r10,%r11
781
+ sub %r11,%rsp
782
+ .Lsqr8x_sp_done:
783
+ and \$-64,%rsp
784
+ mov $num,%r10
785
+ neg $num
786
+
787
+ lea 64(%rsp,$num,2),%r11 # copy of modulus
788
+ mov $n0, 32(%rsp)
789
+ mov %rax, 40(%rsp) # save original %rsp
790
+ .Lsqr8x_body:
791
+
792
+ mov $num,$i
793
+ movq %r11, %xmm2 # save pointer to modulus copy
794
+ shr \$3+2,$i
795
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
796
+ jmp .Lsqr8x_copy_n
797
+
798
+ .align 32
799
+ .Lsqr8x_copy_n:
800
+ movq 8*0($nptr),%xmm0
801
+ movq 8*1($nptr),%xmm1
802
+ movq 8*2($nptr),%xmm3
803
+ movq 8*3($nptr),%xmm4
804
+ lea 8*4($nptr),$nptr
805
+ movdqa %xmm0,16*0(%r11)
806
+ movdqa %xmm1,16*1(%r11)
807
+ movdqa %xmm3,16*2(%r11)
808
+ movdqa %xmm4,16*3(%r11)
809
+ lea 16*4(%r11),%r11
810
+ dec $i
811
+ jnz .Lsqr8x_copy_n
812
+
813
+ pxor %xmm0,%xmm0
814
+ movq $rptr,%xmm1 # save $rptr
815
+ movq %r10, %xmm3 # -$num
816
+ ___
817
+ $code.=<<___ if ($addx);
818
+ and \$0x80100,%eax
819
+ cmp \$0x80100,%eax
820
+ jne .Lsqr8x_nox
821
+
822
+ call bn_sqrx8x_internal # see x86_64-mont5 module
823
+
824
+ pxor %xmm0,%xmm0
825
+ lea 48(%rsp),%rax
826
+ lea 64(%rsp,$num,2),%rdx
827
+ shr \$3+2,$num
828
+ mov 40(%rsp),%rsi # restore %rsp
829
+ jmp .Lsqr8x_zero
830
+
831
+ .align 32
832
+ .Lsqr8x_nox:
833
+ ___
834
+ $code.=<<___;
835
+ call bn_sqr8x_internal # see x86_64-mont5 module
836
+
837
+ pxor %xmm0,%xmm0
838
+ lea 48(%rsp),%rax
839
+ lea 64(%rsp,$num,2),%rdx
840
+ shr \$3+2,$num
841
+ mov 40(%rsp),%rsi # restore %rsp
842
+ jmp .Lsqr8x_zero
843
+
844
+ .align 32
845
+ .Lsqr8x_zero:
846
+ movdqa %xmm0,16*0(%rax) # wipe t
847
+ movdqa %xmm0,16*1(%rax)
848
+ movdqa %xmm0,16*2(%rax)
849
+ movdqa %xmm0,16*3(%rax)
850
+ lea 16*4(%rax),%rax
851
+ movdqa %xmm0,16*0(%rdx) # wipe n
852
+ movdqa %xmm0,16*1(%rdx)
853
+ movdqa %xmm0,16*2(%rdx)
854
+ movdqa %xmm0,16*3(%rdx)
855
+ lea 16*4(%rdx),%rdx
856
+ dec $num
857
+ jnz .Lsqr8x_zero
858
+
859
+ mov \$1,%rax
860
+ mov -48(%rsi),%r15
861
+ mov -40(%rsi),%r14
862
+ mov -32(%rsi),%r13
863
+ mov -24(%rsi),%r12
864
+ mov -16(%rsi),%rbp
865
+ mov -8(%rsi),%rbx
866
+ lea (%rsi),%rsp
867
+ .Lsqr8x_epilogue:
868
+ ret
869
+ .size bn_sqr8x_mont,.-bn_sqr8x_mont
870
+ ___
871
+ }}}
872
+
873
+ if ($addx) {{{
874
+ my $bp="%rdx"; # original value
875
+
876
+ $code.=<<___;
877
+ .type bn_mulx4x_mont,\@function,6
878
+ .align 32
879
+ bn_mulx4x_mont:
880
+ .Lmulx4x_enter:
881
+ mov %rsp,%rax
882
+ push %rbx
883
+ push %rbp
884
+ push %r12
885
+ push %r13
886
+ push %r14
887
+ push %r15
888
+
889
+ shl \$3,${num}d # convert $num to bytes
890
+ .byte 0x67
891
+ xor %r10,%r10
892
+ sub $num,%r10 # -$num
893
+ mov ($n0),$n0 # *n0
894
+ lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
895
+ lea ($bp,$num),%r10
896
+ and \$-128,%rsp
897
+ ##############################################################
898
+ # Stack layout
899
+ # +0 num
900
+ # +8 off-loaded &b[i]
901
+ # +16 end of b[num]
902
+ # +24 saved n0
903
+ # +32 saved rp
904
+ # +40 saved %rsp
905
+ # +48 inner counter
906
+ # +56
907
+ # +64 tmp[num+1]
908
+ #
909
+ mov $num,0(%rsp) # save $num
910
+ shr \$5,$num
911
+ mov %r10,16(%rsp) # end of b[num]
912
+ sub \$1,$num
913
+ mov $n0, 24(%rsp) # save *n0
914
+ mov $rp, 32(%rsp) # save $rp
915
+ mov %rax,40(%rsp) # save original %rsp
916
+ mov $num,48(%rsp) # inner counter
917
+ jmp .Lmulx4x_body
918
+
919
+ .align 32
920
+ .Lmulx4x_body:
921
+ ___
922
+ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
923
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
924
+ my $rptr=$bptr;
925
+ $code.=<<___;
926
+ lea 8($bp),$bptr
927
+ mov ($bp),%rdx # b[0], $bp==%rdx actually
928
+ lea 64+32(%rsp),$tptr
929
+ mov %rdx,$bi
930
+
931
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
932
+ mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
933
+ add %rax,%r11
934
+ mov $bptr,8(%rsp) # off-load &b[i]
935
+ mulx 2*8($aptr),%r12,%r13 # ...
936
+ adc %r14,%r12
937
+ adc \$0,%r13
938
+
939
+ mov $mi,$bptr # borrow $bptr
940
+ imulq 24(%rsp),$mi # "t[0]"*n0
941
+ xor $zero,$zero # cf=0, of=0
942
+
943
+ mulx 3*8($aptr),%rax,%r14
944
+ mov $mi,%rdx
945
+ lea 4*8($aptr),$aptr
946
+ adcx %rax,%r13
947
+ adcx $zero,%r14 # cf=0
948
+
949
+ mulx 0*8($nptr),%rax,%r10
950
+ adcx %rax,$bptr # discarded
951
+ adox %r11,%r10
952
+ mulx 1*8($nptr),%rax,%r11
953
+ adcx %rax,%r10
954
+ adox %r12,%r11
955
+ .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
956
+ mov 48(%rsp),$bptr # counter value
957
+ mov %r10,-4*8($tptr)
958
+ adcx %rax,%r11
959
+ adox %r13,%r12
960
+ mulx 3*8($nptr),%rax,%r15
961
+ mov $bi,%rdx
962
+ mov %r11,-3*8($tptr)
963
+ adcx %rax,%r12
964
+ adox $zero,%r15 # of=0
965
+ lea 4*8($nptr),$nptr
966
+ mov %r12,-2*8($tptr)
967
+
968
+ jmp .Lmulx4x_1st
969
+
970
+ .align 32
971
+ .Lmulx4x_1st:
972
+ adcx $zero,%r15 # cf=0, modulo-scheduled
973
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
974
+ adcx %r14,%r10
975
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
976
+ adcx %rax,%r11
977
+ mulx 2*8($aptr),%r12,%rax # ...
978
+ adcx %r14,%r12
979
+ mulx 3*8($aptr),%r13,%r14
980
+ .byte 0x67,0x67
981
+ mov $mi,%rdx
982
+ adcx %rax,%r13
983
+ adcx $zero,%r14 # cf=0
984
+ lea 4*8($aptr),$aptr
985
+ lea 4*8($tptr),$tptr
986
+
987
+ adox %r15,%r10
988
+ mulx 0*8($nptr),%rax,%r15
989
+ adcx %rax,%r10
990
+ adox %r15,%r11
991
+ mulx 1*8($nptr),%rax,%r15
992
+ adcx %rax,%r11
993
+ adox %r15,%r12
994
+ mulx 2*8($nptr),%rax,%r15
995
+ mov %r10,-5*8($tptr)
996
+ adcx %rax,%r12
997
+ mov %r11,-4*8($tptr)
998
+ adox %r15,%r13
999
+ mulx 3*8($nptr),%rax,%r15
1000
+ mov $bi,%rdx
1001
+ mov %r12,-3*8($tptr)
1002
+ adcx %rax,%r13
1003
+ adox $zero,%r15
1004
+ lea 4*8($nptr),$nptr
1005
+ mov %r13,-2*8($tptr)
1006
+
1007
+ dec $bptr # of=0, pass cf
1008
+ jnz .Lmulx4x_1st
1009
+
1010
+ mov 0(%rsp),$num # load num
1011
+ mov 8(%rsp),$bptr # re-load &b[i]
1012
+ adc $zero,%r15 # modulo-scheduled
1013
+ add %r15,%r14
1014
+ sbb %r15,%r15 # top-most carry
1015
+ mov %r14,-1*8($tptr)
1016
+ jmp .Lmulx4x_outer
1017
+
1018
+ .align 32
1019
+ .Lmulx4x_outer:
1020
+ mov ($bptr),%rdx # b[i]
1021
+ lea 8($bptr),$bptr # b++
1022
+ sub $num,$aptr # rewind $aptr
1023
+ mov %r15,($tptr) # save top-most carry
1024
+ lea 64+4*8(%rsp),$tptr
1025
+ sub $num,$nptr # rewind $nptr
1026
+
1027
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
1028
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1029
+ mov %rdx,$bi
1030
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
1031
+ adox -4*8($tptr),$mi
1032
+ adcx %r14,%r11
1033
+ mulx 2*8($aptr),%r15,%r13 # ...
1034
+ adox -3*8($tptr),%r11
1035
+ adcx %r15,%r12
1036
+ adox $zero,%r12
1037
+ adcx $zero,%r13
1038
+
1039
+ mov $bptr,8(%rsp) # off-load &b[i]
1040
+ .byte 0x67
1041
+ mov $mi,%r15
1042
+ imulq 24(%rsp),$mi # "t[0]"*n0
1043
+ xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
1044
+
1045
+ mulx 3*8($aptr),%rax,%r14
1046
+ mov $mi,%rdx
1047
+ adox -2*8($tptr),%r12
1048
+ adcx %rax,%r13
1049
+ adox -1*8($tptr),%r13
1050
+ adcx $zero,%r14
1051
+ lea 4*8($aptr),$aptr
1052
+ adox $zero,%r14
1053
+
1054
+ mulx 0*8($nptr),%rax,%r10
1055
+ adcx %rax,%r15 # discarded
1056
+ adox %r11,%r10
1057
+ mulx 1*8($nptr),%rax,%r11
1058
+ adcx %rax,%r10
1059
+ adox %r12,%r11
1060
+ mulx 2*8($nptr),%rax,%r12
1061
+ mov %r10,-4*8($tptr)
1062
+ adcx %rax,%r11
1063
+ adox %r13,%r12
1064
+ mulx 3*8($nptr),%rax,%r15
1065
+ mov $bi,%rdx
1066
+ mov %r11,-3*8($tptr)
1067
+ lea 4*8($nptr),$nptr
1068
+ adcx %rax,%r12
1069
+ adox $zero,%r15 # of=0
1070
+ mov 48(%rsp),$bptr # counter value
1071
+ mov %r12,-2*8($tptr)
1072
+
1073
+ jmp .Lmulx4x_inner
1074
+
1075
+ .align 32
1076
+ .Lmulx4x_inner:
1077
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
1078
+ adcx $zero,%r15 # cf=0, modulo-scheduled
1079
+ adox %r14,%r10
1080
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
1081
+ adcx 0*8($tptr),%r10
1082
+ adox %rax,%r11
1083
+ mulx 2*8($aptr),%r12,%rax # ...
1084
+ adcx 1*8($tptr),%r11
1085
+ adox %r14,%r12
1086
+ mulx 3*8($aptr),%r13,%r14
1087
+ mov $mi,%rdx
1088
+ adcx 2*8($tptr),%r12
1089
+ adox %rax,%r13
1090
+ adcx 3*8($tptr),%r13
1091
+ adox $zero,%r14 # of=0
1092
+ lea 4*8($aptr),$aptr
1093
+ lea 4*8($tptr),$tptr
1094
+ adcx $zero,%r14 # cf=0
1095
+
1096
+ adox %r15,%r10
1097
+ mulx 0*8($nptr),%rax,%r15
1098
+ adcx %rax,%r10
1099
+ adox %r15,%r11
1100
+ mulx 1*8($nptr),%rax,%r15
1101
+ adcx %rax,%r11
1102
+ adox %r15,%r12
1103
+ mulx 2*8($nptr),%rax,%r15
1104
+ mov %r10,-5*8($tptr)
1105
+ adcx %rax,%r12
1106
+ adox %r15,%r13
1107
+ mulx 3*8($nptr),%rax,%r15
1108
+ mov $bi,%rdx
1109
+ mov %r11,-4*8($tptr)
1110
+ mov %r12,-3*8($tptr)
1111
+ adcx %rax,%r13
1112
+ adox $zero,%r15
1113
+ lea 4*8($nptr),$nptr
1114
+ mov %r13,-2*8($tptr)
1115
+
1116
+ dec $bptr # of=0, pass cf
1117
+ jnz .Lmulx4x_inner
1118
+
1119
+ mov 0(%rsp),$num # load num
1120
+ mov 8(%rsp),$bptr # re-load &b[i]
1121
+ adc $zero,%r15 # modulo-scheduled
1122
+ sub 0*8($tptr),$zero # pull top-most carry
1123
+ adc %r15,%r14
1124
+ mov -8($nptr),$mi
1125
+ sbb %r15,%r15 # top-most carry
1126
+ mov %r14,-1*8($tptr)
1127
+
1128
+ cmp 16(%rsp),$bptr
1129
+ jne .Lmulx4x_outer
1130
+
1131
+ sub %r14,$mi # compare top-most words
1132
+ sbb $mi,$mi
1133
+ or $mi,%r15
1134
+
1135
+ neg $num
1136
+ xor %rdx,%rdx
1137
+ mov 32(%rsp),$rptr # restore rp
1138
+ lea 64(%rsp),$tptr
1139
+
1140
+ pxor %xmm0,%xmm0
1141
+ mov 0*8($nptr,$num),%r8
1142
+ mov 1*8($nptr,$num),%r9
1143
+ neg %r8
1144
+ jmp .Lmulx4x_sub_entry
1145
+
1146
+ .align 32
1147
+ .Lmulx4x_sub:
1148
+ mov 0*8($nptr,$num),%r8
1149
+ mov 1*8($nptr,$num),%r9
1150
+ not %r8
1151
+ .Lmulx4x_sub_entry:
1152
+ mov 2*8($nptr,$num),%r10
1153
+ not %r9
1154
+ and %r15,%r8
1155
+ mov 3*8($nptr,$num),%r11
1156
+ not %r10
1157
+ and %r15,%r9
1158
+ not %r11
1159
+ and %r15,%r10
1160
+ and %r15,%r11
1161
+
1162
+ neg %rdx # mov %rdx,%cf
1163
+ adc 0*8($tptr),%r8
1164
+ adc 1*8($tptr),%r9
1165
+ movdqa %xmm0,($tptr)
1166
+ adc 2*8($tptr),%r10
1167
+ adc 3*8($tptr),%r11
1168
+ movdqa %xmm0,16($tptr)
1169
+ lea 4*8($tptr),$tptr
1170
+ sbb %rdx,%rdx # mov %cf,%rdx
1171
+
1172
+ mov %r8,0*8($rptr)
1173
+ mov %r9,1*8($rptr)
1174
+ mov %r10,2*8($rptr)
1175
+ mov %r11,3*8($rptr)
1176
+ lea 4*8($rptr),$rptr
1177
+
1178
+ add \$32,$num
1179
+ jnz .Lmulx4x_sub
1180
+
1181
+ mov 40(%rsp),%rsi # restore %rsp
1182
+ mov \$1,%rax
1183
+ mov -48(%rsi),%r15
1184
+ mov -40(%rsi),%r14
1185
+ mov -32(%rsi),%r13
1186
+ mov -24(%rsi),%r12
1187
+ mov -16(%rsi),%rbp
1188
+ mov -8(%rsi),%rbx
1189
+ lea (%rsi),%rsp
1190
+ .Lmulx4x_epilogue:
1191
+ ret
1192
+ .size bn_mulx4x_mont,.-bn_mulx4x_mont
1193
+ ___
1194
+ }}}
1195
+ $code.=<<___;
1196
+ .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1197
+ .align 16
1198
+ ___
1199
+
1200
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1201
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1202
+ if ($win64) {
1203
+ $rec="%rcx";
1204
+ $frame="%rdx";
1205
+ $context="%r8";
1206
+ $disp="%r9";
1207
+
1208
+ $code.=<<___;
1209
+ .extern __imp_RtlVirtualUnwind
1210
+ .type mul_handler,\@abi-omnipotent
1211
+ .align 16
1212
+ mul_handler:
1213
+ push %rsi
1214
+ push %rdi
1215
+ push %rbx
1216
+ push %rbp
1217
+ push %r12
1218
+ push %r13
1219
+ push %r14
1220
+ push %r15
1221
+ pushfq
1222
+ sub \$64,%rsp
1223
+
1224
+ mov 120($context),%rax # pull context->Rax
1225
+ mov 248($context),%rbx # pull context->Rip
1226
+
1227
+ mov 8($disp),%rsi # disp->ImageBase
1228
+ mov 56($disp),%r11 # disp->HandlerData
1229
+
1230
+ mov 0(%r11),%r10d # HandlerData[0]
1231
+ lea (%rsi,%r10),%r10 # end of prologue label
1232
+ cmp %r10,%rbx # context->Rip<end of prologue label
1233
+ jb .Lcommon_seh_tail
1234
+
1235
+ mov 152($context),%rax # pull context->Rsp
1236
+
1237
+ mov 4(%r11),%r10d # HandlerData[1]
1238
+ lea (%rsi,%r10),%r10 # epilogue label
1239
+ cmp %r10,%rbx # context->Rip>=epilogue label
1240
+ jae .Lcommon_seh_tail
1241
+
1242
+ mov 192($context),%r10 # pull $num
1243
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
1244
+ lea 48(%rax),%rax
1245
+
1246
+ mov -8(%rax),%rbx
1247
+ mov -16(%rax),%rbp
1248
+ mov -24(%rax),%r12
1249
+ mov -32(%rax),%r13
1250
+ mov -40(%rax),%r14
1251
+ mov -48(%rax),%r15
1252
+ mov %rbx,144($context) # restore context->Rbx
1253
+ mov %rbp,160($context) # restore context->Rbp
1254
+ mov %r12,216($context) # restore context->R12
1255
+ mov %r13,224($context) # restore context->R13
1256
+ mov %r14,232($context) # restore context->R14
1257
+ mov %r15,240($context) # restore context->R15
1258
+
1259
+ jmp .Lcommon_seh_tail
1260
+ .size mul_handler,.-mul_handler
1261
+
1262
+ .type sqr_handler,\@abi-omnipotent
1263
+ .align 16
1264
+ sqr_handler:
1265
+ push %rsi
1266
+ push %rdi
1267
+ push %rbx
1268
+ push %rbp
1269
+ push %r12
1270
+ push %r13
1271
+ push %r14
1272
+ push %r15
1273
+ pushfq
1274
+ sub \$64,%rsp
1275
+
1276
+ mov 120($context),%rax # pull context->Rax
1277
+ mov 248($context),%rbx # pull context->Rip
1278
+
1279
+ mov 8($disp),%rsi # disp->ImageBase
1280
+ mov 56($disp),%r11 # disp->HandlerData
1281
+
1282
+ mov 0(%r11),%r10d # HandlerData[0]
1283
+ lea (%rsi,%r10),%r10 # end of prologue label
1284
+ cmp %r10,%rbx # context->Rip<.Lsqr_body
1285
+ jb .Lcommon_seh_tail
1286
+
1287
+ mov 152($context),%rax # pull context->Rsp
1288
+
1289
+ mov 4(%r11),%r10d # HandlerData[1]
1290
+ lea (%rsi,%r10),%r10 # epilogue label
1291
+ cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
1292
+ jae .Lcommon_seh_tail
1293
+
1294
+ mov 40(%rax),%rax # pull saved stack pointer
1295
+
1296
+ mov -8(%rax),%rbx
1297
+ mov -16(%rax),%rbp
1298
+ mov -24(%rax),%r12
1299
+ mov -32(%rax),%r13
1300
+ mov -40(%rax),%r14
1301
+ mov -48(%rax),%r15
1302
+ mov %rbx,144($context) # restore context->Rbx
1303
+ mov %rbp,160($context) # restore context->Rbp
1304
+ mov %r12,216($context) # restore context->R12
1305
+ mov %r13,224($context) # restore context->R13
1306
+ mov %r14,232($context) # restore context->R14
1307
+ mov %r15,240($context) # restore context->R15
1308
+
1309
+ .Lcommon_seh_tail:
1310
+ mov 8(%rax),%rdi
1311
+ mov 16(%rax),%rsi
1312
+ mov %rax,152($context) # restore context->Rsp
1313
+ mov %rsi,168($context) # restore context->Rsi
1314
+ mov %rdi,176($context) # restore context->Rdi
1315
+
1316
+ mov 40($disp),%rdi # disp->ContextRecord
1317
+ mov $context,%rsi # context
1318
+ mov \$154,%ecx # sizeof(CONTEXT)
1319
+ .long 0xa548f3fc # cld; rep movsq
1320
+
1321
+ mov $disp,%rsi
1322
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1323
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1324
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1325
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1326
+ mov 40(%rsi),%r10 # disp->ContextRecord
1327
+ lea 56(%rsi),%r11 # &disp->HandlerData
1328
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1329
+ mov %r10,32(%rsp) # arg5
1330
+ mov %r11,40(%rsp) # arg6
1331
+ mov %r12,48(%rsp) # arg7
1332
+ mov %rcx,56(%rsp) # arg8, (NULL)
1333
+ call *__imp_RtlVirtualUnwind(%rip)
1334
+
1335
+ mov \$1,%eax # ExceptionContinueSearch
1336
+ add \$64,%rsp
1337
+ popfq
1338
+ pop %r15
1339
+ pop %r14
1340
+ pop %r13
1341
+ pop %r12
1342
+ pop %rbp
1343
+ pop %rbx
1344
+ pop %rdi
1345
+ pop %rsi
1346
+ ret
1347
+ .size sqr_handler,.-sqr_handler
1348
+
1349
+ .section .pdata
1350
+ .align 4
1351
+ .rva .LSEH_begin_bn_mul_mont
1352
+ .rva .LSEH_end_bn_mul_mont
1353
+ .rva .LSEH_info_bn_mul_mont
1354
+
1355
+ .rva .LSEH_begin_bn_mul4x_mont
1356
+ .rva .LSEH_end_bn_mul4x_mont
1357
+ .rva .LSEH_info_bn_mul4x_mont
1358
+
1359
+ .rva .LSEH_begin_bn_sqr8x_mont
1360
+ .rva .LSEH_end_bn_sqr8x_mont
1361
+ .rva .LSEH_info_bn_sqr8x_mont
1362
+ ___
1363
+ $code.=<<___ if ($addx);
1364
+ .rva .LSEH_begin_bn_mulx4x_mont
1365
+ .rva .LSEH_end_bn_mulx4x_mont
1366
+ .rva .LSEH_info_bn_mulx4x_mont
1367
+ ___
1368
+ $code.=<<___;
1369
+ .section .xdata
1370
+ .align 8
1371
+ .LSEH_info_bn_mul_mont:
1372
+ .byte 9,0,0,0
1373
+ .rva mul_handler
1374
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
1375
+ .LSEH_info_bn_mul4x_mont:
1376
+ .byte 9,0,0,0
1377
+ .rva mul_handler
1378
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
1379
+ .LSEH_info_bn_sqr8x_mont:
1380
+ .byte 9,0,0,0
1381
+ .rva sqr_handler
1382
+ .rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
1383
+ ___
1384
+ $code.=<<___ if ($addx);
1385
+ .LSEH_info_bn_mulx4x_mont:
1386
+ .byte 9,0,0,0
1387
+ .rva sqr_handler
1388
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
1389
+ ___
1390
+ }
1391
+
1392
+ print $code;
1393
+ close STDOUT;