ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,3507 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # August 2011.
11
+ #
12
+ # Companion to x86_64-mont.pl that optimizes cache-timing attack
13
+ # countermeasures. The subroutines are produced by replacing bp[i]
14
+ # references in their x86_64-mont.pl counterparts with cache-neutral
15
+ # references to powers table computed in BN_mod_exp_mont_consttime.
16
+ # In addition subroutine that scatters elements of the powers table
17
+ # is implemented, so that scatter-/gathering can be tuned without
18
+ # bn_exp.c modifications.
19
+
20
+ # August 2013.
21
+ #
22
+ # Add MULX/AD*X code paths and additional interfaces to optimize for
23
+ # branch prediction unit. For input lengths that are multiples of 8
24
+ # the np argument is not just modulus value, but one interleaved
25
+ # with 0. This is to optimize post-condition...
26
+
27
+ $flavour = shift;
28
+ $output = shift;
29
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
+
31
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
+
33
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36
+ die "can't locate x86_64-xlate.pl";
37
+
38
+ open OUT,"| \"$^X\" $xlate $flavour $output";
39
+ *STDOUT=*OUT;
40
+
41
+ # In upstream, this is controlled by shelling out to the compiler to check
42
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
43
+ # output, so this isn't useful anyway.
44
+ #
45
+ # TODO(davidben): Enable this after testing. $addx goes up to 1.
46
+ $addx = 0;
47
+
48
+ # int bn_mul_mont_gather5(
49
+ $rp="%rdi"; # BN_ULONG *rp,
50
+ $ap="%rsi"; # const BN_ULONG *ap,
51
+ $bp="%rdx"; # const BN_ULONG *bp,
52
+ $np="%rcx"; # const BN_ULONG *np,
53
+ $n0="%r8"; # const BN_ULONG *n0,
54
+ $num="%r9"; # int num,
55
+ # int idx); # 0 to 2^5-1, "index" in $bp holding
56
+ # pre-computed powers of a', interlaced
57
+ # in such manner that b[0] is $bp[idx],
58
+ # b[1] is [2^5+idx], etc.
59
+ $lo0="%r10";
60
+ $hi0="%r11";
61
+ $hi1="%r13";
62
+ $i="%r14";
63
+ $j="%r15";
64
+ $m0="%rbx";
65
+ $m1="%rbp";
66
+
67
+ $code=<<___;
68
+ .text
69
+
70
+ .extern OPENSSL_ia32cap_P
71
+
72
+ .globl bn_mul_mont_gather5
73
+ .type bn_mul_mont_gather5,\@function,6
74
+ .align 64
75
+ bn_mul_mont_gather5:
76
+ test \$7,${num}d
77
+ jnz .Lmul_enter
78
+ ___
79
+ $code.=<<___ if ($addx);
80
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
81
+ ___
82
+ $code.=<<___;
83
+ jmp .Lmul4x_enter
84
+
85
+ .align 16
86
+ .Lmul_enter:
87
+ mov ${num}d,${num}d
88
+ mov %rsp,%rax
89
+ mov `($win64?56:8)`(%rsp),%r10d # load 7th argument
90
+ push %rbx
91
+ push %rbp
92
+ push %r12
93
+ push %r13
94
+ push %r14
95
+ push %r15
96
+ ___
97
+ $code.=<<___ if ($win64);
98
+ lea -0x28(%rsp),%rsp
99
+ movaps %xmm6,(%rsp)
100
+ movaps %xmm7,0x10(%rsp)
101
+ ___
102
+ $code.=<<___;
103
+ lea 2($num),%r11
104
+ neg %r11
105
+ lea (%rsp,%r11,8),%rsp # tp=alloca(8*(num+2))
106
+ and \$-1024,%rsp # minimize TLB usage
107
+
108
+ mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp
109
+ .Lmul_body:
110
+ mov $bp,%r12 # reassign $bp
111
+ ___
112
+ $bp="%r12";
113
+ $STRIDE=2**5*8; # 5 is "window size"
114
+ $N=$STRIDE/4; # should match cache line size
115
+ $code.=<<___;
116
+ mov %r10,%r11
117
+ shr \$`log($N/8)/log(2)`,%r10
118
+ and \$`$N/8-1`,%r11
119
+ not %r10
120
+ lea .Lmagic_masks(%rip),%rax
121
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
122
+ lea 96($bp,%r11,8),$bp # pointer within 1st cache line
123
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
124
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
125
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
126
+ movq 24(%rax,%r10,8),%xmm7
127
+
128
+ movq `0*$STRIDE/4-96`($bp),%xmm0
129
+ movq `1*$STRIDE/4-96`($bp),%xmm1
130
+ pand %xmm4,%xmm0
131
+ movq `2*$STRIDE/4-96`($bp),%xmm2
132
+ pand %xmm5,%xmm1
133
+ movq `3*$STRIDE/4-96`($bp),%xmm3
134
+ pand %xmm6,%xmm2
135
+ por %xmm1,%xmm0
136
+ pand %xmm7,%xmm3
137
+ por %xmm2,%xmm0
138
+ lea $STRIDE($bp),$bp
139
+ por %xmm3,%xmm0
140
+
141
+ movq %xmm0,$m0 # m0=bp[0]
142
+
143
+ mov ($n0),$n0 # pull n0[0] value
144
+ mov ($ap),%rax
145
+
146
+ xor $i,$i # i=0
147
+ xor $j,$j # j=0
148
+
149
+ movq `0*$STRIDE/4-96`($bp),%xmm0
150
+ movq `1*$STRIDE/4-96`($bp),%xmm1
151
+ pand %xmm4,%xmm0
152
+ movq `2*$STRIDE/4-96`($bp),%xmm2
153
+ pand %xmm5,%xmm1
154
+
155
+ mov $n0,$m1
156
+ mulq $m0 # ap[0]*bp[0]
157
+ mov %rax,$lo0
158
+ mov ($np),%rax
159
+
160
+ movq `3*$STRIDE/4-96`($bp),%xmm3
161
+ pand %xmm6,%xmm2
162
+ por %xmm1,%xmm0
163
+ pand %xmm7,%xmm3
164
+
165
+ imulq $lo0,$m1 # "tp[0]"*n0
166
+ mov %rdx,$hi0
167
+
168
+ por %xmm2,%xmm0
169
+ lea $STRIDE($bp),$bp
170
+ por %xmm3,%xmm0
171
+
172
+ mulq $m1 # np[0]*m1
173
+ add %rax,$lo0 # discarded
174
+ mov 8($ap),%rax
175
+ adc \$0,%rdx
176
+ mov %rdx,$hi1
177
+
178
+ lea 1($j),$j # j++
179
+ jmp .L1st_enter
180
+
181
+ .align 16
182
+ .L1st:
183
+ add %rax,$hi1
184
+ mov ($ap,$j,8),%rax
185
+ adc \$0,%rdx
186
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
187
+ mov $lo0,$hi0
188
+ adc \$0,%rdx
189
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
190
+ mov %rdx,$hi1
191
+
192
+ .L1st_enter:
193
+ mulq $m0 # ap[j]*bp[0]
194
+ add %rax,$hi0
195
+ mov ($np,$j,8),%rax
196
+ adc \$0,%rdx
197
+ lea 1($j),$j # j++
198
+ mov %rdx,$lo0
199
+
200
+ mulq $m1 # np[j]*m1
201
+ cmp $num,$j
202
+ jne .L1st
203
+
204
+ movq %xmm0,$m0 # bp[1]
205
+
206
+ add %rax,$hi1
207
+ mov ($ap),%rax # ap[0]
208
+ adc \$0,%rdx
209
+ add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
210
+ adc \$0,%rdx
211
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
212
+ mov %rdx,$hi1
213
+ mov $lo0,$hi0
214
+
215
+ xor %rdx,%rdx
216
+ add $hi0,$hi1
217
+ adc \$0,%rdx
218
+ mov $hi1,-8(%rsp,$num,8)
219
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
220
+
221
+ lea 1($i),$i # i++
222
+ jmp .Louter
223
+ .align 16
224
+ .Louter:
225
+ xor $j,$j # j=0
226
+ mov $n0,$m1
227
+ mov (%rsp),$lo0
228
+
229
+ movq `0*$STRIDE/4-96`($bp),%xmm0
230
+ movq `1*$STRIDE/4-96`($bp),%xmm1
231
+ pand %xmm4,%xmm0
232
+ movq `2*$STRIDE/4-96`($bp),%xmm2
233
+ pand %xmm5,%xmm1
234
+
235
+ mulq $m0 # ap[0]*bp[i]
236
+ add %rax,$lo0 # ap[0]*bp[i]+tp[0]
237
+ mov ($np),%rax
238
+ adc \$0,%rdx
239
+
240
+ movq `3*$STRIDE/4-96`($bp),%xmm3
241
+ pand %xmm6,%xmm2
242
+ por %xmm1,%xmm0
243
+ pand %xmm7,%xmm3
244
+
245
+ imulq $lo0,$m1 # tp[0]*n0
246
+ mov %rdx,$hi0
247
+
248
+ por %xmm2,%xmm0
249
+ lea $STRIDE($bp),$bp
250
+ por %xmm3,%xmm0
251
+
252
+ mulq $m1 # np[0]*m1
253
+ add %rax,$lo0 # discarded
254
+ mov 8($ap),%rax
255
+ adc \$0,%rdx
256
+ mov 8(%rsp),$lo0 # tp[1]
257
+ mov %rdx,$hi1
258
+
259
+ lea 1($j),$j # j++
260
+ jmp .Linner_enter
261
+
262
+ .align 16
263
+ .Linner:
264
+ add %rax,$hi1
265
+ mov ($ap,$j,8),%rax
266
+ adc \$0,%rdx
267
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
268
+ mov (%rsp,$j,8),$lo0
269
+ adc \$0,%rdx
270
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
271
+ mov %rdx,$hi1
272
+
273
+ .Linner_enter:
274
+ mulq $m0 # ap[j]*bp[i]
275
+ add %rax,$hi0
276
+ mov ($np,$j,8),%rax
277
+ adc \$0,%rdx
278
+ add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
279
+ mov %rdx,$hi0
280
+ adc \$0,$hi0
281
+ lea 1($j),$j # j++
282
+
283
+ mulq $m1 # np[j]*m1
284
+ cmp $num,$j
285
+ jne .Linner
286
+
287
+ movq %xmm0,$m0 # bp[i+1]
288
+
289
+ add %rax,$hi1
290
+ mov ($ap),%rax # ap[0]
291
+ adc \$0,%rdx
292
+ add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
293
+ mov (%rsp,$j,8),$lo0
294
+ adc \$0,%rdx
295
+ mov $hi1,-16(%rsp,$j,8) # tp[j-1]
296
+ mov %rdx,$hi1
297
+
298
+ xor %rdx,%rdx
299
+ add $hi0,$hi1
300
+ adc \$0,%rdx
301
+ add $lo0,$hi1 # pull upmost overflow bit
302
+ adc \$0,%rdx
303
+ mov $hi1,-8(%rsp,$num,8)
304
+ mov %rdx,(%rsp,$num,8) # store upmost overflow bit
305
+
306
+ lea 1($i),$i # i++
307
+ cmp $num,$i
308
+ jb .Louter
309
+
310
+ xor $i,$i # i=0 and clear CF!
311
+ mov (%rsp),%rax # tp[0]
312
+ lea (%rsp),$ap # borrow ap for tp
313
+ mov $num,$j # j=num
314
+ jmp .Lsub
315
+ .align 16
316
+ .Lsub: sbb ($np,$i,8),%rax
317
+ mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
318
+ mov 8($ap,$i,8),%rax # tp[i+1]
319
+ lea 1($i),$i # i++
320
+ dec $j # doesnn't affect CF!
321
+ jnz .Lsub
322
+
323
+ sbb \$0,%rax # handle upmost overflow bit
324
+ xor $i,$i
325
+ mov $num,$j # j=num
326
+ .align 16
327
+ .Lcopy: # copy or in-place refresh
328
+ mov (%rsp,$i,8),$ap
329
+ mov ($rp,$i,8),$np
330
+ xor $np,$ap # conditional select:
331
+ and %rax,$ap # ((ap ^ np) & %rax) ^ np
332
+ xor $np,$ap # ap = borrow?tp:rp
333
+ mov $i,(%rsp,$i,8) # zap temporary vector
334
+ mov $ap,($rp,$i,8) # rp[i]=tp[i]
335
+ lea 1($i),$i
336
+ sub \$1,$j
337
+ jnz .Lcopy
338
+
339
+ mov 8(%rsp,$num,8),%rsi # restore %rsp
340
+ mov \$1,%rax
341
+ ___
342
+ $code.=<<___ if ($win64);
343
+ movaps -88(%rsi),%xmm6
344
+ movaps -72(%rsi),%xmm7
345
+ ___
346
+ $code.=<<___;
347
+ mov -48(%rsi),%r15
348
+ mov -40(%rsi),%r14
349
+ mov -32(%rsi),%r13
350
+ mov -24(%rsi),%r12
351
+ mov -16(%rsi),%rbp
352
+ mov -8(%rsi),%rbx
353
+ lea (%rsi),%rsp
354
+ .Lmul_epilogue:
355
+ ret
356
+ .size bn_mul_mont_gather5,.-bn_mul_mont_gather5
357
+ ___
358
+ {{{
359
+ my @A=("%r10","%r11");
360
+ my @N=("%r13","%rdi");
361
+ $code.=<<___;
362
+ .type bn_mul4x_mont_gather5,\@function,6
363
+ .align 32
364
+ bn_mul4x_mont_gather5:
365
+ .Lmul4x_enter:
366
+ ___
367
+ $code.=<<___ if ($addx);
368
+ and \$0x80100,%r11d
369
+ cmp \$0x80100,%r11d
370
+ je .Lmulx4x_enter
371
+ ___
372
+ $code.=<<___;
373
+ .byte 0x67
374
+ mov %rsp,%rax
375
+ push %rbx
376
+ push %rbp
377
+ push %r12
378
+ push %r13
379
+ push %r14
380
+ push %r15
381
+ ___
382
+ $code.=<<___ if ($win64);
383
+ lea -0x28(%rsp),%rsp
384
+ movaps %xmm6,(%rsp)
385
+ movaps %xmm7,0x10(%rsp)
386
+ ___
387
+ $code.=<<___;
388
+ .byte 0x67
389
+ mov ${num}d,%r10d
390
+ shl \$3,${num}d
391
+ shl \$3+2,%r10d # 4*$num
392
+ neg $num # -$num
393
+
394
+ ##############################################################
395
+ # ensure that stack frame doesn't alias with $aptr+4*$num
396
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
397
+ # (see bn_exp.c). this is done to allow memory disambiguation
398
+ # logic do its magic. [excessive frame is allocated in order
399
+ # to allow bn_from_mont8x to clear it.]
400
+ #
401
+ lea -64(%rsp,$num,2),%r11
402
+ sub $ap,%r11
403
+ and \$4095,%r11
404
+ cmp %r11,%r10
405
+ jb .Lmul4xsp_alt
406
+ sub %r11,%rsp # align with $ap
407
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
408
+ jmp .Lmul4xsp_done
409
+
410
+ .align 32
411
+ .Lmul4xsp_alt:
412
+ lea 4096-64(,$num,2),%r10
413
+ lea -64(%rsp,$num,2),%rsp # alloca(128+num*8)
414
+ sub %r10,%r11
415
+ mov \$0,%r10
416
+ cmovc %r10,%r11
417
+ sub %r11,%rsp
418
+ .Lmul4xsp_done:
419
+ and \$-64,%rsp
420
+ neg $num
421
+
422
+ mov %rax,40(%rsp)
423
+ .Lmul4x_body:
424
+
425
+ call mul4x_internal
426
+
427
+ mov 40(%rsp),%rsi # restore %rsp
428
+ mov \$1,%rax
429
+ ___
430
+ $code.=<<___ if ($win64);
431
+ movaps -88(%rsi),%xmm6
432
+ movaps -72(%rsi),%xmm7
433
+ ___
434
+ $code.=<<___;
435
+ mov -48(%rsi),%r15
436
+ mov -40(%rsi),%r14
437
+ mov -32(%rsi),%r13
438
+ mov -24(%rsi),%r12
439
+ mov -16(%rsi),%rbp
440
+ mov -8(%rsi),%rbx
441
+ lea (%rsi),%rsp
442
+ .Lmul4x_epilogue:
443
+ ret
444
+ .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
445
+
446
+ .type mul4x_internal,\@abi-omnipotent
447
+ .align 32
448
+ mul4x_internal:
449
+ shl \$5,$num
450
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
451
+ lea 256(%rdx,$num),%r13
452
+ shr \$5,$num # restore $num
453
+ ___
454
+ $bp="%r12";
455
+ $STRIDE=2**5*8; # 5 is "window size"
456
+ $N=$STRIDE/4; # should match cache line size
457
+ $tp=$i;
458
+ $code.=<<___;
459
+ mov %r10,%r11
460
+ shr \$`log($N/8)/log(2)`,%r10
461
+ and \$`$N/8-1`,%r11
462
+ not %r10
463
+ lea .Lmagic_masks(%rip),%rax
464
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
465
+ lea 96(%rdx,%r11,8),$bp # pointer within 1st cache line
466
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
467
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
468
+ add \$7,%r11
469
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
470
+ movq 24(%rax,%r10,8),%xmm7
471
+ and \$7,%r11
472
+
473
+ movq `0*$STRIDE/4-96`($bp),%xmm0
474
+ lea $STRIDE($bp),$tp # borrow $tp
475
+ movq `1*$STRIDE/4-96`($bp),%xmm1
476
+ pand %xmm4,%xmm0
477
+ movq `2*$STRIDE/4-96`($bp),%xmm2
478
+ pand %xmm5,%xmm1
479
+ movq `3*$STRIDE/4-96`($bp),%xmm3
480
+ pand %xmm6,%xmm2
481
+ .byte 0x67
482
+ por %xmm1,%xmm0
483
+ movq `0*$STRIDE/4-96`($tp),%xmm1
484
+ .byte 0x67
485
+ pand %xmm7,%xmm3
486
+ .byte 0x67
487
+ por %xmm2,%xmm0
488
+ movq `1*$STRIDE/4-96`($tp),%xmm2
489
+ .byte 0x67
490
+ pand %xmm4,%xmm1
491
+ .byte 0x67
492
+ por %xmm3,%xmm0
493
+ movq `2*$STRIDE/4-96`($tp),%xmm3
494
+
495
+ movq %xmm0,$m0 # m0=bp[0]
496
+ movq `3*$STRIDE/4-96`($tp),%xmm0
497
+ mov %r13,16+8(%rsp) # save end of b[num]
498
+ mov $rp, 56+8(%rsp) # save $rp
499
+
500
+ mov ($n0),$n0 # pull n0[0] value
501
+ mov ($ap),%rax
502
+ lea ($ap,$num),$ap # end of a[num]
503
+ neg $num
504
+
505
+ mov $n0,$m1
506
+ mulq $m0 # ap[0]*bp[0]
507
+ mov %rax,$A[0]
508
+ mov ($np),%rax
509
+
510
+ pand %xmm5,%xmm2
511
+ pand %xmm6,%xmm3
512
+ por %xmm2,%xmm1
513
+
514
+ imulq $A[0],$m1 # "tp[0]"*n0
515
+ ##############################################################
516
+ # $tp is chosen so that writing to top-most element of the
517
+ # vector occurs just "above" references to powers table,
518
+ # "above" modulo cache-line size, which effectively precludes
519
+ # possibility of memory disambiguation logic failure when
520
+ # accessing the table.
521
+ #
522
+ lea 64+8(%rsp,%r11,8),$tp
523
+ mov %rdx,$A[1]
524
+
525
+ pand %xmm7,%xmm0
526
+ por %xmm3,%xmm1
527
+ lea 2*$STRIDE($bp),$bp
528
+ por %xmm1,%xmm0
529
+
530
+ mulq $m1 # np[0]*m1
531
+ add %rax,$A[0] # discarded
532
+ mov 8($ap,$num),%rax
533
+ adc \$0,%rdx
534
+ mov %rdx,$N[1]
535
+
536
+ mulq $m0
537
+ add %rax,$A[1]
538
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
539
+ adc \$0,%rdx
540
+ mov %rdx,$A[0]
541
+
542
+ mulq $m1
543
+ add %rax,$N[1]
544
+ mov 16($ap,$num),%rax
545
+ adc \$0,%rdx
546
+ add $A[1],$N[1]
547
+ lea 4*8($num),$j # j=4
548
+ lea 16*4($np),$np
549
+ adc \$0,%rdx
550
+ mov $N[1],($tp)
551
+ mov %rdx,$N[0]
552
+ jmp .L1st4x
553
+
554
+ .align 32
555
+ .L1st4x:
556
+ mulq $m0 # ap[j]*bp[0]
557
+ add %rax,$A[0]
558
+ mov -16*2($np),%rax
559
+ lea 32($tp),$tp
560
+ adc \$0,%rdx
561
+ mov %rdx,$A[1]
562
+
563
+ mulq $m1 # np[j]*m1
564
+ add %rax,$N[0]
565
+ mov -8($ap,$j),%rax
566
+ adc \$0,%rdx
567
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
568
+ adc \$0,%rdx
569
+ mov $N[0],-24($tp) # tp[j-1]
570
+ mov %rdx,$N[1]
571
+
572
+ mulq $m0 # ap[j]*bp[0]
573
+ add %rax,$A[1]
574
+ mov -16*1($np),%rax
575
+ adc \$0,%rdx
576
+ mov %rdx,$A[0]
577
+
578
+ mulq $m1 # np[j]*m1
579
+ add %rax,$N[1]
580
+ mov ($ap,$j),%rax
581
+ adc \$0,%rdx
582
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
583
+ adc \$0,%rdx
584
+ mov $N[1],-16($tp) # tp[j-1]
585
+ mov %rdx,$N[0]
586
+
587
+ mulq $m0 # ap[j]*bp[0]
588
+ add %rax,$A[0]
589
+ mov 16*0($np),%rax
590
+ adc \$0,%rdx
591
+ mov %rdx,$A[1]
592
+
593
+ mulq $m1 # np[j]*m1
594
+ add %rax,$N[0]
595
+ mov 8($ap,$j),%rax
596
+ adc \$0,%rdx
597
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
598
+ adc \$0,%rdx
599
+ mov $N[0],-8($tp) # tp[j-1]
600
+ mov %rdx,$N[1]
601
+
602
+ mulq $m0 # ap[j]*bp[0]
603
+ add %rax,$A[1]
604
+ mov 16*1($np),%rax
605
+ adc \$0,%rdx
606
+ mov %rdx,$A[0]
607
+
608
+ mulq $m1 # np[j]*m1
609
+ add %rax,$N[1]
610
+ mov 16($ap,$j),%rax
611
+ adc \$0,%rdx
612
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
613
+ lea 16*4($np),$np
614
+ adc \$0,%rdx
615
+ mov $N[1],($tp) # tp[j-1]
616
+ mov %rdx,$N[0]
617
+
618
+ add \$32,$j # j+=4
619
+ jnz .L1st4x
620
+
621
+ mulq $m0 # ap[j]*bp[0]
622
+ add %rax,$A[0]
623
+ mov -16*2($np),%rax
624
+ lea 32($tp),$tp
625
+ adc \$0,%rdx
626
+ mov %rdx,$A[1]
627
+
628
+ mulq $m1 # np[j]*m1
629
+ add %rax,$N[0]
630
+ mov -8($ap),%rax
631
+ adc \$0,%rdx
632
+ add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
633
+ adc \$0,%rdx
634
+ mov $N[0],-24($tp) # tp[j-1]
635
+ mov %rdx,$N[1]
636
+
637
+ mulq $m0 # ap[j]*bp[0]
638
+ add %rax,$A[1]
639
+ mov -16*1($np),%rax
640
+ adc \$0,%rdx
641
+ mov %rdx,$A[0]
642
+
643
+ mulq $m1 # np[j]*m1
644
+ add %rax,$N[1]
645
+ mov ($ap,$num),%rax # ap[0]
646
+ adc \$0,%rdx
647
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
648
+ adc \$0,%rdx
649
+ mov $N[1],-16($tp) # tp[j-1]
650
+ mov %rdx,$N[0]
651
+
652
+ movq %xmm0,$m0 # bp[1]
653
+ lea ($np,$num,2),$np # rewind $np
654
+
655
+ xor $N[1],$N[1]
656
+ add $A[0],$N[0]
657
+ adc \$0,$N[1]
658
+ mov $N[0],-8($tp)
659
+
660
+ jmp .Louter4x
661
+
662
+ .align 32
663
+ .Louter4x:
664
+ mov ($tp,$num),$A[0]
665
+ mov $n0,$m1
666
+ mulq $m0 # ap[0]*bp[i]
667
+ add %rax,$A[0] # ap[0]*bp[i]+tp[0]
668
+ mov ($np),%rax
669
+ adc \$0,%rdx
670
+
671
+ movq `0*$STRIDE/4-96`($bp),%xmm0
672
+ movq `1*$STRIDE/4-96`($bp),%xmm1
673
+ pand %xmm4,%xmm0
674
+ movq `2*$STRIDE/4-96`($bp),%xmm2
675
+ pand %xmm5,%xmm1
676
+ movq `3*$STRIDE/4-96`($bp),%xmm3
677
+
678
+ imulq $A[0],$m1 # tp[0]*n0
679
+ .byte 0x67
680
+ mov %rdx,$A[1]
681
+ mov $N[1],($tp) # store upmost overflow bit
682
+
683
+ pand %xmm6,%xmm2
684
+ por %xmm1,%xmm0
685
+ pand %xmm7,%xmm3
686
+ por %xmm2,%xmm0
687
+ lea ($tp,$num),$tp # rewind $tp
688
+ lea $STRIDE($bp),$bp
689
+ por %xmm3,%xmm0
690
+
691
+ mulq $m1 # np[0]*m1
692
+ add %rax,$A[0] # "$N[0]", discarded
693
+ mov 8($ap,$num),%rax
694
+ adc \$0,%rdx
695
+ mov %rdx,$N[1]
696
+
697
+ mulq $m0 # ap[j]*bp[i]
698
+ add %rax,$A[1]
699
+ mov 16*1($np),%rax # interleaved with 0, therefore 16*n
700
+ adc \$0,%rdx
701
+ add 8($tp),$A[1] # +tp[1]
702
+ adc \$0,%rdx
703
+ mov %rdx,$A[0]
704
+
705
+ mulq $m1 # np[j]*m1
706
+ add %rax,$N[1]
707
+ mov 16($ap,$num),%rax
708
+ adc \$0,%rdx
709
+ add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
710
+ lea 4*8($num),$j # j=4
711
+ lea 16*4($np),$np
712
+ adc \$0,%rdx
713
+ mov %rdx,$N[0]
714
+ jmp .Linner4x
715
+
716
+ .align 32
717
+ .Linner4x:
718
+ mulq $m0 # ap[j]*bp[i]
719
+ add %rax,$A[0]
720
+ mov -16*2($np),%rax
721
+ adc \$0,%rdx
722
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
723
+ lea 32($tp),$tp
724
+ adc \$0,%rdx
725
+ mov %rdx,$A[1]
726
+
727
+ mulq $m1 # np[j]*m1
728
+ add %rax,$N[0]
729
+ mov -8($ap,$j),%rax
730
+ adc \$0,%rdx
731
+ add $A[0],$N[0]
732
+ adc \$0,%rdx
733
+ mov $N[1],-32($tp) # tp[j-1]
734
+ mov %rdx,$N[1]
735
+
736
+ mulq $m0 # ap[j]*bp[i]
737
+ add %rax,$A[1]
738
+ mov -16*1($np),%rax
739
+ adc \$0,%rdx
740
+ add -8($tp),$A[1]
741
+ adc \$0,%rdx
742
+ mov %rdx,$A[0]
743
+
744
+ mulq $m1 # np[j]*m1
745
+ add %rax,$N[1]
746
+ mov ($ap,$j),%rax
747
+ adc \$0,%rdx
748
+ add $A[1],$N[1]
749
+ adc \$0,%rdx
750
+ mov $N[0],-24($tp) # tp[j-1]
751
+ mov %rdx,$N[0]
752
+
753
+ mulq $m0 # ap[j]*bp[i]
754
+ add %rax,$A[0]
755
+ mov 16*0($np),%rax
756
+ adc \$0,%rdx
757
+ add ($tp),$A[0] # ap[j]*bp[i]+tp[j]
758
+ adc \$0,%rdx
759
+ mov %rdx,$A[1]
760
+
761
+ mulq $m1 # np[j]*m1
762
+ add %rax,$N[0]
763
+ mov 8($ap,$j),%rax
764
+ adc \$0,%rdx
765
+ add $A[0],$N[0]
766
+ adc \$0,%rdx
767
+ mov $N[1],-16($tp) # tp[j-1]
768
+ mov %rdx,$N[1]
769
+
770
+ mulq $m0 # ap[j]*bp[i]
771
+ add %rax,$A[1]
772
+ mov 16*1($np),%rax
773
+ adc \$0,%rdx
774
+ add 8($tp),$A[1]
775
+ adc \$0,%rdx
776
+ mov %rdx,$A[0]
777
+
778
+ mulq $m1 # np[j]*m1
779
+ add %rax,$N[1]
780
+ mov 16($ap,$j),%rax
781
+ adc \$0,%rdx
782
+ add $A[1],$N[1]
783
+ lea 16*4($np),$np
784
+ adc \$0,%rdx
785
+ mov $N[0],-8($tp) # tp[j-1]
786
+ mov %rdx,$N[0]
787
+
788
+ add \$32,$j # j+=4
789
+ jnz .Linner4x
790
+
791
+ mulq $m0 # ap[j]*bp[i]
792
+ add %rax,$A[0]
793
+ mov -16*2($np),%rax
794
+ adc \$0,%rdx
795
+ add 16($tp),$A[0] # ap[j]*bp[i]+tp[j]
796
+ lea 32($tp),$tp
797
+ adc \$0,%rdx
798
+ mov %rdx,$A[1]
799
+
800
+ mulq $m1 # np[j]*m1
801
+ add %rax,$N[0]
802
+ mov -8($ap),%rax
803
+ adc \$0,%rdx
804
+ add $A[0],$N[0]
805
+ adc \$0,%rdx
806
+ mov $N[1],-32($tp) # tp[j-1]
807
+ mov %rdx,$N[1]
808
+
809
+ mulq $m0 # ap[j]*bp[i]
810
+ add %rax,$A[1]
811
+ mov $m1,%rax
812
+ mov -16*1($np),$m1
813
+ adc \$0,%rdx
814
+ add -8($tp),$A[1]
815
+ adc \$0,%rdx
816
+ mov %rdx,$A[0]
817
+
818
+ mulq $m1 # np[j]*m1
819
+ add %rax,$N[1]
820
+ mov ($ap,$num),%rax # ap[0]
821
+ adc \$0,%rdx
822
+ add $A[1],$N[1]
823
+ adc \$0,%rdx
824
+ mov $N[0],-24($tp) # tp[j-1]
825
+ mov %rdx,$N[0]
826
+
827
+ movq %xmm0,$m0 # bp[i+1]
828
+ mov $N[1],-16($tp) # tp[j-1]
829
+ lea ($np,$num,2),$np # rewind $np
830
+
831
+ xor $N[1],$N[1]
832
+ add $A[0],$N[0]
833
+ adc \$0,$N[1]
834
+ add ($tp),$N[0] # pull upmost overflow bit
835
+ adc \$0,$N[1] # upmost overflow bit
836
+ mov $N[0],-8($tp)
837
+
838
+ cmp 16+8(%rsp),$bp
839
+ jb .Louter4x
840
+ ___
841
+ if (1) {
842
+ $code.=<<___;
843
+ sub $N[0],$m1 # compare top-most words
844
+ adc $j,$j # $j is zero
845
+ or $j,$N[1]
846
+ xor \$1,$N[1]
847
+ lea ($tp,$num),%rbx # tptr in .sqr4x_sub
848
+ lea ($np,$N[1],8),%rbp # nptr in .sqr4x_sub
849
+ mov %r9,%rcx
850
+ sar \$3+2,%rcx # cf=0
851
+ mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub
852
+ jmp .Lsqr4x_sub
853
+ ___
854
+ } else {
855
+ my @ri=("%rax",$bp,$m0,$m1);
856
+ my $rp="%rdx";
857
+ $code.=<<___
858
+ xor \$1,$N[1]
859
+ lea ($tp,$num),$tp # rewind $tp
860
+ sar \$5,$num # cf=0
861
+ lea ($np,$N[1],8),$np
862
+ mov 56+8(%rsp),$rp # restore $rp
863
+ jmp .Lsub4x
864
+
865
+ .align 32
866
+ .Lsub4x:
867
+ .byte 0x66
868
+ mov 8*0($tp),@ri[0]
869
+ mov 8*1($tp),@ri[1]
870
+ .byte 0x66
871
+ sbb 16*0($np),@ri[0]
872
+ mov 8*2($tp),@ri[2]
873
+ sbb 16*1($np),@ri[1]
874
+ mov 3*8($tp),@ri[3]
875
+ lea 4*8($tp),$tp
876
+ sbb 16*2($np),@ri[2]
877
+ mov @ri[0],8*0($rp)
878
+ sbb 16*3($np),@ri[3]
879
+ lea 16*4($np),$np
880
+ mov @ri[1],8*1($rp)
881
+ mov @ri[2],8*2($rp)
882
+ mov @ri[3],8*3($rp)
883
+ lea 8*4($rp),$rp
884
+
885
+ inc $num
886
+ jnz .Lsub4x
887
+
888
+ ret
889
+ ___
890
+ }
891
+ $code.=<<___;
892
+ .size mul4x_internal,.-mul4x_internal
893
+ ___
894
+ }}}
895
+ {{{
896
+ ######################################################################
897
+ # void bn_power5(
898
+ my $rptr="%rdi"; # BN_ULONG *rptr,
899
+ my $aptr="%rsi"; # const BN_ULONG *aptr,
900
+ my $bptr="%rdx"; # const void *table,
901
+ my $nptr="%rcx"; # const BN_ULONG *nptr,
902
+ my $n0 ="%r8"; # const BN_ULONG *n0);
903
+ my $num ="%r9"; # int num, has to be divisible by 8
904
+ # int pwr
905
+
906
+ my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
907
+ my @A0=("%r10","%r11");
908
+ my @A1=("%r12","%r13");
909
+ my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
910
+
911
+ $code.=<<___;
912
+ .globl bn_power5
913
+ .type bn_power5,\@function,6
914
+ .align 32
915
+ bn_power5:
916
+ ___
917
+ $code.=<<___ if ($addx);
918
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
919
+ and \$0x80100,%r11d
920
+ cmp \$0x80100,%r11d
921
+ je .Lpowerx5_enter
922
+ ___
923
+ $code.=<<___;
924
+ mov %rsp,%rax
925
+ push %rbx
926
+ push %rbp
927
+ push %r12
928
+ push %r13
929
+ push %r14
930
+ push %r15
931
+ ___
932
+ $code.=<<___ if ($win64);
933
+ lea -0x28(%rsp),%rsp
934
+ movaps %xmm6,(%rsp)
935
+ movaps %xmm7,0x10(%rsp)
936
+ ___
937
+ $code.=<<___;
938
+ mov ${num}d,%r10d
939
+ shl \$3,${num}d # convert $num to bytes
940
+ shl \$3+2,%r10d # 4*$num
941
+ neg $num
942
+ mov ($n0),$n0 # *n0
943
+
944
+ ##############################################################
945
+ # ensure that stack frame doesn't alias with $aptr+4*$num
946
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
947
+ # (see bn_exp.c). this is done to allow memory disambiguation
948
+ # logic do its magic.
949
+ #
950
+ lea -64(%rsp,$num,2),%r11
951
+ sub $aptr,%r11
952
+ and \$4095,%r11
953
+ cmp %r11,%r10
954
+ jb .Lpwr_sp_alt
955
+ sub %r11,%rsp # align with $aptr
956
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
957
+ jmp .Lpwr_sp_done
958
+
959
+ .align 32
960
+ .Lpwr_sp_alt:
961
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
962
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
963
+ sub %r10,%r11
964
+ mov \$0,%r10
965
+ cmovc %r10,%r11
966
+ sub %r11,%rsp
967
+ .Lpwr_sp_done:
968
+ and \$-64,%rsp
969
+ mov $num,%r10
970
+ neg $num
971
+
972
+ ##############################################################
973
+ # Stack layout
974
+ #
975
+ # +0 saved $num, used in reduction section
976
+ # +8 &t[2*$num], used in reduction section
977
+ # +32 saved *n0
978
+ # +40 saved %rsp
979
+ # +48 t[2*$num]
980
+ #
981
+ mov $n0, 32(%rsp)
982
+ mov %rax, 40(%rsp) # save original %rsp
983
+ .Lpower5_body:
984
+ movq $rptr,%xmm1 # save $rptr
985
+ movq $nptr,%xmm2 # save $nptr
986
+ movq %r10, %xmm3 # -$num
987
+ movq $bptr,%xmm4
988
+
989
+ call __bn_sqr8x_internal
990
+ call __bn_sqr8x_internal
991
+ call __bn_sqr8x_internal
992
+ call __bn_sqr8x_internal
993
+ call __bn_sqr8x_internal
994
+
995
+ movq %xmm2,$nptr
996
+ movq %xmm4,$bptr
997
+ mov $aptr,$rptr
998
+ mov 40(%rsp),%rax
999
+ lea 32(%rsp),$n0
1000
+
1001
+ call mul4x_internal
1002
+
1003
+ mov 40(%rsp),%rsi # restore %rsp
1004
+ mov \$1,%rax
1005
+ mov -48(%rsi),%r15
1006
+ mov -40(%rsi),%r14
1007
+ mov -32(%rsi),%r13
1008
+ mov -24(%rsi),%r12
1009
+ mov -16(%rsi),%rbp
1010
+ mov -8(%rsi),%rbx
1011
+ lea (%rsi),%rsp
1012
+ .Lpower5_epilogue:
1013
+ ret
1014
+ .size bn_power5,.-bn_power5
1015
+
1016
+ .globl bn_sqr8x_internal
1017
+ .hidden bn_sqr8x_internal
1018
+ .type bn_sqr8x_internal,\@abi-omnipotent
1019
+ .align 32
1020
+ bn_sqr8x_internal:
1021
+ __bn_sqr8x_internal:
1022
+ ##############################################################
1023
+ # Squaring part:
1024
+ #
1025
+ # a) multiply-n-add everything but a[i]*a[i];
1026
+ # b) shift result of a) by 1 to the left and accumulate
1027
+ # a[i]*a[i] products;
1028
+ #
1029
+ ##############################################################
1030
+ # a[1]a[0]
1031
+ # a[2]a[0]
1032
+ # a[3]a[0]
1033
+ # a[2]a[1]
1034
+ # a[4]a[0]
1035
+ # a[3]a[1]
1036
+ # a[5]a[0]
1037
+ # a[4]a[1]
1038
+ # a[3]a[2]
1039
+ # a[6]a[0]
1040
+ # a[5]a[1]
1041
+ # a[4]a[2]
1042
+ # a[7]a[0]
1043
+ # a[6]a[1]
1044
+ # a[5]a[2]
1045
+ # a[4]a[3]
1046
+ # a[7]a[1]
1047
+ # a[6]a[2]
1048
+ # a[5]a[3]
1049
+ # a[7]a[2]
1050
+ # a[6]a[3]
1051
+ # a[5]a[4]
1052
+ # a[7]a[3]
1053
+ # a[6]a[4]
1054
+ # a[7]a[4]
1055
+ # a[6]a[5]
1056
+ # a[7]a[5]
1057
+ # a[7]a[6]
1058
+ # a[1]a[0]
1059
+ # a[2]a[0]
1060
+ # a[3]a[0]
1061
+ # a[4]a[0]
1062
+ # a[5]a[0]
1063
+ # a[6]a[0]
1064
+ # a[7]a[0]
1065
+ # a[2]a[1]
1066
+ # a[3]a[1]
1067
+ # a[4]a[1]
1068
+ # a[5]a[1]
1069
+ # a[6]a[1]
1070
+ # a[7]a[1]
1071
+ # a[3]a[2]
1072
+ # a[4]a[2]
1073
+ # a[5]a[2]
1074
+ # a[6]a[2]
1075
+ # a[7]a[2]
1076
+ # a[4]a[3]
1077
+ # a[5]a[3]
1078
+ # a[6]a[3]
1079
+ # a[7]a[3]
1080
+ # a[5]a[4]
1081
+ # a[6]a[4]
1082
+ # a[7]a[4]
1083
+ # a[6]a[5]
1084
+ # a[7]a[5]
1085
+ # a[7]a[6]
1086
+ # a[0]a[0]
1087
+ # a[1]a[1]
1088
+ # a[2]a[2]
1089
+ # a[3]a[3]
1090
+ # a[4]a[4]
1091
+ # a[5]a[5]
1092
+ # a[6]a[6]
1093
+ # a[7]a[7]
1094
+
1095
+ lea 32(%r10),$i # $i=-($num-32)
1096
+ lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2]
1097
+
1098
+ mov $num,$j # $j=$num
1099
+
1100
+ # comments apply to $num==8 case
1101
+ mov -32($aptr,$i),$a0 # a[0]
1102
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1103
+ mov -24($aptr,$i),%rax # a[1]
1104
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1105
+ mov -16($aptr,$i),$ai # a[2]
1106
+ mov %rax,$a1
1107
+
1108
+ mul $a0 # a[1]*a[0]
1109
+ mov %rax,$A0[0] # a[1]*a[0]
1110
+ mov $ai,%rax # a[2]
1111
+ mov %rdx,$A0[1]
1112
+ mov $A0[0],-24($tptr,$i) # t[1]
1113
+
1114
+ mul $a0 # a[2]*a[0]
1115
+ add %rax,$A0[1]
1116
+ mov $ai,%rax
1117
+ adc \$0,%rdx
1118
+ mov $A0[1],-16($tptr,$i) # t[2]
1119
+ mov %rdx,$A0[0]
1120
+
1121
+
1122
+ mov -8($aptr,$i),$ai # a[3]
1123
+ mul $a1 # a[2]*a[1]
1124
+ mov %rax,$A1[0] # a[2]*a[1]+t[3]
1125
+ mov $ai,%rax
1126
+ mov %rdx,$A1[1]
1127
+
1128
+ lea ($i),$j
1129
+ mul $a0 # a[3]*a[0]
1130
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1131
+ mov $ai,%rax
1132
+ mov %rdx,$A0[1]
1133
+ adc \$0,$A0[1]
1134
+ add $A1[0],$A0[0]
1135
+ adc \$0,$A0[1]
1136
+ mov $A0[0],-8($tptr,$j) # t[3]
1137
+ jmp .Lsqr4x_1st
1138
+
1139
+ .align 32
1140
+ .Lsqr4x_1st:
1141
+ mov ($aptr,$j),$ai # a[4]
1142
+ mul $a1 # a[3]*a[1]
1143
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
1144
+ mov $ai,%rax
1145
+ mov %rdx,$A1[0]
1146
+ adc \$0,$A1[0]
1147
+
1148
+ mul $a0 # a[4]*a[0]
1149
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1150
+ mov $ai,%rax # a[3]
1151
+ mov 8($aptr,$j),$ai # a[5]
1152
+ mov %rdx,$A0[0]
1153
+ adc \$0,$A0[0]
1154
+ add $A1[1],$A0[1]
1155
+ adc \$0,$A0[0]
1156
+
1157
+
1158
+ mul $a1 # a[4]*a[3]
1159
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
1160
+ mov $ai,%rax
1161
+ mov $A0[1],($tptr,$j) # t[4]
1162
+ mov %rdx,$A1[1]
1163
+ adc \$0,$A1[1]
1164
+
1165
+ mul $a0 # a[5]*a[2]
1166
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1167
+ mov $ai,%rax
1168
+ mov 16($aptr,$j),$ai # a[6]
1169
+ mov %rdx,$A0[1]
1170
+ adc \$0,$A0[1]
1171
+ add $A1[0],$A0[0]
1172
+ adc \$0,$A0[1]
1173
+
1174
+ mul $a1 # a[5]*a[3]
1175
+ add %rax,$A1[1] # a[5]*a[3]+t[6]
1176
+ mov $ai,%rax
1177
+ mov $A0[0],8($tptr,$j) # t[5]
1178
+ mov %rdx,$A1[0]
1179
+ adc \$0,$A1[0]
1180
+
1181
+ mul $a0 # a[6]*a[2]
1182
+ add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6]
1183
+ mov $ai,%rax # a[3]
1184
+ mov 24($aptr,$j),$ai # a[7]
1185
+ mov %rdx,$A0[0]
1186
+ adc \$0,$A0[0]
1187
+ add $A1[1],$A0[1]
1188
+ adc \$0,$A0[0]
1189
+
1190
+
1191
+ mul $a1 # a[6]*a[5]
1192
+ add %rax,$A1[0] # a[6]*a[5]+t[7]
1193
+ mov $ai,%rax
1194
+ mov $A0[1],16($tptr,$j) # t[6]
1195
+ mov %rdx,$A1[1]
1196
+ adc \$0,$A1[1]
1197
+ lea 32($j),$j
1198
+
1199
+ mul $a0 # a[7]*a[4]
1200
+ add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6]
1201
+ mov $ai,%rax
1202
+ mov %rdx,$A0[1]
1203
+ adc \$0,$A0[1]
1204
+ add $A1[0],$A0[0]
1205
+ adc \$0,$A0[1]
1206
+ mov $A0[0],-8($tptr,$j) # t[7]
1207
+
1208
+ cmp \$0,$j
1209
+ jne .Lsqr4x_1st
1210
+
1211
+ mul $a1 # a[7]*a[5]
1212
+ add %rax,$A1[1]
1213
+ lea 16($i),$i
1214
+ adc \$0,%rdx
1215
+ add $A0[1],$A1[1]
1216
+ adc \$0,%rdx
1217
+
1218
+ mov $A1[1],($tptr) # t[8]
1219
+ mov %rdx,$A1[0]
1220
+ mov %rdx,8($tptr) # t[9]
1221
+ jmp .Lsqr4x_outer
1222
+
1223
+ .align 32
1224
+ .Lsqr4x_outer: # comments apply to $num==6 case
1225
+ mov -32($aptr,$i),$a0 # a[0]
1226
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1227
+ mov -24($aptr,$i),%rax # a[1]
1228
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1229
+ mov -16($aptr,$i),$ai # a[2]
1230
+ mov %rax,$a1
1231
+
1232
+ mul $a0 # a[1]*a[0]
1233
+ mov -24($tptr,$i),$A0[0] # t[1]
1234
+ add %rax,$A0[0] # a[1]*a[0]+t[1]
1235
+ mov $ai,%rax # a[2]
1236
+ adc \$0,%rdx
1237
+ mov $A0[0],-24($tptr,$i) # t[1]
1238
+ mov %rdx,$A0[1]
1239
+
1240
+ mul $a0 # a[2]*a[0]
1241
+ add %rax,$A0[1]
1242
+ mov $ai,%rax
1243
+ adc \$0,%rdx
1244
+ add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2]
1245
+ mov %rdx,$A0[0]
1246
+ adc \$0,$A0[0]
1247
+ mov $A0[1],-16($tptr,$i) # t[2]
1248
+
1249
+ xor $A1[0],$A1[0]
1250
+
1251
+ mov -8($aptr,$i),$ai # a[3]
1252
+ mul $a1 # a[2]*a[1]
1253
+ add %rax,$A1[0] # a[2]*a[1]+t[3]
1254
+ mov $ai,%rax
1255
+ adc \$0,%rdx
1256
+ add -8($tptr,$i),$A1[0]
1257
+ mov %rdx,$A1[1]
1258
+ adc \$0,$A1[1]
1259
+
1260
+ mul $a0 # a[3]*a[0]
1261
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1262
+ mov $ai,%rax
1263
+ adc \$0,%rdx
1264
+ add $A1[0],$A0[0]
1265
+ mov %rdx,$A0[1]
1266
+ adc \$0,$A0[1]
1267
+ mov $A0[0],-8($tptr,$i) # t[3]
1268
+
1269
+ lea ($i),$j
1270
+ jmp .Lsqr4x_inner
1271
+
1272
+ .align 32
1273
+ .Lsqr4x_inner:
1274
+ mov ($aptr,$j),$ai # a[4]
1275
+ mul $a1 # a[3]*a[1]
1276
+ add %rax,$A1[1] # a[3]*a[1]+t[4]
1277
+ mov $ai,%rax
1278
+ mov %rdx,$A1[0]
1279
+ adc \$0,$A1[0]
1280
+ add ($tptr,$j),$A1[1]
1281
+ adc \$0,$A1[0]
1282
+
1283
+ .byte 0x67
1284
+ mul $a0 # a[4]*a[0]
1285
+ add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4]
1286
+ mov $ai,%rax # a[3]
1287
+ mov 8($aptr,$j),$ai # a[5]
1288
+ mov %rdx,$A0[0]
1289
+ adc \$0,$A0[0]
1290
+ add $A1[1],$A0[1]
1291
+ adc \$0,$A0[0]
1292
+
1293
+ mul $a1 # a[4]*a[3]
1294
+ add %rax,$A1[0] # a[4]*a[3]+t[5]
1295
+ mov $A0[1],($tptr,$j) # t[4]
1296
+ mov $ai,%rax
1297
+ mov %rdx,$A1[1]
1298
+ adc \$0,$A1[1]
1299
+ add 8($tptr,$j),$A1[0]
1300
+ lea 16($j),$j # j++
1301
+ adc \$0,$A1[1]
1302
+
1303
+ mul $a0 # a[5]*a[2]
1304
+ add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5]
1305
+ mov $ai,%rax
1306
+ adc \$0,%rdx
1307
+ add $A1[0],$A0[0]
1308
+ mov %rdx,$A0[1]
1309
+ adc \$0,$A0[1]
1310
+ mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below
1311
+
1312
+ cmp \$0,$j
1313
+ jne .Lsqr4x_inner
1314
+
1315
+ .byte 0x67
1316
+ mul $a1 # a[5]*a[3]
1317
+ add %rax,$A1[1]
1318
+ adc \$0,%rdx
1319
+ add $A0[1],$A1[1]
1320
+ adc \$0,%rdx
1321
+
1322
+ mov $A1[1],($tptr) # t[6], "preloaded t[2]" below
1323
+ mov %rdx,$A1[0]
1324
+ mov %rdx,8($tptr) # t[7], "preloaded t[3]" below
1325
+
1326
+ add \$16,$i
1327
+ jnz .Lsqr4x_outer
1328
+
1329
+ # comments apply to $num==4 case
1330
+ mov -32($aptr),$a0 # a[0]
1331
+ lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num]
1332
+ mov -24($aptr),%rax # a[1]
1333
+ lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"]
1334
+ mov -16($aptr),$ai # a[2]
1335
+ mov %rax,$a1
1336
+
1337
+ mul $a0 # a[1]*a[0]
1338
+ add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1]
1339
+ mov $ai,%rax # a[2]
1340
+ mov %rdx,$A0[1]
1341
+ adc \$0,$A0[1]
1342
+
1343
+ mul $a0 # a[2]*a[0]
1344
+ add %rax,$A0[1]
1345
+ mov $ai,%rax
1346
+ mov $A0[0],-24($tptr) # t[1]
1347
+ mov %rdx,$A0[0]
1348
+ adc \$0,$A0[0]
1349
+ add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2]
1350
+ mov -8($aptr),$ai # a[3]
1351
+ adc \$0,$A0[0]
1352
+
1353
+ mul $a1 # a[2]*a[1]
1354
+ add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3]
1355
+ mov $ai,%rax
1356
+ mov $A0[1],-16($tptr) # t[2]
1357
+ mov %rdx,$A1[1]
1358
+ adc \$0,$A1[1]
1359
+
1360
+ mul $a0 # a[3]*a[0]
1361
+ add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3]
1362
+ mov $ai,%rax
1363
+ mov %rdx,$A0[1]
1364
+ adc \$0,$A0[1]
1365
+ add $A1[0],$A0[0]
1366
+ adc \$0,$A0[1]
1367
+ mov $A0[0],-8($tptr) # t[3]
1368
+
1369
+ mul $a1 # a[3]*a[1]
1370
+ add %rax,$A1[1]
1371
+ mov -16($aptr),%rax # a[2]
1372
+ adc \$0,%rdx
1373
+ add $A0[1],$A1[1]
1374
+ adc \$0,%rdx
1375
+
1376
+ mov $A1[1],($tptr) # t[4]
1377
+ mov %rdx,$A1[0]
1378
+ mov %rdx,8($tptr) # t[5]
1379
+
1380
+ mul $ai # a[2]*a[3]
1381
+ ___
1382
+ {
1383
+ my ($shift,$carry)=($a0,$a1);
1384
+ my @S=(@A1,$ai,$n0);
1385
+ $code.=<<___;
1386
+ add \$16,$i
1387
+ xor $shift,$shift
1388
+ sub $num,$i # $i=16-$num
1389
+ xor $carry,$carry
1390
+
1391
+ add $A1[0],%rax # t[5]
1392
+ adc \$0,%rdx
1393
+ mov %rax,8($tptr) # t[5]
1394
+ mov %rdx,16($tptr) # t[6]
1395
+ mov $carry,24($tptr) # t[7]
1396
+
1397
+ mov -16($aptr,$i),%rax # a[0]
1398
+ lea 48+8(%rsp),$tptr
1399
+ xor $A0[0],$A0[0] # t[0]
1400
+ mov 8($tptr),$A0[1] # t[1]
1401
+
1402
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1403
+ shr \$63,$A0[0]
1404
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1405
+ shr \$63,$A0[1]
1406
+ or $A0[0],$S[1] # | t[2*i]>>63
1407
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1408
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1409
+ mul %rax # a[i]*a[i]
1410
+ neg $carry # mov $carry,cf
1411
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1412
+ adc %rax,$S[0]
1413
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
1414
+ mov $S[0],($tptr)
1415
+ adc %rdx,$S[1]
1416
+
1417
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1418
+ mov $S[1],8($tptr)
1419
+ sbb $carry,$carry # mov cf,$carry
1420
+ shr \$63,$A0[0]
1421
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1422
+ shr \$63,$A0[1]
1423
+ or $A0[0],$S[3] # | t[2*i]>>63
1424
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1425
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1426
+ mul %rax # a[i]*a[i]
1427
+ neg $carry # mov $carry,cf
1428
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1429
+ adc %rax,$S[2]
1430
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
1431
+ mov $S[2],16($tptr)
1432
+ adc %rdx,$S[3]
1433
+ lea 16($i),$i
1434
+ mov $S[3],24($tptr)
1435
+ sbb $carry,$carry # mov cf,$carry
1436
+ lea 64($tptr),$tptr
1437
+ jmp .Lsqr4x_shift_n_add
1438
+
1439
+ .align 32
1440
+ .Lsqr4x_shift_n_add:
1441
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1442
+ shr \$63,$A0[0]
1443
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1444
+ shr \$63,$A0[1]
1445
+ or $A0[0],$S[1] # | t[2*i]>>63
1446
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1447
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1448
+ mul %rax # a[i]*a[i]
1449
+ neg $carry # mov $carry,cf
1450
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1451
+ adc %rax,$S[0]
1452
+ mov -8($aptr,$i),%rax # a[i+1] # prefetch
1453
+ mov $S[0],-32($tptr)
1454
+ adc %rdx,$S[1]
1455
+
1456
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1457
+ mov $S[1],-24($tptr)
1458
+ sbb $carry,$carry # mov cf,$carry
1459
+ shr \$63,$A0[0]
1460
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1461
+ shr \$63,$A0[1]
1462
+ or $A0[0],$S[3] # | t[2*i]>>63
1463
+ mov 0($tptr),$A0[0] # t[2*i+2] # prefetch
1464
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1465
+ mul %rax # a[i]*a[i]
1466
+ neg $carry # mov $carry,cf
1467
+ mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1468
+ adc %rax,$S[2]
1469
+ mov 0($aptr,$i),%rax # a[i+1] # prefetch
1470
+ mov $S[2],-16($tptr)
1471
+ adc %rdx,$S[3]
1472
+
1473
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1474
+ mov $S[3],-8($tptr)
1475
+ sbb $carry,$carry # mov cf,$carry
1476
+ shr \$63,$A0[0]
1477
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1478
+ shr \$63,$A0[1]
1479
+ or $A0[0],$S[1] # | t[2*i]>>63
1480
+ mov 16($tptr),$A0[0] # t[2*i+2] # prefetch
1481
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1482
+ mul %rax # a[i]*a[i]
1483
+ neg $carry # mov $carry,cf
1484
+ mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch
1485
+ adc %rax,$S[0]
1486
+ mov 8($aptr,$i),%rax # a[i+1] # prefetch
1487
+ mov $S[0],0($tptr)
1488
+ adc %rdx,$S[1]
1489
+
1490
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
1491
+ mov $S[1],8($tptr)
1492
+ sbb $carry,$carry # mov cf,$carry
1493
+ shr \$63,$A0[0]
1494
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1495
+ shr \$63,$A0[1]
1496
+ or $A0[0],$S[3] # | t[2*i]>>63
1497
+ mov 32($tptr),$A0[0] # t[2*i+2] # prefetch
1498
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1499
+ mul %rax # a[i]*a[i]
1500
+ neg $carry # mov $carry,cf
1501
+ mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch
1502
+ adc %rax,$S[2]
1503
+ mov 16($aptr,$i),%rax # a[i+1] # prefetch
1504
+ mov $S[2],16($tptr)
1505
+ adc %rdx,$S[3]
1506
+ mov $S[3],24($tptr)
1507
+ sbb $carry,$carry # mov cf,$carry
1508
+ lea 64($tptr),$tptr
1509
+ add \$32,$i
1510
+ jnz .Lsqr4x_shift_n_add
1511
+
1512
+ lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
1513
+ .byte 0x67
1514
+ shr \$63,$A0[0]
1515
+ lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
1516
+ shr \$63,$A0[1]
1517
+ or $A0[0],$S[1] # | t[2*i]>>63
1518
+ mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
1519
+ mov $A0[1],$shift # shift=t[2*i+1]>>63
1520
+ mul %rax # a[i]*a[i]
1521
+ neg $carry # mov $carry,cf
1522
+ mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
1523
+ adc %rax,$S[0]
1524
+ mov -8($aptr),%rax # a[i+1] # prefetch
1525
+ mov $S[0],-32($tptr)
1526
+ adc %rdx,$S[1]
1527
+
1528
+ lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
1529
+ mov $S[1],-24($tptr)
1530
+ sbb $carry,$carry # mov cf,$carry
1531
+ shr \$63,$A0[0]
1532
+ lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
1533
+ shr \$63,$A0[1]
1534
+ or $A0[0],$S[3] # | t[2*i]>>63
1535
+ mul %rax # a[i]*a[i]
1536
+ neg $carry # mov $carry,cf
1537
+ adc %rax,$S[2]
1538
+ adc %rdx,$S[3]
1539
+ mov $S[2],-16($tptr)
1540
+ mov $S[3],-8($tptr)
1541
+ ___
1542
+ }
1543
+ ######################################################################
1544
+ # Montgomery reduction part, "word-by-word" algorithm.
1545
+ #
1546
+ # This new path is inspired by multiple submissions from Intel, by
1547
+ # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1548
+ # Vinodh Gopal...
1549
+ {
1550
+ my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1551
+
1552
+ $code.=<<___;
1553
+ movq %xmm2,$nptr
1554
+ sqr8x_reduction:
1555
+ xor %rax,%rax
1556
+ lea ($nptr,$num,2),%rcx # end of n[]
1557
+ lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer
1558
+ mov %rcx,0+8(%rsp)
1559
+ lea 48+8(%rsp,$num),$tptr # end of initial t[] window
1560
+ mov %rdx,8+8(%rsp)
1561
+ neg $num
1562
+ jmp .L8x_reduction_loop
1563
+
1564
+ .align 32
1565
+ .L8x_reduction_loop:
1566
+ lea ($tptr,$num),$tptr # start of current t[] window
1567
+ .byte 0x66
1568
+ mov 8*0($tptr),$m0
1569
+ mov 8*1($tptr),%r9
1570
+ mov 8*2($tptr),%r10
1571
+ mov 8*3($tptr),%r11
1572
+ mov 8*4($tptr),%r12
1573
+ mov 8*5($tptr),%r13
1574
+ mov 8*6($tptr),%r14
1575
+ mov 8*7($tptr),%r15
1576
+ mov %rax,(%rdx) # store top-most carry bit
1577
+ lea 8*8($tptr),$tptr
1578
+
1579
+ .byte 0x67
1580
+ mov $m0,%r8
1581
+ imulq 32+8(%rsp),$m0 # n0*a[0]
1582
+ mov 16*0($nptr),%rax # n[0]
1583
+ mov \$8,%ecx
1584
+ jmp .L8x_reduce
1585
+
1586
+ .align 32
1587
+ .L8x_reduce:
1588
+ mulq $m0
1589
+ mov 16*1($nptr),%rax # n[1]
1590
+ neg %r8
1591
+ mov %rdx,%r8
1592
+ adc \$0,%r8
1593
+
1594
+ mulq $m0
1595
+ add %rax,%r9
1596
+ mov 16*2($nptr),%rax
1597
+ adc \$0,%rdx
1598
+ add %r9,%r8
1599
+ mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i]
1600
+ mov %rdx,%r9
1601
+ adc \$0,%r9
1602
+
1603
+ mulq $m0
1604
+ add %rax,%r10
1605
+ mov 16*3($nptr),%rax
1606
+ adc \$0,%rdx
1607
+ add %r10,%r9
1608
+ mov 32+8(%rsp),$carry # pull n0, borrow $carry
1609
+ mov %rdx,%r10
1610
+ adc \$0,%r10
1611
+
1612
+ mulq $m0
1613
+ add %rax,%r11
1614
+ mov 16*4($nptr),%rax
1615
+ adc \$0,%rdx
1616
+ imulq %r8,$carry # modulo-scheduled
1617
+ add %r11,%r10
1618
+ mov %rdx,%r11
1619
+ adc \$0,%r11
1620
+
1621
+ mulq $m0
1622
+ add %rax,%r12
1623
+ mov 16*5($nptr),%rax
1624
+ adc \$0,%rdx
1625
+ add %r12,%r11
1626
+ mov %rdx,%r12
1627
+ adc \$0,%r12
1628
+
1629
+ mulq $m0
1630
+ add %rax,%r13
1631
+ mov 16*6($nptr),%rax
1632
+ adc \$0,%rdx
1633
+ add %r13,%r12
1634
+ mov %rdx,%r13
1635
+ adc \$0,%r13
1636
+
1637
+ mulq $m0
1638
+ add %rax,%r14
1639
+ mov 16*7($nptr),%rax
1640
+ adc \$0,%rdx
1641
+ add %r14,%r13
1642
+ mov %rdx,%r14
1643
+ adc \$0,%r14
1644
+
1645
+ mulq $m0
1646
+ mov $carry,$m0 # n0*a[i]
1647
+ add %rax,%r15
1648
+ mov 16*0($nptr),%rax # n[0]
1649
+ adc \$0,%rdx
1650
+ add %r15,%r14
1651
+ mov %rdx,%r15
1652
+ adc \$0,%r15
1653
+
1654
+ dec %ecx
1655
+ jnz .L8x_reduce
1656
+
1657
+ lea 16*8($nptr),$nptr
1658
+ xor %rax,%rax
1659
+ mov 8+8(%rsp),%rdx # pull end of t[]
1660
+ cmp 0+8(%rsp),$nptr # end of n[]?
1661
+ jae .L8x_no_tail
1662
+
1663
+ .byte 0x66
1664
+ add 8*0($tptr),%r8
1665
+ adc 8*1($tptr),%r9
1666
+ adc 8*2($tptr),%r10
1667
+ adc 8*3($tptr),%r11
1668
+ adc 8*4($tptr),%r12
1669
+ adc 8*5($tptr),%r13
1670
+ adc 8*6($tptr),%r14
1671
+ adc 8*7($tptr),%r15
1672
+ sbb $carry,$carry # top carry
1673
+
1674
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1675
+ mov \$8,%ecx
1676
+ mov 16*0($nptr),%rax
1677
+ jmp .L8x_tail
1678
+
1679
+ .align 32
1680
+ .L8x_tail:
1681
+ mulq $m0
1682
+ add %rax,%r8
1683
+ mov 16*1($nptr),%rax
1684
+ mov %r8,($tptr) # save result
1685
+ mov %rdx,%r8
1686
+ adc \$0,%r8
1687
+
1688
+ mulq $m0
1689
+ add %rax,%r9
1690
+ mov 16*2($nptr),%rax
1691
+ adc \$0,%rdx
1692
+ add %r9,%r8
1693
+ lea 8($tptr),$tptr # $tptr++
1694
+ mov %rdx,%r9
1695
+ adc \$0,%r9
1696
+
1697
+ mulq $m0
1698
+ add %rax,%r10
1699
+ mov 16*3($nptr),%rax
1700
+ adc \$0,%rdx
1701
+ add %r10,%r9
1702
+ mov %rdx,%r10
1703
+ adc \$0,%r10
1704
+
1705
+ mulq $m0
1706
+ add %rax,%r11
1707
+ mov 16*4($nptr),%rax
1708
+ adc \$0,%rdx
1709
+ add %r11,%r10
1710
+ mov %rdx,%r11
1711
+ adc \$0,%r11
1712
+
1713
+ mulq $m0
1714
+ add %rax,%r12
1715
+ mov 16*5($nptr),%rax
1716
+ adc \$0,%rdx
1717
+ add %r12,%r11
1718
+ mov %rdx,%r12
1719
+ adc \$0,%r12
1720
+
1721
+ mulq $m0
1722
+ add %rax,%r13
1723
+ mov 16*6($nptr),%rax
1724
+ adc \$0,%rdx
1725
+ add %r13,%r12
1726
+ mov %rdx,%r13
1727
+ adc \$0,%r13
1728
+
1729
+ mulq $m0
1730
+ add %rax,%r14
1731
+ mov 16*7($nptr),%rax
1732
+ adc \$0,%rdx
1733
+ add %r14,%r13
1734
+ mov %rdx,%r14
1735
+ adc \$0,%r14
1736
+
1737
+ mulq $m0
1738
+ mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1739
+ add %rax,%r15
1740
+ adc \$0,%rdx
1741
+ add %r15,%r14
1742
+ mov 16*0($nptr),%rax # pull n[0]
1743
+ mov %rdx,%r15
1744
+ adc \$0,%r15
1745
+
1746
+ dec %ecx
1747
+ jnz .L8x_tail
1748
+
1749
+ lea 16*8($nptr),$nptr
1750
+ mov 8+8(%rsp),%rdx # pull end of t[]
1751
+ cmp 0+8(%rsp),$nptr # end of n[]?
1752
+ jae .L8x_tail_done # break out of loop
1753
+
1754
+ mov 48+56+8(%rsp),$m0 # pull n0*a[0]
1755
+ neg $carry
1756
+ mov 8*0($nptr),%rax # pull n[0]
1757
+ adc 8*0($tptr),%r8
1758
+ adc 8*1($tptr),%r9
1759
+ adc 8*2($tptr),%r10
1760
+ adc 8*3($tptr),%r11
1761
+ adc 8*4($tptr),%r12
1762
+ adc 8*5($tptr),%r13
1763
+ adc 8*6($tptr),%r14
1764
+ adc 8*7($tptr),%r15
1765
+ sbb $carry,$carry # top carry
1766
+
1767
+ mov \$8,%ecx
1768
+ jmp .L8x_tail
1769
+
1770
+ .align 32
1771
+ .L8x_tail_done:
1772
+ add (%rdx),%r8 # can this overflow?
1773
+ adc \$0,%r9
1774
+ adc \$0,%r10
1775
+ adc \$0,%r11
1776
+ adc \$0,%r12
1777
+ adc \$0,%r13
1778
+ adc \$0,%r14
1779
+ adc \$0,%r15 # can't overflow, because we
1780
+ # started with "overhung" part
1781
+ # of multiplication
1782
+ xor %rax,%rax
1783
+
1784
+ neg $carry
1785
+ .L8x_no_tail:
1786
+ adc 8*0($tptr),%r8
1787
+ adc 8*1($tptr),%r9
1788
+ adc 8*2($tptr),%r10
1789
+ adc 8*3($tptr),%r11
1790
+ adc 8*4($tptr),%r12
1791
+ adc 8*5($tptr),%r13
1792
+ adc 8*6($tptr),%r14
1793
+ adc 8*7($tptr),%r15
1794
+ adc \$0,%rax # top-most carry
1795
+ mov -16($nptr),%rcx # np[num-1]
1796
+ xor $carry,$carry
1797
+
1798
+ movq %xmm2,$nptr # restore $nptr
1799
+
1800
+ mov %r8,8*0($tptr) # store top 512 bits
1801
+ mov %r9,8*1($tptr)
1802
+ movq %xmm3,$num # $num is %r9, can't be moved upwards
1803
+ mov %r10,8*2($tptr)
1804
+ mov %r11,8*3($tptr)
1805
+ mov %r12,8*4($tptr)
1806
+ mov %r13,8*5($tptr)
1807
+ mov %r14,8*6($tptr)
1808
+ mov %r15,8*7($tptr)
1809
+ lea 8*8($tptr),$tptr
1810
+
1811
+ cmp %rdx,$tptr # end of t[]?
1812
+ jb .L8x_reduction_loop
1813
+ ___
1814
+ }
1815
+ ##############################################################
1816
+ # Post-condition, 4x unrolled
1817
+ #
1818
+ {
1819
+ my ($tptr,$nptr)=("%rbx","%rbp");
1820
+ $code.=<<___;
1821
+ #xor %rsi,%rsi # %rsi was $carry above
1822
+ sub %r15,%rcx # compare top-most words
1823
+ lea (%rdi,$num),$tptr # %rdi was $tptr above
1824
+ adc %rsi,%rsi
1825
+ mov $num,%rcx
1826
+ or %rsi,%rax
1827
+ movq %xmm1,$rptr # restore $rptr
1828
+ xor \$1,%rax
1829
+ movq %xmm1,$aptr # prepare for back-to-back call
1830
+ lea ($nptr,%rax,8),$nptr
1831
+ sar \$3+2,%rcx # cf=0
1832
+ jmp .Lsqr4x_sub
1833
+
1834
+ .align 32
1835
+ .Lsqr4x_sub:
1836
+ .byte 0x66
1837
+ mov 8*0($tptr),%r12
1838
+ mov 8*1($tptr),%r13
1839
+ sbb 16*0($nptr),%r12
1840
+ mov 8*2($tptr),%r14
1841
+ sbb 16*1($nptr),%r13
1842
+ mov 8*3($tptr),%r15
1843
+ lea 8*4($tptr),$tptr
1844
+ sbb 16*2($nptr),%r14
1845
+ mov %r12,8*0($rptr)
1846
+ sbb 16*3($nptr),%r15
1847
+ lea 16*4($nptr),$nptr
1848
+ mov %r13,8*1($rptr)
1849
+ mov %r14,8*2($rptr)
1850
+ mov %r15,8*3($rptr)
1851
+ lea 8*4($rptr),$rptr
1852
+
1853
+ inc %rcx # pass %cf
1854
+ jnz .Lsqr4x_sub
1855
+ ___
1856
+ }
1857
+ $code.=<<___;
1858
+ mov $num,%r10 # prepare for back-to-back call
1859
+ neg $num # restore $num
1860
+ ret
1861
+ .size bn_sqr8x_internal,.-bn_sqr8x_internal
1862
+ ___
1863
+ {
1864
+ $code.=<<___;
1865
+ .globl bn_from_montgomery
1866
+ .type bn_from_montgomery,\@abi-omnipotent
1867
+ .align 32
1868
+ bn_from_montgomery:
1869
+ testl \$7,`($win64?"48(%rsp)":"%r9d")`
1870
+ jz bn_from_mont8x
1871
+ xor %eax,%eax
1872
+ ret
1873
+ .size bn_from_montgomery,.-bn_from_montgomery
1874
+
1875
+ .type bn_from_mont8x,\@function,6
1876
+ .align 32
1877
+ bn_from_mont8x:
1878
+ .byte 0x67
1879
+ mov %rsp,%rax
1880
+ push %rbx
1881
+ push %rbp
1882
+ push %r12
1883
+ push %r13
1884
+ push %r14
1885
+ push %r15
1886
+ ___
1887
+ $code.=<<___ if ($win64);
1888
+ lea -0x28(%rsp),%rsp
1889
+ movaps %xmm6,(%rsp)
1890
+ movaps %xmm7,0x10(%rsp)
1891
+ ___
1892
+ $code.=<<___;
1893
+ .byte 0x67
1894
+ mov ${num}d,%r10d
1895
+ shl \$3,${num}d # convert $num to bytes
1896
+ shl \$3+2,%r10d # 4*$num
1897
+ neg $num
1898
+ mov ($n0),$n0 # *n0
1899
+
1900
+ ##############################################################
1901
+ # ensure that stack frame doesn't alias with $aptr+4*$num
1902
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
1903
+ # (see bn_exp.c). this is done to allow memory disambiguation
1904
+ # logic do its magic.
1905
+ #
1906
+ lea -64(%rsp,$num,2),%r11
1907
+ sub $aptr,%r11
1908
+ and \$4095,%r11
1909
+ cmp %r11,%r10
1910
+ jb .Lfrom_sp_alt
1911
+ sub %r11,%rsp # align with $aptr
1912
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
1913
+ jmp .Lfrom_sp_done
1914
+
1915
+ .align 32
1916
+ .Lfrom_sp_alt:
1917
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
1918
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
1919
+ sub %r10,%r11
1920
+ mov \$0,%r10
1921
+ cmovc %r10,%r11
1922
+ sub %r11,%rsp
1923
+ .Lfrom_sp_done:
1924
+ and \$-64,%rsp
1925
+ mov $num,%r10
1926
+ neg $num
1927
+
1928
+ ##############################################################
1929
+ # Stack layout
1930
+ #
1931
+ # +0 saved $num, used in reduction section
1932
+ # +8 &t[2*$num], used in reduction section
1933
+ # +32 saved *n0
1934
+ # +40 saved %rsp
1935
+ # +48 t[2*$num]
1936
+ #
1937
+ mov $n0, 32(%rsp)
1938
+ mov %rax, 40(%rsp) # save original %rsp
1939
+ .Lfrom_body:
1940
+ mov $num,%r11
1941
+ lea 48(%rsp),%rax
1942
+ pxor %xmm0,%xmm0
1943
+ jmp .Lmul_by_1
1944
+
1945
+ .align 32
1946
+ .Lmul_by_1:
1947
+ movdqu ($aptr),%xmm1
1948
+ movdqu 16($aptr),%xmm2
1949
+ movdqu 32($aptr),%xmm3
1950
+ movdqa %xmm0,(%rax,$num)
1951
+ movdqu 48($aptr),%xmm4
1952
+ movdqa %xmm0,16(%rax,$num)
1953
+ .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr
1954
+ movdqa %xmm1,(%rax)
1955
+ movdqa %xmm0,32(%rax,$num)
1956
+ movdqa %xmm2,16(%rax)
1957
+ movdqa %xmm0,48(%rax,$num)
1958
+ movdqa %xmm3,32(%rax)
1959
+ movdqa %xmm4,48(%rax)
1960
+ lea 64(%rax),%rax
1961
+ sub \$64,%r11
1962
+ jnz .Lmul_by_1
1963
+
1964
+ movq $rptr,%xmm1
1965
+ movq $nptr,%xmm2
1966
+ .byte 0x67
1967
+ mov $nptr,%rbp
1968
+ movq %r10, %xmm3 # -num
1969
+ ___
1970
+ $code.=<<___ if ($addx);
1971
+ mov OPENSSL_ia32cap_P+8(%rip),%r11d
1972
+ and \$0x80100,%r11d
1973
+ cmp \$0x80100,%r11d
1974
+ jne .Lfrom_mont_nox
1975
+
1976
+ lea (%rax,$num),$rptr
1977
+ call sqrx8x_reduction
1978
+
1979
+ pxor %xmm0,%xmm0
1980
+ lea 48(%rsp),%rax
1981
+ mov 40(%rsp),%rsi # restore %rsp
1982
+ jmp .Lfrom_mont_zero
1983
+
1984
+ .align 32
1985
+ .Lfrom_mont_nox:
1986
+ ___
1987
+ $code.=<<___;
1988
+ call sqr8x_reduction
1989
+
1990
+ pxor %xmm0,%xmm0
1991
+ lea 48(%rsp),%rax
1992
+ mov 40(%rsp),%rsi # restore %rsp
1993
+ jmp .Lfrom_mont_zero
1994
+
1995
+ .align 32
1996
+ .Lfrom_mont_zero:
1997
+ movdqa %xmm0,16*0(%rax)
1998
+ movdqa %xmm0,16*1(%rax)
1999
+ movdqa %xmm0,16*2(%rax)
2000
+ movdqa %xmm0,16*3(%rax)
2001
+ lea 16*4(%rax),%rax
2002
+ sub \$32,$num
2003
+ jnz .Lfrom_mont_zero
2004
+
2005
+ mov \$1,%rax
2006
+ mov -48(%rsi),%r15
2007
+ mov -40(%rsi),%r14
2008
+ mov -32(%rsi),%r13
2009
+ mov -24(%rsi),%r12
2010
+ mov -16(%rsi),%rbp
2011
+ mov -8(%rsi),%rbx
2012
+ lea (%rsi),%rsp
2013
+ .Lfrom_epilogue:
2014
+ ret
2015
+ .size bn_from_mont8x,.-bn_from_mont8x
2016
+ ___
2017
+ }
2018
+ }}}
2019
+
2020
+ if ($addx) {{{
2021
+ my $bp="%rdx"; # restore original value
2022
+
2023
+ $code.=<<___;
2024
+ .type bn_mulx4x_mont_gather5,\@function,6
2025
+ .align 32
2026
+ bn_mulx4x_mont_gather5:
2027
+ .Lmulx4x_enter:
2028
+ .byte 0x67
2029
+ mov %rsp,%rax
2030
+ push %rbx
2031
+ push %rbp
2032
+ push %r12
2033
+ push %r13
2034
+ push %r14
2035
+ push %r15
2036
+ ___
2037
+ $code.=<<___ if ($win64);
2038
+ lea -0x28(%rsp),%rsp
2039
+ movaps %xmm6,(%rsp)
2040
+ movaps %xmm7,0x10(%rsp)
2041
+ ___
2042
+ $code.=<<___;
2043
+ .byte 0x67
2044
+ mov ${num}d,%r10d
2045
+ shl \$3,${num}d # convert $num to bytes
2046
+ shl \$3+2,%r10d # 4*$num
2047
+ neg $num # -$num
2048
+ mov ($n0),$n0 # *n0
2049
+
2050
+ ##############################################################
2051
+ # ensure that stack frame doesn't alias with $aptr+4*$num
2052
+ # modulo 4096, which covers a[num], ret[num] and n[2*num]
2053
+ # (see bn_exp.c). this is done to allow memory disambiguation
2054
+ # logic do its magic. [excessive frame is allocated in order
2055
+ # to allow bn_from_mont8x to clear it.]
2056
+ #
2057
+ lea -64(%rsp,$num,2),%r11
2058
+ sub $ap,%r11
2059
+ and \$4095,%r11
2060
+ cmp %r11,%r10
2061
+ jb .Lmulx4xsp_alt
2062
+ sub %r11,%rsp # align with $aptr
2063
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
2064
+ jmp .Lmulx4xsp_done
2065
+
2066
+ .align 32
2067
+ .Lmulx4xsp_alt:
2068
+ lea 4096-64(,$num,2),%r10 # 4096-frame-$num
2069
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+$num)
2070
+ sub %r10,%r11
2071
+ mov \$0,%r10
2072
+ cmovc %r10,%r11
2073
+ sub %r11,%rsp
2074
+ .Lmulx4xsp_done:
2075
+ and \$-64,%rsp # ensure alignment
2076
+ ##############################################################
2077
+ # Stack layout
2078
+ # +0 -num
2079
+ # +8 off-loaded &b[i]
2080
+ # +16 end of b[num]
2081
+ # +24 inner counter
2082
+ # +32 saved n0
2083
+ # +40 saved %rsp
2084
+ # +48
2085
+ # +56 saved rp
2086
+ # +64 tmp[num+1]
2087
+ #
2088
+ mov $n0, 32(%rsp) # save *n0
2089
+ mov %rax,40(%rsp) # save original %rsp
2090
+ .Lmulx4x_body:
2091
+ call mulx4x_internal
2092
+
2093
+ mov 40(%rsp),%rsi # restore %rsp
2094
+ mov \$1,%rax
2095
+ ___
2096
+ $code.=<<___ if ($win64);
2097
+ movaps -88(%rsi),%xmm6
2098
+ movaps -72(%rsi),%xmm7
2099
+ ___
2100
+ $code.=<<___;
2101
+ mov -48(%rsi),%r15
2102
+ mov -40(%rsi),%r14
2103
+ mov -32(%rsi),%r13
2104
+ mov -24(%rsi),%r12
2105
+ mov -16(%rsi),%rbp
2106
+ mov -8(%rsi),%rbx
2107
+ lea (%rsi),%rsp
2108
+ .Lmulx4x_epilogue:
2109
+ ret
2110
+ .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2111
+
2112
+ .type mulx4x_internal,\@abi-omnipotent
2113
+ .align 32
2114
+ mulx4x_internal:
2115
+ .byte 0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00 # mov $num,8(%rsp) # save -$num
2116
+ .byte 0x67
2117
+ neg $num # restore $num
2118
+ shl \$5,$num
2119
+ lea 256($bp,$num),%r13
2120
+ shr \$5+5,$num
2121
+ mov `($win64?56:8)`(%rax),%r10d # load 7th argument
2122
+ sub \$1,$num
2123
+ mov %r13,16+8(%rsp) # end of b[num]
2124
+ mov $num,24+8(%rsp) # inner counter
2125
+ mov $rp, 56+8(%rsp) # save $rp
2126
+ ___
2127
+ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
2128
+ ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2129
+ my $rptr=$bptr;
2130
+ my $STRIDE=2**5*8; # 5 is "window size"
2131
+ my $N=$STRIDE/4; # should match cache line size
2132
+ $code.=<<___;
2133
+ mov %r10,%r11
2134
+ shr \$`log($N/8)/log(2)`,%r10
2135
+ and \$`$N/8-1`,%r11
2136
+ not %r10
2137
+ lea .Lmagic_masks(%rip),%rax
2138
+ and \$`2**5/($N/8)-1`,%r10 # 5 is "window size"
2139
+ lea 96($bp,%r11,8),$bptr # pointer within 1st cache line
2140
+ movq 0(%rax,%r10,8),%xmm4 # set of masks denoting which
2141
+ movq 8(%rax,%r10,8),%xmm5 # cache line contains element
2142
+ add \$7,%r11
2143
+ movq 16(%rax,%r10,8),%xmm6 # denoted by 7th argument
2144
+ movq 24(%rax,%r10,8),%xmm7
2145
+ and \$7,%r11
2146
+
2147
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
2148
+ lea $STRIDE($bptr),$tptr # borrow $tptr
2149
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
2150
+ pand %xmm4,%xmm0
2151
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
2152
+ pand %xmm5,%xmm1
2153
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
2154
+ pand %xmm6,%xmm2
2155
+ por %xmm1,%xmm0
2156
+ movq `0*$STRIDE/4-96`($tptr),%xmm1
2157
+ pand %xmm7,%xmm3
2158
+ por %xmm2,%xmm0
2159
+ movq `1*$STRIDE/4-96`($tptr),%xmm2
2160
+ por %xmm3,%xmm0
2161
+ .byte 0x67,0x67
2162
+ pand %xmm4,%xmm1
2163
+ movq `2*$STRIDE/4-96`($tptr),%xmm3
2164
+
2165
+ movq %xmm0,%rdx # bp[0]
2166
+ movq `3*$STRIDE/4-96`($tptr),%xmm0
2167
+ lea 2*$STRIDE($bptr),$bptr # next &b[i]
2168
+ pand %xmm5,%xmm2
2169
+ .byte 0x67,0x67
2170
+ pand %xmm6,%xmm3
2171
+ ##############################################################
2172
+ # $tptr is chosen so that writing to top-most element of the
2173
+ # vector occurs just "above" references to powers table,
2174
+ # "above" modulo cache-line size, which effectively precludes
2175
+ # possibility of memory disambiguation logic failure when
2176
+ # accessing the table.
2177
+ #
2178
+ lea 64+8*4+8(%rsp,%r11,8),$tptr
2179
+
2180
+ mov %rdx,$bi
2181
+ mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
2182
+ mulx 1*8($aptr),%r11,%r12 # a[1]*b[0]
2183
+ add %rax,%r11
2184
+ mulx 2*8($aptr),%rax,%r13 # ...
2185
+ adc %rax,%r12
2186
+ adc \$0,%r13
2187
+ mulx 3*8($aptr),%rax,%r14
2188
+
2189
+ mov $mi,%r15
2190
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
2191
+ xor $zero,$zero # cf=0, of=0
2192
+ mov $mi,%rdx
2193
+
2194
+ por %xmm2,%xmm1
2195
+ pand %xmm7,%xmm0
2196
+ por %xmm3,%xmm1
2197
+ mov $bptr,8+8(%rsp) # off-load &b[i]
2198
+ por %xmm1,%xmm0
2199
+
2200
+ .byte 0x48,0x8d,0xb6,0x20,0x00,0x00,0x00 # lea 4*8($aptr),$aptr
2201
+ adcx %rax,%r13
2202
+ adcx $zero,%r14 # cf=0
2203
+
2204
+ mulx 0*16($nptr),%rax,%r10
2205
+ adcx %rax,%r15 # discarded
2206
+ adox %r11,%r10
2207
+ mulx 1*16($nptr),%rax,%r11
2208
+ adcx %rax,%r10
2209
+ adox %r12,%r11
2210
+ mulx 2*16($nptr),%rax,%r12
2211
+ mov 24+8(%rsp),$bptr # counter value
2212
+ .byte 0x66
2213
+ mov %r10,-8*4($tptr)
2214
+ adcx %rax,%r11
2215
+ adox %r13,%r12
2216
+ mulx 3*16($nptr),%rax,%r15
2217
+ .byte 0x67,0x67
2218
+ mov $bi,%rdx
2219
+ mov %r11,-8*3($tptr)
2220
+ adcx %rax,%r12
2221
+ adox $zero,%r15 # of=0
2222
+ .byte 0x48,0x8d,0x89,0x40,0x00,0x00,0x00 # lea 4*16($nptr),$nptr
2223
+ mov %r12,-8*2($tptr)
2224
+ #jmp .Lmulx4x_1st
2225
+
2226
+ .align 32
2227
+ .Lmulx4x_1st:
2228
+ adcx $zero,%r15 # cf=0, modulo-scheduled
2229
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
2230
+ adcx %r14,%r10
2231
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
2232
+ adcx %rax,%r11
2233
+ mulx 2*8($aptr),%r12,%rax # ...
2234
+ adcx %r14,%r12
2235
+ mulx 3*8($aptr),%r13,%r14
2236
+ .byte 0x67,0x67
2237
+ mov $mi,%rdx
2238
+ adcx %rax,%r13
2239
+ adcx $zero,%r14 # cf=0
2240
+ lea 4*8($aptr),$aptr
2241
+ lea 4*8($tptr),$tptr
2242
+
2243
+ adox %r15,%r10
2244
+ mulx 0*16($nptr),%rax,%r15
2245
+ adcx %rax,%r10
2246
+ adox %r15,%r11
2247
+ mulx 1*16($nptr),%rax,%r15
2248
+ adcx %rax,%r11
2249
+ adox %r15,%r12
2250
+ mulx 2*16($nptr),%rax,%r15
2251
+ mov %r10,-5*8($tptr)
2252
+ adcx %rax,%r12
2253
+ mov %r11,-4*8($tptr)
2254
+ adox %r15,%r13
2255
+ mulx 3*16($nptr),%rax,%r15
2256
+ mov $bi,%rdx
2257
+ mov %r12,-3*8($tptr)
2258
+ adcx %rax,%r13
2259
+ adox $zero,%r15
2260
+ lea 4*16($nptr),$nptr
2261
+ mov %r13,-2*8($tptr)
2262
+
2263
+ dec $bptr # of=0, pass cf
2264
+ jnz .Lmulx4x_1st
2265
+
2266
+ mov 8(%rsp),$num # load -num
2267
+ movq %xmm0,%rdx # bp[1]
2268
+ adc $zero,%r15 # modulo-scheduled
2269
+ lea ($aptr,$num),$aptr # rewind $aptr
2270
+ add %r15,%r14
2271
+ mov 8+8(%rsp),$bptr # re-load &b[i]
2272
+ adc $zero,$zero # top-most carry
2273
+ mov %r14,-1*8($tptr)
2274
+ jmp .Lmulx4x_outer
2275
+
2276
+ .align 32
2277
+ .Lmulx4x_outer:
2278
+ mov $zero,($tptr) # save top-most carry
2279
+ lea 4*8($tptr,$num),$tptr # rewind $tptr
2280
+ mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
2281
+ xor $zero,$zero # cf=0, of=0
2282
+ mov %rdx,$bi
2283
+ mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
2284
+ adox -4*8($tptr),$mi # +t[0]
2285
+ adcx %r14,%r11
2286
+ mulx 2*8($aptr),%r15,%r13 # ...
2287
+ adox -3*8($tptr),%r11
2288
+ adcx %r15,%r12
2289
+ mulx 3*8($aptr),%rdx,%r14
2290
+ adox -2*8($tptr),%r12
2291
+ adcx %rdx,%r13
2292
+ lea ($nptr,$num,2),$nptr # rewind $nptr
2293
+ lea 4*8($aptr),$aptr
2294
+ adox -1*8($tptr),%r13
2295
+ adcx $zero,%r14
2296
+ adox $zero,%r14
2297
+
2298
+ .byte 0x67
2299
+ mov $mi,%r15
2300
+ imulq 32+8(%rsp),$mi # "t[0]"*n0
2301
+
2302
+ movq `0*$STRIDE/4-96`($bptr),%xmm0
2303
+ .byte 0x67,0x67
2304
+ mov $mi,%rdx
2305
+ movq `1*$STRIDE/4-96`($bptr),%xmm1
2306
+ .byte 0x67
2307
+ pand %xmm4,%xmm0
2308
+ movq `2*$STRIDE/4-96`($bptr),%xmm2
2309
+ .byte 0x67
2310
+ pand %xmm5,%xmm1
2311
+ movq `3*$STRIDE/4-96`($bptr),%xmm3
2312
+ add \$$STRIDE,$bptr # next &b[i]
2313
+ .byte 0x67
2314
+ pand %xmm6,%xmm2
2315
+ por %xmm1,%xmm0
2316
+ pand %xmm7,%xmm3
2317
+ xor $zero,$zero # cf=0, of=0
2318
+ mov $bptr,8+8(%rsp) # off-load &b[i]
2319
+
2320
+ mulx 0*16($nptr),%rax,%r10
2321
+ adcx %rax,%r15 # discarded
2322
+ adox %r11,%r10
2323
+ mulx 1*16($nptr),%rax,%r11
2324
+ adcx %rax,%r10
2325
+ adox %r12,%r11
2326
+ mulx 2*16($nptr),%rax,%r12
2327
+ adcx %rax,%r11
2328
+ adox %r13,%r12
2329
+ mulx 3*16($nptr),%rax,%r15
2330
+ mov $bi,%rdx
2331
+ por %xmm2,%xmm0
2332
+ mov 24+8(%rsp),$bptr # counter value
2333
+ mov %r10,-8*4($tptr)
2334
+ por %xmm3,%xmm0
2335
+ adcx %rax,%r12
2336
+ mov %r11,-8*3($tptr)
2337
+ adox $zero,%r15 # of=0
2338
+ mov %r12,-8*2($tptr)
2339
+ lea 4*16($nptr),$nptr
2340
+ jmp .Lmulx4x_inner
2341
+
2342
+ .align 32
2343
+ .Lmulx4x_inner:
2344
+ mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
2345
+ adcx $zero,%r15 # cf=0, modulo-scheduled
2346
+ adox %r14,%r10
2347
+ mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
2348
+ adcx 0*8($tptr),%r10
2349
+ adox %rax,%r11
2350
+ mulx 2*8($aptr),%r12,%rax # ...
2351
+ adcx 1*8($tptr),%r11
2352
+ adox %r14,%r12
2353
+ mulx 3*8($aptr),%r13,%r14
2354
+ mov $mi,%rdx
2355
+ adcx 2*8($tptr),%r12
2356
+ adox %rax,%r13
2357
+ adcx 3*8($tptr),%r13
2358
+ adox $zero,%r14 # of=0
2359
+ lea 4*8($aptr),$aptr
2360
+ lea 4*8($tptr),$tptr
2361
+ adcx $zero,%r14 # cf=0
2362
+
2363
+ adox %r15,%r10
2364
+ mulx 0*16($nptr),%rax,%r15
2365
+ adcx %rax,%r10
2366
+ adox %r15,%r11
2367
+ mulx 1*16($nptr),%rax,%r15
2368
+ adcx %rax,%r11
2369
+ adox %r15,%r12
2370
+ mulx 2*16($nptr),%rax,%r15
2371
+ mov %r10,-5*8($tptr)
2372
+ adcx %rax,%r12
2373
+ adox %r15,%r13
2374
+ mov %r11,-4*8($tptr)
2375
+ mulx 3*16($nptr),%rax,%r15
2376
+ mov $bi,%rdx
2377
+ lea 4*16($nptr),$nptr
2378
+ mov %r12,-3*8($tptr)
2379
+ adcx %rax,%r13
2380
+ adox $zero,%r15
2381
+ mov %r13,-2*8($tptr)
2382
+
2383
+ dec $bptr # of=0, pass cf
2384
+ jnz .Lmulx4x_inner
2385
+
2386
+ mov 0+8(%rsp),$num # load -num
2387
+ movq %xmm0,%rdx # bp[i+1]
2388
+ adc $zero,%r15 # modulo-scheduled
2389
+ sub 0*8($tptr),$bptr # pull top-most carry to %cf
2390
+ mov 8+8(%rsp),$bptr # re-load &b[i]
2391
+ mov 16+8(%rsp),%r10
2392
+ adc %r15,%r14
2393
+ lea ($aptr,$num),$aptr # rewind $aptr
2394
+ adc $zero,$zero # top-most carry
2395
+ mov %r14,-1*8($tptr)
2396
+
2397
+ cmp %r10,$bptr
2398
+ jb .Lmulx4x_outer
2399
+
2400
+ mov -16($nptr),%r10
2401
+ xor %r15,%r15
2402
+ sub %r14,%r10 # compare top-most words
2403
+ adc %r15,%r15
2404
+ or %r15,$zero
2405
+ xor \$1,$zero
2406
+ lea ($tptr,$num),%rdi # rewind $tptr
2407
+ lea ($nptr,$num,2),$nptr # rewind $nptr
2408
+ .byte 0x67,0x67
2409
+ sar \$3+2,$num # cf=0
2410
+ lea ($nptr,$zero,8),%rbp
2411
+ mov 56+8(%rsp),%rdx # restore rp
2412
+ mov $num,%rcx
2413
+ jmp .Lsqrx4x_sub # common post-condition
2414
+ .size mulx4x_internal,.-mulx4x_internal
2415
+ ___
2416
+ } {
2417
+ ######################################################################
2418
+ # void bn_power5(
2419
+ my $rptr="%rdi"; # BN_ULONG *rptr,
2420
+ my $aptr="%rsi"; # const BN_ULONG *aptr,
2421
+ my $bptr="%rdx"; # const void *table,
2422
+ my $nptr="%rcx"; # const BN_ULONG *nptr,
2423
+ my $n0 ="%r8"; # const BN_ULONG *n0);
2424
+ my $num ="%r9"; # int num, has to be divisible by 8
2425
+ # int pwr);
2426
+
2427
+ my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2428
+ my @A0=("%r10","%r11");
2429
+ my @A1=("%r12","%r13");
2430
+ my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2431
+
2432
+ $code.=<<___;
2433
+ .type bn_powerx5,\@function,6
2434
+ .align 32
2435
+ bn_powerx5:
2436
+ .Lpowerx5_enter:
2437
+ .byte 0x67
2438
+ mov %rsp,%rax
2439
+ push %rbx
2440
+ push %rbp
2441
+ push %r12
2442
+ push %r13
2443
+ push %r14
2444
+ push %r15
2445
+ ___
2446
+ $code.=<<___ if ($win64);
2447
+ lea -0x28(%rsp),%rsp
2448
+ movaps %xmm6,(%rsp)
2449
+ movaps %xmm7,0x10(%rsp)
2450
+ ___
2451
+ $code.=<<___;
2452
+ .byte 0x67
2453
+ mov ${num}d,%r10d
2454
+ shl \$3,${num}d # convert $num to bytes
2455
+ shl \$3+2,%r10d # 4*$num
2456
+ neg $num
2457
+ mov ($n0),$n0 # *n0
2458
+
2459
+ ##############################################################
2460
+ # ensure that stack frame doesn't alias with $aptr+4*$num
2461
+ # modulo 4096, which covers ret[num], am[num] and n[2*num]
2462
+ # (see bn_exp.c). this is done to allow memory disambiguation
2463
+ # logic do its magic.
2464
+ #
2465
+ lea -64(%rsp,$num,2),%r11
2466
+ sub $aptr,%r11
2467
+ and \$4095,%r11
2468
+ cmp %r11,%r10
2469
+ jb .Lpwrx_sp_alt
2470
+ sub %r11,%rsp # align with $aptr
2471
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
2472
+ jmp .Lpwrx_sp_done
2473
+
2474
+ .align 32
2475
+ .Lpwrx_sp_alt:
2476
+ lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num
2477
+ lea -64(%rsp,$num,2),%rsp # alloca(frame+2*$num)
2478
+ sub %r10,%r11
2479
+ mov \$0,%r10
2480
+ cmovc %r10,%r11
2481
+ sub %r11,%rsp
2482
+ .Lpwrx_sp_done:
2483
+ and \$-64,%rsp
2484
+ mov $num,%r10
2485
+ neg $num
2486
+
2487
+ ##############################################################
2488
+ # Stack layout
2489
+ #
2490
+ # +0 saved $num, used in reduction section
2491
+ # +8 &t[2*$num], used in reduction section
2492
+ # +16 intermediate carry bit
2493
+ # +24 top-most carry bit, used in reduction section
2494
+ # +32 saved *n0
2495
+ # +40 saved %rsp
2496
+ # +48 t[2*$num]
2497
+ #
2498
+ pxor %xmm0,%xmm0
2499
+ movq $rptr,%xmm1 # save $rptr
2500
+ movq $nptr,%xmm2 # save $nptr
2501
+ movq %r10, %xmm3 # -$num
2502
+ movq $bptr,%xmm4
2503
+ mov $n0, 32(%rsp)
2504
+ mov %rax, 40(%rsp) # save original %rsp
2505
+ .Lpowerx5_body:
2506
+
2507
+ call __bn_sqrx8x_internal
2508
+ call __bn_sqrx8x_internal
2509
+ call __bn_sqrx8x_internal
2510
+ call __bn_sqrx8x_internal
2511
+ call __bn_sqrx8x_internal
2512
+
2513
+ mov %r10,$num # -num
2514
+ mov $aptr,$rptr
2515
+ movq %xmm2,$nptr
2516
+ movq %xmm4,$bptr
2517
+ mov 40(%rsp),%rax
2518
+
2519
+ call mulx4x_internal
2520
+
2521
+ mov 40(%rsp),%rsi # restore %rsp
2522
+ mov \$1,%rax
2523
+ ___
2524
+ $code.=<<___ if ($win64);
2525
+ movaps -88(%rsi),%xmm6
2526
+ movaps -72(%rsi),%xmm7
2527
+ ___
2528
+ $code.=<<___;
2529
+ mov -48(%rsi),%r15
2530
+ mov -40(%rsi),%r14
2531
+ mov -32(%rsi),%r13
2532
+ mov -24(%rsi),%r12
2533
+ mov -16(%rsi),%rbp
2534
+ mov -8(%rsi),%rbx
2535
+ lea (%rsi),%rsp
2536
+ .Lpowerx5_epilogue:
2537
+ ret
2538
+ .size bn_powerx5,.-bn_powerx5
2539
+
2540
+ .globl bn_sqrx8x_internal
2541
+ .hidden bn_sqrx8x_internal
2542
+ .type bn_sqrx8x_internal,\@abi-omnipotent
2543
+ .align 32
2544
+ bn_sqrx8x_internal:
2545
+ __bn_sqrx8x_internal:
2546
+ ##################################################################
2547
+ # Squaring part:
2548
+ #
2549
+ # a) multiply-n-add everything but a[i]*a[i];
2550
+ # b) shift result of a) by 1 to the left and accumulate
2551
+ # a[i]*a[i] products;
2552
+ #
2553
+ ##################################################################
2554
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2555
+ # a[1]a[0]
2556
+ # a[2]a[0]
2557
+ # a[3]a[0]
2558
+ # a[2]a[1]
2559
+ # a[3]a[1]
2560
+ # a[3]a[2]
2561
+ #
2562
+ # a[4]a[0]
2563
+ # a[5]a[0]
2564
+ # a[6]a[0]
2565
+ # a[7]a[0]
2566
+ # a[4]a[1]
2567
+ # a[5]a[1]
2568
+ # a[6]a[1]
2569
+ # a[7]a[1]
2570
+ # a[4]a[2]
2571
+ # a[5]a[2]
2572
+ # a[6]a[2]
2573
+ # a[7]a[2]
2574
+ # a[4]a[3]
2575
+ # a[5]a[3]
2576
+ # a[6]a[3]
2577
+ # a[7]a[3]
2578
+ #
2579
+ # a[5]a[4]
2580
+ # a[6]a[4]
2581
+ # a[7]a[4]
2582
+ # a[6]a[5]
2583
+ # a[7]a[5]
2584
+ # a[7]a[6]
2585
+ # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2586
+ ___
2587
+ {
2588
+ my ($zero,$carry)=("%rbp","%rcx");
2589
+ my $aaptr=$zero;
2590
+ $code.=<<___;
2591
+ lea 48+8(%rsp),$tptr
2592
+ lea ($aptr,$num),$aaptr
2593
+ mov $num,0+8(%rsp) # save $num
2594
+ mov $aaptr,8+8(%rsp) # save end of $aptr
2595
+ jmp .Lsqr8x_zero_start
2596
+
2597
+ .align 32
2598
+ .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2599
+ .Lsqrx8x_zero:
2600
+ .byte 0x3e
2601
+ movdqa %xmm0,0*8($tptr)
2602
+ movdqa %xmm0,2*8($tptr)
2603
+ movdqa %xmm0,4*8($tptr)
2604
+ movdqa %xmm0,6*8($tptr)
2605
+ .Lsqr8x_zero_start: # aligned at 32
2606
+ movdqa %xmm0,8*8($tptr)
2607
+ movdqa %xmm0,10*8($tptr)
2608
+ movdqa %xmm0,12*8($tptr)
2609
+ movdqa %xmm0,14*8($tptr)
2610
+ lea 16*8($tptr),$tptr
2611
+ sub \$64,$num
2612
+ jnz .Lsqrx8x_zero
2613
+
2614
+ mov 0*8($aptr),%rdx # a[0], modulo-scheduled
2615
+ #xor %r9,%r9 # t[1], ex-$num, zero already
2616
+ xor %r10,%r10
2617
+ xor %r11,%r11
2618
+ xor %r12,%r12
2619
+ xor %r13,%r13
2620
+ xor %r14,%r14
2621
+ xor %r15,%r15
2622
+ lea 48+8(%rsp),$tptr
2623
+ xor $zero,$zero # cf=0, cf=0
2624
+ jmp .Lsqrx8x_outer_loop
2625
+
2626
+ .align 32
2627
+ .Lsqrx8x_outer_loop:
2628
+ mulx 1*8($aptr),%r8,%rax # a[1]*a[0]
2629
+ adcx %r9,%r8 # a[1]*a[0]+=t[1]
2630
+ adox %rax,%r10
2631
+ mulx 2*8($aptr),%r9,%rax # a[2]*a[0]
2632
+ adcx %r10,%r9
2633
+ adox %rax,%r11
2634
+ .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ...
2635
+ adcx %r11,%r10
2636
+ adox %rax,%r12
2637
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax
2638
+ adcx %r12,%r11
2639
+ adox %rax,%r13
2640
+ mulx 5*8($aptr),%r12,%rax
2641
+ adcx %r13,%r12
2642
+ adox %rax,%r14
2643
+ mulx 6*8($aptr),%r13,%rax
2644
+ adcx %r14,%r13
2645
+ adox %r15,%rax
2646
+ mulx 7*8($aptr),%r14,%r15
2647
+ mov 1*8($aptr),%rdx # a[1]
2648
+ adcx %rax,%r14
2649
+ adox $zero,%r15
2650
+ adc 8*8($tptr),%r15
2651
+ mov %r8,1*8($tptr) # t[1]
2652
+ mov %r9,2*8($tptr) # t[2]
2653
+ sbb $carry,$carry # mov %cf,$carry
2654
+ xor $zero,$zero # cf=0, of=0
2655
+
2656
+
2657
+ mulx 2*8($aptr),%r8,%rbx # a[2]*a[1]
2658
+ mulx 3*8($aptr),%r9,%rax # a[3]*a[1]
2659
+ adcx %r10,%r8
2660
+ adox %rbx,%r9
2661
+ mulx 4*8($aptr),%r10,%rbx # ...
2662
+ adcx %r11,%r9
2663
+ adox %rax,%r10
2664
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax
2665
+ adcx %r12,%r10
2666
+ adox %rbx,%r11
2667
+ .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx
2668
+ adcx %r13,%r11
2669
+ adox %r14,%r12
2670
+ .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14
2671
+ mov 2*8($aptr),%rdx # a[2]
2672
+ adcx %rax,%r12
2673
+ adox %rbx,%r13
2674
+ adcx %r15,%r13
2675
+ adox $zero,%r14 # of=0
2676
+ adcx $zero,%r14 # cf=0
2677
+
2678
+ mov %r8,3*8($tptr) # t[3]
2679
+ mov %r9,4*8($tptr) # t[4]
2680
+
2681
+ mulx 3*8($aptr),%r8,%rbx # a[3]*a[2]
2682
+ mulx 4*8($aptr),%r9,%rax # a[4]*a[2]
2683
+ adcx %r10,%r8
2684
+ adox %rbx,%r9
2685
+ mulx 5*8($aptr),%r10,%rbx # ...
2686
+ adcx %r11,%r9
2687
+ adox %rax,%r10
2688
+ .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax
2689
+ adcx %r12,%r10
2690
+ adox %r13,%r11
2691
+ .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13
2692
+ .byte 0x3e
2693
+ mov 3*8($aptr),%rdx # a[3]
2694
+ adcx %rbx,%r11
2695
+ adox %rax,%r12
2696
+ adcx %r14,%r12
2697
+ mov %r8,5*8($tptr) # t[5]
2698
+ mov %r9,6*8($tptr) # t[6]
2699
+ mulx 4*8($aptr),%r8,%rax # a[4]*a[3]
2700
+ adox $zero,%r13 # of=0
2701
+ adcx $zero,%r13 # cf=0
2702
+
2703
+ mulx 5*8($aptr),%r9,%rbx # a[5]*a[3]
2704
+ adcx %r10,%r8
2705
+ adox %rax,%r9
2706
+ mulx 6*8($aptr),%r10,%rax # ...
2707
+ adcx %r11,%r9
2708
+ adox %r12,%r10
2709
+ mulx 7*8($aptr),%r11,%r12
2710
+ mov 4*8($aptr),%rdx # a[4]
2711
+ mov 5*8($aptr),%r14 # a[5]
2712
+ adcx %rbx,%r10
2713
+ adox %rax,%r11
2714
+ mov 6*8($aptr),%r15 # a[6]
2715
+ adcx %r13,%r11
2716
+ adox $zero,%r12 # of=0
2717
+ adcx $zero,%r12 # cf=0
2718
+
2719
+ mov %r8,7*8($tptr) # t[7]
2720
+ mov %r9,8*8($tptr) # t[8]
2721
+
2722
+ mulx %r14,%r9,%rax # a[5]*a[4]
2723
+ mov 7*8($aptr),%r8 # a[7]
2724
+ adcx %r10,%r9
2725
+ mulx %r15,%r10,%rbx # a[6]*a[4]
2726
+ adox %rax,%r10
2727
+ adcx %r11,%r10
2728
+ mulx %r8,%r11,%rax # a[7]*a[4]
2729
+ mov %r14,%rdx # a[5]
2730
+ adox %rbx,%r11
2731
+ adcx %r12,%r11
2732
+ #adox $zero,%rax # of=0
2733
+ adcx $zero,%rax # cf=0
2734
+
2735
+ mulx %r15,%r14,%rbx # a[6]*a[5]
2736
+ mulx %r8,%r12,%r13 # a[7]*a[5]
2737
+ mov %r15,%rdx # a[6]
2738
+ lea 8*8($aptr),$aptr
2739
+ adcx %r14,%r11
2740
+ adox %rbx,%r12
2741
+ adcx %rax,%r12
2742
+ adox $zero,%r13
2743
+
2744
+ .byte 0x67,0x67
2745
+ mulx %r8,%r8,%r14 # a[7]*a[6]
2746
+ adcx %r8,%r13
2747
+ adcx $zero,%r14
2748
+
2749
+ cmp 8+8(%rsp),$aptr
2750
+ je .Lsqrx8x_outer_break
2751
+
2752
+ neg $carry # mov $carry,%cf
2753
+ mov \$-8,%rcx
2754
+ mov $zero,%r15
2755
+ mov 8*8($tptr),%r8
2756
+ adcx 9*8($tptr),%r9 # +=t[9]
2757
+ adcx 10*8($tptr),%r10 # ...
2758
+ adcx 11*8($tptr),%r11
2759
+ adc 12*8($tptr),%r12
2760
+ adc 13*8($tptr),%r13
2761
+ adc 14*8($tptr),%r14
2762
+ adc 15*8($tptr),%r15
2763
+ lea ($aptr),$aaptr
2764
+ lea 2*64($tptr),$tptr
2765
+ sbb %rax,%rax # mov %cf,$carry
2766
+
2767
+ mov -64($aptr),%rdx # a[0]
2768
+ mov %rax,16+8(%rsp) # offload $carry
2769
+ mov $tptr,24+8(%rsp)
2770
+
2771
+ #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above
2772
+ xor %eax,%eax # cf=0, of=0
2773
+ jmp .Lsqrx8x_loop
2774
+
2775
+ .align 32
2776
+ .Lsqrx8x_loop:
2777
+ mov %r8,%rbx
2778
+ mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i]
2779
+ adcx %rax,%rbx # +=t[8]
2780
+ adox %r9,%r8
2781
+
2782
+ mulx 1*8($aaptr),%rax,%r9 # ...
2783
+ adcx %rax,%r8
2784
+ adox %r10,%r9
2785
+
2786
+ mulx 2*8($aaptr),%rax,%r10
2787
+ adcx %rax,%r9
2788
+ adox %r11,%r10
2789
+
2790
+ mulx 3*8($aaptr),%rax,%r11
2791
+ adcx %rax,%r10
2792
+ adox %r12,%r11
2793
+
2794
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12
2795
+ adcx %rax,%r11
2796
+ adox %r13,%r12
2797
+
2798
+ mulx 5*8($aaptr),%rax,%r13
2799
+ adcx %rax,%r12
2800
+ adox %r14,%r13
2801
+
2802
+ mulx 6*8($aaptr),%rax,%r14
2803
+ mov %rbx,($tptr,%rcx,8) # store t[8+i]
2804
+ mov \$0,%ebx
2805
+ adcx %rax,%r13
2806
+ adox %r15,%r14
2807
+
2808
+ .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15
2809
+ mov 8($aptr,%rcx,8),%rdx # a[i]
2810
+ adcx %rax,%r14
2811
+ adox %rbx,%r15 # %rbx is 0, of=0
2812
+ adcx %rbx,%r15 # cf=0
2813
+
2814
+ .byte 0x67
2815
+ inc %rcx # of=0
2816
+ jnz .Lsqrx8x_loop
2817
+
2818
+ lea 8*8($aaptr),$aaptr
2819
+ mov \$-8,%rcx
2820
+ cmp 8+8(%rsp),$aaptr # done?
2821
+ je .Lsqrx8x_break
2822
+
2823
+ sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf
2824
+ .byte 0x66
2825
+ mov -64($aptr),%rdx
2826
+ adcx 0*8($tptr),%r8
2827
+ adcx 1*8($tptr),%r9
2828
+ adc 2*8($tptr),%r10
2829
+ adc 3*8($tptr),%r11
2830
+ adc 4*8($tptr),%r12
2831
+ adc 5*8($tptr),%r13
2832
+ adc 6*8($tptr),%r14
2833
+ adc 7*8($tptr),%r15
2834
+ lea 8*8($tptr),$tptr
2835
+ .byte 0x67
2836
+ sbb %rax,%rax # mov %cf,%rax
2837
+ xor %ebx,%ebx # cf=0, of=0
2838
+ mov %rax,16+8(%rsp) # offload carry
2839
+ jmp .Lsqrx8x_loop
2840
+
2841
+ .align 32
2842
+ .Lsqrx8x_break:
2843
+ sub 16+8(%rsp),%r8 # consume last carry
2844
+ mov 24+8(%rsp),$carry # initial $tptr, borrow $carry
2845
+ mov 0*8($aptr),%rdx # a[8], modulo-scheduled
2846
+ xor %ebp,%ebp # xor $zero,$zero
2847
+ mov %r8,0*8($tptr)
2848
+ cmp $carry,$tptr # cf=0, of=0
2849
+ je .Lsqrx8x_outer_loop
2850
+
2851
+ mov %r9,1*8($tptr)
2852
+ mov 1*8($carry),%r9
2853
+ mov %r10,2*8($tptr)
2854
+ mov 2*8($carry),%r10
2855
+ mov %r11,3*8($tptr)
2856
+ mov 3*8($carry),%r11
2857
+ mov %r12,4*8($tptr)
2858
+ mov 4*8($carry),%r12
2859
+ mov %r13,5*8($tptr)
2860
+ mov 5*8($carry),%r13
2861
+ mov %r14,6*8($tptr)
2862
+ mov 6*8($carry),%r14
2863
+ mov %r15,7*8($tptr)
2864
+ mov 7*8($carry),%r15
2865
+ mov $carry,$tptr
2866
+ jmp .Lsqrx8x_outer_loop
2867
+
2868
+ .align 32
2869
+ .Lsqrx8x_outer_break:
2870
+ mov %r9,9*8($tptr) # t[9]
2871
+ movq %xmm3,%rcx # -$num
2872
+ mov %r10,10*8($tptr) # ...
2873
+ mov %r11,11*8($tptr)
2874
+ mov %r12,12*8($tptr)
2875
+ mov %r13,13*8($tptr)
2876
+ mov %r14,14*8($tptr)
2877
+ ___
2878
+ } {
2879
+ my $i="%rcx";
2880
+ $code.=<<___;
2881
+ lea 48+8(%rsp),$tptr
2882
+ mov ($aptr,$i),%rdx # a[0]
2883
+
2884
+ mov 8($tptr),$A0[1] # t[1]
2885
+ xor $A0[0],$A0[0] # t[0], of=0, cf=0
2886
+ mov 0+8(%rsp),$num # restore $num
2887
+ adox $A0[1],$A0[1]
2888
+ mov 16($tptr),$A1[0] # t[2] # prefetch
2889
+ mov 24($tptr),$A1[1] # t[3] # prefetch
2890
+ #jmp .Lsqrx4x_shift_n_add # happens to be aligned
2891
+
2892
+ .align 32
2893
+ .Lsqrx4x_shift_n_add:
2894
+ mulx %rdx,%rax,%rbx
2895
+ adox $A1[0],$A1[0]
2896
+ adcx $A0[0],%rax
2897
+ .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch
2898
+ .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch
2899
+ adox $A1[1],$A1[1]
2900
+ adcx $A0[1],%rbx
2901
+ mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch
2902
+ mov %rax,0($tptr)
2903
+ mov %rbx,8($tptr)
2904
+
2905
+ mulx %rdx,%rax,%rbx
2906
+ adox $A0[0],$A0[0]
2907
+ adcx $A1[0],%rax
2908
+ mov 16($aptr,$i),%rdx # a[i+2] # prefetch
2909
+ mov 48($tptr),$A1[0] # t[2*i+6] # prefetch
2910
+ adox $A0[1],$A0[1]
2911
+ adcx $A1[1],%rbx
2912
+ mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch
2913
+ mov %rax,16($tptr)
2914
+ mov %rbx,24($tptr)
2915
+
2916
+ mulx %rdx,%rax,%rbx
2917
+ adox $A1[0],$A1[0]
2918
+ adcx $A0[0],%rax
2919
+ mov 24($aptr,$i),%rdx # a[i+3] # prefetch
2920
+ lea 32($i),$i
2921
+ mov 64($tptr),$A0[0] # t[2*i+8] # prefetch
2922
+ adox $A1[1],$A1[1]
2923
+ adcx $A0[1],%rbx
2924
+ mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch
2925
+ mov %rax,32($tptr)
2926
+ mov %rbx,40($tptr)
2927
+
2928
+ mulx %rdx,%rax,%rbx
2929
+ adox $A0[0],$A0[0]
2930
+ adcx $A1[0],%rax
2931
+ jrcxz .Lsqrx4x_shift_n_add_break
2932
+ .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch
2933
+ adox $A0[1],$A0[1]
2934
+ adcx $A1[1],%rbx
2935
+ mov 80($tptr),$A1[0] # t[2*i+10] # prefetch
2936
+ mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch
2937
+ mov %rax,48($tptr)
2938
+ mov %rbx,56($tptr)
2939
+ lea 64($tptr),$tptr
2940
+ nop
2941
+ jmp .Lsqrx4x_shift_n_add
2942
+
2943
+ .align 32
2944
+ .Lsqrx4x_shift_n_add_break:
2945
+ adcx $A1[1],%rbx
2946
+ mov %rax,48($tptr)
2947
+ mov %rbx,56($tptr)
2948
+ lea 64($tptr),$tptr # end of t[] buffer
2949
+ ___
2950
+ }
2951
+ ######################################################################
2952
+ # Montgomery reduction part, "word-by-word" algorithm.
2953
+ #
2954
+ # This new path is inspired by multiple submissions from Intel, by
2955
+ # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
2956
+ # Vinodh Gopal...
2957
+ {
2958
+ my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
2959
+
2960
+ $code.=<<___;
2961
+ movq %xmm2,$nptr
2962
+ sqrx8x_reduction:
2963
+ xor %eax,%eax # initial top-most carry bit
2964
+ mov 32+8(%rsp),%rbx # n0
2965
+ mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr)
2966
+ lea -128($nptr,$num,2),%rcx # end of n[]
2967
+ #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer
2968
+ mov %rcx, 0+8(%rsp) # save end of n[]
2969
+ mov $tptr,8+8(%rsp) # save end of t[]
2970
+
2971
+ lea 48+8(%rsp),$tptr # initial t[] window
2972
+ jmp .Lsqrx8x_reduction_loop
2973
+
2974
+ .align 32
2975
+ .Lsqrx8x_reduction_loop:
2976
+ mov 8*1($tptr),%r9
2977
+ mov 8*2($tptr),%r10
2978
+ mov 8*3($tptr),%r11
2979
+ mov 8*4($tptr),%r12
2980
+ mov %rdx,%r8
2981
+ imulq %rbx,%rdx # n0*a[i]
2982
+ mov 8*5($tptr),%r13
2983
+ mov 8*6($tptr),%r14
2984
+ mov 8*7($tptr),%r15
2985
+ mov %rax,24+8(%rsp) # store top-most carry bit
2986
+
2987
+ lea 8*8($tptr),$tptr
2988
+ xor $carry,$carry # cf=0,of=0
2989
+ mov \$-8,%rcx
2990
+ jmp .Lsqrx8x_reduce
2991
+
2992
+ .align 32
2993
+ .Lsqrx8x_reduce:
2994
+ mov %r8, %rbx
2995
+ mulx 16*0($nptr),%rax,%r8 # n[0]
2996
+ adcx %rbx,%rax # discarded
2997
+ adox %r9,%r8
2998
+
2999
+ mulx 16*1($nptr),%rbx,%r9 # n[1]
3000
+ adcx %rbx,%r8
3001
+ adox %r10,%r9
3002
+
3003
+ mulx 16*2($nptr),%rbx,%r10
3004
+ adcx %rbx,%r9
3005
+ adox %r11,%r10
3006
+
3007
+ mulx 16*3($nptr),%rbx,%r11
3008
+ adcx %rbx,%r10
3009
+ adox %r12,%r11
3010
+
3011
+ .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rbx,%r12
3012
+ mov %rdx,%rax
3013
+ mov %r8,%rdx
3014
+ adcx %rbx,%r11
3015
+ adox %r13,%r12
3016
+
3017
+ mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded
3018
+ mov %rax,%rdx
3019
+ mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i]
3020
+
3021
+ mulx 16*5($nptr),%rax,%r13
3022
+ adcx %rax,%r12
3023
+ adox %r14,%r13
3024
+
3025
+ mulx 16*6($nptr),%rax,%r14
3026
+ adcx %rax,%r13
3027
+ adox %r15,%r14
3028
+
3029
+ mulx 16*7($nptr),%rax,%r15
3030
+ mov %rbx,%rdx
3031
+ adcx %rax,%r14
3032
+ adox $carry,%r15 # $carry is 0
3033
+ adcx $carry,%r15 # cf=0
3034
+
3035
+ .byte 0x67,0x67,0x67
3036
+ inc %rcx # of=0
3037
+ jnz .Lsqrx8x_reduce
3038
+
3039
+ mov $carry,%rax # xor %rax,%rax
3040
+ cmp 0+8(%rsp),$nptr # end of n[]?
3041
+ jae .Lsqrx8x_no_tail
3042
+
3043
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
3044
+ add 8*0($tptr),%r8
3045
+ lea 16*8($nptr),$nptr
3046
+ mov \$-8,%rcx
3047
+ adcx 8*1($tptr),%r9
3048
+ adcx 8*2($tptr),%r10
3049
+ adc 8*3($tptr),%r11
3050
+ adc 8*4($tptr),%r12
3051
+ adc 8*5($tptr),%r13
3052
+ adc 8*6($tptr),%r14
3053
+ adc 8*7($tptr),%r15
3054
+ lea 8*8($tptr),$tptr
3055
+ sbb %rax,%rax # top carry
3056
+
3057
+ xor $carry,$carry # of=0, cf=0
3058
+ mov %rax,16+8(%rsp)
3059
+ jmp .Lsqrx8x_tail
3060
+
3061
+ .align 32
3062
+ .Lsqrx8x_tail:
3063
+ mov %r8,%rbx
3064
+ mulx 16*0($nptr),%rax,%r8
3065
+ adcx %rax,%rbx
3066
+ adox %r9,%r8
3067
+
3068
+ mulx 16*1($nptr),%rax,%r9
3069
+ adcx %rax,%r8
3070
+ adox %r10,%r9
3071
+
3072
+ mulx 16*2($nptr),%rax,%r10
3073
+ adcx %rax,%r9
3074
+ adox %r11,%r10
3075
+
3076
+ mulx 16*3($nptr),%rax,%r11
3077
+ adcx %rax,%r10
3078
+ adox %r12,%r11
3079
+
3080
+ .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00 # mulx 16*4($nptr),%rax,%r12
3081
+ adcx %rax,%r11
3082
+ adox %r13,%r12
3083
+
3084
+ mulx 16*5($nptr),%rax,%r13
3085
+ adcx %rax,%r12
3086
+ adox %r14,%r13
3087
+
3088
+ mulx 16*6($nptr),%rax,%r14
3089
+ adcx %rax,%r13
3090
+ adox %r15,%r14
3091
+
3092
+ mulx 16*7($nptr),%rax,%r15
3093
+ mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i]
3094
+ adcx %rax,%r14
3095
+ adox $carry,%r15
3096
+ mov %rbx,($tptr,%rcx,8) # save result
3097
+ mov %r8,%rbx
3098
+ adcx $carry,%r15 # cf=0
3099
+
3100
+ inc %rcx # of=0
3101
+ jnz .Lsqrx8x_tail
3102
+
3103
+ cmp 0+8(%rsp),$nptr # end of n[]?
3104
+ jae .Lsqrx8x_tail_done # break out of loop
3105
+
3106
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3107
+ mov 48+8(%rsp),%rdx # pull n0*a[0]
3108
+ lea 16*8($nptr),$nptr
3109
+ adc 8*0($tptr),%r8
3110
+ adc 8*1($tptr),%r9
3111
+ adc 8*2($tptr),%r10
3112
+ adc 8*3($tptr),%r11
3113
+ adc 8*4($tptr),%r12
3114
+ adc 8*5($tptr),%r13
3115
+ adc 8*6($tptr),%r14
3116
+ adc 8*7($tptr),%r15
3117
+ lea 8*8($tptr),$tptr
3118
+ sbb %rax,%rax
3119
+ sub \$8,%rcx # mov \$-8,%rcx
3120
+
3121
+ xor $carry,$carry # of=0, cf=0
3122
+ mov %rax,16+8(%rsp)
3123
+ jmp .Lsqrx8x_tail
3124
+
3125
+ .align 32
3126
+ .Lsqrx8x_tail_done:
3127
+ add 24+8(%rsp),%r8 # can this overflow?
3128
+ adc \$0,%r9
3129
+ adc \$0,%r10
3130
+ adc \$0,%r11
3131
+ adc \$0,%r12
3132
+ adc \$0,%r13
3133
+ adc \$0,%r14
3134
+ adc \$0,%r15 # can't overflow, because we
3135
+ # started with "overhung" part
3136
+ # of multiplication
3137
+ mov $carry,%rax # xor %rax,%rax
3138
+
3139
+ sub 16+8(%rsp),$carry # mov 16(%rsp),%cf
3140
+ .Lsqrx8x_no_tail: # %cf is 0 if jumped here
3141
+ adc 8*0($tptr),%r8
3142
+ movq %xmm3,%rcx
3143
+ adc 8*1($tptr),%r9
3144
+ mov 16*7($nptr),$carry
3145
+ movq %xmm2,$nptr # restore $nptr
3146
+ adc 8*2($tptr),%r10
3147
+ adc 8*3($tptr),%r11
3148
+ adc 8*4($tptr),%r12
3149
+ adc 8*5($tptr),%r13
3150
+ adc 8*6($tptr),%r14
3151
+ adc 8*7($tptr),%r15
3152
+ adc %rax,%rax # top-most carry
3153
+
3154
+ mov 32+8(%rsp),%rbx # n0
3155
+ mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8"
3156
+
3157
+ mov %r8,8*0($tptr) # store top 512 bits
3158
+ lea 8*8($tptr),%r8 # borrow %r8
3159
+ mov %r9,8*1($tptr)
3160
+ mov %r10,8*2($tptr)
3161
+ mov %r11,8*3($tptr)
3162
+ mov %r12,8*4($tptr)
3163
+ mov %r13,8*5($tptr)
3164
+ mov %r14,8*6($tptr)
3165
+ mov %r15,8*7($tptr)
3166
+
3167
+ lea 8*8($tptr,%rcx),$tptr # start of current t[] window
3168
+ cmp 8+8(%rsp),%r8 # end of t[]?
3169
+ jb .Lsqrx8x_reduction_loop
3170
+ ___
3171
+ }
3172
+ ##############################################################
3173
+ # Post-condition, 4x unrolled
3174
+ #
3175
+ {
3176
+ my ($rptr,$nptr)=("%rdx","%rbp");
3177
+ my @ri=map("%r$_",(10..13));
3178
+ my @ni=map("%r$_",(14..15));
3179
+ $code.=<<___;
3180
+ xor %ebx,%ebx
3181
+ sub %r15,%rsi # compare top-most words
3182
+ adc %rbx,%rbx
3183
+ mov %rcx,%r10 # -$num
3184
+ or %rbx,%rax
3185
+ mov %rcx,%r9 # -$num
3186
+ xor \$1,%rax
3187
+ sar \$3+2,%rcx # cf=0
3188
+ #lea 48+8(%rsp,%r9),$tptr
3189
+ lea ($nptr,%rax,8),$nptr
3190
+ movq %xmm1,$rptr # restore $rptr
3191
+ movq %xmm1,$aptr # prepare for back-to-back call
3192
+ jmp .Lsqrx4x_sub
3193
+
3194
+ .align 32
3195
+ .Lsqrx4x_sub:
3196
+ .byte 0x66
3197
+ mov 8*0($tptr),%r12
3198
+ mov 8*1($tptr),%r13
3199
+ sbb 16*0($nptr),%r12
3200
+ mov 8*2($tptr),%r14
3201
+ sbb 16*1($nptr),%r13
3202
+ mov 8*3($tptr),%r15
3203
+ lea 8*4($tptr),$tptr
3204
+ sbb 16*2($nptr),%r14
3205
+ mov %r12,8*0($rptr)
3206
+ sbb 16*3($nptr),%r15
3207
+ lea 16*4($nptr),$nptr
3208
+ mov %r13,8*1($rptr)
3209
+ mov %r14,8*2($rptr)
3210
+ mov %r15,8*3($rptr)
3211
+ lea 8*4($rptr),$rptr
3212
+
3213
+ inc %rcx
3214
+ jnz .Lsqrx4x_sub
3215
+ ___
3216
+ }
3217
+ $code.=<<___;
3218
+ neg %r9 # restore $num
3219
+
3220
+ ret
3221
+ .size bn_sqrx8x_internal,.-bn_sqrx8x_internal
3222
+ ___
3223
+ }}}
3224
+ {
3225
+ my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3226
+ ("%rdi","%esi","%rdx","%ecx"); # Unix order
3227
+ my $out=$inp;
3228
+ my $STRIDE=2**5*8;
3229
+ my $N=$STRIDE/4;
3230
+
3231
+ $code.=<<___;
3232
+ .globl bn_scatter5
3233
+ .type bn_scatter5,\@abi-omnipotent
3234
+ .align 16
3235
+ bn_scatter5:
3236
+ cmp \$0, $num
3237
+ jz .Lscatter_epilogue
3238
+ lea ($tbl,$idx,8),$tbl
3239
+ .Lscatter:
3240
+ mov ($inp),%rax
3241
+ lea 8($inp),$inp
3242
+ mov %rax,($tbl)
3243
+ lea 32*8($tbl),$tbl
3244
+ sub \$1,$num
3245
+ jnz .Lscatter
3246
+ .Lscatter_epilogue:
3247
+ ret
3248
+ .size bn_scatter5,.-bn_scatter5
3249
+
3250
+ .globl bn_gather5
3251
+ .type bn_gather5,\@abi-omnipotent
3252
+ .align 16
3253
+ bn_gather5:
3254
+ ___
3255
+ $code.=<<___ if ($win64);
3256
+ .LSEH_begin_bn_gather5:
3257
+ # I can't trust assembler to use specific encoding:-(
3258
+ .byte 0x48,0x83,0xec,0x28 #sub \$0x28,%rsp
3259
+ .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
3260
+ .byte 0x0f,0x29,0x7c,0x24,0x10 #movdqa %xmm7,0x10(%rsp)
3261
+ ___
3262
+ $code.=<<___;
3263
+ mov $idx,%r11d
3264
+ shr \$`log($N/8)/log(2)`,$idx
3265
+ and \$`$N/8-1`,%r11
3266
+ not $idx
3267
+ lea .Lmagic_masks(%rip),%rax
3268
+ and \$`2**5/($N/8)-1`,$idx # 5 is "window size"
3269
+ lea 128($tbl,%r11,8),$tbl # pointer within 1st cache line
3270
+ movq 0(%rax,$idx,8),%xmm4 # set of masks denoting which
3271
+ movq 8(%rax,$idx,8),%xmm5 # cache line contains element
3272
+ movq 16(%rax,$idx,8),%xmm6 # denoted by 7th argument
3273
+ movq 24(%rax,$idx,8),%xmm7
3274
+ jmp .Lgather
3275
+ .align 16
3276
+ .Lgather:
3277
+ movq `0*$STRIDE/4-128`($tbl),%xmm0
3278
+ movq `1*$STRIDE/4-128`($tbl),%xmm1
3279
+ pand %xmm4,%xmm0
3280
+ movq `2*$STRIDE/4-128`($tbl),%xmm2
3281
+ pand %xmm5,%xmm1
3282
+ movq `3*$STRIDE/4-128`($tbl),%xmm3
3283
+ pand %xmm6,%xmm2
3284
+ por %xmm1,%xmm0
3285
+ pand %xmm7,%xmm3
3286
+ .byte 0x67,0x67
3287
+ por %xmm2,%xmm0
3288
+ lea $STRIDE($tbl),$tbl
3289
+ por %xmm3,%xmm0
3290
+
3291
+ movq %xmm0,($out) # m0=bp[0]
3292
+ lea 8($out),$out
3293
+ sub \$1,$num
3294
+ jnz .Lgather
3295
+ ___
3296
+ $code.=<<___ if ($win64);
3297
+ movaps (%rsp),%xmm6
3298
+ movaps 0x10(%rsp),%xmm7
3299
+ lea 0x28(%rsp),%rsp
3300
+ ___
3301
+ $code.=<<___;
3302
+ ret
3303
+ .LSEH_end_bn_gather5:
3304
+ .size bn_gather5,.-bn_gather5
3305
+ ___
3306
+ }
3307
+ $code.=<<___;
3308
+ .align 64
3309
+ .Lmagic_masks:
3310
+ .long 0,0, 0,0, 0,0, -1,-1
3311
+ .long 0,0, 0,0, 0,0, 0,0
3312
+ .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3313
+ ___
3314
+
3315
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3316
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3317
+ if ($win64) {
3318
+ $rec="%rcx";
3319
+ $frame="%rdx";
3320
+ $context="%r8";
3321
+ $disp="%r9";
3322
+
3323
+ $code.=<<___;
3324
+ .extern __imp_RtlVirtualUnwind
3325
+ .type mul_handler,\@abi-omnipotent
3326
+ .align 16
3327
+ mul_handler:
3328
+ push %rsi
3329
+ push %rdi
3330
+ push %rbx
3331
+ push %rbp
3332
+ push %r12
3333
+ push %r13
3334
+ push %r14
3335
+ push %r15
3336
+ pushfq
3337
+ sub \$64,%rsp
3338
+
3339
+ mov 120($context),%rax # pull context->Rax
3340
+ mov 248($context),%rbx # pull context->Rip
3341
+
3342
+ mov 8($disp),%rsi # disp->ImageBase
3343
+ mov 56($disp),%r11 # disp->HandlerData
3344
+
3345
+ mov 0(%r11),%r10d # HandlerData[0]
3346
+ lea (%rsi,%r10),%r10 # end of prologue label
3347
+ cmp %r10,%rbx # context->Rip<end of prologue label
3348
+ jb .Lcommon_seh_tail
3349
+
3350
+ mov 152($context),%rax # pull context->Rsp
3351
+
3352
+ mov 4(%r11),%r10d # HandlerData[1]
3353
+ lea (%rsi,%r10),%r10 # epilogue label
3354
+ cmp %r10,%rbx # context->Rip>=epilogue label
3355
+ jae .Lcommon_seh_tail
3356
+
3357
+ lea .Lmul_epilogue(%rip),%r10
3358
+ cmp %r10,%rbx
3359
+ jb .Lbody_40
3360
+
3361
+ mov 192($context),%r10 # pull $num
3362
+ mov 8(%rax,%r10,8),%rax # pull saved stack pointer
3363
+ jmp .Lbody_proceed
3364
+
3365
+ .Lbody_40:
3366
+ mov 40(%rax),%rax # pull saved stack pointer
3367
+ .Lbody_proceed:
3368
+
3369
+ movaps -88(%rax),%xmm0
3370
+ movaps -72(%rax),%xmm1
3371
+
3372
+ mov -8(%rax),%rbx
3373
+ mov -16(%rax),%rbp
3374
+ mov -24(%rax),%r12
3375
+ mov -32(%rax),%r13
3376
+ mov -40(%rax),%r14
3377
+ mov -48(%rax),%r15
3378
+ mov %rbx,144($context) # restore context->Rbx
3379
+ mov %rbp,160($context) # restore context->Rbp
3380
+ mov %r12,216($context) # restore context->R12
3381
+ mov %r13,224($context) # restore context->R13
3382
+ mov %r14,232($context) # restore context->R14
3383
+ mov %r15,240($context) # restore context->R15
3384
+ movups %xmm0,512($context) # restore context->Xmm6
3385
+ movups %xmm1,528($context) # restore context->Xmm7
3386
+
3387
+ .Lcommon_seh_tail:
3388
+ mov 8(%rax),%rdi
3389
+ mov 16(%rax),%rsi
3390
+ mov %rax,152($context) # restore context->Rsp
3391
+ mov %rsi,168($context) # restore context->Rsi
3392
+ mov %rdi,176($context) # restore context->Rdi
3393
+
3394
+ mov 40($disp),%rdi # disp->ContextRecord
3395
+ mov $context,%rsi # context
3396
+ mov \$154,%ecx # sizeof(CONTEXT)
3397
+ .long 0xa548f3fc # cld; rep movsq
3398
+
3399
+ mov $disp,%rsi
3400
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3401
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
3402
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
3403
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3404
+ mov 40(%rsi),%r10 # disp->ContextRecord
3405
+ lea 56(%rsi),%r11 # &disp->HandlerData
3406
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
3407
+ mov %r10,32(%rsp) # arg5
3408
+ mov %r11,40(%rsp) # arg6
3409
+ mov %r12,48(%rsp) # arg7
3410
+ mov %rcx,56(%rsp) # arg8, (NULL)
3411
+ call *__imp_RtlVirtualUnwind(%rip)
3412
+
3413
+ mov \$1,%eax # ExceptionContinueSearch
3414
+ add \$64,%rsp
3415
+ popfq
3416
+ pop %r15
3417
+ pop %r14
3418
+ pop %r13
3419
+ pop %r12
3420
+ pop %rbp
3421
+ pop %rbx
3422
+ pop %rdi
3423
+ pop %rsi
3424
+ ret
3425
+ .size mul_handler,.-mul_handler
3426
+
3427
+ .section .pdata
3428
+ .align 4
3429
+ .rva .LSEH_begin_bn_mul_mont_gather5
3430
+ .rva .LSEH_end_bn_mul_mont_gather5
3431
+ .rva .LSEH_info_bn_mul_mont_gather5
3432
+
3433
+ .rva .LSEH_begin_bn_mul4x_mont_gather5
3434
+ .rva .LSEH_end_bn_mul4x_mont_gather5
3435
+ .rva .LSEH_info_bn_mul4x_mont_gather5
3436
+
3437
+ .rva .LSEH_begin_bn_power5
3438
+ .rva .LSEH_end_bn_power5
3439
+ .rva .LSEH_info_bn_power5
3440
+
3441
+ .rva .LSEH_begin_bn_from_mont8x
3442
+ .rva .LSEH_end_bn_from_mont8x
3443
+ .rva .LSEH_info_bn_from_mont8x
3444
+ ___
3445
+ $code.=<<___ if ($addx);
3446
+ .rva .LSEH_begin_bn_mulx4x_mont_gather5
3447
+ .rva .LSEH_end_bn_mulx4x_mont_gather5
3448
+ .rva .LSEH_info_bn_mulx4x_mont_gather5
3449
+
3450
+ .rva .LSEH_begin_bn_powerx5
3451
+ .rva .LSEH_end_bn_powerx5
3452
+ .rva .LSEH_info_bn_powerx5
3453
+ ___
3454
+ $code.=<<___;
3455
+ .rva .LSEH_begin_bn_gather5
3456
+ .rva .LSEH_end_bn_gather5
3457
+ .rva .LSEH_info_bn_gather5
3458
+
3459
+ .section .xdata
3460
+ .align 8
3461
+ .LSEH_info_bn_mul_mont_gather5:
3462
+ .byte 9,0,0,0
3463
+ .rva mul_handler
3464
+ .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
3465
+ .align 8
3466
+ .LSEH_info_bn_mul4x_mont_gather5:
3467
+ .byte 9,0,0,0
3468
+ .rva mul_handler
3469
+ .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
3470
+ .align 8
3471
+ .LSEH_info_bn_power5:
3472
+ .byte 9,0,0,0
3473
+ .rva mul_handler
3474
+ .rva .Lpower5_body,.Lpower5_epilogue # HandlerData[]
3475
+ .align 8
3476
+ .LSEH_info_bn_from_mont8x:
3477
+ .byte 9,0,0,0
3478
+ .rva mul_handler
3479
+ .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[]
3480
+ ___
3481
+ $code.=<<___ if ($addx);
3482
+ .align 8
3483
+ .LSEH_info_bn_mulx4x_mont_gather5:
3484
+ .byte 9,0,0,0
3485
+ .rva mul_handler
3486
+ .rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
3487
+ .align 8
3488
+ .LSEH_info_bn_powerx5:
3489
+ .byte 9,0,0,0
3490
+ .rva mul_handler
3491
+ .rva .Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[]
3492
+ ___
3493
+ $code.=<<___;
3494
+ .align 8
3495
+ .LSEH_info_bn_gather5:
3496
+ .byte 0x01,0x0d,0x05,0x00
3497
+ .byte 0x0d,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
3498
+ .byte 0x08,0x68,0x00,0x00 #movaps (rsp),xmm6
3499
+ .byte 0x04,0x42,0x00,0x00 #sub rsp,0x28
3500
+ .align 8
3501
+ ___
3502
+ }
3503
+
3504
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3505
+
3506
+ print $code;
3507
+ close STDOUT;