ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,287 @@
1
+ #!/usr/local/bin/perl
2
+
3
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4
+ push(@INC,"${dir}","${dir}../../perlasm");
5
+ require "x86asm.pl";
6
+
7
+ &asm_init($ARGV[0],$0);
8
+
9
+ &bn_mul_comba("bn_mul_comba8",8);
10
+ &bn_mul_comba("bn_mul_comba4",4);
11
+ &bn_sqr_comba("bn_sqr_comba8",8);
12
+ &bn_sqr_comba("bn_sqr_comba4",4);
13
+
14
+ &asm_finish();
15
+
16
+ sub mul_add_c
17
+ {
18
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
+
20
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21
+ # words, and 1 if load return value
22
+
23
+ &comment("mul a[$ai]*b[$bi]");
24
+
25
+ # "eax" and "edx" will always be pre-loaded.
26
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27
+ # &mov("edx",&DWP($bi*4,$b,"",0));
28
+
29
+ &mul("edx");
30
+ &add($c0,"eax");
31
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
32
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33
+ ###
34
+ &adc($c1,"edx");
35
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
36
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
37
+ ###
38
+ &adc($c2,0);
39
+ # is pos > 1, it means it is the last loop
40
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
42
+ }
43
+
44
+ sub sqr_add_c
45
+ {
46
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
+
48
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49
+ # words, and 1 if load return value
50
+
51
+ &comment("sqr a[$ai]*a[$bi]");
52
+
53
+ # "eax" and "edx" will always be pre-loaded.
54
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55
+ # &mov("edx",&DWP($bi*4,$b,"",0));
56
+
57
+ if ($ai == $bi)
58
+ { &mul("eax");}
59
+ else
60
+ { &mul("edx");}
61
+ &add($c0,"eax");
62
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63
+ ###
64
+ &adc($c1,"edx");
65
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66
+ ###
67
+ &adc($c2,0);
68
+ # is pos > 1, it means it is the last loop
69
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71
+ }
72
+
73
+ sub sqr_add_c2
74
+ {
75
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
+
77
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78
+ # words, and 1 if load return value
79
+
80
+ &comment("sqr a[$ai]*a[$bi]");
81
+
82
+ # "eax" and "edx" will always be pre-loaded.
83
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84
+ # &mov("edx",&DWP($bi*4,$a,"",0));
85
+
86
+ if ($ai == $bi)
87
+ { &mul("eax");}
88
+ else
89
+ { &mul("edx");}
90
+ &add("eax","eax");
91
+ ###
92
+ &adc("edx","edx");
93
+ ###
94
+ &adc($c2,0);
95
+ &add($c0,"eax");
96
+ &adc($c1,"edx");
97
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99
+ &adc($c2,0);
100
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102
+ ###
103
+ }
104
+
105
+ sub bn_mul_comba
106
+ {
107
+ local($name,$num)=@_;
108
+ local($a,$b,$c0,$c1,$c2);
109
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
110
+ local($tot,$end);
111
+
112
+ &function_begin_B($name,"");
113
+
114
+ $c0="ebx";
115
+ $c1="ecx";
116
+ $c2="ebp";
117
+ $a="esi";
118
+ $b="edi";
119
+
120
+ $as=0;
121
+ $ae=0;
122
+ $bs=0;
123
+ $be=0;
124
+ $tot=$num+$num-1;
125
+
126
+ &push("esi");
127
+ &mov($a,&wparam(1));
128
+ &push("edi");
129
+ &mov($b,&wparam(2));
130
+ &push("ebp");
131
+ &push("ebx");
132
+
133
+ &xor($c0,$c0);
134
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
135
+ &xor($c1,$c1);
136
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
+
138
+ for ($i=0; $i<$tot; $i++)
139
+ {
140
+ $ai=$as;
141
+ $bi=$bs;
142
+ $end=$be+1;
143
+
144
+ &comment("################## Calculate word $i");
145
+
146
+ for ($j=$bs; $j<$end; $j++)
147
+ {
148
+ &xor($c2,$c2) if ($j == $bs);
149
+ if (($j+1) == $end)
150
+ {
151
+ $v=1;
152
+ $v=2 if (($i+1) == $tot);
153
+ }
154
+ else
155
+ { $v=0; }
156
+ if (($j+1) != $end)
157
+ {
158
+ $na=($ai-1);
159
+ $nb=($bi+1);
160
+ }
161
+ else
162
+ {
163
+ $na=$as+($i < ($num-1));
164
+ $nb=$bs+($i >= ($num-1));
165
+ }
166
+ #printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168
+ if ($v)
169
+ {
170
+ &comment("saved r[$i]");
171
+ # &mov("eax",&wparam(0));
172
+ # &mov(&DWP($i*4,"eax","",0),$c0);
173
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
174
+ }
175
+ $ai--;
176
+ $bi++;
177
+ }
178
+ $as++ if ($i < ($num-1));
179
+ $ae++ if ($i >= ($num-1));
180
+
181
+ $bs++ if ($i >= ($num-1));
182
+ $be++ if ($i < ($num-1));
183
+ }
184
+ &comment("save r[$i]");
185
+ # &mov("eax",&wparam(0));
186
+ &mov(&DWP($i*4,"eax","",0),$c0);
187
+
188
+ &pop("ebx");
189
+ &pop("ebp");
190
+ &pop("edi");
191
+ &pop("esi");
192
+ &ret();
193
+ &function_end_B($name);
194
+ }
195
+
196
+ sub bn_sqr_comba
197
+ {
198
+ local($name,$num)=@_;
199
+ local($r,$a,$c0,$c1,$c2)=@_;
200
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
201
+ local($b,$tot,$end,$half);
202
+
203
+ &function_begin_B($name,"");
204
+
205
+ $c0="ebx";
206
+ $c1="ecx";
207
+ $c2="ebp";
208
+ $a="esi";
209
+ $r="edi";
210
+
211
+ &push("esi");
212
+ &push("edi");
213
+ &push("ebp");
214
+ &push("ebx");
215
+ &mov($r,&wparam(0));
216
+ &mov($a,&wparam(1));
217
+ &xor($c0,$c0);
218
+ &xor($c1,$c1);
219
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
+
221
+ $as=0;
222
+ $ae=0;
223
+ $bs=0;
224
+ $be=0;
225
+ $tot=$num+$num-1;
226
+
227
+ for ($i=0; $i<$tot; $i++)
228
+ {
229
+ $ai=$as;
230
+ $bi=$bs;
231
+ $end=$be+1;
232
+
233
+ &comment("############### Calculate word $i");
234
+ for ($j=$bs; $j<$end; $j++)
235
+ {
236
+ &xor($c2,$c2) if ($j == $bs);
237
+ if (($ai-1) < ($bi+1))
238
+ {
239
+ $v=1;
240
+ $v=2 if ($i+1) == $tot;
241
+ }
242
+ else
243
+ { $v=0; }
244
+ if (!$v)
245
+ {
246
+ $na=$ai-1;
247
+ $nb=$bi+1;
248
+ }
249
+ else
250
+ {
251
+ $na=$as+($i < ($num-1));
252
+ $nb=$bs+($i >= ($num-1));
253
+ }
254
+ if ($ai == $bi)
255
+ {
256
+ &sqr_add_c($r,$a,$ai,$bi,
257
+ $c0,$c1,$c2,$v,$i,$na,$nb);
258
+ }
259
+ else
260
+ {
261
+ &sqr_add_c2($r,$a,$ai,$bi,
262
+ $c0,$c1,$c2,$v,$i,$na,$nb);
263
+ }
264
+ if ($v)
265
+ {
266
+ &comment("saved r[$i]");
267
+ #&mov(&DWP($i*4,$r,"",0),$c0);
268
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
269
+ last;
270
+ }
271
+ $ai--;
272
+ $bi++;
273
+ }
274
+ $as++ if ($i < ($num-1));
275
+ $ae++ if ($i >= ($num-1));
276
+
277
+ $bs++ if ($i >= ($num-1));
278
+ $be++ if ($i < ($num-1));
279
+ }
280
+ &mov(&DWP($i*4,$r,"",0),$c0);
281
+ &pop("ebx");
282
+ &pop("ebp");
283
+ &pop("edi");
284
+ &pop("esi");
285
+ &ret();
286
+ &function_end_B($name);
287
+ }
@@ -0,0 +1,1882 @@
1
+ #!/usr/bin/env perl
2
+
3
+ ##############################################################################
4
+ # #
5
+ # Copyright (c) 2012, Intel Corporation #
6
+ # #
7
+ # All rights reserved. #
8
+ # #
9
+ # Redistribution and use in source and binary forms, with or without #
10
+ # modification, are permitted provided that the following conditions are #
11
+ # met: #
12
+ # #
13
+ # * Redistributions of source code must retain the above copyright #
14
+ # notice, this list of conditions and the following disclaimer. #
15
+ # #
16
+ # * Redistributions in binary form must reproduce the above copyright #
17
+ # notice, this list of conditions and the following disclaimer in the #
18
+ # documentation and/or other materials provided with the #
19
+ # distribution. #
20
+ # #
21
+ # * Neither the name of the Intel Corporation nor the names of its #
22
+ # contributors may be used to endorse or promote products derived from #
23
+ # this software without specific prior written permission. #
24
+ # #
25
+ # #
26
+ # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37
+ # #
38
+ ##############################################################################
39
+ # Developers and authors: #
40
+ # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41
+ # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42
+ # (2) University of Haifa, Israel #
43
+ ##############################################################################
44
+ # Reference: #
45
+ # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46
+ # Exponentiation, Using Advanced Vector Instructions Architectures", #
47
+ # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48
+ # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49
+ # [2] S. Gueron: "Efficient Software Implementations of Modular #
50
+ # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51
+ # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52
+ # Proceedings of 9th International Conference on Information Technology: #
53
+ # New Generations (ITNG 2012), pp.821-823 (2012) #
54
+ # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55
+ # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56
+ # on AVX2 capable x86_64 platforms", #
57
+ # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58
+ ##############################################################################
59
+ #
60
+ # +13% improvement over original submission by <appro@openssl.org>
61
+ #
62
+ # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63
+ # 2.3GHz Haswell 621 765/+23% 1113/+79%
64
+ # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
65
+ #
66
+ # (*) if system doesn't support AVX2, for reference purposes;
67
+ # (**) scaled to 2.3GHz to simplify comparison;
68
+ # (***) scalar AD*X code is faster than AVX2 and is preferred code
69
+ # path for Broadwell;
70
+
71
+ $flavour = shift;
72
+ $output = shift;
73
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
+
75
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
+
77
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80
+ die "can't locate x86_64-xlate.pl";
81
+
82
+ # In upstream, this is controlled by shelling out to the compiler to check
83
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
84
+ # output, so this isn't useful anyway.
85
+ #
86
+ # TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
87
+ $avx = 0;
88
+ $addx = 0;
89
+
90
+ open OUT,"| \"$^X\" $xlate $flavour $output";
91
+ *STDOUT = *OUT;
92
+
93
+ if ($avx>1) {{{
94
+ { # void AMS_WW(
95
+ my $rp="%rdi"; # BN_ULONG *rp,
96
+ my $ap="%rsi"; # const BN_ULONG *ap,
97
+ my $np="%rdx"; # const BN_ULONG *np,
98
+ my $n0="%ecx"; # const BN_ULONG n0,
99
+ my $rep="%r8d"; # int repeat);
100
+
101
+ # The registers that hold the accumulated redundant result
102
+ # The AMM works on 1024 bit operands, and redundant word size is 29
103
+ # Therefore: ceil(1024/29)/4 = 9
104
+ my $ACC0="%ymm0";
105
+ my $ACC1="%ymm1";
106
+ my $ACC2="%ymm2";
107
+ my $ACC3="%ymm3";
108
+ my $ACC4="%ymm4";
109
+ my $ACC5="%ymm5";
110
+ my $ACC6="%ymm6";
111
+ my $ACC7="%ymm7";
112
+ my $ACC8="%ymm8";
113
+ my $ACC9="%ymm9";
114
+ # Registers that hold the broadcasted words of bp, currently used
115
+ my $B1="%ymm10";
116
+ my $B2="%ymm11";
117
+ # Registers that hold the broadcasted words of Y, currently used
118
+ my $Y1="%ymm12";
119
+ my $Y2="%ymm13";
120
+ # Helper registers
121
+ my $TEMP1="%ymm14";
122
+ my $AND_MASK="%ymm15";
123
+ # alu registers that hold the first words of the ACC
124
+ my $r0="%r9";
125
+ my $r1="%r10";
126
+ my $r2="%r11";
127
+ my $r3="%r12";
128
+
129
+ my $i="%r14d"; # loop counter
130
+ my $tmp = "%r15";
131
+
132
+ my $FrameSize=32*18+32*8; # place for A^2 and 2*A
133
+
134
+ my $aap=$r0;
135
+ my $tp0="%rbx";
136
+ my $tp1=$r3;
137
+ my $tpa=$tmp;
138
+
139
+ $np="%r13"; # reassigned argument
140
+
141
+ $code.=<<___;
142
+ .text
143
+
144
+ .globl rsaz_1024_sqr_avx2
145
+ .type rsaz_1024_sqr_avx2,\@function,5
146
+ .align 64
147
+ rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
148
+ lea (%rsp), %rax
149
+ push %rbx
150
+ push %rbp
151
+ push %r12
152
+ push %r13
153
+ push %r14
154
+ push %r15
155
+ vzeroupper
156
+ ___
157
+ $code.=<<___ if ($win64);
158
+ lea -0xa8(%rsp),%rsp
159
+ vmovaps %xmm6,-0xd8(%rax)
160
+ vmovaps %xmm7,-0xc8(%rax)
161
+ vmovaps %xmm8,-0xb8(%rax)
162
+ vmovaps %xmm9,-0xa8(%rax)
163
+ vmovaps %xmm10,-0x98(%rax)
164
+ vmovaps %xmm11,-0x88(%rax)
165
+ vmovaps %xmm12,-0x78(%rax)
166
+ vmovaps %xmm13,-0x68(%rax)
167
+ vmovaps %xmm14,-0x58(%rax)
168
+ vmovaps %xmm15,-0x48(%rax)
169
+ .Lsqr_1024_body:
170
+ ___
171
+ $code.=<<___;
172
+ mov %rax,%rbp
173
+ mov %rdx, $np # reassigned argument
174
+ sub \$$FrameSize, %rsp
175
+ mov $np, $tmp
176
+ sub \$-128, $rp # size optimization
177
+ sub \$-128, $ap
178
+ sub \$-128, $np
179
+
180
+ and \$4095, $tmp # see if $np crosses page
181
+ add \$32*10, $tmp
182
+ shr \$12, $tmp
183
+ vpxor $ACC9,$ACC9,$ACC9
184
+ jz .Lsqr_1024_no_n_copy
185
+
186
+ # unaligned 256-bit load that crosses page boundary can
187
+ # cause >2x performance degradation here, so if $np does
188
+ # cross page boundary, copy it to stack and make sure stack
189
+ # frame doesn't...
190
+ sub \$32*10,%rsp
191
+ vmovdqu 32*0-128($np), $ACC0
192
+ and \$-2048, %rsp
193
+ vmovdqu 32*1-128($np), $ACC1
194
+ vmovdqu 32*2-128($np), $ACC2
195
+ vmovdqu 32*3-128($np), $ACC3
196
+ vmovdqu 32*4-128($np), $ACC4
197
+ vmovdqu 32*5-128($np), $ACC5
198
+ vmovdqu 32*6-128($np), $ACC6
199
+ vmovdqu 32*7-128($np), $ACC7
200
+ vmovdqu 32*8-128($np), $ACC8
201
+ lea $FrameSize+128(%rsp),$np
202
+ vmovdqu $ACC0, 32*0-128($np)
203
+ vmovdqu $ACC1, 32*1-128($np)
204
+ vmovdqu $ACC2, 32*2-128($np)
205
+ vmovdqu $ACC3, 32*3-128($np)
206
+ vmovdqu $ACC4, 32*4-128($np)
207
+ vmovdqu $ACC5, 32*5-128($np)
208
+ vmovdqu $ACC6, 32*6-128($np)
209
+ vmovdqu $ACC7, 32*7-128($np)
210
+ vmovdqu $ACC8, 32*8-128($np)
211
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
212
+
213
+ .Lsqr_1024_no_n_copy:
214
+ and \$-1024, %rsp
215
+
216
+ vmovdqu 32*1-128($ap), $ACC1
217
+ vmovdqu 32*2-128($ap), $ACC2
218
+ vmovdqu 32*3-128($ap), $ACC3
219
+ vmovdqu 32*4-128($ap), $ACC4
220
+ vmovdqu 32*5-128($ap), $ACC5
221
+ vmovdqu 32*6-128($ap), $ACC6
222
+ vmovdqu 32*7-128($ap), $ACC7
223
+ vmovdqu 32*8-128($ap), $ACC8
224
+
225
+ lea 192(%rsp), $tp0 # 64+128=192
226
+ vpbroadcastq .Land_mask(%rip), $AND_MASK
227
+ jmp .LOOP_GRANDE_SQR_1024
228
+
229
+ .align 32
230
+ .LOOP_GRANDE_SQR_1024:
231
+ lea 32*18+128(%rsp), $aap # size optimization
232
+ lea 448(%rsp), $tp1 # 64+128+256=448
233
+
234
+ # the squaring is performed as described in Variant B of
235
+ # "Speeding up Big-Number Squaring", so start by calculating
236
+ # the A*2=A+A vector
237
+ vpaddq $ACC1, $ACC1, $ACC1
238
+ vpbroadcastq 32*0-128($ap), $B1
239
+ vpaddq $ACC2, $ACC2, $ACC2
240
+ vmovdqa $ACC1, 32*0-128($aap)
241
+ vpaddq $ACC3, $ACC3, $ACC3
242
+ vmovdqa $ACC2, 32*1-128($aap)
243
+ vpaddq $ACC4, $ACC4, $ACC4
244
+ vmovdqa $ACC3, 32*2-128($aap)
245
+ vpaddq $ACC5, $ACC5, $ACC5
246
+ vmovdqa $ACC4, 32*3-128($aap)
247
+ vpaddq $ACC6, $ACC6, $ACC6
248
+ vmovdqa $ACC5, 32*4-128($aap)
249
+ vpaddq $ACC7, $ACC7, $ACC7
250
+ vmovdqa $ACC6, 32*5-128($aap)
251
+ vpaddq $ACC8, $ACC8, $ACC8
252
+ vmovdqa $ACC7, 32*6-128($aap)
253
+ vpxor $ACC9, $ACC9, $ACC9
254
+ vmovdqa $ACC8, 32*7-128($aap)
255
+
256
+ vpmuludq 32*0-128($ap), $B1, $ACC0
257
+ vpbroadcastq 32*1-128($ap), $B2
258
+ vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
259
+ vpmuludq $B1, $ACC1, $ACC1
260
+ vmovdqu $ACC9, 32*10-448($tp1)
261
+ vpmuludq $B1, $ACC2, $ACC2
262
+ vmovdqu $ACC9, 32*11-448($tp1)
263
+ vpmuludq $B1, $ACC3, $ACC3
264
+ vmovdqu $ACC9, 32*12-448($tp1)
265
+ vpmuludq $B1, $ACC4, $ACC4
266
+ vmovdqu $ACC9, 32*13-448($tp1)
267
+ vpmuludq $B1, $ACC5, $ACC5
268
+ vmovdqu $ACC9, 32*14-448($tp1)
269
+ vpmuludq $B1, $ACC6, $ACC6
270
+ vmovdqu $ACC9, 32*15-448($tp1)
271
+ vpmuludq $B1, $ACC7, $ACC7
272
+ vmovdqu $ACC9, 32*16-448($tp1)
273
+ vpmuludq $B1, $ACC8, $ACC8
274
+ vpbroadcastq 32*2-128($ap), $B1
275
+ vmovdqu $ACC9, 32*17-448($tp1)
276
+
277
+ mov $ap, $tpa
278
+ mov \$4, $i
279
+ jmp .Lsqr_entry_1024
280
+ ___
281
+ $TEMP0=$Y1;
282
+ $TEMP2=$Y2;
283
+ $code.=<<___;
284
+ .align 32
285
+ .LOOP_SQR_1024:
286
+ vpbroadcastq 32*1-128($tpa), $B2
287
+ vpmuludq 32*0-128($ap), $B1, $ACC0
288
+ vpaddq 32*0-192($tp0), $ACC0, $ACC0
289
+ vpmuludq 32*0-128($aap), $B1, $ACC1
290
+ vpaddq 32*1-192($tp0), $ACC1, $ACC1
291
+ vpmuludq 32*1-128($aap), $B1, $ACC2
292
+ vpaddq 32*2-192($tp0), $ACC2, $ACC2
293
+ vpmuludq 32*2-128($aap), $B1, $ACC3
294
+ vpaddq 32*3-192($tp0), $ACC3, $ACC3
295
+ vpmuludq 32*3-128($aap), $B1, $ACC4
296
+ vpaddq 32*4-192($tp0), $ACC4, $ACC4
297
+ vpmuludq 32*4-128($aap), $B1, $ACC5
298
+ vpaddq 32*5-192($tp0), $ACC5, $ACC5
299
+ vpmuludq 32*5-128($aap), $B1, $ACC6
300
+ vpaddq 32*6-192($tp0), $ACC6, $ACC6
301
+ vpmuludq 32*6-128($aap), $B1, $ACC7
302
+ vpaddq 32*7-192($tp0), $ACC7, $ACC7
303
+ vpmuludq 32*7-128($aap), $B1, $ACC8
304
+ vpbroadcastq 32*2-128($tpa), $B1
305
+ vpaddq 32*8-192($tp0), $ACC8, $ACC8
306
+ .Lsqr_entry_1024:
307
+ vmovdqu $ACC0, 32*0-192($tp0)
308
+ vmovdqu $ACC1, 32*1-192($tp0)
309
+
310
+ vpmuludq 32*1-128($ap), $B2, $TEMP0
311
+ vpaddq $TEMP0, $ACC2, $ACC2
312
+ vpmuludq 32*1-128($aap), $B2, $TEMP1
313
+ vpaddq $TEMP1, $ACC3, $ACC3
314
+ vpmuludq 32*2-128($aap), $B2, $TEMP2
315
+ vpaddq $TEMP2, $ACC4, $ACC4
316
+ vpmuludq 32*3-128($aap), $B2, $TEMP0
317
+ vpaddq $TEMP0, $ACC5, $ACC5
318
+ vpmuludq 32*4-128($aap), $B2, $TEMP1
319
+ vpaddq $TEMP1, $ACC6, $ACC6
320
+ vpmuludq 32*5-128($aap), $B2, $TEMP2
321
+ vpaddq $TEMP2, $ACC7, $ACC7
322
+ vpmuludq 32*6-128($aap), $B2, $TEMP0
323
+ vpaddq $TEMP0, $ACC8, $ACC8
324
+ vpmuludq 32*7-128($aap), $B2, $ACC0
325
+ vpbroadcastq 32*3-128($tpa), $B2
326
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
327
+
328
+ vmovdqu $ACC2, 32*2-192($tp0)
329
+ vmovdqu $ACC3, 32*3-192($tp0)
330
+
331
+ vpmuludq 32*2-128($ap), $B1, $TEMP2
332
+ vpaddq $TEMP2, $ACC4, $ACC4
333
+ vpmuludq 32*2-128($aap), $B1, $TEMP0
334
+ vpaddq $TEMP0, $ACC5, $ACC5
335
+ vpmuludq 32*3-128($aap), $B1, $TEMP1
336
+ vpaddq $TEMP1, $ACC6, $ACC6
337
+ vpmuludq 32*4-128($aap), $B1, $TEMP2
338
+ vpaddq $TEMP2, $ACC7, $ACC7
339
+ vpmuludq 32*5-128($aap), $B1, $TEMP0
340
+ vpaddq $TEMP0, $ACC8, $ACC8
341
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
342
+ vpaddq $TEMP1, $ACC0, $ACC0
343
+ vpmuludq 32*7-128($aap), $B1, $ACC1
344
+ vpbroadcastq 32*4-128($tpa), $B1
345
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
346
+
347
+ vmovdqu $ACC4, 32*4-192($tp0)
348
+ vmovdqu $ACC5, 32*5-192($tp0)
349
+
350
+ vpmuludq 32*3-128($ap), $B2, $TEMP0
351
+ vpaddq $TEMP0, $ACC6, $ACC6
352
+ vpmuludq 32*3-128($aap), $B2, $TEMP1
353
+ vpaddq $TEMP1, $ACC7, $ACC7
354
+ vpmuludq 32*4-128($aap), $B2, $TEMP2
355
+ vpaddq $TEMP2, $ACC8, $ACC8
356
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
357
+ vpaddq $TEMP0, $ACC0, $ACC0
358
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
359
+ vpaddq $TEMP1, $ACC1, $ACC1
360
+ vpmuludq 32*7-128($aap), $B2, $ACC2
361
+ vpbroadcastq 32*5-128($tpa), $B2
362
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
363
+
364
+ vmovdqu $ACC6, 32*6-192($tp0)
365
+ vmovdqu $ACC7, 32*7-192($tp0)
366
+
367
+ vpmuludq 32*4-128($ap), $B1, $TEMP0
368
+ vpaddq $TEMP0, $ACC8, $ACC8
369
+ vpmuludq 32*4-128($aap), $B1, $TEMP1
370
+ vpaddq $TEMP1, $ACC0, $ACC0
371
+ vpmuludq 32*5-128($aap), $B1, $TEMP2
372
+ vpaddq $TEMP2, $ACC1, $ACC1
373
+ vpmuludq 32*6-128($aap), $B1, $TEMP0
374
+ vpaddq $TEMP0, $ACC2, $ACC2
375
+ vpmuludq 32*7-128($aap), $B1, $ACC3
376
+ vpbroadcastq 32*6-128($tpa), $B1
377
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
378
+
379
+ vmovdqu $ACC8, 32*8-192($tp0)
380
+ vmovdqu $ACC0, 32*9-192($tp0)
381
+ lea 8($tp0), $tp0
382
+
383
+ vpmuludq 32*5-128($ap), $B2, $TEMP2
384
+ vpaddq $TEMP2, $ACC1, $ACC1
385
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
386
+ vpaddq $TEMP0, $ACC2, $ACC2
387
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
388
+ vpaddq $TEMP1, $ACC3, $ACC3
389
+ vpmuludq 32*7-128($aap), $B2, $ACC4
390
+ vpbroadcastq 32*7-128($tpa), $B2
391
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
392
+
393
+ vmovdqu $ACC1, 32*10-448($tp1)
394
+ vmovdqu $ACC2, 32*11-448($tp1)
395
+
396
+ vpmuludq 32*6-128($ap), $B1, $TEMP0
397
+ vpaddq $TEMP0, $ACC3, $ACC3
398
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
399
+ vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
400
+ vpaddq $TEMP1, $ACC4, $ACC4
401
+ vpmuludq 32*7-128($aap), $B1, $ACC5
402
+ vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
403
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
404
+
405
+ vmovdqu $ACC3, 32*12-448($tp1)
406
+ vmovdqu $ACC4, 32*13-448($tp1)
407
+ lea 8($tpa), $tpa
408
+
409
+ vpmuludq 32*7-128($ap), $B2, $TEMP0
410
+ vpaddq $TEMP0, $ACC5, $ACC5
411
+ vpmuludq 32*7-128($aap), $B2, $ACC6
412
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
413
+
414
+ vpmuludq 32*8-128($ap), $ACC0, $ACC7
415
+ vmovdqu $ACC5, 32*14-448($tp1)
416
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
417
+ vmovdqu $ACC6, 32*15-448($tp1)
418
+ vmovdqu $ACC7, 32*16-448($tp1)
419
+ lea 8($tp1), $tp1
420
+
421
+ dec $i
422
+ jnz .LOOP_SQR_1024
423
+ ___
424
+ $ZERO = $ACC9;
425
+ $TEMP0 = $B1;
426
+ $TEMP2 = $B2;
427
+ $TEMP3 = $Y1;
428
+ $TEMP4 = $Y2;
429
+ $code.=<<___;
430
+ #we need to fix indexes 32-39 to avoid overflow
431
+ vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
432
+ vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
433
+ vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
434
+ lea 192(%rsp), $tp0 # 64+128=192
435
+
436
+ vpsrlq \$29, $ACC8, $TEMP1
437
+ vpand $AND_MASK, $ACC8, $ACC8
438
+ vpsrlq \$29, $ACC1, $TEMP2
439
+ vpand $AND_MASK, $ACC1, $ACC1
440
+
441
+ vpermq \$0x93, $TEMP1, $TEMP1
442
+ vpxor $ZERO, $ZERO, $ZERO
443
+ vpermq \$0x93, $TEMP2, $TEMP2
444
+
445
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
446
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
447
+ vpaddq $TEMP0, $ACC8, $ACC8
448
+ vpblendd \$3, $TEMP2, $ZERO, $TEMP2
449
+ vpaddq $TEMP1, $ACC1, $ACC1
450
+ vpaddq $TEMP2, $ACC2, $ACC2
451
+ vmovdqu $ACC1, 32*9-192($tp0)
452
+ vmovdqu $ACC2, 32*10-192($tp0)
453
+
454
+ mov (%rsp), %rax
455
+ mov 8(%rsp), $r1
456
+ mov 16(%rsp), $r2
457
+ mov 24(%rsp), $r3
458
+ vmovdqu 32*1(%rsp), $ACC1
459
+ vmovdqu 32*2-192($tp0), $ACC2
460
+ vmovdqu 32*3-192($tp0), $ACC3
461
+ vmovdqu 32*4-192($tp0), $ACC4
462
+ vmovdqu 32*5-192($tp0), $ACC5
463
+ vmovdqu 32*6-192($tp0), $ACC6
464
+ vmovdqu 32*7-192($tp0), $ACC7
465
+
466
+ mov %rax, $r0
467
+ imull $n0, %eax
468
+ and \$0x1fffffff, %eax
469
+ vmovd %eax, $Y1
470
+
471
+ mov %rax, %rdx
472
+ imulq -128($np), %rax
473
+ vpbroadcastq $Y1, $Y1
474
+ add %rax, $r0
475
+ mov %rdx, %rax
476
+ imulq 8-128($np), %rax
477
+ shr \$29, $r0
478
+ add %rax, $r1
479
+ mov %rdx, %rax
480
+ imulq 16-128($np), %rax
481
+ add $r0, $r1
482
+ add %rax, $r2
483
+ imulq 24-128($np), %rdx
484
+ add %rdx, $r3
485
+
486
+ mov $r1, %rax
487
+ imull $n0, %eax
488
+ and \$0x1fffffff, %eax
489
+
490
+ mov \$9, $i
491
+ jmp .LOOP_REDUCE_1024
492
+
493
+ .align 32
494
+ .LOOP_REDUCE_1024:
495
+ vmovd %eax, $Y2
496
+ vpbroadcastq $Y2, $Y2
497
+
498
+ vpmuludq 32*1-128($np), $Y1, $TEMP0
499
+ mov %rax, %rdx
500
+ imulq -128($np), %rax
501
+ vpaddq $TEMP0, $ACC1, $ACC1
502
+ add %rax, $r1
503
+ vpmuludq 32*2-128($np), $Y1, $TEMP1
504
+ mov %rdx, %rax
505
+ imulq 8-128($np), %rax
506
+ vpaddq $TEMP1, $ACC2, $ACC2
507
+ vpmuludq 32*3-128($np), $Y1, $TEMP2
508
+ .byte 0x67
509
+ add %rax, $r2
510
+ .byte 0x67
511
+ mov %rdx, %rax
512
+ imulq 16-128($np), %rax
513
+ shr \$29, $r1
514
+ vpaddq $TEMP2, $ACC3, $ACC3
515
+ vpmuludq 32*4-128($np), $Y1, $TEMP0
516
+ add %rax, $r3
517
+ add $r1, $r2
518
+ vpaddq $TEMP0, $ACC4, $ACC4
519
+ vpmuludq 32*5-128($np), $Y1, $TEMP1
520
+ mov $r2, %rax
521
+ imull $n0, %eax
522
+ vpaddq $TEMP1, $ACC5, $ACC5
523
+ vpmuludq 32*6-128($np), $Y1, $TEMP2
524
+ and \$0x1fffffff, %eax
525
+ vpaddq $TEMP2, $ACC6, $ACC6
526
+ vpmuludq 32*7-128($np), $Y1, $TEMP0
527
+ vpaddq $TEMP0, $ACC7, $ACC7
528
+ vpmuludq 32*8-128($np), $Y1, $TEMP1
529
+ vmovd %eax, $Y1
530
+ #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
531
+ vpaddq $TEMP1, $ACC8, $ACC8
532
+ #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
533
+ vpbroadcastq $Y1, $Y1
534
+
535
+ vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
536
+ vmovdqu 32*3-8-128($np), $TEMP1
537
+ mov %rax, %rdx
538
+ imulq -128($np), %rax
539
+ vpaddq $TEMP2, $ACC1, $ACC1
540
+ vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
541
+ vmovdqu 32*4-8-128($np), $TEMP2
542
+ add %rax, $r2
543
+ mov %rdx, %rax
544
+ imulq 8-128($np), %rax
545
+ vpaddq $TEMP0, $ACC2, $ACC2
546
+ add $r3, %rax
547
+ shr \$29, $r2
548
+ vpmuludq $Y2, $TEMP1, $TEMP1
549
+ vmovdqu 32*5-8-128($np), $TEMP0
550
+ add $r2, %rax
551
+ vpaddq $TEMP1, $ACC3, $ACC3
552
+ vpmuludq $Y2, $TEMP2, $TEMP2
553
+ vmovdqu 32*6-8-128($np), $TEMP1
554
+ .byte 0x67
555
+ mov %rax, $r3
556
+ imull $n0, %eax
557
+ vpaddq $TEMP2, $ACC4, $ACC4
558
+ vpmuludq $Y2, $TEMP0, $TEMP0
559
+ .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
560
+ and \$0x1fffffff, %eax
561
+ vpaddq $TEMP0, $ACC5, $ACC5
562
+ vpmuludq $Y2, $TEMP1, $TEMP1
563
+ vmovdqu 32*8-8-128($np), $TEMP0
564
+ vpaddq $TEMP1, $ACC6, $ACC6
565
+ vpmuludq $Y2, $TEMP2, $TEMP2
566
+ vmovdqu 32*9-8-128($np), $ACC9
567
+ vmovd %eax, $ACC0 # borrow ACC0 for Y2
568
+ imulq -128($np), %rax
569
+ vpaddq $TEMP2, $ACC7, $ACC7
570
+ vpmuludq $Y2, $TEMP0, $TEMP0
571
+ vmovdqu 32*1-16-128($np), $TEMP1
572
+ vpbroadcastq $ACC0, $ACC0
573
+ vpaddq $TEMP0, $ACC8, $ACC8
574
+ vpmuludq $Y2, $ACC9, $ACC9
575
+ vmovdqu 32*2-16-128($np), $TEMP2
576
+ add %rax, $r3
577
+
578
+ ___
579
+ ($ACC0,$Y2)=($Y2,$ACC0);
580
+ $code.=<<___;
581
+ vmovdqu 32*1-24-128($np), $ACC0
582
+ vpmuludq $Y1, $TEMP1, $TEMP1
583
+ vmovdqu 32*3-16-128($np), $TEMP0
584
+ vpaddq $TEMP1, $ACC1, $ACC1
585
+ vpmuludq $Y2, $ACC0, $ACC0
586
+ vpmuludq $Y1, $TEMP2, $TEMP2
587
+ .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
588
+ vpaddq $ACC1, $ACC0, $ACC0
589
+ vpaddq $TEMP2, $ACC2, $ACC2
590
+ vpmuludq $Y1, $TEMP0, $TEMP0
591
+ vmovdqu 32*5-16-128($np), $TEMP2
592
+ .byte 0x67
593
+ vmovq $ACC0, %rax
594
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
595
+ vpaddq $TEMP0, $ACC3, $ACC3
596
+ vpmuludq $Y1, $TEMP1, $TEMP1
597
+ vmovdqu 32*6-16-128($np), $TEMP0
598
+ vpaddq $TEMP1, $ACC4, $ACC4
599
+ vpmuludq $Y1, $TEMP2, $TEMP2
600
+ vmovdqu 32*7-16-128($np), $TEMP1
601
+ vpaddq $TEMP2, $ACC5, $ACC5
602
+ vpmuludq $Y1, $TEMP0, $TEMP0
603
+ vmovdqu 32*8-16-128($np), $TEMP2
604
+ vpaddq $TEMP0, $ACC6, $ACC6
605
+ vpmuludq $Y1, $TEMP1, $TEMP1
606
+ shr \$29, $r3
607
+ vmovdqu 32*9-16-128($np), $TEMP0
608
+ add $r3, %rax
609
+ vpaddq $TEMP1, $ACC7, $ACC7
610
+ vpmuludq $Y1, $TEMP2, $TEMP2
611
+ #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
612
+ mov %rax, $r0
613
+ imull $n0, %eax
614
+ vpaddq $TEMP2, $ACC8, $ACC8
615
+ vpmuludq $Y1, $TEMP0, $TEMP0
616
+ and \$0x1fffffff, %eax
617
+ vmovd %eax, $Y1
618
+ vmovdqu 32*3-24-128($np), $TEMP2
619
+ .byte 0x67
620
+ vpaddq $TEMP0, $ACC9, $ACC9
621
+ vpbroadcastq $Y1, $Y1
622
+
623
+ vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
624
+ vmovdqu 32*4-24-128($np), $TEMP0
625
+ mov %rax, %rdx
626
+ imulq -128($np), %rax
627
+ mov 8(%rsp), $r1
628
+ vpaddq $TEMP1, $ACC2, $ACC1
629
+ vpmuludq $Y2, $TEMP2, $TEMP2
630
+ vmovdqu 32*5-24-128($np), $TEMP1
631
+ add %rax, $r0
632
+ mov %rdx, %rax
633
+ imulq 8-128($np), %rax
634
+ .byte 0x67
635
+ shr \$29, $r0
636
+ mov 16(%rsp), $r2
637
+ vpaddq $TEMP2, $ACC3, $ACC2
638
+ vpmuludq $Y2, $TEMP0, $TEMP0
639
+ vmovdqu 32*6-24-128($np), $TEMP2
640
+ add %rax, $r1
641
+ mov %rdx, %rax
642
+ imulq 16-128($np), %rax
643
+ vpaddq $TEMP0, $ACC4, $ACC3
644
+ vpmuludq $Y2, $TEMP1, $TEMP1
645
+ vmovdqu 32*7-24-128($np), $TEMP0
646
+ imulq 24-128($np), %rdx # future $r3
647
+ add %rax, $r2
648
+ lea ($r0,$r1), %rax
649
+ vpaddq $TEMP1, $ACC5, $ACC4
650
+ vpmuludq $Y2, $TEMP2, $TEMP2
651
+ vmovdqu 32*8-24-128($np), $TEMP1
652
+ mov %rax, $r1
653
+ imull $n0, %eax
654
+ vpmuludq $Y2, $TEMP0, $TEMP0
655
+ vpaddq $TEMP2, $ACC6, $ACC5
656
+ vmovdqu 32*9-24-128($np), $TEMP2
657
+ and \$0x1fffffff, %eax
658
+ vpaddq $TEMP0, $ACC7, $ACC6
659
+ vpmuludq $Y2, $TEMP1, $TEMP1
660
+ add 24(%rsp), %rdx
661
+ vpaddq $TEMP1, $ACC8, $ACC7
662
+ vpmuludq $Y2, $TEMP2, $TEMP2
663
+ vpaddq $TEMP2, $ACC9, $ACC8
664
+ vmovq $r3, $ACC9
665
+ mov %rdx, $r3
666
+
667
+ dec $i
668
+ jnz .LOOP_REDUCE_1024
669
+ ___
670
+ ($ACC0,$Y2)=($Y2,$ACC0);
671
+ $code.=<<___;
672
+ lea 448(%rsp), $tp1 # size optimization
673
+ vpaddq $ACC9, $Y2, $ACC0
674
+ vpxor $ZERO, $ZERO, $ZERO
675
+
676
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
677
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
678
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
679
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
680
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
681
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
682
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
683
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
684
+ vpaddq 32*17-448($tp1), $ACC8, $ACC8
685
+
686
+ vpsrlq \$29, $ACC0, $TEMP1
687
+ vpand $AND_MASK, $ACC0, $ACC0
688
+ vpsrlq \$29, $ACC1, $TEMP2
689
+ vpand $AND_MASK, $ACC1, $ACC1
690
+ vpsrlq \$29, $ACC2, $TEMP3
691
+ vpermq \$0x93, $TEMP1, $TEMP1
692
+ vpand $AND_MASK, $ACC2, $ACC2
693
+ vpsrlq \$29, $ACC3, $TEMP4
694
+ vpermq \$0x93, $TEMP2, $TEMP2
695
+ vpand $AND_MASK, $ACC3, $ACC3
696
+ vpermq \$0x93, $TEMP3, $TEMP3
697
+
698
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
699
+ vpermq \$0x93, $TEMP4, $TEMP4
700
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
701
+ vpaddq $TEMP0, $ACC0, $ACC0
702
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
703
+ vpaddq $TEMP1, $ACC1, $ACC1
704
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
705
+ vpaddq $TEMP2, $ACC2, $ACC2
706
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
707
+ vpaddq $TEMP3, $ACC3, $ACC3
708
+ vpaddq $TEMP4, $ACC4, $ACC4
709
+
710
+ vpsrlq \$29, $ACC0, $TEMP1
711
+ vpand $AND_MASK, $ACC0, $ACC0
712
+ vpsrlq \$29, $ACC1, $TEMP2
713
+ vpand $AND_MASK, $ACC1, $ACC1
714
+ vpsrlq \$29, $ACC2, $TEMP3
715
+ vpermq \$0x93, $TEMP1, $TEMP1
716
+ vpand $AND_MASK, $ACC2, $ACC2
717
+ vpsrlq \$29, $ACC3, $TEMP4
718
+ vpermq \$0x93, $TEMP2, $TEMP2
719
+ vpand $AND_MASK, $ACC3, $ACC3
720
+ vpermq \$0x93, $TEMP3, $TEMP3
721
+
722
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
723
+ vpermq \$0x93, $TEMP4, $TEMP4
724
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
725
+ vpaddq $TEMP0, $ACC0, $ACC0
726
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
727
+ vpaddq $TEMP1, $ACC1, $ACC1
728
+ vmovdqu $ACC0, 32*0-128($rp)
729
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
730
+ vpaddq $TEMP2, $ACC2, $ACC2
731
+ vmovdqu $ACC1, 32*1-128($rp)
732
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
733
+ vpaddq $TEMP3, $ACC3, $ACC3
734
+ vmovdqu $ACC2, 32*2-128($rp)
735
+ vpaddq $TEMP4, $ACC4, $ACC4
736
+ vmovdqu $ACC3, 32*3-128($rp)
737
+ ___
738
+ $TEMP5=$ACC0;
739
+ $code.=<<___;
740
+ vpsrlq \$29, $ACC4, $TEMP1
741
+ vpand $AND_MASK, $ACC4, $ACC4
742
+ vpsrlq \$29, $ACC5, $TEMP2
743
+ vpand $AND_MASK, $ACC5, $ACC5
744
+ vpsrlq \$29, $ACC6, $TEMP3
745
+ vpermq \$0x93, $TEMP1, $TEMP1
746
+ vpand $AND_MASK, $ACC6, $ACC6
747
+ vpsrlq \$29, $ACC7, $TEMP4
748
+ vpermq \$0x93, $TEMP2, $TEMP2
749
+ vpand $AND_MASK, $ACC7, $ACC7
750
+ vpsrlq \$29, $ACC8, $TEMP5
751
+ vpermq \$0x93, $TEMP3, $TEMP3
752
+ vpand $AND_MASK, $ACC8, $ACC8
753
+ vpermq \$0x93, $TEMP4, $TEMP4
754
+
755
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
756
+ vpermq \$0x93, $TEMP5, $TEMP5
757
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
758
+ vpaddq $TEMP0, $ACC4, $ACC4
759
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
760
+ vpaddq $TEMP1, $ACC5, $ACC5
761
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
762
+ vpaddq $TEMP2, $ACC6, $ACC6
763
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
764
+ vpaddq $TEMP3, $ACC7, $ACC7
765
+ vpaddq $TEMP4, $ACC8, $ACC8
766
+
767
+ vpsrlq \$29, $ACC4, $TEMP1
768
+ vpand $AND_MASK, $ACC4, $ACC4
769
+ vpsrlq \$29, $ACC5, $TEMP2
770
+ vpand $AND_MASK, $ACC5, $ACC5
771
+ vpsrlq \$29, $ACC6, $TEMP3
772
+ vpermq \$0x93, $TEMP1, $TEMP1
773
+ vpand $AND_MASK, $ACC6, $ACC6
774
+ vpsrlq \$29, $ACC7, $TEMP4
775
+ vpermq \$0x93, $TEMP2, $TEMP2
776
+ vpand $AND_MASK, $ACC7, $ACC7
777
+ vpsrlq \$29, $ACC8, $TEMP5
778
+ vpermq \$0x93, $TEMP3, $TEMP3
779
+ vpand $AND_MASK, $ACC8, $ACC8
780
+ vpermq \$0x93, $TEMP4, $TEMP4
781
+
782
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
783
+ vpermq \$0x93, $TEMP5, $TEMP5
784
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
785
+ vpaddq $TEMP0, $ACC4, $ACC4
786
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
787
+ vpaddq $TEMP1, $ACC5, $ACC5
788
+ vmovdqu $ACC4, 32*4-128($rp)
789
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
790
+ vpaddq $TEMP2, $ACC6, $ACC6
791
+ vmovdqu $ACC5, 32*5-128($rp)
792
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
793
+ vpaddq $TEMP3, $ACC7, $ACC7
794
+ vmovdqu $ACC6, 32*6-128($rp)
795
+ vpaddq $TEMP4, $ACC8, $ACC8
796
+ vmovdqu $ACC7, 32*7-128($rp)
797
+ vmovdqu $ACC8, 32*8-128($rp)
798
+
799
+ mov $rp, $ap
800
+ dec $rep
801
+ jne .LOOP_GRANDE_SQR_1024
802
+
803
+ vzeroall
804
+ mov %rbp, %rax
805
+ ___
806
+ $code.=<<___ if ($win64);
807
+ movaps -0xd8(%rax),%xmm6
808
+ movaps -0xc8(%rax),%xmm7
809
+ movaps -0xb8(%rax),%xmm8
810
+ movaps -0xa8(%rax),%xmm9
811
+ movaps -0x98(%rax),%xmm10
812
+ movaps -0x88(%rax),%xmm11
813
+ movaps -0x78(%rax),%xmm12
814
+ movaps -0x68(%rax),%xmm13
815
+ movaps -0x58(%rax),%xmm14
816
+ movaps -0x48(%rax),%xmm15
817
+ ___
818
+ $code.=<<___;
819
+ mov -48(%rax),%r15
820
+ mov -40(%rax),%r14
821
+ mov -32(%rax),%r13
822
+ mov -24(%rax),%r12
823
+ mov -16(%rax),%rbp
824
+ mov -8(%rax),%rbx
825
+ lea (%rax),%rsp # restore %rsp
826
+ .Lsqr_1024_epilogue:
827
+ ret
828
+ .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
829
+ ___
830
+ }
831
+
832
+ { # void AMM_WW(
833
+ my $rp="%rdi"; # BN_ULONG *rp,
834
+ my $ap="%rsi"; # const BN_ULONG *ap,
835
+ my $bp="%rdx"; # const BN_ULONG *bp,
836
+ my $np="%rcx"; # const BN_ULONG *np,
837
+ my $n0="%r8d"; # unsigned int n0);
838
+
839
+ # The registers that hold the accumulated redundant result
840
+ # The AMM works on 1024 bit operands, and redundant word size is 29
841
+ # Therefore: ceil(1024/29)/4 = 9
842
+ my $ACC0="%ymm0";
843
+ my $ACC1="%ymm1";
844
+ my $ACC2="%ymm2";
845
+ my $ACC3="%ymm3";
846
+ my $ACC4="%ymm4";
847
+ my $ACC5="%ymm5";
848
+ my $ACC6="%ymm6";
849
+ my $ACC7="%ymm7";
850
+ my $ACC8="%ymm8";
851
+ my $ACC9="%ymm9";
852
+
853
+ # Registers that hold the broadcasted words of multiplier, currently used
854
+ my $Bi="%ymm10";
855
+ my $Yi="%ymm11";
856
+
857
+ # Helper registers
858
+ my $TEMP0=$ACC0;
859
+ my $TEMP1="%ymm12";
860
+ my $TEMP2="%ymm13";
861
+ my $ZERO="%ymm14";
862
+ my $AND_MASK="%ymm15";
863
+
864
+ # alu registers that hold the first words of the ACC
865
+ my $r0="%r9";
866
+ my $r1="%r10";
867
+ my $r2="%r11";
868
+ my $r3="%r12";
869
+
870
+ my $i="%r14d";
871
+ my $tmp="%r15";
872
+
873
+ $bp="%r13"; # reassigned argument
874
+
875
+ $code.=<<___;
876
+ .globl rsaz_1024_mul_avx2
877
+ .type rsaz_1024_mul_avx2,\@function,5
878
+ .align 64
879
+ rsaz_1024_mul_avx2:
880
+ lea (%rsp), %rax
881
+ push %rbx
882
+ push %rbp
883
+ push %r12
884
+ push %r13
885
+ push %r14
886
+ push %r15
887
+ ___
888
+ $code.=<<___ if ($win64);
889
+ vzeroupper
890
+ lea -0xa8(%rsp),%rsp
891
+ vmovaps %xmm6,-0xd8(%rax)
892
+ vmovaps %xmm7,-0xc8(%rax)
893
+ vmovaps %xmm8,-0xb8(%rax)
894
+ vmovaps %xmm9,-0xa8(%rax)
895
+ vmovaps %xmm10,-0x98(%rax)
896
+ vmovaps %xmm11,-0x88(%rax)
897
+ vmovaps %xmm12,-0x78(%rax)
898
+ vmovaps %xmm13,-0x68(%rax)
899
+ vmovaps %xmm14,-0x58(%rax)
900
+ vmovaps %xmm15,-0x48(%rax)
901
+ .Lmul_1024_body:
902
+ ___
903
+ $code.=<<___;
904
+ mov %rax,%rbp
905
+ vzeroall
906
+ mov %rdx, $bp # reassigned argument
907
+ sub \$64,%rsp
908
+
909
+ # unaligned 256-bit load that crosses page boundary can
910
+ # cause severe performance degradation here, so if $ap does
911
+ # cross page boundary, swap it with $bp [meaning that caller
912
+ # is advised to lay down $ap and $bp next to each other, so
913
+ # that only one can cross page boundary].
914
+ .byte 0x67,0x67
915
+ mov $ap, $tmp
916
+ and \$4095, $tmp
917
+ add \$32*10, $tmp
918
+ shr \$12, $tmp
919
+ mov $ap, $tmp
920
+ cmovnz $bp, $ap
921
+ cmovnz $tmp, $bp
922
+
923
+ mov $np, $tmp
924
+ sub \$-128,$ap # size optimization
925
+ sub \$-128,$np
926
+ sub \$-128,$rp
927
+
928
+ and \$4095, $tmp # see if $np crosses page
929
+ add \$32*10, $tmp
930
+ .byte 0x67,0x67
931
+ shr \$12, $tmp
932
+ jz .Lmul_1024_no_n_copy
933
+
934
+ # unaligned 256-bit load that crosses page boundary can
935
+ # cause severe performance degradation here, so if $np does
936
+ # cross page boundary, copy it to stack and make sure stack
937
+ # frame doesn't...
938
+ sub \$32*10,%rsp
939
+ vmovdqu 32*0-128($np), $ACC0
940
+ and \$-512, %rsp
941
+ vmovdqu 32*1-128($np), $ACC1
942
+ vmovdqu 32*2-128($np), $ACC2
943
+ vmovdqu 32*3-128($np), $ACC3
944
+ vmovdqu 32*4-128($np), $ACC4
945
+ vmovdqu 32*5-128($np), $ACC5
946
+ vmovdqu 32*6-128($np), $ACC6
947
+ vmovdqu 32*7-128($np), $ACC7
948
+ vmovdqu 32*8-128($np), $ACC8
949
+ lea 64+128(%rsp),$np
950
+ vmovdqu $ACC0, 32*0-128($np)
951
+ vpxor $ACC0, $ACC0, $ACC0
952
+ vmovdqu $ACC1, 32*1-128($np)
953
+ vpxor $ACC1, $ACC1, $ACC1
954
+ vmovdqu $ACC2, 32*2-128($np)
955
+ vpxor $ACC2, $ACC2, $ACC2
956
+ vmovdqu $ACC3, 32*3-128($np)
957
+ vpxor $ACC3, $ACC3, $ACC3
958
+ vmovdqu $ACC4, 32*4-128($np)
959
+ vpxor $ACC4, $ACC4, $ACC4
960
+ vmovdqu $ACC5, 32*5-128($np)
961
+ vpxor $ACC5, $ACC5, $ACC5
962
+ vmovdqu $ACC6, 32*6-128($np)
963
+ vpxor $ACC6, $ACC6, $ACC6
964
+ vmovdqu $ACC7, 32*7-128($np)
965
+ vpxor $ACC7, $ACC7, $ACC7
966
+ vmovdqu $ACC8, 32*8-128($np)
967
+ vmovdqa $ACC0, $ACC8
968
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
969
+ .Lmul_1024_no_n_copy:
970
+ and \$-64,%rsp
971
+
972
+ mov ($bp), %rbx
973
+ vpbroadcastq ($bp), $Bi
974
+ vmovdqu $ACC0, (%rsp) # clear top of stack
975
+ xor $r0, $r0
976
+ .byte 0x67
977
+ xor $r1, $r1
978
+ xor $r2, $r2
979
+ xor $r3, $r3
980
+
981
+ vmovdqu .Land_mask(%rip), $AND_MASK
982
+ mov \$9, $i
983
+ vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
984
+ jmp .Loop_mul_1024
985
+
986
+ .align 32
987
+ .Loop_mul_1024:
988
+ vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
989
+ mov %rbx, %rax
990
+ imulq -128($ap), %rax
991
+ add $r0, %rax
992
+ mov %rbx, $r1
993
+ imulq 8-128($ap), $r1
994
+ add 8(%rsp), $r1
995
+
996
+ mov %rax, $r0
997
+ imull $n0, %eax
998
+ and \$0x1fffffff, %eax
999
+
1000
+ mov %rbx, $r2
1001
+ imulq 16-128($ap), $r2
1002
+ add 16(%rsp), $r2
1003
+
1004
+ mov %rbx, $r3
1005
+ imulq 24-128($ap), $r3
1006
+ add 24(%rsp), $r3
1007
+ vpmuludq 32*1-128($ap),$Bi,$TEMP0
1008
+ vmovd %eax, $Yi
1009
+ vpaddq $TEMP0,$ACC1,$ACC1
1010
+ vpmuludq 32*2-128($ap),$Bi,$TEMP1
1011
+ vpbroadcastq $Yi, $Yi
1012
+ vpaddq $TEMP1,$ACC2,$ACC2
1013
+ vpmuludq 32*3-128($ap),$Bi,$TEMP2
1014
+ vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1015
+ vpaddq $TEMP2,$ACC3,$ACC3
1016
+ vpmuludq 32*4-128($ap),$Bi,$TEMP0
1017
+ vpaddq $TEMP0,$ACC4,$ACC4
1018
+ vpmuludq 32*5-128($ap),$Bi,$TEMP1
1019
+ vpaddq $TEMP1,$ACC5,$ACC5
1020
+ vpmuludq 32*6-128($ap),$Bi,$TEMP2
1021
+ vpaddq $TEMP2,$ACC6,$ACC6
1022
+ vpmuludq 32*7-128($ap),$Bi,$TEMP0
1023
+ vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1024
+ vpaddq $TEMP0,$ACC7,$ACC7
1025
+ vpmuludq 32*8-128($ap),$Bi,$TEMP1
1026
+ vpbroadcastq 8($bp), $Bi
1027
+ vpaddq $TEMP1,$ACC8,$ACC8
1028
+
1029
+ mov %rax,%rdx
1030
+ imulq -128($np),%rax
1031
+ add %rax,$r0
1032
+ mov %rdx,%rax
1033
+ imulq 8-128($np),%rax
1034
+ add %rax,$r1
1035
+ mov %rdx,%rax
1036
+ imulq 16-128($np),%rax
1037
+ add %rax,$r2
1038
+ shr \$29, $r0
1039
+ imulq 24-128($np),%rdx
1040
+ add %rdx,$r3
1041
+ add $r0, $r1
1042
+
1043
+ vpmuludq 32*1-128($np),$Yi,$TEMP2
1044
+ vmovq $Bi, %rbx
1045
+ vpaddq $TEMP2,$ACC1,$ACC1
1046
+ vpmuludq 32*2-128($np),$Yi,$TEMP0
1047
+ vpaddq $TEMP0,$ACC2,$ACC2
1048
+ vpmuludq 32*3-128($np),$Yi,$TEMP1
1049
+ vpaddq $TEMP1,$ACC3,$ACC3
1050
+ vpmuludq 32*4-128($np),$Yi,$TEMP2
1051
+ vpaddq $TEMP2,$ACC4,$ACC4
1052
+ vpmuludq 32*5-128($np),$Yi,$TEMP0
1053
+ vpaddq $TEMP0,$ACC5,$ACC5
1054
+ vpmuludq 32*6-128($np),$Yi,$TEMP1
1055
+ vpaddq $TEMP1,$ACC6,$ACC6
1056
+ vpmuludq 32*7-128($np),$Yi,$TEMP2
1057
+ vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1058
+ vpaddq $TEMP2,$ACC7,$ACC7
1059
+ vpmuludq 32*8-128($np),$Yi,$TEMP0
1060
+ vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1061
+ vpaddq $TEMP0,$ACC8,$ACC8
1062
+
1063
+ mov %rbx, %rax
1064
+ imulq -128($ap),%rax
1065
+ add %rax,$r1
1066
+ vmovdqu -8+32*1-128($ap),$TEMP1
1067
+ mov %rbx, %rax
1068
+ imulq 8-128($ap),%rax
1069
+ add %rax,$r2
1070
+ vmovdqu -8+32*2-128($ap),$TEMP2
1071
+
1072
+ mov $r1, %rax
1073
+ imull $n0, %eax
1074
+ and \$0x1fffffff, %eax
1075
+
1076
+ imulq 16-128($ap),%rbx
1077
+ add %rbx,$r3
1078
+ vpmuludq $Bi,$TEMP1,$TEMP1
1079
+ vmovd %eax, $Yi
1080
+ vmovdqu -8+32*3-128($ap),$TEMP0
1081
+ vpaddq $TEMP1,$ACC1,$ACC1
1082
+ vpmuludq $Bi,$TEMP2,$TEMP2
1083
+ vpbroadcastq $Yi, $Yi
1084
+ vmovdqu -8+32*4-128($ap),$TEMP1
1085
+ vpaddq $TEMP2,$ACC2,$ACC2
1086
+ vpmuludq $Bi,$TEMP0,$TEMP0
1087
+ vmovdqu -8+32*5-128($ap),$TEMP2
1088
+ vpaddq $TEMP0,$ACC3,$ACC3
1089
+ vpmuludq $Bi,$TEMP1,$TEMP1
1090
+ vmovdqu -8+32*6-128($ap),$TEMP0
1091
+ vpaddq $TEMP1,$ACC4,$ACC4
1092
+ vpmuludq $Bi,$TEMP2,$TEMP2
1093
+ vmovdqu -8+32*7-128($ap),$TEMP1
1094
+ vpaddq $TEMP2,$ACC5,$ACC5
1095
+ vpmuludq $Bi,$TEMP0,$TEMP0
1096
+ vmovdqu -8+32*8-128($ap),$TEMP2
1097
+ vpaddq $TEMP0,$ACC6,$ACC6
1098
+ vpmuludq $Bi,$TEMP1,$TEMP1
1099
+ vmovdqu -8+32*9-128($ap),$ACC9
1100
+ vpaddq $TEMP1,$ACC7,$ACC7
1101
+ vpmuludq $Bi,$TEMP2,$TEMP2
1102
+ vpaddq $TEMP2,$ACC8,$ACC8
1103
+ vpmuludq $Bi,$ACC9,$ACC9
1104
+ vpbroadcastq 16($bp), $Bi
1105
+
1106
+ mov %rax,%rdx
1107
+ imulq -128($np),%rax
1108
+ add %rax,$r1
1109
+ vmovdqu -8+32*1-128($np),$TEMP0
1110
+ mov %rdx,%rax
1111
+ imulq 8-128($np),%rax
1112
+ add %rax,$r2
1113
+ vmovdqu -8+32*2-128($np),$TEMP1
1114
+ shr \$29, $r1
1115
+ imulq 16-128($np),%rdx
1116
+ add %rdx,$r3
1117
+ add $r1, $r2
1118
+
1119
+ vpmuludq $Yi,$TEMP0,$TEMP0
1120
+ vmovq $Bi, %rbx
1121
+ vmovdqu -8+32*3-128($np),$TEMP2
1122
+ vpaddq $TEMP0,$ACC1,$ACC1
1123
+ vpmuludq $Yi,$TEMP1,$TEMP1
1124
+ vmovdqu -8+32*4-128($np),$TEMP0
1125
+ vpaddq $TEMP1,$ACC2,$ACC2
1126
+ vpmuludq $Yi,$TEMP2,$TEMP2
1127
+ vmovdqu -8+32*5-128($np),$TEMP1
1128
+ vpaddq $TEMP2,$ACC3,$ACC3
1129
+ vpmuludq $Yi,$TEMP0,$TEMP0
1130
+ vmovdqu -8+32*6-128($np),$TEMP2
1131
+ vpaddq $TEMP0,$ACC4,$ACC4
1132
+ vpmuludq $Yi,$TEMP1,$TEMP1
1133
+ vmovdqu -8+32*7-128($np),$TEMP0
1134
+ vpaddq $TEMP1,$ACC5,$ACC5
1135
+ vpmuludq $Yi,$TEMP2,$TEMP2
1136
+ vmovdqu -8+32*8-128($np),$TEMP1
1137
+ vpaddq $TEMP2,$ACC6,$ACC6
1138
+ vpmuludq $Yi,$TEMP0,$TEMP0
1139
+ vmovdqu -8+32*9-128($np),$TEMP2
1140
+ vpaddq $TEMP0,$ACC7,$ACC7
1141
+ vpmuludq $Yi,$TEMP1,$TEMP1
1142
+ vpaddq $TEMP1,$ACC8,$ACC8
1143
+ vpmuludq $Yi,$TEMP2,$TEMP2
1144
+ vpaddq $TEMP2,$ACC9,$ACC9
1145
+
1146
+ vmovdqu -16+32*1-128($ap),$TEMP0
1147
+ mov %rbx,%rax
1148
+ imulq -128($ap),%rax
1149
+ add $r2,%rax
1150
+
1151
+ vmovdqu -16+32*2-128($ap),$TEMP1
1152
+ mov %rax,$r2
1153
+ imull $n0, %eax
1154
+ and \$0x1fffffff, %eax
1155
+
1156
+ imulq 8-128($ap),%rbx
1157
+ add %rbx,$r3
1158
+ vpmuludq $Bi,$TEMP0,$TEMP0
1159
+ vmovd %eax, $Yi
1160
+ vmovdqu -16+32*3-128($ap),$TEMP2
1161
+ vpaddq $TEMP0,$ACC1,$ACC1
1162
+ vpmuludq $Bi,$TEMP1,$TEMP1
1163
+ vpbroadcastq $Yi, $Yi
1164
+ vmovdqu -16+32*4-128($ap),$TEMP0
1165
+ vpaddq $TEMP1,$ACC2,$ACC2
1166
+ vpmuludq $Bi,$TEMP2,$TEMP2
1167
+ vmovdqu -16+32*5-128($ap),$TEMP1
1168
+ vpaddq $TEMP2,$ACC3,$ACC3
1169
+ vpmuludq $Bi,$TEMP0,$TEMP0
1170
+ vmovdqu -16+32*6-128($ap),$TEMP2
1171
+ vpaddq $TEMP0,$ACC4,$ACC4
1172
+ vpmuludq $Bi,$TEMP1,$TEMP1
1173
+ vmovdqu -16+32*7-128($ap),$TEMP0
1174
+ vpaddq $TEMP1,$ACC5,$ACC5
1175
+ vpmuludq $Bi,$TEMP2,$TEMP2
1176
+ vmovdqu -16+32*8-128($ap),$TEMP1
1177
+ vpaddq $TEMP2,$ACC6,$ACC6
1178
+ vpmuludq $Bi,$TEMP0,$TEMP0
1179
+ vmovdqu -16+32*9-128($ap),$TEMP2
1180
+ vpaddq $TEMP0,$ACC7,$ACC7
1181
+ vpmuludq $Bi,$TEMP1,$TEMP1
1182
+ vpaddq $TEMP1,$ACC8,$ACC8
1183
+ vpmuludq $Bi,$TEMP2,$TEMP2
1184
+ vpbroadcastq 24($bp), $Bi
1185
+ vpaddq $TEMP2,$ACC9,$ACC9
1186
+
1187
+ vmovdqu -16+32*1-128($np),$TEMP0
1188
+ mov %rax,%rdx
1189
+ imulq -128($np),%rax
1190
+ add %rax,$r2
1191
+ vmovdqu -16+32*2-128($np),$TEMP1
1192
+ imulq 8-128($np),%rdx
1193
+ add %rdx,$r3
1194
+ shr \$29, $r2
1195
+
1196
+ vpmuludq $Yi,$TEMP0,$TEMP0
1197
+ vmovq $Bi, %rbx
1198
+ vmovdqu -16+32*3-128($np),$TEMP2
1199
+ vpaddq $TEMP0,$ACC1,$ACC1
1200
+ vpmuludq $Yi,$TEMP1,$TEMP1
1201
+ vmovdqu -16+32*4-128($np),$TEMP0
1202
+ vpaddq $TEMP1,$ACC2,$ACC2
1203
+ vpmuludq $Yi,$TEMP2,$TEMP2
1204
+ vmovdqu -16+32*5-128($np),$TEMP1
1205
+ vpaddq $TEMP2,$ACC3,$ACC3
1206
+ vpmuludq $Yi,$TEMP0,$TEMP0
1207
+ vmovdqu -16+32*6-128($np),$TEMP2
1208
+ vpaddq $TEMP0,$ACC4,$ACC4
1209
+ vpmuludq $Yi,$TEMP1,$TEMP1
1210
+ vmovdqu -16+32*7-128($np),$TEMP0
1211
+ vpaddq $TEMP1,$ACC5,$ACC5
1212
+ vpmuludq $Yi,$TEMP2,$TEMP2
1213
+ vmovdqu -16+32*8-128($np),$TEMP1
1214
+ vpaddq $TEMP2,$ACC6,$ACC6
1215
+ vpmuludq $Yi,$TEMP0,$TEMP0
1216
+ vmovdqu -16+32*9-128($np),$TEMP2
1217
+ vpaddq $TEMP0,$ACC7,$ACC7
1218
+ vpmuludq $Yi,$TEMP1,$TEMP1
1219
+ vmovdqu -24+32*1-128($ap),$TEMP0
1220
+ vpaddq $TEMP1,$ACC8,$ACC8
1221
+ vpmuludq $Yi,$TEMP2,$TEMP2
1222
+ vmovdqu -24+32*2-128($ap),$TEMP1
1223
+ vpaddq $TEMP2,$ACC9,$ACC9
1224
+
1225
+ add $r2, $r3
1226
+ imulq -128($ap),%rbx
1227
+ add %rbx,$r3
1228
+
1229
+ mov $r3, %rax
1230
+ imull $n0, %eax
1231
+ and \$0x1fffffff, %eax
1232
+
1233
+ vpmuludq $Bi,$TEMP0,$TEMP0
1234
+ vmovd %eax, $Yi
1235
+ vmovdqu -24+32*3-128($ap),$TEMP2
1236
+ vpaddq $TEMP0,$ACC1,$ACC1
1237
+ vpmuludq $Bi,$TEMP1,$TEMP1
1238
+ vpbroadcastq $Yi, $Yi
1239
+ vmovdqu -24+32*4-128($ap),$TEMP0
1240
+ vpaddq $TEMP1,$ACC2,$ACC2
1241
+ vpmuludq $Bi,$TEMP2,$TEMP2
1242
+ vmovdqu -24+32*5-128($ap),$TEMP1
1243
+ vpaddq $TEMP2,$ACC3,$ACC3
1244
+ vpmuludq $Bi,$TEMP0,$TEMP0
1245
+ vmovdqu -24+32*6-128($ap),$TEMP2
1246
+ vpaddq $TEMP0,$ACC4,$ACC4
1247
+ vpmuludq $Bi,$TEMP1,$TEMP1
1248
+ vmovdqu -24+32*7-128($ap),$TEMP0
1249
+ vpaddq $TEMP1,$ACC5,$ACC5
1250
+ vpmuludq $Bi,$TEMP2,$TEMP2
1251
+ vmovdqu -24+32*8-128($ap),$TEMP1
1252
+ vpaddq $TEMP2,$ACC6,$ACC6
1253
+ vpmuludq $Bi,$TEMP0,$TEMP0
1254
+ vmovdqu -24+32*9-128($ap),$TEMP2
1255
+ vpaddq $TEMP0,$ACC7,$ACC7
1256
+ vpmuludq $Bi,$TEMP1,$TEMP1
1257
+ vpaddq $TEMP1,$ACC8,$ACC8
1258
+ vpmuludq $Bi,$TEMP2,$TEMP2
1259
+ vpbroadcastq 32($bp), $Bi
1260
+ vpaddq $TEMP2,$ACC9,$ACC9
1261
+ add \$32, $bp # $bp++
1262
+
1263
+ vmovdqu -24+32*1-128($np),$TEMP0
1264
+ imulq -128($np),%rax
1265
+ add %rax,$r3
1266
+ shr \$29, $r3
1267
+
1268
+ vmovdqu -24+32*2-128($np),$TEMP1
1269
+ vpmuludq $Yi,$TEMP0,$TEMP0
1270
+ vmovq $Bi, %rbx
1271
+ vmovdqu -24+32*3-128($np),$TEMP2
1272
+ vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1273
+ vpmuludq $Yi,$TEMP1,$TEMP1
1274
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1275
+ vpaddq $TEMP1,$ACC2,$ACC1
1276
+ vmovdqu -24+32*4-128($np),$TEMP0
1277
+ vpmuludq $Yi,$TEMP2,$TEMP2
1278
+ vmovdqu -24+32*5-128($np),$TEMP1
1279
+ vpaddq $TEMP2,$ACC3,$ACC2
1280
+ vpmuludq $Yi,$TEMP0,$TEMP0
1281
+ vmovdqu -24+32*6-128($np),$TEMP2
1282
+ vpaddq $TEMP0,$ACC4,$ACC3
1283
+ vpmuludq $Yi,$TEMP1,$TEMP1
1284
+ vmovdqu -24+32*7-128($np),$TEMP0
1285
+ vpaddq $TEMP1,$ACC5,$ACC4
1286
+ vpmuludq $Yi,$TEMP2,$TEMP2
1287
+ vmovdqu -24+32*8-128($np),$TEMP1
1288
+ vpaddq $TEMP2,$ACC6,$ACC5
1289
+ vpmuludq $Yi,$TEMP0,$TEMP0
1290
+ vmovdqu -24+32*9-128($np),$TEMP2
1291
+ mov $r3, $r0
1292
+ vpaddq $TEMP0,$ACC7,$ACC6
1293
+ vpmuludq $Yi,$TEMP1,$TEMP1
1294
+ add (%rsp), $r0
1295
+ vpaddq $TEMP1,$ACC8,$ACC7
1296
+ vpmuludq $Yi,$TEMP2,$TEMP2
1297
+ vmovq $r3, $TEMP1
1298
+ vpaddq $TEMP2,$ACC9,$ACC8
1299
+
1300
+ dec $i
1301
+ jnz .Loop_mul_1024
1302
+ ___
1303
+
1304
+ # (*) Original implementation was correcting ACC1-ACC3 for overflow
1305
+ # after 7 loop runs, or after 28 iterations, or 56 additions.
1306
+ # But as we underutilize resources, it's possible to correct in
1307
+ # each iteration with marginal performance loss. But then, as
1308
+ # we do it in each iteration, we can correct less digits, and
1309
+ # avoid performance penalties completely. Also note that we
1310
+ # correct only three digits out of four. This works because
1311
+ # most significant digit is subjected to less additions.
1312
+
1313
+ $TEMP0 = $ACC9;
1314
+ $TEMP3 = $Bi;
1315
+ $TEMP4 = $Yi;
1316
+ $code.=<<___;
1317
+ vpermq \$0, $AND_MASK, $AND_MASK
1318
+ vpaddq (%rsp), $TEMP1, $ACC0
1319
+
1320
+ vpsrlq \$29, $ACC0, $TEMP1
1321
+ vpand $AND_MASK, $ACC0, $ACC0
1322
+ vpsrlq \$29, $ACC1, $TEMP2
1323
+ vpand $AND_MASK, $ACC1, $ACC1
1324
+ vpsrlq \$29, $ACC2, $TEMP3
1325
+ vpermq \$0x93, $TEMP1, $TEMP1
1326
+ vpand $AND_MASK, $ACC2, $ACC2
1327
+ vpsrlq \$29, $ACC3, $TEMP4
1328
+ vpermq \$0x93, $TEMP2, $TEMP2
1329
+ vpand $AND_MASK, $ACC3, $ACC3
1330
+
1331
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1332
+ vpermq \$0x93, $TEMP3, $TEMP3
1333
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1334
+ vpermq \$0x93, $TEMP4, $TEMP4
1335
+ vpaddq $TEMP0, $ACC0, $ACC0
1336
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1337
+ vpaddq $TEMP1, $ACC1, $ACC1
1338
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1339
+ vpaddq $TEMP2, $ACC2, $ACC2
1340
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1341
+ vpaddq $TEMP3, $ACC3, $ACC3
1342
+ vpaddq $TEMP4, $ACC4, $ACC4
1343
+
1344
+ vpsrlq \$29, $ACC0, $TEMP1
1345
+ vpand $AND_MASK, $ACC0, $ACC0
1346
+ vpsrlq \$29, $ACC1, $TEMP2
1347
+ vpand $AND_MASK, $ACC1, $ACC1
1348
+ vpsrlq \$29, $ACC2, $TEMP3
1349
+ vpermq \$0x93, $TEMP1, $TEMP1
1350
+ vpand $AND_MASK, $ACC2, $ACC2
1351
+ vpsrlq \$29, $ACC3, $TEMP4
1352
+ vpermq \$0x93, $TEMP2, $TEMP2
1353
+ vpand $AND_MASK, $ACC3, $ACC3
1354
+ vpermq \$0x93, $TEMP3, $TEMP3
1355
+
1356
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1357
+ vpermq \$0x93, $TEMP4, $TEMP4
1358
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1359
+ vpaddq $TEMP0, $ACC0, $ACC0
1360
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1361
+ vpaddq $TEMP1, $ACC1, $ACC1
1362
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1363
+ vpaddq $TEMP2, $ACC2, $ACC2
1364
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1365
+ vpaddq $TEMP3, $ACC3, $ACC3
1366
+ vpaddq $TEMP4, $ACC4, $ACC4
1367
+
1368
+ vmovdqu $ACC0, 0-128($rp)
1369
+ vmovdqu $ACC1, 32-128($rp)
1370
+ vmovdqu $ACC2, 64-128($rp)
1371
+ vmovdqu $ACC3, 96-128($rp)
1372
+ ___
1373
+
1374
+ $TEMP5=$ACC0;
1375
+ $code.=<<___;
1376
+ vpsrlq \$29, $ACC4, $TEMP1
1377
+ vpand $AND_MASK, $ACC4, $ACC4
1378
+ vpsrlq \$29, $ACC5, $TEMP2
1379
+ vpand $AND_MASK, $ACC5, $ACC5
1380
+ vpsrlq \$29, $ACC6, $TEMP3
1381
+ vpermq \$0x93, $TEMP1, $TEMP1
1382
+ vpand $AND_MASK, $ACC6, $ACC6
1383
+ vpsrlq \$29, $ACC7, $TEMP4
1384
+ vpermq \$0x93, $TEMP2, $TEMP2
1385
+ vpand $AND_MASK, $ACC7, $ACC7
1386
+ vpsrlq \$29, $ACC8, $TEMP5
1387
+ vpermq \$0x93, $TEMP3, $TEMP3
1388
+ vpand $AND_MASK, $ACC8, $ACC8
1389
+ vpermq \$0x93, $TEMP4, $TEMP4
1390
+
1391
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1392
+ vpermq \$0x93, $TEMP5, $TEMP5
1393
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1394
+ vpaddq $TEMP0, $ACC4, $ACC4
1395
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1396
+ vpaddq $TEMP1, $ACC5, $ACC5
1397
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1398
+ vpaddq $TEMP2, $ACC6, $ACC6
1399
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1400
+ vpaddq $TEMP3, $ACC7, $ACC7
1401
+ vpaddq $TEMP4, $ACC8, $ACC8
1402
+
1403
+ vpsrlq \$29, $ACC4, $TEMP1
1404
+ vpand $AND_MASK, $ACC4, $ACC4
1405
+ vpsrlq \$29, $ACC5, $TEMP2
1406
+ vpand $AND_MASK, $ACC5, $ACC5
1407
+ vpsrlq \$29, $ACC6, $TEMP3
1408
+ vpermq \$0x93, $TEMP1, $TEMP1
1409
+ vpand $AND_MASK, $ACC6, $ACC6
1410
+ vpsrlq \$29, $ACC7, $TEMP4
1411
+ vpermq \$0x93, $TEMP2, $TEMP2
1412
+ vpand $AND_MASK, $ACC7, $ACC7
1413
+ vpsrlq \$29, $ACC8, $TEMP5
1414
+ vpermq \$0x93, $TEMP3, $TEMP3
1415
+ vpand $AND_MASK, $ACC8, $ACC8
1416
+ vpermq \$0x93, $TEMP4, $TEMP4
1417
+
1418
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1419
+ vpermq \$0x93, $TEMP5, $TEMP5
1420
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1421
+ vpaddq $TEMP0, $ACC4, $ACC4
1422
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1423
+ vpaddq $TEMP1, $ACC5, $ACC5
1424
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1425
+ vpaddq $TEMP2, $ACC6, $ACC6
1426
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1427
+ vpaddq $TEMP3, $ACC7, $ACC7
1428
+ vpaddq $TEMP4, $ACC8, $ACC8
1429
+
1430
+ vmovdqu $ACC4, 128-128($rp)
1431
+ vmovdqu $ACC5, 160-128($rp)
1432
+ vmovdqu $ACC6, 192-128($rp)
1433
+ vmovdqu $ACC7, 224-128($rp)
1434
+ vmovdqu $ACC8, 256-128($rp)
1435
+ vzeroupper
1436
+
1437
+ mov %rbp, %rax
1438
+ ___
1439
+ $code.=<<___ if ($win64);
1440
+ movaps -0xd8(%rax),%xmm6
1441
+ movaps -0xc8(%rax),%xmm7
1442
+ movaps -0xb8(%rax),%xmm8
1443
+ movaps -0xa8(%rax),%xmm9
1444
+ movaps -0x98(%rax),%xmm10
1445
+ movaps -0x88(%rax),%xmm11
1446
+ movaps -0x78(%rax),%xmm12
1447
+ movaps -0x68(%rax),%xmm13
1448
+ movaps -0x58(%rax),%xmm14
1449
+ movaps -0x48(%rax),%xmm15
1450
+ ___
1451
+ $code.=<<___;
1452
+ mov -48(%rax),%r15
1453
+ mov -40(%rax),%r14
1454
+ mov -32(%rax),%r13
1455
+ mov -24(%rax),%r12
1456
+ mov -16(%rax),%rbp
1457
+ mov -8(%rax),%rbx
1458
+ lea (%rax),%rsp # restore %rsp
1459
+ .Lmul_1024_epilogue:
1460
+ ret
1461
+ .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1462
+ ___
1463
+ }
1464
+ {
1465
+ my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1466
+ my @T = map("%r$_",(8..11));
1467
+
1468
+ $code.=<<___;
1469
+ .globl rsaz_1024_red2norm_avx2
1470
+ .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1471
+ .align 32
1472
+ rsaz_1024_red2norm_avx2:
1473
+ sub \$-128,$inp # size optimization
1474
+ xor %rax,%rax
1475
+ ___
1476
+
1477
+ for ($j=0,$i=0; $i<16; $i++) {
1478
+ my $k=0;
1479
+ while (29*$j<64*($i+1)) { # load data till boundary
1480
+ $code.=" mov `8*$j-128`($inp), @T[0]\n";
1481
+ $j++; $k++; push(@T,shift(@T));
1482
+ }
1483
+ $l=$k;
1484
+ while ($k>1) { # shift loaded data but last value
1485
+ $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1486
+ $k--;
1487
+ }
1488
+ $code.=<<___; # shift last value
1489
+ mov @T[-1], @T[0]
1490
+ shl \$`29*($j-1)`, @T[-1]
1491
+ shr \$`-29*($j-1)`, @T[0]
1492
+ ___
1493
+ while ($l) { # accumulate all values
1494
+ $code.=" add @T[-$l], %rax\n";
1495
+ $l--;
1496
+ }
1497
+ $code.=<<___;
1498
+ adc \$0, @T[0] # consume eventual carry
1499
+ mov %rax, 8*$i($out)
1500
+ mov @T[0], %rax
1501
+ ___
1502
+ push(@T,shift(@T));
1503
+ }
1504
+ $code.=<<___;
1505
+ ret
1506
+ .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1507
+
1508
+ .globl rsaz_1024_norm2red_avx2
1509
+ .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1510
+ .align 32
1511
+ rsaz_1024_norm2red_avx2:
1512
+ sub \$-128,$out # size optimization
1513
+ mov ($inp),@T[0]
1514
+ mov \$0x1fffffff,%eax
1515
+ ___
1516
+ for ($j=0,$i=0; $i<16; $i++) {
1517
+ $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1518
+ $code.=" xor @T[1],@T[1]\n" if ($i==15);
1519
+ my $k=1;
1520
+ while (29*($j+1)<64*($i+1)) {
1521
+ $code.=<<___;
1522
+ mov @T[0],@T[-$k]
1523
+ shr \$`29*$j`,@T[-$k]
1524
+ and %rax,@T[-$k] # &0x1fffffff
1525
+ mov @T[-$k],`8*$j-128`($out)
1526
+ ___
1527
+ $j++; $k++;
1528
+ }
1529
+ $code.=<<___;
1530
+ shrd \$`29*$j`,@T[1],@T[0]
1531
+ and %rax,@T[0]
1532
+ mov @T[0],`8*$j-128`($out)
1533
+ ___
1534
+ $j++;
1535
+ push(@T,shift(@T));
1536
+ }
1537
+ $code.=<<___;
1538
+ mov @T[0],`8*$j-128`($out) # zero
1539
+ mov @T[0],`8*($j+1)-128`($out)
1540
+ mov @T[0],`8*($j+2)-128`($out)
1541
+ mov @T[0],`8*($j+3)-128`($out)
1542
+ ret
1543
+ .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1544
+ ___
1545
+ }
1546
+ {
1547
+ my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1548
+
1549
+ $code.=<<___;
1550
+ .globl rsaz_1024_scatter5_avx2
1551
+ .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1552
+ .align 32
1553
+ rsaz_1024_scatter5_avx2:
1554
+ vzeroupper
1555
+ vmovdqu .Lscatter_permd(%rip),%ymm5
1556
+ shl \$4,$power
1557
+ lea ($out,$power),$out
1558
+ mov \$9,%eax
1559
+ jmp .Loop_scatter_1024
1560
+
1561
+ .align 32
1562
+ .Loop_scatter_1024:
1563
+ vmovdqu ($inp),%ymm0
1564
+ lea 32($inp),$inp
1565
+ vpermd %ymm0,%ymm5,%ymm0
1566
+ vmovdqu %xmm0,($out)
1567
+ lea 16*32($out),$out
1568
+ dec %eax
1569
+ jnz .Loop_scatter_1024
1570
+
1571
+ vzeroupper
1572
+ ret
1573
+ .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1574
+
1575
+ .globl rsaz_1024_gather5_avx2
1576
+ .type rsaz_1024_gather5_avx2,\@abi-omnipotent
1577
+ .align 32
1578
+ rsaz_1024_gather5_avx2:
1579
+ ___
1580
+ $code.=<<___ if ($win64);
1581
+ lea -0x88(%rsp),%rax
1582
+ vzeroupper
1583
+ .LSEH_begin_rsaz_1024_gather5:
1584
+ # I can't trust assembler to use specific encoding:-(
1585
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1586
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1587
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1588
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1589
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1590
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1591
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1592
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1593
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1594
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1595
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1596
+ ___
1597
+ $code.=<<___;
1598
+ lea .Lgather_table(%rip),%r11
1599
+ mov $power,%eax
1600
+ and \$3,$power
1601
+ shr \$2,%eax # cache line number
1602
+ shl \$4,$power # offset within cache line
1603
+
1604
+ vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1605
+ vpbroadcastb 8(%r11,%rax), %xmm8
1606
+ vpbroadcastb 7(%r11,%rax), %xmm9
1607
+ vpbroadcastb 6(%r11,%rax), %xmm10
1608
+ vpbroadcastb 5(%r11,%rax), %xmm11
1609
+ vpbroadcastb 4(%r11,%rax), %xmm12
1610
+ vpbroadcastb 3(%r11,%rax), %xmm13
1611
+ vpbroadcastb 2(%r11,%rax), %xmm14
1612
+ vpbroadcastb 1(%r11,%rax), %xmm15
1613
+
1614
+ lea 64($inp,$power),$inp
1615
+ mov \$64,%r11 # size optimization
1616
+ mov \$9,%eax
1617
+ jmp .Loop_gather_1024
1618
+
1619
+ .align 32
1620
+ .Loop_gather_1024:
1621
+ vpand -64($inp), %xmm8,%xmm0
1622
+ vpand ($inp), %xmm9,%xmm1
1623
+ vpand 64($inp), %xmm10,%xmm2
1624
+ vpand ($inp,%r11,2), %xmm11,%xmm3
1625
+ vpor %xmm0,%xmm1,%xmm1
1626
+ vpand 64($inp,%r11,2), %xmm12,%xmm4
1627
+ vpor %xmm2,%xmm3,%xmm3
1628
+ vpand ($inp,%r11,4), %xmm13,%xmm5
1629
+ vpor %xmm1,%xmm3,%xmm3
1630
+ vpand 64($inp,%r11,4), %xmm14,%xmm6
1631
+ vpor %xmm4,%xmm5,%xmm5
1632
+ vpand -128($inp,%r11,8), %xmm15,%xmm2
1633
+ lea ($inp,%r11,8),$inp
1634
+ vpor %xmm3,%xmm5,%xmm5
1635
+ vpor %xmm2,%xmm6,%xmm6
1636
+ vpor %xmm5,%xmm6,%xmm6
1637
+ vpermd %ymm6,%ymm7,%ymm6
1638
+ vmovdqu %ymm6,($out)
1639
+ lea 32($out),$out
1640
+ dec %eax
1641
+ jnz .Loop_gather_1024
1642
+
1643
+ vpxor %ymm0,%ymm0,%ymm0
1644
+ vmovdqu %ymm0,($out)
1645
+ vzeroupper
1646
+ ___
1647
+ $code.=<<___ if ($win64);
1648
+ movaps (%rsp),%xmm6
1649
+ movaps 0x10(%rsp),%xmm7
1650
+ movaps 0x20(%rsp),%xmm8
1651
+ movaps 0x30(%rsp),%xmm9
1652
+ movaps 0x40(%rsp),%xmm10
1653
+ movaps 0x50(%rsp),%xmm11
1654
+ movaps 0x60(%rsp),%xmm12
1655
+ movaps 0x70(%rsp),%xmm13
1656
+ movaps 0x80(%rsp),%xmm14
1657
+ movaps 0x90(%rsp),%xmm15
1658
+ lea 0xa8(%rsp),%rsp
1659
+ .LSEH_end_rsaz_1024_gather5:
1660
+ ___
1661
+ $code.=<<___;
1662
+ ret
1663
+ .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1664
+ ___
1665
+ }
1666
+
1667
+ $code.=<<___;
1668
+ .extern OPENSSL_ia32cap_P
1669
+ .globl rsaz_avx2_eligible
1670
+ .type rsaz_avx2_eligible,\@abi-omnipotent
1671
+ .align 32
1672
+ rsaz_avx2_eligible:
1673
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
1674
+ ___
1675
+ $code.=<<___ if ($addx);
1676
+ mov \$`1<<8|1<<19`,%ecx
1677
+ mov \$0,%edx
1678
+ and %eax,%ecx
1679
+ cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1680
+ cmove %edx,%eax
1681
+ ___
1682
+ $code.=<<___;
1683
+ and \$`1<<5`,%eax
1684
+ shr \$5,%eax
1685
+ ret
1686
+ .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1687
+
1688
+ .align 64
1689
+ .Land_mask:
1690
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1691
+ .Lscatter_permd:
1692
+ .long 0,2,4,6,7,7,7,7
1693
+ .Lgather_permd:
1694
+ .long 0,7,1,7,2,7,3,7
1695
+ .Lgather_table:
1696
+ .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1697
+ .align 64
1698
+ ___
1699
+
1700
+ if ($win64) {
1701
+ $rec="%rcx";
1702
+ $frame="%rdx";
1703
+ $context="%r8";
1704
+ $disp="%r9";
1705
+
1706
+ $code.=<<___
1707
+ .extern __imp_RtlVirtualUnwind
1708
+ .type rsaz_se_handler,\@abi-omnipotent
1709
+ .align 16
1710
+ rsaz_se_handler:
1711
+ push %rsi
1712
+ push %rdi
1713
+ push %rbx
1714
+ push %rbp
1715
+ push %r12
1716
+ push %r13
1717
+ push %r14
1718
+ push %r15
1719
+ pushfq
1720
+ sub \$64,%rsp
1721
+
1722
+ mov 120($context),%rax # pull context->Rax
1723
+ mov 248($context),%rbx # pull context->Rip
1724
+
1725
+ mov 8($disp),%rsi # disp->ImageBase
1726
+ mov 56($disp),%r11 # disp->HandlerData
1727
+
1728
+ mov 0(%r11),%r10d # HandlerData[0]
1729
+ lea (%rsi,%r10),%r10 # prologue label
1730
+ cmp %r10,%rbx # context->Rip<prologue label
1731
+ jb .Lcommon_seh_tail
1732
+
1733
+ mov 152($context),%rax # pull context->Rsp
1734
+
1735
+ mov 4(%r11),%r10d # HandlerData[1]
1736
+ lea (%rsi,%r10),%r10 # epilogue label
1737
+ cmp %r10,%rbx # context->Rip>=epilogue label
1738
+ jae .Lcommon_seh_tail
1739
+
1740
+ mov 160($context),%rax # pull context->Rbp
1741
+
1742
+ mov -48(%rax),%r15
1743
+ mov -40(%rax),%r14
1744
+ mov -32(%rax),%r13
1745
+ mov -24(%rax),%r12
1746
+ mov -16(%rax),%rbp
1747
+ mov -8(%rax),%rbx
1748
+ mov %r15,240($context)
1749
+ mov %r14,232($context)
1750
+ mov %r13,224($context)
1751
+ mov %r12,216($context)
1752
+ mov %rbp,160($context)
1753
+ mov %rbx,144($context)
1754
+
1755
+ lea -0xd8(%rax),%rsi # %xmm save area
1756
+ lea 512($context),%rdi # & context.Xmm6
1757
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1758
+ .long 0xa548f3fc # cld; rep movsq
1759
+
1760
+ .Lcommon_seh_tail:
1761
+ mov 8(%rax),%rdi
1762
+ mov 16(%rax),%rsi
1763
+ mov %rax,152($context) # restore context->Rsp
1764
+ mov %rsi,168($context) # restore context->Rsi
1765
+ mov %rdi,176($context) # restore context->Rdi
1766
+
1767
+ mov 40($disp),%rdi # disp->ContextRecord
1768
+ mov $context,%rsi # context
1769
+ mov \$154,%ecx # sizeof(CONTEXT)
1770
+ .long 0xa548f3fc # cld; rep movsq
1771
+
1772
+ mov $disp,%rsi
1773
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1774
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1775
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1776
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1777
+ mov 40(%rsi),%r10 # disp->ContextRecord
1778
+ lea 56(%rsi),%r11 # &disp->HandlerData
1779
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1780
+ mov %r10,32(%rsp) # arg5
1781
+ mov %r11,40(%rsp) # arg6
1782
+ mov %r12,48(%rsp) # arg7
1783
+ mov %rcx,56(%rsp) # arg8, (NULL)
1784
+ call *__imp_RtlVirtualUnwind(%rip)
1785
+
1786
+ mov \$1,%eax # ExceptionContinueSearch
1787
+ add \$64,%rsp
1788
+ popfq
1789
+ pop %r15
1790
+ pop %r14
1791
+ pop %r13
1792
+ pop %r12
1793
+ pop %rbp
1794
+ pop %rbx
1795
+ pop %rdi
1796
+ pop %rsi
1797
+ ret
1798
+ .size rsaz_se_handler,.-rsaz_se_handler
1799
+
1800
+ .section .pdata
1801
+ .align 4
1802
+ .rva .LSEH_begin_rsaz_1024_sqr_avx2
1803
+ .rva .LSEH_end_rsaz_1024_sqr_avx2
1804
+ .rva .LSEH_info_rsaz_1024_sqr_avx2
1805
+
1806
+ .rva .LSEH_begin_rsaz_1024_mul_avx2
1807
+ .rva .LSEH_end_rsaz_1024_mul_avx2
1808
+ .rva .LSEH_info_rsaz_1024_mul_avx2
1809
+
1810
+ .rva .LSEH_begin_rsaz_1024_gather5
1811
+ .rva .LSEH_end_rsaz_1024_gather5
1812
+ .rva .LSEH_info_rsaz_1024_gather5
1813
+ .section .xdata
1814
+ .align 8
1815
+ .LSEH_info_rsaz_1024_sqr_avx2:
1816
+ .byte 9,0,0,0
1817
+ .rva rsaz_se_handler
1818
+ .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1819
+ .LSEH_info_rsaz_1024_mul_avx2:
1820
+ .byte 9,0,0,0
1821
+ .rva rsaz_se_handler
1822
+ .rva .Lmul_1024_body,.Lmul_1024_epilogue
1823
+ .LSEH_info_rsaz_1024_gather5:
1824
+ .byte 0x01,0x33,0x16,0x00
1825
+ .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1826
+ .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1827
+ .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1828
+ .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1829
+ .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1830
+ .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1831
+ .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1832
+ .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1833
+ .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1834
+ .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1835
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1836
+ ___
1837
+ }
1838
+
1839
+ foreach (split("\n",$code)) {
1840
+ s/\`([^\`]*)\`/eval($1)/ge;
1841
+
1842
+ s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1843
+
1844
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1845
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1846
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1847
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1848
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1849
+ print $_,"\n";
1850
+ }
1851
+
1852
+ }}} else {{{
1853
+ print <<___; # assembler is too old
1854
+ .text
1855
+
1856
+ .globl rsaz_avx2_eligible
1857
+ .type rsaz_avx2_eligible,\@abi-omnipotent
1858
+ rsaz_avx2_eligible:
1859
+ xor %eax,%eax
1860
+ ret
1861
+ .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1862
+
1863
+ .globl rsaz_1024_sqr_avx2
1864
+ .globl rsaz_1024_mul_avx2
1865
+ .globl rsaz_1024_norm2red_avx2
1866
+ .globl rsaz_1024_red2norm_avx2
1867
+ .globl rsaz_1024_scatter5_avx2
1868
+ .globl rsaz_1024_gather5_avx2
1869
+ .type rsaz_1024_sqr_avx2,\@abi-omnipotent
1870
+ rsaz_1024_sqr_avx2:
1871
+ rsaz_1024_mul_avx2:
1872
+ rsaz_1024_norm2red_avx2:
1873
+ rsaz_1024_red2norm_avx2:
1874
+ rsaz_1024_scatter5_avx2:
1875
+ rsaz_1024_gather5_avx2:
1876
+ .byte 0x0f,0x0b # ud2
1877
+ ret
1878
+ .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1879
+ ___
1880
+ }}}
1881
+
1882
+ close STDOUT;