ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,287 @@
1
+ #!/usr/local/bin/perl
2
+
3
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4
+ push(@INC,"${dir}","${dir}../../perlasm");
5
+ require "x86asm.pl";
6
+
7
+ &asm_init($ARGV[0],$0);
8
+
9
+ &bn_mul_comba("bn_mul_comba8",8);
10
+ &bn_mul_comba("bn_mul_comba4",4);
11
+ &bn_sqr_comba("bn_sqr_comba8",8);
12
+ &bn_sqr_comba("bn_sqr_comba4",4);
13
+
14
+ &asm_finish();
15
+
16
+ sub mul_add_c
17
+ {
18
+ local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
+
20
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21
+ # words, and 1 if load return value
22
+
23
+ &comment("mul a[$ai]*b[$bi]");
24
+
25
+ # "eax" and "edx" will always be pre-loaded.
26
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27
+ # &mov("edx",&DWP($bi*4,$b,"",0));
28
+
29
+ &mul("edx");
30
+ &add($c0,"eax");
31
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
32
+ &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33
+ ###
34
+ &adc($c1,"edx");
35
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
36
+ &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
37
+ ###
38
+ &adc($c2,0);
39
+ # is pos > 1, it means it is the last loop
40
+ &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
42
+ }
43
+
44
+ sub sqr_add_c
45
+ {
46
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
+
48
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49
+ # words, and 1 if load return value
50
+
51
+ &comment("sqr a[$ai]*a[$bi]");
52
+
53
+ # "eax" and "edx" will always be pre-loaded.
54
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55
+ # &mov("edx",&DWP($bi*4,$b,"",0));
56
+
57
+ if ($ai == $bi)
58
+ { &mul("eax");}
59
+ else
60
+ { &mul("edx");}
61
+ &add($c0,"eax");
62
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63
+ ###
64
+ &adc($c1,"edx");
65
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66
+ ###
67
+ &adc($c2,0);
68
+ # is pos > 1, it means it is the last loop
69
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71
+ }
72
+
73
+ sub sqr_add_c2
74
+ {
75
+ local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
+
77
+ # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78
+ # words, and 1 if load return value
79
+
80
+ &comment("sqr a[$ai]*a[$bi]");
81
+
82
+ # "eax" and "edx" will always be pre-loaded.
83
+ # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84
+ # &mov("edx",&DWP($bi*4,$a,"",0));
85
+
86
+ if ($ai == $bi)
87
+ { &mul("eax");}
88
+ else
89
+ { &mul("edx");}
90
+ &add("eax","eax");
91
+ ###
92
+ &adc("edx","edx");
93
+ ###
94
+ &adc($c2,0);
95
+ &add($c0,"eax");
96
+ &adc($c1,"edx");
97
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98
+ &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99
+ &adc($c2,0);
100
+ &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101
+ &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102
+ ###
103
+ }
104
+
105
+ sub bn_mul_comba
106
+ {
107
+ local($name,$num)=@_;
108
+ local($a,$b,$c0,$c1,$c2);
109
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
110
+ local($tot,$end);
111
+
112
+ &function_begin_B($name,"");
113
+
114
+ $c0="ebx";
115
+ $c1="ecx";
116
+ $c2="ebp";
117
+ $a="esi";
118
+ $b="edi";
119
+
120
+ $as=0;
121
+ $ae=0;
122
+ $bs=0;
123
+ $be=0;
124
+ $tot=$num+$num-1;
125
+
126
+ &push("esi");
127
+ &mov($a,&wparam(1));
128
+ &push("edi");
129
+ &mov($b,&wparam(2));
130
+ &push("ebp");
131
+ &push("ebx");
132
+
133
+ &xor($c0,$c0);
134
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
135
+ &xor($c1,$c1);
136
+ &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
+
138
+ for ($i=0; $i<$tot; $i++)
139
+ {
140
+ $ai=$as;
141
+ $bi=$bs;
142
+ $end=$be+1;
143
+
144
+ &comment("################## Calculate word $i");
145
+
146
+ for ($j=$bs; $j<$end; $j++)
147
+ {
148
+ &xor($c2,$c2) if ($j == $bs);
149
+ if (($j+1) == $end)
150
+ {
151
+ $v=1;
152
+ $v=2 if (($i+1) == $tot);
153
+ }
154
+ else
155
+ { $v=0; }
156
+ if (($j+1) != $end)
157
+ {
158
+ $na=($ai-1);
159
+ $nb=($bi+1);
160
+ }
161
+ else
162
+ {
163
+ $na=$as+($i < ($num-1));
164
+ $nb=$bs+($i >= ($num-1));
165
+ }
166
+ #printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167
+ &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168
+ if ($v)
169
+ {
170
+ &comment("saved r[$i]");
171
+ # &mov("eax",&wparam(0));
172
+ # &mov(&DWP($i*4,"eax","",0),$c0);
173
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
174
+ }
175
+ $ai--;
176
+ $bi++;
177
+ }
178
+ $as++ if ($i < ($num-1));
179
+ $ae++ if ($i >= ($num-1));
180
+
181
+ $bs++ if ($i >= ($num-1));
182
+ $be++ if ($i < ($num-1));
183
+ }
184
+ &comment("save r[$i]");
185
+ # &mov("eax",&wparam(0));
186
+ &mov(&DWP($i*4,"eax","",0),$c0);
187
+
188
+ &pop("ebx");
189
+ &pop("ebp");
190
+ &pop("edi");
191
+ &pop("esi");
192
+ &ret();
193
+ &function_end_B($name);
194
+ }
195
+
196
+ sub bn_sqr_comba
197
+ {
198
+ local($name,$num)=@_;
199
+ local($r,$a,$c0,$c1,$c2)=@_;
200
+ local($i,$as,$ae,$bs,$be,$ai,$bi);
201
+ local($b,$tot,$end,$half);
202
+
203
+ &function_begin_B($name,"");
204
+
205
+ $c0="ebx";
206
+ $c1="ecx";
207
+ $c2="ebp";
208
+ $a="esi";
209
+ $r="edi";
210
+
211
+ &push("esi");
212
+ &push("edi");
213
+ &push("ebp");
214
+ &push("ebx");
215
+ &mov($r,&wparam(0));
216
+ &mov($a,&wparam(1));
217
+ &xor($c0,$c0);
218
+ &xor($c1,$c1);
219
+ &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
+
221
+ $as=0;
222
+ $ae=0;
223
+ $bs=0;
224
+ $be=0;
225
+ $tot=$num+$num-1;
226
+
227
+ for ($i=0; $i<$tot; $i++)
228
+ {
229
+ $ai=$as;
230
+ $bi=$bs;
231
+ $end=$be+1;
232
+
233
+ &comment("############### Calculate word $i");
234
+ for ($j=$bs; $j<$end; $j++)
235
+ {
236
+ &xor($c2,$c2) if ($j == $bs);
237
+ if (($ai-1) < ($bi+1))
238
+ {
239
+ $v=1;
240
+ $v=2 if ($i+1) == $tot;
241
+ }
242
+ else
243
+ { $v=0; }
244
+ if (!$v)
245
+ {
246
+ $na=$ai-1;
247
+ $nb=$bi+1;
248
+ }
249
+ else
250
+ {
251
+ $na=$as+($i < ($num-1));
252
+ $nb=$bs+($i >= ($num-1));
253
+ }
254
+ if ($ai == $bi)
255
+ {
256
+ &sqr_add_c($r,$a,$ai,$bi,
257
+ $c0,$c1,$c2,$v,$i,$na,$nb);
258
+ }
259
+ else
260
+ {
261
+ &sqr_add_c2($r,$a,$ai,$bi,
262
+ $c0,$c1,$c2,$v,$i,$na,$nb);
263
+ }
264
+ if ($v)
265
+ {
266
+ &comment("saved r[$i]");
267
+ #&mov(&DWP($i*4,$r,"",0),$c0);
268
+ ($c0,$c1,$c2)=($c1,$c2,$c0);
269
+ last;
270
+ }
271
+ $ai--;
272
+ $bi++;
273
+ }
274
+ $as++ if ($i < ($num-1));
275
+ $ae++ if ($i >= ($num-1));
276
+
277
+ $bs++ if ($i >= ($num-1));
278
+ $be++ if ($i < ($num-1));
279
+ }
280
+ &mov(&DWP($i*4,$r,"",0),$c0);
281
+ &pop("ebx");
282
+ &pop("ebp");
283
+ &pop("edi");
284
+ &pop("esi");
285
+ &ret();
286
+ &function_end_B($name);
287
+ }
@@ -0,0 +1,1882 @@
1
+ #!/usr/bin/env perl
2
+
3
+ ##############################################################################
4
+ # #
5
+ # Copyright (c) 2012, Intel Corporation #
6
+ # #
7
+ # All rights reserved. #
8
+ # #
9
+ # Redistribution and use in source and binary forms, with or without #
10
+ # modification, are permitted provided that the following conditions are #
11
+ # met: #
12
+ # #
13
+ # * Redistributions of source code must retain the above copyright #
14
+ # notice, this list of conditions and the following disclaimer. #
15
+ # #
16
+ # * Redistributions in binary form must reproduce the above copyright #
17
+ # notice, this list of conditions and the following disclaimer in the #
18
+ # documentation and/or other materials provided with the #
19
+ # distribution. #
20
+ # #
21
+ # * Neither the name of the Intel Corporation nor the names of its #
22
+ # contributors may be used to endorse or promote products derived from #
23
+ # this software without specific prior written permission. #
24
+ # #
25
+ # #
26
+ # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29
+ # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31
+ # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32
+ # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33
+ # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37
+ # #
38
+ ##############################################################################
39
+ # Developers and authors: #
40
+ # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41
+ # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42
+ # (2) University of Haifa, Israel #
43
+ ##############################################################################
44
+ # Reference: #
45
+ # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46
+ # Exponentiation, Using Advanced Vector Instructions Architectures", #
47
+ # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48
+ # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49
+ # [2] S. Gueron: "Efficient Software Implementations of Modular #
50
+ # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51
+ # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52
+ # Proceedings of 9th International Conference on Information Technology: #
53
+ # New Generations (ITNG 2012), pp.821-823 (2012) #
54
+ # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55
+ # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56
+ # on AVX2 capable x86_64 platforms", #
57
+ # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58
+ ##############################################################################
59
+ #
60
+ # +13% improvement over original submission by <appro@openssl.org>
61
+ #
62
+ # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63
+ # 2.3GHz Haswell 621 765/+23% 1113/+79%
64
+ # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
65
+ #
66
+ # (*) if system doesn't support AVX2, for reference purposes;
67
+ # (**) scaled to 2.3GHz to simplify comparison;
68
+ # (***) scalar AD*X code is faster than AVX2 and is preferred code
69
+ # path for Broadwell;
70
+
71
+ $flavour = shift;
72
+ $output = shift;
73
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
+
75
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
+
77
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80
+ die "can't locate x86_64-xlate.pl";
81
+
82
+ # In upstream, this is controlled by shelling out to the compiler to check
83
+ # versions, but BoringSSL is intended to be used with pre-generated perlasm
84
+ # output, so this isn't useful anyway.
85
+ #
86
+ # TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
87
+ $avx = 0;
88
+ $addx = 0;
89
+
90
+ open OUT,"| \"$^X\" $xlate $flavour $output";
91
+ *STDOUT = *OUT;
92
+
93
+ if ($avx>1) {{{
94
+ { # void AMS_WW(
95
+ my $rp="%rdi"; # BN_ULONG *rp,
96
+ my $ap="%rsi"; # const BN_ULONG *ap,
97
+ my $np="%rdx"; # const BN_ULONG *np,
98
+ my $n0="%ecx"; # const BN_ULONG n0,
99
+ my $rep="%r8d"; # int repeat);
100
+
101
+ # The registers that hold the accumulated redundant result
102
+ # The AMM works on 1024 bit operands, and redundant word size is 29
103
+ # Therefore: ceil(1024/29)/4 = 9
104
+ my $ACC0="%ymm0";
105
+ my $ACC1="%ymm1";
106
+ my $ACC2="%ymm2";
107
+ my $ACC3="%ymm3";
108
+ my $ACC4="%ymm4";
109
+ my $ACC5="%ymm5";
110
+ my $ACC6="%ymm6";
111
+ my $ACC7="%ymm7";
112
+ my $ACC8="%ymm8";
113
+ my $ACC9="%ymm9";
114
+ # Registers that hold the broadcasted words of bp, currently used
115
+ my $B1="%ymm10";
116
+ my $B2="%ymm11";
117
+ # Registers that hold the broadcasted words of Y, currently used
118
+ my $Y1="%ymm12";
119
+ my $Y2="%ymm13";
120
+ # Helper registers
121
+ my $TEMP1="%ymm14";
122
+ my $AND_MASK="%ymm15";
123
+ # alu registers that hold the first words of the ACC
124
+ my $r0="%r9";
125
+ my $r1="%r10";
126
+ my $r2="%r11";
127
+ my $r3="%r12";
128
+
129
+ my $i="%r14d"; # loop counter
130
+ my $tmp = "%r15";
131
+
132
+ my $FrameSize=32*18+32*8; # place for A^2 and 2*A
133
+
134
+ my $aap=$r0;
135
+ my $tp0="%rbx";
136
+ my $tp1=$r3;
137
+ my $tpa=$tmp;
138
+
139
+ $np="%r13"; # reassigned argument
140
+
141
+ $code.=<<___;
142
+ .text
143
+
144
+ .globl rsaz_1024_sqr_avx2
145
+ .type rsaz_1024_sqr_avx2,\@function,5
146
+ .align 64
147
+ rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
148
+ lea (%rsp), %rax
149
+ push %rbx
150
+ push %rbp
151
+ push %r12
152
+ push %r13
153
+ push %r14
154
+ push %r15
155
+ vzeroupper
156
+ ___
157
+ $code.=<<___ if ($win64);
158
+ lea -0xa8(%rsp),%rsp
159
+ vmovaps %xmm6,-0xd8(%rax)
160
+ vmovaps %xmm7,-0xc8(%rax)
161
+ vmovaps %xmm8,-0xb8(%rax)
162
+ vmovaps %xmm9,-0xa8(%rax)
163
+ vmovaps %xmm10,-0x98(%rax)
164
+ vmovaps %xmm11,-0x88(%rax)
165
+ vmovaps %xmm12,-0x78(%rax)
166
+ vmovaps %xmm13,-0x68(%rax)
167
+ vmovaps %xmm14,-0x58(%rax)
168
+ vmovaps %xmm15,-0x48(%rax)
169
+ .Lsqr_1024_body:
170
+ ___
171
+ $code.=<<___;
172
+ mov %rax,%rbp
173
+ mov %rdx, $np # reassigned argument
174
+ sub \$$FrameSize, %rsp
175
+ mov $np, $tmp
176
+ sub \$-128, $rp # size optimization
177
+ sub \$-128, $ap
178
+ sub \$-128, $np
179
+
180
+ and \$4095, $tmp # see if $np crosses page
181
+ add \$32*10, $tmp
182
+ shr \$12, $tmp
183
+ vpxor $ACC9,$ACC9,$ACC9
184
+ jz .Lsqr_1024_no_n_copy
185
+
186
+ # unaligned 256-bit load that crosses page boundary can
187
+ # cause >2x performance degradation here, so if $np does
188
+ # cross page boundary, copy it to stack and make sure stack
189
+ # frame doesn't...
190
+ sub \$32*10,%rsp
191
+ vmovdqu 32*0-128($np), $ACC0
192
+ and \$-2048, %rsp
193
+ vmovdqu 32*1-128($np), $ACC1
194
+ vmovdqu 32*2-128($np), $ACC2
195
+ vmovdqu 32*3-128($np), $ACC3
196
+ vmovdqu 32*4-128($np), $ACC4
197
+ vmovdqu 32*5-128($np), $ACC5
198
+ vmovdqu 32*6-128($np), $ACC6
199
+ vmovdqu 32*7-128($np), $ACC7
200
+ vmovdqu 32*8-128($np), $ACC8
201
+ lea $FrameSize+128(%rsp),$np
202
+ vmovdqu $ACC0, 32*0-128($np)
203
+ vmovdqu $ACC1, 32*1-128($np)
204
+ vmovdqu $ACC2, 32*2-128($np)
205
+ vmovdqu $ACC3, 32*3-128($np)
206
+ vmovdqu $ACC4, 32*4-128($np)
207
+ vmovdqu $ACC5, 32*5-128($np)
208
+ vmovdqu $ACC6, 32*6-128($np)
209
+ vmovdqu $ACC7, 32*7-128($np)
210
+ vmovdqu $ACC8, 32*8-128($np)
211
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
212
+
213
+ .Lsqr_1024_no_n_copy:
214
+ and \$-1024, %rsp
215
+
216
+ vmovdqu 32*1-128($ap), $ACC1
217
+ vmovdqu 32*2-128($ap), $ACC2
218
+ vmovdqu 32*3-128($ap), $ACC3
219
+ vmovdqu 32*4-128($ap), $ACC4
220
+ vmovdqu 32*5-128($ap), $ACC5
221
+ vmovdqu 32*6-128($ap), $ACC6
222
+ vmovdqu 32*7-128($ap), $ACC7
223
+ vmovdqu 32*8-128($ap), $ACC8
224
+
225
+ lea 192(%rsp), $tp0 # 64+128=192
226
+ vpbroadcastq .Land_mask(%rip), $AND_MASK
227
+ jmp .LOOP_GRANDE_SQR_1024
228
+
229
+ .align 32
230
+ .LOOP_GRANDE_SQR_1024:
231
+ lea 32*18+128(%rsp), $aap # size optimization
232
+ lea 448(%rsp), $tp1 # 64+128+256=448
233
+
234
+ # the squaring is performed as described in Variant B of
235
+ # "Speeding up Big-Number Squaring", so start by calculating
236
+ # the A*2=A+A vector
237
+ vpaddq $ACC1, $ACC1, $ACC1
238
+ vpbroadcastq 32*0-128($ap), $B1
239
+ vpaddq $ACC2, $ACC2, $ACC2
240
+ vmovdqa $ACC1, 32*0-128($aap)
241
+ vpaddq $ACC3, $ACC3, $ACC3
242
+ vmovdqa $ACC2, 32*1-128($aap)
243
+ vpaddq $ACC4, $ACC4, $ACC4
244
+ vmovdqa $ACC3, 32*2-128($aap)
245
+ vpaddq $ACC5, $ACC5, $ACC5
246
+ vmovdqa $ACC4, 32*3-128($aap)
247
+ vpaddq $ACC6, $ACC6, $ACC6
248
+ vmovdqa $ACC5, 32*4-128($aap)
249
+ vpaddq $ACC7, $ACC7, $ACC7
250
+ vmovdqa $ACC6, 32*5-128($aap)
251
+ vpaddq $ACC8, $ACC8, $ACC8
252
+ vmovdqa $ACC7, 32*6-128($aap)
253
+ vpxor $ACC9, $ACC9, $ACC9
254
+ vmovdqa $ACC8, 32*7-128($aap)
255
+
256
+ vpmuludq 32*0-128($ap), $B1, $ACC0
257
+ vpbroadcastq 32*1-128($ap), $B2
258
+ vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
259
+ vpmuludq $B1, $ACC1, $ACC1
260
+ vmovdqu $ACC9, 32*10-448($tp1)
261
+ vpmuludq $B1, $ACC2, $ACC2
262
+ vmovdqu $ACC9, 32*11-448($tp1)
263
+ vpmuludq $B1, $ACC3, $ACC3
264
+ vmovdqu $ACC9, 32*12-448($tp1)
265
+ vpmuludq $B1, $ACC4, $ACC4
266
+ vmovdqu $ACC9, 32*13-448($tp1)
267
+ vpmuludq $B1, $ACC5, $ACC5
268
+ vmovdqu $ACC9, 32*14-448($tp1)
269
+ vpmuludq $B1, $ACC6, $ACC6
270
+ vmovdqu $ACC9, 32*15-448($tp1)
271
+ vpmuludq $B1, $ACC7, $ACC7
272
+ vmovdqu $ACC9, 32*16-448($tp1)
273
+ vpmuludq $B1, $ACC8, $ACC8
274
+ vpbroadcastq 32*2-128($ap), $B1
275
+ vmovdqu $ACC9, 32*17-448($tp1)
276
+
277
+ mov $ap, $tpa
278
+ mov \$4, $i
279
+ jmp .Lsqr_entry_1024
280
+ ___
281
+ $TEMP0=$Y1;
282
+ $TEMP2=$Y2;
283
+ $code.=<<___;
284
+ .align 32
285
+ .LOOP_SQR_1024:
286
+ vpbroadcastq 32*1-128($tpa), $B2
287
+ vpmuludq 32*0-128($ap), $B1, $ACC0
288
+ vpaddq 32*0-192($tp0), $ACC0, $ACC0
289
+ vpmuludq 32*0-128($aap), $B1, $ACC1
290
+ vpaddq 32*1-192($tp0), $ACC1, $ACC1
291
+ vpmuludq 32*1-128($aap), $B1, $ACC2
292
+ vpaddq 32*2-192($tp0), $ACC2, $ACC2
293
+ vpmuludq 32*2-128($aap), $B1, $ACC3
294
+ vpaddq 32*3-192($tp0), $ACC3, $ACC3
295
+ vpmuludq 32*3-128($aap), $B1, $ACC4
296
+ vpaddq 32*4-192($tp0), $ACC4, $ACC4
297
+ vpmuludq 32*4-128($aap), $B1, $ACC5
298
+ vpaddq 32*5-192($tp0), $ACC5, $ACC5
299
+ vpmuludq 32*5-128($aap), $B1, $ACC6
300
+ vpaddq 32*6-192($tp0), $ACC6, $ACC6
301
+ vpmuludq 32*6-128($aap), $B1, $ACC7
302
+ vpaddq 32*7-192($tp0), $ACC7, $ACC7
303
+ vpmuludq 32*7-128($aap), $B1, $ACC8
304
+ vpbroadcastq 32*2-128($tpa), $B1
305
+ vpaddq 32*8-192($tp0), $ACC8, $ACC8
306
+ .Lsqr_entry_1024:
307
+ vmovdqu $ACC0, 32*0-192($tp0)
308
+ vmovdqu $ACC1, 32*1-192($tp0)
309
+
310
+ vpmuludq 32*1-128($ap), $B2, $TEMP0
311
+ vpaddq $TEMP0, $ACC2, $ACC2
312
+ vpmuludq 32*1-128($aap), $B2, $TEMP1
313
+ vpaddq $TEMP1, $ACC3, $ACC3
314
+ vpmuludq 32*2-128($aap), $B2, $TEMP2
315
+ vpaddq $TEMP2, $ACC4, $ACC4
316
+ vpmuludq 32*3-128($aap), $B2, $TEMP0
317
+ vpaddq $TEMP0, $ACC5, $ACC5
318
+ vpmuludq 32*4-128($aap), $B2, $TEMP1
319
+ vpaddq $TEMP1, $ACC6, $ACC6
320
+ vpmuludq 32*5-128($aap), $B2, $TEMP2
321
+ vpaddq $TEMP2, $ACC7, $ACC7
322
+ vpmuludq 32*6-128($aap), $B2, $TEMP0
323
+ vpaddq $TEMP0, $ACC8, $ACC8
324
+ vpmuludq 32*7-128($aap), $B2, $ACC0
325
+ vpbroadcastq 32*3-128($tpa), $B2
326
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
327
+
328
+ vmovdqu $ACC2, 32*2-192($tp0)
329
+ vmovdqu $ACC3, 32*3-192($tp0)
330
+
331
+ vpmuludq 32*2-128($ap), $B1, $TEMP2
332
+ vpaddq $TEMP2, $ACC4, $ACC4
333
+ vpmuludq 32*2-128($aap), $B1, $TEMP0
334
+ vpaddq $TEMP0, $ACC5, $ACC5
335
+ vpmuludq 32*3-128($aap), $B1, $TEMP1
336
+ vpaddq $TEMP1, $ACC6, $ACC6
337
+ vpmuludq 32*4-128($aap), $B1, $TEMP2
338
+ vpaddq $TEMP2, $ACC7, $ACC7
339
+ vpmuludq 32*5-128($aap), $B1, $TEMP0
340
+ vpaddq $TEMP0, $ACC8, $ACC8
341
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
342
+ vpaddq $TEMP1, $ACC0, $ACC0
343
+ vpmuludq 32*7-128($aap), $B1, $ACC1
344
+ vpbroadcastq 32*4-128($tpa), $B1
345
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
346
+
347
+ vmovdqu $ACC4, 32*4-192($tp0)
348
+ vmovdqu $ACC5, 32*5-192($tp0)
349
+
350
+ vpmuludq 32*3-128($ap), $B2, $TEMP0
351
+ vpaddq $TEMP0, $ACC6, $ACC6
352
+ vpmuludq 32*3-128($aap), $B2, $TEMP1
353
+ vpaddq $TEMP1, $ACC7, $ACC7
354
+ vpmuludq 32*4-128($aap), $B2, $TEMP2
355
+ vpaddq $TEMP2, $ACC8, $ACC8
356
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
357
+ vpaddq $TEMP0, $ACC0, $ACC0
358
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
359
+ vpaddq $TEMP1, $ACC1, $ACC1
360
+ vpmuludq 32*7-128($aap), $B2, $ACC2
361
+ vpbroadcastq 32*5-128($tpa), $B2
362
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
363
+
364
+ vmovdqu $ACC6, 32*6-192($tp0)
365
+ vmovdqu $ACC7, 32*7-192($tp0)
366
+
367
+ vpmuludq 32*4-128($ap), $B1, $TEMP0
368
+ vpaddq $TEMP0, $ACC8, $ACC8
369
+ vpmuludq 32*4-128($aap), $B1, $TEMP1
370
+ vpaddq $TEMP1, $ACC0, $ACC0
371
+ vpmuludq 32*5-128($aap), $B1, $TEMP2
372
+ vpaddq $TEMP2, $ACC1, $ACC1
373
+ vpmuludq 32*6-128($aap), $B1, $TEMP0
374
+ vpaddq $TEMP0, $ACC2, $ACC2
375
+ vpmuludq 32*7-128($aap), $B1, $ACC3
376
+ vpbroadcastq 32*6-128($tpa), $B1
377
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
378
+
379
+ vmovdqu $ACC8, 32*8-192($tp0)
380
+ vmovdqu $ACC0, 32*9-192($tp0)
381
+ lea 8($tp0), $tp0
382
+
383
+ vpmuludq 32*5-128($ap), $B2, $TEMP2
384
+ vpaddq $TEMP2, $ACC1, $ACC1
385
+ vpmuludq 32*5-128($aap), $B2, $TEMP0
386
+ vpaddq $TEMP0, $ACC2, $ACC2
387
+ vpmuludq 32*6-128($aap), $B2, $TEMP1
388
+ vpaddq $TEMP1, $ACC3, $ACC3
389
+ vpmuludq 32*7-128($aap), $B2, $ACC4
390
+ vpbroadcastq 32*7-128($tpa), $B2
391
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
392
+
393
+ vmovdqu $ACC1, 32*10-448($tp1)
394
+ vmovdqu $ACC2, 32*11-448($tp1)
395
+
396
+ vpmuludq 32*6-128($ap), $B1, $TEMP0
397
+ vpaddq $TEMP0, $ACC3, $ACC3
398
+ vpmuludq 32*6-128($aap), $B1, $TEMP1
399
+ vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
400
+ vpaddq $TEMP1, $ACC4, $ACC4
401
+ vpmuludq 32*7-128($aap), $B1, $ACC5
402
+ vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
403
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
404
+
405
+ vmovdqu $ACC3, 32*12-448($tp1)
406
+ vmovdqu $ACC4, 32*13-448($tp1)
407
+ lea 8($tpa), $tpa
408
+
409
+ vpmuludq 32*7-128($ap), $B2, $TEMP0
410
+ vpaddq $TEMP0, $ACC5, $ACC5
411
+ vpmuludq 32*7-128($aap), $B2, $ACC6
412
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
413
+
414
+ vpmuludq 32*8-128($ap), $ACC0, $ACC7
415
+ vmovdqu $ACC5, 32*14-448($tp1)
416
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
417
+ vmovdqu $ACC6, 32*15-448($tp1)
418
+ vmovdqu $ACC7, 32*16-448($tp1)
419
+ lea 8($tp1), $tp1
420
+
421
+ dec $i
422
+ jnz .LOOP_SQR_1024
423
+ ___
424
+ $ZERO = $ACC9;
425
+ $TEMP0 = $B1;
426
+ $TEMP2 = $B2;
427
+ $TEMP3 = $Y1;
428
+ $TEMP4 = $Y2;
429
+ $code.=<<___;
430
+ #we need to fix indexes 32-39 to avoid overflow
431
+ vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
432
+ vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
433
+ vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
434
+ lea 192(%rsp), $tp0 # 64+128=192
435
+
436
+ vpsrlq \$29, $ACC8, $TEMP1
437
+ vpand $AND_MASK, $ACC8, $ACC8
438
+ vpsrlq \$29, $ACC1, $TEMP2
439
+ vpand $AND_MASK, $ACC1, $ACC1
440
+
441
+ vpermq \$0x93, $TEMP1, $TEMP1
442
+ vpxor $ZERO, $ZERO, $ZERO
443
+ vpermq \$0x93, $TEMP2, $TEMP2
444
+
445
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
446
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
447
+ vpaddq $TEMP0, $ACC8, $ACC8
448
+ vpblendd \$3, $TEMP2, $ZERO, $TEMP2
449
+ vpaddq $TEMP1, $ACC1, $ACC1
450
+ vpaddq $TEMP2, $ACC2, $ACC2
451
+ vmovdqu $ACC1, 32*9-192($tp0)
452
+ vmovdqu $ACC2, 32*10-192($tp0)
453
+
454
+ mov (%rsp), %rax
455
+ mov 8(%rsp), $r1
456
+ mov 16(%rsp), $r2
457
+ mov 24(%rsp), $r3
458
+ vmovdqu 32*1(%rsp), $ACC1
459
+ vmovdqu 32*2-192($tp0), $ACC2
460
+ vmovdqu 32*3-192($tp0), $ACC3
461
+ vmovdqu 32*4-192($tp0), $ACC4
462
+ vmovdqu 32*5-192($tp0), $ACC5
463
+ vmovdqu 32*6-192($tp0), $ACC6
464
+ vmovdqu 32*7-192($tp0), $ACC7
465
+
466
+ mov %rax, $r0
467
+ imull $n0, %eax
468
+ and \$0x1fffffff, %eax
469
+ vmovd %eax, $Y1
470
+
471
+ mov %rax, %rdx
472
+ imulq -128($np), %rax
473
+ vpbroadcastq $Y1, $Y1
474
+ add %rax, $r0
475
+ mov %rdx, %rax
476
+ imulq 8-128($np), %rax
477
+ shr \$29, $r0
478
+ add %rax, $r1
479
+ mov %rdx, %rax
480
+ imulq 16-128($np), %rax
481
+ add $r0, $r1
482
+ add %rax, $r2
483
+ imulq 24-128($np), %rdx
484
+ add %rdx, $r3
485
+
486
+ mov $r1, %rax
487
+ imull $n0, %eax
488
+ and \$0x1fffffff, %eax
489
+
490
+ mov \$9, $i
491
+ jmp .LOOP_REDUCE_1024
492
+
493
+ .align 32
494
+ .LOOP_REDUCE_1024:
495
+ vmovd %eax, $Y2
496
+ vpbroadcastq $Y2, $Y2
497
+
498
+ vpmuludq 32*1-128($np), $Y1, $TEMP0
499
+ mov %rax, %rdx
500
+ imulq -128($np), %rax
501
+ vpaddq $TEMP0, $ACC1, $ACC1
502
+ add %rax, $r1
503
+ vpmuludq 32*2-128($np), $Y1, $TEMP1
504
+ mov %rdx, %rax
505
+ imulq 8-128($np), %rax
506
+ vpaddq $TEMP1, $ACC2, $ACC2
507
+ vpmuludq 32*3-128($np), $Y1, $TEMP2
508
+ .byte 0x67
509
+ add %rax, $r2
510
+ .byte 0x67
511
+ mov %rdx, %rax
512
+ imulq 16-128($np), %rax
513
+ shr \$29, $r1
514
+ vpaddq $TEMP2, $ACC3, $ACC3
515
+ vpmuludq 32*4-128($np), $Y1, $TEMP0
516
+ add %rax, $r3
517
+ add $r1, $r2
518
+ vpaddq $TEMP0, $ACC4, $ACC4
519
+ vpmuludq 32*5-128($np), $Y1, $TEMP1
520
+ mov $r2, %rax
521
+ imull $n0, %eax
522
+ vpaddq $TEMP1, $ACC5, $ACC5
523
+ vpmuludq 32*6-128($np), $Y1, $TEMP2
524
+ and \$0x1fffffff, %eax
525
+ vpaddq $TEMP2, $ACC6, $ACC6
526
+ vpmuludq 32*7-128($np), $Y1, $TEMP0
527
+ vpaddq $TEMP0, $ACC7, $ACC7
528
+ vpmuludq 32*8-128($np), $Y1, $TEMP1
529
+ vmovd %eax, $Y1
530
+ #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
531
+ vpaddq $TEMP1, $ACC8, $ACC8
532
+ #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
533
+ vpbroadcastq $Y1, $Y1
534
+
535
+ vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
536
+ vmovdqu 32*3-8-128($np), $TEMP1
537
+ mov %rax, %rdx
538
+ imulq -128($np), %rax
539
+ vpaddq $TEMP2, $ACC1, $ACC1
540
+ vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
541
+ vmovdqu 32*4-8-128($np), $TEMP2
542
+ add %rax, $r2
543
+ mov %rdx, %rax
544
+ imulq 8-128($np), %rax
545
+ vpaddq $TEMP0, $ACC2, $ACC2
546
+ add $r3, %rax
547
+ shr \$29, $r2
548
+ vpmuludq $Y2, $TEMP1, $TEMP1
549
+ vmovdqu 32*5-8-128($np), $TEMP0
550
+ add $r2, %rax
551
+ vpaddq $TEMP1, $ACC3, $ACC3
552
+ vpmuludq $Y2, $TEMP2, $TEMP2
553
+ vmovdqu 32*6-8-128($np), $TEMP1
554
+ .byte 0x67
555
+ mov %rax, $r3
556
+ imull $n0, %eax
557
+ vpaddq $TEMP2, $ACC4, $ACC4
558
+ vpmuludq $Y2, $TEMP0, $TEMP0
559
+ .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
560
+ and \$0x1fffffff, %eax
561
+ vpaddq $TEMP0, $ACC5, $ACC5
562
+ vpmuludq $Y2, $TEMP1, $TEMP1
563
+ vmovdqu 32*8-8-128($np), $TEMP0
564
+ vpaddq $TEMP1, $ACC6, $ACC6
565
+ vpmuludq $Y2, $TEMP2, $TEMP2
566
+ vmovdqu 32*9-8-128($np), $ACC9
567
+ vmovd %eax, $ACC0 # borrow ACC0 for Y2
568
+ imulq -128($np), %rax
569
+ vpaddq $TEMP2, $ACC7, $ACC7
570
+ vpmuludq $Y2, $TEMP0, $TEMP0
571
+ vmovdqu 32*1-16-128($np), $TEMP1
572
+ vpbroadcastq $ACC0, $ACC0
573
+ vpaddq $TEMP0, $ACC8, $ACC8
574
+ vpmuludq $Y2, $ACC9, $ACC9
575
+ vmovdqu 32*2-16-128($np), $TEMP2
576
+ add %rax, $r3
577
+
578
+ ___
579
+ ($ACC0,$Y2)=($Y2,$ACC0);
580
+ $code.=<<___;
581
+ vmovdqu 32*1-24-128($np), $ACC0
582
+ vpmuludq $Y1, $TEMP1, $TEMP1
583
+ vmovdqu 32*3-16-128($np), $TEMP0
584
+ vpaddq $TEMP1, $ACC1, $ACC1
585
+ vpmuludq $Y2, $ACC0, $ACC0
586
+ vpmuludq $Y1, $TEMP2, $TEMP2
587
+ .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
588
+ vpaddq $ACC1, $ACC0, $ACC0
589
+ vpaddq $TEMP2, $ACC2, $ACC2
590
+ vpmuludq $Y1, $TEMP0, $TEMP0
591
+ vmovdqu 32*5-16-128($np), $TEMP2
592
+ .byte 0x67
593
+ vmovq $ACC0, %rax
594
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
595
+ vpaddq $TEMP0, $ACC3, $ACC3
596
+ vpmuludq $Y1, $TEMP1, $TEMP1
597
+ vmovdqu 32*6-16-128($np), $TEMP0
598
+ vpaddq $TEMP1, $ACC4, $ACC4
599
+ vpmuludq $Y1, $TEMP2, $TEMP2
600
+ vmovdqu 32*7-16-128($np), $TEMP1
601
+ vpaddq $TEMP2, $ACC5, $ACC5
602
+ vpmuludq $Y1, $TEMP0, $TEMP0
603
+ vmovdqu 32*8-16-128($np), $TEMP2
604
+ vpaddq $TEMP0, $ACC6, $ACC6
605
+ vpmuludq $Y1, $TEMP1, $TEMP1
606
+ shr \$29, $r3
607
+ vmovdqu 32*9-16-128($np), $TEMP0
608
+ add $r3, %rax
609
+ vpaddq $TEMP1, $ACC7, $ACC7
610
+ vpmuludq $Y1, $TEMP2, $TEMP2
611
+ #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
612
+ mov %rax, $r0
613
+ imull $n0, %eax
614
+ vpaddq $TEMP2, $ACC8, $ACC8
615
+ vpmuludq $Y1, $TEMP0, $TEMP0
616
+ and \$0x1fffffff, %eax
617
+ vmovd %eax, $Y1
618
+ vmovdqu 32*3-24-128($np), $TEMP2
619
+ .byte 0x67
620
+ vpaddq $TEMP0, $ACC9, $ACC9
621
+ vpbroadcastq $Y1, $Y1
622
+
623
+ vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
624
+ vmovdqu 32*4-24-128($np), $TEMP0
625
+ mov %rax, %rdx
626
+ imulq -128($np), %rax
627
+ mov 8(%rsp), $r1
628
+ vpaddq $TEMP1, $ACC2, $ACC1
629
+ vpmuludq $Y2, $TEMP2, $TEMP2
630
+ vmovdqu 32*5-24-128($np), $TEMP1
631
+ add %rax, $r0
632
+ mov %rdx, %rax
633
+ imulq 8-128($np), %rax
634
+ .byte 0x67
635
+ shr \$29, $r0
636
+ mov 16(%rsp), $r2
637
+ vpaddq $TEMP2, $ACC3, $ACC2
638
+ vpmuludq $Y2, $TEMP0, $TEMP0
639
+ vmovdqu 32*6-24-128($np), $TEMP2
640
+ add %rax, $r1
641
+ mov %rdx, %rax
642
+ imulq 16-128($np), %rax
643
+ vpaddq $TEMP0, $ACC4, $ACC3
644
+ vpmuludq $Y2, $TEMP1, $TEMP1
645
+ vmovdqu 32*7-24-128($np), $TEMP0
646
+ imulq 24-128($np), %rdx # future $r3
647
+ add %rax, $r2
648
+ lea ($r0,$r1), %rax
649
+ vpaddq $TEMP1, $ACC5, $ACC4
650
+ vpmuludq $Y2, $TEMP2, $TEMP2
651
+ vmovdqu 32*8-24-128($np), $TEMP1
652
+ mov %rax, $r1
653
+ imull $n0, %eax
654
+ vpmuludq $Y2, $TEMP0, $TEMP0
655
+ vpaddq $TEMP2, $ACC6, $ACC5
656
+ vmovdqu 32*9-24-128($np), $TEMP2
657
+ and \$0x1fffffff, %eax
658
+ vpaddq $TEMP0, $ACC7, $ACC6
659
+ vpmuludq $Y2, $TEMP1, $TEMP1
660
+ add 24(%rsp), %rdx
661
+ vpaddq $TEMP1, $ACC8, $ACC7
662
+ vpmuludq $Y2, $TEMP2, $TEMP2
663
+ vpaddq $TEMP2, $ACC9, $ACC8
664
+ vmovq $r3, $ACC9
665
+ mov %rdx, $r3
666
+
667
+ dec $i
668
+ jnz .LOOP_REDUCE_1024
669
+ ___
670
+ ($ACC0,$Y2)=($Y2,$ACC0);
671
+ $code.=<<___;
672
+ lea 448(%rsp), $tp1 # size optimization
673
+ vpaddq $ACC9, $Y2, $ACC0
674
+ vpxor $ZERO, $ZERO, $ZERO
675
+
676
+ vpaddq 32*9-192($tp0), $ACC0, $ACC0
677
+ vpaddq 32*10-448($tp1), $ACC1, $ACC1
678
+ vpaddq 32*11-448($tp1), $ACC2, $ACC2
679
+ vpaddq 32*12-448($tp1), $ACC3, $ACC3
680
+ vpaddq 32*13-448($tp1), $ACC4, $ACC4
681
+ vpaddq 32*14-448($tp1), $ACC5, $ACC5
682
+ vpaddq 32*15-448($tp1), $ACC6, $ACC6
683
+ vpaddq 32*16-448($tp1), $ACC7, $ACC7
684
+ vpaddq 32*17-448($tp1), $ACC8, $ACC8
685
+
686
+ vpsrlq \$29, $ACC0, $TEMP1
687
+ vpand $AND_MASK, $ACC0, $ACC0
688
+ vpsrlq \$29, $ACC1, $TEMP2
689
+ vpand $AND_MASK, $ACC1, $ACC1
690
+ vpsrlq \$29, $ACC2, $TEMP3
691
+ vpermq \$0x93, $TEMP1, $TEMP1
692
+ vpand $AND_MASK, $ACC2, $ACC2
693
+ vpsrlq \$29, $ACC3, $TEMP4
694
+ vpermq \$0x93, $TEMP2, $TEMP2
695
+ vpand $AND_MASK, $ACC3, $ACC3
696
+ vpermq \$0x93, $TEMP3, $TEMP3
697
+
698
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
699
+ vpermq \$0x93, $TEMP4, $TEMP4
700
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
701
+ vpaddq $TEMP0, $ACC0, $ACC0
702
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
703
+ vpaddq $TEMP1, $ACC1, $ACC1
704
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
705
+ vpaddq $TEMP2, $ACC2, $ACC2
706
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
707
+ vpaddq $TEMP3, $ACC3, $ACC3
708
+ vpaddq $TEMP4, $ACC4, $ACC4
709
+
710
+ vpsrlq \$29, $ACC0, $TEMP1
711
+ vpand $AND_MASK, $ACC0, $ACC0
712
+ vpsrlq \$29, $ACC1, $TEMP2
713
+ vpand $AND_MASK, $ACC1, $ACC1
714
+ vpsrlq \$29, $ACC2, $TEMP3
715
+ vpermq \$0x93, $TEMP1, $TEMP1
716
+ vpand $AND_MASK, $ACC2, $ACC2
717
+ vpsrlq \$29, $ACC3, $TEMP4
718
+ vpermq \$0x93, $TEMP2, $TEMP2
719
+ vpand $AND_MASK, $ACC3, $ACC3
720
+ vpermq \$0x93, $TEMP3, $TEMP3
721
+
722
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
723
+ vpermq \$0x93, $TEMP4, $TEMP4
724
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
725
+ vpaddq $TEMP0, $ACC0, $ACC0
726
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
727
+ vpaddq $TEMP1, $ACC1, $ACC1
728
+ vmovdqu $ACC0, 32*0-128($rp)
729
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
730
+ vpaddq $TEMP2, $ACC2, $ACC2
731
+ vmovdqu $ACC1, 32*1-128($rp)
732
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
733
+ vpaddq $TEMP3, $ACC3, $ACC3
734
+ vmovdqu $ACC2, 32*2-128($rp)
735
+ vpaddq $TEMP4, $ACC4, $ACC4
736
+ vmovdqu $ACC3, 32*3-128($rp)
737
+ ___
738
+ $TEMP5=$ACC0;
739
+ $code.=<<___;
740
+ vpsrlq \$29, $ACC4, $TEMP1
741
+ vpand $AND_MASK, $ACC4, $ACC4
742
+ vpsrlq \$29, $ACC5, $TEMP2
743
+ vpand $AND_MASK, $ACC5, $ACC5
744
+ vpsrlq \$29, $ACC6, $TEMP3
745
+ vpermq \$0x93, $TEMP1, $TEMP1
746
+ vpand $AND_MASK, $ACC6, $ACC6
747
+ vpsrlq \$29, $ACC7, $TEMP4
748
+ vpermq \$0x93, $TEMP2, $TEMP2
749
+ vpand $AND_MASK, $ACC7, $ACC7
750
+ vpsrlq \$29, $ACC8, $TEMP5
751
+ vpermq \$0x93, $TEMP3, $TEMP3
752
+ vpand $AND_MASK, $ACC8, $ACC8
753
+ vpermq \$0x93, $TEMP4, $TEMP4
754
+
755
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
756
+ vpermq \$0x93, $TEMP5, $TEMP5
757
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
758
+ vpaddq $TEMP0, $ACC4, $ACC4
759
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
760
+ vpaddq $TEMP1, $ACC5, $ACC5
761
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
762
+ vpaddq $TEMP2, $ACC6, $ACC6
763
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
764
+ vpaddq $TEMP3, $ACC7, $ACC7
765
+ vpaddq $TEMP4, $ACC8, $ACC8
766
+
767
+ vpsrlq \$29, $ACC4, $TEMP1
768
+ vpand $AND_MASK, $ACC4, $ACC4
769
+ vpsrlq \$29, $ACC5, $TEMP2
770
+ vpand $AND_MASK, $ACC5, $ACC5
771
+ vpsrlq \$29, $ACC6, $TEMP3
772
+ vpermq \$0x93, $TEMP1, $TEMP1
773
+ vpand $AND_MASK, $ACC6, $ACC6
774
+ vpsrlq \$29, $ACC7, $TEMP4
775
+ vpermq \$0x93, $TEMP2, $TEMP2
776
+ vpand $AND_MASK, $ACC7, $ACC7
777
+ vpsrlq \$29, $ACC8, $TEMP5
778
+ vpermq \$0x93, $TEMP3, $TEMP3
779
+ vpand $AND_MASK, $ACC8, $ACC8
780
+ vpermq \$0x93, $TEMP4, $TEMP4
781
+
782
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
783
+ vpermq \$0x93, $TEMP5, $TEMP5
784
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
785
+ vpaddq $TEMP0, $ACC4, $ACC4
786
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
787
+ vpaddq $TEMP1, $ACC5, $ACC5
788
+ vmovdqu $ACC4, 32*4-128($rp)
789
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
790
+ vpaddq $TEMP2, $ACC6, $ACC6
791
+ vmovdqu $ACC5, 32*5-128($rp)
792
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
793
+ vpaddq $TEMP3, $ACC7, $ACC7
794
+ vmovdqu $ACC6, 32*6-128($rp)
795
+ vpaddq $TEMP4, $ACC8, $ACC8
796
+ vmovdqu $ACC7, 32*7-128($rp)
797
+ vmovdqu $ACC8, 32*8-128($rp)
798
+
799
+ mov $rp, $ap
800
+ dec $rep
801
+ jne .LOOP_GRANDE_SQR_1024
802
+
803
+ vzeroall
804
+ mov %rbp, %rax
805
+ ___
806
+ $code.=<<___ if ($win64);
807
+ movaps -0xd8(%rax),%xmm6
808
+ movaps -0xc8(%rax),%xmm7
809
+ movaps -0xb8(%rax),%xmm8
810
+ movaps -0xa8(%rax),%xmm9
811
+ movaps -0x98(%rax),%xmm10
812
+ movaps -0x88(%rax),%xmm11
813
+ movaps -0x78(%rax),%xmm12
814
+ movaps -0x68(%rax),%xmm13
815
+ movaps -0x58(%rax),%xmm14
816
+ movaps -0x48(%rax),%xmm15
817
+ ___
818
+ $code.=<<___;
819
+ mov -48(%rax),%r15
820
+ mov -40(%rax),%r14
821
+ mov -32(%rax),%r13
822
+ mov -24(%rax),%r12
823
+ mov -16(%rax),%rbp
824
+ mov -8(%rax),%rbx
825
+ lea (%rax),%rsp # restore %rsp
826
+ .Lsqr_1024_epilogue:
827
+ ret
828
+ .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
829
+ ___
830
+ }
831
+
832
+ { # void AMM_WW(
833
+ my $rp="%rdi"; # BN_ULONG *rp,
834
+ my $ap="%rsi"; # const BN_ULONG *ap,
835
+ my $bp="%rdx"; # const BN_ULONG *bp,
836
+ my $np="%rcx"; # const BN_ULONG *np,
837
+ my $n0="%r8d"; # unsigned int n0);
838
+
839
+ # The registers that hold the accumulated redundant result
840
+ # The AMM works on 1024 bit operands, and redundant word size is 29
841
+ # Therefore: ceil(1024/29)/4 = 9
842
+ my $ACC0="%ymm0";
843
+ my $ACC1="%ymm1";
844
+ my $ACC2="%ymm2";
845
+ my $ACC3="%ymm3";
846
+ my $ACC4="%ymm4";
847
+ my $ACC5="%ymm5";
848
+ my $ACC6="%ymm6";
849
+ my $ACC7="%ymm7";
850
+ my $ACC8="%ymm8";
851
+ my $ACC9="%ymm9";
852
+
853
+ # Registers that hold the broadcasted words of multiplier, currently used
854
+ my $Bi="%ymm10";
855
+ my $Yi="%ymm11";
856
+
857
+ # Helper registers
858
+ my $TEMP0=$ACC0;
859
+ my $TEMP1="%ymm12";
860
+ my $TEMP2="%ymm13";
861
+ my $ZERO="%ymm14";
862
+ my $AND_MASK="%ymm15";
863
+
864
+ # alu registers that hold the first words of the ACC
865
+ my $r0="%r9";
866
+ my $r1="%r10";
867
+ my $r2="%r11";
868
+ my $r3="%r12";
869
+
870
+ my $i="%r14d";
871
+ my $tmp="%r15";
872
+
873
+ $bp="%r13"; # reassigned argument
874
+
875
+ $code.=<<___;
876
+ .globl rsaz_1024_mul_avx2
877
+ .type rsaz_1024_mul_avx2,\@function,5
878
+ .align 64
879
+ rsaz_1024_mul_avx2:
880
+ lea (%rsp), %rax
881
+ push %rbx
882
+ push %rbp
883
+ push %r12
884
+ push %r13
885
+ push %r14
886
+ push %r15
887
+ ___
888
+ $code.=<<___ if ($win64);
889
+ vzeroupper
890
+ lea -0xa8(%rsp),%rsp
891
+ vmovaps %xmm6,-0xd8(%rax)
892
+ vmovaps %xmm7,-0xc8(%rax)
893
+ vmovaps %xmm8,-0xb8(%rax)
894
+ vmovaps %xmm9,-0xa8(%rax)
895
+ vmovaps %xmm10,-0x98(%rax)
896
+ vmovaps %xmm11,-0x88(%rax)
897
+ vmovaps %xmm12,-0x78(%rax)
898
+ vmovaps %xmm13,-0x68(%rax)
899
+ vmovaps %xmm14,-0x58(%rax)
900
+ vmovaps %xmm15,-0x48(%rax)
901
+ .Lmul_1024_body:
902
+ ___
903
+ $code.=<<___;
904
+ mov %rax,%rbp
905
+ vzeroall
906
+ mov %rdx, $bp # reassigned argument
907
+ sub \$64,%rsp
908
+
909
+ # unaligned 256-bit load that crosses page boundary can
910
+ # cause severe performance degradation here, so if $ap does
911
+ # cross page boundary, swap it with $bp [meaning that caller
912
+ # is advised to lay down $ap and $bp next to each other, so
913
+ # that only one can cross page boundary].
914
+ .byte 0x67,0x67
915
+ mov $ap, $tmp
916
+ and \$4095, $tmp
917
+ add \$32*10, $tmp
918
+ shr \$12, $tmp
919
+ mov $ap, $tmp
920
+ cmovnz $bp, $ap
921
+ cmovnz $tmp, $bp
922
+
923
+ mov $np, $tmp
924
+ sub \$-128,$ap # size optimization
925
+ sub \$-128,$np
926
+ sub \$-128,$rp
927
+
928
+ and \$4095, $tmp # see if $np crosses page
929
+ add \$32*10, $tmp
930
+ .byte 0x67,0x67
931
+ shr \$12, $tmp
932
+ jz .Lmul_1024_no_n_copy
933
+
934
+ # unaligned 256-bit load that crosses page boundary can
935
+ # cause severe performance degradation here, so if $np does
936
+ # cross page boundary, copy it to stack and make sure stack
937
+ # frame doesn't...
938
+ sub \$32*10,%rsp
939
+ vmovdqu 32*0-128($np), $ACC0
940
+ and \$-512, %rsp
941
+ vmovdqu 32*1-128($np), $ACC1
942
+ vmovdqu 32*2-128($np), $ACC2
943
+ vmovdqu 32*3-128($np), $ACC3
944
+ vmovdqu 32*4-128($np), $ACC4
945
+ vmovdqu 32*5-128($np), $ACC5
946
+ vmovdqu 32*6-128($np), $ACC6
947
+ vmovdqu 32*7-128($np), $ACC7
948
+ vmovdqu 32*8-128($np), $ACC8
949
+ lea 64+128(%rsp),$np
950
+ vmovdqu $ACC0, 32*0-128($np)
951
+ vpxor $ACC0, $ACC0, $ACC0
952
+ vmovdqu $ACC1, 32*1-128($np)
953
+ vpxor $ACC1, $ACC1, $ACC1
954
+ vmovdqu $ACC2, 32*2-128($np)
955
+ vpxor $ACC2, $ACC2, $ACC2
956
+ vmovdqu $ACC3, 32*3-128($np)
957
+ vpxor $ACC3, $ACC3, $ACC3
958
+ vmovdqu $ACC4, 32*4-128($np)
959
+ vpxor $ACC4, $ACC4, $ACC4
960
+ vmovdqu $ACC5, 32*5-128($np)
961
+ vpxor $ACC5, $ACC5, $ACC5
962
+ vmovdqu $ACC6, 32*6-128($np)
963
+ vpxor $ACC6, $ACC6, $ACC6
964
+ vmovdqu $ACC7, 32*7-128($np)
965
+ vpxor $ACC7, $ACC7, $ACC7
966
+ vmovdqu $ACC8, 32*8-128($np)
967
+ vmovdqa $ACC0, $ACC8
968
+ vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
969
+ .Lmul_1024_no_n_copy:
970
+ and \$-64,%rsp
971
+
972
+ mov ($bp), %rbx
973
+ vpbroadcastq ($bp), $Bi
974
+ vmovdqu $ACC0, (%rsp) # clear top of stack
975
+ xor $r0, $r0
976
+ .byte 0x67
977
+ xor $r1, $r1
978
+ xor $r2, $r2
979
+ xor $r3, $r3
980
+
981
+ vmovdqu .Land_mask(%rip), $AND_MASK
982
+ mov \$9, $i
983
+ vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
984
+ jmp .Loop_mul_1024
985
+
986
+ .align 32
987
+ .Loop_mul_1024:
988
+ vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
989
+ mov %rbx, %rax
990
+ imulq -128($ap), %rax
991
+ add $r0, %rax
992
+ mov %rbx, $r1
993
+ imulq 8-128($ap), $r1
994
+ add 8(%rsp), $r1
995
+
996
+ mov %rax, $r0
997
+ imull $n0, %eax
998
+ and \$0x1fffffff, %eax
999
+
1000
+ mov %rbx, $r2
1001
+ imulq 16-128($ap), $r2
1002
+ add 16(%rsp), $r2
1003
+
1004
+ mov %rbx, $r3
1005
+ imulq 24-128($ap), $r3
1006
+ add 24(%rsp), $r3
1007
+ vpmuludq 32*1-128($ap),$Bi,$TEMP0
1008
+ vmovd %eax, $Yi
1009
+ vpaddq $TEMP0,$ACC1,$ACC1
1010
+ vpmuludq 32*2-128($ap),$Bi,$TEMP1
1011
+ vpbroadcastq $Yi, $Yi
1012
+ vpaddq $TEMP1,$ACC2,$ACC2
1013
+ vpmuludq 32*3-128($ap),$Bi,$TEMP2
1014
+ vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1015
+ vpaddq $TEMP2,$ACC3,$ACC3
1016
+ vpmuludq 32*4-128($ap),$Bi,$TEMP0
1017
+ vpaddq $TEMP0,$ACC4,$ACC4
1018
+ vpmuludq 32*5-128($ap),$Bi,$TEMP1
1019
+ vpaddq $TEMP1,$ACC5,$ACC5
1020
+ vpmuludq 32*6-128($ap),$Bi,$TEMP2
1021
+ vpaddq $TEMP2,$ACC6,$ACC6
1022
+ vpmuludq 32*7-128($ap),$Bi,$TEMP0
1023
+ vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1024
+ vpaddq $TEMP0,$ACC7,$ACC7
1025
+ vpmuludq 32*8-128($ap),$Bi,$TEMP1
1026
+ vpbroadcastq 8($bp), $Bi
1027
+ vpaddq $TEMP1,$ACC8,$ACC8
1028
+
1029
+ mov %rax,%rdx
1030
+ imulq -128($np),%rax
1031
+ add %rax,$r0
1032
+ mov %rdx,%rax
1033
+ imulq 8-128($np),%rax
1034
+ add %rax,$r1
1035
+ mov %rdx,%rax
1036
+ imulq 16-128($np),%rax
1037
+ add %rax,$r2
1038
+ shr \$29, $r0
1039
+ imulq 24-128($np),%rdx
1040
+ add %rdx,$r3
1041
+ add $r0, $r1
1042
+
1043
+ vpmuludq 32*1-128($np),$Yi,$TEMP2
1044
+ vmovq $Bi, %rbx
1045
+ vpaddq $TEMP2,$ACC1,$ACC1
1046
+ vpmuludq 32*2-128($np),$Yi,$TEMP0
1047
+ vpaddq $TEMP0,$ACC2,$ACC2
1048
+ vpmuludq 32*3-128($np),$Yi,$TEMP1
1049
+ vpaddq $TEMP1,$ACC3,$ACC3
1050
+ vpmuludq 32*4-128($np),$Yi,$TEMP2
1051
+ vpaddq $TEMP2,$ACC4,$ACC4
1052
+ vpmuludq 32*5-128($np),$Yi,$TEMP0
1053
+ vpaddq $TEMP0,$ACC5,$ACC5
1054
+ vpmuludq 32*6-128($np),$Yi,$TEMP1
1055
+ vpaddq $TEMP1,$ACC6,$ACC6
1056
+ vpmuludq 32*7-128($np),$Yi,$TEMP2
1057
+ vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1058
+ vpaddq $TEMP2,$ACC7,$ACC7
1059
+ vpmuludq 32*8-128($np),$Yi,$TEMP0
1060
+ vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1061
+ vpaddq $TEMP0,$ACC8,$ACC8
1062
+
1063
+ mov %rbx, %rax
1064
+ imulq -128($ap),%rax
1065
+ add %rax,$r1
1066
+ vmovdqu -8+32*1-128($ap),$TEMP1
1067
+ mov %rbx, %rax
1068
+ imulq 8-128($ap),%rax
1069
+ add %rax,$r2
1070
+ vmovdqu -8+32*2-128($ap),$TEMP2
1071
+
1072
+ mov $r1, %rax
1073
+ imull $n0, %eax
1074
+ and \$0x1fffffff, %eax
1075
+
1076
+ imulq 16-128($ap),%rbx
1077
+ add %rbx,$r3
1078
+ vpmuludq $Bi,$TEMP1,$TEMP1
1079
+ vmovd %eax, $Yi
1080
+ vmovdqu -8+32*3-128($ap),$TEMP0
1081
+ vpaddq $TEMP1,$ACC1,$ACC1
1082
+ vpmuludq $Bi,$TEMP2,$TEMP2
1083
+ vpbroadcastq $Yi, $Yi
1084
+ vmovdqu -8+32*4-128($ap),$TEMP1
1085
+ vpaddq $TEMP2,$ACC2,$ACC2
1086
+ vpmuludq $Bi,$TEMP0,$TEMP0
1087
+ vmovdqu -8+32*5-128($ap),$TEMP2
1088
+ vpaddq $TEMP0,$ACC3,$ACC3
1089
+ vpmuludq $Bi,$TEMP1,$TEMP1
1090
+ vmovdqu -8+32*6-128($ap),$TEMP0
1091
+ vpaddq $TEMP1,$ACC4,$ACC4
1092
+ vpmuludq $Bi,$TEMP2,$TEMP2
1093
+ vmovdqu -8+32*7-128($ap),$TEMP1
1094
+ vpaddq $TEMP2,$ACC5,$ACC5
1095
+ vpmuludq $Bi,$TEMP0,$TEMP0
1096
+ vmovdqu -8+32*8-128($ap),$TEMP2
1097
+ vpaddq $TEMP0,$ACC6,$ACC6
1098
+ vpmuludq $Bi,$TEMP1,$TEMP1
1099
+ vmovdqu -8+32*9-128($ap),$ACC9
1100
+ vpaddq $TEMP1,$ACC7,$ACC7
1101
+ vpmuludq $Bi,$TEMP2,$TEMP2
1102
+ vpaddq $TEMP2,$ACC8,$ACC8
1103
+ vpmuludq $Bi,$ACC9,$ACC9
1104
+ vpbroadcastq 16($bp), $Bi
1105
+
1106
+ mov %rax,%rdx
1107
+ imulq -128($np),%rax
1108
+ add %rax,$r1
1109
+ vmovdqu -8+32*1-128($np),$TEMP0
1110
+ mov %rdx,%rax
1111
+ imulq 8-128($np),%rax
1112
+ add %rax,$r2
1113
+ vmovdqu -8+32*2-128($np),$TEMP1
1114
+ shr \$29, $r1
1115
+ imulq 16-128($np),%rdx
1116
+ add %rdx,$r3
1117
+ add $r1, $r2
1118
+
1119
+ vpmuludq $Yi,$TEMP0,$TEMP0
1120
+ vmovq $Bi, %rbx
1121
+ vmovdqu -8+32*3-128($np),$TEMP2
1122
+ vpaddq $TEMP0,$ACC1,$ACC1
1123
+ vpmuludq $Yi,$TEMP1,$TEMP1
1124
+ vmovdqu -8+32*4-128($np),$TEMP0
1125
+ vpaddq $TEMP1,$ACC2,$ACC2
1126
+ vpmuludq $Yi,$TEMP2,$TEMP2
1127
+ vmovdqu -8+32*5-128($np),$TEMP1
1128
+ vpaddq $TEMP2,$ACC3,$ACC3
1129
+ vpmuludq $Yi,$TEMP0,$TEMP0
1130
+ vmovdqu -8+32*6-128($np),$TEMP2
1131
+ vpaddq $TEMP0,$ACC4,$ACC4
1132
+ vpmuludq $Yi,$TEMP1,$TEMP1
1133
+ vmovdqu -8+32*7-128($np),$TEMP0
1134
+ vpaddq $TEMP1,$ACC5,$ACC5
1135
+ vpmuludq $Yi,$TEMP2,$TEMP2
1136
+ vmovdqu -8+32*8-128($np),$TEMP1
1137
+ vpaddq $TEMP2,$ACC6,$ACC6
1138
+ vpmuludq $Yi,$TEMP0,$TEMP0
1139
+ vmovdqu -8+32*9-128($np),$TEMP2
1140
+ vpaddq $TEMP0,$ACC7,$ACC7
1141
+ vpmuludq $Yi,$TEMP1,$TEMP1
1142
+ vpaddq $TEMP1,$ACC8,$ACC8
1143
+ vpmuludq $Yi,$TEMP2,$TEMP2
1144
+ vpaddq $TEMP2,$ACC9,$ACC9
1145
+
1146
+ vmovdqu -16+32*1-128($ap),$TEMP0
1147
+ mov %rbx,%rax
1148
+ imulq -128($ap),%rax
1149
+ add $r2,%rax
1150
+
1151
+ vmovdqu -16+32*2-128($ap),$TEMP1
1152
+ mov %rax,$r2
1153
+ imull $n0, %eax
1154
+ and \$0x1fffffff, %eax
1155
+
1156
+ imulq 8-128($ap),%rbx
1157
+ add %rbx,$r3
1158
+ vpmuludq $Bi,$TEMP0,$TEMP0
1159
+ vmovd %eax, $Yi
1160
+ vmovdqu -16+32*3-128($ap),$TEMP2
1161
+ vpaddq $TEMP0,$ACC1,$ACC1
1162
+ vpmuludq $Bi,$TEMP1,$TEMP1
1163
+ vpbroadcastq $Yi, $Yi
1164
+ vmovdqu -16+32*4-128($ap),$TEMP0
1165
+ vpaddq $TEMP1,$ACC2,$ACC2
1166
+ vpmuludq $Bi,$TEMP2,$TEMP2
1167
+ vmovdqu -16+32*5-128($ap),$TEMP1
1168
+ vpaddq $TEMP2,$ACC3,$ACC3
1169
+ vpmuludq $Bi,$TEMP0,$TEMP0
1170
+ vmovdqu -16+32*6-128($ap),$TEMP2
1171
+ vpaddq $TEMP0,$ACC4,$ACC4
1172
+ vpmuludq $Bi,$TEMP1,$TEMP1
1173
+ vmovdqu -16+32*7-128($ap),$TEMP0
1174
+ vpaddq $TEMP1,$ACC5,$ACC5
1175
+ vpmuludq $Bi,$TEMP2,$TEMP2
1176
+ vmovdqu -16+32*8-128($ap),$TEMP1
1177
+ vpaddq $TEMP2,$ACC6,$ACC6
1178
+ vpmuludq $Bi,$TEMP0,$TEMP0
1179
+ vmovdqu -16+32*9-128($ap),$TEMP2
1180
+ vpaddq $TEMP0,$ACC7,$ACC7
1181
+ vpmuludq $Bi,$TEMP1,$TEMP1
1182
+ vpaddq $TEMP1,$ACC8,$ACC8
1183
+ vpmuludq $Bi,$TEMP2,$TEMP2
1184
+ vpbroadcastq 24($bp), $Bi
1185
+ vpaddq $TEMP2,$ACC9,$ACC9
1186
+
1187
+ vmovdqu -16+32*1-128($np),$TEMP0
1188
+ mov %rax,%rdx
1189
+ imulq -128($np),%rax
1190
+ add %rax,$r2
1191
+ vmovdqu -16+32*2-128($np),$TEMP1
1192
+ imulq 8-128($np),%rdx
1193
+ add %rdx,$r3
1194
+ shr \$29, $r2
1195
+
1196
+ vpmuludq $Yi,$TEMP0,$TEMP0
1197
+ vmovq $Bi, %rbx
1198
+ vmovdqu -16+32*3-128($np),$TEMP2
1199
+ vpaddq $TEMP0,$ACC1,$ACC1
1200
+ vpmuludq $Yi,$TEMP1,$TEMP1
1201
+ vmovdqu -16+32*4-128($np),$TEMP0
1202
+ vpaddq $TEMP1,$ACC2,$ACC2
1203
+ vpmuludq $Yi,$TEMP2,$TEMP2
1204
+ vmovdqu -16+32*5-128($np),$TEMP1
1205
+ vpaddq $TEMP2,$ACC3,$ACC3
1206
+ vpmuludq $Yi,$TEMP0,$TEMP0
1207
+ vmovdqu -16+32*6-128($np),$TEMP2
1208
+ vpaddq $TEMP0,$ACC4,$ACC4
1209
+ vpmuludq $Yi,$TEMP1,$TEMP1
1210
+ vmovdqu -16+32*7-128($np),$TEMP0
1211
+ vpaddq $TEMP1,$ACC5,$ACC5
1212
+ vpmuludq $Yi,$TEMP2,$TEMP2
1213
+ vmovdqu -16+32*8-128($np),$TEMP1
1214
+ vpaddq $TEMP2,$ACC6,$ACC6
1215
+ vpmuludq $Yi,$TEMP0,$TEMP0
1216
+ vmovdqu -16+32*9-128($np),$TEMP2
1217
+ vpaddq $TEMP0,$ACC7,$ACC7
1218
+ vpmuludq $Yi,$TEMP1,$TEMP1
1219
+ vmovdqu -24+32*1-128($ap),$TEMP0
1220
+ vpaddq $TEMP1,$ACC8,$ACC8
1221
+ vpmuludq $Yi,$TEMP2,$TEMP2
1222
+ vmovdqu -24+32*2-128($ap),$TEMP1
1223
+ vpaddq $TEMP2,$ACC9,$ACC9
1224
+
1225
+ add $r2, $r3
1226
+ imulq -128($ap),%rbx
1227
+ add %rbx,$r3
1228
+
1229
+ mov $r3, %rax
1230
+ imull $n0, %eax
1231
+ and \$0x1fffffff, %eax
1232
+
1233
+ vpmuludq $Bi,$TEMP0,$TEMP0
1234
+ vmovd %eax, $Yi
1235
+ vmovdqu -24+32*3-128($ap),$TEMP2
1236
+ vpaddq $TEMP0,$ACC1,$ACC1
1237
+ vpmuludq $Bi,$TEMP1,$TEMP1
1238
+ vpbroadcastq $Yi, $Yi
1239
+ vmovdqu -24+32*4-128($ap),$TEMP0
1240
+ vpaddq $TEMP1,$ACC2,$ACC2
1241
+ vpmuludq $Bi,$TEMP2,$TEMP2
1242
+ vmovdqu -24+32*5-128($ap),$TEMP1
1243
+ vpaddq $TEMP2,$ACC3,$ACC3
1244
+ vpmuludq $Bi,$TEMP0,$TEMP0
1245
+ vmovdqu -24+32*6-128($ap),$TEMP2
1246
+ vpaddq $TEMP0,$ACC4,$ACC4
1247
+ vpmuludq $Bi,$TEMP1,$TEMP1
1248
+ vmovdqu -24+32*7-128($ap),$TEMP0
1249
+ vpaddq $TEMP1,$ACC5,$ACC5
1250
+ vpmuludq $Bi,$TEMP2,$TEMP2
1251
+ vmovdqu -24+32*8-128($ap),$TEMP1
1252
+ vpaddq $TEMP2,$ACC6,$ACC6
1253
+ vpmuludq $Bi,$TEMP0,$TEMP0
1254
+ vmovdqu -24+32*9-128($ap),$TEMP2
1255
+ vpaddq $TEMP0,$ACC7,$ACC7
1256
+ vpmuludq $Bi,$TEMP1,$TEMP1
1257
+ vpaddq $TEMP1,$ACC8,$ACC8
1258
+ vpmuludq $Bi,$TEMP2,$TEMP2
1259
+ vpbroadcastq 32($bp), $Bi
1260
+ vpaddq $TEMP2,$ACC9,$ACC9
1261
+ add \$32, $bp # $bp++
1262
+
1263
+ vmovdqu -24+32*1-128($np),$TEMP0
1264
+ imulq -128($np),%rax
1265
+ add %rax,$r3
1266
+ shr \$29, $r3
1267
+
1268
+ vmovdqu -24+32*2-128($np),$TEMP1
1269
+ vpmuludq $Yi,$TEMP0,$TEMP0
1270
+ vmovq $Bi, %rbx
1271
+ vmovdqu -24+32*3-128($np),$TEMP2
1272
+ vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1273
+ vpmuludq $Yi,$TEMP1,$TEMP1
1274
+ vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1275
+ vpaddq $TEMP1,$ACC2,$ACC1
1276
+ vmovdqu -24+32*4-128($np),$TEMP0
1277
+ vpmuludq $Yi,$TEMP2,$TEMP2
1278
+ vmovdqu -24+32*5-128($np),$TEMP1
1279
+ vpaddq $TEMP2,$ACC3,$ACC2
1280
+ vpmuludq $Yi,$TEMP0,$TEMP0
1281
+ vmovdqu -24+32*6-128($np),$TEMP2
1282
+ vpaddq $TEMP0,$ACC4,$ACC3
1283
+ vpmuludq $Yi,$TEMP1,$TEMP1
1284
+ vmovdqu -24+32*7-128($np),$TEMP0
1285
+ vpaddq $TEMP1,$ACC5,$ACC4
1286
+ vpmuludq $Yi,$TEMP2,$TEMP2
1287
+ vmovdqu -24+32*8-128($np),$TEMP1
1288
+ vpaddq $TEMP2,$ACC6,$ACC5
1289
+ vpmuludq $Yi,$TEMP0,$TEMP0
1290
+ vmovdqu -24+32*9-128($np),$TEMP2
1291
+ mov $r3, $r0
1292
+ vpaddq $TEMP0,$ACC7,$ACC6
1293
+ vpmuludq $Yi,$TEMP1,$TEMP1
1294
+ add (%rsp), $r0
1295
+ vpaddq $TEMP1,$ACC8,$ACC7
1296
+ vpmuludq $Yi,$TEMP2,$TEMP2
1297
+ vmovq $r3, $TEMP1
1298
+ vpaddq $TEMP2,$ACC9,$ACC8
1299
+
1300
+ dec $i
1301
+ jnz .Loop_mul_1024
1302
+ ___
1303
+
1304
+ # (*) Original implementation was correcting ACC1-ACC3 for overflow
1305
+ # after 7 loop runs, or after 28 iterations, or 56 additions.
1306
+ # But as we underutilize resources, it's possible to correct in
1307
+ # each iteration with marginal performance loss. But then, as
1308
+ # we do it in each iteration, we can correct less digits, and
1309
+ # avoid performance penalties completely. Also note that we
1310
+ # correct only three digits out of four. This works because
1311
+ # most significant digit is subjected to less additions.
1312
+
1313
+ $TEMP0 = $ACC9;
1314
+ $TEMP3 = $Bi;
1315
+ $TEMP4 = $Yi;
1316
+ $code.=<<___;
1317
+ vpermq \$0, $AND_MASK, $AND_MASK
1318
+ vpaddq (%rsp), $TEMP1, $ACC0
1319
+
1320
+ vpsrlq \$29, $ACC0, $TEMP1
1321
+ vpand $AND_MASK, $ACC0, $ACC0
1322
+ vpsrlq \$29, $ACC1, $TEMP2
1323
+ vpand $AND_MASK, $ACC1, $ACC1
1324
+ vpsrlq \$29, $ACC2, $TEMP3
1325
+ vpermq \$0x93, $TEMP1, $TEMP1
1326
+ vpand $AND_MASK, $ACC2, $ACC2
1327
+ vpsrlq \$29, $ACC3, $TEMP4
1328
+ vpermq \$0x93, $TEMP2, $TEMP2
1329
+ vpand $AND_MASK, $ACC3, $ACC3
1330
+
1331
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1332
+ vpermq \$0x93, $TEMP3, $TEMP3
1333
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1334
+ vpermq \$0x93, $TEMP4, $TEMP4
1335
+ vpaddq $TEMP0, $ACC0, $ACC0
1336
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1337
+ vpaddq $TEMP1, $ACC1, $ACC1
1338
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1339
+ vpaddq $TEMP2, $ACC2, $ACC2
1340
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1341
+ vpaddq $TEMP3, $ACC3, $ACC3
1342
+ vpaddq $TEMP4, $ACC4, $ACC4
1343
+
1344
+ vpsrlq \$29, $ACC0, $TEMP1
1345
+ vpand $AND_MASK, $ACC0, $ACC0
1346
+ vpsrlq \$29, $ACC1, $TEMP2
1347
+ vpand $AND_MASK, $ACC1, $ACC1
1348
+ vpsrlq \$29, $ACC2, $TEMP3
1349
+ vpermq \$0x93, $TEMP1, $TEMP1
1350
+ vpand $AND_MASK, $ACC2, $ACC2
1351
+ vpsrlq \$29, $ACC3, $TEMP4
1352
+ vpermq \$0x93, $TEMP2, $TEMP2
1353
+ vpand $AND_MASK, $ACC3, $ACC3
1354
+ vpermq \$0x93, $TEMP3, $TEMP3
1355
+
1356
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1357
+ vpermq \$0x93, $TEMP4, $TEMP4
1358
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1359
+ vpaddq $TEMP0, $ACC0, $ACC0
1360
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1361
+ vpaddq $TEMP1, $ACC1, $ACC1
1362
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1363
+ vpaddq $TEMP2, $ACC2, $ACC2
1364
+ vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1365
+ vpaddq $TEMP3, $ACC3, $ACC3
1366
+ vpaddq $TEMP4, $ACC4, $ACC4
1367
+
1368
+ vmovdqu $ACC0, 0-128($rp)
1369
+ vmovdqu $ACC1, 32-128($rp)
1370
+ vmovdqu $ACC2, 64-128($rp)
1371
+ vmovdqu $ACC3, 96-128($rp)
1372
+ ___
1373
+
1374
+ $TEMP5=$ACC0;
1375
+ $code.=<<___;
1376
+ vpsrlq \$29, $ACC4, $TEMP1
1377
+ vpand $AND_MASK, $ACC4, $ACC4
1378
+ vpsrlq \$29, $ACC5, $TEMP2
1379
+ vpand $AND_MASK, $ACC5, $ACC5
1380
+ vpsrlq \$29, $ACC6, $TEMP3
1381
+ vpermq \$0x93, $TEMP1, $TEMP1
1382
+ vpand $AND_MASK, $ACC6, $ACC6
1383
+ vpsrlq \$29, $ACC7, $TEMP4
1384
+ vpermq \$0x93, $TEMP2, $TEMP2
1385
+ vpand $AND_MASK, $ACC7, $ACC7
1386
+ vpsrlq \$29, $ACC8, $TEMP5
1387
+ vpermq \$0x93, $TEMP3, $TEMP3
1388
+ vpand $AND_MASK, $ACC8, $ACC8
1389
+ vpermq \$0x93, $TEMP4, $TEMP4
1390
+
1391
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1392
+ vpermq \$0x93, $TEMP5, $TEMP5
1393
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1394
+ vpaddq $TEMP0, $ACC4, $ACC4
1395
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1396
+ vpaddq $TEMP1, $ACC5, $ACC5
1397
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1398
+ vpaddq $TEMP2, $ACC6, $ACC6
1399
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1400
+ vpaddq $TEMP3, $ACC7, $ACC7
1401
+ vpaddq $TEMP4, $ACC8, $ACC8
1402
+
1403
+ vpsrlq \$29, $ACC4, $TEMP1
1404
+ vpand $AND_MASK, $ACC4, $ACC4
1405
+ vpsrlq \$29, $ACC5, $TEMP2
1406
+ vpand $AND_MASK, $ACC5, $ACC5
1407
+ vpsrlq \$29, $ACC6, $TEMP3
1408
+ vpermq \$0x93, $TEMP1, $TEMP1
1409
+ vpand $AND_MASK, $ACC6, $ACC6
1410
+ vpsrlq \$29, $ACC7, $TEMP4
1411
+ vpermq \$0x93, $TEMP2, $TEMP2
1412
+ vpand $AND_MASK, $ACC7, $ACC7
1413
+ vpsrlq \$29, $ACC8, $TEMP5
1414
+ vpermq \$0x93, $TEMP3, $TEMP3
1415
+ vpand $AND_MASK, $ACC8, $ACC8
1416
+ vpermq \$0x93, $TEMP4, $TEMP4
1417
+
1418
+ vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1419
+ vpermq \$0x93, $TEMP5, $TEMP5
1420
+ vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1421
+ vpaddq $TEMP0, $ACC4, $ACC4
1422
+ vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1423
+ vpaddq $TEMP1, $ACC5, $ACC5
1424
+ vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1425
+ vpaddq $TEMP2, $ACC6, $ACC6
1426
+ vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1427
+ vpaddq $TEMP3, $ACC7, $ACC7
1428
+ vpaddq $TEMP4, $ACC8, $ACC8
1429
+
1430
+ vmovdqu $ACC4, 128-128($rp)
1431
+ vmovdqu $ACC5, 160-128($rp)
1432
+ vmovdqu $ACC6, 192-128($rp)
1433
+ vmovdqu $ACC7, 224-128($rp)
1434
+ vmovdqu $ACC8, 256-128($rp)
1435
+ vzeroupper
1436
+
1437
+ mov %rbp, %rax
1438
+ ___
1439
+ $code.=<<___ if ($win64);
1440
+ movaps -0xd8(%rax),%xmm6
1441
+ movaps -0xc8(%rax),%xmm7
1442
+ movaps -0xb8(%rax),%xmm8
1443
+ movaps -0xa8(%rax),%xmm9
1444
+ movaps -0x98(%rax),%xmm10
1445
+ movaps -0x88(%rax),%xmm11
1446
+ movaps -0x78(%rax),%xmm12
1447
+ movaps -0x68(%rax),%xmm13
1448
+ movaps -0x58(%rax),%xmm14
1449
+ movaps -0x48(%rax),%xmm15
1450
+ ___
1451
+ $code.=<<___;
1452
+ mov -48(%rax),%r15
1453
+ mov -40(%rax),%r14
1454
+ mov -32(%rax),%r13
1455
+ mov -24(%rax),%r12
1456
+ mov -16(%rax),%rbp
1457
+ mov -8(%rax),%rbx
1458
+ lea (%rax),%rsp # restore %rsp
1459
+ .Lmul_1024_epilogue:
1460
+ ret
1461
+ .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1462
+ ___
1463
+ }
1464
+ {
1465
+ my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1466
+ my @T = map("%r$_",(8..11));
1467
+
1468
+ $code.=<<___;
1469
+ .globl rsaz_1024_red2norm_avx2
1470
+ .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1471
+ .align 32
1472
+ rsaz_1024_red2norm_avx2:
1473
+ sub \$-128,$inp # size optimization
1474
+ xor %rax,%rax
1475
+ ___
1476
+
1477
+ for ($j=0,$i=0; $i<16; $i++) {
1478
+ my $k=0;
1479
+ while (29*$j<64*($i+1)) { # load data till boundary
1480
+ $code.=" mov `8*$j-128`($inp), @T[0]\n";
1481
+ $j++; $k++; push(@T,shift(@T));
1482
+ }
1483
+ $l=$k;
1484
+ while ($k>1) { # shift loaded data but last value
1485
+ $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1486
+ $k--;
1487
+ }
1488
+ $code.=<<___; # shift last value
1489
+ mov @T[-1], @T[0]
1490
+ shl \$`29*($j-1)`, @T[-1]
1491
+ shr \$`-29*($j-1)`, @T[0]
1492
+ ___
1493
+ while ($l) { # accumulate all values
1494
+ $code.=" add @T[-$l], %rax\n";
1495
+ $l--;
1496
+ }
1497
+ $code.=<<___;
1498
+ adc \$0, @T[0] # consume eventual carry
1499
+ mov %rax, 8*$i($out)
1500
+ mov @T[0], %rax
1501
+ ___
1502
+ push(@T,shift(@T));
1503
+ }
1504
+ $code.=<<___;
1505
+ ret
1506
+ .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1507
+
1508
+ .globl rsaz_1024_norm2red_avx2
1509
+ .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1510
+ .align 32
1511
+ rsaz_1024_norm2red_avx2:
1512
+ sub \$-128,$out # size optimization
1513
+ mov ($inp),@T[0]
1514
+ mov \$0x1fffffff,%eax
1515
+ ___
1516
+ for ($j=0,$i=0; $i<16; $i++) {
1517
+ $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1518
+ $code.=" xor @T[1],@T[1]\n" if ($i==15);
1519
+ my $k=1;
1520
+ while (29*($j+1)<64*($i+1)) {
1521
+ $code.=<<___;
1522
+ mov @T[0],@T[-$k]
1523
+ shr \$`29*$j`,@T[-$k]
1524
+ and %rax,@T[-$k] # &0x1fffffff
1525
+ mov @T[-$k],`8*$j-128`($out)
1526
+ ___
1527
+ $j++; $k++;
1528
+ }
1529
+ $code.=<<___;
1530
+ shrd \$`29*$j`,@T[1],@T[0]
1531
+ and %rax,@T[0]
1532
+ mov @T[0],`8*$j-128`($out)
1533
+ ___
1534
+ $j++;
1535
+ push(@T,shift(@T));
1536
+ }
1537
+ $code.=<<___;
1538
+ mov @T[0],`8*$j-128`($out) # zero
1539
+ mov @T[0],`8*($j+1)-128`($out)
1540
+ mov @T[0],`8*($j+2)-128`($out)
1541
+ mov @T[0],`8*($j+3)-128`($out)
1542
+ ret
1543
+ .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1544
+ ___
1545
+ }
1546
+ {
1547
+ my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1548
+
1549
+ $code.=<<___;
1550
+ .globl rsaz_1024_scatter5_avx2
1551
+ .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1552
+ .align 32
1553
+ rsaz_1024_scatter5_avx2:
1554
+ vzeroupper
1555
+ vmovdqu .Lscatter_permd(%rip),%ymm5
1556
+ shl \$4,$power
1557
+ lea ($out,$power),$out
1558
+ mov \$9,%eax
1559
+ jmp .Loop_scatter_1024
1560
+
1561
+ .align 32
1562
+ .Loop_scatter_1024:
1563
+ vmovdqu ($inp),%ymm0
1564
+ lea 32($inp),$inp
1565
+ vpermd %ymm0,%ymm5,%ymm0
1566
+ vmovdqu %xmm0,($out)
1567
+ lea 16*32($out),$out
1568
+ dec %eax
1569
+ jnz .Loop_scatter_1024
1570
+
1571
+ vzeroupper
1572
+ ret
1573
+ .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1574
+
1575
+ .globl rsaz_1024_gather5_avx2
1576
+ .type rsaz_1024_gather5_avx2,\@abi-omnipotent
1577
+ .align 32
1578
+ rsaz_1024_gather5_avx2:
1579
+ ___
1580
+ $code.=<<___ if ($win64);
1581
+ lea -0x88(%rsp),%rax
1582
+ vzeroupper
1583
+ .LSEH_begin_rsaz_1024_gather5:
1584
+ # I can't trust assembler to use specific encoding:-(
1585
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1586
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1587
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1588
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1589
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1590
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1591
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1592
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1593
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1594
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1595
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1596
+ ___
1597
+ $code.=<<___;
1598
+ lea .Lgather_table(%rip),%r11
1599
+ mov $power,%eax
1600
+ and \$3,$power
1601
+ shr \$2,%eax # cache line number
1602
+ shl \$4,$power # offset within cache line
1603
+
1604
+ vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1605
+ vpbroadcastb 8(%r11,%rax), %xmm8
1606
+ vpbroadcastb 7(%r11,%rax), %xmm9
1607
+ vpbroadcastb 6(%r11,%rax), %xmm10
1608
+ vpbroadcastb 5(%r11,%rax), %xmm11
1609
+ vpbroadcastb 4(%r11,%rax), %xmm12
1610
+ vpbroadcastb 3(%r11,%rax), %xmm13
1611
+ vpbroadcastb 2(%r11,%rax), %xmm14
1612
+ vpbroadcastb 1(%r11,%rax), %xmm15
1613
+
1614
+ lea 64($inp,$power),$inp
1615
+ mov \$64,%r11 # size optimization
1616
+ mov \$9,%eax
1617
+ jmp .Loop_gather_1024
1618
+
1619
+ .align 32
1620
+ .Loop_gather_1024:
1621
+ vpand -64($inp), %xmm8,%xmm0
1622
+ vpand ($inp), %xmm9,%xmm1
1623
+ vpand 64($inp), %xmm10,%xmm2
1624
+ vpand ($inp,%r11,2), %xmm11,%xmm3
1625
+ vpor %xmm0,%xmm1,%xmm1
1626
+ vpand 64($inp,%r11,2), %xmm12,%xmm4
1627
+ vpor %xmm2,%xmm3,%xmm3
1628
+ vpand ($inp,%r11,4), %xmm13,%xmm5
1629
+ vpor %xmm1,%xmm3,%xmm3
1630
+ vpand 64($inp,%r11,4), %xmm14,%xmm6
1631
+ vpor %xmm4,%xmm5,%xmm5
1632
+ vpand -128($inp,%r11,8), %xmm15,%xmm2
1633
+ lea ($inp,%r11,8),$inp
1634
+ vpor %xmm3,%xmm5,%xmm5
1635
+ vpor %xmm2,%xmm6,%xmm6
1636
+ vpor %xmm5,%xmm6,%xmm6
1637
+ vpermd %ymm6,%ymm7,%ymm6
1638
+ vmovdqu %ymm6,($out)
1639
+ lea 32($out),$out
1640
+ dec %eax
1641
+ jnz .Loop_gather_1024
1642
+
1643
+ vpxor %ymm0,%ymm0,%ymm0
1644
+ vmovdqu %ymm0,($out)
1645
+ vzeroupper
1646
+ ___
1647
+ $code.=<<___ if ($win64);
1648
+ movaps (%rsp),%xmm6
1649
+ movaps 0x10(%rsp),%xmm7
1650
+ movaps 0x20(%rsp),%xmm8
1651
+ movaps 0x30(%rsp),%xmm9
1652
+ movaps 0x40(%rsp),%xmm10
1653
+ movaps 0x50(%rsp),%xmm11
1654
+ movaps 0x60(%rsp),%xmm12
1655
+ movaps 0x70(%rsp),%xmm13
1656
+ movaps 0x80(%rsp),%xmm14
1657
+ movaps 0x90(%rsp),%xmm15
1658
+ lea 0xa8(%rsp),%rsp
1659
+ .LSEH_end_rsaz_1024_gather5:
1660
+ ___
1661
+ $code.=<<___;
1662
+ ret
1663
+ .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1664
+ ___
1665
+ }
1666
+
1667
+ $code.=<<___;
1668
+ .extern OPENSSL_ia32cap_P
1669
+ .globl rsaz_avx2_eligible
1670
+ .type rsaz_avx2_eligible,\@abi-omnipotent
1671
+ .align 32
1672
+ rsaz_avx2_eligible:
1673
+ mov OPENSSL_ia32cap_P+8(%rip),%eax
1674
+ ___
1675
+ $code.=<<___ if ($addx);
1676
+ mov \$`1<<8|1<<19`,%ecx
1677
+ mov \$0,%edx
1678
+ and %eax,%ecx
1679
+ cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1680
+ cmove %edx,%eax
1681
+ ___
1682
+ $code.=<<___;
1683
+ and \$`1<<5`,%eax
1684
+ shr \$5,%eax
1685
+ ret
1686
+ .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1687
+
1688
+ .align 64
1689
+ .Land_mask:
1690
+ .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1691
+ .Lscatter_permd:
1692
+ .long 0,2,4,6,7,7,7,7
1693
+ .Lgather_permd:
1694
+ .long 0,7,1,7,2,7,3,7
1695
+ .Lgather_table:
1696
+ .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1697
+ .align 64
1698
+ ___
1699
+
1700
+ if ($win64) {
1701
+ $rec="%rcx";
1702
+ $frame="%rdx";
1703
+ $context="%r8";
1704
+ $disp="%r9";
1705
+
1706
+ $code.=<<___
1707
+ .extern __imp_RtlVirtualUnwind
1708
+ .type rsaz_se_handler,\@abi-omnipotent
1709
+ .align 16
1710
+ rsaz_se_handler:
1711
+ push %rsi
1712
+ push %rdi
1713
+ push %rbx
1714
+ push %rbp
1715
+ push %r12
1716
+ push %r13
1717
+ push %r14
1718
+ push %r15
1719
+ pushfq
1720
+ sub \$64,%rsp
1721
+
1722
+ mov 120($context),%rax # pull context->Rax
1723
+ mov 248($context),%rbx # pull context->Rip
1724
+
1725
+ mov 8($disp),%rsi # disp->ImageBase
1726
+ mov 56($disp),%r11 # disp->HandlerData
1727
+
1728
+ mov 0(%r11),%r10d # HandlerData[0]
1729
+ lea (%rsi,%r10),%r10 # prologue label
1730
+ cmp %r10,%rbx # context->Rip<prologue label
1731
+ jb .Lcommon_seh_tail
1732
+
1733
+ mov 152($context),%rax # pull context->Rsp
1734
+
1735
+ mov 4(%r11),%r10d # HandlerData[1]
1736
+ lea (%rsi,%r10),%r10 # epilogue label
1737
+ cmp %r10,%rbx # context->Rip>=epilogue label
1738
+ jae .Lcommon_seh_tail
1739
+
1740
+ mov 160($context),%rax # pull context->Rbp
1741
+
1742
+ mov -48(%rax),%r15
1743
+ mov -40(%rax),%r14
1744
+ mov -32(%rax),%r13
1745
+ mov -24(%rax),%r12
1746
+ mov -16(%rax),%rbp
1747
+ mov -8(%rax),%rbx
1748
+ mov %r15,240($context)
1749
+ mov %r14,232($context)
1750
+ mov %r13,224($context)
1751
+ mov %r12,216($context)
1752
+ mov %rbp,160($context)
1753
+ mov %rbx,144($context)
1754
+
1755
+ lea -0xd8(%rax),%rsi # %xmm save area
1756
+ lea 512($context),%rdi # & context.Xmm6
1757
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1758
+ .long 0xa548f3fc # cld; rep movsq
1759
+
1760
+ .Lcommon_seh_tail:
1761
+ mov 8(%rax),%rdi
1762
+ mov 16(%rax),%rsi
1763
+ mov %rax,152($context) # restore context->Rsp
1764
+ mov %rsi,168($context) # restore context->Rsi
1765
+ mov %rdi,176($context) # restore context->Rdi
1766
+
1767
+ mov 40($disp),%rdi # disp->ContextRecord
1768
+ mov $context,%rsi # context
1769
+ mov \$154,%ecx # sizeof(CONTEXT)
1770
+ .long 0xa548f3fc # cld; rep movsq
1771
+
1772
+ mov $disp,%rsi
1773
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1774
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1775
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1776
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1777
+ mov 40(%rsi),%r10 # disp->ContextRecord
1778
+ lea 56(%rsi),%r11 # &disp->HandlerData
1779
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1780
+ mov %r10,32(%rsp) # arg5
1781
+ mov %r11,40(%rsp) # arg6
1782
+ mov %r12,48(%rsp) # arg7
1783
+ mov %rcx,56(%rsp) # arg8, (NULL)
1784
+ call *__imp_RtlVirtualUnwind(%rip)
1785
+
1786
+ mov \$1,%eax # ExceptionContinueSearch
1787
+ add \$64,%rsp
1788
+ popfq
1789
+ pop %r15
1790
+ pop %r14
1791
+ pop %r13
1792
+ pop %r12
1793
+ pop %rbp
1794
+ pop %rbx
1795
+ pop %rdi
1796
+ pop %rsi
1797
+ ret
1798
+ .size rsaz_se_handler,.-rsaz_se_handler
1799
+
1800
+ .section .pdata
1801
+ .align 4
1802
+ .rva .LSEH_begin_rsaz_1024_sqr_avx2
1803
+ .rva .LSEH_end_rsaz_1024_sqr_avx2
1804
+ .rva .LSEH_info_rsaz_1024_sqr_avx2
1805
+
1806
+ .rva .LSEH_begin_rsaz_1024_mul_avx2
1807
+ .rva .LSEH_end_rsaz_1024_mul_avx2
1808
+ .rva .LSEH_info_rsaz_1024_mul_avx2
1809
+
1810
+ .rva .LSEH_begin_rsaz_1024_gather5
1811
+ .rva .LSEH_end_rsaz_1024_gather5
1812
+ .rva .LSEH_info_rsaz_1024_gather5
1813
+ .section .xdata
1814
+ .align 8
1815
+ .LSEH_info_rsaz_1024_sqr_avx2:
1816
+ .byte 9,0,0,0
1817
+ .rva rsaz_se_handler
1818
+ .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1819
+ .LSEH_info_rsaz_1024_mul_avx2:
1820
+ .byte 9,0,0,0
1821
+ .rva rsaz_se_handler
1822
+ .rva .Lmul_1024_body,.Lmul_1024_epilogue
1823
+ .LSEH_info_rsaz_1024_gather5:
1824
+ .byte 0x01,0x33,0x16,0x00
1825
+ .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1826
+ .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1827
+ .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1828
+ .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1829
+ .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1830
+ .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1831
+ .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1832
+ .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1833
+ .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1834
+ .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1835
+ .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1836
+ ___
1837
+ }
1838
+
1839
+ foreach (split("\n",$code)) {
1840
+ s/\`([^\`]*)\`/eval($1)/ge;
1841
+
1842
+ s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1843
+
1844
+ s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1845
+ s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1846
+ s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1847
+ s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1848
+ s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1849
+ print $_,"\n";
1850
+ }
1851
+
1852
+ }}} else {{{
1853
+ print <<___; # assembler is too old
1854
+ .text
1855
+
1856
+ .globl rsaz_avx2_eligible
1857
+ .type rsaz_avx2_eligible,\@abi-omnipotent
1858
+ rsaz_avx2_eligible:
1859
+ xor %eax,%eax
1860
+ ret
1861
+ .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1862
+
1863
+ .globl rsaz_1024_sqr_avx2
1864
+ .globl rsaz_1024_mul_avx2
1865
+ .globl rsaz_1024_norm2red_avx2
1866
+ .globl rsaz_1024_red2norm_avx2
1867
+ .globl rsaz_1024_scatter5_avx2
1868
+ .globl rsaz_1024_gather5_avx2
1869
+ .type rsaz_1024_sqr_avx2,\@abi-omnipotent
1870
+ rsaz_1024_sqr_avx2:
1871
+ rsaz_1024_mul_avx2:
1872
+ rsaz_1024_norm2red_avx2:
1873
+ rsaz_1024_red2norm_avx2:
1874
+ rsaz_1024_scatter5_avx2:
1875
+ rsaz_1024_gather5_avx2:
1876
+ .byte 0x0f,0x0b # ud2
1877
+ ret
1878
+ .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1879
+ ___
1880
+ }}}
1881
+
1882
+ close STDOUT;