ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,287 +0,0 @@
1
- #!/usr/local/bin/perl
2
-
3
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
4
- push(@INC,"${dir}","${dir}../../perlasm");
5
- require "x86asm.pl";
6
-
7
- &asm_init($ARGV[0],$0);
8
-
9
- &bn_mul_comba("bn_mul_comba8",8);
10
- &bn_mul_comba("bn_mul_comba4",4);
11
- &bn_sqr_comba("bn_sqr_comba8",8);
12
- &bn_sqr_comba("bn_sqr_comba4",4);
13
-
14
- &asm_finish();
15
-
16
- sub mul_add_c
17
- {
18
- local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
19
-
20
- # pos == -1 if eax and edx are pre-loaded, 0 to load from next
21
- # words, and 1 if load return value
22
-
23
- &comment("mul a[$ai]*b[$bi]");
24
-
25
- # "eax" and "edx" will always be pre-loaded.
26
- # &mov("eax",&DWP($ai*4,$a,"",0)) ;
27
- # &mov("edx",&DWP($bi*4,$b,"",0));
28
-
29
- &mul("edx");
30
- &add($c0,"eax");
31
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # laod next a
32
- &mov("eax",&wparam(0)) if $pos > 0; # load r[]
33
- ###
34
- &adc($c1,"edx");
35
- &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0; # laod next b
36
- &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1; # laod next b
37
- ###
38
- &adc($c2,0);
39
- # is pos > 1, it means it is the last loop
40
- &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0; # save r[];
41
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # laod next a
42
- }
43
-
44
- sub sqr_add_c
45
- {
46
- local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
47
-
48
- # pos == -1 if eax and edx are pre-loaded, 0 to load from next
49
- # words, and 1 if load return value
50
-
51
- &comment("sqr a[$ai]*a[$bi]");
52
-
53
- # "eax" and "edx" will always be pre-loaded.
54
- # &mov("eax",&DWP($ai*4,$a,"",0)) ;
55
- # &mov("edx",&DWP($bi*4,$b,"",0));
56
-
57
- if ($ai == $bi)
58
- { &mul("eax");}
59
- else
60
- { &mul("edx");}
61
- &add($c0,"eax");
62
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
63
- ###
64
- &adc($c1,"edx");
65
- &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
66
- ###
67
- &adc($c2,0);
68
- # is pos > 1, it means it is the last loop
69
- &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
70
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
71
- }
72
-
73
- sub sqr_add_c2
74
- {
75
- local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
76
-
77
- # pos == -1 if eax and edx are pre-loaded, 0 to load from next
78
- # words, and 1 if load return value
79
-
80
- &comment("sqr a[$ai]*a[$bi]");
81
-
82
- # "eax" and "edx" will always be pre-loaded.
83
- # &mov("eax",&DWP($ai*4,$a,"",0)) ;
84
- # &mov("edx",&DWP($bi*4,$a,"",0));
85
-
86
- if ($ai == $bi)
87
- { &mul("eax");}
88
- else
89
- { &mul("edx");}
90
- &add("eax","eax");
91
- ###
92
- &adc("edx","edx");
93
- ###
94
- &adc($c2,0);
95
- &add($c0,"eax");
96
- &adc($c1,"edx");
97
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0; # load next a
98
- &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1; # load next b
99
- &adc($c2,0);
100
- &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0; # save r[];
101
- &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
102
- ###
103
- }
104
-
105
- sub bn_mul_comba
106
- {
107
- local($name,$num)=@_;
108
- local($a,$b,$c0,$c1,$c2);
109
- local($i,$as,$ae,$bs,$be,$ai,$bi);
110
- local($tot,$end);
111
-
112
- &function_begin_B($name,"");
113
-
114
- $c0="ebx";
115
- $c1="ecx";
116
- $c2="ebp";
117
- $a="esi";
118
- $b="edi";
119
-
120
- $as=0;
121
- $ae=0;
122
- $bs=0;
123
- $be=0;
124
- $tot=$num+$num-1;
125
-
126
- &push("esi");
127
- &mov($a,&wparam(1));
128
- &push("edi");
129
- &mov($b,&wparam(2));
130
- &push("ebp");
131
- &push("ebx");
132
-
133
- &xor($c0,$c0);
134
- &mov("eax",&DWP(0,$a,"",0)); # load the first word
135
- &xor($c1,$c1);
136
- &mov("edx",&DWP(0,$b,"",0)); # load the first second
137
-
138
- for ($i=0; $i<$tot; $i++)
139
- {
140
- $ai=$as;
141
- $bi=$bs;
142
- $end=$be+1;
143
-
144
- &comment("################## Calculate word $i");
145
-
146
- for ($j=$bs; $j<$end; $j++)
147
- {
148
- &xor($c2,$c2) if ($j == $bs);
149
- if (($j+1) == $end)
150
- {
151
- $v=1;
152
- $v=2 if (($i+1) == $tot);
153
- }
154
- else
155
- { $v=0; }
156
- if (($j+1) != $end)
157
- {
158
- $na=($ai-1);
159
- $nb=($bi+1);
160
- }
161
- else
162
- {
163
- $na=$as+($i < ($num-1));
164
- $nb=$bs+($i >= ($num-1));
165
- }
166
- #printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
167
- &mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
168
- if ($v)
169
- {
170
- &comment("saved r[$i]");
171
- # &mov("eax",&wparam(0));
172
- # &mov(&DWP($i*4,"eax","",0),$c0);
173
- ($c0,$c1,$c2)=($c1,$c2,$c0);
174
- }
175
- $ai--;
176
- $bi++;
177
- }
178
- $as++ if ($i < ($num-1));
179
- $ae++ if ($i >= ($num-1));
180
-
181
- $bs++ if ($i >= ($num-1));
182
- $be++ if ($i < ($num-1));
183
- }
184
- &comment("save r[$i]");
185
- # &mov("eax",&wparam(0));
186
- &mov(&DWP($i*4,"eax","",0),$c0);
187
-
188
- &pop("ebx");
189
- &pop("ebp");
190
- &pop("edi");
191
- &pop("esi");
192
- &ret();
193
- &function_end_B($name);
194
- }
195
-
196
- sub bn_sqr_comba
197
- {
198
- local($name,$num)=@_;
199
- local($r,$a,$c0,$c1,$c2)=@_;
200
- local($i,$as,$ae,$bs,$be,$ai,$bi);
201
- local($b,$tot,$end,$half);
202
-
203
- &function_begin_B($name,"");
204
-
205
- $c0="ebx";
206
- $c1="ecx";
207
- $c2="ebp";
208
- $a="esi";
209
- $r="edi";
210
-
211
- &push("esi");
212
- &push("edi");
213
- &push("ebp");
214
- &push("ebx");
215
- &mov($r,&wparam(0));
216
- &mov($a,&wparam(1));
217
- &xor($c0,$c0);
218
- &xor($c1,$c1);
219
- &mov("eax",&DWP(0,$a,"",0)); # load the first word
220
-
221
- $as=0;
222
- $ae=0;
223
- $bs=0;
224
- $be=0;
225
- $tot=$num+$num-1;
226
-
227
- for ($i=0; $i<$tot; $i++)
228
- {
229
- $ai=$as;
230
- $bi=$bs;
231
- $end=$be+1;
232
-
233
- &comment("############### Calculate word $i");
234
- for ($j=$bs; $j<$end; $j++)
235
- {
236
- &xor($c2,$c2) if ($j == $bs);
237
- if (($ai-1) < ($bi+1))
238
- {
239
- $v=1;
240
- $v=2 if ($i+1) == $tot;
241
- }
242
- else
243
- { $v=0; }
244
- if (!$v)
245
- {
246
- $na=$ai-1;
247
- $nb=$bi+1;
248
- }
249
- else
250
- {
251
- $na=$as+($i < ($num-1));
252
- $nb=$bs+($i >= ($num-1));
253
- }
254
- if ($ai == $bi)
255
- {
256
- &sqr_add_c($r,$a,$ai,$bi,
257
- $c0,$c1,$c2,$v,$i,$na,$nb);
258
- }
259
- else
260
- {
261
- &sqr_add_c2($r,$a,$ai,$bi,
262
- $c0,$c1,$c2,$v,$i,$na,$nb);
263
- }
264
- if ($v)
265
- {
266
- &comment("saved r[$i]");
267
- #&mov(&DWP($i*4,$r,"",0),$c0);
268
- ($c0,$c1,$c2)=($c1,$c2,$c0);
269
- last;
270
- }
271
- $ai--;
272
- $bi++;
273
- }
274
- $as++ if ($i < ($num-1));
275
- $ae++ if ($i >= ($num-1));
276
-
277
- $bs++ if ($i >= ($num-1));
278
- $be++ if ($i < ($num-1));
279
- }
280
- &mov(&DWP($i*4,$r,"",0),$c0);
281
- &pop("ebx");
282
- &pop("ebp");
283
- &pop("edi");
284
- &pop("esi");
285
- &ret();
286
- &function_end_B($name);
287
- }
@@ -1,1882 +0,0 @@
1
- #!/usr/bin/env perl
2
-
3
- ##############################################################################
4
- # #
5
- # Copyright (c) 2012, Intel Corporation #
6
- # #
7
- # All rights reserved. #
8
- # #
9
- # Redistribution and use in source and binary forms, with or without #
10
- # modification, are permitted provided that the following conditions are #
11
- # met: #
12
- # #
13
- # * Redistributions of source code must retain the above copyright #
14
- # notice, this list of conditions and the following disclaimer. #
15
- # #
16
- # * Redistributions in binary form must reproduce the above copyright #
17
- # notice, this list of conditions and the following disclaimer in the #
18
- # documentation and/or other materials provided with the #
19
- # distribution. #
20
- # #
21
- # * Neither the name of the Intel Corporation nor the names of its #
22
- # contributors may be used to endorse or promote products derived from #
23
- # this software without specific prior written permission. #
24
- # #
25
- # #
26
- # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
27
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
28
- # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
29
- # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
30
- # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
31
- # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
32
- # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
33
- # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
34
- # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
35
- # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
36
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
37
- # #
38
- ##############################################################################
39
- # Developers and authors: #
40
- # Shay Gueron (1, 2), and Vlad Krasnov (1) #
41
- # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
42
- # (2) University of Haifa, Israel #
43
- ##############################################################################
44
- # Reference: #
45
- # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
46
- # Exponentiation, Using Advanced Vector Instructions Architectures", #
47
- # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
48
- # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
49
- # [2] S. Gueron: "Efficient Software Implementations of Modular #
50
- # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
51
- # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
52
- # Proceedings of 9th International Conference on Information Technology: #
53
- # New Generations (ITNG 2012), pp.821-823 (2012) #
54
- # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
55
- # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
56
- # on AVX2 capable x86_64 platforms", #
57
- # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
58
- ##############################################################################
59
- #
60
- # +13% improvement over original submission by <appro@openssl.org>
61
- #
62
- # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
63
- # 2.3GHz Haswell 621 765/+23% 1113/+79%
64
- # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
65
- #
66
- # (*) if system doesn't support AVX2, for reference purposes;
67
- # (**) scaled to 2.3GHz to simplify comparison;
68
- # (***) scalar AD*X code is faster than AVX2 and is preferred code
69
- # path for Broadwell;
70
-
71
- $flavour = shift;
72
- $output = shift;
73
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
74
-
75
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
76
-
77
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
78
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
79
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
80
- die "can't locate x86_64-xlate.pl";
81
-
82
- # In upstream, this is controlled by shelling out to the compiler to check
83
- # versions, but BoringSSL is intended to be used with pre-generated perlasm
84
- # output, so this isn't useful anyway.
85
- #
86
- # TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
87
- $avx = 0;
88
- $addx = 0;
89
-
90
- open OUT,"| \"$^X\" $xlate $flavour $output";
91
- *STDOUT = *OUT;
92
-
93
- if ($avx>1) {{{
94
- { # void AMS_WW(
95
- my $rp="%rdi"; # BN_ULONG *rp,
96
- my $ap="%rsi"; # const BN_ULONG *ap,
97
- my $np="%rdx"; # const BN_ULONG *np,
98
- my $n0="%ecx"; # const BN_ULONG n0,
99
- my $rep="%r8d"; # int repeat);
100
-
101
- # The registers that hold the accumulated redundant result
102
- # The AMM works on 1024 bit operands, and redundant word size is 29
103
- # Therefore: ceil(1024/29)/4 = 9
104
- my $ACC0="%ymm0";
105
- my $ACC1="%ymm1";
106
- my $ACC2="%ymm2";
107
- my $ACC3="%ymm3";
108
- my $ACC4="%ymm4";
109
- my $ACC5="%ymm5";
110
- my $ACC6="%ymm6";
111
- my $ACC7="%ymm7";
112
- my $ACC8="%ymm8";
113
- my $ACC9="%ymm9";
114
- # Registers that hold the broadcasted words of bp, currently used
115
- my $B1="%ymm10";
116
- my $B2="%ymm11";
117
- # Registers that hold the broadcasted words of Y, currently used
118
- my $Y1="%ymm12";
119
- my $Y2="%ymm13";
120
- # Helper registers
121
- my $TEMP1="%ymm14";
122
- my $AND_MASK="%ymm15";
123
- # alu registers that hold the first words of the ACC
124
- my $r0="%r9";
125
- my $r1="%r10";
126
- my $r2="%r11";
127
- my $r3="%r12";
128
-
129
- my $i="%r14d"; # loop counter
130
- my $tmp = "%r15";
131
-
132
- my $FrameSize=32*18+32*8; # place for A^2 and 2*A
133
-
134
- my $aap=$r0;
135
- my $tp0="%rbx";
136
- my $tp1=$r3;
137
- my $tpa=$tmp;
138
-
139
- $np="%r13"; # reassigned argument
140
-
141
- $code.=<<___;
142
- .text
143
-
144
- .globl rsaz_1024_sqr_avx2
145
- .type rsaz_1024_sqr_avx2,\@function,5
146
- .align 64
147
- rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
148
- lea (%rsp), %rax
149
- push %rbx
150
- push %rbp
151
- push %r12
152
- push %r13
153
- push %r14
154
- push %r15
155
- vzeroupper
156
- ___
157
- $code.=<<___ if ($win64);
158
- lea -0xa8(%rsp),%rsp
159
- vmovaps %xmm6,-0xd8(%rax)
160
- vmovaps %xmm7,-0xc8(%rax)
161
- vmovaps %xmm8,-0xb8(%rax)
162
- vmovaps %xmm9,-0xa8(%rax)
163
- vmovaps %xmm10,-0x98(%rax)
164
- vmovaps %xmm11,-0x88(%rax)
165
- vmovaps %xmm12,-0x78(%rax)
166
- vmovaps %xmm13,-0x68(%rax)
167
- vmovaps %xmm14,-0x58(%rax)
168
- vmovaps %xmm15,-0x48(%rax)
169
- .Lsqr_1024_body:
170
- ___
171
- $code.=<<___;
172
- mov %rax,%rbp
173
- mov %rdx, $np # reassigned argument
174
- sub \$$FrameSize, %rsp
175
- mov $np, $tmp
176
- sub \$-128, $rp # size optimization
177
- sub \$-128, $ap
178
- sub \$-128, $np
179
-
180
- and \$4095, $tmp # see if $np crosses page
181
- add \$32*10, $tmp
182
- shr \$12, $tmp
183
- vpxor $ACC9,$ACC9,$ACC9
184
- jz .Lsqr_1024_no_n_copy
185
-
186
- # unaligned 256-bit load that crosses page boundary can
187
- # cause >2x performance degradation here, so if $np does
188
- # cross page boundary, copy it to stack and make sure stack
189
- # frame doesn't...
190
- sub \$32*10,%rsp
191
- vmovdqu 32*0-128($np), $ACC0
192
- and \$-2048, %rsp
193
- vmovdqu 32*1-128($np), $ACC1
194
- vmovdqu 32*2-128($np), $ACC2
195
- vmovdqu 32*3-128($np), $ACC3
196
- vmovdqu 32*4-128($np), $ACC4
197
- vmovdqu 32*5-128($np), $ACC5
198
- vmovdqu 32*6-128($np), $ACC6
199
- vmovdqu 32*7-128($np), $ACC7
200
- vmovdqu 32*8-128($np), $ACC8
201
- lea $FrameSize+128(%rsp),$np
202
- vmovdqu $ACC0, 32*0-128($np)
203
- vmovdqu $ACC1, 32*1-128($np)
204
- vmovdqu $ACC2, 32*2-128($np)
205
- vmovdqu $ACC3, 32*3-128($np)
206
- vmovdqu $ACC4, 32*4-128($np)
207
- vmovdqu $ACC5, 32*5-128($np)
208
- vmovdqu $ACC6, 32*6-128($np)
209
- vmovdqu $ACC7, 32*7-128($np)
210
- vmovdqu $ACC8, 32*8-128($np)
211
- vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
212
-
213
- .Lsqr_1024_no_n_copy:
214
- and \$-1024, %rsp
215
-
216
- vmovdqu 32*1-128($ap), $ACC1
217
- vmovdqu 32*2-128($ap), $ACC2
218
- vmovdqu 32*3-128($ap), $ACC3
219
- vmovdqu 32*4-128($ap), $ACC4
220
- vmovdqu 32*5-128($ap), $ACC5
221
- vmovdqu 32*6-128($ap), $ACC6
222
- vmovdqu 32*7-128($ap), $ACC7
223
- vmovdqu 32*8-128($ap), $ACC8
224
-
225
- lea 192(%rsp), $tp0 # 64+128=192
226
- vpbroadcastq .Land_mask(%rip), $AND_MASK
227
- jmp .LOOP_GRANDE_SQR_1024
228
-
229
- .align 32
230
- .LOOP_GRANDE_SQR_1024:
231
- lea 32*18+128(%rsp), $aap # size optimization
232
- lea 448(%rsp), $tp1 # 64+128+256=448
233
-
234
- # the squaring is performed as described in Variant B of
235
- # "Speeding up Big-Number Squaring", so start by calculating
236
- # the A*2=A+A vector
237
- vpaddq $ACC1, $ACC1, $ACC1
238
- vpbroadcastq 32*0-128($ap), $B1
239
- vpaddq $ACC2, $ACC2, $ACC2
240
- vmovdqa $ACC1, 32*0-128($aap)
241
- vpaddq $ACC3, $ACC3, $ACC3
242
- vmovdqa $ACC2, 32*1-128($aap)
243
- vpaddq $ACC4, $ACC4, $ACC4
244
- vmovdqa $ACC3, 32*2-128($aap)
245
- vpaddq $ACC5, $ACC5, $ACC5
246
- vmovdqa $ACC4, 32*3-128($aap)
247
- vpaddq $ACC6, $ACC6, $ACC6
248
- vmovdqa $ACC5, 32*4-128($aap)
249
- vpaddq $ACC7, $ACC7, $ACC7
250
- vmovdqa $ACC6, 32*5-128($aap)
251
- vpaddq $ACC8, $ACC8, $ACC8
252
- vmovdqa $ACC7, 32*6-128($aap)
253
- vpxor $ACC9, $ACC9, $ACC9
254
- vmovdqa $ACC8, 32*7-128($aap)
255
-
256
- vpmuludq 32*0-128($ap), $B1, $ACC0
257
- vpbroadcastq 32*1-128($ap), $B2
258
- vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
259
- vpmuludq $B1, $ACC1, $ACC1
260
- vmovdqu $ACC9, 32*10-448($tp1)
261
- vpmuludq $B1, $ACC2, $ACC2
262
- vmovdqu $ACC9, 32*11-448($tp1)
263
- vpmuludq $B1, $ACC3, $ACC3
264
- vmovdqu $ACC9, 32*12-448($tp1)
265
- vpmuludq $B1, $ACC4, $ACC4
266
- vmovdqu $ACC9, 32*13-448($tp1)
267
- vpmuludq $B1, $ACC5, $ACC5
268
- vmovdqu $ACC9, 32*14-448($tp1)
269
- vpmuludq $B1, $ACC6, $ACC6
270
- vmovdqu $ACC9, 32*15-448($tp1)
271
- vpmuludq $B1, $ACC7, $ACC7
272
- vmovdqu $ACC9, 32*16-448($tp1)
273
- vpmuludq $B1, $ACC8, $ACC8
274
- vpbroadcastq 32*2-128($ap), $B1
275
- vmovdqu $ACC9, 32*17-448($tp1)
276
-
277
- mov $ap, $tpa
278
- mov \$4, $i
279
- jmp .Lsqr_entry_1024
280
- ___
281
- $TEMP0=$Y1;
282
- $TEMP2=$Y2;
283
- $code.=<<___;
284
- .align 32
285
- .LOOP_SQR_1024:
286
- vpbroadcastq 32*1-128($tpa), $B2
287
- vpmuludq 32*0-128($ap), $B1, $ACC0
288
- vpaddq 32*0-192($tp0), $ACC0, $ACC0
289
- vpmuludq 32*0-128($aap), $B1, $ACC1
290
- vpaddq 32*1-192($tp0), $ACC1, $ACC1
291
- vpmuludq 32*1-128($aap), $B1, $ACC2
292
- vpaddq 32*2-192($tp0), $ACC2, $ACC2
293
- vpmuludq 32*2-128($aap), $B1, $ACC3
294
- vpaddq 32*3-192($tp0), $ACC3, $ACC3
295
- vpmuludq 32*3-128($aap), $B1, $ACC4
296
- vpaddq 32*4-192($tp0), $ACC4, $ACC4
297
- vpmuludq 32*4-128($aap), $B1, $ACC5
298
- vpaddq 32*5-192($tp0), $ACC5, $ACC5
299
- vpmuludq 32*5-128($aap), $B1, $ACC6
300
- vpaddq 32*6-192($tp0), $ACC6, $ACC6
301
- vpmuludq 32*6-128($aap), $B1, $ACC7
302
- vpaddq 32*7-192($tp0), $ACC7, $ACC7
303
- vpmuludq 32*7-128($aap), $B1, $ACC8
304
- vpbroadcastq 32*2-128($tpa), $B1
305
- vpaddq 32*8-192($tp0), $ACC8, $ACC8
306
- .Lsqr_entry_1024:
307
- vmovdqu $ACC0, 32*0-192($tp0)
308
- vmovdqu $ACC1, 32*1-192($tp0)
309
-
310
- vpmuludq 32*1-128($ap), $B2, $TEMP0
311
- vpaddq $TEMP0, $ACC2, $ACC2
312
- vpmuludq 32*1-128($aap), $B2, $TEMP1
313
- vpaddq $TEMP1, $ACC3, $ACC3
314
- vpmuludq 32*2-128($aap), $B2, $TEMP2
315
- vpaddq $TEMP2, $ACC4, $ACC4
316
- vpmuludq 32*3-128($aap), $B2, $TEMP0
317
- vpaddq $TEMP0, $ACC5, $ACC5
318
- vpmuludq 32*4-128($aap), $B2, $TEMP1
319
- vpaddq $TEMP1, $ACC6, $ACC6
320
- vpmuludq 32*5-128($aap), $B2, $TEMP2
321
- vpaddq $TEMP2, $ACC7, $ACC7
322
- vpmuludq 32*6-128($aap), $B2, $TEMP0
323
- vpaddq $TEMP0, $ACC8, $ACC8
324
- vpmuludq 32*7-128($aap), $B2, $ACC0
325
- vpbroadcastq 32*3-128($tpa), $B2
326
- vpaddq 32*9-192($tp0), $ACC0, $ACC0
327
-
328
- vmovdqu $ACC2, 32*2-192($tp0)
329
- vmovdqu $ACC3, 32*3-192($tp0)
330
-
331
- vpmuludq 32*2-128($ap), $B1, $TEMP2
332
- vpaddq $TEMP2, $ACC4, $ACC4
333
- vpmuludq 32*2-128($aap), $B1, $TEMP0
334
- vpaddq $TEMP0, $ACC5, $ACC5
335
- vpmuludq 32*3-128($aap), $B1, $TEMP1
336
- vpaddq $TEMP1, $ACC6, $ACC6
337
- vpmuludq 32*4-128($aap), $B1, $TEMP2
338
- vpaddq $TEMP2, $ACC7, $ACC7
339
- vpmuludq 32*5-128($aap), $B1, $TEMP0
340
- vpaddq $TEMP0, $ACC8, $ACC8
341
- vpmuludq 32*6-128($aap), $B1, $TEMP1
342
- vpaddq $TEMP1, $ACC0, $ACC0
343
- vpmuludq 32*7-128($aap), $B1, $ACC1
344
- vpbroadcastq 32*4-128($tpa), $B1
345
- vpaddq 32*10-448($tp1), $ACC1, $ACC1
346
-
347
- vmovdqu $ACC4, 32*4-192($tp0)
348
- vmovdqu $ACC5, 32*5-192($tp0)
349
-
350
- vpmuludq 32*3-128($ap), $B2, $TEMP0
351
- vpaddq $TEMP0, $ACC6, $ACC6
352
- vpmuludq 32*3-128($aap), $B2, $TEMP1
353
- vpaddq $TEMP1, $ACC7, $ACC7
354
- vpmuludq 32*4-128($aap), $B2, $TEMP2
355
- vpaddq $TEMP2, $ACC8, $ACC8
356
- vpmuludq 32*5-128($aap), $B2, $TEMP0
357
- vpaddq $TEMP0, $ACC0, $ACC0
358
- vpmuludq 32*6-128($aap), $B2, $TEMP1
359
- vpaddq $TEMP1, $ACC1, $ACC1
360
- vpmuludq 32*7-128($aap), $B2, $ACC2
361
- vpbroadcastq 32*5-128($tpa), $B2
362
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
363
-
364
- vmovdqu $ACC6, 32*6-192($tp0)
365
- vmovdqu $ACC7, 32*7-192($tp0)
366
-
367
- vpmuludq 32*4-128($ap), $B1, $TEMP0
368
- vpaddq $TEMP0, $ACC8, $ACC8
369
- vpmuludq 32*4-128($aap), $B1, $TEMP1
370
- vpaddq $TEMP1, $ACC0, $ACC0
371
- vpmuludq 32*5-128($aap), $B1, $TEMP2
372
- vpaddq $TEMP2, $ACC1, $ACC1
373
- vpmuludq 32*6-128($aap), $B1, $TEMP0
374
- vpaddq $TEMP0, $ACC2, $ACC2
375
- vpmuludq 32*7-128($aap), $B1, $ACC3
376
- vpbroadcastq 32*6-128($tpa), $B1
377
- vpaddq 32*12-448($tp1), $ACC3, $ACC3
378
-
379
- vmovdqu $ACC8, 32*8-192($tp0)
380
- vmovdqu $ACC0, 32*9-192($tp0)
381
- lea 8($tp0), $tp0
382
-
383
- vpmuludq 32*5-128($ap), $B2, $TEMP2
384
- vpaddq $TEMP2, $ACC1, $ACC1
385
- vpmuludq 32*5-128($aap), $B2, $TEMP0
386
- vpaddq $TEMP0, $ACC2, $ACC2
387
- vpmuludq 32*6-128($aap), $B2, $TEMP1
388
- vpaddq $TEMP1, $ACC3, $ACC3
389
- vpmuludq 32*7-128($aap), $B2, $ACC4
390
- vpbroadcastq 32*7-128($tpa), $B2
391
- vpaddq 32*13-448($tp1), $ACC4, $ACC4
392
-
393
- vmovdqu $ACC1, 32*10-448($tp1)
394
- vmovdqu $ACC2, 32*11-448($tp1)
395
-
396
- vpmuludq 32*6-128($ap), $B1, $TEMP0
397
- vpaddq $TEMP0, $ACC3, $ACC3
398
- vpmuludq 32*6-128($aap), $B1, $TEMP1
399
- vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
400
- vpaddq $TEMP1, $ACC4, $ACC4
401
- vpmuludq 32*7-128($aap), $B1, $ACC5
402
- vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
403
- vpaddq 32*14-448($tp1), $ACC5, $ACC5
404
-
405
- vmovdqu $ACC3, 32*12-448($tp1)
406
- vmovdqu $ACC4, 32*13-448($tp1)
407
- lea 8($tpa), $tpa
408
-
409
- vpmuludq 32*7-128($ap), $B2, $TEMP0
410
- vpaddq $TEMP0, $ACC5, $ACC5
411
- vpmuludq 32*7-128($aap), $B2, $ACC6
412
- vpaddq 32*15-448($tp1), $ACC6, $ACC6
413
-
414
- vpmuludq 32*8-128($ap), $ACC0, $ACC7
415
- vmovdqu $ACC5, 32*14-448($tp1)
416
- vpaddq 32*16-448($tp1), $ACC7, $ACC7
417
- vmovdqu $ACC6, 32*15-448($tp1)
418
- vmovdqu $ACC7, 32*16-448($tp1)
419
- lea 8($tp1), $tp1
420
-
421
- dec $i
422
- jnz .LOOP_SQR_1024
423
- ___
424
- $ZERO = $ACC9;
425
- $TEMP0 = $B1;
426
- $TEMP2 = $B2;
427
- $TEMP3 = $Y1;
428
- $TEMP4 = $Y2;
429
- $code.=<<___;
430
- #we need to fix indexes 32-39 to avoid overflow
431
- vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
432
- vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
433
- vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
434
- lea 192(%rsp), $tp0 # 64+128=192
435
-
436
- vpsrlq \$29, $ACC8, $TEMP1
437
- vpand $AND_MASK, $ACC8, $ACC8
438
- vpsrlq \$29, $ACC1, $TEMP2
439
- vpand $AND_MASK, $ACC1, $ACC1
440
-
441
- vpermq \$0x93, $TEMP1, $TEMP1
442
- vpxor $ZERO, $ZERO, $ZERO
443
- vpermq \$0x93, $TEMP2, $TEMP2
444
-
445
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
446
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
447
- vpaddq $TEMP0, $ACC8, $ACC8
448
- vpblendd \$3, $TEMP2, $ZERO, $TEMP2
449
- vpaddq $TEMP1, $ACC1, $ACC1
450
- vpaddq $TEMP2, $ACC2, $ACC2
451
- vmovdqu $ACC1, 32*9-192($tp0)
452
- vmovdqu $ACC2, 32*10-192($tp0)
453
-
454
- mov (%rsp), %rax
455
- mov 8(%rsp), $r1
456
- mov 16(%rsp), $r2
457
- mov 24(%rsp), $r3
458
- vmovdqu 32*1(%rsp), $ACC1
459
- vmovdqu 32*2-192($tp0), $ACC2
460
- vmovdqu 32*3-192($tp0), $ACC3
461
- vmovdqu 32*4-192($tp0), $ACC4
462
- vmovdqu 32*5-192($tp0), $ACC5
463
- vmovdqu 32*6-192($tp0), $ACC6
464
- vmovdqu 32*7-192($tp0), $ACC7
465
-
466
- mov %rax, $r0
467
- imull $n0, %eax
468
- and \$0x1fffffff, %eax
469
- vmovd %eax, $Y1
470
-
471
- mov %rax, %rdx
472
- imulq -128($np), %rax
473
- vpbroadcastq $Y1, $Y1
474
- add %rax, $r0
475
- mov %rdx, %rax
476
- imulq 8-128($np), %rax
477
- shr \$29, $r0
478
- add %rax, $r1
479
- mov %rdx, %rax
480
- imulq 16-128($np), %rax
481
- add $r0, $r1
482
- add %rax, $r2
483
- imulq 24-128($np), %rdx
484
- add %rdx, $r3
485
-
486
- mov $r1, %rax
487
- imull $n0, %eax
488
- and \$0x1fffffff, %eax
489
-
490
- mov \$9, $i
491
- jmp .LOOP_REDUCE_1024
492
-
493
- .align 32
494
- .LOOP_REDUCE_1024:
495
- vmovd %eax, $Y2
496
- vpbroadcastq $Y2, $Y2
497
-
498
- vpmuludq 32*1-128($np), $Y1, $TEMP0
499
- mov %rax, %rdx
500
- imulq -128($np), %rax
501
- vpaddq $TEMP0, $ACC1, $ACC1
502
- add %rax, $r1
503
- vpmuludq 32*2-128($np), $Y1, $TEMP1
504
- mov %rdx, %rax
505
- imulq 8-128($np), %rax
506
- vpaddq $TEMP1, $ACC2, $ACC2
507
- vpmuludq 32*3-128($np), $Y1, $TEMP2
508
- .byte 0x67
509
- add %rax, $r2
510
- .byte 0x67
511
- mov %rdx, %rax
512
- imulq 16-128($np), %rax
513
- shr \$29, $r1
514
- vpaddq $TEMP2, $ACC3, $ACC3
515
- vpmuludq 32*4-128($np), $Y1, $TEMP0
516
- add %rax, $r3
517
- add $r1, $r2
518
- vpaddq $TEMP0, $ACC4, $ACC4
519
- vpmuludq 32*5-128($np), $Y1, $TEMP1
520
- mov $r2, %rax
521
- imull $n0, %eax
522
- vpaddq $TEMP1, $ACC5, $ACC5
523
- vpmuludq 32*6-128($np), $Y1, $TEMP2
524
- and \$0x1fffffff, %eax
525
- vpaddq $TEMP2, $ACC6, $ACC6
526
- vpmuludq 32*7-128($np), $Y1, $TEMP0
527
- vpaddq $TEMP0, $ACC7, $ACC7
528
- vpmuludq 32*8-128($np), $Y1, $TEMP1
529
- vmovd %eax, $Y1
530
- #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
531
- vpaddq $TEMP1, $ACC8, $ACC8
532
- #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
533
- vpbroadcastq $Y1, $Y1
534
-
535
- vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
536
- vmovdqu 32*3-8-128($np), $TEMP1
537
- mov %rax, %rdx
538
- imulq -128($np), %rax
539
- vpaddq $TEMP2, $ACC1, $ACC1
540
- vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
541
- vmovdqu 32*4-8-128($np), $TEMP2
542
- add %rax, $r2
543
- mov %rdx, %rax
544
- imulq 8-128($np), %rax
545
- vpaddq $TEMP0, $ACC2, $ACC2
546
- add $r3, %rax
547
- shr \$29, $r2
548
- vpmuludq $Y2, $TEMP1, $TEMP1
549
- vmovdqu 32*5-8-128($np), $TEMP0
550
- add $r2, %rax
551
- vpaddq $TEMP1, $ACC3, $ACC3
552
- vpmuludq $Y2, $TEMP2, $TEMP2
553
- vmovdqu 32*6-8-128($np), $TEMP1
554
- .byte 0x67
555
- mov %rax, $r3
556
- imull $n0, %eax
557
- vpaddq $TEMP2, $ACC4, $ACC4
558
- vpmuludq $Y2, $TEMP0, $TEMP0
559
- .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
560
- and \$0x1fffffff, %eax
561
- vpaddq $TEMP0, $ACC5, $ACC5
562
- vpmuludq $Y2, $TEMP1, $TEMP1
563
- vmovdqu 32*8-8-128($np), $TEMP0
564
- vpaddq $TEMP1, $ACC6, $ACC6
565
- vpmuludq $Y2, $TEMP2, $TEMP2
566
- vmovdqu 32*9-8-128($np), $ACC9
567
- vmovd %eax, $ACC0 # borrow ACC0 for Y2
568
- imulq -128($np), %rax
569
- vpaddq $TEMP2, $ACC7, $ACC7
570
- vpmuludq $Y2, $TEMP0, $TEMP0
571
- vmovdqu 32*1-16-128($np), $TEMP1
572
- vpbroadcastq $ACC0, $ACC0
573
- vpaddq $TEMP0, $ACC8, $ACC8
574
- vpmuludq $Y2, $ACC9, $ACC9
575
- vmovdqu 32*2-16-128($np), $TEMP2
576
- add %rax, $r3
577
-
578
- ___
579
- ($ACC0,$Y2)=($Y2,$ACC0);
580
- $code.=<<___;
581
- vmovdqu 32*1-24-128($np), $ACC0
582
- vpmuludq $Y1, $TEMP1, $TEMP1
583
- vmovdqu 32*3-16-128($np), $TEMP0
584
- vpaddq $TEMP1, $ACC1, $ACC1
585
- vpmuludq $Y2, $ACC0, $ACC0
586
- vpmuludq $Y1, $TEMP2, $TEMP2
587
- .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
588
- vpaddq $ACC1, $ACC0, $ACC0
589
- vpaddq $TEMP2, $ACC2, $ACC2
590
- vpmuludq $Y1, $TEMP0, $TEMP0
591
- vmovdqu 32*5-16-128($np), $TEMP2
592
- .byte 0x67
593
- vmovq $ACC0, %rax
594
- vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
595
- vpaddq $TEMP0, $ACC3, $ACC3
596
- vpmuludq $Y1, $TEMP1, $TEMP1
597
- vmovdqu 32*6-16-128($np), $TEMP0
598
- vpaddq $TEMP1, $ACC4, $ACC4
599
- vpmuludq $Y1, $TEMP2, $TEMP2
600
- vmovdqu 32*7-16-128($np), $TEMP1
601
- vpaddq $TEMP2, $ACC5, $ACC5
602
- vpmuludq $Y1, $TEMP0, $TEMP0
603
- vmovdqu 32*8-16-128($np), $TEMP2
604
- vpaddq $TEMP0, $ACC6, $ACC6
605
- vpmuludq $Y1, $TEMP1, $TEMP1
606
- shr \$29, $r3
607
- vmovdqu 32*9-16-128($np), $TEMP0
608
- add $r3, %rax
609
- vpaddq $TEMP1, $ACC7, $ACC7
610
- vpmuludq $Y1, $TEMP2, $TEMP2
611
- #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
612
- mov %rax, $r0
613
- imull $n0, %eax
614
- vpaddq $TEMP2, $ACC8, $ACC8
615
- vpmuludq $Y1, $TEMP0, $TEMP0
616
- and \$0x1fffffff, %eax
617
- vmovd %eax, $Y1
618
- vmovdqu 32*3-24-128($np), $TEMP2
619
- .byte 0x67
620
- vpaddq $TEMP0, $ACC9, $ACC9
621
- vpbroadcastq $Y1, $Y1
622
-
623
- vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
624
- vmovdqu 32*4-24-128($np), $TEMP0
625
- mov %rax, %rdx
626
- imulq -128($np), %rax
627
- mov 8(%rsp), $r1
628
- vpaddq $TEMP1, $ACC2, $ACC1
629
- vpmuludq $Y2, $TEMP2, $TEMP2
630
- vmovdqu 32*5-24-128($np), $TEMP1
631
- add %rax, $r0
632
- mov %rdx, %rax
633
- imulq 8-128($np), %rax
634
- .byte 0x67
635
- shr \$29, $r0
636
- mov 16(%rsp), $r2
637
- vpaddq $TEMP2, $ACC3, $ACC2
638
- vpmuludq $Y2, $TEMP0, $TEMP0
639
- vmovdqu 32*6-24-128($np), $TEMP2
640
- add %rax, $r1
641
- mov %rdx, %rax
642
- imulq 16-128($np), %rax
643
- vpaddq $TEMP0, $ACC4, $ACC3
644
- vpmuludq $Y2, $TEMP1, $TEMP1
645
- vmovdqu 32*7-24-128($np), $TEMP0
646
- imulq 24-128($np), %rdx # future $r3
647
- add %rax, $r2
648
- lea ($r0,$r1), %rax
649
- vpaddq $TEMP1, $ACC5, $ACC4
650
- vpmuludq $Y2, $TEMP2, $TEMP2
651
- vmovdqu 32*8-24-128($np), $TEMP1
652
- mov %rax, $r1
653
- imull $n0, %eax
654
- vpmuludq $Y2, $TEMP0, $TEMP0
655
- vpaddq $TEMP2, $ACC6, $ACC5
656
- vmovdqu 32*9-24-128($np), $TEMP2
657
- and \$0x1fffffff, %eax
658
- vpaddq $TEMP0, $ACC7, $ACC6
659
- vpmuludq $Y2, $TEMP1, $TEMP1
660
- add 24(%rsp), %rdx
661
- vpaddq $TEMP1, $ACC8, $ACC7
662
- vpmuludq $Y2, $TEMP2, $TEMP2
663
- vpaddq $TEMP2, $ACC9, $ACC8
664
- vmovq $r3, $ACC9
665
- mov %rdx, $r3
666
-
667
- dec $i
668
- jnz .LOOP_REDUCE_1024
669
- ___
670
- ($ACC0,$Y2)=($Y2,$ACC0);
671
- $code.=<<___;
672
- lea 448(%rsp), $tp1 # size optimization
673
- vpaddq $ACC9, $Y2, $ACC0
674
- vpxor $ZERO, $ZERO, $ZERO
675
-
676
- vpaddq 32*9-192($tp0), $ACC0, $ACC0
677
- vpaddq 32*10-448($tp1), $ACC1, $ACC1
678
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
679
- vpaddq 32*12-448($tp1), $ACC3, $ACC3
680
- vpaddq 32*13-448($tp1), $ACC4, $ACC4
681
- vpaddq 32*14-448($tp1), $ACC5, $ACC5
682
- vpaddq 32*15-448($tp1), $ACC6, $ACC6
683
- vpaddq 32*16-448($tp1), $ACC7, $ACC7
684
- vpaddq 32*17-448($tp1), $ACC8, $ACC8
685
-
686
- vpsrlq \$29, $ACC0, $TEMP1
687
- vpand $AND_MASK, $ACC0, $ACC0
688
- vpsrlq \$29, $ACC1, $TEMP2
689
- vpand $AND_MASK, $ACC1, $ACC1
690
- vpsrlq \$29, $ACC2, $TEMP3
691
- vpermq \$0x93, $TEMP1, $TEMP1
692
- vpand $AND_MASK, $ACC2, $ACC2
693
- vpsrlq \$29, $ACC3, $TEMP4
694
- vpermq \$0x93, $TEMP2, $TEMP2
695
- vpand $AND_MASK, $ACC3, $ACC3
696
- vpermq \$0x93, $TEMP3, $TEMP3
697
-
698
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
699
- vpermq \$0x93, $TEMP4, $TEMP4
700
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
701
- vpaddq $TEMP0, $ACC0, $ACC0
702
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
703
- vpaddq $TEMP1, $ACC1, $ACC1
704
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
705
- vpaddq $TEMP2, $ACC2, $ACC2
706
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
707
- vpaddq $TEMP3, $ACC3, $ACC3
708
- vpaddq $TEMP4, $ACC4, $ACC4
709
-
710
- vpsrlq \$29, $ACC0, $TEMP1
711
- vpand $AND_MASK, $ACC0, $ACC0
712
- vpsrlq \$29, $ACC1, $TEMP2
713
- vpand $AND_MASK, $ACC1, $ACC1
714
- vpsrlq \$29, $ACC2, $TEMP3
715
- vpermq \$0x93, $TEMP1, $TEMP1
716
- vpand $AND_MASK, $ACC2, $ACC2
717
- vpsrlq \$29, $ACC3, $TEMP4
718
- vpermq \$0x93, $TEMP2, $TEMP2
719
- vpand $AND_MASK, $ACC3, $ACC3
720
- vpermq \$0x93, $TEMP3, $TEMP3
721
-
722
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
723
- vpermq \$0x93, $TEMP4, $TEMP4
724
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
725
- vpaddq $TEMP0, $ACC0, $ACC0
726
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
727
- vpaddq $TEMP1, $ACC1, $ACC1
728
- vmovdqu $ACC0, 32*0-128($rp)
729
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
730
- vpaddq $TEMP2, $ACC2, $ACC2
731
- vmovdqu $ACC1, 32*1-128($rp)
732
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
733
- vpaddq $TEMP3, $ACC3, $ACC3
734
- vmovdqu $ACC2, 32*2-128($rp)
735
- vpaddq $TEMP4, $ACC4, $ACC4
736
- vmovdqu $ACC3, 32*3-128($rp)
737
- ___
738
- $TEMP5=$ACC0;
739
- $code.=<<___;
740
- vpsrlq \$29, $ACC4, $TEMP1
741
- vpand $AND_MASK, $ACC4, $ACC4
742
- vpsrlq \$29, $ACC5, $TEMP2
743
- vpand $AND_MASK, $ACC5, $ACC5
744
- vpsrlq \$29, $ACC6, $TEMP3
745
- vpermq \$0x93, $TEMP1, $TEMP1
746
- vpand $AND_MASK, $ACC6, $ACC6
747
- vpsrlq \$29, $ACC7, $TEMP4
748
- vpermq \$0x93, $TEMP2, $TEMP2
749
- vpand $AND_MASK, $ACC7, $ACC7
750
- vpsrlq \$29, $ACC8, $TEMP5
751
- vpermq \$0x93, $TEMP3, $TEMP3
752
- vpand $AND_MASK, $ACC8, $ACC8
753
- vpermq \$0x93, $TEMP4, $TEMP4
754
-
755
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
756
- vpermq \$0x93, $TEMP5, $TEMP5
757
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
758
- vpaddq $TEMP0, $ACC4, $ACC4
759
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
760
- vpaddq $TEMP1, $ACC5, $ACC5
761
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
762
- vpaddq $TEMP2, $ACC6, $ACC6
763
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
764
- vpaddq $TEMP3, $ACC7, $ACC7
765
- vpaddq $TEMP4, $ACC8, $ACC8
766
-
767
- vpsrlq \$29, $ACC4, $TEMP1
768
- vpand $AND_MASK, $ACC4, $ACC4
769
- vpsrlq \$29, $ACC5, $TEMP2
770
- vpand $AND_MASK, $ACC5, $ACC5
771
- vpsrlq \$29, $ACC6, $TEMP3
772
- vpermq \$0x93, $TEMP1, $TEMP1
773
- vpand $AND_MASK, $ACC6, $ACC6
774
- vpsrlq \$29, $ACC7, $TEMP4
775
- vpermq \$0x93, $TEMP2, $TEMP2
776
- vpand $AND_MASK, $ACC7, $ACC7
777
- vpsrlq \$29, $ACC8, $TEMP5
778
- vpermq \$0x93, $TEMP3, $TEMP3
779
- vpand $AND_MASK, $ACC8, $ACC8
780
- vpermq \$0x93, $TEMP4, $TEMP4
781
-
782
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
783
- vpermq \$0x93, $TEMP5, $TEMP5
784
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
785
- vpaddq $TEMP0, $ACC4, $ACC4
786
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
787
- vpaddq $TEMP1, $ACC5, $ACC5
788
- vmovdqu $ACC4, 32*4-128($rp)
789
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
790
- vpaddq $TEMP2, $ACC6, $ACC6
791
- vmovdqu $ACC5, 32*5-128($rp)
792
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
793
- vpaddq $TEMP3, $ACC7, $ACC7
794
- vmovdqu $ACC6, 32*6-128($rp)
795
- vpaddq $TEMP4, $ACC8, $ACC8
796
- vmovdqu $ACC7, 32*7-128($rp)
797
- vmovdqu $ACC8, 32*8-128($rp)
798
-
799
- mov $rp, $ap
800
- dec $rep
801
- jne .LOOP_GRANDE_SQR_1024
802
-
803
- vzeroall
804
- mov %rbp, %rax
805
- ___
806
- $code.=<<___ if ($win64);
807
- movaps -0xd8(%rax),%xmm6
808
- movaps -0xc8(%rax),%xmm7
809
- movaps -0xb8(%rax),%xmm8
810
- movaps -0xa8(%rax),%xmm9
811
- movaps -0x98(%rax),%xmm10
812
- movaps -0x88(%rax),%xmm11
813
- movaps -0x78(%rax),%xmm12
814
- movaps -0x68(%rax),%xmm13
815
- movaps -0x58(%rax),%xmm14
816
- movaps -0x48(%rax),%xmm15
817
- ___
818
- $code.=<<___;
819
- mov -48(%rax),%r15
820
- mov -40(%rax),%r14
821
- mov -32(%rax),%r13
822
- mov -24(%rax),%r12
823
- mov -16(%rax),%rbp
824
- mov -8(%rax),%rbx
825
- lea (%rax),%rsp # restore %rsp
826
- .Lsqr_1024_epilogue:
827
- ret
828
- .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
829
- ___
830
- }
831
-
832
- { # void AMM_WW(
833
- my $rp="%rdi"; # BN_ULONG *rp,
834
- my $ap="%rsi"; # const BN_ULONG *ap,
835
- my $bp="%rdx"; # const BN_ULONG *bp,
836
- my $np="%rcx"; # const BN_ULONG *np,
837
- my $n0="%r8d"; # unsigned int n0);
838
-
839
- # The registers that hold the accumulated redundant result
840
- # The AMM works on 1024 bit operands, and redundant word size is 29
841
- # Therefore: ceil(1024/29)/4 = 9
842
- my $ACC0="%ymm0";
843
- my $ACC1="%ymm1";
844
- my $ACC2="%ymm2";
845
- my $ACC3="%ymm3";
846
- my $ACC4="%ymm4";
847
- my $ACC5="%ymm5";
848
- my $ACC6="%ymm6";
849
- my $ACC7="%ymm7";
850
- my $ACC8="%ymm8";
851
- my $ACC9="%ymm9";
852
-
853
- # Registers that hold the broadcasted words of multiplier, currently used
854
- my $Bi="%ymm10";
855
- my $Yi="%ymm11";
856
-
857
- # Helper registers
858
- my $TEMP0=$ACC0;
859
- my $TEMP1="%ymm12";
860
- my $TEMP2="%ymm13";
861
- my $ZERO="%ymm14";
862
- my $AND_MASK="%ymm15";
863
-
864
- # alu registers that hold the first words of the ACC
865
- my $r0="%r9";
866
- my $r1="%r10";
867
- my $r2="%r11";
868
- my $r3="%r12";
869
-
870
- my $i="%r14d";
871
- my $tmp="%r15";
872
-
873
- $bp="%r13"; # reassigned argument
874
-
875
- $code.=<<___;
876
- .globl rsaz_1024_mul_avx2
877
- .type rsaz_1024_mul_avx2,\@function,5
878
- .align 64
879
- rsaz_1024_mul_avx2:
880
- lea (%rsp), %rax
881
- push %rbx
882
- push %rbp
883
- push %r12
884
- push %r13
885
- push %r14
886
- push %r15
887
- ___
888
- $code.=<<___ if ($win64);
889
- vzeroupper
890
- lea -0xa8(%rsp),%rsp
891
- vmovaps %xmm6,-0xd8(%rax)
892
- vmovaps %xmm7,-0xc8(%rax)
893
- vmovaps %xmm8,-0xb8(%rax)
894
- vmovaps %xmm9,-0xa8(%rax)
895
- vmovaps %xmm10,-0x98(%rax)
896
- vmovaps %xmm11,-0x88(%rax)
897
- vmovaps %xmm12,-0x78(%rax)
898
- vmovaps %xmm13,-0x68(%rax)
899
- vmovaps %xmm14,-0x58(%rax)
900
- vmovaps %xmm15,-0x48(%rax)
901
- .Lmul_1024_body:
902
- ___
903
- $code.=<<___;
904
- mov %rax,%rbp
905
- vzeroall
906
- mov %rdx, $bp # reassigned argument
907
- sub \$64,%rsp
908
-
909
- # unaligned 256-bit load that crosses page boundary can
910
- # cause severe performance degradation here, so if $ap does
911
- # cross page boundary, swap it with $bp [meaning that caller
912
- # is advised to lay down $ap and $bp next to each other, so
913
- # that only one can cross page boundary].
914
- .byte 0x67,0x67
915
- mov $ap, $tmp
916
- and \$4095, $tmp
917
- add \$32*10, $tmp
918
- shr \$12, $tmp
919
- mov $ap, $tmp
920
- cmovnz $bp, $ap
921
- cmovnz $tmp, $bp
922
-
923
- mov $np, $tmp
924
- sub \$-128,$ap # size optimization
925
- sub \$-128,$np
926
- sub \$-128,$rp
927
-
928
- and \$4095, $tmp # see if $np crosses page
929
- add \$32*10, $tmp
930
- .byte 0x67,0x67
931
- shr \$12, $tmp
932
- jz .Lmul_1024_no_n_copy
933
-
934
- # unaligned 256-bit load that crosses page boundary can
935
- # cause severe performance degradation here, so if $np does
936
- # cross page boundary, copy it to stack and make sure stack
937
- # frame doesn't...
938
- sub \$32*10,%rsp
939
- vmovdqu 32*0-128($np), $ACC0
940
- and \$-512, %rsp
941
- vmovdqu 32*1-128($np), $ACC1
942
- vmovdqu 32*2-128($np), $ACC2
943
- vmovdqu 32*3-128($np), $ACC3
944
- vmovdqu 32*4-128($np), $ACC4
945
- vmovdqu 32*5-128($np), $ACC5
946
- vmovdqu 32*6-128($np), $ACC6
947
- vmovdqu 32*7-128($np), $ACC7
948
- vmovdqu 32*8-128($np), $ACC8
949
- lea 64+128(%rsp),$np
950
- vmovdqu $ACC0, 32*0-128($np)
951
- vpxor $ACC0, $ACC0, $ACC0
952
- vmovdqu $ACC1, 32*1-128($np)
953
- vpxor $ACC1, $ACC1, $ACC1
954
- vmovdqu $ACC2, 32*2-128($np)
955
- vpxor $ACC2, $ACC2, $ACC2
956
- vmovdqu $ACC3, 32*3-128($np)
957
- vpxor $ACC3, $ACC3, $ACC3
958
- vmovdqu $ACC4, 32*4-128($np)
959
- vpxor $ACC4, $ACC4, $ACC4
960
- vmovdqu $ACC5, 32*5-128($np)
961
- vpxor $ACC5, $ACC5, $ACC5
962
- vmovdqu $ACC6, 32*6-128($np)
963
- vpxor $ACC6, $ACC6, $ACC6
964
- vmovdqu $ACC7, 32*7-128($np)
965
- vpxor $ACC7, $ACC7, $ACC7
966
- vmovdqu $ACC8, 32*8-128($np)
967
- vmovdqa $ACC0, $ACC8
968
- vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
969
- .Lmul_1024_no_n_copy:
970
- and \$-64,%rsp
971
-
972
- mov ($bp), %rbx
973
- vpbroadcastq ($bp), $Bi
974
- vmovdqu $ACC0, (%rsp) # clear top of stack
975
- xor $r0, $r0
976
- .byte 0x67
977
- xor $r1, $r1
978
- xor $r2, $r2
979
- xor $r3, $r3
980
-
981
- vmovdqu .Land_mask(%rip), $AND_MASK
982
- mov \$9, $i
983
- vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
984
- jmp .Loop_mul_1024
985
-
986
- .align 32
987
- .Loop_mul_1024:
988
- vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
989
- mov %rbx, %rax
990
- imulq -128($ap), %rax
991
- add $r0, %rax
992
- mov %rbx, $r1
993
- imulq 8-128($ap), $r1
994
- add 8(%rsp), $r1
995
-
996
- mov %rax, $r0
997
- imull $n0, %eax
998
- and \$0x1fffffff, %eax
999
-
1000
- mov %rbx, $r2
1001
- imulq 16-128($ap), $r2
1002
- add 16(%rsp), $r2
1003
-
1004
- mov %rbx, $r3
1005
- imulq 24-128($ap), $r3
1006
- add 24(%rsp), $r3
1007
- vpmuludq 32*1-128($ap),$Bi,$TEMP0
1008
- vmovd %eax, $Yi
1009
- vpaddq $TEMP0,$ACC1,$ACC1
1010
- vpmuludq 32*2-128($ap),$Bi,$TEMP1
1011
- vpbroadcastq $Yi, $Yi
1012
- vpaddq $TEMP1,$ACC2,$ACC2
1013
- vpmuludq 32*3-128($ap),$Bi,$TEMP2
1014
- vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
1015
- vpaddq $TEMP2,$ACC3,$ACC3
1016
- vpmuludq 32*4-128($ap),$Bi,$TEMP0
1017
- vpaddq $TEMP0,$ACC4,$ACC4
1018
- vpmuludq 32*5-128($ap),$Bi,$TEMP1
1019
- vpaddq $TEMP1,$ACC5,$ACC5
1020
- vpmuludq 32*6-128($ap),$Bi,$TEMP2
1021
- vpaddq $TEMP2,$ACC6,$ACC6
1022
- vpmuludq 32*7-128($ap),$Bi,$TEMP0
1023
- vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
1024
- vpaddq $TEMP0,$ACC7,$ACC7
1025
- vpmuludq 32*8-128($ap),$Bi,$TEMP1
1026
- vpbroadcastq 8($bp), $Bi
1027
- vpaddq $TEMP1,$ACC8,$ACC8
1028
-
1029
- mov %rax,%rdx
1030
- imulq -128($np),%rax
1031
- add %rax,$r0
1032
- mov %rdx,%rax
1033
- imulq 8-128($np),%rax
1034
- add %rax,$r1
1035
- mov %rdx,%rax
1036
- imulq 16-128($np),%rax
1037
- add %rax,$r2
1038
- shr \$29, $r0
1039
- imulq 24-128($np),%rdx
1040
- add %rdx,$r3
1041
- add $r0, $r1
1042
-
1043
- vpmuludq 32*1-128($np),$Yi,$TEMP2
1044
- vmovq $Bi, %rbx
1045
- vpaddq $TEMP2,$ACC1,$ACC1
1046
- vpmuludq 32*2-128($np),$Yi,$TEMP0
1047
- vpaddq $TEMP0,$ACC2,$ACC2
1048
- vpmuludq 32*3-128($np),$Yi,$TEMP1
1049
- vpaddq $TEMP1,$ACC3,$ACC3
1050
- vpmuludq 32*4-128($np),$Yi,$TEMP2
1051
- vpaddq $TEMP2,$ACC4,$ACC4
1052
- vpmuludq 32*5-128($np),$Yi,$TEMP0
1053
- vpaddq $TEMP0,$ACC5,$ACC5
1054
- vpmuludq 32*6-128($np),$Yi,$TEMP1
1055
- vpaddq $TEMP1,$ACC6,$ACC6
1056
- vpmuludq 32*7-128($np),$Yi,$TEMP2
1057
- vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
1058
- vpaddq $TEMP2,$ACC7,$ACC7
1059
- vpmuludq 32*8-128($np),$Yi,$TEMP0
1060
- vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
1061
- vpaddq $TEMP0,$ACC8,$ACC8
1062
-
1063
- mov %rbx, %rax
1064
- imulq -128($ap),%rax
1065
- add %rax,$r1
1066
- vmovdqu -8+32*1-128($ap),$TEMP1
1067
- mov %rbx, %rax
1068
- imulq 8-128($ap),%rax
1069
- add %rax,$r2
1070
- vmovdqu -8+32*2-128($ap),$TEMP2
1071
-
1072
- mov $r1, %rax
1073
- imull $n0, %eax
1074
- and \$0x1fffffff, %eax
1075
-
1076
- imulq 16-128($ap),%rbx
1077
- add %rbx,$r3
1078
- vpmuludq $Bi,$TEMP1,$TEMP1
1079
- vmovd %eax, $Yi
1080
- vmovdqu -8+32*3-128($ap),$TEMP0
1081
- vpaddq $TEMP1,$ACC1,$ACC1
1082
- vpmuludq $Bi,$TEMP2,$TEMP2
1083
- vpbroadcastq $Yi, $Yi
1084
- vmovdqu -8+32*4-128($ap),$TEMP1
1085
- vpaddq $TEMP2,$ACC2,$ACC2
1086
- vpmuludq $Bi,$TEMP0,$TEMP0
1087
- vmovdqu -8+32*5-128($ap),$TEMP2
1088
- vpaddq $TEMP0,$ACC3,$ACC3
1089
- vpmuludq $Bi,$TEMP1,$TEMP1
1090
- vmovdqu -8+32*6-128($ap),$TEMP0
1091
- vpaddq $TEMP1,$ACC4,$ACC4
1092
- vpmuludq $Bi,$TEMP2,$TEMP2
1093
- vmovdqu -8+32*7-128($ap),$TEMP1
1094
- vpaddq $TEMP2,$ACC5,$ACC5
1095
- vpmuludq $Bi,$TEMP0,$TEMP0
1096
- vmovdqu -8+32*8-128($ap),$TEMP2
1097
- vpaddq $TEMP0,$ACC6,$ACC6
1098
- vpmuludq $Bi,$TEMP1,$TEMP1
1099
- vmovdqu -8+32*9-128($ap),$ACC9
1100
- vpaddq $TEMP1,$ACC7,$ACC7
1101
- vpmuludq $Bi,$TEMP2,$TEMP2
1102
- vpaddq $TEMP2,$ACC8,$ACC8
1103
- vpmuludq $Bi,$ACC9,$ACC9
1104
- vpbroadcastq 16($bp), $Bi
1105
-
1106
- mov %rax,%rdx
1107
- imulq -128($np),%rax
1108
- add %rax,$r1
1109
- vmovdqu -8+32*1-128($np),$TEMP0
1110
- mov %rdx,%rax
1111
- imulq 8-128($np),%rax
1112
- add %rax,$r2
1113
- vmovdqu -8+32*2-128($np),$TEMP1
1114
- shr \$29, $r1
1115
- imulq 16-128($np),%rdx
1116
- add %rdx,$r3
1117
- add $r1, $r2
1118
-
1119
- vpmuludq $Yi,$TEMP0,$TEMP0
1120
- vmovq $Bi, %rbx
1121
- vmovdqu -8+32*3-128($np),$TEMP2
1122
- vpaddq $TEMP0,$ACC1,$ACC1
1123
- vpmuludq $Yi,$TEMP1,$TEMP1
1124
- vmovdqu -8+32*4-128($np),$TEMP0
1125
- vpaddq $TEMP1,$ACC2,$ACC2
1126
- vpmuludq $Yi,$TEMP2,$TEMP2
1127
- vmovdqu -8+32*5-128($np),$TEMP1
1128
- vpaddq $TEMP2,$ACC3,$ACC3
1129
- vpmuludq $Yi,$TEMP0,$TEMP0
1130
- vmovdqu -8+32*6-128($np),$TEMP2
1131
- vpaddq $TEMP0,$ACC4,$ACC4
1132
- vpmuludq $Yi,$TEMP1,$TEMP1
1133
- vmovdqu -8+32*7-128($np),$TEMP0
1134
- vpaddq $TEMP1,$ACC5,$ACC5
1135
- vpmuludq $Yi,$TEMP2,$TEMP2
1136
- vmovdqu -8+32*8-128($np),$TEMP1
1137
- vpaddq $TEMP2,$ACC6,$ACC6
1138
- vpmuludq $Yi,$TEMP0,$TEMP0
1139
- vmovdqu -8+32*9-128($np),$TEMP2
1140
- vpaddq $TEMP0,$ACC7,$ACC7
1141
- vpmuludq $Yi,$TEMP1,$TEMP1
1142
- vpaddq $TEMP1,$ACC8,$ACC8
1143
- vpmuludq $Yi,$TEMP2,$TEMP2
1144
- vpaddq $TEMP2,$ACC9,$ACC9
1145
-
1146
- vmovdqu -16+32*1-128($ap),$TEMP0
1147
- mov %rbx,%rax
1148
- imulq -128($ap),%rax
1149
- add $r2,%rax
1150
-
1151
- vmovdqu -16+32*2-128($ap),$TEMP1
1152
- mov %rax,$r2
1153
- imull $n0, %eax
1154
- and \$0x1fffffff, %eax
1155
-
1156
- imulq 8-128($ap),%rbx
1157
- add %rbx,$r3
1158
- vpmuludq $Bi,$TEMP0,$TEMP0
1159
- vmovd %eax, $Yi
1160
- vmovdqu -16+32*3-128($ap),$TEMP2
1161
- vpaddq $TEMP0,$ACC1,$ACC1
1162
- vpmuludq $Bi,$TEMP1,$TEMP1
1163
- vpbroadcastq $Yi, $Yi
1164
- vmovdqu -16+32*4-128($ap),$TEMP0
1165
- vpaddq $TEMP1,$ACC2,$ACC2
1166
- vpmuludq $Bi,$TEMP2,$TEMP2
1167
- vmovdqu -16+32*5-128($ap),$TEMP1
1168
- vpaddq $TEMP2,$ACC3,$ACC3
1169
- vpmuludq $Bi,$TEMP0,$TEMP0
1170
- vmovdqu -16+32*6-128($ap),$TEMP2
1171
- vpaddq $TEMP0,$ACC4,$ACC4
1172
- vpmuludq $Bi,$TEMP1,$TEMP1
1173
- vmovdqu -16+32*7-128($ap),$TEMP0
1174
- vpaddq $TEMP1,$ACC5,$ACC5
1175
- vpmuludq $Bi,$TEMP2,$TEMP2
1176
- vmovdqu -16+32*8-128($ap),$TEMP1
1177
- vpaddq $TEMP2,$ACC6,$ACC6
1178
- vpmuludq $Bi,$TEMP0,$TEMP0
1179
- vmovdqu -16+32*9-128($ap),$TEMP2
1180
- vpaddq $TEMP0,$ACC7,$ACC7
1181
- vpmuludq $Bi,$TEMP1,$TEMP1
1182
- vpaddq $TEMP1,$ACC8,$ACC8
1183
- vpmuludq $Bi,$TEMP2,$TEMP2
1184
- vpbroadcastq 24($bp), $Bi
1185
- vpaddq $TEMP2,$ACC9,$ACC9
1186
-
1187
- vmovdqu -16+32*1-128($np),$TEMP0
1188
- mov %rax,%rdx
1189
- imulq -128($np),%rax
1190
- add %rax,$r2
1191
- vmovdqu -16+32*2-128($np),$TEMP1
1192
- imulq 8-128($np),%rdx
1193
- add %rdx,$r3
1194
- shr \$29, $r2
1195
-
1196
- vpmuludq $Yi,$TEMP0,$TEMP0
1197
- vmovq $Bi, %rbx
1198
- vmovdqu -16+32*3-128($np),$TEMP2
1199
- vpaddq $TEMP0,$ACC1,$ACC1
1200
- vpmuludq $Yi,$TEMP1,$TEMP1
1201
- vmovdqu -16+32*4-128($np),$TEMP0
1202
- vpaddq $TEMP1,$ACC2,$ACC2
1203
- vpmuludq $Yi,$TEMP2,$TEMP2
1204
- vmovdqu -16+32*5-128($np),$TEMP1
1205
- vpaddq $TEMP2,$ACC3,$ACC3
1206
- vpmuludq $Yi,$TEMP0,$TEMP0
1207
- vmovdqu -16+32*6-128($np),$TEMP2
1208
- vpaddq $TEMP0,$ACC4,$ACC4
1209
- vpmuludq $Yi,$TEMP1,$TEMP1
1210
- vmovdqu -16+32*7-128($np),$TEMP0
1211
- vpaddq $TEMP1,$ACC5,$ACC5
1212
- vpmuludq $Yi,$TEMP2,$TEMP2
1213
- vmovdqu -16+32*8-128($np),$TEMP1
1214
- vpaddq $TEMP2,$ACC6,$ACC6
1215
- vpmuludq $Yi,$TEMP0,$TEMP0
1216
- vmovdqu -16+32*9-128($np),$TEMP2
1217
- vpaddq $TEMP0,$ACC7,$ACC7
1218
- vpmuludq $Yi,$TEMP1,$TEMP1
1219
- vmovdqu -24+32*1-128($ap),$TEMP0
1220
- vpaddq $TEMP1,$ACC8,$ACC8
1221
- vpmuludq $Yi,$TEMP2,$TEMP2
1222
- vmovdqu -24+32*2-128($ap),$TEMP1
1223
- vpaddq $TEMP2,$ACC9,$ACC9
1224
-
1225
- add $r2, $r3
1226
- imulq -128($ap),%rbx
1227
- add %rbx,$r3
1228
-
1229
- mov $r3, %rax
1230
- imull $n0, %eax
1231
- and \$0x1fffffff, %eax
1232
-
1233
- vpmuludq $Bi,$TEMP0,$TEMP0
1234
- vmovd %eax, $Yi
1235
- vmovdqu -24+32*3-128($ap),$TEMP2
1236
- vpaddq $TEMP0,$ACC1,$ACC1
1237
- vpmuludq $Bi,$TEMP1,$TEMP1
1238
- vpbroadcastq $Yi, $Yi
1239
- vmovdqu -24+32*4-128($ap),$TEMP0
1240
- vpaddq $TEMP1,$ACC2,$ACC2
1241
- vpmuludq $Bi,$TEMP2,$TEMP2
1242
- vmovdqu -24+32*5-128($ap),$TEMP1
1243
- vpaddq $TEMP2,$ACC3,$ACC3
1244
- vpmuludq $Bi,$TEMP0,$TEMP0
1245
- vmovdqu -24+32*6-128($ap),$TEMP2
1246
- vpaddq $TEMP0,$ACC4,$ACC4
1247
- vpmuludq $Bi,$TEMP1,$TEMP1
1248
- vmovdqu -24+32*7-128($ap),$TEMP0
1249
- vpaddq $TEMP1,$ACC5,$ACC5
1250
- vpmuludq $Bi,$TEMP2,$TEMP2
1251
- vmovdqu -24+32*8-128($ap),$TEMP1
1252
- vpaddq $TEMP2,$ACC6,$ACC6
1253
- vpmuludq $Bi,$TEMP0,$TEMP0
1254
- vmovdqu -24+32*9-128($ap),$TEMP2
1255
- vpaddq $TEMP0,$ACC7,$ACC7
1256
- vpmuludq $Bi,$TEMP1,$TEMP1
1257
- vpaddq $TEMP1,$ACC8,$ACC8
1258
- vpmuludq $Bi,$TEMP2,$TEMP2
1259
- vpbroadcastq 32($bp), $Bi
1260
- vpaddq $TEMP2,$ACC9,$ACC9
1261
- add \$32, $bp # $bp++
1262
-
1263
- vmovdqu -24+32*1-128($np),$TEMP0
1264
- imulq -128($np),%rax
1265
- add %rax,$r3
1266
- shr \$29, $r3
1267
-
1268
- vmovdqu -24+32*2-128($np),$TEMP1
1269
- vpmuludq $Yi,$TEMP0,$TEMP0
1270
- vmovq $Bi, %rbx
1271
- vmovdqu -24+32*3-128($np),$TEMP2
1272
- vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
1273
- vpmuludq $Yi,$TEMP1,$TEMP1
1274
- vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
1275
- vpaddq $TEMP1,$ACC2,$ACC1
1276
- vmovdqu -24+32*4-128($np),$TEMP0
1277
- vpmuludq $Yi,$TEMP2,$TEMP2
1278
- vmovdqu -24+32*5-128($np),$TEMP1
1279
- vpaddq $TEMP2,$ACC3,$ACC2
1280
- vpmuludq $Yi,$TEMP0,$TEMP0
1281
- vmovdqu -24+32*6-128($np),$TEMP2
1282
- vpaddq $TEMP0,$ACC4,$ACC3
1283
- vpmuludq $Yi,$TEMP1,$TEMP1
1284
- vmovdqu -24+32*7-128($np),$TEMP0
1285
- vpaddq $TEMP1,$ACC5,$ACC4
1286
- vpmuludq $Yi,$TEMP2,$TEMP2
1287
- vmovdqu -24+32*8-128($np),$TEMP1
1288
- vpaddq $TEMP2,$ACC6,$ACC5
1289
- vpmuludq $Yi,$TEMP0,$TEMP0
1290
- vmovdqu -24+32*9-128($np),$TEMP2
1291
- mov $r3, $r0
1292
- vpaddq $TEMP0,$ACC7,$ACC6
1293
- vpmuludq $Yi,$TEMP1,$TEMP1
1294
- add (%rsp), $r0
1295
- vpaddq $TEMP1,$ACC8,$ACC7
1296
- vpmuludq $Yi,$TEMP2,$TEMP2
1297
- vmovq $r3, $TEMP1
1298
- vpaddq $TEMP2,$ACC9,$ACC8
1299
-
1300
- dec $i
1301
- jnz .Loop_mul_1024
1302
- ___
1303
-
1304
- # (*) Original implementation was correcting ACC1-ACC3 for overflow
1305
- # after 7 loop runs, or after 28 iterations, or 56 additions.
1306
- # But as we underutilize resources, it's possible to correct in
1307
- # each iteration with marginal performance loss. But then, as
1308
- # we do it in each iteration, we can correct less digits, and
1309
- # avoid performance penalties completely. Also note that we
1310
- # correct only three digits out of four. This works because
1311
- # most significant digit is subjected to less additions.
1312
-
1313
- $TEMP0 = $ACC9;
1314
- $TEMP3 = $Bi;
1315
- $TEMP4 = $Yi;
1316
- $code.=<<___;
1317
- vpermq \$0, $AND_MASK, $AND_MASK
1318
- vpaddq (%rsp), $TEMP1, $ACC0
1319
-
1320
- vpsrlq \$29, $ACC0, $TEMP1
1321
- vpand $AND_MASK, $ACC0, $ACC0
1322
- vpsrlq \$29, $ACC1, $TEMP2
1323
- vpand $AND_MASK, $ACC1, $ACC1
1324
- vpsrlq \$29, $ACC2, $TEMP3
1325
- vpermq \$0x93, $TEMP1, $TEMP1
1326
- vpand $AND_MASK, $ACC2, $ACC2
1327
- vpsrlq \$29, $ACC3, $TEMP4
1328
- vpermq \$0x93, $TEMP2, $TEMP2
1329
- vpand $AND_MASK, $ACC3, $ACC3
1330
-
1331
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1332
- vpermq \$0x93, $TEMP3, $TEMP3
1333
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1334
- vpermq \$0x93, $TEMP4, $TEMP4
1335
- vpaddq $TEMP0, $ACC0, $ACC0
1336
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1337
- vpaddq $TEMP1, $ACC1, $ACC1
1338
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1339
- vpaddq $TEMP2, $ACC2, $ACC2
1340
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1341
- vpaddq $TEMP3, $ACC3, $ACC3
1342
- vpaddq $TEMP4, $ACC4, $ACC4
1343
-
1344
- vpsrlq \$29, $ACC0, $TEMP1
1345
- vpand $AND_MASK, $ACC0, $ACC0
1346
- vpsrlq \$29, $ACC1, $TEMP2
1347
- vpand $AND_MASK, $ACC1, $ACC1
1348
- vpsrlq \$29, $ACC2, $TEMP3
1349
- vpermq \$0x93, $TEMP1, $TEMP1
1350
- vpand $AND_MASK, $ACC2, $ACC2
1351
- vpsrlq \$29, $ACC3, $TEMP4
1352
- vpermq \$0x93, $TEMP2, $TEMP2
1353
- vpand $AND_MASK, $ACC3, $ACC3
1354
- vpermq \$0x93, $TEMP3, $TEMP3
1355
-
1356
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1357
- vpermq \$0x93, $TEMP4, $TEMP4
1358
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1359
- vpaddq $TEMP0, $ACC0, $ACC0
1360
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1361
- vpaddq $TEMP1, $ACC1, $ACC1
1362
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1363
- vpaddq $TEMP2, $ACC2, $ACC2
1364
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
1365
- vpaddq $TEMP3, $ACC3, $ACC3
1366
- vpaddq $TEMP4, $ACC4, $ACC4
1367
-
1368
- vmovdqu $ACC0, 0-128($rp)
1369
- vmovdqu $ACC1, 32-128($rp)
1370
- vmovdqu $ACC2, 64-128($rp)
1371
- vmovdqu $ACC3, 96-128($rp)
1372
- ___
1373
-
1374
- $TEMP5=$ACC0;
1375
- $code.=<<___;
1376
- vpsrlq \$29, $ACC4, $TEMP1
1377
- vpand $AND_MASK, $ACC4, $ACC4
1378
- vpsrlq \$29, $ACC5, $TEMP2
1379
- vpand $AND_MASK, $ACC5, $ACC5
1380
- vpsrlq \$29, $ACC6, $TEMP3
1381
- vpermq \$0x93, $TEMP1, $TEMP1
1382
- vpand $AND_MASK, $ACC6, $ACC6
1383
- vpsrlq \$29, $ACC7, $TEMP4
1384
- vpermq \$0x93, $TEMP2, $TEMP2
1385
- vpand $AND_MASK, $ACC7, $ACC7
1386
- vpsrlq \$29, $ACC8, $TEMP5
1387
- vpermq \$0x93, $TEMP3, $TEMP3
1388
- vpand $AND_MASK, $ACC8, $ACC8
1389
- vpermq \$0x93, $TEMP4, $TEMP4
1390
-
1391
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1392
- vpermq \$0x93, $TEMP5, $TEMP5
1393
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1394
- vpaddq $TEMP0, $ACC4, $ACC4
1395
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1396
- vpaddq $TEMP1, $ACC5, $ACC5
1397
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1398
- vpaddq $TEMP2, $ACC6, $ACC6
1399
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1400
- vpaddq $TEMP3, $ACC7, $ACC7
1401
- vpaddq $TEMP4, $ACC8, $ACC8
1402
-
1403
- vpsrlq \$29, $ACC4, $TEMP1
1404
- vpand $AND_MASK, $ACC4, $ACC4
1405
- vpsrlq \$29, $ACC5, $TEMP2
1406
- vpand $AND_MASK, $ACC5, $ACC5
1407
- vpsrlq \$29, $ACC6, $TEMP3
1408
- vpermq \$0x93, $TEMP1, $TEMP1
1409
- vpand $AND_MASK, $ACC6, $ACC6
1410
- vpsrlq \$29, $ACC7, $TEMP4
1411
- vpermq \$0x93, $TEMP2, $TEMP2
1412
- vpand $AND_MASK, $ACC7, $ACC7
1413
- vpsrlq \$29, $ACC8, $TEMP5
1414
- vpermq \$0x93, $TEMP3, $TEMP3
1415
- vpand $AND_MASK, $ACC8, $ACC8
1416
- vpermq \$0x93, $TEMP4, $TEMP4
1417
-
1418
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
1419
- vpermq \$0x93, $TEMP5, $TEMP5
1420
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
1421
- vpaddq $TEMP0, $ACC4, $ACC4
1422
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
1423
- vpaddq $TEMP1, $ACC5, $ACC5
1424
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
1425
- vpaddq $TEMP2, $ACC6, $ACC6
1426
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
1427
- vpaddq $TEMP3, $ACC7, $ACC7
1428
- vpaddq $TEMP4, $ACC8, $ACC8
1429
-
1430
- vmovdqu $ACC4, 128-128($rp)
1431
- vmovdqu $ACC5, 160-128($rp)
1432
- vmovdqu $ACC6, 192-128($rp)
1433
- vmovdqu $ACC7, 224-128($rp)
1434
- vmovdqu $ACC8, 256-128($rp)
1435
- vzeroupper
1436
-
1437
- mov %rbp, %rax
1438
- ___
1439
- $code.=<<___ if ($win64);
1440
- movaps -0xd8(%rax),%xmm6
1441
- movaps -0xc8(%rax),%xmm7
1442
- movaps -0xb8(%rax),%xmm8
1443
- movaps -0xa8(%rax),%xmm9
1444
- movaps -0x98(%rax),%xmm10
1445
- movaps -0x88(%rax),%xmm11
1446
- movaps -0x78(%rax),%xmm12
1447
- movaps -0x68(%rax),%xmm13
1448
- movaps -0x58(%rax),%xmm14
1449
- movaps -0x48(%rax),%xmm15
1450
- ___
1451
- $code.=<<___;
1452
- mov -48(%rax),%r15
1453
- mov -40(%rax),%r14
1454
- mov -32(%rax),%r13
1455
- mov -24(%rax),%r12
1456
- mov -16(%rax),%rbp
1457
- mov -8(%rax),%rbx
1458
- lea (%rax),%rsp # restore %rsp
1459
- .Lmul_1024_epilogue:
1460
- ret
1461
- .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
1462
- ___
1463
- }
1464
- {
1465
- my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
1466
- my @T = map("%r$_",(8..11));
1467
-
1468
- $code.=<<___;
1469
- .globl rsaz_1024_red2norm_avx2
1470
- .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
1471
- .align 32
1472
- rsaz_1024_red2norm_avx2:
1473
- sub \$-128,$inp # size optimization
1474
- xor %rax,%rax
1475
- ___
1476
-
1477
- for ($j=0,$i=0; $i<16; $i++) {
1478
- my $k=0;
1479
- while (29*$j<64*($i+1)) { # load data till boundary
1480
- $code.=" mov `8*$j-128`($inp), @T[0]\n";
1481
- $j++; $k++; push(@T,shift(@T));
1482
- }
1483
- $l=$k;
1484
- while ($k>1) { # shift loaded data but last value
1485
- $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
1486
- $k--;
1487
- }
1488
- $code.=<<___; # shift last value
1489
- mov @T[-1], @T[0]
1490
- shl \$`29*($j-1)`, @T[-1]
1491
- shr \$`-29*($j-1)`, @T[0]
1492
- ___
1493
- while ($l) { # accumulate all values
1494
- $code.=" add @T[-$l], %rax\n";
1495
- $l--;
1496
- }
1497
- $code.=<<___;
1498
- adc \$0, @T[0] # consume eventual carry
1499
- mov %rax, 8*$i($out)
1500
- mov @T[0], %rax
1501
- ___
1502
- push(@T,shift(@T));
1503
- }
1504
- $code.=<<___;
1505
- ret
1506
- .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
1507
-
1508
- .globl rsaz_1024_norm2red_avx2
1509
- .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
1510
- .align 32
1511
- rsaz_1024_norm2red_avx2:
1512
- sub \$-128,$out # size optimization
1513
- mov ($inp),@T[0]
1514
- mov \$0x1fffffff,%eax
1515
- ___
1516
- for ($j=0,$i=0; $i<16; $i++) {
1517
- $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
1518
- $code.=" xor @T[1],@T[1]\n" if ($i==15);
1519
- my $k=1;
1520
- while (29*($j+1)<64*($i+1)) {
1521
- $code.=<<___;
1522
- mov @T[0],@T[-$k]
1523
- shr \$`29*$j`,@T[-$k]
1524
- and %rax,@T[-$k] # &0x1fffffff
1525
- mov @T[-$k],`8*$j-128`($out)
1526
- ___
1527
- $j++; $k++;
1528
- }
1529
- $code.=<<___;
1530
- shrd \$`29*$j`,@T[1],@T[0]
1531
- and %rax,@T[0]
1532
- mov @T[0],`8*$j-128`($out)
1533
- ___
1534
- $j++;
1535
- push(@T,shift(@T));
1536
- }
1537
- $code.=<<___;
1538
- mov @T[0],`8*$j-128`($out) # zero
1539
- mov @T[0],`8*($j+1)-128`($out)
1540
- mov @T[0],`8*($j+2)-128`($out)
1541
- mov @T[0],`8*($j+3)-128`($out)
1542
- ret
1543
- .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
1544
- ___
1545
- }
1546
- {
1547
- my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
1548
-
1549
- $code.=<<___;
1550
- .globl rsaz_1024_scatter5_avx2
1551
- .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
1552
- .align 32
1553
- rsaz_1024_scatter5_avx2:
1554
- vzeroupper
1555
- vmovdqu .Lscatter_permd(%rip),%ymm5
1556
- shl \$4,$power
1557
- lea ($out,$power),$out
1558
- mov \$9,%eax
1559
- jmp .Loop_scatter_1024
1560
-
1561
- .align 32
1562
- .Loop_scatter_1024:
1563
- vmovdqu ($inp),%ymm0
1564
- lea 32($inp),$inp
1565
- vpermd %ymm0,%ymm5,%ymm0
1566
- vmovdqu %xmm0,($out)
1567
- lea 16*32($out),$out
1568
- dec %eax
1569
- jnz .Loop_scatter_1024
1570
-
1571
- vzeroupper
1572
- ret
1573
- .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
1574
-
1575
- .globl rsaz_1024_gather5_avx2
1576
- .type rsaz_1024_gather5_avx2,\@abi-omnipotent
1577
- .align 32
1578
- rsaz_1024_gather5_avx2:
1579
- ___
1580
- $code.=<<___ if ($win64);
1581
- lea -0x88(%rsp),%rax
1582
- vzeroupper
1583
- .LSEH_begin_rsaz_1024_gather5:
1584
- # I can't trust assembler to use specific encoding:-(
1585
- .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1586
- .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
1587
- .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
1588
- .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
1589
- .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
1590
- .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
1591
- .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
1592
- .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
1593
- .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
1594
- .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
1595
- .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
1596
- ___
1597
- $code.=<<___;
1598
- lea .Lgather_table(%rip),%r11
1599
- mov $power,%eax
1600
- and \$3,$power
1601
- shr \$2,%eax # cache line number
1602
- shl \$4,$power # offset within cache line
1603
-
1604
- vmovdqu -32(%r11),%ymm7 # .Lgather_permd
1605
- vpbroadcastb 8(%r11,%rax), %xmm8
1606
- vpbroadcastb 7(%r11,%rax), %xmm9
1607
- vpbroadcastb 6(%r11,%rax), %xmm10
1608
- vpbroadcastb 5(%r11,%rax), %xmm11
1609
- vpbroadcastb 4(%r11,%rax), %xmm12
1610
- vpbroadcastb 3(%r11,%rax), %xmm13
1611
- vpbroadcastb 2(%r11,%rax), %xmm14
1612
- vpbroadcastb 1(%r11,%rax), %xmm15
1613
-
1614
- lea 64($inp,$power),$inp
1615
- mov \$64,%r11 # size optimization
1616
- mov \$9,%eax
1617
- jmp .Loop_gather_1024
1618
-
1619
- .align 32
1620
- .Loop_gather_1024:
1621
- vpand -64($inp), %xmm8,%xmm0
1622
- vpand ($inp), %xmm9,%xmm1
1623
- vpand 64($inp), %xmm10,%xmm2
1624
- vpand ($inp,%r11,2), %xmm11,%xmm3
1625
- vpor %xmm0,%xmm1,%xmm1
1626
- vpand 64($inp,%r11,2), %xmm12,%xmm4
1627
- vpor %xmm2,%xmm3,%xmm3
1628
- vpand ($inp,%r11,4), %xmm13,%xmm5
1629
- vpor %xmm1,%xmm3,%xmm3
1630
- vpand 64($inp,%r11,4), %xmm14,%xmm6
1631
- vpor %xmm4,%xmm5,%xmm5
1632
- vpand -128($inp,%r11,8), %xmm15,%xmm2
1633
- lea ($inp,%r11,8),$inp
1634
- vpor %xmm3,%xmm5,%xmm5
1635
- vpor %xmm2,%xmm6,%xmm6
1636
- vpor %xmm5,%xmm6,%xmm6
1637
- vpermd %ymm6,%ymm7,%ymm6
1638
- vmovdqu %ymm6,($out)
1639
- lea 32($out),$out
1640
- dec %eax
1641
- jnz .Loop_gather_1024
1642
-
1643
- vpxor %ymm0,%ymm0,%ymm0
1644
- vmovdqu %ymm0,($out)
1645
- vzeroupper
1646
- ___
1647
- $code.=<<___ if ($win64);
1648
- movaps (%rsp),%xmm6
1649
- movaps 0x10(%rsp),%xmm7
1650
- movaps 0x20(%rsp),%xmm8
1651
- movaps 0x30(%rsp),%xmm9
1652
- movaps 0x40(%rsp),%xmm10
1653
- movaps 0x50(%rsp),%xmm11
1654
- movaps 0x60(%rsp),%xmm12
1655
- movaps 0x70(%rsp),%xmm13
1656
- movaps 0x80(%rsp),%xmm14
1657
- movaps 0x90(%rsp),%xmm15
1658
- lea 0xa8(%rsp),%rsp
1659
- .LSEH_end_rsaz_1024_gather5:
1660
- ___
1661
- $code.=<<___;
1662
- ret
1663
- .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
1664
- ___
1665
- }
1666
-
1667
- $code.=<<___;
1668
- .extern OPENSSL_ia32cap_P
1669
- .globl rsaz_avx2_eligible
1670
- .type rsaz_avx2_eligible,\@abi-omnipotent
1671
- .align 32
1672
- rsaz_avx2_eligible:
1673
- mov OPENSSL_ia32cap_P+8(%rip),%eax
1674
- ___
1675
- $code.=<<___ if ($addx);
1676
- mov \$`1<<8|1<<19`,%ecx
1677
- mov \$0,%edx
1678
- and %eax,%ecx
1679
- cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
1680
- cmove %edx,%eax
1681
- ___
1682
- $code.=<<___;
1683
- and \$`1<<5`,%eax
1684
- shr \$5,%eax
1685
- ret
1686
- .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1687
-
1688
- .align 64
1689
- .Land_mask:
1690
- .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
1691
- .Lscatter_permd:
1692
- .long 0,2,4,6,7,7,7,7
1693
- .Lgather_permd:
1694
- .long 0,7,1,7,2,7,3,7
1695
- .Lgather_table:
1696
- .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
1697
- .align 64
1698
- ___
1699
-
1700
- if ($win64) {
1701
- $rec="%rcx";
1702
- $frame="%rdx";
1703
- $context="%r8";
1704
- $disp="%r9";
1705
-
1706
- $code.=<<___
1707
- .extern __imp_RtlVirtualUnwind
1708
- .type rsaz_se_handler,\@abi-omnipotent
1709
- .align 16
1710
- rsaz_se_handler:
1711
- push %rsi
1712
- push %rdi
1713
- push %rbx
1714
- push %rbp
1715
- push %r12
1716
- push %r13
1717
- push %r14
1718
- push %r15
1719
- pushfq
1720
- sub \$64,%rsp
1721
-
1722
- mov 120($context),%rax # pull context->Rax
1723
- mov 248($context),%rbx # pull context->Rip
1724
-
1725
- mov 8($disp),%rsi # disp->ImageBase
1726
- mov 56($disp),%r11 # disp->HandlerData
1727
-
1728
- mov 0(%r11),%r10d # HandlerData[0]
1729
- lea (%rsi,%r10),%r10 # prologue label
1730
- cmp %r10,%rbx # context->Rip<prologue label
1731
- jb .Lcommon_seh_tail
1732
-
1733
- mov 152($context),%rax # pull context->Rsp
1734
-
1735
- mov 4(%r11),%r10d # HandlerData[1]
1736
- lea (%rsi,%r10),%r10 # epilogue label
1737
- cmp %r10,%rbx # context->Rip>=epilogue label
1738
- jae .Lcommon_seh_tail
1739
-
1740
- mov 160($context),%rax # pull context->Rbp
1741
-
1742
- mov -48(%rax),%r15
1743
- mov -40(%rax),%r14
1744
- mov -32(%rax),%r13
1745
- mov -24(%rax),%r12
1746
- mov -16(%rax),%rbp
1747
- mov -8(%rax),%rbx
1748
- mov %r15,240($context)
1749
- mov %r14,232($context)
1750
- mov %r13,224($context)
1751
- mov %r12,216($context)
1752
- mov %rbp,160($context)
1753
- mov %rbx,144($context)
1754
-
1755
- lea -0xd8(%rax),%rsi # %xmm save area
1756
- lea 512($context),%rdi # & context.Xmm6
1757
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1758
- .long 0xa548f3fc # cld; rep movsq
1759
-
1760
- .Lcommon_seh_tail:
1761
- mov 8(%rax),%rdi
1762
- mov 16(%rax),%rsi
1763
- mov %rax,152($context) # restore context->Rsp
1764
- mov %rsi,168($context) # restore context->Rsi
1765
- mov %rdi,176($context) # restore context->Rdi
1766
-
1767
- mov 40($disp),%rdi # disp->ContextRecord
1768
- mov $context,%rsi # context
1769
- mov \$154,%ecx # sizeof(CONTEXT)
1770
- .long 0xa548f3fc # cld; rep movsq
1771
-
1772
- mov $disp,%rsi
1773
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1774
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
1775
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
1776
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1777
- mov 40(%rsi),%r10 # disp->ContextRecord
1778
- lea 56(%rsi),%r11 # &disp->HandlerData
1779
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
1780
- mov %r10,32(%rsp) # arg5
1781
- mov %r11,40(%rsp) # arg6
1782
- mov %r12,48(%rsp) # arg7
1783
- mov %rcx,56(%rsp) # arg8, (NULL)
1784
- call *__imp_RtlVirtualUnwind(%rip)
1785
-
1786
- mov \$1,%eax # ExceptionContinueSearch
1787
- add \$64,%rsp
1788
- popfq
1789
- pop %r15
1790
- pop %r14
1791
- pop %r13
1792
- pop %r12
1793
- pop %rbp
1794
- pop %rbx
1795
- pop %rdi
1796
- pop %rsi
1797
- ret
1798
- .size rsaz_se_handler,.-rsaz_se_handler
1799
-
1800
- .section .pdata
1801
- .align 4
1802
- .rva .LSEH_begin_rsaz_1024_sqr_avx2
1803
- .rva .LSEH_end_rsaz_1024_sqr_avx2
1804
- .rva .LSEH_info_rsaz_1024_sqr_avx2
1805
-
1806
- .rva .LSEH_begin_rsaz_1024_mul_avx2
1807
- .rva .LSEH_end_rsaz_1024_mul_avx2
1808
- .rva .LSEH_info_rsaz_1024_mul_avx2
1809
-
1810
- .rva .LSEH_begin_rsaz_1024_gather5
1811
- .rva .LSEH_end_rsaz_1024_gather5
1812
- .rva .LSEH_info_rsaz_1024_gather5
1813
- .section .xdata
1814
- .align 8
1815
- .LSEH_info_rsaz_1024_sqr_avx2:
1816
- .byte 9,0,0,0
1817
- .rva rsaz_se_handler
1818
- .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
1819
- .LSEH_info_rsaz_1024_mul_avx2:
1820
- .byte 9,0,0,0
1821
- .rva rsaz_se_handler
1822
- .rva .Lmul_1024_body,.Lmul_1024_epilogue
1823
- .LSEH_info_rsaz_1024_gather5:
1824
- .byte 0x01,0x33,0x16,0x00
1825
- .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
1826
- .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
1827
- .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
1828
- .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
1829
- .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
1830
- .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
1831
- .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
1832
- .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
1833
- .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
1834
- .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
1835
- .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1836
- ___
1837
- }
1838
-
1839
- foreach (split("\n",$code)) {
1840
- s/\`([^\`]*)\`/eval($1)/ge;
1841
-
1842
- s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
1843
-
1844
- s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1845
- s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1846
- s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1847
- s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1848
- s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1849
- print $_,"\n";
1850
- }
1851
-
1852
- }}} else {{{
1853
- print <<___; # assembler is too old
1854
- .text
1855
-
1856
- .globl rsaz_avx2_eligible
1857
- .type rsaz_avx2_eligible,\@abi-omnipotent
1858
- rsaz_avx2_eligible:
1859
- xor %eax,%eax
1860
- ret
1861
- .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
1862
-
1863
- .globl rsaz_1024_sqr_avx2
1864
- .globl rsaz_1024_mul_avx2
1865
- .globl rsaz_1024_norm2red_avx2
1866
- .globl rsaz_1024_red2norm_avx2
1867
- .globl rsaz_1024_scatter5_avx2
1868
- .globl rsaz_1024_gather5_avx2
1869
- .type rsaz_1024_sqr_avx2,\@abi-omnipotent
1870
- rsaz_1024_sqr_avx2:
1871
- rsaz_1024_mul_avx2:
1872
- rsaz_1024_norm2red_avx2:
1873
- rsaz_1024_red2norm_avx2:
1874
- rsaz_1024_scatter5_avx2:
1875
- rsaz_1024_gather5_avx2:
1876
- .byte 0x0f,0x0b # ud2
1877
- ret
1878
- .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
1879
- ___
1880
- }}}
1881
-
1882
- close STDOUT;