ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,128 @@
1
+ /* Copyright (c) 2015, Google Inc.
2
+ *
3
+ * Permission to use, copy, modify, and/or distribute this software for any
4
+ * purpose with or without fee is hereby granted, provided that the above
5
+ * copyright notice and this permission notice appear in all copies.
6
+ *
7
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
+
15
+ #include <stdint.h>
16
+ #include <stdio.h>
17
+ #include <string.h>
18
+
19
+ #include <openssl/curve25519.h>
20
+
21
+
22
+ static bool TestX25519() {
23
+ /* Taken from
24
+ * https://tools.ietf.org/html/draft-irtf-cfrg-curves-11#section-5.2 */
25
+ static const uint8_t kScalar1[32] = {
26
+ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15,
27
+ 0x4b, 0x82, 0x46, 0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc,
28
+ 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4,
29
+ };
30
+ static const uint8_t kPoint1[32] = {
31
+ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, 0x35, 0x94, 0xc1,
32
+ 0xa4, 0x24, 0xb1, 0x5f, 0x7c, 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3,
33
+ 0x35, 0x3b, 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c,
34
+ };
35
+
36
+ uint8_t out[32];
37
+ X25519(out, kScalar1, kPoint1);
38
+
39
+ static const uint8_t kExpected1[32] = {
40
+ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea,
41
+ 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c,
42
+ 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52,
43
+ };
44
+ if (memcmp(kExpected1, out, sizeof(out)) != 0) {
45
+ fprintf(stderr, "X25519 test one failed.\n");
46
+ return false;
47
+ }
48
+
49
+ static const uint8_t kScalar2[32] = {
50
+ 0x4b, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, 0x5a, 0xd2, 0x26,
51
+ 0x91, 0x95, 0x7d, 0x6a, 0xf5, 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea,
52
+ 0x01, 0xd4, 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x0d,
53
+ };
54
+ static const uint8_t kPoint2[32] = {
55
+ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, 0xf4, 0xb7, 0x95,
56
+ 0x9d, 0x05, 0x38, 0xae, 0x2c, 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0,
57
+ 0x3c, 0x3e, 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x93,
58
+ };
59
+
60
+ X25519(out, kScalar2, kPoint2);
61
+
62
+ static const uint8_t kExpected2[32] = {
63
+ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, 0x7a, 0xad, 0xe4,
64
+ 0x5c, 0xb4, 0xb8, 0x73, 0xf8, 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f,
65
+ 0xa1, 0x52, 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57,
66
+ };
67
+ if (memcmp(kExpected2, out, sizeof(out)) != 0) {
68
+ fprintf(stderr, "X25519 test two failed.\n");
69
+ return false;
70
+ }
71
+
72
+ return true;
73
+ }
74
+
75
+ static bool TestX25519SmallOrder() {
76
+ static const uint8_t kSmallOrderPoint[32] = {
77
+ 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3,
78
+ 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32,
79
+ 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8,
80
+ };
81
+
82
+ uint8_t out[32], private_key[32];
83
+ memset(private_key, 0x11, sizeof(private_key));
84
+
85
+ if (X25519(out, private_key, kSmallOrderPoint)) {
86
+ fprintf(stderr, "X25519 returned success with a small-order input.\n");
87
+ return false;
88
+ }
89
+
90
+ return true;
91
+ }
92
+
93
+ static bool TestX25519Iterated() {
94
+ /* Taken from
95
+ * https://tools.ietf.org/html/draft-irtf-cfrg-curves-11#section-5.2 */
96
+ uint8_t scalar[32] = {9}, point[32] = {9}, out[32];
97
+
98
+ unsigned i;
99
+ for (i = 0; i < 1000; i++) {
100
+ X25519(out, scalar, point);
101
+ memcpy(point, scalar, sizeof(point));
102
+ memcpy(scalar, out, sizeof(scalar));
103
+ }
104
+
105
+ static const uint8_t kExpected[32] = {
106
+ 0x68, 0x4c, 0xf5, 0x9b, 0xa8, 0x33, 0x09, 0x55, 0x28, 0x00, 0xef,
107
+ 0x56, 0x6f, 0x2f, 0x4d, 0x3c, 0x1c, 0x38, 0x87, 0xc4, 0x93, 0x60,
108
+ 0xe3, 0x87, 0x5f, 0x2e, 0xb9, 0x4d, 0x99, 0x53, 0x2c, 0x51,
109
+ };
110
+
111
+ if (memcmp(kExpected, scalar, sizeof(kExpected)) != 0) {
112
+ fprintf(stderr, "Iterated X25519 test failed\n");
113
+ return false;
114
+ }
115
+
116
+ return true;
117
+ }
118
+
119
+ int main(int argc, char **argv) {
120
+ if (!TestX25519() ||
121
+ !TestX25519Iterated() ||
122
+ !TestX25519SmallOrder()) {
123
+ return 1;
124
+ }
125
+
126
+ printf("PASS\n");
127
+ return 0;
128
+ }
@@ -0,0 +1,181 @@
1
+ /* ====================================================================
2
+ * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ *
8
+ * 1. Redistributions of source code must retain the above copyright
9
+ * notice, this list of conditions and the following disclaimer.
10
+ *
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in
13
+ * the documentation and/or other materials provided with the
14
+ * distribution.
15
+ *
16
+ * 3. All advertising materials mentioning features or use of this
17
+ * software must display the following acknowledgment:
18
+ * "This product includes software developed by the OpenSSL Project
19
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
20
+ *
21
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22
+ * endorse or promote products derived from this software without
23
+ * prior written permission. For written permission, please contact
24
+ * licensing@OpenSSL.org.
25
+ *
26
+ * 5. Products derived from this software may not be called "OpenSSL"
27
+ * nor may "OpenSSL" appear in their names without prior written
28
+ * permission of the OpenSSL Project.
29
+ *
30
+ * 6. Redistributions of any form whatsoever must retain the following
31
+ * acknowledgment:
32
+ * "This product includes software developed by the OpenSSL Project
33
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
34
+ *
35
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
47
+ * ==================================================================== */
48
+
49
+ #ifndef OPENSSL_HEADER_MD32_COMMON_H
50
+ #define OPENSSL_HEADER_MD32_COMMON_H
51
+
52
+ #include <openssl/base.h>
53
+
54
+
55
+ #if defined(__cplusplus)
56
+ extern "C" {
57
+ #endif
58
+
59
+ #define asm __asm__
60
+
61
+ /* One of |DATA_ORDER_IS_BIG_ENDIAN| or |DATA_ORDER_IS_LITTLE_ENDIAN| must be
62
+ * defined to specify the byte order of the input stream. */
63
+
64
+ #if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
65
+ #error "DATA_ORDER must be defined!"
66
+ #endif
67
+
68
+ /*
69
+ * Engage compiler specific rotate intrinsic function if available.
70
+ */
71
+ #undef ROTATE
72
+ # if defined(_MSC_VER)
73
+ # define ROTATE(a,n) _lrotl(a,n)
74
+ # elif defined(__ICC)
75
+ # define ROTATE(a,n) _rotl(a,n)
76
+ # elif defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM)
77
+ /*
78
+ * Some GNU C inline assembler templates. Note that these are
79
+ * rotates by *constant* number of bits! But that's exactly
80
+ * what we need here...
81
+ * <appro@fy.chalmers.se>
82
+ */
83
+ # if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
84
+ # define ROTATE(a,n) ({ register uint32_t ret; \
85
+ asm ( \
86
+ "roll %1,%0" \
87
+ : "=r"(ret) \
88
+ : "I"(n), "0"((uint32_t)(a)) \
89
+ : "cc"); \
90
+ ret; \
91
+ })
92
+ # endif /* OPENSSL_X86 || OPENSSL_X86_64 */
93
+ # endif /* COMPILER */
94
+
95
+ #ifndef ROTATE
96
+ #define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
97
+ #endif
98
+
99
+ #if defined(DATA_ORDER_IS_BIG_ENDIAN)
100
+
101
+ #ifndef PEDANTIC
102
+ # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM)
103
+ # if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
104
+ /*
105
+ * This gives ~30-40% performance improvement in SHA-256 compiled
106
+ * with gcc [on P4]. Well, first macro to be frank. We can pull
107
+ * this trick on x86* platforms only, because these CPUs can fetch
108
+ * unaligned data without raising an exception.
109
+ */
110
+ # define HOST_c2l(c,l) ({ uint32_t r=*((const uint32_t *)(c)); \
111
+ asm ("bswapl %0":"=r"(r):"0"(r)); \
112
+ (c)+=4; (l)=r; })
113
+ # define HOST_l2c(l,c) ({ uint32_t r=(l); \
114
+ asm ("bswapl %0":"=r"(r):"0"(r)); \
115
+ *((uint32_t *)(c))=r; (c)+=4; r; })
116
+ # elif defined(__aarch64__)
117
+ # if defined(__BYTE_ORDER__)
118
+ # if defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
119
+ # define HOST_c2l(c,l) ({ uint32_t r; \
120
+ asm ("rev %w0,%w1" \
121
+ :"=r"(r) \
122
+ :"r"(*((const uint32_t *)(c))));\
123
+ (c)+=4; (l)=r; })
124
+ # define HOST_l2c(l,c) ({ uint32_t r; \
125
+ asm ("rev %w0,%w1" \
126
+ :"=r"(r) \
127
+ :"r"((uint32_t)(l))); \
128
+ *((uint32_t *)(c))=r; (c)+=4; r; })
129
+ # elif defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
130
+ # define HOST_c2l(c,l) (void)((l)=*((const uint32_t *)(c)), (c)+=4)
131
+ # define HOST_l2c(l,c) (*((uint32_t *)(c))=(l), (c)+=4, (l))
132
+ # endif
133
+ # endif
134
+ # endif
135
+ # endif
136
+ #endif
137
+
138
+ #ifndef HOST_c2l
139
+ #define HOST_c2l(c,l) (void)(l =(((uint32_t)(*((c)++)))<<24), \
140
+ l|=(((uint32_t)(*((c)++)))<<16), \
141
+ l|=(((uint32_t)(*((c)++)))<< 8), \
142
+ l|=(((uint32_t)(*((c)++))) ))
143
+ #endif
144
+ #ifndef HOST_l2c
145
+ #define HOST_l2c(l,c) (*((c)++)=(uint8_t)(((l)>>24)&0xff), \
146
+ *((c)++)=(uint8_t)(((l)>>16)&0xff), \
147
+ *((c)++)=(uint8_t)(((l)>> 8)&0xff), \
148
+ *((c)++)=(uint8_t)(((l) )&0xff), \
149
+ l)
150
+ #endif
151
+
152
+ #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
153
+
154
+ #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
155
+ /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
156
+ # define HOST_c2l(c,l) (void)((l)=*((const uint32_t *)(c)), (c)+=4)
157
+ # define HOST_l2c(l,c) (*((uint32_t *)(c))=(l), (c)+=4, l)
158
+ #endif
159
+
160
+ #ifndef HOST_c2l
161
+ #define HOST_c2l(c,l) (void)(l =(((uint32_t)(*((c)++))) ), \
162
+ l|=(((uint32_t)(*((c)++)))<< 8), \
163
+ l|=(((uint32_t)(*((c)++)))<<16), \
164
+ l|=(((uint32_t)(*((c)++)))<<24))
165
+ #endif
166
+ #ifndef HOST_l2c
167
+ #define HOST_l2c(l,c) (*((c)++)=(uint8_t)(((l) )&0xff), \
168
+ *((c)++)=(uint8_t)(((l)>> 8)&0xff), \
169
+ *((c)++)=(uint8_t)(((l)>>16)&0xff), \
170
+ *((c)++)=(uint8_t)(((l)>>24)&0xff), \
171
+ l)
172
+ #endif
173
+
174
+ #endif
175
+
176
+
177
+ #if defined(__cplusplus)
178
+ } /* extern C */
179
+ #endif
180
+
181
+ #endif /* OPENSSL_HEADER_MD32_COMMON_H */
@@ -0,0 +1,2725 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # Copyright (c) 2014, Intel Corporation.
4
+ #
5
+ # Permission to use, copy, modify, and/or distribute this software for any
6
+ # purpose with or without fee is hereby granted, provided that the above
7
+ # copyright notice and this permission notice appear in all copies.
8
+ #
9
+ # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
+ # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
+ # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12
+ # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
+ # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14
+ # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15
+ # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+
17
+ # Developers and authors:
18
+ # Shay Gueron (1, 2), and Vlad Krasnov (1)
19
+ # (1) Intel Corporation, Israel Development Center
20
+ # (2) University of Haifa
21
+
22
+ # Reference:
23
+ # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
24
+ # 256 Bit Primes"
25
+
26
+ # Further optimization by <appro@openssl.org>:
27
+ #
28
+ # this/original
29
+ # Opteron +12-49%
30
+ # Bulldozer +14-45%
31
+ # P4 +18-46%
32
+ # Westmere +12-34%
33
+ # Sandy Bridge +9-35%
34
+ # Ivy Bridge +9-35%
35
+ # Haswell +8-37%
36
+ # Broadwell +18-58%
37
+ # Atom +15-50%
38
+ # VIA Nano +43-160%
39
+ #
40
+ # Ranges denote minimum and maximum improvement coefficients depending
41
+ # on benchmark.
42
+
43
+ $flavour = shift;
44
+ $output = shift;
45
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
+
47
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
+
49
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52
+ die "can't locate x86_64-xlate.pl";
53
+
54
+ open OUT,"| \"$^X\" $xlate $flavour $output";
55
+ *STDOUT=*OUT;
56
+
57
+ # TODO: enable these after testing. $avx goes to two and $addx to one.
58
+ $avx=0;
59
+ $addx=0;
60
+
61
+ $code.=<<___;
62
+ .text
63
+ .extern OPENSSL_ia32cap_P
64
+
65
+ # The polynomial
66
+ .align 64
67
+ .Lpoly:
68
+ .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
+
70
+ .LOne:
71
+ .long 1,1,1,1,1,1,1,1
72
+ .LTwo:
73
+ .long 2,2,2,2,2,2,2,2
74
+ .LThree:
75
+ .long 3,3,3,3,3,3,3,3
76
+ .LONE_mont:
77
+ .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78
+ ___
79
+
80
+ {
81
+ ################################################################################
82
+ # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
83
+
84
+ my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
85
+ my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
86
+ my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
87
+
88
+ $code.=<<___;
89
+
90
+ .type ecp_nistz256_mul_by_2,\@function,2
91
+ .align 64
92
+ ecp_nistz256_mul_by_2:
93
+ push %r12
94
+ push %r13
95
+
96
+ mov 8*0($a_ptr), $a0
97
+ mov 8*1($a_ptr), $a1
98
+ add $a0, $a0 # a0:a3+a0:a3
99
+ mov 8*2($a_ptr), $a2
100
+ adc $a1, $a1
101
+ mov 8*3($a_ptr), $a3
102
+ lea .Lpoly(%rip), $a_ptr
103
+ mov $a0, $t0
104
+ adc $a2, $a2
105
+ adc $a3, $a3
106
+ mov $a1, $t1
107
+ sbb $t4, $t4
108
+
109
+ sub 8*0($a_ptr), $a0
110
+ mov $a2, $t2
111
+ sbb 8*1($a_ptr), $a1
112
+ sbb 8*2($a_ptr), $a2
113
+ mov $a3, $t3
114
+ sbb 8*3($a_ptr), $a3
115
+ test $t4, $t4
116
+
117
+ cmovz $t0, $a0
118
+ cmovz $t1, $a1
119
+ mov $a0, 8*0($r_ptr)
120
+ cmovz $t2, $a2
121
+ mov $a1, 8*1($r_ptr)
122
+ cmovz $t3, $a3
123
+ mov $a2, 8*2($r_ptr)
124
+ mov $a3, 8*3($r_ptr)
125
+
126
+ pop %r13
127
+ pop %r12
128
+ ret
129
+ .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
130
+
131
+ ################################################################################
132
+ # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
133
+ .globl ecp_nistz256_neg
134
+ .type ecp_nistz256_neg,\@function,2
135
+ .align 32
136
+ ecp_nistz256_neg:
137
+ push %r12
138
+ push %r13
139
+
140
+ xor $a0, $a0
141
+ xor $a1, $a1
142
+ xor $a2, $a2
143
+ xor $a3, $a3
144
+ xor $t4, $t4
145
+
146
+ sub 8*0($a_ptr), $a0
147
+ sbb 8*1($a_ptr), $a1
148
+ sbb 8*2($a_ptr), $a2
149
+ mov $a0, $t0
150
+ sbb 8*3($a_ptr), $a3
151
+ lea .Lpoly(%rip), $a_ptr
152
+ mov $a1, $t1
153
+ sbb \$0, $t4
154
+
155
+ add 8*0($a_ptr), $a0
156
+ mov $a2, $t2
157
+ adc 8*1($a_ptr), $a1
158
+ adc 8*2($a_ptr), $a2
159
+ mov $a3, $t3
160
+ adc 8*3($a_ptr), $a3
161
+ test $t4, $t4
162
+
163
+ cmovz $t0, $a0
164
+ cmovz $t1, $a1
165
+ mov $a0, 8*0($r_ptr)
166
+ cmovz $t2, $a2
167
+ mov $a1, 8*1($r_ptr)
168
+ cmovz $t3, $a3
169
+ mov $a2, 8*2($r_ptr)
170
+ mov $a3, 8*3($r_ptr)
171
+
172
+ pop %r13
173
+ pop %r12
174
+ ret
175
+ .size ecp_nistz256_neg,.-ecp_nistz256_neg
176
+ ___
177
+ }
178
+ {
179
+ my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
180
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
181
+ my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
182
+ my ($poly1,$poly3)=($acc6,$acc7);
183
+
184
+ $code.=<<___;
185
+ ################################################################################
186
+ # void ecp_nistz256_mul_mont(
187
+ # uint64_t res[4],
188
+ # uint64_t a[4],
189
+ # uint64_t b[4]);
190
+
191
+ .globl ecp_nistz256_mul_mont
192
+ .type ecp_nistz256_mul_mont,\@function,3
193
+ .align 32
194
+ ecp_nistz256_mul_mont:
195
+ ___
196
+ $code.=<<___ if ($addx);
197
+ mov \$0x80100, %ecx
198
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
199
+ ___
200
+ $code.=<<___;
201
+ .Lmul_mont:
202
+ push %rbp
203
+ push %rbx
204
+ push %r12
205
+ push %r13
206
+ push %r14
207
+ push %r15
208
+ ___
209
+ $code.=<<___ if ($addx);
210
+ cmp \$0x80100, %ecx
211
+ je .Lmul_montx
212
+ ___
213
+ $code.=<<___;
214
+ mov $b_org, $b_ptr
215
+ mov 8*0($b_org), %rax
216
+ mov 8*0($a_ptr), $acc1
217
+ mov 8*1($a_ptr), $acc2
218
+ mov 8*2($a_ptr), $acc3
219
+ mov 8*3($a_ptr), $acc4
220
+
221
+ call __ecp_nistz256_mul_montq
222
+ ___
223
+ $code.=<<___ if ($addx);
224
+ jmp .Lmul_mont_done
225
+
226
+ .align 32
227
+ .Lmul_montx:
228
+ mov $b_org, $b_ptr
229
+ mov 8*0($b_org), %rdx
230
+ mov 8*0($a_ptr), $acc1
231
+ mov 8*1($a_ptr), $acc2
232
+ mov 8*2($a_ptr), $acc3
233
+ mov 8*3($a_ptr), $acc4
234
+ lea -128($a_ptr), $a_ptr # control u-op density
235
+
236
+ call __ecp_nistz256_mul_montx
237
+ ___
238
+ $code.=<<___;
239
+ .Lmul_mont_done:
240
+ pop %r15
241
+ pop %r14
242
+ pop %r13
243
+ pop %r12
244
+ pop %rbx
245
+ pop %rbp
246
+ ret
247
+ .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
248
+
249
+ .type __ecp_nistz256_mul_montq,\@abi-omnipotent
250
+ .align 32
251
+ __ecp_nistz256_mul_montq:
252
+ ########################################################################
253
+ # Multiply a by b[0]
254
+ mov %rax, $t1
255
+ mulq $acc1
256
+ mov .Lpoly+8*1(%rip),$poly1
257
+ mov %rax, $acc0
258
+ mov $t1, %rax
259
+ mov %rdx, $acc1
260
+
261
+ mulq $acc2
262
+ mov .Lpoly+8*3(%rip),$poly3
263
+ add %rax, $acc1
264
+ mov $t1, %rax
265
+ adc \$0, %rdx
266
+ mov %rdx, $acc2
267
+
268
+ mulq $acc3
269
+ add %rax, $acc2
270
+ mov $t1, %rax
271
+ adc \$0, %rdx
272
+ mov %rdx, $acc3
273
+
274
+ mulq $acc4
275
+ add %rax, $acc3
276
+ mov $acc0, %rax
277
+ adc \$0, %rdx
278
+ xor $acc5, $acc5
279
+ mov %rdx, $acc4
280
+
281
+ ########################################################################
282
+ # First reduction step
283
+ # Basically now we want to multiply acc[0] by p256,
284
+ # and add the result to the acc.
285
+ # Due to the special form of p256 we do some optimizations
286
+ #
287
+ # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
288
+ # then we add acc[0] and get acc[0] x 2^96
289
+
290
+ mov $acc0, $t1
291
+ shl \$32, $acc0
292
+ mulq $poly3
293
+ shr \$32, $t1
294
+ add $acc0, $acc1 # +=acc[0]<<96
295
+ adc $t1, $acc2
296
+ adc %rax, $acc3
297
+ mov 8*1($b_ptr), %rax
298
+ adc %rdx, $acc4
299
+ adc \$0, $acc5
300
+ xor $acc0, $acc0
301
+
302
+ ########################################################################
303
+ # Multiply by b[1]
304
+ mov %rax, $t1
305
+ mulq 8*0($a_ptr)
306
+ add %rax, $acc1
307
+ mov $t1, %rax
308
+ adc \$0, %rdx
309
+ mov %rdx, $t0
310
+
311
+ mulq 8*1($a_ptr)
312
+ add $t0, $acc2
313
+ adc \$0, %rdx
314
+ add %rax, $acc2
315
+ mov $t1, %rax
316
+ adc \$0, %rdx
317
+ mov %rdx, $t0
318
+
319
+ mulq 8*2($a_ptr)
320
+ add $t0, $acc3
321
+ adc \$0, %rdx
322
+ add %rax, $acc3
323
+ mov $t1, %rax
324
+ adc \$0, %rdx
325
+ mov %rdx, $t0
326
+
327
+ mulq 8*3($a_ptr)
328
+ add $t0, $acc4
329
+ adc \$0, %rdx
330
+ add %rax, $acc4
331
+ mov $acc1, %rax
332
+ adc %rdx, $acc5
333
+ adc \$0, $acc0
334
+
335
+ ########################################################################
336
+ # Second reduction step
337
+ mov $acc1, $t1
338
+ shl \$32, $acc1
339
+ mulq $poly3
340
+ shr \$32, $t1
341
+ add $acc1, $acc2
342
+ adc $t1, $acc3
343
+ adc %rax, $acc4
344
+ mov 8*2($b_ptr), %rax
345
+ adc %rdx, $acc5
346
+ adc \$0, $acc0
347
+ xor $acc1, $acc1
348
+
349
+ ########################################################################
350
+ # Multiply by b[2]
351
+ mov %rax, $t1
352
+ mulq 8*0($a_ptr)
353
+ add %rax, $acc2
354
+ mov $t1, %rax
355
+ adc \$0, %rdx
356
+ mov %rdx, $t0
357
+
358
+ mulq 8*1($a_ptr)
359
+ add $t0, $acc3
360
+ adc \$0, %rdx
361
+ add %rax, $acc3
362
+ mov $t1, %rax
363
+ adc \$0, %rdx
364
+ mov %rdx, $t0
365
+
366
+ mulq 8*2($a_ptr)
367
+ add $t0, $acc4
368
+ adc \$0, %rdx
369
+ add %rax, $acc4
370
+ mov $t1, %rax
371
+ adc \$0, %rdx
372
+ mov %rdx, $t0
373
+
374
+ mulq 8*3($a_ptr)
375
+ add $t0, $acc5
376
+ adc \$0, %rdx
377
+ add %rax, $acc5
378
+ mov $acc2, %rax
379
+ adc %rdx, $acc0
380
+ adc \$0, $acc1
381
+
382
+ ########################################################################
383
+ # Third reduction step
384
+ mov $acc2, $t1
385
+ shl \$32, $acc2
386
+ mulq $poly3
387
+ shr \$32, $t1
388
+ add $acc2, $acc3
389
+ adc $t1, $acc4
390
+ adc %rax, $acc5
391
+ mov 8*3($b_ptr), %rax
392
+ adc %rdx, $acc0
393
+ adc \$0, $acc1
394
+ xor $acc2, $acc2
395
+
396
+ ########################################################################
397
+ # Multiply by b[3]
398
+ mov %rax, $t1
399
+ mulq 8*0($a_ptr)
400
+ add %rax, $acc3
401
+ mov $t1, %rax
402
+ adc \$0, %rdx
403
+ mov %rdx, $t0
404
+
405
+ mulq 8*1($a_ptr)
406
+ add $t0, $acc4
407
+ adc \$0, %rdx
408
+ add %rax, $acc4
409
+ mov $t1, %rax
410
+ adc \$0, %rdx
411
+ mov %rdx, $t0
412
+
413
+ mulq 8*2($a_ptr)
414
+ add $t0, $acc5
415
+ adc \$0, %rdx
416
+ add %rax, $acc5
417
+ mov $t1, %rax
418
+ adc \$0, %rdx
419
+ mov %rdx, $t0
420
+
421
+ mulq 8*3($a_ptr)
422
+ add $t0, $acc0
423
+ adc \$0, %rdx
424
+ add %rax, $acc0
425
+ mov $acc3, %rax
426
+ adc %rdx, $acc1
427
+ adc \$0, $acc2
428
+
429
+ ########################################################################
430
+ # Final reduction step
431
+ mov $acc3, $t1
432
+ shl \$32, $acc3
433
+ mulq $poly3
434
+ shr \$32, $t1
435
+ add $acc3, $acc4
436
+ adc $t1, $acc5
437
+ mov $acc4, $t0
438
+ adc %rax, $acc0
439
+ adc %rdx, $acc1
440
+ mov $acc5, $t1
441
+ adc \$0, $acc2
442
+
443
+ ########################################################################
444
+ # Branch-less conditional subtraction of P
445
+ sub \$-1, $acc4 # .Lpoly[0]
446
+ mov $acc0, $t2
447
+ sbb $poly1, $acc5 # .Lpoly[1]
448
+ sbb \$0, $acc0 # .Lpoly[2]
449
+ mov $acc1, $t3
450
+ sbb $poly3, $acc1 # .Lpoly[3]
451
+ sbb \$0, $acc2
452
+
453
+ cmovc $t0, $acc4
454
+ cmovc $t1, $acc5
455
+ mov $acc4, 8*0($r_ptr)
456
+ cmovc $t2, $acc0
457
+ mov $acc5, 8*1($r_ptr)
458
+ cmovc $t3, $acc1
459
+ mov $acc0, 8*2($r_ptr)
460
+ mov $acc1, 8*3($r_ptr)
461
+
462
+ ret
463
+ .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
464
+
465
+ ################################################################################
466
+ # void ecp_nistz256_sqr_mont(
467
+ # uint64_t res[4],
468
+ # uint64_t a[4]);
469
+
470
+ # we optimize the square according to S.Gueron and V.Krasnov,
471
+ # "Speeding up Big-Number Squaring"
472
+ .globl ecp_nistz256_sqr_mont
473
+ .type ecp_nistz256_sqr_mont,\@function,2
474
+ .align 32
475
+ ecp_nistz256_sqr_mont:
476
+ ___
477
+ $code.=<<___ if ($addx);
478
+ mov \$0x80100, %ecx
479
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
480
+ ___
481
+ $code.=<<___;
482
+ push %rbp
483
+ push %rbx
484
+ push %r12
485
+ push %r13
486
+ push %r14
487
+ push %r15
488
+ ___
489
+ $code.=<<___ if ($addx);
490
+ cmp \$0x80100, %ecx
491
+ je .Lsqr_montx
492
+ ___
493
+ $code.=<<___;
494
+ mov 8*0($a_ptr), %rax
495
+ mov 8*1($a_ptr), $acc6
496
+ mov 8*2($a_ptr), $acc7
497
+ mov 8*3($a_ptr), $acc0
498
+
499
+ call __ecp_nistz256_sqr_montq
500
+ ___
501
+ $code.=<<___ if ($addx);
502
+ jmp .Lsqr_mont_done
503
+
504
+ .align 32
505
+ .Lsqr_montx:
506
+ mov 8*0($a_ptr), %rdx
507
+ mov 8*1($a_ptr), $acc6
508
+ mov 8*2($a_ptr), $acc7
509
+ mov 8*3($a_ptr), $acc0
510
+ lea -128($a_ptr), $a_ptr # control u-op density
511
+
512
+ call __ecp_nistz256_sqr_montx
513
+ ___
514
+ $code.=<<___;
515
+ .Lsqr_mont_done:
516
+ pop %r15
517
+ pop %r14
518
+ pop %r13
519
+ pop %r12
520
+ pop %rbx
521
+ pop %rbp
522
+ ret
523
+ .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
524
+
525
+ .type __ecp_nistz256_sqr_montq,\@abi-omnipotent
526
+ .align 32
527
+ __ecp_nistz256_sqr_montq:
528
+ mov %rax, $acc5
529
+ mulq $acc6 # a[1]*a[0]
530
+ mov %rax, $acc1
531
+ mov $acc7, %rax
532
+ mov %rdx, $acc2
533
+
534
+ mulq $acc5 # a[0]*a[2]
535
+ add %rax, $acc2
536
+ mov $acc0, %rax
537
+ adc \$0, %rdx
538
+ mov %rdx, $acc3
539
+
540
+ mulq $acc5 # a[0]*a[3]
541
+ add %rax, $acc3
542
+ mov $acc7, %rax
543
+ adc \$0, %rdx
544
+ mov %rdx, $acc4
545
+
546
+ #################################
547
+ mulq $acc6 # a[1]*a[2]
548
+ add %rax, $acc3
549
+ mov $acc0, %rax
550
+ adc \$0, %rdx
551
+ mov %rdx, $t1
552
+
553
+ mulq $acc6 # a[1]*a[3]
554
+ add %rax, $acc4
555
+ mov $acc0, %rax
556
+ adc \$0, %rdx
557
+ add $t1, $acc4
558
+ mov %rdx, $acc5
559
+ adc \$0, $acc5
560
+
561
+ #################################
562
+ mulq $acc7 # a[2]*a[3]
563
+ xor $acc7, $acc7
564
+ add %rax, $acc5
565
+ mov 8*0($a_ptr), %rax
566
+ mov %rdx, $acc6
567
+ adc \$0, $acc6
568
+
569
+ add $acc1, $acc1 # acc1:6<<1
570
+ adc $acc2, $acc2
571
+ adc $acc3, $acc3
572
+ adc $acc4, $acc4
573
+ adc $acc5, $acc5
574
+ adc $acc6, $acc6
575
+ adc \$0, $acc7
576
+
577
+ mulq %rax
578
+ mov %rax, $acc0
579
+ mov 8*1($a_ptr), %rax
580
+ mov %rdx, $t0
581
+
582
+ mulq %rax
583
+ add $t0, $acc1
584
+ adc %rax, $acc2
585
+ mov 8*2($a_ptr), %rax
586
+ adc \$0, %rdx
587
+ mov %rdx, $t0
588
+
589
+ mulq %rax
590
+ add $t0, $acc3
591
+ adc %rax, $acc4
592
+ mov 8*3($a_ptr), %rax
593
+ adc \$0, %rdx
594
+ mov %rdx, $t0
595
+
596
+ mulq %rax
597
+ add $t0, $acc5
598
+ adc %rax, $acc6
599
+ mov $acc0, %rax
600
+ adc %rdx, $acc7
601
+
602
+ mov .Lpoly+8*1(%rip), $a_ptr
603
+ mov .Lpoly+8*3(%rip), $t1
604
+
605
+ ##########################################
606
+ # Now the reduction
607
+ # First iteration
608
+ mov $acc0, $t0
609
+ shl \$32, $acc0
610
+ mulq $t1
611
+ shr \$32, $t0
612
+ add $acc0, $acc1 # +=acc[0]<<96
613
+ adc $t0, $acc2
614
+ adc %rax, $acc3
615
+ mov $acc1, %rax
616
+ adc \$0, %rdx
617
+
618
+ ##########################################
619
+ # Second iteration
620
+ mov $acc1, $t0
621
+ shl \$32, $acc1
622
+ mov %rdx, $acc0
623
+ mulq $t1
624
+ shr \$32, $t0
625
+ add $acc1, $acc2
626
+ adc $t0, $acc3
627
+ adc %rax, $acc0
628
+ mov $acc2, %rax
629
+ adc \$0, %rdx
630
+
631
+ ##########################################
632
+ # Third iteration
633
+ mov $acc2, $t0
634
+ shl \$32, $acc2
635
+ mov %rdx, $acc1
636
+ mulq $t1
637
+ shr \$32, $t0
638
+ add $acc2, $acc3
639
+ adc $t0, $acc0
640
+ adc %rax, $acc1
641
+ mov $acc3, %rax
642
+ adc \$0, %rdx
643
+
644
+ ###########################################
645
+ # Last iteration
646
+ mov $acc3, $t0
647
+ shl \$32, $acc3
648
+ mov %rdx, $acc2
649
+ mulq $t1
650
+ shr \$32, $t0
651
+ add $acc3, $acc0
652
+ adc $t0, $acc1
653
+ adc %rax, $acc2
654
+ adc \$0, %rdx
655
+ xor $acc3, $acc3
656
+
657
+ ############################################
658
+ # Add the rest of the acc
659
+ add $acc0, $acc4
660
+ adc $acc1, $acc5
661
+ mov $acc4, $acc0
662
+ adc $acc2, $acc6
663
+ adc %rdx, $acc7
664
+ mov $acc5, $acc1
665
+ adc \$0, $acc3
666
+
667
+ sub \$-1, $acc4 # .Lpoly[0]
668
+ mov $acc6, $acc2
669
+ sbb $a_ptr, $acc5 # .Lpoly[1]
670
+ sbb \$0, $acc6 # .Lpoly[2]
671
+ mov $acc7, $t0
672
+ sbb $t1, $acc7 # .Lpoly[3]
673
+ sbb \$0, $acc3
674
+
675
+ cmovc $acc0, $acc4
676
+ cmovc $acc1, $acc5
677
+ mov $acc4, 8*0($r_ptr)
678
+ cmovc $acc2, $acc6
679
+ mov $acc5, 8*1($r_ptr)
680
+ cmovc $t0, $acc7
681
+ mov $acc6, 8*2($r_ptr)
682
+ mov $acc7, 8*3($r_ptr)
683
+
684
+ ret
685
+ .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
686
+ ___
687
+
688
+ if ($addx) {
689
+ $code.=<<___;
690
+ .type __ecp_nistz256_mul_montx,\@abi-omnipotent
691
+ .align 32
692
+ __ecp_nistz256_mul_montx:
693
+ ########################################################################
694
+ # Multiply by b[0]
695
+ mulx $acc1, $acc0, $acc1
696
+ mulx $acc2, $t0, $acc2
697
+ mov \$32, $poly1
698
+ xor $acc5, $acc5 # cf=0
699
+ mulx $acc3, $t1, $acc3
700
+ mov .Lpoly+8*3(%rip), $poly3
701
+ adc $t0, $acc1
702
+ mulx $acc4, $t0, $acc4
703
+ mov $acc0, %rdx
704
+ adc $t1, $acc2
705
+ shlx $poly1,$acc0,$t1
706
+ adc $t0, $acc3
707
+ shrx $poly1,$acc0,$t0
708
+ adc \$0, $acc4
709
+
710
+ ########################################################################
711
+ # First reduction step
712
+ add $t1, $acc1
713
+ adc $t0, $acc2
714
+
715
+ mulx $poly3, $t0, $t1
716
+ mov 8*1($b_ptr), %rdx
717
+ adc $t0, $acc3
718
+ adc $t1, $acc4
719
+ adc \$0, $acc5
720
+ xor $acc0, $acc0 # $acc0=0,cf=0,of=0
721
+
722
+ ########################################################################
723
+ # Multiply by b[1]
724
+ mulx 8*0+128($a_ptr), $t0, $t1
725
+ adcx $t0, $acc1
726
+ adox $t1, $acc2
727
+
728
+ mulx 8*1+128($a_ptr), $t0, $t1
729
+ adcx $t0, $acc2
730
+ adox $t1, $acc3
731
+
732
+ mulx 8*2+128($a_ptr), $t0, $t1
733
+ adcx $t0, $acc3
734
+ adox $t1, $acc4
735
+
736
+ mulx 8*3+128($a_ptr), $t0, $t1
737
+ mov $acc1, %rdx
738
+ adcx $t0, $acc4
739
+ shlx $poly1, $acc1, $t0
740
+ adox $t1, $acc5
741
+ shrx $poly1, $acc1, $t1
742
+
743
+ adcx $acc0, $acc5
744
+ adox $acc0, $acc0
745
+ adc \$0, $acc0
746
+
747
+ ########################################################################
748
+ # Second reduction step
749
+ add $t0, $acc2
750
+ adc $t1, $acc3
751
+
752
+ mulx $poly3, $t0, $t1
753
+ mov 8*2($b_ptr), %rdx
754
+ adc $t0, $acc4
755
+ adc $t1, $acc5
756
+ adc \$0, $acc0
757
+ xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
758
+
759
+ ########################################################################
760
+ # Multiply by b[2]
761
+ mulx 8*0+128($a_ptr), $t0, $t1
762
+ adcx $t0, $acc2
763
+ adox $t1, $acc3
764
+
765
+ mulx 8*1+128($a_ptr), $t0, $t1
766
+ adcx $t0, $acc3
767
+ adox $t1, $acc4
768
+
769
+ mulx 8*2+128($a_ptr), $t0, $t1
770
+ adcx $t0, $acc4
771
+ adox $t1, $acc5
772
+
773
+ mulx 8*3+128($a_ptr), $t0, $t1
774
+ mov $acc2, %rdx
775
+ adcx $t0, $acc5
776
+ shlx $poly1, $acc2, $t0
777
+ adox $t1, $acc0
778
+ shrx $poly1, $acc2, $t1
779
+
780
+ adcx $acc1, $acc0
781
+ adox $acc1, $acc1
782
+ adc \$0, $acc1
783
+
784
+ ########################################################################
785
+ # Third reduction step
786
+ add $t0, $acc3
787
+ adc $t1, $acc4
788
+
789
+ mulx $poly3, $t0, $t1
790
+ mov 8*3($b_ptr), %rdx
791
+ adc $t0, $acc5
792
+ adc $t1, $acc0
793
+ adc \$0, $acc1
794
+ xor $acc2, $acc2 # $acc2=0,cf=0,of=0
795
+
796
+ ########################################################################
797
+ # Multiply by b[3]
798
+ mulx 8*0+128($a_ptr), $t0, $t1
799
+ adcx $t0, $acc3
800
+ adox $t1, $acc4
801
+
802
+ mulx 8*1+128($a_ptr), $t0, $t1
803
+ adcx $t0, $acc4
804
+ adox $t1, $acc5
805
+
806
+ mulx 8*2+128($a_ptr), $t0, $t1
807
+ adcx $t0, $acc5
808
+ adox $t1, $acc0
809
+
810
+ mulx 8*3+128($a_ptr), $t0, $t1
811
+ mov $acc3, %rdx
812
+ adcx $t0, $acc0
813
+ shlx $poly1, $acc3, $t0
814
+ adox $t1, $acc1
815
+ shrx $poly1, $acc3, $t1
816
+
817
+ adcx $acc2, $acc1
818
+ adox $acc2, $acc2
819
+ adc \$0, $acc2
820
+
821
+ ########################################################################
822
+ # Fourth reduction step
823
+ add $t0, $acc4
824
+ adc $t1, $acc5
825
+
826
+ mulx $poly3, $t0, $t1
827
+ mov $acc4, $t2
828
+ mov .Lpoly+8*1(%rip), $poly1
829
+ adc $t0, $acc0
830
+ mov $acc5, $t3
831
+ adc $t1, $acc1
832
+ adc \$0, $acc2
833
+
834
+ ########################################################################
835
+ # Branch-less conditional subtraction of P
836
+ xor %eax, %eax
837
+ mov $acc0, $t0
838
+ sbb \$-1, $acc4 # .Lpoly[0]
839
+ sbb $poly1, $acc5 # .Lpoly[1]
840
+ sbb \$0, $acc0 # .Lpoly[2]
841
+ mov $acc1, $t1
842
+ sbb $poly3, $acc1 # .Lpoly[3]
843
+ sbb \$0, $acc2
844
+
845
+ cmovc $t2, $acc4
846
+ cmovc $t3, $acc5
847
+ mov $acc4, 8*0($r_ptr)
848
+ cmovc $t0, $acc0
849
+ mov $acc5, 8*1($r_ptr)
850
+ cmovc $t1, $acc1
851
+ mov $acc0, 8*2($r_ptr)
852
+ mov $acc1, 8*3($r_ptr)
853
+
854
+ ret
855
+ .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
856
+
857
+ .type __ecp_nistz256_sqr_montx,\@abi-omnipotent
858
+ .align 32
859
+ __ecp_nistz256_sqr_montx:
860
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
861
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
862
+ xor %eax, %eax
863
+ adc $t0, $acc2
864
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
865
+ mov $acc6, %rdx
866
+ adc $t1, $acc3
867
+ adc \$0, $acc4
868
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
869
+
870
+ #################################
871
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
872
+ adcx $t0, $acc3
873
+ adox $t1, $acc4
874
+
875
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
876
+ mov $acc7, %rdx
877
+ adcx $t0, $acc4
878
+ adox $t1, $acc5
879
+ adc \$0, $acc5
880
+
881
+ #################################
882
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
883
+ mov 8*0+128($a_ptr), %rdx
884
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
885
+ adcx $acc1, $acc1 # acc1:6<<1
886
+ adox $t0, $acc5
887
+ adcx $acc2, $acc2
888
+ adox $acc7, $acc6 # of=0
889
+
890
+ mulx %rdx, $acc0, $t1
891
+ mov 8*1+128($a_ptr), %rdx
892
+ adcx $acc3, $acc3
893
+ adox $t1, $acc1
894
+ adcx $acc4, $acc4
895
+ mulx %rdx, $t0, $t4
896
+ mov 8*2+128($a_ptr), %rdx
897
+ adcx $acc5, $acc5
898
+ adox $t0, $acc2
899
+ adcx $acc6, $acc6
900
+ .byte 0x67
901
+ mulx %rdx, $t0, $t1
902
+ mov 8*3+128($a_ptr), %rdx
903
+ adox $t4, $acc3
904
+ adcx $acc7, $acc7
905
+ adox $t0, $acc4
906
+ mov \$32, $a_ptr
907
+ adox $t1, $acc5
908
+ .byte 0x67,0x67
909
+ mulx %rdx, $t0, $t4
910
+ mov $acc0, %rdx
911
+ adox $t0, $acc6
912
+ shlx $a_ptr, $acc0, $t0
913
+ adox $t4, $acc7
914
+ shrx $a_ptr, $acc0, $t4
915
+ mov .Lpoly+8*3(%rip), $t1
916
+
917
+ # reduction step 1
918
+ add $t0, $acc1
919
+ adc $t4, $acc2
920
+
921
+ mulx $t1, $t0, $acc0
922
+ mov $acc1, %rdx
923
+ adc $t0, $acc3
924
+ shlx $a_ptr, $acc1, $t0
925
+ adc \$0, $acc0
926
+ shrx $a_ptr, $acc1, $t4
927
+
928
+ # reduction step 2
929
+ add $t0, $acc2
930
+ adc $t4, $acc3
931
+
932
+ mulx $t1, $t0, $acc1
933
+ mov $acc2, %rdx
934
+ adc $t0, $acc0
935
+ shlx $a_ptr, $acc2, $t0
936
+ adc \$0, $acc1
937
+ shrx $a_ptr, $acc2, $t4
938
+
939
+ # reduction step 3
940
+ add $t0, $acc3
941
+ adc $t4, $acc0
942
+
943
+ mulx $t1, $t0, $acc2
944
+ mov $acc3, %rdx
945
+ adc $t0, $acc1
946
+ shlx $a_ptr, $acc3, $t0
947
+ adc \$0, $acc2
948
+ shrx $a_ptr, $acc3, $t4
949
+
950
+ # reduction step 4
951
+ add $t0, $acc0
952
+ adc $t4, $acc1
953
+
954
+ mulx $t1, $t0, $acc3
955
+ adc $t0, $acc2
956
+ adc \$0, $acc3
957
+
958
+ xor $t3, $t3 # cf=0
959
+ adc $acc0, $acc4 # accumulate upper half
960
+ mov .Lpoly+8*1(%rip), $a_ptr
961
+ adc $acc1, $acc5
962
+ mov $acc4, $acc0
963
+ adc $acc2, $acc6
964
+ adc $acc3, $acc7
965
+ mov $acc5, $acc1
966
+ adc \$0, $t3
967
+
968
+ xor %eax, %eax # cf=0
969
+ sbb \$-1, $acc4 # .Lpoly[0]
970
+ mov $acc6, $acc2
971
+ sbb $a_ptr, $acc5 # .Lpoly[1]
972
+ sbb \$0, $acc6 # .Lpoly[2]
973
+ mov $acc7, $acc3
974
+ sbb $t1, $acc7 # .Lpoly[3]
975
+ sbb \$0, $t3
976
+
977
+ cmovc $acc0, $acc4
978
+ cmovc $acc1, $acc5
979
+ mov $acc4, 8*0($r_ptr)
980
+ cmovc $acc2, $acc6
981
+ mov $acc5, 8*1($r_ptr)
982
+ cmovc $acc3, $acc7
983
+ mov $acc6, 8*2($r_ptr)
984
+ mov $acc7, 8*3($r_ptr)
985
+
986
+ ret
987
+ .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
988
+ ___
989
+ }
990
+ }
991
+ {
992
+ my ($r_ptr,$in_ptr)=("%rdi","%rsi");
993
+ my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
994
+ my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
995
+
996
+ $code.=<<___;
997
+ ################################################################################
998
+ # void ecp_nistz256_from_mont(
999
+ # uint64_t res[4],
1000
+ # uint64_t in[4]);
1001
+ # This one performs Montgomery multiplication by 1, so we only need the reduction
1002
+
1003
+ .globl ecp_nistz256_from_mont
1004
+ .type ecp_nistz256_from_mont,\@function,2
1005
+ .align 32
1006
+ ecp_nistz256_from_mont:
1007
+ push %r12
1008
+ push %r13
1009
+
1010
+ mov 8*0($in_ptr), %rax
1011
+ mov .Lpoly+8*3(%rip), $t2
1012
+ mov 8*1($in_ptr), $acc1
1013
+ mov 8*2($in_ptr), $acc2
1014
+ mov 8*3($in_ptr), $acc3
1015
+ mov %rax, $acc0
1016
+ mov .Lpoly+8*1(%rip), $t1
1017
+
1018
+ #########################################
1019
+ # First iteration
1020
+ mov %rax, $t0
1021
+ shl \$32, $acc0
1022
+ mulq $t2
1023
+ shr \$32, $t0
1024
+ add $acc0, $acc1
1025
+ adc $t0, $acc2
1026
+ adc %rax, $acc3
1027
+ mov $acc1, %rax
1028
+ adc \$0, %rdx
1029
+
1030
+ #########################################
1031
+ # Second iteration
1032
+ mov $acc1, $t0
1033
+ shl \$32, $acc1
1034
+ mov %rdx, $acc0
1035
+ mulq $t2
1036
+ shr \$32, $t0
1037
+ add $acc1, $acc2
1038
+ adc $t0, $acc3
1039
+ adc %rax, $acc0
1040
+ mov $acc2, %rax
1041
+ adc \$0, %rdx
1042
+
1043
+ ##########################################
1044
+ # Third iteration
1045
+ mov $acc2, $t0
1046
+ shl \$32, $acc2
1047
+ mov %rdx, $acc1
1048
+ mulq $t2
1049
+ shr \$32, $t0
1050
+ add $acc2, $acc3
1051
+ adc $t0, $acc0
1052
+ adc %rax, $acc1
1053
+ mov $acc3, %rax
1054
+ adc \$0, %rdx
1055
+
1056
+ ###########################################
1057
+ # Last iteration
1058
+ mov $acc3, $t0
1059
+ shl \$32, $acc3
1060
+ mov %rdx, $acc2
1061
+ mulq $t2
1062
+ shr \$32, $t0
1063
+ add $acc3, $acc0
1064
+ adc $t0, $acc1
1065
+ mov $acc0, $t0
1066
+ adc %rax, $acc2
1067
+ mov $acc1, $in_ptr
1068
+ adc \$0, %rdx
1069
+
1070
+ sub \$-1, $acc0
1071
+ mov $acc2, %rax
1072
+ sbb $t1, $acc1
1073
+ sbb \$0, $acc2
1074
+ mov %rdx, $acc3
1075
+ sbb $t2, %rdx
1076
+ sbb $t2, $t2
1077
+
1078
+ cmovnz $t0, $acc0
1079
+ cmovnz $in_ptr, $acc1
1080
+ mov $acc0, 8*0($r_ptr)
1081
+ cmovnz %rax, $acc2
1082
+ mov $acc1, 8*1($r_ptr)
1083
+ cmovz %rdx, $acc3
1084
+ mov $acc2, 8*2($r_ptr)
1085
+ mov $acc3, 8*3($r_ptr)
1086
+
1087
+ pop %r13
1088
+ pop %r12
1089
+ ret
1090
+ .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1091
+ ___
1092
+ }
1093
+ {
1094
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1095
+ my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1096
+ my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1097
+ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1098
+
1099
+ $code.=<<___;
1100
+ ################################################################################
1101
+ # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1102
+ .globl ecp_nistz256_select_w5
1103
+ .type ecp_nistz256_select_w5,\@abi-omnipotent
1104
+ .align 32
1105
+ ecp_nistz256_select_w5:
1106
+ ___
1107
+ $code.=<<___ if ($avx>1);
1108
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
1109
+ test \$`1<<5`, %eax
1110
+ jnz .Lavx2_select_w5
1111
+ ___
1112
+ $code.=<<___ if ($win64);
1113
+ lea -0x88(%rsp), %rax
1114
+ .LSEH_begin_ecp_nistz256_select_w5:
1115
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1116
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
1117
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
1118
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
1119
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
1120
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
1121
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
1122
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
1123
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
1124
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
1125
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
1126
+ ___
1127
+ $code.=<<___;
1128
+ movdqa .LOne(%rip), $ONE
1129
+ movd $index, $INDEX
1130
+
1131
+ pxor $Ra, $Ra
1132
+ pxor $Rb, $Rb
1133
+ pxor $Rc, $Rc
1134
+ pxor $Rd, $Rd
1135
+ pxor $Re, $Re
1136
+ pxor $Rf, $Rf
1137
+
1138
+ movdqa $ONE, $M0
1139
+ pshufd \$0, $INDEX, $INDEX
1140
+
1141
+ mov \$16, %rax
1142
+ .Lselect_loop_sse_w5:
1143
+
1144
+ movdqa $M0, $TMP0
1145
+ paddd $ONE, $M0
1146
+ pcmpeqd $INDEX, $TMP0
1147
+
1148
+ movdqa 16*0($in_t), $T0a
1149
+ movdqa 16*1($in_t), $T0b
1150
+ movdqa 16*2($in_t), $T0c
1151
+ movdqa 16*3($in_t), $T0d
1152
+ movdqa 16*4($in_t), $T0e
1153
+ movdqa 16*5($in_t), $T0f
1154
+ lea 16*6($in_t), $in_t
1155
+
1156
+ pand $TMP0, $T0a
1157
+ pand $TMP0, $T0b
1158
+ por $T0a, $Ra
1159
+ pand $TMP0, $T0c
1160
+ por $T0b, $Rb
1161
+ pand $TMP0, $T0d
1162
+ por $T0c, $Rc
1163
+ pand $TMP0, $T0e
1164
+ por $T0d, $Rd
1165
+ pand $TMP0, $T0f
1166
+ por $T0e, $Re
1167
+ por $T0f, $Rf
1168
+
1169
+ dec %rax
1170
+ jnz .Lselect_loop_sse_w5
1171
+
1172
+ movdqu $Ra, 16*0($val)
1173
+ movdqu $Rb, 16*1($val)
1174
+ movdqu $Rc, 16*2($val)
1175
+ movdqu $Rd, 16*3($val)
1176
+ movdqu $Re, 16*4($val)
1177
+ movdqu $Rf, 16*5($val)
1178
+ ___
1179
+ $code.=<<___ if ($win64);
1180
+ movaps (%rsp), %xmm6
1181
+ movaps 0x10(%rsp), %xmm7
1182
+ movaps 0x20(%rsp), %xmm8
1183
+ movaps 0x30(%rsp), %xmm9
1184
+ movaps 0x40(%rsp), %xmm10
1185
+ movaps 0x50(%rsp), %xmm11
1186
+ movaps 0x60(%rsp), %xmm12
1187
+ movaps 0x70(%rsp), %xmm13
1188
+ movaps 0x80(%rsp), %xmm14
1189
+ movaps 0x90(%rsp), %xmm15
1190
+ lea 0xa8(%rsp), %rsp
1191
+ .LSEH_end_ecp_nistz256_select_w5:
1192
+ ___
1193
+ $code.=<<___;
1194
+ ret
1195
+ .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1196
+
1197
+ ################################################################################
1198
+ # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1199
+ .globl ecp_nistz256_select_w7
1200
+ .type ecp_nistz256_select_w7,\@abi-omnipotent
1201
+ .align 32
1202
+ ecp_nistz256_select_w7:
1203
+ ___
1204
+ $code.=<<___ if ($avx>1);
1205
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
1206
+ test \$`1<<5`, %eax
1207
+ jnz .Lavx2_select_w7
1208
+ ___
1209
+ $code.=<<___ if ($win64);
1210
+ lea -0x88(%rsp), %rax
1211
+ .LSEH_begin_ecp_nistz256_select_w7:
1212
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1213
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
1214
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
1215
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
1216
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
1217
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
1218
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
1219
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
1220
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
1221
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
1222
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
1223
+ ___
1224
+ $code.=<<___;
1225
+ movdqa .LOne(%rip), $M0
1226
+ movd $index, $INDEX
1227
+
1228
+ pxor $Ra, $Ra
1229
+ pxor $Rb, $Rb
1230
+ pxor $Rc, $Rc
1231
+ pxor $Rd, $Rd
1232
+
1233
+ movdqa $M0, $ONE
1234
+ pshufd \$0, $INDEX, $INDEX
1235
+ mov \$64, %rax
1236
+
1237
+ .Lselect_loop_sse_w7:
1238
+ movdqa $M0, $TMP0
1239
+ paddd $ONE, $M0
1240
+ movdqa 16*0($in_t), $T0a
1241
+ movdqa 16*1($in_t), $T0b
1242
+ pcmpeqd $INDEX, $TMP0
1243
+ movdqa 16*2($in_t), $T0c
1244
+ movdqa 16*3($in_t), $T0d
1245
+ lea 16*4($in_t), $in_t
1246
+
1247
+ pand $TMP0, $T0a
1248
+ pand $TMP0, $T0b
1249
+ por $T0a, $Ra
1250
+ pand $TMP0, $T0c
1251
+ por $T0b, $Rb
1252
+ pand $TMP0, $T0d
1253
+ por $T0c, $Rc
1254
+ prefetcht0 255($in_t)
1255
+ por $T0d, $Rd
1256
+
1257
+ dec %rax
1258
+ jnz .Lselect_loop_sse_w7
1259
+
1260
+ movdqu $Ra, 16*0($val)
1261
+ movdqu $Rb, 16*1($val)
1262
+ movdqu $Rc, 16*2($val)
1263
+ movdqu $Rd, 16*3($val)
1264
+ ___
1265
+ $code.=<<___ if ($win64);
1266
+ movaps (%rsp), %xmm6
1267
+ movaps 0x10(%rsp), %xmm7
1268
+ movaps 0x20(%rsp), %xmm8
1269
+ movaps 0x30(%rsp), %xmm9
1270
+ movaps 0x40(%rsp), %xmm10
1271
+ movaps 0x50(%rsp), %xmm11
1272
+ movaps 0x60(%rsp), %xmm12
1273
+ movaps 0x70(%rsp), %xmm13
1274
+ movaps 0x80(%rsp), %xmm14
1275
+ movaps 0x90(%rsp), %xmm15
1276
+ lea 0xa8(%rsp), %rsp
1277
+ .LSEH_end_ecp_nistz256_select_w7:
1278
+ ___
1279
+ $code.=<<___;
1280
+ ret
1281
+ .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1282
+ ___
1283
+ }
1284
+ if ($avx>1) {
1285
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1286
+ my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1287
+ my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1288
+ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1289
+
1290
+ $code.=<<___;
1291
+ ################################################################################
1292
+ # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1293
+ .type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1294
+ .align 32
1295
+ ecp_nistz256_avx2_select_w5:
1296
+ .Lavx2_select_w5:
1297
+ vzeroupper
1298
+ ___
1299
+ $code.=<<___ if ($win64);
1300
+ lea -0x88(%rsp), %rax
1301
+ .LSEH_begin_ecp_nistz256_avx2_select_w5:
1302
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1303
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1304
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1305
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1306
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1307
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1308
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1309
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1310
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1311
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1312
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1313
+ ___
1314
+ $code.=<<___;
1315
+ vmovdqa .LTwo(%rip), $TWO
1316
+
1317
+ vpxor $Ra, $Ra, $Ra
1318
+ vpxor $Rb, $Rb, $Rb
1319
+ vpxor $Rc, $Rc, $Rc
1320
+
1321
+ vmovdqa .LOne(%rip), $M0
1322
+ vmovdqa .LTwo(%rip), $M1
1323
+
1324
+ vmovd $index, %xmm1
1325
+ vpermd $INDEX, $Ra, $INDEX
1326
+
1327
+ mov \$8, %rax
1328
+ .Lselect_loop_avx2_w5:
1329
+
1330
+ vmovdqa 32*0($in_t), $T0a
1331
+ vmovdqa 32*1($in_t), $T0b
1332
+ vmovdqa 32*2($in_t), $T0c
1333
+
1334
+ vmovdqa 32*3($in_t), $T1a
1335
+ vmovdqa 32*4($in_t), $T1b
1336
+ vmovdqa 32*5($in_t), $T1c
1337
+
1338
+ vpcmpeqd $INDEX, $M0, $TMP0
1339
+ vpcmpeqd $INDEX, $M1, $TMP1
1340
+
1341
+ vpaddd $TWO, $M0, $M0
1342
+ vpaddd $TWO, $M1, $M1
1343
+ lea 32*6($in_t), $in_t
1344
+
1345
+ vpand $TMP0, $T0a, $T0a
1346
+ vpand $TMP0, $T0b, $T0b
1347
+ vpand $TMP0, $T0c, $T0c
1348
+ vpand $TMP1, $T1a, $T1a
1349
+ vpand $TMP1, $T1b, $T1b
1350
+ vpand $TMP1, $T1c, $T1c
1351
+
1352
+ vpxor $T0a, $Ra, $Ra
1353
+ vpxor $T0b, $Rb, $Rb
1354
+ vpxor $T0c, $Rc, $Rc
1355
+ vpxor $T1a, $Ra, $Ra
1356
+ vpxor $T1b, $Rb, $Rb
1357
+ vpxor $T1c, $Rc, $Rc
1358
+
1359
+ dec %rax
1360
+ jnz .Lselect_loop_avx2_w5
1361
+
1362
+ vmovdqu $Ra, 32*0($val)
1363
+ vmovdqu $Rb, 32*1($val)
1364
+ vmovdqu $Rc, 32*2($val)
1365
+ vzeroupper
1366
+ ___
1367
+ $code.=<<___ if ($win64);
1368
+ movaps (%rsp), %xmm6
1369
+ movaps 0x10(%rsp), %xmm7
1370
+ movaps 0x20(%rsp), %xmm8
1371
+ movaps 0x30(%rsp), %xmm9
1372
+ movaps 0x40(%rsp), %xmm10
1373
+ movaps 0x50(%rsp), %xmm11
1374
+ movaps 0x60(%rsp), %xmm12
1375
+ movaps 0x70(%rsp), %xmm13
1376
+ movaps 0x80(%rsp), %xmm14
1377
+ movaps 0x90(%rsp), %xmm15
1378
+ lea 0xa8(%rsp), %rsp
1379
+ .LSEH_end_ecp_nistz256_avx2_select_w5:
1380
+ ___
1381
+ $code.=<<___;
1382
+ ret
1383
+ .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1384
+ ___
1385
+ }
1386
+ if ($avx>1) {
1387
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1388
+ my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1389
+ my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1390
+ my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1391
+ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1392
+
1393
+ $code.=<<___;
1394
+
1395
+ ################################################################################
1396
+ # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1397
+ .globl ecp_nistz256_avx2_select_w7
1398
+ .type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1399
+ .align 32
1400
+ ecp_nistz256_avx2_select_w7:
1401
+ .Lavx2_select_w7:
1402
+ vzeroupper
1403
+ ___
1404
+ $code.=<<___ if ($win64);
1405
+ lea -0x88(%rsp), %rax
1406
+ .LSEH_begin_ecp_nistz256_avx2_select_w7:
1407
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1408
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1409
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1410
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1411
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1412
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1413
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1414
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1415
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1416
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1417
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1418
+ ___
1419
+ $code.=<<___;
1420
+ vmovdqa .LThree(%rip), $THREE
1421
+
1422
+ vpxor $Ra, $Ra, $Ra
1423
+ vpxor $Rb, $Rb, $Rb
1424
+
1425
+ vmovdqa .LOne(%rip), $M0
1426
+ vmovdqa .LTwo(%rip), $M1
1427
+ vmovdqa .LThree(%rip), $M2
1428
+
1429
+ vmovd $index, %xmm1
1430
+ vpermd $INDEX, $Ra, $INDEX
1431
+ # Skip index = 0, because it is implicitly the point at infinity
1432
+
1433
+ mov \$21, %rax
1434
+ .Lselect_loop_avx2_w7:
1435
+
1436
+ vmovdqa 32*0($in_t), $T0a
1437
+ vmovdqa 32*1($in_t), $T0b
1438
+
1439
+ vmovdqa 32*2($in_t), $T1a
1440
+ vmovdqa 32*3($in_t), $T1b
1441
+
1442
+ vmovdqa 32*4($in_t), $T2a
1443
+ vmovdqa 32*5($in_t), $T2b
1444
+
1445
+ vpcmpeqd $INDEX, $M0, $TMP0
1446
+ vpcmpeqd $INDEX, $M1, $TMP1
1447
+ vpcmpeqd $INDEX, $M2, $TMP2
1448
+
1449
+ vpaddd $THREE, $M0, $M0
1450
+ vpaddd $THREE, $M1, $M1
1451
+ vpaddd $THREE, $M2, $M2
1452
+ lea 32*6($in_t), $in_t
1453
+
1454
+ vpand $TMP0, $T0a, $T0a
1455
+ vpand $TMP0, $T0b, $T0b
1456
+ vpand $TMP1, $T1a, $T1a
1457
+ vpand $TMP1, $T1b, $T1b
1458
+ vpand $TMP2, $T2a, $T2a
1459
+ vpand $TMP2, $T2b, $T2b
1460
+
1461
+ vpxor $T0a, $Ra, $Ra
1462
+ vpxor $T0b, $Rb, $Rb
1463
+ vpxor $T1a, $Ra, $Ra
1464
+ vpxor $T1b, $Rb, $Rb
1465
+ vpxor $T2a, $Ra, $Ra
1466
+ vpxor $T2b, $Rb, $Rb
1467
+
1468
+ dec %rax
1469
+ jnz .Lselect_loop_avx2_w7
1470
+
1471
+
1472
+ vmovdqa 32*0($in_t), $T0a
1473
+ vmovdqa 32*1($in_t), $T0b
1474
+
1475
+ vpcmpeqd $INDEX, $M0, $TMP0
1476
+
1477
+ vpand $TMP0, $T0a, $T0a
1478
+ vpand $TMP0, $T0b, $T0b
1479
+
1480
+ vpxor $T0a, $Ra, $Ra
1481
+ vpxor $T0b, $Rb, $Rb
1482
+
1483
+ vmovdqu $Ra, 32*0($val)
1484
+ vmovdqu $Rb, 32*1($val)
1485
+ vzeroupper
1486
+ ___
1487
+ $code.=<<___ if ($win64);
1488
+ movaps (%rsp), %xmm6
1489
+ movaps 0x10(%rsp), %xmm7
1490
+ movaps 0x20(%rsp), %xmm8
1491
+ movaps 0x30(%rsp), %xmm9
1492
+ movaps 0x40(%rsp), %xmm10
1493
+ movaps 0x50(%rsp), %xmm11
1494
+ movaps 0x60(%rsp), %xmm12
1495
+ movaps 0x70(%rsp), %xmm13
1496
+ movaps 0x80(%rsp), %xmm14
1497
+ movaps 0x90(%rsp), %xmm15
1498
+ lea 0xa8(%rsp), %rsp
1499
+ .LSEH_end_ecp_nistz256_avx2_select_w7:
1500
+ ___
1501
+ $code.=<<___;
1502
+ ret
1503
+ .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1504
+ ___
1505
+ } else {
1506
+ $code.=<<___;
1507
+ .globl ecp_nistz256_avx2_select_w7
1508
+ .type ecp_nistz256_avx2_select_w7,\@function,3
1509
+ .align 32
1510
+ ecp_nistz256_avx2_select_w7:
1511
+ .byte 0x0f,0x0b # ud2
1512
+ ret
1513
+ .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1514
+ ___
1515
+ }
1516
+ {{{
1517
+ ########################################################################
1518
+ # This block implements higher level point_double, point_add and
1519
+ # point_add_affine. The key to performance in this case is to allow
1520
+ # out-of-order execution logic to overlap computations from next step
1521
+ # with tail processing from current step. By using tailored calling
1522
+ # sequence we minimize inter-step overhead to give processor better
1523
+ # shot at overlapping operations...
1524
+ #
1525
+ # You will notice that input data is copied to stack. Trouble is that
1526
+ # there are no registers to spare for holding original pointers and
1527
+ # reloading them, pointers, would create undesired dependencies on
1528
+ # effective addresses calculation paths. In other words it's too done
1529
+ # to favour out-of-order execution logic.
1530
+ # <appro@openssl.org>
1531
+
1532
+ my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1533
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1534
+ my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1535
+ my ($poly1,$poly3)=($acc6,$acc7);
1536
+
1537
+ sub load_for_mul () {
1538
+ my ($a,$b,$src0) = @_;
1539
+ my $bias = $src0 eq "%rax" ? 0 : -128;
1540
+
1541
+ " mov $b, $src0
1542
+ lea $b, $b_ptr
1543
+ mov 8*0+$a, $acc1
1544
+ mov 8*1+$a, $acc2
1545
+ lea $bias+$a, $a_ptr
1546
+ mov 8*2+$a, $acc3
1547
+ mov 8*3+$a, $acc4"
1548
+ }
1549
+
1550
+ sub load_for_sqr () {
1551
+ my ($a,$src0) = @_;
1552
+ my $bias = $src0 eq "%rax" ? 0 : -128;
1553
+
1554
+ " mov 8*0+$a, $src0
1555
+ mov 8*1+$a, $acc6
1556
+ lea $bias+$a, $a_ptr
1557
+ mov 8*2+$a, $acc7
1558
+ mov 8*3+$a, $acc0"
1559
+ }
1560
+
1561
+ {
1562
+ ########################################################################
1563
+ # operate in 4-5-0-1 "name space" that matches multiplication output
1564
+ #
1565
+ my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1566
+
1567
+ $code.=<<___;
1568
+ .type __ecp_nistz256_add_toq,\@abi-omnipotent
1569
+ .align 32
1570
+ __ecp_nistz256_add_toq:
1571
+ add 8*0($b_ptr), $a0
1572
+ adc 8*1($b_ptr), $a1
1573
+ mov $a0, $t0
1574
+ adc 8*2($b_ptr), $a2
1575
+ adc 8*3($b_ptr), $a3
1576
+ mov $a1, $t1
1577
+ sbb $t4, $t4
1578
+
1579
+ sub \$-1, $a0
1580
+ mov $a2, $t2
1581
+ sbb $poly1, $a1
1582
+ sbb \$0, $a2
1583
+ mov $a3, $t3
1584
+ sbb $poly3, $a3
1585
+ test $t4, $t4
1586
+
1587
+ cmovz $t0, $a0
1588
+ cmovz $t1, $a1
1589
+ mov $a0, 8*0($r_ptr)
1590
+ cmovz $t2, $a2
1591
+ mov $a1, 8*1($r_ptr)
1592
+ cmovz $t3, $a3
1593
+ mov $a2, 8*2($r_ptr)
1594
+ mov $a3, 8*3($r_ptr)
1595
+
1596
+ ret
1597
+ .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1598
+
1599
+ .type __ecp_nistz256_sub_fromq,\@abi-omnipotent
1600
+ .align 32
1601
+ __ecp_nistz256_sub_fromq:
1602
+ sub 8*0($b_ptr), $a0
1603
+ sbb 8*1($b_ptr), $a1
1604
+ mov $a0, $t0
1605
+ sbb 8*2($b_ptr), $a2
1606
+ sbb 8*3($b_ptr), $a3
1607
+ mov $a1, $t1
1608
+ sbb $t4, $t4
1609
+
1610
+ add \$-1, $a0
1611
+ mov $a2, $t2
1612
+ adc $poly1, $a1
1613
+ adc \$0, $a2
1614
+ mov $a3, $t3
1615
+ adc $poly3, $a3
1616
+ test $t4, $t4
1617
+
1618
+ cmovz $t0, $a0
1619
+ cmovz $t1, $a1
1620
+ mov $a0, 8*0($r_ptr)
1621
+ cmovz $t2, $a2
1622
+ mov $a1, 8*1($r_ptr)
1623
+ cmovz $t3, $a3
1624
+ mov $a2, 8*2($r_ptr)
1625
+ mov $a3, 8*3($r_ptr)
1626
+
1627
+ ret
1628
+ .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1629
+
1630
+ .type __ecp_nistz256_subq,\@abi-omnipotent
1631
+ .align 32
1632
+ __ecp_nistz256_subq:
1633
+ sub $a0, $t0
1634
+ sbb $a1, $t1
1635
+ mov $t0, $a0
1636
+ sbb $a2, $t2
1637
+ sbb $a3, $t3
1638
+ mov $t1, $a1
1639
+ sbb $t4, $t4
1640
+
1641
+ add \$-1, $t0
1642
+ mov $t2, $a2
1643
+ adc $poly1, $t1
1644
+ adc \$0, $t2
1645
+ mov $t3, $a3
1646
+ adc $poly3, $t3
1647
+ test $t4, $t4
1648
+
1649
+ cmovnz $t0, $a0
1650
+ cmovnz $t1, $a1
1651
+ cmovnz $t2, $a2
1652
+ cmovnz $t3, $a3
1653
+
1654
+ ret
1655
+ .size __ecp_nistz256_subq,.-__ecp_nistz256_subq
1656
+
1657
+ .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1658
+ .align 32
1659
+ __ecp_nistz256_mul_by_2q:
1660
+ add $a0, $a0 # a0:a3+a0:a3
1661
+ adc $a1, $a1
1662
+ mov $a0, $t0
1663
+ adc $a2, $a2
1664
+ adc $a3, $a3
1665
+ mov $a1, $t1
1666
+ sbb $t4, $t4
1667
+
1668
+ sub \$-1, $a0
1669
+ mov $a2, $t2
1670
+ sbb $poly1, $a1
1671
+ sbb \$0, $a2
1672
+ mov $a3, $t3
1673
+ sbb $poly3, $a3
1674
+ test $t4, $t4
1675
+
1676
+ cmovz $t0, $a0
1677
+ cmovz $t1, $a1
1678
+ mov $a0, 8*0($r_ptr)
1679
+ cmovz $t2, $a2
1680
+ mov $a1, 8*1($r_ptr)
1681
+ cmovz $t3, $a3
1682
+ mov $a2, 8*2($r_ptr)
1683
+ mov $a3, 8*3($r_ptr)
1684
+
1685
+ ret
1686
+ .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1687
+ ___
1688
+ }
1689
+ sub gen_double () {
1690
+ my $x = shift;
1691
+ my ($src0,$sfx,$bias);
1692
+ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1693
+
1694
+ if ($x ne "x") {
1695
+ $src0 = "%rax";
1696
+ $sfx = "";
1697
+ $bias = 0;
1698
+
1699
+ $code.=<<___;
1700
+ .globl ecp_nistz256_point_double
1701
+ .type ecp_nistz256_point_double,\@function,2
1702
+ .align 32
1703
+ ecp_nistz256_point_double:
1704
+ ___
1705
+ $code.=<<___ if ($addx);
1706
+ mov \$0x80100, %ecx
1707
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
1708
+ cmp \$0x80100, %ecx
1709
+ je .Lpoint_doublex
1710
+ ___
1711
+ } else {
1712
+ $src0 = "%rdx";
1713
+ $sfx = "x";
1714
+ $bias = 128;
1715
+
1716
+ $code.=<<___;
1717
+ .type ecp_nistz256_point_doublex,\@function,2
1718
+ .align 32
1719
+ ecp_nistz256_point_doublex:
1720
+ .Lpoint_doublex:
1721
+ ___
1722
+ }
1723
+ $code.=<<___;
1724
+ push %rbp
1725
+ push %rbx
1726
+ push %r12
1727
+ push %r13
1728
+ push %r14
1729
+ push %r15
1730
+ sub \$32*5+8, %rsp
1731
+
1732
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1733
+ mov $a_ptr, $b_ptr # backup copy
1734
+ movdqu 0x10($a_ptr), %xmm1
1735
+ mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1736
+ mov 0x20+8*1($a_ptr), $acc5
1737
+ mov 0x20+8*2($a_ptr), $acc0
1738
+ mov 0x20+8*3($a_ptr), $acc1
1739
+ mov .Lpoly+8*1(%rip), $poly1
1740
+ mov .Lpoly+8*3(%rip), $poly3
1741
+ movdqa %xmm0, $in_x(%rsp)
1742
+ movdqa %xmm1, $in_x+0x10(%rsp)
1743
+ lea 0x20($r_ptr), $acc2
1744
+ lea 0x40($r_ptr), $acc3
1745
+ movq $r_ptr, %xmm0
1746
+ movq $acc2, %xmm1
1747
+ movq $acc3, %xmm2
1748
+
1749
+ lea $S(%rsp), $r_ptr
1750
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
1751
+
1752
+ mov 0x40+8*0($a_ptr), $src0
1753
+ mov 0x40+8*1($a_ptr), $acc6
1754
+ mov 0x40+8*2($a_ptr), $acc7
1755
+ mov 0x40+8*3($a_ptr), $acc0
1756
+ lea 0x40-$bias($a_ptr), $a_ptr
1757
+ lea $Zsqr(%rsp), $r_ptr
1758
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
1759
+
1760
+ `&load_for_sqr("$S(%rsp)", "$src0")`
1761
+ lea $S(%rsp), $r_ptr
1762
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
1763
+
1764
+ mov 0x20($b_ptr), $src0 # $b_ptr is still valid
1765
+ mov 0x40+8*0($b_ptr), $acc1
1766
+ mov 0x40+8*1($b_ptr), $acc2
1767
+ mov 0x40+8*2($b_ptr), $acc3
1768
+ mov 0x40+8*3($b_ptr), $acc4
1769
+ lea 0x40-$bias($b_ptr), $a_ptr
1770
+ lea 0x20($b_ptr), $b_ptr
1771
+ movq %xmm2, $r_ptr
1772
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
1773
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
1774
+
1775
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1776
+ mov $in_x+8*1(%rsp), $acc5
1777
+ lea $Zsqr(%rsp), $b_ptr
1778
+ mov $in_x+8*2(%rsp), $acc0
1779
+ mov $in_x+8*3(%rsp), $acc1
1780
+ lea $M(%rsp), $r_ptr
1781
+ call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
1782
+
1783
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1784
+ mov $in_x+8*1(%rsp), $acc5
1785
+ lea $Zsqr(%rsp), $b_ptr
1786
+ mov $in_x+8*2(%rsp), $acc0
1787
+ mov $in_x+8*3(%rsp), $acc1
1788
+ lea $Zsqr(%rsp), $r_ptr
1789
+ call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
1790
+
1791
+ `&load_for_sqr("$S(%rsp)", "$src0")`
1792
+ movq %xmm1, $r_ptr
1793
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
1794
+ ___
1795
+ {
1796
+ ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1797
+ # operate in 4-5-6-7 "name space" that matches squaring output
1798
+ #
1799
+ my ($poly1,$poly3)=($a_ptr,$t1);
1800
+ my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1801
+
1802
+ $code.=<<___;
1803
+ xor $t4, $t4
1804
+ mov $a0, $t0
1805
+ add \$-1, $a0
1806
+ mov $a1, $t1
1807
+ adc $poly1, $a1
1808
+ mov $a2, $t2
1809
+ adc \$0, $a2
1810
+ mov $a3, $t3
1811
+ adc $poly3, $a3
1812
+ adc \$0, $t4
1813
+ xor $a_ptr, $a_ptr # borrow $a_ptr
1814
+ test \$1, $t0
1815
+
1816
+ cmovz $t0, $a0
1817
+ cmovz $t1, $a1
1818
+ cmovz $t2, $a2
1819
+ cmovz $t3, $a3
1820
+ cmovz $a_ptr, $t4
1821
+
1822
+ mov $a1, $t0 # a0:a3>>1
1823
+ shr \$1, $a0
1824
+ shl \$63, $t0
1825
+ mov $a2, $t1
1826
+ shr \$1, $a1
1827
+ or $t0, $a0
1828
+ shl \$63, $t1
1829
+ mov $a3, $t2
1830
+ shr \$1, $a2
1831
+ or $t1, $a1
1832
+ shl \$63, $t2
1833
+ mov $a0, 8*0($r_ptr)
1834
+ shr \$1, $a3
1835
+ mov $a1, 8*1($r_ptr)
1836
+ shl \$63, $t4
1837
+ or $t2, $a2
1838
+ or $t4, $a3
1839
+ mov $a2, 8*2($r_ptr)
1840
+ mov $a3, 8*3($r_ptr)
1841
+ ___
1842
+ }
1843
+ $code.=<<___;
1844
+ `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1845
+ lea $M(%rsp), $r_ptr
1846
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
1847
+
1848
+ lea $tmp0(%rsp), $r_ptr
1849
+ call __ecp_nistz256_mul_by_2$x
1850
+
1851
+ lea $M(%rsp), $b_ptr
1852
+ lea $M(%rsp), $r_ptr
1853
+ call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
1854
+
1855
+ `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1856
+ lea $S(%rsp), $r_ptr
1857
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
1858
+
1859
+ lea $tmp0(%rsp), $r_ptr
1860
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
1861
+
1862
+ `&load_for_sqr("$M(%rsp)", "$src0")`
1863
+ movq %xmm0, $r_ptr
1864
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
1865
+
1866
+ lea $tmp0(%rsp), $b_ptr
1867
+ mov $acc6, $acc0 # harmonize sqr output and sub input
1868
+ mov $acc7, $acc1
1869
+ mov $a_ptr, $poly1
1870
+ mov $t1, $poly3
1871
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
1872
+
1873
+ mov $S+8*0(%rsp), $t0
1874
+ mov $S+8*1(%rsp), $t1
1875
+ mov $S+8*2(%rsp), $t2
1876
+ mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
1877
+ lea $S(%rsp), $r_ptr
1878
+ call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
1879
+
1880
+ mov $M(%rsp), $src0
1881
+ lea $M(%rsp), $b_ptr
1882
+ mov $acc4, $acc6 # harmonize sub output and mul input
1883
+ xor %ecx, %ecx
1884
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
1885
+ mov $acc5, $acc2
1886
+ mov $acc5, $S+8*1(%rsp)
1887
+ cmovz $acc0, $acc3
1888
+ mov $acc0, $S+8*2(%rsp)
1889
+ lea $S-$bias(%rsp), $a_ptr
1890
+ cmovz $acc1, $acc4
1891
+ mov $acc1, $S+8*3(%rsp)
1892
+ mov $acc6, $acc1
1893
+ lea $S(%rsp), $r_ptr
1894
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
1895
+
1896
+ movq %xmm1, $b_ptr
1897
+ movq %xmm1, $r_ptr
1898
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
1899
+
1900
+ add \$32*5+8, %rsp
1901
+ pop %r15
1902
+ pop %r14
1903
+ pop %r13
1904
+ pop %r12
1905
+ pop %rbx
1906
+ pop %rbp
1907
+ ret
1908
+ .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1909
+ ___
1910
+ }
1911
+ &gen_double("q");
1912
+
1913
+ sub gen_add () {
1914
+ my $x = shift;
1915
+ my ($src0,$sfx,$bias);
1916
+ my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1917
+ $U1,$U2,$S1,$S2,
1918
+ $res_x,$res_y,$res_z,
1919
+ $in1_x,$in1_y,$in1_z,
1920
+ $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1921
+ my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1922
+
1923
+ if ($x ne "x") {
1924
+ $src0 = "%rax";
1925
+ $sfx = "";
1926
+ $bias = 0;
1927
+
1928
+ $code.=<<___;
1929
+ .globl ecp_nistz256_point_add
1930
+ .type ecp_nistz256_point_add,\@function,3
1931
+ .align 32
1932
+ ecp_nistz256_point_add:
1933
+ ___
1934
+ $code.=<<___ if ($addx);
1935
+ mov \$0x80100, %ecx
1936
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
1937
+ cmp \$0x80100, %ecx
1938
+ je .Lpoint_addx
1939
+ ___
1940
+ } else {
1941
+ $src0 = "%rdx";
1942
+ $sfx = "x";
1943
+ $bias = 128;
1944
+
1945
+ $code.=<<___;
1946
+ .type ecp_nistz256_point_addx,\@function,3
1947
+ .align 32
1948
+ ecp_nistz256_point_addx:
1949
+ .Lpoint_addx:
1950
+ ___
1951
+ }
1952
+ $code.=<<___;
1953
+ push %rbp
1954
+ push %rbx
1955
+ push %r12
1956
+ push %r13
1957
+ push %r14
1958
+ push %r15
1959
+ sub \$32*18+8, %rsp
1960
+
1961
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1962
+ movdqu 0x10($a_ptr), %xmm1
1963
+ movdqu 0x20($a_ptr), %xmm2
1964
+ movdqu 0x30($a_ptr), %xmm3
1965
+ movdqu 0x40($a_ptr), %xmm4
1966
+ movdqu 0x50($a_ptr), %xmm5
1967
+ mov $a_ptr, $b_ptr # reassign
1968
+ mov $b_org, $a_ptr # reassign
1969
+ movdqa %xmm0, $in1_x(%rsp)
1970
+ movdqa %xmm1, $in1_x+0x10(%rsp)
1971
+ por %xmm0, %xmm1
1972
+ movdqa %xmm2, $in1_y(%rsp)
1973
+ movdqa %xmm3, $in1_y+0x10(%rsp)
1974
+ por %xmm2, %xmm3
1975
+ movdqa %xmm4, $in1_z(%rsp)
1976
+ movdqa %xmm5, $in1_z+0x10(%rsp)
1977
+ por %xmm1, %xmm3
1978
+
1979
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
1980
+ pshufd \$0xb1, %xmm3, %xmm5
1981
+ movdqu 0x10($a_ptr), %xmm1
1982
+ movdqu 0x20($a_ptr), %xmm2
1983
+ por %xmm3, %xmm5
1984
+ movdqu 0x30($a_ptr), %xmm3
1985
+ mov 0x40+8*0($a_ptr), $src0 # load original in2_z
1986
+ mov 0x40+8*1($a_ptr), $acc6
1987
+ mov 0x40+8*2($a_ptr), $acc7
1988
+ mov 0x40+8*3($a_ptr), $acc0
1989
+ movdqa %xmm0, $in2_x(%rsp)
1990
+ pshufd \$0x1e, %xmm5, %xmm4
1991
+ movdqa %xmm1, $in2_x+0x10(%rsp)
1992
+ por %xmm0, %xmm1
1993
+ movq $r_ptr, %xmm0 # save $r_ptr
1994
+ movdqa %xmm2, $in2_y(%rsp)
1995
+ movdqa %xmm3, $in2_y+0x10(%rsp)
1996
+ por %xmm2, %xmm3
1997
+ por %xmm4, %xmm5
1998
+ pxor %xmm4, %xmm4
1999
+ por %xmm1, %xmm3
2000
+
2001
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
2002
+ mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
2003
+ mov $acc6, $in2_z+8*1(%rsp)
2004
+ mov $acc7, $in2_z+8*2(%rsp)
2005
+ mov $acc0, $in2_z+8*3(%rsp)
2006
+ lea $Z2sqr(%rsp), $r_ptr # Z2^2
2007
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
2008
+
2009
+ pcmpeqd %xmm4, %xmm5
2010
+ pshufd \$0xb1, %xmm3, %xmm4
2011
+ por %xmm3, %xmm4
2012
+ pshufd \$0, %xmm5, %xmm5 # in1infty
2013
+ pshufd \$0x1e, %xmm4, %xmm3
2014
+ por %xmm3, %xmm4
2015
+ pxor %xmm3, %xmm3
2016
+ pcmpeqd %xmm3, %xmm4
2017
+ pshufd \$0, %xmm4, %xmm4 # in2infty
2018
+ mov 0x40+8*0($b_ptr), $src0 # load original in1_z
2019
+ mov 0x40+8*1($b_ptr), $acc6
2020
+ mov 0x40+8*2($b_ptr), $acc7
2021
+ mov 0x40+8*3($b_ptr), $acc0
2022
+
2023
+ lea 0x40-$bias($b_ptr), $a_ptr
2024
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
2025
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
2026
+
2027
+ `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2028
+ lea $S1(%rsp), $r_ptr # S1 = Z2^3
2029
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
2030
+
2031
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2032
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
2033
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
2034
+
2035
+ `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2036
+ lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
2037
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
2038
+
2039
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2040
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
2041
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
2042
+
2043
+ lea $S1(%rsp), $b_ptr
2044
+ lea $R(%rsp), $r_ptr # R = S2 - S1
2045
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
2046
+
2047
+ or $acc5, $acc4 # see if result is zero
2048
+ movdqa %xmm4, %xmm2
2049
+ or $acc0, $acc4
2050
+ or $acc1, $acc4
2051
+ por %xmm5, %xmm2 # in1infty || in2infty
2052
+ movq $acc4, %xmm3
2053
+
2054
+ `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2055
+ lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
2056
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
2057
+
2058
+ `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2059
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
2060
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
2061
+
2062
+ lea $U1(%rsp), $b_ptr
2063
+ lea $H(%rsp), $r_ptr # H = U2 - U1
2064
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
2065
+
2066
+ or $acc5, $acc4 # see if result is zero
2067
+ or $acc0, $acc4
2068
+ or $acc1, $acc4
2069
+
2070
+ .byte 0x3e # predict taken
2071
+ jnz .Ladd_proceed$x # is_equal(U1,U2)?
2072
+ movq %xmm2, $acc0
2073
+ movq %xmm3, $acc1
2074
+ test $acc0, $acc0
2075
+ jnz .Ladd_proceed$x # (in1infty || in2infty)?
2076
+ test $acc1, $acc1
2077
+ jz .Ladd_proceed$x # is_equal(S1,S2)?
2078
+
2079
+ movq %xmm0, $r_ptr # restore $r_ptr
2080
+ pxor %xmm0, %xmm0
2081
+ movdqu %xmm0, 0x00($r_ptr)
2082
+ movdqu %xmm0, 0x10($r_ptr)
2083
+ movdqu %xmm0, 0x20($r_ptr)
2084
+ movdqu %xmm0, 0x30($r_ptr)
2085
+ movdqu %xmm0, 0x40($r_ptr)
2086
+ movdqu %xmm0, 0x50($r_ptr)
2087
+ jmp .Ladd_done$x
2088
+
2089
+ .align 32
2090
+ .Ladd_proceed$x:
2091
+ `&load_for_sqr("$R(%rsp)", "$src0")`
2092
+ lea $Rsqr(%rsp), $r_ptr # R^2
2093
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
2094
+
2095
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2096
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2097
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
2098
+
2099
+ `&load_for_sqr("$H(%rsp)", "$src0")`
2100
+ lea $Hsqr(%rsp), $r_ptr # H^2
2101
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
2102
+
2103
+ `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2104
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2105
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
2106
+
2107
+ `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2108
+ lea $Hcub(%rsp), $r_ptr # H^3
2109
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
2110
+
2111
+ `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2112
+ lea $U2(%rsp), $r_ptr # U1*H^2
2113
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
2114
+ ___
2115
+ {
2116
+ #######################################################################
2117
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2118
+ #
2119
+ my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2120
+ my ($poly1, $poly3)=($acc6,$acc7);
2121
+
2122
+ $code.=<<___;
2123
+ #lea $U2(%rsp), $a_ptr
2124
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
2125
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2126
+
2127
+ add $acc0, $acc0 # a0:a3+a0:a3
2128
+ lea $Rsqr(%rsp), $a_ptr
2129
+ adc $acc1, $acc1
2130
+ mov $acc0, $t0
2131
+ adc $acc2, $acc2
2132
+ adc $acc3, $acc3
2133
+ mov $acc1, $t1
2134
+ sbb $t4, $t4
2135
+
2136
+ sub \$-1, $acc0
2137
+ mov $acc2, $t2
2138
+ sbb $poly1, $acc1
2139
+ sbb \$0, $acc2
2140
+ mov $acc3, $t3
2141
+ sbb $poly3, $acc3
2142
+ test $t4, $t4
2143
+
2144
+ cmovz $t0, $acc0
2145
+ mov 8*0($a_ptr), $t0
2146
+ cmovz $t1, $acc1
2147
+ mov 8*1($a_ptr), $t1
2148
+ cmovz $t2, $acc2
2149
+ mov 8*2($a_ptr), $t2
2150
+ cmovz $t3, $acc3
2151
+ mov 8*3($a_ptr), $t3
2152
+
2153
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2154
+
2155
+ lea $Hcub(%rsp), $b_ptr
2156
+ lea $res_x(%rsp), $r_ptr
2157
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2158
+
2159
+ mov $U2+8*0(%rsp), $t0
2160
+ mov $U2+8*1(%rsp), $t1
2161
+ mov $U2+8*2(%rsp), $t2
2162
+ mov $U2+8*3(%rsp), $t3
2163
+ lea $res_y(%rsp), $r_ptr
2164
+
2165
+ call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
2166
+
2167
+ mov $acc0, 8*0($r_ptr) # save the result, as
2168
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2169
+ mov $acc2, 8*2($r_ptr)
2170
+ mov $acc3, 8*3($r_ptr)
2171
+ ___
2172
+ }
2173
+ $code.=<<___;
2174
+ `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2175
+ lea $S2(%rsp), $r_ptr
2176
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
2177
+
2178
+ `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2179
+ lea $res_y(%rsp), $r_ptr
2180
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
2181
+
2182
+ lea $S2(%rsp), $b_ptr
2183
+ lea $res_y(%rsp), $r_ptr
2184
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
2185
+
2186
+ movq %xmm0, $r_ptr # restore $r_ptr
2187
+
2188
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
2189
+ movdqa %xmm5, %xmm1
2190
+ pandn $res_z(%rsp), %xmm0
2191
+ movdqa %xmm5, %xmm2
2192
+ pandn $res_z+0x10(%rsp), %xmm1
2193
+ movdqa %xmm5, %xmm3
2194
+ pand $in2_z(%rsp), %xmm2
2195
+ pand $in2_z+0x10(%rsp), %xmm3
2196
+ por %xmm0, %xmm2
2197
+ por %xmm1, %xmm3
2198
+
2199
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2200
+ movdqa %xmm4, %xmm1
2201
+ pandn %xmm2, %xmm0
2202
+ movdqa %xmm4, %xmm2
2203
+ pandn %xmm3, %xmm1
2204
+ movdqa %xmm4, %xmm3
2205
+ pand $in1_z(%rsp), %xmm2
2206
+ pand $in1_z+0x10(%rsp), %xmm3
2207
+ por %xmm0, %xmm2
2208
+ por %xmm1, %xmm3
2209
+ movdqu %xmm2, 0x40($r_ptr)
2210
+ movdqu %xmm3, 0x50($r_ptr)
2211
+
2212
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2213
+ movdqa %xmm5, %xmm1
2214
+ pandn $res_x(%rsp), %xmm0
2215
+ movdqa %xmm5, %xmm2
2216
+ pandn $res_x+0x10(%rsp), %xmm1
2217
+ movdqa %xmm5, %xmm3
2218
+ pand $in2_x(%rsp), %xmm2
2219
+ pand $in2_x+0x10(%rsp), %xmm3
2220
+ por %xmm0, %xmm2
2221
+ por %xmm1, %xmm3
2222
+
2223
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2224
+ movdqa %xmm4, %xmm1
2225
+ pandn %xmm2, %xmm0
2226
+ movdqa %xmm4, %xmm2
2227
+ pandn %xmm3, %xmm1
2228
+ movdqa %xmm4, %xmm3
2229
+ pand $in1_x(%rsp), %xmm2
2230
+ pand $in1_x+0x10(%rsp), %xmm3
2231
+ por %xmm0, %xmm2
2232
+ por %xmm1, %xmm3
2233
+ movdqu %xmm2, 0x00($r_ptr)
2234
+ movdqu %xmm3, 0x10($r_ptr)
2235
+
2236
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2237
+ movdqa %xmm5, %xmm1
2238
+ pandn $res_y(%rsp), %xmm0
2239
+ movdqa %xmm5, %xmm2
2240
+ pandn $res_y+0x10(%rsp), %xmm1
2241
+ movdqa %xmm5, %xmm3
2242
+ pand $in2_y(%rsp), %xmm2
2243
+ pand $in2_y+0x10(%rsp), %xmm3
2244
+ por %xmm0, %xmm2
2245
+ por %xmm1, %xmm3
2246
+
2247
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2248
+ movdqa %xmm4, %xmm1
2249
+ pandn %xmm2, %xmm0
2250
+ movdqa %xmm4, %xmm2
2251
+ pandn %xmm3, %xmm1
2252
+ movdqa %xmm4, %xmm3
2253
+ pand $in1_y(%rsp), %xmm2
2254
+ pand $in1_y+0x10(%rsp), %xmm3
2255
+ por %xmm0, %xmm2
2256
+ por %xmm1, %xmm3
2257
+ movdqu %xmm2, 0x20($r_ptr)
2258
+ movdqu %xmm3, 0x30($r_ptr)
2259
+
2260
+ .Ladd_done$x:
2261
+ add \$32*18+8, %rsp
2262
+ pop %r15
2263
+ pop %r14
2264
+ pop %r13
2265
+ pop %r12
2266
+ pop %rbx
2267
+ pop %rbp
2268
+ ret
2269
+ .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2270
+ ___
2271
+ }
2272
+ &gen_add("q");
2273
+
2274
+ sub gen_add_affine () {
2275
+ my $x = shift;
2276
+ my ($src0,$sfx,$bias);
2277
+ my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2278
+ $res_x,$res_y,$res_z,
2279
+ $in1_x,$in1_y,$in1_z,
2280
+ $in2_x,$in2_y)=map(32*$_,(0..14));
2281
+ my $Z1sqr = $S2;
2282
+
2283
+ if ($x ne "x") {
2284
+ $src0 = "%rax";
2285
+ $sfx = "";
2286
+ $bias = 0;
2287
+
2288
+ $code.=<<___;
2289
+ .globl ecp_nistz256_point_add_affine
2290
+ .type ecp_nistz256_point_add_affine,\@function,3
2291
+ .align 32
2292
+ ecp_nistz256_point_add_affine:
2293
+ ___
2294
+ $code.=<<___ if ($addx);
2295
+ mov \$0x80100, %ecx
2296
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
2297
+ cmp \$0x80100, %ecx
2298
+ je .Lpoint_add_affinex
2299
+ ___
2300
+ } else {
2301
+ $src0 = "%rdx";
2302
+ $sfx = "x";
2303
+ $bias = 128;
2304
+
2305
+ $code.=<<___;
2306
+ .type ecp_nistz256_point_add_affinex,\@function,3
2307
+ .align 32
2308
+ ecp_nistz256_point_add_affinex:
2309
+ .Lpoint_add_affinex:
2310
+ ___
2311
+ }
2312
+ $code.=<<___;
2313
+ push %rbp
2314
+ push %rbx
2315
+ push %r12
2316
+ push %r13
2317
+ push %r14
2318
+ push %r15
2319
+ sub \$32*15+8, %rsp
2320
+
2321
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
2322
+ mov $b_org, $b_ptr # reassign
2323
+ movdqu 0x10($a_ptr), %xmm1
2324
+ movdqu 0x20($a_ptr), %xmm2
2325
+ movdqu 0x30($a_ptr), %xmm3
2326
+ movdqu 0x40($a_ptr), %xmm4
2327
+ movdqu 0x50($a_ptr), %xmm5
2328
+ mov 0x40+8*0($a_ptr), $src0 # load original in1_z
2329
+ mov 0x40+8*1($a_ptr), $acc6
2330
+ mov 0x40+8*2($a_ptr), $acc7
2331
+ mov 0x40+8*3($a_ptr), $acc0
2332
+ movdqa %xmm0, $in1_x(%rsp)
2333
+ movdqa %xmm1, $in1_x+0x10(%rsp)
2334
+ por %xmm0, %xmm1
2335
+ movdqa %xmm2, $in1_y(%rsp)
2336
+ movdqa %xmm3, $in1_y+0x10(%rsp)
2337
+ por %xmm2, %xmm3
2338
+ movdqa %xmm4, $in1_z(%rsp)
2339
+ movdqa %xmm5, $in1_z+0x10(%rsp)
2340
+ por %xmm1, %xmm3
2341
+
2342
+ movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
2343
+ pshufd \$0xb1, %xmm3, %xmm5
2344
+ movdqu 0x10($b_ptr), %xmm1
2345
+ movdqu 0x20($b_ptr), %xmm2
2346
+ por %xmm3, %xmm5
2347
+ movdqu 0x30($b_ptr), %xmm3
2348
+ movdqa %xmm0, $in2_x(%rsp)
2349
+ pshufd \$0x1e, %xmm5, %xmm4
2350
+ movdqa %xmm1, $in2_x+0x10(%rsp)
2351
+ por %xmm0, %xmm1
2352
+ movq $r_ptr, %xmm0 # save $r_ptr
2353
+ movdqa %xmm2, $in2_y(%rsp)
2354
+ movdqa %xmm3, $in2_y+0x10(%rsp)
2355
+ por %xmm2, %xmm3
2356
+ por %xmm4, %xmm5
2357
+ pxor %xmm4, %xmm4
2358
+ por %xmm1, %xmm3
2359
+
2360
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
2361
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
2362
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
2363
+
2364
+ pcmpeqd %xmm4, %xmm5
2365
+ pshufd \$0xb1, %xmm3, %xmm4
2366
+ mov 0x00($b_ptr), $src0 # $b_ptr is still valid
2367
+ #lea 0x00($b_ptr), $b_ptr
2368
+ mov $acc4, $acc1 # harmonize sqr output and mul input
2369
+ por %xmm3, %xmm4
2370
+ pshufd \$0, %xmm5, %xmm5 # in1infty
2371
+ pshufd \$0x1e, %xmm4, %xmm3
2372
+ mov $acc5, $acc2
2373
+ por %xmm3, %xmm4
2374
+ pxor %xmm3, %xmm3
2375
+ mov $acc6, $acc3
2376
+ pcmpeqd %xmm3, %xmm4
2377
+ pshufd \$0, %xmm4, %xmm4 # in2infty
2378
+
2379
+ lea $Z1sqr-$bias(%rsp), $a_ptr
2380
+ mov $acc7, $acc4
2381
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
2382
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
2383
+
2384
+ lea $in1_x(%rsp), $b_ptr
2385
+ lea $H(%rsp), $r_ptr # H = U2 - U1
2386
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
2387
+
2388
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2389
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
2390
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
2391
+
2392
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2393
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2394
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
2395
+
2396
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2397
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
2398
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
2399
+
2400
+ lea $in1_y(%rsp), $b_ptr
2401
+ lea $R(%rsp), $r_ptr # R = S2 - S1
2402
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
2403
+
2404
+ `&load_for_sqr("$H(%rsp)", "$src0")`
2405
+ lea $Hsqr(%rsp), $r_ptr # H^2
2406
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
2407
+
2408
+ `&load_for_sqr("$R(%rsp)", "$src0")`
2409
+ lea $Rsqr(%rsp), $r_ptr # R^2
2410
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
2411
+
2412
+ `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2413
+ lea $Hcub(%rsp), $r_ptr # H^3
2414
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
2415
+
2416
+ `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2417
+ lea $U2(%rsp), $r_ptr # U1*H^2
2418
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
2419
+ ___
2420
+ {
2421
+ #######################################################################
2422
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2423
+ #
2424
+ my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2425
+ my ($poly1, $poly3)=($acc6,$acc7);
2426
+
2427
+ $code.=<<___;
2428
+ #lea $U2(%rsp), $a_ptr
2429
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
2430
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2431
+
2432
+ add $acc0, $acc0 # a0:a3+a0:a3
2433
+ lea $Rsqr(%rsp), $a_ptr
2434
+ adc $acc1, $acc1
2435
+ mov $acc0, $t0
2436
+ adc $acc2, $acc2
2437
+ adc $acc3, $acc3
2438
+ mov $acc1, $t1
2439
+ sbb $t4, $t4
2440
+
2441
+ sub \$-1, $acc0
2442
+ mov $acc2, $t2
2443
+ sbb $poly1, $acc1
2444
+ sbb \$0, $acc2
2445
+ mov $acc3, $t3
2446
+ sbb $poly3, $acc3
2447
+ test $t4, $t4
2448
+
2449
+ cmovz $t0, $acc0
2450
+ mov 8*0($a_ptr), $t0
2451
+ cmovz $t1, $acc1
2452
+ mov 8*1($a_ptr), $t1
2453
+ cmovz $t2, $acc2
2454
+ mov 8*2($a_ptr), $t2
2455
+ cmovz $t3, $acc3
2456
+ mov 8*3($a_ptr), $t3
2457
+
2458
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2459
+
2460
+ lea $Hcub(%rsp), $b_ptr
2461
+ lea $res_x(%rsp), $r_ptr
2462
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2463
+
2464
+ mov $U2+8*0(%rsp), $t0
2465
+ mov $U2+8*1(%rsp), $t1
2466
+ mov $U2+8*2(%rsp), $t2
2467
+ mov $U2+8*3(%rsp), $t3
2468
+ lea $H(%rsp), $r_ptr
2469
+
2470
+ call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
2471
+
2472
+ mov $acc0, 8*0($r_ptr) # save the result, as
2473
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2474
+ mov $acc2, 8*2($r_ptr)
2475
+ mov $acc3, 8*3($r_ptr)
2476
+ ___
2477
+ }
2478
+ $code.=<<___;
2479
+ `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2480
+ lea $S2(%rsp), $r_ptr
2481
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
2482
+
2483
+ `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2484
+ lea $H(%rsp), $r_ptr
2485
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
2486
+
2487
+ lea $S2(%rsp), $b_ptr
2488
+ lea $res_y(%rsp), $r_ptr
2489
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
2490
+
2491
+ movq %xmm0, $r_ptr # restore $r_ptr
2492
+
2493
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
2494
+ movdqa %xmm5, %xmm1
2495
+ pandn $res_z(%rsp), %xmm0
2496
+ movdqa %xmm5, %xmm2
2497
+ pandn $res_z+0x10(%rsp), %xmm1
2498
+ movdqa %xmm5, %xmm3
2499
+ pand .LONE_mont(%rip), %xmm2
2500
+ pand .LONE_mont+0x10(%rip), %xmm3
2501
+ por %xmm0, %xmm2
2502
+ por %xmm1, %xmm3
2503
+
2504
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2505
+ movdqa %xmm4, %xmm1
2506
+ pandn %xmm2, %xmm0
2507
+ movdqa %xmm4, %xmm2
2508
+ pandn %xmm3, %xmm1
2509
+ movdqa %xmm4, %xmm3
2510
+ pand $in1_z(%rsp), %xmm2
2511
+ pand $in1_z+0x10(%rsp), %xmm3
2512
+ por %xmm0, %xmm2
2513
+ por %xmm1, %xmm3
2514
+ movdqu %xmm2, 0x40($r_ptr)
2515
+ movdqu %xmm3, 0x50($r_ptr)
2516
+
2517
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2518
+ movdqa %xmm5, %xmm1
2519
+ pandn $res_x(%rsp), %xmm0
2520
+ movdqa %xmm5, %xmm2
2521
+ pandn $res_x+0x10(%rsp), %xmm1
2522
+ movdqa %xmm5, %xmm3
2523
+ pand $in2_x(%rsp), %xmm2
2524
+ pand $in2_x+0x10(%rsp), %xmm3
2525
+ por %xmm0, %xmm2
2526
+ por %xmm1, %xmm3
2527
+
2528
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2529
+ movdqa %xmm4, %xmm1
2530
+ pandn %xmm2, %xmm0
2531
+ movdqa %xmm4, %xmm2
2532
+ pandn %xmm3, %xmm1
2533
+ movdqa %xmm4, %xmm3
2534
+ pand $in1_x(%rsp), %xmm2
2535
+ pand $in1_x+0x10(%rsp), %xmm3
2536
+ por %xmm0, %xmm2
2537
+ por %xmm1, %xmm3
2538
+ movdqu %xmm2, 0x00($r_ptr)
2539
+ movdqu %xmm3, 0x10($r_ptr)
2540
+
2541
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2542
+ movdqa %xmm5, %xmm1
2543
+ pandn $res_y(%rsp), %xmm0
2544
+ movdqa %xmm5, %xmm2
2545
+ pandn $res_y+0x10(%rsp), %xmm1
2546
+ movdqa %xmm5, %xmm3
2547
+ pand $in2_y(%rsp), %xmm2
2548
+ pand $in2_y+0x10(%rsp), %xmm3
2549
+ por %xmm0, %xmm2
2550
+ por %xmm1, %xmm3
2551
+
2552
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2553
+ movdqa %xmm4, %xmm1
2554
+ pandn %xmm2, %xmm0
2555
+ movdqa %xmm4, %xmm2
2556
+ pandn %xmm3, %xmm1
2557
+ movdqa %xmm4, %xmm3
2558
+ pand $in1_y(%rsp), %xmm2
2559
+ pand $in1_y+0x10(%rsp), %xmm3
2560
+ por %xmm0, %xmm2
2561
+ por %xmm1, %xmm3
2562
+ movdqu %xmm2, 0x20($r_ptr)
2563
+ movdqu %xmm3, 0x30($r_ptr)
2564
+
2565
+ add \$32*15+8, %rsp
2566
+ pop %r15
2567
+ pop %r14
2568
+ pop %r13
2569
+ pop %r12
2570
+ pop %rbx
2571
+ pop %rbp
2572
+ ret
2573
+ .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2574
+ ___
2575
+ }
2576
+ &gen_add_affine("q");
2577
+
2578
+ ########################################################################
2579
+ # AD*X magic
2580
+ #
2581
+ if ($addx) { {
2582
+ ########################################################################
2583
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2584
+ #
2585
+ my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2586
+
2587
+ $code.=<<___;
2588
+ .type __ecp_nistz256_add_tox,\@abi-omnipotent
2589
+ .align 32
2590
+ __ecp_nistz256_add_tox:
2591
+ xor $t4, $t4
2592
+ adc 8*0($b_ptr), $a0
2593
+ adc 8*1($b_ptr), $a1
2594
+ mov $a0, $t0
2595
+ adc 8*2($b_ptr), $a2
2596
+ adc 8*3($b_ptr), $a3
2597
+ mov $a1, $t1
2598
+ adc \$0, $t4
2599
+
2600
+ xor $t3, $t3
2601
+ sbb \$-1, $a0
2602
+ mov $a2, $t2
2603
+ sbb $poly1, $a1
2604
+ sbb \$0, $a2
2605
+ mov $a3, $t3
2606
+ sbb $poly3, $a3
2607
+
2608
+ bt \$0, $t4
2609
+ cmovnc $t0, $a0
2610
+ cmovnc $t1, $a1
2611
+ mov $a0, 8*0($r_ptr)
2612
+ cmovnc $t2, $a2
2613
+ mov $a1, 8*1($r_ptr)
2614
+ cmovnc $t3, $a3
2615
+ mov $a2, 8*2($r_ptr)
2616
+ mov $a3, 8*3($r_ptr)
2617
+
2618
+ ret
2619
+ .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2620
+
2621
+ .type __ecp_nistz256_sub_fromx,\@abi-omnipotent
2622
+ .align 32
2623
+ __ecp_nistz256_sub_fromx:
2624
+ xor $t4, $t4
2625
+ sbb 8*0($b_ptr), $a0
2626
+ sbb 8*1($b_ptr), $a1
2627
+ mov $a0, $t0
2628
+ sbb 8*2($b_ptr), $a2
2629
+ sbb 8*3($b_ptr), $a3
2630
+ mov $a1, $t1
2631
+ sbb \$0, $t4
2632
+
2633
+ xor $t3, $t3
2634
+ adc \$-1, $a0
2635
+ mov $a2, $t2
2636
+ adc $poly1, $a1
2637
+ adc \$0, $a2
2638
+ mov $a3, $t3
2639
+ adc $poly3, $a3
2640
+
2641
+ bt \$0, $t4
2642
+ cmovnc $t0, $a0
2643
+ cmovnc $t1, $a1
2644
+ mov $a0, 8*0($r_ptr)
2645
+ cmovnc $t2, $a2
2646
+ mov $a1, 8*1($r_ptr)
2647
+ cmovnc $t3, $a3
2648
+ mov $a2, 8*2($r_ptr)
2649
+ mov $a3, 8*3($r_ptr)
2650
+
2651
+ ret
2652
+ .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2653
+
2654
+ .type __ecp_nistz256_subx,\@abi-omnipotent
2655
+ .align 32
2656
+ __ecp_nistz256_subx:
2657
+ xor $t4, $t4
2658
+ sbb $a0, $t0
2659
+ sbb $a1, $t1
2660
+ mov $t0, $a0
2661
+ sbb $a2, $t2
2662
+ sbb $a3, $t3
2663
+ mov $t1, $a1
2664
+ sbb \$0, $t4
2665
+
2666
+ xor $a3 ,$a3
2667
+ adc \$-1, $t0
2668
+ mov $t2, $a2
2669
+ adc $poly1, $t1
2670
+ adc \$0, $t2
2671
+ mov $t3, $a3
2672
+ adc $poly3, $t3
2673
+
2674
+ bt \$0, $t4
2675
+ cmovc $t0, $a0
2676
+ cmovc $t1, $a1
2677
+ cmovc $t2, $a2
2678
+ cmovc $t3, $a3
2679
+
2680
+ ret
2681
+ .size __ecp_nistz256_subx,.-__ecp_nistz256_subx
2682
+
2683
+ .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
2684
+ .align 32
2685
+ __ecp_nistz256_mul_by_2x:
2686
+ xor $t4, $t4
2687
+ adc $a0, $a0 # a0:a3+a0:a3
2688
+ adc $a1, $a1
2689
+ mov $a0, $t0
2690
+ adc $a2, $a2
2691
+ adc $a3, $a3
2692
+ mov $a1, $t1
2693
+ adc \$0, $t4
2694
+
2695
+ xor $t3, $t3
2696
+ sbb \$-1, $a0
2697
+ mov $a2, $t2
2698
+ sbb $poly1, $a1
2699
+ sbb \$0, $a2
2700
+ mov $a3, $t3
2701
+ sbb $poly3, $a3
2702
+
2703
+ bt \$0, $t4
2704
+ cmovnc $t0, $a0
2705
+ cmovnc $t1, $a1
2706
+ mov $a0, 8*0($r_ptr)
2707
+ cmovnc $t2, $a2
2708
+ mov $a1, 8*1($r_ptr)
2709
+ cmovnc $t3, $a3
2710
+ mov $a2, 8*2($r_ptr)
2711
+ mov $a3, 8*3($r_ptr)
2712
+
2713
+ ret
2714
+ .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2715
+ ___
2716
+ }
2717
+ &gen_double("x");
2718
+ &gen_add("x");
2719
+ &gen_add_affine("x");
2720
+ }
2721
+ }}}
2722
+
2723
+ $code =~ s/\`([^\`]*)\`/eval $1/gem;
2724
+ print $code;
2725
+ close STDOUT;