ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,128 @@
1
+ /* Copyright (c) 2015, Google Inc.
2
+ *
3
+ * Permission to use, copy, modify, and/or distribute this software for any
4
+ * purpose with or without fee is hereby granted, provided that the above
5
+ * copyright notice and this permission notice appear in all copies.
6
+ *
7
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
+
15
+ #include <stdint.h>
16
+ #include <stdio.h>
17
+ #include <string.h>
18
+
19
+ #include <openssl/curve25519.h>
20
+
21
+
22
+ static bool TestX25519() {
23
+ /* Taken from
24
+ * https://tools.ietf.org/html/draft-irtf-cfrg-curves-11#section-5.2 */
25
+ static const uint8_t kScalar1[32] = {
26
+ 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15,
27
+ 0x4b, 0x82, 0x46, 0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc,
28
+ 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4,
29
+ };
30
+ static const uint8_t kPoint1[32] = {
31
+ 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, 0x35, 0x94, 0xc1,
32
+ 0xa4, 0x24, 0xb1, 0x5f, 0x7c, 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3,
33
+ 0x35, 0x3b, 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c,
34
+ };
35
+
36
+ uint8_t out[32];
37
+ X25519(out, kScalar1, kPoint1);
38
+
39
+ static const uint8_t kExpected1[32] = {
40
+ 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea,
41
+ 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c,
42
+ 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52,
43
+ };
44
+ if (memcmp(kExpected1, out, sizeof(out)) != 0) {
45
+ fprintf(stderr, "X25519 test one failed.\n");
46
+ return false;
47
+ }
48
+
49
+ static const uint8_t kScalar2[32] = {
50
+ 0x4b, 0x66, 0xe9, 0xd4, 0xd1, 0xb4, 0x67, 0x3c, 0x5a, 0xd2, 0x26,
51
+ 0x91, 0x95, 0x7d, 0x6a, 0xf5, 0xc1, 0x1b, 0x64, 0x21, 0xe0, 0xea,
52
+ 0x01, 0xd4, 0x2c, 0xa4, 0x16, 0x9e, 0x79, 0x18, 0xba, 0x0d,
53
+ };
54
+ static const uint8_t kPoint2[32] = {
55
+ 0xe5, 0x21, 0x0f, 0x12, 0x78, 0x68, 0x11, 0xd3, 0xf4, 0xb7, 0x95,
56
+ 0x9d, 0x05, 0x38, 0xae, 0x2c, 0x31, 0xdb, 0xe7, 0x10, 0x6f, 0xc0,
57
+ 0x3c, 0x3e, 0xfc, 0x4c, 0xd5, 0x49, 0xc7, 0x15, 0xa4, 0x93,
58
+ };
59
+
60
+ X25519(out, kScalar2, kPoint2);
61
+
62
+ static const uint8_t kExpected2[32] = {
63
+ 0x95, 0xcb, 0xde, 0x94, 0x76, 0xe8, 0x90, 0x7d, 0x7a, 0xad, 0xe4,
64
+ 0x5c, 0xb4, 0xb8, 0x73, 0xf8, 0x8b, 0x59, 0x5a, 0x68, 0x79, 0x9f,
65
+ 0xa1, 0x52, 0xe6, 0xf8, 0xf7, 0x64, 0x7a, 0xac, 0x79, 0x57,
66
+ };
67
+ if (memcmp(kExpected2, out, sizeof(out)) != 0) {
68
+ fprintf(stderr, "X25519 test two failed.\n");
69
+ return false;
70
+ }
71
+
72
+ return true;
73
+ }
74
+
75
+ static bool TestX25519SmallOrder() {
76
+ static const uint8_t kSmallOrderPoint[32] = {
77
+ 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3,
78
+ 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32,
79
+ 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8,
80
+ };
81
+
82
+ uint8_t out[32], private_key[32];
83
+ memset(private_key, 0x11, sizeof(private_key));
84
+
85
+ if (X25519(out, private_key, kSmallOrderPoint)) {
86
+ fprintf(stderr, "X25519 returned success with a small-order input.\n");
87
+ return false;
88
+ }
89
+
90
+ return true;
91
+ }
92
+
93
+ static bool TestX25519Iterated() {
94
+ /* Taken from
95
+ * https://tools.ietf.org/html/draft-irtf-cfrg-curves-11#section-5.2 */
96
+ uint8_t scalar[32] = {9}, point[32] = {9}, out[32];
97
+
98
+ unsigned i;
99
+ for (i = 0; i < 1000; i++) {
100
+ X25519(out, scalar, point);
101
+ memcpy(point, scalar, sizeof(point));
102
+ memcpy(scalar, out, sizeof(scalar));
103
+ }
104
+
105
+ static const uint8_t kExpected[32] = {
106
+ 0x68, 0x4c, 0xf5, 0x9b, 0xa8, 0x33, 0x09, 0x55, 0x28, 0x00, 0xef,
107
+ 0x56, 0x6f, 0x2f, 0x4d, 0x3c, 0x1c, 0x38, 0x87, 0xc4, 0x93, 0x60,
108
+ 0xe3, 0x87, 0x5f, 0x2e, 0xb9, 0x4d, 0x99, 0x53, 0x2c, 0x51,
109
+ };
110
+
111
+ if (memcmp(kExpected, scalar, sizeof(kExpected)) != 0) {
112
+ fprintf(stderr, "Iterated X25519 test failed\n");
113
+ return false;
114
+ }
115
+
116
+ return true;
117
+ }
118
+
119
+ int main(int argc, char **argv) {
120
+ if (!TestX25519() ||
121
+ !TestX25519Iterated() ||
122
+ !TestX25519SmallOrder()) {
123
+ return 1;
124
+ }
125
+
126
+ printf("PASS\n");
127
+ return 0;
128
+ }
@@ -0,0 +1,181 @@
1
+ /* ====================================================================
2
+ * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved.
3
+ *
4
+ * Redistribution and use in source and binary forms, with or without
5
+ * modification, are permitted provided that the following conditions
6
+ * are met:
7
+ *
8
+ * 1. Redistributions of source code must retain the above copyright
9
+ * notice, this list of conditions and the following disclaimer.
10
+ *
11
+ * 2. Redistributions in binary form must reproduce the above copyright
12
+ * notice, this list of conditions and the following disclaimer in
13
+ * the documentation and/or other materials provided with the
14
+ * distribution.
15
+ *
16
+ * 3. All advertising materials mentioning features or use of this
17
+ * software must display the following acknowledgment:
18
+ * "This product includes software developed by the OpenSSL Project
19
+ * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
20
+ *
21
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22
+ * endorse or promote products derived from this software without
23
+ * prior written permission. For written permission, please contact
24
+ * licensing@OpenSSL.org.
25
+ *
26
+ * 5. Products derived from this software may not be called "OpenSSL"
27
+ * nor may "OpenSSL" appear in their names without prior written
28
+ * permission of the OpenSSL Project.
29
+ *
30
+ * 6. Redistributions of any form whatsoever must retain the following
31
+ * acknowledgment:
32
+ * "This product includes software developed by the OpenSSL Project
33
+ * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
34
+ *
35
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
47
+ * ==================================================================== */
48
+
49
+ #ifndef OPENSSL_HEADER_MD32_COMMON_H
50
+ #define OPENSSL_HEADER_MD32_COMMON_H
51
+
52
+ #include <openssl/base.h>
53
+
54
+
55
+ #if defined(__cplusplus)
56
+ extern "C" {
57
+ #endif
58
+
59
+ #define asm __asm__
60
+
61
+ /* One of |DATA_ORDER_IS_BIG_ENDIAN| or |DATA_ORDER_IS_LITTLE_ENDIAN| must be
62
+ * defined to specify the byte order of the input stream. */
63
+
64
+ #if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
65
+ #error "DATA_ORDER must be defined!"
66
+ #endif
67
+
68
+ /*
69
+ * Engage compiler specific rotate intrinsic function if available.
70
+ */
71
+ #undef ROTATE
72
+ # if defined(_MSC_VER)
73
+ # define ROTATE(a,n) _lrotl(a,n)
74
+ # elif defined(__ICC)
75
+ # define ROTATE(a,n) _rotl(a,n)
76
+ # elif defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM)
77
+ /*
78
+ * Some GNU C inline assembler templates. Note that these are
79
+ * rotates by *constant* number of bits! But that's exactly
80
+ * what we need here...
81
+ * <appro@fy.chalmers.se>
82
+ */
83
+ # if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
84
+ # define ROTATE(a,n) ({ register uint32_t ret; \
85
+ asm ( \
86
+ "roll %1,%0" \
87
+ : "=r"(ret) \
88
+ : "I"(n), "0"((uint32_t)(a)) \
89
+ : "cc"); \
90
+ ret; \
91
+ })
92
+ # endif /* OPENSSL_X86 || OPENSSL_X86_64 */
93
+ # endif /* COMPILER */
94
+
95
+ #ifndef ROTATE
96
+ #define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n))))
97
+ #endif
98
+
99
+ #if defined(DATA_ORDER_IS_BIG_ENDIAN)
100
+
101
+ #ifndef PEDANTIC
102
+ # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM)
103
+ # if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
104
+ /*
105
+ * This gives ~30-40% performance improvement in SHA-256 compiled
106
+ * with gcc [on P4]. Well, first macro to be frank. We can pull
107
+ * this trick on x86* platforms only, because these CPUs can fetch
108
+ * unaligned data without raising an exception.
109
+ */
110
+ # define HOST_c2l(c,l) ({ uint32_t r=*((const uint32_t *)(c)); \
111
+ asm ("bswapl %0":"=r"(r):"0"(r)); \
112
+ (c)+=4; (l)=r; })
113
+ # define HOST_l2c(l,c) ({ uint32_t r=(l); \
114
+ asm ("bswapl %0":"=r"(r):"0"(r)); \
115
+ *((uint32_t *)(c))=r; (c)+=4; r; })
116
+ # elif defined(__aarch64__)
117
+ # if defined(__BYTE_ORDER__)
118
+ # if defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__==__ORDER_LITTLE_ENDIAN__
119
+ # define HOST_c2l(c,l) ({ uint32_t r; \
120
+ asm ("rev %w0,%w1" \
121
+ :"=r"(r) \
122
+ :"r"(*((const uint32_t *)(c))));\
123
+ (c)+=4; (l)=r; })
124
+ # define HOST_l2c(l,c) ({ uint32_t r; \
125
+ asm ("rev %w0,%w1" \
126
+ :"=r"(r) \
127
+ :"r"((uint32_t)(l))); \
128
+ *((uint32_t *)(c))=r; (c)+=4; r; })
129
+ # elif defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__==__ORDER_BIG_ENDIAN__
130
+ # define HOST_c2l(c,l) (void)((l)=*((const uint32_t *)(c)), (c)+=4)
131
+ # define HOST_l2c(l,c) (*((uint32_t *)(c))=(l), (c)+=4, (l))
132
+ # endif
133
+ # endif
134
+ # endif
135
+ # endif
136
+ #endif
137
+
138
+ #ifndef HOST_c2l
139
+ #define HOST_c2l(c,l) (void)(l =(((uint32_t)(*((c)++)))<<24), \
140
+ l|=(((uint32_t)(*((c)++)))<<16), \
141
+ l|=(((uint32_t)(*((c)++)))<< 8), \
142
+ l|=(((uint32_t)(*((c)++))) ))
143
+ #endif
144
+ #ifndef HOST_l2c
145
+ #define HOST_l2c(l,c) (*((c)++)=(uint8_t)(((l)>>24)&0xff), \
146
+ *((c)++)=(uint8_t)(((l)>>16)&0xff), \
147
+ *((c)++)=(uint8_t)(((l)>> 8)&0xff), \
148
+ *((c)++)=(uint8_t)(((l) )&0xff), \
149
+ l)
150
+ #endif
151
+
152
+ #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
153
+
154
+ #if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
155
+ /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
156
+ # define HOST_c2l(c,l) (void)((l)=*((const uint32_t *)(c)), (c)+=4)
157
+ # define HOST_l2c(l,c) (*((uint32_t *)(c))=(l), (c)+=4, l)
158
+ #endif
159
+
160
+ #ifndef HOST_c2l
161
+ #define HOST_c2l(c,l) (void)(l =(((uint32_t)(*((c)++))) ), \
162
+ l|=(((uint32_t)(*((c)++)))<< 8), \
163
+ l|=(((uint32_t)(*((c)++)))<<16), \
164
+ l|=(((uint32_t)(*((c)++)))<<24))
165
+ #endif
166
+ #ifndef HOST_l2c
167
+ #define HOST_l2c(l,c) (*((c)++)=(uint8_t)(((l) )&0xff), \
168
+ *((c)++)=(uint8_t)(((l)>> 8)&0xff), \
169
+ *((c)++)=(uint8_t)(((l)>>16)&0xff), \
170
+ *((c)++)=(uint8_t)(((l)>>24)&0xff), \
171
+ l)
172
+ #endif
173
+
174
+ #endif
175
+
176
+
177
+ #if defined(__cplusplus)
178
+ } /* extern C */
179
+ #endif
180
+
181
+ #endif /* OPENSSL_HEADER_MD32_COMMON_H */
@@ -0,0 +1,2725 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # Copyright (c) 2014, Intel Corporation.
4
+ #
5
+ # Permission to use, copy, modify, and/or distribute this software for any
6
+ # purpose with or without fee is hereby granted, provided that the above
7
+ # copyright notice and this permission notice appear in all copies.
8
+ #
9
+ # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10
+ # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11
+ # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
12
+ # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13
+ # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
14
+ # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
15
+ # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16
+
17
+ # Developers and authors:
18
+ # Shay Gueron (1, 2), and Vlad Krasnov (1)
19
+ # (1) Intel Corporation, Israel Development Center
20
+ # (2) University of Haifa
21
+
22
+ # Reference:
23
+ # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
24
+ # 256 Bit Primes"
25
+
26
+ # Further optimization by <appro@openssl.org>:
27
+ #
28
+ # this/original
29
+ # Opteron +12-49%
30
+ # Bulldozer +14-45%
31
+ # P4 +18-46%
32
+ # Westmere +12-34%
33
+ # Sandy Bridge +9-35%
34
+ # Ivy Bridge +9-35%
35
+ # Haswell +8-37%
36
+ # Broadwell +18-58%
37
+ # Atom +15-50%
38
+ # VIA Nano +43-160%
39
+ #
40
+ # Ranges denote minimum and maximum improvement coefficients depending
41
+ # on benchmark.
42
+
43
+ $flavour = shift;
44
+ $output = shift;
45
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
+
47
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
+
49
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52
+ die "can't locate x86_64-xlate.pl";
53
+
54
+ open OUT,"| \"$^X\" $xlate $flavour $output";
55
+ *STDOUT=*OUT;
56
+
57
+ # TODO: enable these after testing. $avx goes to two and $addx to one.
58
+ $avx=0;
59
+ $addx=0;
60
+
61
+ $code.=<<___;
62
+ .text
63
+ .extern OPENSSL_ia32cap_P
64
+
65
+ # The polynomial
66
+ .align 64
67
+ .Lpoly:
68
+ .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
69
+
70
+ .LOne:
71
+ .long 1,1,1,1,1,1,1,1
72
+ .LTwo:
73
+ .long 2,2,2,2,2,2,2,2
74
+ .LThree:
75
+ .long 3,3,3,3,3,3,3,3
76
+ .LONE_mont:
77
+ .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
78
+ ___
79
+
80
+ {
81
+ ################################################################################
82
+ # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]);
83
+
84
+ my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
85
+ my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
86
+ my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
87
+
88
+ $code.=<<___;
89
+
90
+ .type ecp_nistz256_mul_by_2,\@function,2
91
+ .align 64
92
+ ecp_nistz256_mul_by_2:
93
+ push %r12
94
+ push %r13
95
+
96
+ mov 8*0($a_ptr), $a0
97
+ mov 8*1($a_ptr), $a1
98
+ add $a0, $a0 # a0:a3+a0:a3
99
+ mov 8*2($a_ptr), $a2
100
+ adc $a1, $a1
101
+ mov 8*3($a_ptr), $a3
102
+ lea .Lpoly(%rip), $a_ptr
103
+ mov $a0, $t0
104
+ adc $a2, $a2
105
+ adc $a3, $a3
106
+ mov $a1, $t1
107
+ sbb $t4, $t4
108
+
109
+ sub 8*0($a_ptr), $a0
110
+ mov $a2, $t2
111
+ sbb 8*1($a_ptr), $a1
112
+ sbb 8*2($a_ptr), $a2
113
+ mov $a3, $t3
114
+ sbb 8*3($a_ptr), $a3
115
+ test $t4, $t4
116
+
117
+ cmovz $t0, $a0
118
+ cmovz $t1, $a1
119
+ mov $a0, 8*0($r_ptr)
120
+ cmovz $t2, $a2
121
+ mov $a1, 8*1($r_ptr)
122
+ cmovz $t3, $a3
123
+ mov $a2, 8*2($r_ptr)
124
+ mov $a3, 8*3($r_ptr)
125
+
126
+ pop %r13
127
+ pop %r12
128
+ ret
129
+ .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
130
+
131
+ ################################################################################
132
+ # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
133
+ .globl ecp_nistz256_neg
134
+ .type ecp_nistz256_neg,\@function,2
135
+ .align 32
136
+ ecp_nistz256_neg:
137
+ push %r12
138
+ push %r13
139
+
140
+ xor $a0, $a0
141
+ xor $a1, $a1
142
+ xor $a2, $a2
143
+ xor $a3, $a3
144
+ xor $t4, $t4
145
+
146
+ sub 8*0($a_ptr), $a0
147
+ sbb 8*1($a_ptr), $a1
148
+ sbb 8*2($a_ptr), $a2
149
+ mov $a0, $t0
150
+ sbb 8*3($a_ptr), $a3
151
+ lea .Lpoly(%rip), $a_ptr
152
+ mov $a1, $t1
153
+ sbb \$0, $t4
154
+
155
+ add 8*0($a_ptr), $a0
156
+ mov $a2, $t2
157
+ adc 8*1($a_ptr), $a1
158
+ adc 8*2($a_ptr), $a2
159
+ mov $a3, $t3
160
+ adc 8*3($a_ptr), $a3
161
+ test $t4, $t4
162
+
163
+ cmovz $t0, $a0
164
+ cmovz $t1, $a1
165
+ mov $a0, 8*0($r_ptr)
166
+ cmovz $t2, $a2
167
+ mov $a1, 8*1($r_ptr)
168
+ cmovz $t3, $a3
169
+ mov $a2, 8*2($r_ptr)
170
+ mov $a3, 8*3($r_ptr)
171
+
172
+ pop %r13
173
+ pop %r12
174
+ ret
175
+ .size ecp_nistz256_neg,.-ecp_nistz256_neg
176
+ ___
177
+ }
178
+ {
179
+ my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
180
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
181
+ my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
182
+ my ($poly1,$poly3)=($acc6,$acc7);
183
+
184
+ $code.=<<___;
185
+ ################################################################################
186
+ # void ecp_nistz256_mul_mont(
187
+ # uint64_t res[4],
188
+ # uint64_t a[4],
189
+ # uint64_t b[4]);
190
+
191
+ .globl ecp_nistz256_mul_mont
192
+ .type ecp_nistz256_mul_mont,\@function,3
193
+ .align 32
194
+ ecp_nistz256_mul_mont:
195
+ ___
196
+ $code.=<<___ if ($addx);
197
+ mov \$0x80100, %ecx
198
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
199
+ ___
200
+ $code.=<<___;
201
+ .Lmul_mont:
202
+ push %rbp
203
+ push %rbx
204
+ push %r12
205
+ push %r13
206
+ push %r14
207
+ push %r15
208
+ ___
209
+ $code.=<<___ if ($addx);
210
+ cmp \$0x80100, %ecx
211
+ je .Lmul_montx
212
+ ___
213
+ $code.=<<___;
214
+ mov $b_org, $b_ptr
215
+ mov 8*0($b_org), %rax
216
+ mov 8*0($a_ptr), $acc1
217
+ mov 8*1($a_ptr), $acc2
218
+ mov 8*2($a_ptr), $acc3
219
+ mov 8*3($a_ptr), $acc4
220
+
221
+ call __ecp_nistz256_mul_montq
222
+ ___
223
+ $code.=<<___ if ($addx);
224
+ jmp .Lmul_mont_done
225
+
226
+ .align 32
227
+ .Lmul_montx:
228
+ mov $b_org, $b_ptr
229
+ mov 8*0($b_org), %rdx
230
+ mov 8*0($a_ptr), $acc1
231
+ mov 8*1($a_ptr), $acc2
232
+ mov 8*2($a_ptr), $acc3
233
+ mov 8*3($a_ptr), $acc4
234
+ lea -128($a_ptr), $a_ptr # control u-op density
235
+
236
+ call __ecp_nistz256_mul_montx
237
+ ___
238
+ $code.=<<___;
239
+ .Lmul_mont_done:
240
+ pop %r15
241
+ pop %r14
242
+ pop %r13
243
+ pop %r12
244
+ pop %rbx
245
+ pop %rbp
246
+ ret
247
+ .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
248
+
249
+ .type __ecp_nistz256_mul_montq,\@abi-omnipotent
250
+ .align 32
251
+ __ecp_nistz256_mul_montq:
252
+ ########################################################################
253
+ # Multiply a by b[0]
254
+ mov %rax, $t1
255
+ mulq $acc1
256
+ mov .Lpoly+8*1(%rip),$poly1
257
+ mov %rax, $acc0
258
+ mov $t1, %rax
259
+ mov %rdx, $acc1
260
+
261
+ mulq $acc2
262
+ mov .Lpoly+8*3(%rip),$poly3
263
+ add %rax, $acc1
264
+ mov $t1, %rax
265
+ adc \$0, %rdx
266
+ mov %rdx, $acc2
267
+
268
+ mulq $acc3
269
+ add %rax, $acc2
270
+ mov $t1, %rax
271
+ adc \$0, %rdx
272
+ mov %rdx, $acc3
273
+
274
+ mulq $acc4
275
+ add %rax, $acc3
276
+ mov $acc0, %rax
277
+ adc \$0, %rdx
278
+ xor $acc5, $acc5
279
+ mov %rdx, $acc4
280
+
281
+ ########################################################################
282
+ # First reduction step
283
+ # Basically now we want to multiply acc[0] by p256,
284
+ # and add the result to the acc.
285
+ # Due to the special form of p256 we do some optimizations
286
+ #
287
+ # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
288
+ # then we add acc[0] and get acc[0] x 2^96
289
+
290
+ mov $acc0, $t1
291
+ shl \$32, $acc0
292
+ mulq $poly3
293
+ shr \$32, $t1
294
+ add $acc0, $acc1 # +=acc[0]<<96
295
+ adc $t1, $acc2
296
+ adc %rax, $acc3
297
+ mov 8*1($b_ptr), %rax
298
+ adc %rdx, $acc4
299
+ adc \$0, $acc5
300
+ xor $acc0, $acc0
301
+
302
+ ########################################################################
303
+ # Multiply by b[1]
304
+ mov %rax, $t1
305
+ mulq 8*0($a_ptr)
306
+ add %rax, $acc1
307
+ mov $t1, %rax
308
+ adc \$0, %rdx
309
+ mov %rdx, $t0
310
+
311
+ mulq 8*1($a_ptr)
312
+ add $t0, $acc2
313
+ adc \$0, %rdx
314
+ add %rax, $acc2
315
+ mov $t1, %rax
316
+ adc \$0, %rdx
317
+ mov %rdx, $t0
318
+
319
+ mulq 8*2($a_ptr)
320
+ add $t0, $acc3
321
+ adc \$0, %rdx
322
+ add %rax, $acc3
323
+ mov $t1, %rax
324
+ adc \$0, %rdx
325
+ mov %rdx, $t0
326
+
327
+ mulq 8*3($a_ptr)
328
+ add $t0, $acc4
329
+ adc \$0, %rdx
330
+ add %rax, $acc4
331
+ mov $acc1, %rax
332
+ adc %rdx, $acc5
333
+ adc \$0, $acc0
334
+
335
+ ########################################################################
336
+ # Second reduction step
337
+ mov $acc1, $t1
338
+ shl \$32, $acc1
339
+ mulq $poly3
340
+ shr \$32, $t1
341
+ add $acc1, $acc2
342
+ adc $t1, $acc3
343
+ adc %rax, $acc4
344
+ mov 8*2($b_ptr), %rax
345
+ adc %rdx, $acc5
346
+ adc \$0, $acc0
347
+ xor $acc1, $acc1
348
+
349
+ ########################################################################
350
+ # Multiply by b[2]
351
+ mov %rax, $t1
352
+ mulq 8*0($a_ptr)
353
+ add %rax, $acc2
354
+ mov $t1, %rax
355
+ adc \$0, %rdx
356
+ mov %rdx, $t0
357
+
358
+ mulq 8*1($a_ptr)
359
+ add $t0, $acc3
360
+ adc \$0, %rdx
361
+ add %rax, $acc3
362
+ mov $t1, %rax
363
+ adc \$0, %rdx
364
+ mov %rdx, $t0
365
+
366
+ mulq 8*2($a_ptr)
367
+ add $t0, $acc4
368
+ adc \$0, %rdx
369
+ add %rax, $acc4
370
+ mov $t1, %rax
371
+ adc \$0, %rdx
372
+ mov %rdx, $t0
373
+
374
+ mulq 8*3($a_ptr)
375
+ add $t0, $acc5
376
+ adc \$0, %rdx
377
+ add %rax, $acc5
378
+ mov $acc2, %rax
379
+ adc %rdx, $acc0
380
+ adc \$0, $acc1
381
+
382
+ ########################################################################
383
+ # Third reduction step
384
+ mov $acc2, $t1
385
+ shl \$32, $acc2
386
+ mulq $poly3
387
+ shr \$32, $t1
388
+ add $acc2, $acc3
389
+ adc $t1, $acc4
390
+ adc %rax, $acc5
391
+ mov 8*3($b_ptr), %rax
392
+ adc %rdx, $acc0
393
+ adc \$0, $acc1
394
+ xor $acc2, $acc2
395
+
396
+ ########################################################################
397
+ # Multiply by b[3]
398
+ mov %rax, $t1
399
+ mulq 8*0($a_ptr)
400
+ add %rax, $acc3
401
+ mov $t1, %rax
402
+ adc \$0, %rdx
403
+ mov %rdx, $t0
404
+
405
+ mulq 8*1($a_ptr)
406
+ add $t0, $acc4
407
+ adc \$0, %rdx
408
+ add %rax, $acc4
409
+ mov $t1, %rax
410
+ adc \$0, %rdx
411
+ mov %rdx, $t0
412
+
413
+ mulq 8*2($a_ptr)
414
+ add $t0, $acc5
415
+ adc \$0, %rdx
416
+ add %rax, $acc5
417
+ mov $t1, %rax
418
+ adc \$0, %rdx
419
+ mov %rdx, $t0
420
+
421
+ mulq 8*3($a_ptr)
422
+ add $t0, $acc0
423
+ adc \$0, %rdx
424
+ add %rax, $acc0
425
+ mov $acc3, %rax
426
+ adc %rdx, $acc1
427
+ adc \$0, $acc2
428
+
429
+ ########################################################################
430
+ # Final reduction step
431
+ mov $acc3, $t1
432
+ shl \$32, $acc3
433
+ mulq $poly3
434
+ shr \$32, $t1
435
+ add $acc3, $acc4
436
+ adc $t1, $acc5
437
+ mov $acc4, $t0
438
+ adc %rax, $acc0
439
+ adc %rdx, $acc1
440
+ mov $acc5, $t1
441
+ adc \$0, $acc2
442
+
443
+ ########################################################################
444
+ # Branch-less conditional subtraction of P
445
+ sub \$-1, $acc4 # .Lpoly[0]
446
+ mov $acc0, $t2
447
+ sbb $poly1, $acc5 # .Lpoly[1]
448
+ sbb \$0, $acc0 # .Lpoly[2]
449
+ mov $acc1, $t3
450
+ sbb $poly3, $acc1 # .Lpoly[3]
451
+ sbb \$0, $acc2
452
+
453
+ cmovc $t0, $acc4
454
+ cmovc $t1, $acc5
455
+ mov $acc4, 8*0($r_ptr)
456
+ cmovc $t2, $acc0
457
+ mov $acc5, 8*1($r_ptr)
458
+ cmovc $t3, $acc1
459
+ mov $acc0, 8*2($r_ptr)
460
+ mov $acc1, 8*3($r_ptr)
461
+
462
+ ret
463
+ .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
464
+
465
+ ################################################################################
466
+ # void ecp_nistz256_sqr_mont(
467
+ # uint64_t res[4],
468
+ # uint64_t a[4]);
469
+
470
+ # we optimize the square according to S.Gueron and V.Krasnov,
471
+ # "Speeding up Big-Number Squaring"
472
+ .globl ecp_nistz256_sqr_mont
473
+ .type ecp_nistz256_sqr_mont,\@function,2
474
+ .align 32
475
+ ecp_nistz256_sqr_mont:
476
+ ___
477
+ $code.=<<___ if ($addx);
478
+ mov \$0x80100, %ecx
479
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
480
+ ___
481
+ $code.=<<___;
482
+ push %rbp
483
+ push %rbx
484
+ push %r12
485
+ push %r13
486
+ push %r14
487
+ push %r15
488
+ ___
489
+ $code.=<<___ if ($addx);
490
+ cmp \$0x80100, %ecx
491
+ je .Lsqr_montx
492
+ ___
493
+ $code.=<<___;
494
+ mov 8*0($a_ptr), %rax
495
+ mov 8*1($a_ptr), $acc6
496
+ mov 8*2($a_ptr), $acc7
497
+ mov 8*3($a_ptr), $acc0
498
+
499
+ call __ecp_nistz256_sqr_montq
500
+ ___
501
+ $code.=<<___ if ($addx);
502
+ jmp .Lsqr_mont_done
503
+
504
+ .align 32
505
+ .Lsqr_montx:
506
+ mov 8*0($a_ptr), %rdx
507
+ mov 8*1($a_ptr), $acc6
508
+ mov 8*2($a_ptr), $acc7
509
+ mov 8*3($a_ptr), $acc0
510
+ lea -128($a_ptr), $a_ptr # control u-op density
511
+
512
+ call __ecp_nistz256_sqr_montx
513
+ ___
514
+ $code.=<<___;
515
+ .Lsqr_mont_done:
516
+ pop %r15
517
+ pop %r14
518
+ pop %r13
519
+ pop %r12
520
+ pop %rbx
521
+ pop %rbp
522
+ ret
523
+ .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
524
+
525
+ .type __ecp_nistz256_sqr_montq,\@abi-omnipotent
526
+ .align 32
527
+ __ecp_nistz256_sqr_montq:
528
+ mov %rax, $acc5
529
+ mulq $acc6 # a[1]*a[0]
530
+ mov %rax, $acc1
531
+ mov $acc7, %rax
532
+ mov %rdx, $acc2
533
+
534
+ mulq $acc5 # a[0]*a[2]
535
+ add %rax, $acc2
536
+ mov $acc0, %rax
537
+ adc \$0, %rdx
538
+ mov %rdx, $acc3
539
+
540
+ mulq $acc5 # a[0]*a[3]
541
+ add %rax, $acc3
542
+ mov $acc7, %rax
543
+ adc \$0, %rdx
544
+ mov %rdx, $acc4
545
+
546
+ #################################
547
+ mulq $acc6 # a[1]*a[2]
548
+ add %rax, $acc3
549
+ mov $acc0, %rax
550
+ adc \$0, %rdx
551
+ mov %rdx, $t1
552
+
553
+ mulq $acc6 # a[1]*a[3]
554
+ add %rax, $acc4
555
+ mov $acc0, %rax
556
+ adc \$0, %rdx
557
+ add $t1, $acc4
558
+ mov %rdx, $acc5
559
+ adc \$0, $acc5
560
+
561
+ #################################
562
+ mulq $acc7 # a[2]*a[3]
563
+ xor $acc7, $acc7
564
+ add %rax, $acc5
565
+ mov 8*0($a_ptr), %rax
566
+ mov %rdx, $acc6
567
+ adc \$0, $acc6
568
+
569
+ add $acc1, $acc1 # acc1:6<<1
570
+ adc $acc2, $acc2
571
+ adc $acc3, $acc3
572
+ adc $acc4, $acc4
573
+ adc $acc5, $acc5
574
+ adc $acc6, $acc6
575
+ adc \$0, $acc7
576
+
577
+ mulq %rax
578
+ mov %rax, $acc0
579
+ mov 8*1($a_ptr), %rax
580
+ mov %rdx, $t0
581
+
582
+ mulq %rax
583
+ add $t0, $acc1
584
+ adc %rax, $acc2
585
+ mov 8*2($a_ptr), %rax
586
+ adc \$0, %rdx
587
+ mov %rdx, $t0
588
+
589
+ mulq %rax
590
+ add $t0, $acc3
591
+ adc %rax, $acc4
592
+ mov 8*3($a_ptr), %rax
593
+ adc \$0, %rdx
594
+ mov %rdx, $t0
595
+
596
+ mulq %rax
597
+ add $t0, $acc5
598
+ adc %rax, $acc6
599
+ mov $acc0, %rax
600
+ adc %rdx, $acc7
601
+
602
+ mov .Lpoly+8*1(%rip), $a_ptr
603
+ mov .Lpoly+8*3(%rip), $t1
604
+
605
+ ##########################################
606
+ # Now the reduction
607
+ # First iteration
608
+ mov $acc0, $t0
609
+ shl \$32, $acc0
610
+ mulq $t1
611
+ shr \$32, $t0
612
+ add $acc0, $acc1 # +=acc[0]<<96
613
+ adc $t0, $acc2
614
+ adc %rax, $acc3
615
+ mov $acc1, %rax
616
+ adc \$0, %rdx
617
+
618
+ ##########################################
619
+ # Second iteration
620
+ mov $acc1, $t0
621
+ shl \$32, $acc1
622
+ mov %rdx, $acc0
623
+ mulq $t1
624
+ shr \$32, $t0
625
+ add $acc1, $acc2
626
+ adc $t0, $acc3
627
+ adc %rax, $acc0
628
+ mov $acc2, %rax
629
+ adc \$0, %rdx
630
+
631
+ ##########################################
632
+ # Third iteration
633
+ mov $acc2, $t0
634
+ shl \$32, $acc2
635
+ mov %rdx, $acc1
636
+ mulq $t1
637
+ shr \$32, $t0
638
+ add $acc2, $acc3
639
+ adc $t0, $acc0
640
+ adc %rax, $acc1
641
+ mov $acc3, %rax
642
+ adc \$0, %rdx
643
+
644
+ ###########################################
645
+ # Last iteration
646
+ mov $acc3, $t0
647
+ shl \$32, $acc3
648
+ mov %rdx, $acc2
649
+ mulq $t1
650
+ shr \$32, $t0
651
+ add $acc3, $acc0
652
+ adc $t0, $acc1
653
+ adc %rax, $acc2
654
+ adc \$0, %rdx
655
+ xor $acc3, $acc3
656
+
657
+ ############################################
658
+ # Add the rest of the acc
659
+ add $acc0, $acc4
660
+ adc $acc1, $acc5
661
+ mov $acc4, $acc0
662
+ adc $acc2, $acc6
663
+ adc %rdx, $acc7
664
+ mov $acc5, $acc1
665
+ adc \$0, $acc3
666
+
667
+ sub \$-1, $acc4 # .Lpoly[0]
668
+ mov $acc6, $acc2
669
+ sbb $a_ptr, $acc5 # .Lpoly[1]
670
+ sbb \$0, $acc6 # .Lpoly[2]
671
+ mov $acc7, $t0
672
+ sbb $t1, $acc7 # .Lpoly[3]
673
+ sbb \$0, $acc3
674
+
675
+ cmovc $acc0, $acc4
676
+ cmovc $acc1, $acc5
677
+ mov $acc4, 8*0($r_ptr)
678
+ cmovc $acc2, $acc6
679
+ mov $acc5, 8*1($r_ptr)
680
+ cmovc $t0, $acc7
681
+ mov $acc6, 8*2($r_ptr)
682
+ mov $acc7, 8*3($r_ptr)
683
+
684
+ ret
685
+ .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
686
+ ___
687
+
688
+ if ($addx) {
689
+ $code.=<<___;
690
+ .type __ecp_nistz256_mul_montx,\@abi-omnipotent
691
+ .align 32
692
+ __ecp_nistz256_mul_montx:
693
+ ########################################################################
694
+ # Multiply by b[0]
695
+ mulx $acc1, $acc0, $acc1
696
+ mulx $acc2, $t0, $acc2
697
+ mov \$32, $poly1
698
+ xor $acc5, $acc5 # cf=0
699
+ mulx $acc3, $t1, $acc3
700
+ mov .Lpoly+8*3(%rip), $poly3
701
+ adc $t0, $acc1
702
+ mulx $acc4, $t0, $acc4
703
+ mov $acc0, %rdx
704
+ adc $t1, $acc2
705
+ shlx $poly1,$acc0,$t1
706
+ adc $t0, $acc3
707
+ shrx $poly1,$acc0,$t0
708
+ adc \$0, $acc4
709
+
710
+ ########################################################################
711
+ # First reduction step
712
+ add $t1, $acc1
713
+ adc $t0, $acc2
714
+
715
+ mulx $poly3, $t0, $t1
716
+ mov 8*1($b_ptr), %rdx
717
+ adc $t0, $acc3
718
+ adc $t1, $acc4
719
+ adc \$0, $acc5
720
+ xor $acc0, $acc0 # $acc0=0,cf=0,of=0
721
+
722
+ ########################################################################
723
+ # Multiply by b[1]
724
+ mulx 8*0+128($a_ptr), $t0, $t1
725
+ adcx $t0, $acc1
726
+ adox $t1, $acc2
727
+
728
+ mulx 8*1+128($a_ptr), $t0, $t1
729
+ adcx $t0, $acc2
730
+ adox $t1, $acc3
731
+
732
+ mulx 8*2+128($a_ptr), $t0, $t1
733
+ adcx $t0, $acc3
734
+ adox $t1, $acc4
735
+
736
+ mulx 8*3+128($a_ptr), $t0, $t1
737
+ mov $acc1, %rdx
738
+ adcx $t0, $acc4
739
+ shlx $poly1, $acc1, $t0
740
+ adox $t1, $acc5
741
+ shrx $poly1, $acc1, $t1
742
+
743
+ adcx $acc0, $acc5
744
+ adox $acc0, $acc0
745
+ adc \$0, $acc0
746
+
747
+ ########################################################################
748
+ # Second reduction step
749
+ add $t0, $acc2
750
+ adc $t1, $acc3
751
+
752
+ mulx $poly3, $t0, $t1
753
+ mov 8*2($b_ptr), %rdx
754
+ adc $t0, $acc4
755
+ adc $t1, $acc5
756
+ adc \$0, $acc0
757
+ xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
758
+
759
+ ########################################################################
760
+ # Multiply by b[2]
761
+ mulx 8*0+128($a_ptr), $t0, $t1
762
+ adcx $t0, $acc2
763
+ adox $t1, $acc3
764
+
765
+ mulx 8*1+128($a_ptr), $t0, $t1
766
+ adcx $t0, $acc3
767
+ adox $t1, $acc4
768
+
769
+ mulx 8*2+128($a_ptr), $t0, $t1
770
+ adcx $t0, $acc4
771
+ adox $t1, $acc5
772
+
773
+ mulx 8*3+128($a_ptr), $t0, $t1
774
+ mov $acc2, %rdx
775
+ adcx $t0, $acc5
776
+ shlx $poly1, $acc2, $t0
777
+ adox $t1, $acc0
778
+ shrx $poly1, $acc2, $t1
779
+
780
+ adcx $acc1, $acc0
781
+ adox $acc1, $acc1
782
+ adc \$0, $acc1
783
+
784
+ ########################################################################
785
+ # Third reduction step
786
+ add $t0, $acc3
787
+ adc $t1, $acc4
788
+
789
+ mulx $poly3, $t0, $t1
790
+ mov 8*3($b_ptr), %rdx
791
+ adc $t0, $acc5
792
+ adc $t1, $acc0
793
+ adc \$0, $acc1
794
+ xor $acc2, $acc2 # $acc2=0,cf=0,of=0
795
+
796
+ ########################################################################
797
+ # Multiply by b[3]
798
+ mulx 8*0+128($a_ptr), $t0, $t1
799
+ adcx $t0, $acc3
800
+ adox $t1, $acc4
801
+
802
+ mulx 8*1+128($a_ptr), $t0, $t1
803
+ adcx $t0, $acc4
804
+ adox $t1, $acc5
805
+
806
+ mulx 8*2+128($a_ptr), $t0, $t1
807
+ adcx $t0, $acc5
808
+ adox $t1, $acc0
809
+
810
+ mulx 8*3+128($a_ptr), $t0, $t1
811
+ mov $acc3, %rdx
812
+ adcx $t0, $acc0
813
+ shlx $poly1, $acc3, $t0
814
+ adox $t1, $acc1
815
+ shrx $poly1, $acc3, $t1
816
+
817
+ adcx $acc2, $acc1
818
+ adox $acc2, $acc2
819
+ adc \$0, $acc2
820
+
821
+ ########################################################################
822
+ # Fourth reduction step
823
+ add $t0, $acc4
824
+ adc $t1, $acc5
825
+
826
+ mulx $poly3, $t0, $t1
827
+ mov $acc4, $t2
828
+ mov .Lpoly+8*1(%rip), $poly1
829
+ adc $t0, $acc0
830
+ mov $acc5, $t3
831
+ adc $t1, $acc1
832
+ adc \$0, $acc2
833
+
834
+ ########################################################################
835
+ # Branch-less conditional subtraction of P
836
+ xor %eax, %eax
837
+ mov $acc0, $t0
838
+ sbb \$-1, $acc4 # .Lpoly[0]
839
+ sbb $poly1, $acc5 # .Lpoly[1]
840
+ sbb \$0, $acc0 # .Lpoly[2]
841
+ mov $acc1, $t1
842
+ sbb $poly3, $acc1 # .Lpoly[3]
843
+ sbb \$0, $acc2
844
+
845
+ cmovc $t2, $acc4
846
+ cmovc $t3, $acc5
847
+ mov $acc4, 8*0($r_ptr)
848
+ cmovc $t0, $acc0
849
+ mov $acc5, 8*1($r_ptr)
850
+ cmovc $t1, $acc1
851
+ mov $acc0, 8*2($r_ptr)
852
+ mov $acc1, 8*3($r_ptr)
853
+
854
+ ret
855
+ .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
856
+
857
+ .type __ecp_nistz256_sqr_montx,\@abi-omnipotent
858
+ .align 32
859
+ __ecp_nistz256_sqr_montx:
860
+ mulx $acc6, $acc1, $acc2 # a[0]*a[1]
861
+ mulx $acc7, $t0, $acc3 # a[0]*a[2]
862
+ xor %eax, %eax
863
+ adc $t0, $acc2
864
+ mulx $acc0, $t1, $acc4 # a[0]*a[3]
865
+ mov $acc6, %rdx
866
+ adc $t1, $acc3
867
+ adc \$0, $acc4
868
+ xor $acc5, $acc5 # $acc5=0,cf=0,of=0
869
+
870
+ #################################
871
+ mulx $acc7, $t0, $t1 # a[1]*a[2]
872
+ adcx $t0, $acc3
873
+ adox $t1, $acc4
874
+
875
+ mulx $acc0, $t0, $t1 # a[1]*a[3]
876
+ mov $acc7, %rdx
877
+ adcx $t0, $acc4
878
+ adox $t1, $acc5
879
+ adc \$0, $acc5
880
+
881
+ #################################
882
+ mulx $acc0, $t0, $acc6 # a[2]*a[3]
883
+ mov 8*0+128($a_ptr), %rdx
884
+ xor $acc7, $acc7 # $acc7=0,cf=0,of=0
885
+ adcx $acc1, $acc1 # acc1:6<<1
886
+ adox $t0, $acc5
887
+ adcx $acc2, $acc2
888
+ adox $acc7, $acc6 # of=0
889
+
890
+ mulx %rdx, $acc0, $t1
891
+ mov 8*1+128($a_ptr), %rdx
892
+ adcx $acc3, $acc3
893
+ adox $t1, $acc1
894
+ adcx $acc4, $acc4
895
+ mulx %rdx, $t0, $t4
896
+ mov 8*2+128($a_ptr), %rdx
897
+ adcx $acc5, $acc5
898
+ adox $t0, $acc2
899
+ adcx $acc6, $acc6
900
+ .byte 0x67
901
+ mulx %rdx, $t0, $t1
902
+ mov 8*3+128($a_ptr), %rdx
903
+ adox $t4, $acc3
904
+ adcx $acc7, $acc7
905
+ adox $t0, $acc4
906
+ mov \$32, $a_ptr
907
+ adox $t1, $acc5
908
+ .byte 0x67,0x67
909
+ mulx %rdx, $t0, $t4
910
+ mov $acc0, %rdx
911
+ adox $t0, $acc6
912
+ shlx $a_ptr, $acc0, $t0
913
+ adox $t4, $acc7
914
+ shrx $a_ptr, $acc0, $t4
915
+ mov .Lpoly+8*3(%rip), $t1
916
+
917
+ # reduction step 1
918
+ add $t0, $acc1
919
+ adc $t4, $acc2
920
+
921
+ mulx $t1, $t0, $acc0
922
+ mov $acc1, %rdx
923
+ adc $t0, $acc3
924
+ shlx $a_ptr, $acc1, $t0
925
+ adc \$0, $acc0
926
+ shrx $a_ptr, $acc1, $t4
927
+
928
+ # reduction step 2
929
+ add $t0, $acc2
930
+ adc $t4, $acc3
931
+
932
+ mulx $t1, $t0, $acc1
933
+ mov $acc2, %rdx
934
+ adc $t0, $acc0
935
+ shlx $a_ptr, $acc2, $t0
936
+ adc \$0, $acc1
937
+ shrx $a_ptr, $acc2, $t4
938
+
939
+ # reduction step 3
940
+ add $t0, $acc3
941
+ adc $t4, $acc0
942
+
943
+ mulx $t1, $t0, $acc2
944
+ mov $acc3, %rdx
945
+ adc $t0, $acc1
946
+ shlx $a_ptr, $acc3, $t0
947
+ adc \$0, $acc2
948
+ shrx $a_ptr, $acc3, $t4
949
+
950
+ # reduction step 4
951
+ add $t0, $acc0
952
+ adc $t4, $acc1
953
+
954
+ mulx $t1, $t0, $acc3
955
+ adc $t0, $acc2
956
+ adc \$0, $acc3
957
+
958
+ xor $t3, $t3 # cf=0
959
+ adc $acc0, $acc4 # accumulate upper half
960
+ mov .Lpoly+8*1(%rip), $a_ptr
961
+ adc $acc1, $acc5
962
+ mov $acc4, $acc0
963
+ adc $acc2, $acc6
964
+ adc $acc3, $acc7
965
+ mov $acc5, $acc1
966
+ adc \$0, $t3
967
+
968
+ xor %eax, %eax # cf=0
969
+ sbb \$-1, $acc4 # .Lpoly[0]
970
+ mov $acc6, $acc2
971
+ sbb $a_ptr, $acc5 # .Lpoly[1]
972
+ sbb \$0, $acc6 # .Lpoly[2]
973
+ mov $acc7, $acc3
974
+ sbb $t1, $acc7 # .Lpoly[3]
975
+ sbb \$0, $t3
976
+
977
+ cmovc $acc0, $acc4
978
+ cmovc $acc1, $acc5
979
+ mov $acc4, 8*0($r_ptr)
980
+ cmovc $acc2, $acc6
981
+ mov $acc5, 8*1($r_ptr)
982
+ cmovc $acc3, $acc7
983
+ mov $acc6, 8*2($r_ptr)
984
+ mov $acc7, 8*3($r_ptr)
985
+
986
+ ret
987
+ .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
988
+ ___
989
+ }
990
+ }
991
+ {
992
+ my ($r_ptr,$in_ptr)=("%rdi","%rsi");
993
+ my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11));
994
+ my ($t0,$t1,$t2)=("%rcx","%r12","%r13");
995
+
996
+ $code.=<<___;
997
+ ################################################################################
998
+ # void ecp_nistz256_from_mont(
999
+ # uint64_t res[4],
1000
+ # uint64_t in[4]);
1001
+ # This one performs Montgomery multiplication by 1, so we only need the reduction
1002
+
1003
+ .globl ecp_nistz256_from_mont
1004
+ .type ecp_nistz256_from_mont,\@function,2
1005
+ .align 32
1006
+ ecp_nistz256_from_mont:
1007
+ push %r12
1008
+ push %r13
1009
+
1010
+ mov 8*0($in_ptr), %rax
1011
+ mov .Lpoly+8*3(%rip), $t2
1012
+ mov 8*1($in_ptr), $acc1
1013
+ mov 8*2($in_ptr), $acc2
1014
+ mov 8*3($in_ptr), $acc3
1015
+ mov %rax, $acc0
1016
+ mov .Lpoly+8*1(%rip), $t1
1017
+
1018
+ #########################################
1019
+ # First iteration
1020
+ mov %rax, $t0
1021
+ shl \$32, $acc0
1022
+ mulq $t2
1023
+ shr \$32, $t0
1024
+ add $acc0, $acc1
1025
+ adc $t0, $acc2
1026
+ adc %rax, $acc3
1027
+ mov $acc1, %rax
1028
+ adc \$0, %rdx
1029
+
1030
+ #########################################
1031
+ # Second iteration
1032
+ mov $acc1, $t0
1033
+ shl \$32, $acc1
1034
+ mov %rdx, $acc0
1035
+ mulq $t2
1036
+ shr \$32, $t0
1037
+ add $acc1, $acc2
1038
+ adc $t0, $acc3
1039
+ adc %rax, $acc0
1040
+ mov $acc2, %rax
1041
+ adc \$0, %rdx
1042
+
1043
+ ##########################################
1044
+ # Third iteration
1045
+ mov $acc2, $t0
1046
+ shl \$32, $acc2
1047
+ mov %rdx, $acc1
1048
+ mulq $t2
1049
+ shr \$32, $t0
1050
+ add $acc2, $acc3
1051
+ adc $t0, $acc0
1052
+ adc %rax, $acc1
1053
+ mov $acc3, %rax
1054
+ adc \$0, %rdx
1055
+
1056
+ ###########################################
1057
+ # Last iteration
1058
+ mov $acc3, $t0
1059
+ shl \$32, $acc3
1060
+ mov %rdx, $acc2
1061
+ mulq $t2
1062
+ shr \$32, $t0
1063
+ add $acc3, $acc0
1064
+ adc $t0, $acc1
1065
+ mov $acc0, $t0
1066
+ adc %rax, $acc2
1067
+ mov $acc1, $in_ptr
1068
+ adc \$0, %rdx
1069
+
1070
+ sub \$-1, $acc0
1071
+ mov $acc2, %rax
1072
+ sbb $t1, $acc1
1073
+ sbb \$0, $acc2
1074
+ mov %rdx, $acc3
1075
+ sbb $t2, %rdx
1076
+ sbb $t2, $t2
1077
+
1078
+ cmovnz $t0, $acc0
1079
+ cmovnz $in_ptr, $acc1
1080
+ mov $acc0, 8*0($r_ptr)
1081
+ cmovnz %rax, $acc2
1082
+ mov $acc1, 8*1($r_ptr)
1083
+ cmovz %rdx, $acc3
1084
+ mov $acc2, 8*2($r_ptr)
1085
+ mov $acc3, 8*3($r_ptr)
1086
+
1087
+ pop %r13
1088
+ pop %r12
1089
+ ret
1090
+ .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
1091
+ ___
1092
+ }
1093
+ {
1094
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1095
+ my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
1096
+ my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
1097
+ my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
1098
+
1099
+ $code.=<<___;
1100
+ ################################################################################
1101
+ # void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
1102
+ .globl ecp_nistz256_select_w5
1103
+ .type ecp_nistz256_select_w5,\@abi-omnipotent
1104
+ .align 32
1105
+ ecp_nistz256_select_w5:
1106
+ ___
1107
+ $code.=<<___ if ($avx>1);
1108
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
1109
+ test \$`1<<5`, %eax
1110
+ jnz .Lavx2_select_w5
1111
+ ___
1112
+ $code.=<<___ if ($win64);
1113
+ lea -0x88(%rsp), %rax
1114
+ .LSEH_begin_ecp_nistz256_select_w5:
1115
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1116
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
1117
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
1118
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
1119
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
1120
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
1121
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
1122
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
1123
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
1124
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
1125
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
1126
+ ___
1127
+ $code.=<<___;
1128
+ movdqa .LOne(%rip), $ONE
1129
+ movd $index, $INDEX
1130
+
1131
+ pxor $Ra, $Ra
1132
+ pxor $Rb, $Rb
1133
+ pxor $Rc, $Rc
1134
+ pxor $Rd, $Rd
1135
+ pxor $Re, $Re
1136
+ pxor $Rf, $Rf
1137
+
1138
+ movdqa $ONE, $M0
1139
+ pshufd \$0, $INDEX, $INDEX
1140
+
1141
+ mov \$16, %rax
1142
+ .Lselect_loop_sse_w5:
1143
+
1144
+ movdqa $M0, $TMP0
1145
+ paddd $ONE, $M0
1146
+ pcmpeqd $INDEX, $TMP0
1147
+
1148
+ movdqa 16*0($in_t), $T0a
1149
+ movdqa 16*1($in_t), $T0b
1150
+ movdqa 16*2($in_t), $T0c
1151
+ movdqa 16*3($in_t), $T0d
1152
+ movdqa 16*4($in_t), $T0e
1153
+ movdqa 16*5($in_t), $T0f
1154
+ lea 16*6($in_t), $in_t
1155
+
1156
+ pand $TMP0, $T0a
1157
+ pand $TMP0, $T0b
1158
+ por $T0a, $Ra
1159
+ pand $TMP0, $T0c
1160
+ por $T0b, $Rb
1161
+ pand $TMP0, $T0d
1162
+ por $T0c, $Rc
1163
+ pand $TMP0, $T0e
1164
+ por $T0d, $Rd
1165
+ pand $TMP0, $T0f
1166
+ por $T0e, $Re
1167
+ por $T0f, $Rf
1168
+
1169
+ dec %rax
1170
+ jnz .Lselect_loop_sse_w5
1171
+
1172
+ movdqu $Ra, 16*0($val)
1173
+ movdqu $Rb, 16*1($val)
1174
+ movdqu $Rc, 16*2($val)
1175
+ movdqu $Rd, 16*3($val)
1176
+ movdqu $Re, 16*4($val)
1177
+ movdqu $Rf, 16*5($val)
1178
+ ___
1179
+ $code.=<<___ if ($win64);
1180
+ movaps (%rsp), %xmm6
1181
+ movaps 0x10(%rsp), %xmm7
1182
+ movaps 0x20(%rsp), %xmm8
1183
+ movaps 0x30(%rsp), %xmm9
1184
+ movaps 0x40(%rsp), %xmm10
1185
+ movaps 0x50(%rsp), %xmm11
1186
+ movaps 0x60(%rsp), %xmm12
1187
+ movaps 0x70(%rsp), %xmm13
1188
+ movaps 0x80(%rsp), %xmm14
1189
+ movaps 0x90(%rsp), %xmm15
1190
+ lea 0xa8(%rsp), %rsp
1191
+ .LSEH_end_ecp_nistz256_select_w5:
1192
+ ___
1193
+ $code.=<<___;
1194
+ ret
1195
+ .size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
1196
+
1197
+ ################################################################################
1198
+ # void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
1199
+ .globl ecp_nistz256_select_w7
1200
+ .type ecp_nistz256_select_w7,\@abi-omnipotent
1201
+ .align 32
1202
+ ecp_nistz256_select_w7:
1203
+ ___
1204
+ $code.=<<___ if ($avx>1);
1205
+ mov OPENSSL_ia32cap_P+8(%rip), %eax
1206
+ test \$`1<<5`, %eax
1207
+ jnz .Lavx2_select_w7
1208
+ ___
1209
+ $code.=<<___ if ($win64);
1210
+ lea -0x88(%rsp), %rax
1211
+ .LSEH_begin_ecp_nistz256_select_w7:
1212
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1213
+ .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
1214
+ .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
1215
+ .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
1216
+ .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
1217
+ .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
1218
+ .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
1219
+ .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
1220
+ .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
1221
+ .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
1222
+ .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
1223
+ ___
1224
+ $code.=<<___;
1225
+ movdqa .LOne(%rip), $M0
1226
+ movd $index, $INDEX
1227
+
1228
+ pxor $Ra, $Ra
1229
+ pxor $Rb, $Rb
1230
+ pxor $Rc, $Rc
1231
+ pxor $Rd, $Rd
1232
+
1233
+ movdqa $M0, $ONE
1234
+ pshufd \$0, $INDEX, $INDEX
1235
+ mov \$64, %rax
1236
+
1237
+ .Lselect_loop_sse_w7:
1238
+ movdqa $M0, $TMP0
1239
+ paddd $ONE, $M0
1240
+ movdqa 16*0($in_t), $T0a
1241
+ movdqa 16*1($in_t), $T0b
1242
+ pcmpeqd $INDEX, $TMP0
1243
+ movdqa 16*2($in_t), $T0c
1244
+ movdqa 16*3($in_t), $T0d
1245
+ lea 16*4($in_t), $in_t
1246
+
1247
+ pand $TMP0, $T0a
1248
+ pand $TMP0, $T0b
1249
+ por $T0a, $Ra
1250
+ pand $TMP0, $T0c
1251
+ por $T0b, $Rb
1252
+ pand $TMP0, $T0d
1253
+ por $T0c, $Rc
1254
+ prefetcht0 255($in_t)
1255
+ por $T0d, $Rd
1256
+
1257
+ dec %rax
1258
+ jnz .Lselect_loop_sse_w7
1259
+
1260
+ movdqu $Ra, 16*0($val)
1261
+ movdqu $Rb, 16*1($val)
1262
+ movdqu $Rc, 16*2($val)
1263
+ movdqu $Rd, 16*3($val)
1264
+ ___
1265
+ $code.=<<___ if ($win64);
1266
+ movaps (%rsp), %xmm6
1267
+ movaps 0x10(%rsp), %xmm7
1268
+ movaps 0x20(%rsp), %xmm8
1269
+ movaps 0x30(%rsp), %xmm9
1270
+ movaps 0x40(%rsp), %xmm10
1271
+ movaps 0x50(%rsp), %xmm11
1272
+ movaps 0x60(%rsp), %xmm12
1273
+ movaps 0x70(%rsp), %xmm13
1274
+ movaps 0x80(%rsp), %xmm14
1275
+ movaps 0x90(%rsp), %xmm15
1276
+ lea 0xa8(%rsp), %rsp
1277
+ .LSEH_end_ecp_nistz256_select_w7:
1278
+ ___
1279
+ $code.=<<___;
1280
+ ret
1281
+ .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
1282
+ ___
1283
+ }
1284
+ if ($avx>1) {
1285
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1286
+ my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
1287
+ my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
1288
+ my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
1289
+
1290
+ $code.=<<___;
1291
+ ################################################################################
1292
+ # void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
1293
+ .type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
1294
+ .align 32
1295
+ ecp_nistz256_avx2_select_w5:
1296
+ .Lavx2_select_w5:
1297
+ vzeroupper
1298
+ ___
1299
+ $code.=<<___ if ($win64);
1300
+ lea -0x88(%rsp), %rax
1301
+ .LSEH_begin_ecp_nistz256_avx2_select_w5:
1302
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1303
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1304
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1305
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1306
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1307
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1308
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1309
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1310
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1311
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1312
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1313
+ ___
1314
+ $code.=<<___;
1315
+ vmovdqa .LTwo(%rip), $TWO
1316
+
1317
+ vpxor $Ra, $Ra, $Ra
1318
+ vpxor $Rb, $Rb, $Rb
1319
+ vpxor $Rc, $Rc, $Rc
1320
+
1321
+ vmovdqa .LOne(%rip), $M0
1322
+ vmovdqa .LTwo(%rip), $M1
1323
+
1324
+ vmovd $index, %xmm1
1325
+ vpermd $INDEX, $Ra, $INDEX
1326
+
1327
+ mov \$8, %rax
1328
+ .Lselect_loop_avx2_w5:
1329
+
1330
+ vmovdqa 32*0($in_t), $T0a
1331
+ vmovdqa 32*1($in_t), $T0b
1332
+ vmovdqa 32*2($in_t), $T0c
1333
+
1334
+ vmovdqa 32*3($in_t), $T1a
1335
+ vmovdqa 32*4($in_t), $T1b
1336
+ vmovdqa 32*5($in_t), $T1c
1337
+
1338
+ vpcmpeqd $INDEX, $M0, $TMP0
1339
+ vpcmpeqd $INDEX, $M1, $TMP1
1340
+
1341
+ vpaddd $TWO, $M0, $M0
1342
+ vpaddd $TWO, $M1, $M1
1343
+ lea 32*6($in_t), $in_t
1344
+
1345
+ vpand $TMP0, $T0a, $T0a
1346
+ vpand $TMP0, $T0b, $T0b
1347
+ vpand $TMP0, $T0c, $T0c
1348
+ vpand $TMP1, $T1a, $T1a
1349
+ vpand $TMP1, $T1b, $T1b
1350
+ vpand $TMP1, $T1c, $T1c
1351
+
1352
+ vpxor $T0a, $Ra, $Ra
1353
+ vpxor $T0b, $Rb, $Rb
1354
+ vpxor $T0c, $Rc, $Rc
1355
+ vpxor $T1a, $Ra, $Ra
1356
+ vpxor $T1b, $Rb, $Rb
1357
+ vpxor $T1c, $Rc, $Rc
1358
+
1359
+ dec %rax
1360
+ jnz .Lselect_loop_avx2_w5
1361
+
1362
+ vmovdqu $Ra, 32*0($val)
1363
+ vmovdqu $Rb, 32*1($val)
1364
+ vmovdqu $Rc, 32*2($val)
1365
+ vzeroupper
1366
+ ___
1367
+ $code.=<<___ if ($win64);
1368
+ movaps (%rsp), %xmm6
1369
+ movaps 0x10(%rsp), %xmm7
1370
+ movaps 0x20(%rsp), %xmm8
1371
+ movaps 0x30(%rsp), %xmm9
1372
+ movaps 0x40(%rsp), %xmm10
1373
+ movaps 0x50(%rsp), %xmm11
1374
+ movaps 0x60(%rsp), %xmm12
1375
+ movaps 0x70(%rsp), %xmm13
1376
+ movaps 0x80(%rsp), %xmm14
1377
+ movaps 0x90(%rsp), %xmm15
1378
+ lea 0xa8(%rsp), %rsp
1379
+ .LSEH_end_ecp_nistz256_avx2_select_w5:
1380
+ ___
1381
+ $code.=<<___;
1382
+ ret
1383
+ .size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
1384
+ ___
1385
+ }
1386
+ if ($avx>1) {
1387
+ my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
1388
+ my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
1389
+ my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
1390
+ my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
1391
+ my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
1392
+
1393
+ $code.=<<___;
1394
+
1395
+ ################################################################################
1396
+ # void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
1397
+ .globl ecp_nistz256_avx2_select_w7
1398
+ .type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
1399
+ .align 32
1400
+ ecp_nistz256_avx2_select_w7:
1401
+ .Lavx2_select_w7:
1402
+ vzeroupper
1403
+ ___
1404
+ $code.=<<___ if ($win64);
1405
+ lea -0x88(%rsp), %rax
1406
+ .LSEH_begin_ecp_nistz256_avx2_select_w7:
1407
+ .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
1408
+ .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
1409
+ .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
1410
+ .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
1411
+ .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
1412
+ .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
1413
+ .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
1414
+ .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
1415
+ .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
1416
+ .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
1417
+ .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
1418
+ ___
1419
+ $code.=<<___;
1420
+ vmovdqa .LThree(%rip), $THREE
1421
+
1422
+ vpxor $Ra, $Ra, $Ra
1423
+ vpxor $Rb, $Rb, $Rb
1424
+
1425
+ vmovdqa .LOne(%rip), $M0
1426
+ vmovdqa .LTwo(%rip), $M1
1427
+ vmovdqa .LThree(%rip), $M2
1428
+
1429
+ vmovd $index, %xmm1
1430
+ vpermd $INDEX, $Ra, $INDEX
1431
+ # Skip index = 0, because it is implicitly the point at infinity
1432
+
1433
+ mov \$21, %rax
1434
+ .Lselect_loop_avx2_w7:
1435
+
1436
+ vmovdqa 32*0($in_t), $T0a
1437
+ vmovdqa 32*1($in_t), $T0b
1438
+
1439
+ vmovdqa 32*2($in_t), $T1a
1440
+ vmovdqa 32*3($in_t), $T1b
1441
+
1442
+ vmovdqa 32*4($in_t), $T2a
1443
+ vmovdqa 32*5($in_t), $T2b
1444
+
1445
+ vpcmpeqd $INDEX, $M0, $TMP0
1446
+ vpcmpeqd $INDEX, $M1, $TMP1
1447
+ vpcmpeqd $INDEX, $M2, $TMP2
1448
+
1449
+ vpaddd $THREE, $M0, $M0
1450
+ vpaddd $THREE, $M1, $M1
1451
+ vpaddd $THREE, $M2, $M2
1452
+ lea 32*6($in_t), $in_t
1453
+
1454
+ vpand $TMP0, $T0a, $T0a
1455
+ vpand $TMP0, $T0b, $T0b
1456
+ vpand $TMP1, $T1a, $T1a
1457
+ vpand $TMP1, $T1b, $T1b
1458
+ vpand $TMP2, $T2a, $T2a
1459
+ vpand $TMP2, $T2b, $T2b
1460
+
1461
+ vpxor $T0a, $Ra, $Ra
1462
+ vpxor $T0b, $Rb, $Rb
1463
+ vpxor $T1a, $Ra, $Ra
1464
+ vpxor $T1b, $Rb, $Rb
1465
+ vpxor $T2a, $Ra, $Ra
1466
+ vpxor $T2b, $Rb, $Rb
1467
+
1468
+ dec %rax
1469
+ jnz .Lselect_loop_avx2_w7
1470
+
1471
+
1472
+ vmovdqa 32*0($in_t), $T0a
1473
+ vmovdqa 32*1($in_t), $T0b
1474
+
1475
+ vpcmpeqd $INDEX, $M0, $TMP0
1476
+
1477
+ vpand $TMP0, $T0a, $T0a
1478
+ vpand $TMP0, $T0b, $T0b
1479
+
1480
+ vpxor $T0a, $Ra, $Ra
1481
+ vpxor $T0b, $Rb, $Rb
1482
+
1483
+ vmovdqu $Ra, 32*0($val)
1484
+ vmovdqu $Rb, 32*1($val)
1485
+ vzeroupper
1486
+ ___
1487
+ $code.=<<___ if ($win64);
1488
+ movaps (%rsp), %xmm6
1489
+ movaps 0x10(%rsp), %xmm7
1490
+ movaps 0x20(%rsp), %xmm8
1491
+ movaps 0x30(%rsp), %xmm9
1492
+ movaps 0x40(%rsp), %xmm10
1493
+ movaps 0x50(%rsp), %xmm11
1494
+ movaps 0x60(%rsp), %xmm12
1495
+ movaps 0x70(%rsp), %xmm13
1496
+ movaps 0x80(%rsp), %xmm14
1497
+ movaps 0x90(%rsp), %xmm15
1498
+ lea 0xa8(%rsp), %rsp
1499
+ .LSEH_end_ecp_nistz256_avx2_select_w7:
1500
+ ___
1501
+ $code.=<<___;
1502
+ ret
1503
+ .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1504
+ ___
1505
+ } else {
1506
+ $code.=<<___;
1507
+ .globl ecp_nistz256_avx2_select_w7
1508
+ .type ecp_nistz256_avx2_select_w7,\@function,3
1509
+ .align 32
1510
+ ecp_nistz256_avx2_select_w7:
1511
+ .byte 0x0f,0x0b # ud2
1512
+ ret
1513
+ .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
1514
+ ___
1515
+ }
1516
+ {{{
1517
+ ########################################################################
1518
+ # This block implements higher level point_double, point_add and
1519
+ # point_add_affine. The key to performance in this case is to allow
1520
+ # out-of-order execution logic to overlap computations from next step
1521
+ # with tail processing from current step. By using tailored calling
1522
+ # sequence we minimize inter-step overhead to give processor better
1523
+ # shot at overlapping operations...
1524
+ #
1525
+ # You will notice that input data is copied to stack. Trouble is that
1526
+ # there are no registers to spare for holding original pointers and
1527
+ # reloading them, pointers, would create undesired dependencies on
1528
+ # effective addresses calculation paths. In other words it's too done
1529
+ # to favour out-of-order execution logic.
1530
+ # <appro@openssl.org>
1531
+
1532
+ my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
1533
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
1534
+ my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
1535
+ my ($poly1,$poly3)=($acc6,$acc7);
1536
+
1537
+ sub load_for_mul () {
1538
+ my ($a,$b,$src0) = @_;
1539
+ my $bias = $src0 eq "%rax" ? 0 : -128;
1540
+
1541
+ " mov $b, $src0
1542
+ lea $b, $b_ptr
1543
+ mov 8*0+$a, $acc1
1544
+ mov 8*1+$a, $acc2
1545
+ lea $bias+$a, $a_ptr
1546
+ mov 8*2+$a, $acc3
1547
+ mov 8*3+$a, $acc4"
1548
+ }
1549
+
1550
+ sub load_for_sqr () {
1551
+ my ($a,$src0) = @_;
1552
+ my $bias = $src0 eq "%rax" ? 0 : -128;
1553
+
1554
+ " mov 8*0+$a, $src0
1555
+ mov 8*1+$a, $acc6
1556
+ lea $bias+$a, $a_ptr
1557
+ mov 8*2+$a, $acc7
1558
+ mov 8*3+$a, $acc0"
1559
+ }
1560
+
1561
+ {
1562
+ ########################################################################
1563
+ # operate in 4-5-0-1 "name space" that matches multiplication output
1564
+ #
1565
+ my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
1566
+
1567
+ $code.=<<___;
1568
+ .type __ecp_nistz256_add_toq,\@abi-omnipotent
1569
+ .align 32
1570
+ __ecp_nistz256_add_toq:
1571
+ add 8*0($b_ptr), $a0
1572
+ adc 8*1($b_ptr), $a1
1573
+ mov $a0, $t0
1574
+ adc 8*2($b_ptr), $a2
1575
+ adc 8*3($b_ptr), $a3
1576
+ mov $a1, $t1
1577
+ sbb $t4, $t4
1578
+
1579
+ sub \$-1, $a0
1580
+ mov $a2, $t2
1581
+ sbb $poly1, $a1
1582
+ sbb \$0, $a2
1583
+ mov $a3, $t3
1584
+ sbb $poly3, $a3
1585
+ test $t4, $t4
1586
+
1587
+ cmovz $t0, $a0
1588
+ cmovz $t1, $a1
1589
+ mov $a0, 8*0($r_ptr)
1590
+ cmovz $t2, $a2
1591
+ mov $a1, 8*1($r_ptr)
1592
+ cmovz $t3, $a3
1593
+ mov $a2, 8*2($r_ptr)
1594
+ mov $a3, 8*3($r_ptr)
1595
+
1596
+ ret
1597
+ .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
1598
+
1599
+ .type __ecp_nistz256_sub_fromq,\@abi-omnipotent
1600
+ .align 32
1601
+ __ecp_nistz256_sub_fromq:
1602
+ sub 8*0($b_ptr), $a0
1603
+ sbb 8*1($b_ptr), $a1
1604
+ mov $a0, $t0
1605
+ sbb 8*2($b_ptr), $a2
1606
+ sbb 8*3($b_ptr), $a3
1607
+ mov $a1, $t1
1608
+ sbb $t4, $t4
1609
+
1610
+ add \$-1, $a0
1611
+ mov $a2, $t2
1612
+ adc $poly1, $a1
1613
+ adc \$0, $a2
1614
+ mov $a3, $t3
1615
+ adc $poly3, $a3
1616
+ test $t4, $t4
1617
+
1618
+ cmovz $t0, $a0
1619
+ cmovz $t1, $a1
1620
+ mov $a0, 8*0($r_ptr)
1621
+ cmovz $t2, $a2
1622
+ mov $a1, 8*1($r_ptr)
1623
+ cmovz $t3, $a3
1624
+ mov $a2, 8*2($r_ptr)
1625
+ mov $a3, 8*3($r_ptr)
1626
+
1627
+ ret
1628
+ .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
1629
+
1630
+ .type __ecp_nistz256_subq,\@abi-omnipotent
1631
+ .align 32
1632
+ __ecp_nistz256_subq:
1633
+ sub $a0, $t0
1634
+ sbb $a1, $t1
1635
+ mov $t0, $a0
1636
+ sbb $a2, $t2
1637
+ sbb $a3, $t3
1638
+ mov $t1, $a1
1639
+ sbb $t4, $t4
1640
+
1641
+ add \$-1, $t0
1642
+ mov $t2, $a2
1643
+ adc $poly1, $t1
1644
+ adc \$0, $t2
1645
+ mov $t3, $a3
1646
+ adc $poly3, $t3
1647
+ test $t4, $t4
1648
+
1649
+ cmovnz $t0, $a0
1650
+ cmovnz $t1, $a1
1651
+ cmovnz $t2, $a2
1652
+ cmovnz $t3, $a3
1653
+
1654
+ ret
1655
+ .size __ecp_nistz256_subq,.-__ecp_nistz256_subq
1656
+
1657
+ .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
1658
+ .align 32
1659
+ __ecp_nistz256_mul_by_2q:
1660
+ add $a0, $a0 # a0:a3+a0:a3
1661
+ adc $a1, $a1
1662
+ mov $a0, $t0
1663
+ adc $a2, $a2
1664
+ adc $a3, $a3
1665
+ mov $a1, $t1
1666
+ sbb $t4, $t4
1667
+
1668
+ sub \$-1, $a0
1669
+ mov $a2, $t2
1670
+ sbb $poly1, $a1
1671
+ sbb \$0, $a2
1672
+ mov $a3, $t3
1673
+ sbb $poly3, $a3
1674
+ test $t4, $t4
1675
+
1676
+ cmovz $t0, $a0
1677
+ cmovz $t1, $a1
1678
+ mov $a0, 8*0($r_ptr)
1679
+ cmovz $t2, $a2
1680
+ mov $a1, 8*1($r_ptr)
1681
+ cmovz $t3, $a3
1682
+ mov $a2, 8*2($r_ptr)
1683
+ mov $a3, 8*3($r_ptr)
1684
+
1685
+ ret
1686
+ .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
1687
+ ___
1688
+ }
1689
+ sub gen_double () {
1690
+ my $x = shift;
1691
+ my ($src0,$sfx,$bias);
1692
+ my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
1693
+
1694
+ if ($x ne "x") {
1695
+ $src0 = "%rax";
1696
+ $sfx = "";
1697
+ $bias = 0;
1698
+
1699
+ $code.=<<___;
1700
+ .globl ecp_nistz256_point_double
1701
+ .type ecp_nistz256_point_double,\@function,2
1702
+ .align 32
1703
+ ecp_nistz256_point_double:
1704
+ ___
1705
+ $code.=<<___ if ($addx);
1706
+ mov \$0x80100, %ecx
1707
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
1708
+ cmp \$0x80100, %ecx
1709
+ je .Lpoint_doublex
1710
+ ___
1711
+ } else {
1712
+ $src0 = "%rdx";
1713
+ $sfx = "x";
1714
+ $bias = 128;
1715
+
1716
+ $code.=<<___;
1717
+ .type ecp_nistz256_point_doublex,\@function,2
1718
+ .align 32
1719
+ ecp_nistz256_point_doublex:
1720
+ .Lpoint_doublex:
1721
+ ___
1722
+ }
1723
+ $code.=<<___;
1724
+ push %rbp
1725
+ push %rbx
1726
+ push %r12
1727
+ push %r13
1728
+ push %r14
1729
+ push %r15
1730
+ sub \$32*5+8, %rsp
1731
+
1732
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
1733
+ mov $a_ptr, $b_ptr # backup copy
1734
+ movdqu 0x10($a_ptr), %xmm1
1735
+ mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
1736
+ mov 0x20+8*1($a_ptr), $acc5
1737
+ mov 0x20+8*2($a_ptr), $acc0
1738
+ mov 0x20+8*3($a_ptr), $acc1
1739
+ mov .Lpoly+8*1(%rip), $poly1
1740
+ mov .Lpoly+8*3(%rip), $poly3
1741
+ movdqa %xmm0, $in_x(%rsp)
1742
+ movdqa %xmm1, $in_x+0x10(%rsp)
1743
+ lea 0x20($r_ptr), $acc2
1744
+ lea 0x40($r_ptr), $acc3
1745
+ movq $r_ptr, %xmm0
1746
+ movq $acc2, %xmm1
1747
+ movq $acc3, %xmm2
1748
+
1749
+ lea $S(%rsp), $r_ptr
1750
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
1751
+
1752
+ mov 0x40+8*0($a_ptr), $src0
1753
+ mov 0x40+8*1($a_ptr), $acc6
1754
+ mov 0x40+8*2($a_ptr), $acc7
1755
+ mov 0x40+8*3($a_ptr), $acc0
1756
+ lea 0x40-$bias($a_ptr), $a_ptr
1757
+ lea $Zsqr(%rsp), $r_ptr
1758
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
1759
+
1760
+ `&load_for_sqr("$S(%rsp)", "$src0")`
1761
+ lea $S(%rsp), $r_ptr
1762
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
1763
+
1764
+ mov 0x20($b_ptr), $src0 # $b_ptr is still valid
1765
+ mov 0x40+8*0($b_ptr), $acc1
1766
+ mov 0x40+8*1($b_ptr), $acc2
1767
+ mov 0x40+8*2($b_ptr), $acc3
1768
+ mov 0x40+8*3($b_ptr), $acc4
1769
+ lea 0x40-$bias($b_ptr), $a_ptr
1770
+ lea 0x20($b_ptr), $b_ptr
1771
+ movq %xmm2, $r_ptr
1772
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
1773
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
1774
+
1775
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1776
+ mov $in_x+8*1(%rsp), $acc5
1777
+ lea $Zsqr(%rsp), $b_ptr
1778
+ mov $in_x+8*2(%rsp), $acc0
1779
+ mov $in_x+8*3(%rsp), $acc1
1780
+ lea $M(%rsp), $r_ptr
1781
+ call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
1782
+
1783
+ mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
1784
+ mov $in_x+8*1(%rsp), $acc5
1785
+ lea $Zsqr(%rsp), $b_ptr
1786
+ mov $in_x+8*2(%rsp), $acc0
1787
+ mov $in_x+8*3(%rsp), $acc1
1788
+ lea $Zsqr(%rsp), $r_ptr
1789
+ call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
1790
+
1791
+ `&load_for_sqr("$S(%rsp)", "$src0")`
1792
+ movq %xmm1, $r_ptr
1793
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
1794
+ ___
1795
+ {
1796
+ ######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
1797
+ # operate in 4-5-6-7 "name space" that matches squaring output
1798
+ #
1799
+ my ($poly1,$poly3)=($a_ptr,$t1);
1800
+ my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
1801
+
1802
+ $code.=<<___;
1803
+ xor $t4, $t4
1804
+ mov $a0, $t0
1805
+ add \$-1, $a0
1806
+ mov $a1, $t1
1807
+ adc $poly1, $a1
1808
+ mov $a2, $t2
1809
+ adc \$0, $a2
1810
+ mov $a3, $t3
1811
+ adc $poly3, $a3
1812
+ adc \$0, $t4
1813
+ xor $a_ptr, $a_ptr # borrow $a_ptr
1814
+ test \$1, $t0
1815
+
1816
+ cmovz $t0, $a0
1817
+ cmovz $t1, $a1
1818
+ cmovz $t2, $a2
1819
+ cmovz $t3, $a3
1820
+ cmovz $a_ptr, $t4
1821
+
1822
+ mov $a1, $t0 # a0:a3>>1
1823
+ shr \$1, $a0
1824
+ shl \$63, $t0
1825
+ mov $a2, $t1
1826
+ shr \$1, $a1
1827
+ or $t0, $a0
1828
+ shl \$63, $t1
1829
+ mov $a3, $t2
1830
+ shr \$1, $a2
1831
+ or $t1, $a1
1832
+ shl \$63, $t2
1833
+ mov $a0, 8*0($r_ptr)
1834
+ shr \$1, $a3
1835
+ mov $a1, 8*1($r_ptr)
1836
+ shl \$63, $t4
1837
+ or $t2, $a2
1838
+ or $t4, $a3
1839
+ mov $a2, 8*2($r_ptr)
1840
+ mov $a3, 8*3($r_ptr)
1841
+ ___
1842
+ }
1843
+ $code.=<<___;
1844
+ `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
1845
+ lea $M(%rsp), $r_ptr
1846
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
1847
+
1848
+ lea $tmp0(%rsp), $r_ptr
1849
+ call __ecp_nistz256_mul_by_2$x
1850
+
1851
+ lea $M(%rsp), $b_ptr
1852
+ lea $M(%rsp), $r_ptr
1853
+ call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
1854
+
1855
+ `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
1856
+ lea $S(%rsp), $r_ptr
1857
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
1858
+
1859
+ lea $tmp0(%rsp), $r_ptr
1860
+ call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
1861
+
1862
+ `&load_for_sqr("$M(%rsp)", "$src0")`
1863
+ movq %xmm0, $r_ptr
1864
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
1865
+
1866
+ lea $tmp0(%rsp), $b_ptr
1867
+ mov $acc6, $acc0 # harmonize sqr output and sub input
1868
+ mov $acc7, $acc1
1869
+ mov $a_ptr, $poly1
1870
+ mov $t1, $poly3
1871
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
1872
+
1873
+ mov $S+8*0(%rsp), $t0
1874
+ mov $S+8*1(%rsp), $t1
1875
+ mov $S+8*2(%rsp), $t2
1876
+ mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
1877
+ lea $S(%rsp), $r_ptr
1878
+ call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
1879
+
1880
+ mov $M(%rsp), $src0
1881
+ lea $M(%rsp), $b_ptr
1882
+ mov $acc4, $acc6 # harmonize sub output and mul input
1883
+ xor %ecx, %ecx
1884
+ mov $acc4, $S+8*0(%rsp) # have to save:-(
1885
+ mov $acc5, $acc2
1886
+ mov $acc5, $S+8*1(%rsp)
1887
+ cmovz $acc0, $acc3
1888
+ mov $acc0, $S+8*2(%rsp)
1889
+ lea $S-$bias(%rsp), $a_ptr
1890
+ cmovz $acc1, $acc4
1891
+ mov $acc1, $S+8*3(%rsp)
1892
+ mov $acc6, $acc1
1893
+ lea $S(%rsp), $r_ptr
1894
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
1895
+
1896
+ movq %xmm1, $b_ptr
1897
+ movq %xmm1, $r_ptr
1898
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
1899
+
1900
+ add \$32*5+8, %rsp
1901
+ pop %r15
1902
+ pop %r14
1903
+ pop %r13
1904
+ pop %r12
1905
+ pop %rbx
1906
+ pop %rbp
1907
+ ret
1908
+ .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
1909
+ ___
1910
+ }
1911
+ &gen_double("q");
1912
+
1913
+ sub gen_add () {
1914
+ my $x = shift;
1915
+ my ($src0,$sfx,$bias);
1916
+ my ($H,$Hsqr,$R,$Rsqr,$Hcub,
1917
+ $U1,$U2,$S1,$S2,
1918
+ $res_x,$res_y,$res_z,
1919
+ $in1_x,$in1_y,$in1_z,
1920
+ $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
1921
+ my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
1922
+
1923
+ if ($x ne "x") {
1924
+ $src0 = "%rax";
1925
+ $sfx = "";
1926
+ $bias = 0;
1927
+
1928
+ $code.=<<___;
1929
+ .globl ecp_nistz256_point_add
1930
+ .type ecp_nistz256_point_add,\@function,3
1931
+ .align 32
1932
+ ecp_nistz256_point_add:
1933
+ ___
1934
+ $code.=<<___ if ($addx);
1935
+ mov \$0x80100, %ecx
1936
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
1937
+ cmp \$0x80100, %ecx
1938
+ je .Lpoint_addx
1939
+ ___
1940
+ } else {
1941
+ $src0 = "%rdx";
1942
+ $sfx = "x";
1943
+ $bias = 128;
1944
+
1945
+ $code.=<<___;
1946
+ .type ecp_nistz256_point_addx,\@function,3
1947
+ .align 32
1948
+ ecp_nistz256_point_addx:
1949
+ .Lpoint_addx:
1950
+ ___
1951
+ }
1952
+ $code.=<<___;
1953
+ push %rbp
1954
+ push %rbx
1955
+ push %r12
1956
+ push %r13
1957
+ push %r14
1958
+ push %r15
1959
+ sub \$32*18+8, %rsp
1960
+
1961
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
1962
+ movdqu 0x10($a_ptr), %xmm1
1963
+ movdqu 0x20($a_ptr), %xmm2
1964
+ movdqu 0x30($a_ptr), %xmm3
1965
+ movdqu 0x40($a_ptr), %xmm4
1966
+ movdqu 0x50($a_ptr), %xmm5
1967
+ mov $a_ptr, $b_ptr # reassign
1968
+ mov $b_org, $a_ptr # reassign
1969
+ movdqa %xmm0, $in1_x(%rsp)
1970
+ movdqa %xmm1, $in1_x+0x10(%rsp)
1971
+ por %xmm0, %xmm1
1972
+ movdqa %xmm2, $in1_y(%rsp)
1973
+ movdqa %xmm3, $in1_y+0x10(%rsp)
1974
+ por %xmm2, %xmm3
1975
+ movdqa %xmm4, $in1_z(%rsp)
1976
+ movdqa %xmm5, $in1_z+0x10(%rsp)
1977
+ por %xmm1, %xmm3
1978
+
1979
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
1980
+ pshufd \$0xb1, %xmm3, %xmm5
1981
+ movdqu 0x10($a_ptr), %xmm1
1982
+ movdqu 0x20($a_ptr), %xmm2
1983
+ por %xmm3, %xmm5
1984
+ movdqu 0x30($a_ptr), %xmm3
1985
+ mov 0x40+8*0($a_ptr), $src0 # load original in2_z
1986
+ mov 0x40+8*1($a_ptr), $acc6
1987
+ mov 0x40+8*2($a_ptr), $acc7
1988
+ mov 0x40+8*3($a_ptr), $acc0
1989
+ movdqa %xmm0, $in2_x(%rsp)
1990
+ pshufd \$0x1e, %xmm5, %xmm4
1991
+ movdqa %xmm1, $in2_x+0x10(%rsp)
1992
+ por %xmm0, %xmm1
1993
+ movq $r_ptr, %xmm0 # save $r_ptr
1994
+ movdqa %xmm2, $in2_y(%rsp)
1995
+ movdqa %xmm3, $in2_y+0x10(%rsp)
1996
+ por %xmm2, %xmm3
1997
+ por %xmm4, %xmm5
1998
+ pxor %xmm4, %xmm4
1999
+ por %xmm1, %xmm3
2000
+
2001
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
2002
+ mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
2003
+ mov $acc6, $in2_z+8*1(%rsp)
2004
+ mov $acc7, $in2_z+8*2(%rsp)
2005
+ mov $acc0, $in2_z+8*3(%rsp)
2006
+ lea $Z2sqr(%rsp), $r_ptr # Z2^2
2007
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
2008
+
2009
+ pcmpeqd %xmm4, %xmm5
2010
+ pshufd \$0xb1, %xmm3, %xmm4
2011
+ por %xmm3, %xmm4
2012
+ pshufd \$0, %xmm5, %xmm5 # in1infty
2013
+ pshufd \$0x1e, %xmm4, %xmm3
2014
+ por %xmm3, %xmm4
2015
+ pxor %xmm3, %xmm3
2016
+ pcmpeqd %xmm3, %xmm4
2017
+ pshufd \$0, %xmm4, %xmm4 # in2infty
2018
+ mov 0x40+8*0($b_ptr), $src0 # load original in1_z
2019
+ mov 0x40+8*1($b_ptr), $acc6
2020
+ mov 0x40+8*2($b_ptr), $acc7
2021
+ mov 0x40+8*3($b_ptr), $acc0
2022
+
2023
+ lea 0x40-$bias($b_ptr), $a_ptr
2024
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
2025
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
2026
+
2027
+ `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
2028
+ lea $S1(%rsp), $r_ptr # S1 = Z2^3
2029
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
2030
+
2031
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2032
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
2033
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
2034
+
2035
+ `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
2036
+ lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
2037
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
2038
+
2039
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2040
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
2041
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
2042
+
2043
+ lea $S1(%rsp), $b_ptr
2044
+ lea $R(%rsp), $r_ptr # R = S2 - S1
2045
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
2046
+
2047
+ or $acc5, $acc4 # see if result is zero
2048
+ movdqa %xmm4, %xmm2
2049
+ or $acc0, $acc4
2050
+ or $acc1, $acc4
2051
+ por %xmm5, %xmm2 # in1infty || in2infty
2052
+ movq $acc4, %xmm3
2053
+
2054
+ `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2055
+ lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
2056
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
2057
+
2058
+ `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
2059
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
2060
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
2061
+
2062
+ lea $U1(%rsp), $b_ptr
2063
+ lea $H(%rsp), $r_ptr # H = U2 - U1
2064
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
2065
+
2066
+ or $acc5, $acc4 # see if result is zero
2067
+ or $acc0, $acc4
2068
+ or $acc1, $acc4
2069
+
2070
+ .byte 0x3e # predict taken
2071
+ jnz .Ladd_proceed$x # is_equal(U1,U2)?
2072
+ movq %xmm2, $acc0
2073
+ movq %xmm3, $acc1
2074
+ test $acc0, $acc0
2075
+ jnz .Ladd_proceed$x # (in1infty || in2infty)?
2076
+ test $acc1, $acc1
2077
+ jz .Ladd_proceed$x # is_equal(S1,S2)?
2078
+
2079
+ movq %xmm0, $r_ptr # restore $r_ptr
2080
+ pxor %xmm0, %xmm0
2081
+ movdqu %xmm0, 0x00($r_ptr)
2082
+ movdqu %xmm0, 0x10($r_ptr)
2083
+ movdqu %xmm0, 0x20($r_ptr)
2084
+ movdqu %xmm0, 0x30($r_ptr)
2085
+ movdqu %xmm0, 0x40($r_ptr)
2086
+ movdqu %xmm0, 0x50($r_ptr)
2087
+ jmp .Ladd_done$x
2088
+
2089
+ .align 32
2090
+ .Ladd_proceed$x:
2091
+ `&load_for_sqr("$R(%rsp)", "$src0")`
2092
+ lea $Rsqr(%rsp), $r_ptr # R^2
2093
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
2094
+
2095
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2096
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2097
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
2098
+
2099
+ `&load_for_sqr("$H(%rsp)", "$src0")`
2100
+ lea $Hsqr(%rsp), $r_ptr # H^2
2101
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
2102
+
2103
+ `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
2104
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2105
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
2106
+
2107
+ `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
2108
+ lea $Hcub(%rsp), $r_ptr # H^3
2109
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
2110
+
2111
+ `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
2112
+ lea $U2(%rsp), $r_ptr # U1*H^2
2113
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
2114
+ ___
2115
+ {
2116
+ #######################################################################
2117
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2118
+ #
2119
+ my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2120
+ my ($poly1, $poly3)=($acc6,$acc7);
2121
+
2122
+ $code.=<<___;
2123
+ #lea $U2(%rsp), $a_ptr
2124
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
2125
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2126
+
2127
+ add $acc0, $acc0 # a0:a3+a0:a3
2128
+ lea $Rsqr(%rsp), $a_ptr
2129
+ adc $acc1, $acc1
2130
+ mov $acc0, $t0
2131
+ adc $acc2, $acc2
2132
+ adc $acc3, $acc3
2133
+ mov $acc1, $t1
2134
+ sbb $t4, $t4
2135
+
2136
+ sub \$-1, $acc0
2137
+ mov $acc2, $t2
2138
+ sbb $poly1, $acc1
2139
+ sbb \$0, $acc2
2140
+ mov $acc3, $t3
2141
+ sbb $poly3, $acc3
2142
+ test $t4, $t4
2143
+
2144
+ cmovz $t0, $acc0
2145
+ mov 8*0($a_ptr), $t0
2146
+ cmovz $t1, $acc1
2147
+ mov 8*1($a_ptr), $t1
2148
+ cmovz $t2, $acc2
2149
+ mov 8*2($a_ptr), $t2
2150
+ cmovz $t3, $acc3
2151
+ mov 8*3($a_ptr), $t3
2152
+
2153
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2154
+
2155
+ lea $Hcub(%rsp), $b_ptr
2156
+ lea $res_x(%rsp), $r_ptr
2157
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2158
+
2159
+ mov $U2+8*0(%rsp), $t0
2160
+ mov $U2+8*1(%rsp), $t1
2161
+ mov $U2+8*2(%rsp), $t2
2162
+ mov $U2+8*3(%rsp), $t3
2163
+ lea $res_y(%rsp), $r_ptr
2164
+
2165
+ call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
2166
+
2167
+ mov $acc0, 8*0($r_ptr) # save the result, as
2168
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2169
+ mov $acc2, 8*2($r_ptr)
2170
+ mov $acc3, 8*3($r_ptr)
2171
+ ___
2172
+ }
2173
+ $code.=<<___;
2174
+ `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
2175
+ lea $S2(%rsp), $r_ptr
2176
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
2177
+
2178
+ `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
2179
+ lea $res_y(%rsp), $r_ptr
2180
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
2181
+
2182
+ lea $S2(%rsp), $b_ptr
2183
+ lea $res_y(%rsp), $r_ptr
2184
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
2185
+
2186
+ movq %xmm0, $r_ptr # restore $r_ptr
2187
+
2188
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
2189
+ movdqa %xmm5, %xmm1
2190
+ pandn $res_z(%rsp), %xmm0
2191
+ movdqa %xmm5, %xmm2
2192
+ pandn $res_z+0x10(%rsp), %xmm1
2193
+ movdqa %xmm5, %xmm3
2194
+ pand $in2_z(%rsp), %xmm2
2195
+ pand $in2_z+0x10(%rsp), %xmm3
2196
+ por %xmm0, %xmm2
2197
+ por %xmm1, %xmm3
2198
+
2199
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2200
+ movdqa %xmm4, %xmm1
2201
+ pandn %xmm2, %xmm0
2202
+ movdqa %xmm4, %xmm2
2203
+ pandn %xmm3, %xmm1
2204
+ movdqa %xmm4, %xmm3
2205
+ pand $in1_z(%rsp), %xmm2
2206
+ pand $in1_z+0x10(%rsp), %xmm3
2207
+ por %xmm0, %xmm2
2208
+ por %xmm1, %xmm3
2209
+ movdqu %xmm2, 0x40($r_ptr)
2210
+ movdqu %xmm3, 0x50($r_ptr)
2211
+
2212
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2213
+ movdqa %xmm5, %xmm1
2214
+ pandn $res_x(%rsp), %xmm0
2215
+ movdqa %xmm5, %xmm2
2216
+ pandn $res_x+0x10(%rsp), %xmm1
2217
+ movdqa %xmm5, %xmm3
2218
+ pand $in2_x(%rsp), %xmm2
2219
+ pand $in2_x+0x10(%rsp), %xmm3
2220
+ por %xmm0, %xmm2
2221
+ por %xmm1, %xmm3
2222
+
2223
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2224
+ movdqa %xmm4, %xmm1
2225
+ pandn %xmm2, %xmm0
2226
+ movdqa %xmm4, %xmm2
2227
+ pandn %xmm3, %xmm1
2228
+ movdqa %xmm4, %xmm3
2229
+ pand $in1_x(%rsp), %xmm2
2230
+ pand $in1_x+0x10(%rsp), %xmm3
2231
+ por %xmm0, %xmm2
2232
+ por %xmm1, %xmm3
2233
+ movdqu %xmm2, 0x00($r_ptr)
2234
+ movdqu %xmm3, 0x10($r_ptr)
2235
+
2236
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2237
+ movdqa %xmm5, %xmm1
2238
+ pandn $res_y(%rsp), %xmm0
2239
+ movdqa %xmm5, %xmm2
2240
+ pandn $res_y+0x10(%rsp), %xmm1
2241
+ movdqa %xmm5, %xmm3
2242
+ pand $in2_y(%rsp), %xmm2
2243
+ pand $in2_y+0x10(%rsp), %xmm3
2244
+ por %xmm0, %xmm2
2245
+ por %xmm1, %xmm3
2246
+
2247
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2248
+ movdqa %xmm4, %xmm1
2249
+ pandn %xmm2, %xmm0
2250
+ movdqa %xmm4, %xmm2
2251
+ pandn %xmm3, %xmm1
2252
+ movdqa %xmm4, %xmm3
2253
+ pand $in1_y(%rsp), %xmm2
2254
+ pand $in1_y+0x10(%rsp), %xmm3
2255
+ por %xmm0, %xmm2
2256
+ por %xmm1, %xmm3
2257
+ movdqu %xmm2, 0x20($r_ptr)
2258
+ movdqu %xmm3, 0x30($r_ptr)
2259
+
2260
+ .Ladd_done$x:
2261
+ add \$32*18+8, %rsp
2262
+ pop %r15
2263
+ pop %r14
2264
+ pop %r13
2265
+ pop %r12
2266
+ pop %rbx
2267
+ pop %rbp
2268
+ ret
2269
+ .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
2270
+ ___
2271
+ }
2272
+ &gen_add("q");
2273
+
2274
+ sub gen_add_affine () {
2275
+ my $x = shift;
2276
+ my ($src0,$sfx,$bias);
2277
+ my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
2278
+ $res_x,$res_y,$res_z,
2279
+ $in1_x,$in1_y,$in1_z,
2280
+ $in2_x,$in2_y)=map(32*$_,(0..14));
2281
+ my $Z1sqr = $S2;
2282
+
2283
+ if ($x ne "x") {
2284
+ $src0 = "%rax";
2285
+ $sfx = "";
2286
+ $bias = 0;
2287
+
2288
+ $code.=<<___;
2289
+ .globl ecp_nistz256_point_add_affine
2290
+ .type ecp_nistz256_point_add_affine,\@function,3
2291
+ .align 32
2292
+ ecp_nistz256_point_add_affine:
2293
+ ___
2294
+ $code.=<<___ if ($addx);
2295
+ mov \$0x80100, %ecx
2296
+ and OPENSSL_ia32cap_P+8(%rip), %ecx
2297
+ cmp \$0x80100, %ecx
2298
+ je .Lpoint_add_affinex
2299
+ ___
2300
+ } else {
2301
+ $src0 = "%rdx";
2302
+ $sfx = "x";
2303
+ $bias = 128;
2304
+
2305
+ $code.=<<___;
2306
+ .type ecp_nistz256_point_add_affinex,\@function,3
2307
+ .align 32
2308
+ ecp_nistz256_point_add_affinex:
2309
+ .Lpoint_add_affinex:
2310
+ ___
2311
+ }
2312
+ $code.=<<___;
2313
+ push %rbp
2314
+ push %rbx
2315
+ push %r12
2316
+ push %r13
2317
+ push %r14
2318
+ push %r15
2319
+ sub \$32*15+8, %rsp
2320
+
2321
+ movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
2322
+ mov $b_org, $b_ptr # reassign
2323
+ movdqu 0x10($a_ptr), %xmm1
2324
+ movdqu 0x20($a_ptr), %xmm2
2325
+ movdqu 0x30($a_ptr), %xmm3
2326
+ movdqu 0x40($a_ptr), %xmm4
2327
+ movdqu 0x50($a_ptr), %xmm5
2328
+ mov 0x40+8*0($a_ptr), $src0 # load original in1_z
2329
+ mov 0x40+8*1($a_ptr), $acc6
2330
+ mov 0x40+8*2($a_ptr), $acc7
2331
+ mov 0x40+8*3($a_ptr), $acc0
2332
+ movdqa %xmm0, $in1_x(%rsp)
2333
+ movdqa %xmm1, $in1_x+0x10(%rsp)
2334
+ por %xmm0, %xmm1
2335
+ movdqa %xmm2, $in1_y(%rsp)
2336
+ movdqa %xmm3, $in1_y+0x10(%rsp)
2337
+ por %xmm2, %xmm3
2338
+ movdqa %xmm4, $in1_z(%rsp)
2339
+ movdqa %xmm5, $in1_z+0x10(%rsp)
2340
+ por %xmm1, %xmm3
2341
+
2342
+ movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
2343
+ pshufd \$0xb1, %xmm3, %xmm5
2344
+ movdqu 0x10($b_ptr), %xmm1
2345
+ movdqu 0x20($b_ptr), %xmm2
2346
+ por %xmm3, %xmm5
2347
+ movdqu 0x30($b_ptr), %xmm3
2348
+ movdqa %xmm0, $in2_x(%rsp)
2349
+ pshufd \$0x1e, %xmm5, %xmm4
2350
+ movdqa %xmm1, $in2_x+0x10(%rsp)
2351
+ por %xmm0, %xmm1
2352
+ movq $r_ptr, %xmm0 # save $r_ptr
2353
+ movdqa %xmm2, $in2_y(%rsp)
2354
+ movdqa %xmm3, $in2_y+0x10(%rsp)
2355
+ por %xmm2, %xmm3
2356
+ por %xmm4, %xmm5
2357
+ pxor %xmm4, %xmm4
2358
+ por %xmm1, %xmm3
2359
+
2360
+ lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
2361
+ lea $Z1sqr(%rsp), $r_ptr # Z1^2
2362
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
2363
+
2364
+ pcmpeqd %xmm4, %xmm5
2365
+ pshufd \$0xb1, %xmm3, %xmm4
2366
+ mov 0x00($b_ptr), $src0 # $b_ptr is still valid
2367
+ #lea 0x00($b_ptr), $b_ptr
2368
+ mov $acc4, $acc1 # harmonize sqr output and mul input
2369
+ por %xmm3, %xmm4
2370
+ pshufd \$0, %xmm5, %xmm5 # in1infty
2371
+ pshufd \$0x1e, %xmm4, %xmm3
2372
+ mov $acc5, $acc2
2373
+ por %xmm3, %xmm4
2374
+ pxor %xmm3, %xmm3
2375
+ mov $acc6, $acc3
2376
+ pcmpeqd %xmm3, %xmm4
2377
+ pshufd \$0, %xmm4, %xmm4 # in2infty
2378
+
2379
+ lea $Z1sqr-$bias(%rsp), $a_ptr
2380
+ mov $acc7, $acc4
2381
+ lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
2382
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
2383
+
2384
+ lea $in1_x(%rsp), $b_ptr
2385
+ lea $H(%rsp), $r_ptr # H = U2 - U1
2386
+ call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
2387
+
2388
+ `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
2389
+ lea $S2(%rsp), $r_ptr # S2 = Z1^3
2390
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
2391
+
2392
+ `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
2393
+ lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
2394
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
2395
+
2396
+ `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
2397
+ lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
2398
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
2399
+
2400
+ lea $in1_y(%rsp), $b_ptr
2401
+ lea $R(%rsp), $r_ptr # R = S2 - S1
2402
+ call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
2403
+
2404
+ `&load_for_sqr("$H(%rsp)", "$src0")`
2405
+ lea $Hsqr(%rsp), $r_ptr # H^2
2406
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
2407
+
2408
+ `&load_for_sqr("$R(%rsp)", "$src0")`
2409
+ lea $Rsqr(%rsp), $r_ptr # R^2
2410
+ call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
2411
+
2412
+ `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
2413
+ lea $Hcub(%rsp), $r_ptr # H^3
2414
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
2415
+
2416
+ `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
2417
+ lea $U2(%rsp), $r_ptr # U1*H^2
2418
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
2419
+ ___
2420
+ {
2421
+ #######################################################################
2422
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2423
+ #
2424
+ my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2425
+ my ($poly1, $poly3)=($acc6,$acc7);
2426
+
2427
+ $code.=<<___;
2428
+ #lea $U2(%rsp), $a_ptr
2429
+ #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
2430
+ #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
2431
+
2432
+ add $acc0, $acc0 # a0:a3+a0:a3
2433
+ lea $Rsqr(%rsp), $a_ptr
2434
+ adc $acc1, $acc1
2435
+ mov $acc0, $t0
2436
+ adc $acc2, $acc2
2437
+ adc $acc3, $acc3
2438
+ mov $acc1, $t1
2439
+ sbb $t4, $t4
2440
+
2441
+ sub \$-1, $acc0
2442
+ mov $acc2, $t2
2443
+ sbb $poly1, $acc1
2444
+ sbb \$0, $acc2
2445
+ mov $acc3, $t3
2446
+ sbb $poly3, $acc3
2447
+ test $t4, $t4
2448
+
2449
+ cmovz $t0, $acc0
2450
+ mov 8*0($a_ptr), $t0
2451
+ cmovz $t1, $acc1
2452
+ mov 8*1($a_ptr), $t1
2453
+ cmovz $t2, $acc2
2454
+ mov 8*2($a_ptr), $t2
2455
+ cmovz $t3, $acc3
2456
+ mov 8*3($a_ptr), $t3
2457
+
2458
+ call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
2459
+
2460
+ lea $Hcub(%rsp), $b_ptr
2461
+ lea $res_x(%rsp), $r_ptr
2462
+ call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
2463
+
2464
+ mov $U2+8*0(%rsp), $t0
2465
+ mov $U2+8*1(%rsp), $t1
2466
+ mov $U2+8*2(%rsp), $t2
2467
+ mov $U2+8*3(%rsp), $t3
2468
+ lea $H(%rsp), $r_ptr
2469
+
2470
+ call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
2471
+
2472
+ mov $acc0, 8*0($r_ptr) # save the result, as
2473
+ mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
2474
+ mov $acc2, 8*2($r_ptr)
2475
+ mov $acc3, 8*3($r_ptr)
2476
+ ___
2477
+ }
2478
+ $code.=<<___;
2479
+ `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
2480
+ lea $S2(%rsp), $r_ptr
2481
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
2482
+
2483
+ `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
2484
+ lea $H(%rsp), $r_ptr
2485
+ call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
2486
+
2487
+ lea $S2(%rsp), $b_ptr
2488
+ lea $res_y(%rsp), $r_ptr
2489
+ call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
2490
+
2491
+ movq %xmm0, $r_ptr # restore $r_ptr
2492
+
2493
+ movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
2494
+ movdqa %xmm5, %xmm1
2495
+ pandn $res_z(%rsp), %xmm0
2496
+ movdqa %xmm5, %xmm2
2497
+ pandn $res_z+0x10(%rsp), %xmm1
2498
+ movdqa %xmm5, %xmm3
2499
+ pand .LONE_mont(%rip), %xmm2
2500
+ pand .LONE_mont+0x10(%rip), %xmm3
2501
+ por %xmm0, %xmm2
2502
+ por %xmm1, %xmm3
2503
+
2504
+ movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
2505
+ movdqa %xmm4, %xmm1
2506
+ pandn %xmm2, %xmm0
2507
+ movdqa %xmm4, %xmm2
2508
+ pandn %xmm3, %xmm1
2509
+ movdqa %xmm4, %xmm3
2510
+ pand $in1_z(%rsp), %xmm2
2511
+ pand $in1_z+0x10(%rsp), %xmm3
2512
+ por %xmm0, %xmm2
2513
+ por %xmm1, %xmm3
2514
+ movdqu %xmm2, 0x40($r_ptr)
2515
+ movdqu %xmm3, 0x50($r_ptr)
2516
+
2517
+ movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
2518
+ movdqa %xmm5, %xmm1
2519
+ pandn $res_x(%rsp), %xmm0
2520
+ movdqa %xmm5, %xmm2
2521
+ pandn $res_x+0x10(%rsp), %xmm1
2522
+ movdqa %xmm5, %xmm3
2523
+ pand $in2_x(%rsp), %xmm2
2524
+ pand $in2_x+0x10(%rsp), %xmm3
2525
+ por %xmm0, %xmm2
2526
+ por %xmm1, %xmm3
2527
+
2528
+ movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
2529
+ movdqa %xmm4, %xmm1
2530
+ pandn %xmm2, %xmm0
2531
+ movdqa %xmm4, %xmm2
2532
+ pandn %xmm3, %xmm1
2533
+ movdqa %xmm4, %xmm3
2534
+ pand $in1_x(%rsp), %xmm2
2535
+ pand $in1_x+0x10(%rsp), %xmm3
2536
+ por %xmm0, %xmm2
2537
+ por %xmm1, %xmm3
2538
+ movdqu %xmm2, 0x00($r_ptr)
2539
+ movdqu %xmm3, 0x10($r_ptr)
2540
+
2541
+ movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
2542
+ movdqa %xmm5, %xmm1
2543
+ pandn $res_y(%rsp), %xmm0
2544
+ movdqa %xmm5, %xmm2
2545
+ pandn $res_y+0x10(%rsp), %xmm1
2546
+ movdqa %xmm5, %xmm3
2547
+ pand $in2_y(%rsp), %xmm2
2548
+ pand $in2_y+0x10(%rsp), %xmm3
2549
+ por %xmm0, %xmm2
2550
+ por %xmm1, %xmm3
2551
+
2552
+ movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
2553
+ movdqa %xmm4, %xmm1
2554
+ pandn %xmm2, %xmm0
2555
+ movdqa %xmm4, %xmm2
2556
+ pandn %xmm3, %xmm1
2557
+ movdqa %xmm4, %xmm3
2558
+ pand $in1_y(%rsp), %xmm2
2559
+ pand $in1_y+0x10(%rsp), %xmm3
2560
+ por %xmm0, %xmm2
2561
+ por %xmm1, %xmm3
2562
+ movdqu %xmm2, 0x20($r_ptr)
2563
+ movdqu %xmm3, 0x30($r_ptr)
2564
+
2565
+ add \$32*15+8, %rsp
2566
+ pop %r15
2567
+ pop %r14
2568
+ pop %r13
2569
+ pop %r12
2570
+ pop %rbx
2571
+ pop %rbp
2572
+ ret
2573
+ .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
2574
+ ___
2575
+ }
2576
+ &gen_add_affine("q");
2577
+
2578
+ ########################################################################
2579
+ # AD*X magic
2580
+ #
2581
+ if ($addx) { {
2582
+ ########################################################################
2583
+ # operate in 4-5-0-1 "name space" that matches multiplication output
2584
+ #
2585
+ my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
2586
+
2587
+ $code.=<<___;
2588
+ .type __ecp_nistz256_add_tox,\@abi-omnipotent
2589
+ .align 32
2590
+ __ecp_nistz256_add_tox:
2591
+ xor $t4, $t4
2592
+ adc 8*0($b_ptr), $a0
2593
+ adc 8*1($b_ptr), $a1
2594
+ mov $a0, $t0
2595
+ adc 8*2($b_ptr), $a2
2596
+ adc 8*3($b_ptr), $a3
2597
+ mov $a1, $t1
2598
+ adc \$0, $t4
2599
+
2600
+ xor $t3, $t3
2601
+ sbb \$-1, $a0
2602
+ mov $a2, $t2
2603
+ sbb $poly1, $a1
2604
+ sbb \$0, $a2
2605
+ mov $a3, $t3
2606
+ sbb $poly3, $a3
2607
+
2608
+ bt \$0, $t4
2609
+ cmovnc $t0, $a0
2610
+ cmovnc $t1, $a1
2611
+ mov $a0, 8*0($r_ptr)
2612
+ cmovnc $t2, $a2
2613
+ mov $a1, 8*1($r_ptr)
2614
+ cmovnc $t3, $a3
2615
+ mov $a2, 8*2($r_ptr)
2616
+ mov $a3, 8*3($r_ptr)
2617
+
2618
+ ret
2619
+ .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
2620
+
2621
+ .type __ecp_nistz256_sub_fromx,\@abi-omnipotent
2622
+ .align 32
2623
+ __ecp_nistz256_sub_fromx:
2624
+ xor $t4, $t4
2625
+ sbb 8*0($b_ptr), $a0
2626
+ sbb 8*1($b_ptr), $a1
2627
+ mov $a0, $t0
2628
+ sbb 8*2($b_ptr), $a2
2629
+ sbb 8*3($b_ptr), $a3
2630
+ mov $a1, $t1
2631
+ sbb \$0, $t4
2632
+
2633
+ xor $t3, $t3
2634
+ adc \$-1, $a0
2635
+ mov $a2, $t2
2636
+ adc $poly1, $a1
2637
+ adc \$0, $a2
2638
+ mov $a3, $t3
2639
+ adc $poly3, $a3
2640
+
2641
+ bt \$0, $t4
2642
+ cmovnc $t0, $a0
2643
+ cmovnc $t1, $a1
2644
+ mov $a0, 8*0($r_ptr)
2645
+ cmovnc $t2, $a2
2646
+ mov $a1, 8*1($r_ptr)
2647
+ cmovnc $t3, $a3
2648
+ mov $a2, 8*2($r_ptr)
2649
+ mov $a3, 8*3($r_ptr)
2650
+
2651
+ ret
2652
+ .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
2653
+
2654
+ .type __ecp_nistz256_subx,\@abi-omnipotent
2655
+ .align 32
2656
+ __ecp_nistz256_subx:
2657
+ xor $t4, $t4
2658
+ sbb $a0, $t0
2659
+ sbb $a1, $t1
2660
+ mov $t0, $a0
2661
+ sbb $a2, $t2
2662
+ sbb $a3, $t3
2663
+ mov $t1, $a1
2664
+ sbb \$0, $t4
2665
+
2666
+ xor $a3 ,$a3
2667
+ adc \$-1, $t0
2668
+ mov $t2, $a2
2669
+ adc $poly1, $t1
2670
+ adc \$0, $t2
2671
+ mov $t3, $a3
2672
+ adc $poly3, $t3
2673
+
2674
+ bt \$0, $t4
2675
+ cmovc $t0, $a0
2676
+ cmovc $t1, $a1
2677
+ cmovc $t2, $a2
2678
+ cmovc $t3, $a3
2679
+
2680
+ ret
2681
+ .size __ecp_nistz256_subx,.-__ecp_nistz256_subx
2682
+
2683
+ .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
2684
+ .align 32
2685
+ __ecp_nistz256_mul_by_2x:
2686
+ xor $t4, $t4
2687
+ adc $a0, $a0 # a0:a3+a0:a3
2688
+ adc $a1, $a1
2689
+ mov $a0, $t0
2690
+ adc $a2, $a2
2691
+ adc $a3, $a3
2692
+ mov $a1, $t1
2693
+ adc \$0, $t4
2694
+
2695
+ xor $t3, $t3
2696
+ sbb \$-1, $a0
2697
+ mov $a2, $t2
2698
+ sbb $poly1, $a1
2699
+ sbb \$0, $a2
2700
+ mov $a3, $t3
2701
+ sbb $poly3, $a3
2702
+
2703
+ bt \$0, $t4
2704
+ cmovnc $t0, $a0
2705
+ cmovnc $t1, $a1
2706
+ mov $a0, 8*0($r_ptr)
2707
+ cmovnc $t2, $a2
2708
+ mov $a1, 8*1($r_ptr)
2709
+ cmovnc $t3, $a3
2710
+ mov $a2, 8*2($r_ptr)
2711
+ mov $a3, 8*3($r_ptr)
2712
+
2713
+ ret
2714
+ .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
2715
+ ___
2716
+ }
2717
+ &gen_double("x");
2718
+ &gen_add("x");
2719
+ &gen_add_affine("x");
2720
+ }
2721
+ }}}
2722
+
2723
+ $code =~ s/\`([^\`]*)\`/eval $1/gem;
2724
+ print $code;
2725
+ close STDOUT;