ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1503 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # March 2015
11
+ #
12
+ # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13
+ # work. While it does improve RSA sign performance by 20-30% (less for
14
+ # longer keys) on most processors, for some reason RSA2048 is not
15
+ # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16
+ # instruction issue rate is limited on processor in question, meaning
17
+ # that dedicated squaring procedure is a must. Well, actually all
18
+ # contemporary AArch64 processors seem to have limited multiplication
19
+ # issue rate, i.e. they can't issue multiplication every cycle, which
20
+ # explains moderate improvement coefficients in comparison to
21
+ # compiler-generated code. Recall that compiler is instructed to use
22
+ # umulh and therefore uses same amount of multiplication instructions
23
+ # to do the job. Assembly's edge is to minimize number of "collateral"
24
+ # instructions and of course instruction scheduling.
25
+ #
26
+ # April 2015
27
+ #
28
+ # Squaring procedure that handles lengths divisible by 8 improves
29
+ # RSA/DSA performance by 25-40-60% depending on processor and key
30
+ # length. Overall improvement coefficients are always positive in
31
+ # comparison to compiler-generated code. On Cortex-A57 improvement
32
+ # is still modest on longest key lengths, while others exhibit e.g.
33
+ # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34
+ # on Cortex-A57 and ~60-100% faster on others.
35
+
36
+ $flavour = shift;
37
+ $output = shift;
38
+
39
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
+ die "can't locate arm-xlate.pl";
43
+
44
+ open OUT,"| \"$^X\" $xlate $flavour $output";
45
+ *STDOUT=*OUT;
46
+
47
+ ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
+
51
+ # int bn_mul_mont(
52
+ $rp="x0"; # BN_ULONG *rp,
53
+ $ap="x1"; # const BN_ULONG *ap,
54
+ $bp="x2"; # const BN_ULONG *bp,
55
+ $np="x3"; # const BN_ULONG *np,
56
+ $n0="x4"; # const BN_ULONG *n0,
57
+ $num="x5"; # int num);
58
+
59
+ $code.=<<___;
60
+ .text
61
+
62
+ .globl bn_mul_mont
63
+ .type bn_mul_mont,%function
64
+ .align 5
65
+ bn_mul_mont:
66
+ tst $num,#7
67
+ b.eq __bn_sqr8x_mont
68
+ tst $num,#3
69
+ b.eq __bn_mul4x_mont
70
+ .Lmul_mont:
71
+ stp x29,x30,[sp,#-64]!
72
+ add x29,sp,#0
73
+ stp x19,x20,[sp,#16]
74
+ stp x21,x22,[sp,#32]
75
+ stp x23,x24,[sp,#48]
76
+
77
+ ldr $m0,[$bp],#8 // bp[0]
78
+ sub $tp,sp,$num,lsl#3
79
+ ldp $hi0,$aj,[$ap],#16 // ap[0..1]
80
+ lsl $num,$num,#3
81
+ ldr $n0,[$n0] // *n0
82
+ and $tp,$tp,#-16 // ABI says so
83
+ ldp $hi1,$nj,[$np],#16 // np[0..1]
84
+
85
+ mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86
+ sub $j,$num,#16 // j=num-2
87
+ umulh $hi0,$hi0,$m0
88
+ mul $alo,$aj,$m0 // ap[1]*bp[0]
89
+ umulh $ahi,$aj,$m0
90
+
91
+ mul $m1,$lo0,$n0 // "tp[0]"*n0
92
+ mov sp,$tp // alloca
93
+
94
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
95
+ umulh $hi1,$hi1,$m1
96
+ mul $nlo,$nj,$m1 // np[1]*m1
97
+ // (*) adds $lo1,$lo1,$lo0 // discarded
98
+ // (*) As for removal of first multiplication and addition
99
+ // instructions. The outcome of first addition is
100
+ // guaranteed to be zero, which leaves two computationally
101
+ // significant outcomes: it either carries or not. Then
102
+ // question is when does it carry? Is there alternative
103
+ // way to deduce it? If you follow operations, you can
104
+ // observe that condition for carry is quite simple:
105
+ // $lo0 being non-zero. So that carry can be calculated
106
+ // by adding -1 to $lo0. That's what next instruction does.
107
+ subs xzr,$lo0,#1 // (*)
108
+ umulh $nhi,$nj,$m1
109
+ adc $hi1,$hi1,xzr
110
+ cbz $j,.L1st_skip
111
+
112
+ .L1st:
113
+ ldr $aj,[$ap],#8
114
+ adds $lo0,$alo,$hi0
115
+ sub $j,$j,#8 // j--
116
+ adc $hi0,$ahi,xzr
117
+
118
+ ldr $nj,[$np],#8
119
+ adds $lo1,$nlo,$hi1
120
+ mul $alo,$aj,$m0 // ap[j]*bp[0]
121
+ adc $hi1,$nhi,xzr
122
+ umulh $ahi,$aj,$m0
123
+
124
+ adds $lo1,$lo1,$lo0
125
+ mul $nlo,$nj,$m1 // np[j]*m1
126
+ adc $hi1,$hi1,xzr
127
+ umulh $nhi,$nj,$m1
128
+ str $lo1,[$tp],#8 // tp[j-1]
129
+ cbnz $j,.L1st
130
+
131
+ .L1st_skip:
132
+ adds $lo0,$alo,$hi0
133
+ sub $ap,$ap,$num // rewind $ap
134
+ adc $hi0,$ahi,xzr
135
+
136
+ adds $lo1,$nlo,$hi1
137
+ sub $np,$np,$num // rewind $np
138
+ adc $hi1,$nhi,xzr
139
+
140
+ adds $lo1,$lo1,$lo0
141
+ sub $i,$num,#8 // i=num-1
142
+ adcs $hi1,$hi1,$hi0
143
+
144
+ adc $ovf,xzr,xzr // upmost overflow bit
145
+ stp $lo1,$hi1,[$tp]
146
+
147
+ .Louter:
148
+ ldr $m0,[$bp],#8 // bp[i]
149
+ ldp $hi0,$aj,[$ap],#16
150
+ ldr $tj,[sp] // tp[0]
151
+ add $tp,sp,#8
152
+
153
+ mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154
+ sub $j,$num,#16 // j=num-2
155
+ umulh $hi0,$hi0,$m0
156
+ ldp $hi1,$nj,[$np],#16
157
+ mul $alo,$aj,$m0 // ap[1]*bp[i]
158
+ adds $lo0,$lo0,$tj
159
+ umulh $ahi,$aj,$m0
160
+ adc $hi0,$hi0,xzr
161
+
162
+ mul $m1,$lo0,$n0
163
+ sub $i,$i,#8 // i--
164
+
165
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
166
+ umulh $hi1,$hi1,$m1
167
+ mul $nlo,$nj,$m1 // np[1]*m1
168
+ // (*) adds $lo1,$lo1,$lo0
169
+ subs xzr,$lo0,#1 // (*)
170
+ umulh $nhi,$nj,$m1
171
+ cbz $j,.Linner_skip
172
+
173
+ .Linner:
174
+ ldr $aj,[$ap],#8
175
+ adc $hi1,$hi1,xzr
176
+ ldr $tj,[$tp],#8 // tp[j]
177
+ adds $lo0,$alo,$hi0
178
+ sub $j,$j,#8 // j--
179
+ adc $hi0,$ahi,xzr
180
+
181
+ adds $lo1,$nlo,$hi1
182
+ ldr $nj,[$np],#8
183
+ adc $hi1,$nhi,xzr
184
+
185
+ mul $alo,$aj,$m0 // ap[j]*bp[i]
186
+ adds $lo0,$lo0,$tj
187
+ umulh $ahi,$aj,$m0
188
+ adc $hi0,$hi0,xzr
189
+
190
+ mul $nlo,$nj,$m1 // np[j]*m1
191
+ adds $lo1,$lo1,$lo0
192
+ umulh $nhi,$nj,$m1
193
+ str $lo1,[$tp,#-16] // tp[j-1]
194
+ cbnz $j,.Linner
195
+
196
+ .Linner_skip:
197
+ ldr $tj,[$tp],#8 // tp[j]
198
+ adc $hi1,$hi1,xzr
199
+ adds $lo0,$alo,$hi0
200
+ sub $ap,$ap,$num // rewind $ap
201
+ adc $hi0,$ahi,xzr
202
+
203
+ adds $lo1,$nlo,$hi1
204
+ sub $np,$np,$num // rewind $np
205
+ adcs $hi1,$nhi,$ovf
206
+ adc $ovf,xzr,xzr
207
+
208
+ adds $lo0,$lo0,$tj
209
+ adc $hi0,$hi0,xzr
210
+
211
+ adds $lo1,$lo1,$lo0
212
+ adcs $hi1,$hi1,$hi0
213
+ adc $ovf,$ovf,xzr // upmost overflow bit
214
+ stp $lo1,$hi1,[$tp,#-16]
215
+
216
+ cbnz $i,.Louter
217
+
218
+ // Final step. We see if result is larger than modulus, and
219
+ // if it is, subtract the modulus. But comparison implies
220
+ // subtraction. So we subtract modulus, see if it borrowed,
221
+ // and conditionally copy original value.
222
+ ldr $tj,[sp] // tp[0]
223
+ add $tp,sp,#8
224
+ ldr $nj,[$np],#8 // np[0]
225
+ subs $j,$num,#8 // j=num-1 and clear borrow
226
+ mov $ap,$rp
227
+ .Lsub:
228
+ sbcs $aj,$tj,$nj // tp[j]-np[j]
229
+ ldr $tj,[$tp],#8
230
+ sub $j,$j,#8 // j--
231
+ ldr $nj,[$np],#8
232
+ str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
233
+ cbnz $j,.Lsub
234
+
235
+ sbcs $aj,$tj,$nj
236
+ sbcs $ovf,$ovf,xzr // did it borrow?
237
+ str $aj,[$ap],#8 // rp[num-1]
238
+
239
+ ldr $tj,[sp] // tp[0]
240
+ add $tp,sp,#8
241
+ ldr $aj,[$rp],#8 // rp[0]
242
+ sub $num,$num,#8 // num--
243
+ nop
244
+ .Lcond_copy:
245
+ sub $num,$num,#8 // num--
246
+ csel $nj,$tj,$aj,lo // did it borrow?
247
+ ldr $tj,[$tp],#8
248
+ ldr $aj,[$rp],#8
249
+ str xzr,[$tp,#-16] // wipe tp
250
+ str $nj,[$rp,#-16]
251
+ cbnz $num,.Lcond_copy
252
+
253
+ csel $nj,$tj,$aj,lo
254
+ str xzr,[$tp,#-8] // wipe tp
255
+ str $nj,[$rp,#-8]
256
+
257
+ ldp x19,x20,[x29,#16]
258
+ mov sp,x29
259
+ ldp x21,x22,[x29,#32]
260
+ mov x0,#1
261
+ ldp x23,x24,[x29,#48]
262
+ ldr x29,[sp],#64
263
+ ret
264
+ .size bn_mul_mont,.-bn_mul_mont
265
+ ___
266
+ {
267
+ ########################################################################
268
+ # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
+
270
+ my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271
+ my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273
+ my ($cnt,$carry,$topmost)=("x27","x28","x30");
274
+ my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
+
276
+ $code.=<<___;
277
+ .type __bn_sqr8x_mont,%function
278
+ .align 5
279
+ __bn_sqr8x_mont:
280
+ cmp $ap,$bp
281
+ b.ne __bn_mul4x_mont
282
+ .Lsqr8x_mont:
283
+ stp x29,x30,[sp,#-128]!
284
+ add x29,sp,#0
285
+ stp x19,x20,[sp,#16]
286
+ stp x21,x22,[sp,#32]
287
+ stp x23,x24,[sp,#48]
288
+ stp x25,x26,[sp,#64]
289
+ stp x27,x28,[sp,#80]
290
+ stp $rp,$np,[sp,#96] // offload rp and np
291
+
292
+ ldp $a0,$a1,[$ap,#8*0]
293
+ ldp $a2,$a3,[$ap,#8*2]
294
+ ldp $a4,$a5,[$ap,#8*4]
295
+ ldp $a6,$a7,[$ap,#8*6]
296
+
297
+ sub $tp,sp,$num,lsl#4
298
+ lsl $num,$num,#3
299
+ ldr $n0,[$n0] // *n0
300
+ mov sp,$tp // alloca
301
+ sub $cnt,$num,#8*8
302
+ b .Lsqr8x_zero_start
303
+
304
+ .Lsqr8x_zero:
305
+ sub $cnt,$cnt,#8*8
306
+ stp xzr,xzr,[$tp,#8*0]
307
+ stp xzr,xzr,[$tp,#8*2]
308
+ stp xzr,xzr,[$tp,#8*4]
309
+ stp xzr,xzr,[$tp,#8*6]
310
+ .Lsqr8x_zero_start:
311
+ stp xzr,xzr,[$tp,#8*8]
312
+ stp xzr,xzr,[$tp,#8*10]
313
+ stp xzr,xzr,[$tp,#8*12]
314
+ stp xzr,xzr,[$tp,#8*14]
315
+ add $tp,$tp,#8*16
316
+ cbnz $cnt,.Lsqr8x_zero
317
+
318
+ add $ap_end,$ap,$num
319
+ add $ap,$ap,#8*8
320
+ mov $acc0,xzr
321
+ mov $acc1,xzr
322
+ mov $acc2,xzr
323
+ mov $acc3,xzr
324
+ mov $acc4,xzr
325
+ mov $acc5,xzr
326
+ mov $acc6,xzr
327
+ mov $acc7,xzr
328
+ mov $tp,sp
329
+ str $n0,[x29,#112] // offload n0
330
+
331
+ // Multiply everything but a[i]*a[i]
332
+ .align 4
333
+ .Lsqr8x_outer_loop:
334
+ // a[1]a[0] (i)
335
+ // a[2]a[0]
336
+ // a[3]a[0]
337
+ // a[4]a[0]
338
+ // a[5]a[0]
339
+ // a[6]a[0]
340
+ // a[7]a[0]
341
+ // a[2]a[1] (ii)
342
+ // a[3]a[1]
343
+ // a[4]a[1]
344
+ // a[5]a[1]
345
+ // a[6]a[1]
346
+ // a[7]a[1]
347
+ // a[3]a[2] (iii)
348
+ // a[4]a[2]
349
+ // a[5]a[2]
350
+ // a[6]a[2]
351
+ // a[7]a[2]
352
+ // a[4]a[3] (iv)
353
+ // a[5]a[3]
354
+ // a[6]a[3]
355
+ // a[7]a[3]
356
+ // a[5]a[4] (v)
357
+ // a[6]a[4]
358
+ // a[7]a[4]
359
+ // a[6]a[5] (vi)
360
+ // a[7]a[5]
361
+ // a[7]a[6] (vii)
362
+
363
+ mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
364
+ mul $t1,$a2,$a0
365
+ mul $t2,$a3,$a0
366
+ mul $t3,$a4,$a0
367
+ adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
368
+ mul $t0,$a5,$a0
369
+ adcs $acc2,$acc2,$t1
370
+ mul $t1,$a6,$a0
371
+ adcs $acc3,$acc3,$t2
372
+ mul $t2,$a7,$a0
373
+ adcs $acc4,$acc4,$t3
374
+ umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
375
+ adcs $acc5,$acc5,$t0
376
+ umulh $t0,$a2,$a0
377
+ adcs $acc6,$acc6,$t1
378
+ umulh $t1,$a3,$a0
379
+ adcs $acc7,$acc7,$t2
380
+ umulh $t2,$a4,$a0
381
+ stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382
+ adc $acc0,xzr,xzr // t[8]
383
+ adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
384
+ umulh $t3,$a5,$a0
385
+ adcs $acc3,$acc3,$t0
386
+ umulh $t0,$a6,$a0
387
+ adcs $acc4,$acc4,$t1
388
+ umulh $t1,$a7,$a0
389
+ adcs $acc5,$acc5,$t2
390
+ mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
391
+ adcs $acc6,$acc6,$t3
392
+ mul $t3,$a3,$a1
393
+ adcs $acc7,$acc7,$t0
394
+ mul $t0,$a4,$a1
395
+ adc $acc0,$acc0,$t1
396
+
397
+ mul $t1,$a5,$a1
398
+ adds $acc3,$acc3,$t2
399
+ mul $t2,$a6,$a1
400
+ adcs $acc4,$acc4,$t3
401
+ mul $t3,$a7,$a1
402
+ adcs $acc5,$acc5,$t0
403
+ umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
404
+ adcs $acc6,$acc6,$t1
405
+ umulh $t1,$a3,$a1
406
+ adcs $acc7,$acc7,$t2
407
+ umulh $t2,$a4,$a1
408
+ adcs $acc0,$acc0,$t3
409
+ umulh $t3,$a5,$a1
410
+ stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411
+ adc $acc1,xzr,xzr // t[9]
412
+ adds $acc4,$acc4,$t0
413
+ umulh $t0,$a6,$a1
414
+ adcs $acc5,$acc5,$t1
415
+ umulh $t1,$a7,$a1
416
+ adcs $acc6,$acc6,$t2
417
+ mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
418
+ adcs $acc7,$acc7,$t3
419
+ mul $t3,$a4,$a2
420
+ adcs $acc0,$acc0,$t0
421
+ mul $t0,$a5,$a2
422
+ adc $acc1,$acc1,$t1
423
+
424
+ mul $t1,$a6,$a2
425
+ adds $acc5,$acc5,$t2
426
+ mul $t2,$a7,$a2
427
+ adcs $acc6,$acc6,$t3
428
+ umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
429
+ adcs $acc7,$acc7,$t0
430
+ umulh $t0,$a4,$a2
431
+ adcs $acc0,$acc0,$t1
432
+ umulh $t1,$a5,$a2
433
+ adcs $acc1,$acc1,$t2
434
+ umulh $t2,$a6,$a2
435
+ stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436
+ adc $acc2,xzr,xzr // t[10]
437
+ adds $acc6,$acc6,$t3
438
+ umulh $t3,$a7,$a2
439
+ adcs $acc7,$acc7,$t0
440
+ mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
441
+ adcs $acc0,$acc0,$t1
442
+ mul $t1,$a5,$a3
443
+ adcs $acc1,$acc1,$t2
444
+ mul $t2,$a6,$a3
445
+ adc $acc2,$acc2,$t3
446
+
447
+ mul $t3,$a7,$a3
448
+ adds $acc7,$acc7,$t0
449
+ umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
450
+ adcs $acc0,$acc0,$t1
451
+ umulh $t1,$a5,$a3
452
+ adcs $acc1,$acc1,$t2
453
+ umulh $t2,$a6,$a3
454
+ adcs $acc2,$acc2,$t3
455
+ umulh $t3,$a7,$a3
456
+ stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457
+ adc $acc3,xzr,xzr // t[11]
458
+ adds $acc0,$acc0,$t0
459
+ mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
460
+ adcs $acc1,$acc1,$t1
461
+ mul $t1,$a6,$a4
462
+ adcs $acc2,$acc2,$t2
463
+ mul $t2,$a7,$a4
464
+ adc $acc3,$acc3,$t3
465
+
466
+ umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
467
+ adds $acc1,$acc1,$t0
468
+ umulh $t0,$a6,$a4
469
+ adcs $acc2,$acc2,$t1
470
+ umulh $t1,$a7,$a4
471
+ adcs $acc3,$acc3,$t2
472
+ mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473
+ adc $acc4,xzr,xzr // t[12]
474
+ adds $acc2,$acc2,$t3
475
+ mul $t3,$a7,$a5
476
+ adcs $acc3,$acc3,$t0
477
+ umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
478
+ adc $acc4,$acc4,$t1
479
+
480
+ umulh $t1,$a7,$a5
481
+ adds $acc3,$acc3,$t2
482
+ mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
483
+ adcs $acc4,$acc4,$t3
484
+ umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485
+ adc $acc5,xzr,xzr // t[13]
486
+ adds $acc4,$acc4,$t0
487
+ sub $cnt,$ap_end,$ap // done yet?
488
+ adc $acc5,$acc5,$t1
489
+
490
+ adds $acc5,$acc5,$t2
491
+ sub $t0,$ap_end,$num // rewinded ap
492
+ adc $acc6,xzr,xzr // t[14]
493
+ add $acc6,$acc6,$t3
494
+
495
+ cbz $cnt,.Lsqr8x_outer_break
496
+
497
+ mov $n0,$a0
498
+ ldp $a0,$a1,[$tp,#8*0]
499
+ ldp $a2,$a3,[$tp,#8*2]
500
+ ldp $a4,$a5,[$tp,#8*4]
501
+ ldp $a6,$a7,[$tp,#8*6]
502
+ adds $acc0,$acc0,$a0
503
+ adcs $acc1,$acc1,$a1
504
+ ldp $a0,$a1,[$ap,#8*0]
505
+ adcs $acc2,$acc2,$a2
506
+ adcs $acc3,$acc3,$a3
507
+ ldp $a2,$a3,[$ap,#8*2]
508
+ adcs $acc4,$acc4,$a4
509
+ adcs $acc5,$acc5,$a5
510
+ ldp $a4,$a5,[$ap,#8*4]
511
+ adcs $acc6,$acc6,$a6
512
+ mov $rp,$ap
513
+ adcs $acc7,xzr,$a7
514
+ ldp $a6,$a7,[$ap,#8*6]
515
+ add $ap,$ap,#8*8
516
+ //adc $carry,xzr,xzr // moved below
517
+ mov $cnt,#-8*8
518
+
519
+ // a[8]a[0]
520
+ // a[9]a[0]
521
+ // a[a]a[0]
522
+ // a[b]a[0]
523
+ // a[c]a[0]
524
+ // a[d]a[0]
525
+ // a[e]a[0]
526
+ // a[f]a[0]
527
+ // a[8]a[1]
528
+ // a[f]a[1]........................
529
+ // a[8]a[2]
530
+ // a[f]a[2]........................
531
+ // a[8]a[3]
532
+ // a[f]a[3]........................
533
+ // a[8]a[4]
534
+ // a[f]a[4]........................
535
+ // a[8]a[5]
536
+ // a[f]a[5]........................
537
+ // a[8]a[6]
538
+ // a[f]a[6]........................
539
+ // a[8]a[7]
540
+ // a[f]a[7]........................
541
+ .Lsqr8x_mul:
542
+ mul $t0,$a0,$n0
543
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
544
+ mul $t1,$a1,$n0
545
+ add $cnt,$cnt,#8
546
+ mul $t2,$a2,$n0
547
+ mul $t3,$a3,$n0
548
+ adds $acc0,$acc0,$t0
549
+ mul $t0,$a4,$n0
550
+ adcs $acc1,$acc1,$t1
551
+ mul $t1,$a5,$n0
552
+ adcs $acc2,$acc2,$t2
553
+ mul $t2,$a6,$n0
554
+ adcs $acc3,$acc3,$t3
555
+ mul $t3,$a7,$n0
556
+ adcs $acc4,$acc4,$t0
557
+ umulh $t0,$a0,$n0
558
+ adcs $acc5,$acc5,$t1
559
+ umulh $t1,$a1,$n0
560
+ adcs $acc6,$acc6,$t2
561
+ umulh $t2,$a2,$n0
562
+ adcs $acc7,$acc7,$t3
563
+ umulh $t3,$a3,$n0
564
+ adc $carry,$carry,xzr
565
+ str $acc0,[$tp],#8
566
+ adds $acc0,$acc1,$t0
567
+ umulh $t0,$a4,$n0
568
+ adcs $acc1,$acc2,$t1
569
+ umulh $t1,$a5,$n0
570
+ adcs $acc2,$acc3,$t2
571
+ umulh $t2,$a6,$n0
572
+ adcs $acc3,$acc4,$t3
573
+ umulh $t3,$a7,$n0
574
+ ldr $n0,[$rp,$cnt]
575
+ adcs $acc4,$acc5,$t0
576
+ adcs $acc5,$acc6,$t1
577
+ adcs $acc6,$acc7,$t2
578
+ adcs $acc7,$carry,$t3
579
+ //adc $carry,xzr,xzr // moved above
580
+ cbnz $cnt,.Lsqr8x_mul
581
+ // note that carry flag is guaranteed
582
+ // to be zero at this point
583
+ cmp $ap,$ap_end // done yet?
584
+ b.eq .Lsqr8x_break
585
+
586
+ ldp $a0,$a1,[$tp,#8*0]
587
+ ldp $a2,$a3,[$tp,#8*2]
588
+ ldp $a4,$a5,[$tp,#8*4]
589
+ ldp $a6,$a7,[$tp,#8*6]
590
+ adds $acc0,$acc0,$a0
591
+ ldr $n0,[$rp,#-8*8]
592
+ adcs $acc1,$acc1,$a1
593
+ ldp $a0,$a1,[$ap,#8*0]
594
+ adcs $acc2,$acc2,$a2
595
+ adcs $acc3,$acc3,$a3
596
+ ldp $a2,$a3,[$ap,#8*2]
597
+ adcs $acc4,$acc4,$a4
598
+ adcs $acc5,$acc5,$a5
599
+ ldp $a4,$a5,[$ap,#8*4]
600
+ adcs $acc6,$acc6,$a6
601
+ mov $cnt,#-8*8
602
+ adcs $acc7,$acc7,$a7
603
+ ldp $a6,$a7,[$ap,#8*6]
604
+ add $ap,$ap,#8*8
605
+ //adc $carry,xzr,xzr // moved above
606
+ b .Lsqr8x_mul
607
+
608
+ .align 4
609
+ .Lsqr8x_break:
610
+ ldp $a0,$a1,[$rp,#8*0]
611
+ add $ap,$rp,#8*8
612
+ ldp $a2,$a3,[$rp,#8*2]
613
+ sub $t0,$ap_end,$ap // is it last iteration?
614
+ ldp $a4,$a5,[$rp,#8*4]
615
+ sub $t1,$tp,$t0
616
+ ldp $a6,$a7,[$rp,#8*6]
617
+ cbz $t0,.Lsqr8x_outer_loop
618
+
619
+ stp $acc0,$acc1,[$tp,#8*0]
620
+ ldp $acc0,$acc1,[$t1,#8*0]
621
+ stp $acc2,$acc3,[$tp,#8*2]
622
+ ldp $acc2,$acc3,[$t1,#8*2]
623
+ stp $acc4,$acc5,[$tp,#8*4]
624
+ ldp $acc4,$acc5,[$t1,#8*4]
625
+ stp $acc6,$acc7,[$tp,#8*6]
626
+ mov $tp,$t1
627
+ ldp $acc6,$acc7,[$t1,#8*6]
628
+ b .Lsqr8x_outer_loop
629
+
630
+ .align 4
631
+ .Lsqr8x_outer_break:
632
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633
+ ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634
+ ldp $t1,$t2,[sp,#8*1]
635
+ ldp $a5,$a7,[$t0,#8*2]
636
+ add $ap,$t0,#8*4
637
+ ldp $t3,$t0,[sp,#8*3]
638
+
639
+ stp $acc0,$acc1,[$tp,#8*0]
640
+ mul $acc0,$a1,$a1
641
+ stp $acc2,$acc3,[$tp,#8*2]
642
+ umulh $a1,$a1,$a1
643
+ stp $acc4,$acc5,[$tp,#8*4]
644
+ mul $a2,$a3,$a3
645
+ stp $acc6,$acc7,[$tp,#8*6]
646
+ mov $tp,sp
647
+ umulh $a3,$a3,$a3
648
+ adds $acc1,$a1,$t1,lsl#1
649
+ extr $t1,$t2,$t1,#63
650
+ sub $cnt,$num,#8*4
651
+
652
+ .Lsqr4x_shift_n_add:
653
+ adcs $acc2,$a2,$t1
654
+ extr $t2,$t3,$t2,#63
655
+ sub $cnt,$cnt,#8*4
656
+ adcs $acc3,$a3,$t2
657
+ ldp $t1,$t2,[$tp,#8*5]
658
+ mul $a4,$a5,$a5
659
+ ldp $a1,$a3,[$ap],#8*2
660
+ umulh $a5,$a5,$a5
661
+ mul $a6,$a7,$a7
662
+ umulh $a7,$a7,$a7
663
+ extr $t3,$t0,$t3,#63
664
+ stp $acc0,$acc1,[$tp,#8*0]
665
+ adcs $acc4,$a4,$t3
666
+ extr $t0,$t1,$t0,#63
667
+ stp $acc2,$acc3,[$tp,#8*2]
668
+ adcs $acc5,$a5,$t0
669
+ ldp $t3,$t0,[$tp,#8*7]
670
+ extr $t1,$t2,$t1,#63
671
+ adcs $acc6,$a6,$t1
672
+ extr $t2,$t3,$t2,#63
673
+ adcs $acc7,$a7,$t2
674
+ ldp $t1,$t2,[$tp,#8*9]
675
+ mul $a0,$a1,$a1
676
+ ldp $a5,$a7,[$ap],#8*2
677
+ umulh $a1,$a1,$a1
678
+ mul $a2,$a3,$a3
679
+ umulh $a3,$a3,$a3
680
+ stp $acc4,$acc5,[$tp,#8*4]
681
+ extr $t3,$t0,$t3,#63
682
+ stp $acc6,$acc7,[$tp,#8*6]
683
+ add $tp,$tp,#8*8
684
+ adcs $acc0,$a0,$t3
685
+ extr $t0,$t1,$t0,#63
686
+ adcs $acc1,$a1,$t0
687
+ ldp $t3,$t0,[$tp,#8*3]
688
+ extr $t1,$t2,$t1,#63
689
+ cbnz $cnt,.Lsqr4x_shift_n_add
690
+ ___
691
+ my ($np,$np_end)=($ap,$ap_end);
692
+ $code.=<<___;
693
+ ldp $np,$n0,[x29,#104] // pull np and n0
694
+
695
+ adcs $acc2,$a2,$t1
696
+ extr $t2,$t3,$t2,#63
697
+ adcs $acc3,$a3,$t2
698
+ ldp $t1,$t2,[$tp,#8*5]
699
+ mul $a4,$a5,$a5
700
+ umulh $a5,$a5,$a5
701
+ stp $acc0,$acc1,[$tp,#8*0]
702
+ mul $a6,$a7,$a7
703
+ umulh $a7,$a7,$a7
704
+ stp $acc2,$acc3,[$tp,#8*2]
705
+ extr $t3,$t0,$t3,#63
706
+ adcs $acc4,$a4,$t3
707
+ extr $t0,$t1,$t0,#63
708
+ ldp $acc0,$acc1,[sp,#8*0]
709
+ adcs $acc5,$a5,$t0
710
+ extr $t1,$t2,$t1,#63
711
+ ldp $a0,$a1,[$np,#8*0]
712
+ adcs $acc6,$a6,$t1
713
+ extr $t2,xzr,$t2,#63
714
+ ldp $a2,$a3,[$np,#8*2]
715
+ adc $acc7,$a7,$t2
716
+ ldp $a4,$a5,[$np,#8*4]
717
+
718
+ // Reduce by 512 bits per iteration
719
+ mul $na0,$n0,$acc0 // t[0]*n0
720
+ ldp $a6,$a7,[$np,#8*6]
721
+ add $np_end,$np,$num
722
+ ldp $acc2,$acc3,[sp,#8*2]
723
+ stp $acc4,$acc5,[$tp,#8*4]
724
+ ldp $acc4,$acc5,[sp,#8*4]
725
+ stp $acc6,$acc7,[$tp,#8*6]
726
+ ldp $acc6,$acc7,[sp,#8*6]
727
+ add $np,$np,#8*8
728
+ mov $topmost,xzr // initial top-most carry
729
+ mov $tp,sp
730
+ mov $cnt,#8
731
+
732
+ .Lsqr8x_reduction:
733
+ // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
734
+ mul $t1,$a1,$na0
735
+ sub $cnt,$cnt,#1
736
+ mul $t2,$a2,$na0
737
+ str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
738
+ mul $t3,$a3,$na0
739
+ // (*) adds xzr,$acc0,$t0
740
+ subs xzr,$acc0,#1 // (*)
741
+ mul $t0,$a4,$na0
742
+ adcs $acc0,$acc1,$t1
743
+ mul $t1,$a5,$na0
744
+ adcs $acc1,$acc2,$t2
745
+ mul $t2,$a6,$na0
746
+ adcs $acc2,$acc3,$t3
747
+ mul $t3,$a7,$na0
748
+ adcs $acc3,$acc4,$t0
749
+ umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
750
+ adcs $acc4,$acc5,$t1
751
+ umulh $t1,$a1,$na0
752
+ adcs $acc5,$acc6,$t2
753
+ umulh $t2,$a2,$na0
754
+ adcs $acc6,$acc7,$t3
755
+ umulh $t3,$a3,$na0
756
+ adc $acc7,xzr,xzr
757
+ adds $acc0,$acc0,$t0
758
+ umulh $t0,$a4,$na0
759
+ adcs $acc1,$acc1,$t1
760
+ umulh $t1,$a5,$na0
761
+ adcs $acc2,$acc2,$t2
762
+ umulh $t2,$a6,$na0
763
+ adcs $acc3,$acc3,$t3
764
+ umulh $t3,$a7,$na0
765
+ mul $na0,$n0,$acc0 // next t[0]*n0
766
+ adcs $acc4,$acc4,$t0
767
+ adcs $acc5,$acc5,$t1
768
+ adcs $acc6,$acc6,$t2
769
+ adc $acc7,$acc7,$t3
770
+ cbnz $cnt,.Lsqr8x_reduction
771
+
772
+ ldp $t0,$t1,[$tp,#8*0]
773
+ ldp $t2,$t3,[$tp,#8*2]
774
+ mov $rp,$tp
775
+ sub $cnt,$np_end,$np // done yet?
776
+ adds $acc0,$acc0,$t0
777
+ adcs $acc1,$acc1,$t1
778
+ ldp $t0,$t1,[$tp,#8*4]
779
+ adcs $acc2,$acc2,$t2
780
+ adcs $acc3,$acc3,$t3
781
+ ldp $t2,$t3,[$tp,#8*6]
782
+ adcs $acc4,$acc4,$t0
783
+ adcs $acc5,$acc5,$t1
784
+ adcs $acc6,$acc6,$t2
785
+ adcs $acc7,$acc7,$t3
786
+ //adc $carry,xzr,xzr // moved below
787
+ cbz $cnt,.Lsqr8x8_post_condition
788
+
789
+ ldr $n0,[$tp,#-8*8]
790
+ ldp $a0,$a1,[$np,#8*0]
791
+ ldp $a2,$a3,[$np,#8*2]
792
+ ldp $a4,$a5,[$np,#8*4]
793
+ mov $cnt,#-8*8
794
+ ldp $a6,$a7,[$np,#8*6]
795
+ add $np,$np,#8*8
796
+
797
+ .Lsqr8x_tail:
798
+ mul $t0,$a0,$n0
799
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
800
+ mul $t1,$a1,$n0
801
+ add $cnt,$cnt,#8
802
+ mul $t2,$a2,$n0
803
+ mul $t3,$a3,$n0
804
+ adds $acc0,$acc0,$t0
805
+ mul $t0,$a4,$n0
806
+ adcs $acc1,$acc1,$t1
807
+ mul $t1,$a5,$n0
808
+ adcs $acc2,$acc2,$t2
809
+ mul $t2,$a6,$n0
810
+ adcs $acc3,$acc3,$t3
811
+ mul $t3,$a7,$n0
812
+ adcs $acc4,$acc4,$t0
813
+ umulh $t0,$a0,$n0
814
+ adcs $acc5,$acc5,$t1
815
+ umulh $t1,$a1,$n0
816
+ adcs $acc6,$acc6,$t2
817
+ umulh $t2,$a2,$n0
818
+ adcs $acc7,$acc7,$t3
819
+ umulh $t3,$a3,$n0
820
+ adc $carry,$carry,xzr
821
+ str $acc0,[$tp],#8
822
+ adds $acc0,$acc1,$t0
823
+ umulh $t0,$a4,$n0
824
+ adcs $acc1,$acc2,$t1
825
+ umulh $t1,$a5,$n0
826
+ adcs $acc2,$acc3,$t2
827
+ umulh $t2,$a6,$n0
828
+ adcs $acc3,$acc4,$t3
829
+ umulh $t3,$a7,$n0
830
+ ldr $n0,[$rp,$cnt]
831
+ adcs $acc4,$acc5,$t0
832
+ adcs $acc5,$acc6,$t1
833
+ adcs $acc6,$acc7,$t2
834
+ adcs $acc7,$carry,$t3
835
+ //adc $carry,xzr,xzr // moved above
836
+ cbnz $cnt,.Lsqr8x_tail
837
+ // note that carry flag is guaranteed
838
+ // to be zero at this point
839
+ ldp $a0,$a1,[$tp,#8*0]
840
+ sub $cnt,$np_end,$np // done yet?
841
+ sub $t2,$np_end,$num // rewinded np
842
+ ldp $a2,$a3,[$tp,#8*2]
843
+ ldp $a4,$a5,[$tp,#8*4]
844
+ ldp $a6,$a7,[$tp,#8*6]
845
+ cbz $cnt,.Lsqr8x_tail_break
846
+
847
+ ldr $n0,[$rp,#-8*8]
848
+ adds $acc0,$acc0,$a0
849
+ adcs $acc1,$acc1,$a1
850
+ ldp $a0,$a1,[$np,#8*0]
851
+ adcs $acc2,$acc2,$a2
852
+ adcs $acc3,$acc3,$a3
853
+ ldp $a2,$a3,[$np,#8*2]
854
+ adcs $acc4,$acc4,$a4
855
+ adcs $acc5,$acc5,$a5
856
+ ldp $a4,$a5,[$np,#8*4]
857
+ adcs $acc6,$acc6,$a6
858
+ mov $cnt,#-8*8
859
+ adcs $acc7,$acc7,$a7
860
+ ldp $a6,$a7,[$np,#8*6]
861
+ add $np,$np,#8*8
862
+ //adc $carry,xzr,xzr // moved above
863
+ b .Lsqr8x_tail
864
+
865
+ .align 4
866
+ .Lsqr8x_tail_break:
867
+ ldr $n0,[x29,#112] // pull n0
868
+ add $cnt,$tp,#8*8 // end of current t[num] window
869
+
870
+ subs xzr,$topmost,#1 // "move" top-most carry to carry bit
871
+ adcs $t0,$acc0,$a0
872
+ adcs $t1,$acc1,$a1
873
+ ldp $acc0,$acc1,[$rp,#8*0]
874
+ adcs $acc2,$acc2,$a2
875
+ ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
876
+ adcs $acc3,$acc3,$a3
877
+ ldp $a2,$a3,[$t2,#8*2]
878
+ adcs $acc4,$acc4,$a4
879
+ adcs $acc5,$acc5,$a5
880
+ ldp $a4,$a5,[$t2,#8*4]
881
+ adcs $acc6,$acc6,$a6
882
+ adcs $acc7,$acc7,$a7
883
+ ldp $a6,$a7,[$t2,#8*6]
884
+ add $np,$t2,#8*8
885
+ adc $topmost,xzr,xzr // top-most carry
886
+ mul $na0,$n0,$acc0
887
+ stp $t0,$t1,[$tp,#8*0]
888
+ stp $acc2,$acc3,[$tp,#8*2]
889
+ ldp $acc2,$acc3,[$rp,#8*2]
890
+ stp $acc4,$acc5,[$tp,#8*4]
891
+ ldp $acc4,$acc5,[$rp,#8*4]
892
+ cmp $cnt,x29 // did we hit the bottom?
893
+ stp $acc6,$acc7,[$tp,#8*6]
894
+ mov $tp,$rp // slide the window
895
+ ldp $acc6,$acc7,[$rp,#8*6]
896
+ mov $cnt,#8
897
+ b.ne .Lsqr8x_reduction
898
+
899
+ // Final step. We see if result is larger than modulus, and
900
+ // if it is, subtract the modulus. But comparison implies
901
+ // subtraction. So we subtract modulus, see if it borrowed,
902
+ // and conditionally copy original value.
903
+ ldr $rp,[x29,#96] // pull rp
904
+ add $tp,$tp,#8*8
905
+ subs $t0,$acc0,$a0
906
+ sbcs $t1,$acc1,$a1
907
+ sub $cnt,$num,#8*8
908
+ mov $ap_end,$rp // $rp copy
909
+
910
+ .Lsqr8x_sub:
911
+ sbcs $t2,$acc2,$a2
912
+ ldp $a0,$a1,[$np,#8*0]
913
+ sbcs $t3,$acc3,$a3
914
+ stp $t0,$t1,[$rp,#8*0]
915
+ sbcs $t0,$acc4,$a4
916
+ ldp $a2,$a3,[$np,#8*2]
917
+ sbcs $t1,$acc5,$a5
918
+ stp $t2,$t3,[$rp,#8*2]
919
+ sbcs $t2,$acc6,$a6
920
+ ldp $a4,$a5,[$np,#8*4]
921
+ sbcs $t3,$acc7,$a7
922
+ ldp $a6,$a7,[$np,#8*6]
923
+ add $np,$np,#8*8
924
+ ldp $acc0,$acc1,[$tp,#8*0]
925
+ sub $cnt,$cnt,#8*8
926
+ ldp $acc2,$acc3,[$tp,#8*2]
927
+ ldp $acc4,$acc5,[$tp,#8*4]
928
+ ldp $acc6,$acc7,[$tp,#8*6]
929
+ add $tp,$tp,#8*8
930
+ stp $t0,$t1,[$rp,#8*4]
931
+ sbcs $t0,$acc0,$a0
932
+ stp $t2,$t3,[$rp,#8*6]
933
+ add $rp,$rp,#8*8
934
+ sbcs $t1,$acc1,$a1
935
+ cbnz $cnt,.Lsqr8x_sub
936
+
937
+ sbcs $t2,$acc2,$a2
938
+ mov $tp,sp
939
+ add $ap,sp,$num
940
+ ldp $a0,$a1,[$ap_end,#8*0]
941
+ sbcs $t3,$acc3,$a3
942
+ stp $t0,$t1,[$rp,#8*0]
943
+ sbcs $t0,$acc4,$a4
944
+ ldp $a2,$a3,[$ap_end,#8*2]
945
+ sbcs $t1,$acc5,$a5
946
+ stp $t2,$t3,[$rp,#8*2]
947
+ sbcs $t2,$acc6,$a6
948
+ ldp $acc0,$acc1,[$ap,#8*0]
949
+ sbcs $t3,$acc7,$a7
950
+ ldp $acc2,$acc3,[$ap,#8*2]
951
+ sbcs xzr,$topmost,xzr // did it borrow?
952
+ ldr x30,[x29,#8] // pull return address
953
+ stp $t0,$t1,[$rp,#8*4]
954
+ stp $t2,$t3,[$rp,#8*6]
955
+
956
+ sub $cnt,$num,#8*4
957
+ .Lsqr4x_cond_copy:
958
+ sub $cnt,$cnt,#8*4
959
+ csel $t0,$acc0,$a0,lo
960
+ stp xzr,xzr,[$tp,#8*0]
961
+ csel $t1,$acc1,$a1,lo
962
+ ldp $a0,$a1,[$ap_end,#8*4]
963
+ ldp $acc0,$acc1,[$ap,#8*4]
964
+ csel $t2,$acc2,$a2,lo
965
+ stp xzr,xzr,[$tp,#8*2]
966
+ add $tp,$tp,#8*4
967
+ csel $t3,$acc3,$a3,lo
968
+ ldp $a2,$a3,[$ap_end,#8*6]
969
+ ldp $acc2,$acc3,[$ap,#8*6]
970
+ add $ap,$ap,#8*4
971
+ stp $t0,$t1,[$ap_end,#8*0]
972
+ stp $t2,$t3,[$ap_end,#8*2]
973
+ add $ap_end,$ap_end,#8*4
974
+ stp xzr,xzr,[$ap,#8*0]
975
+ stp xzr,xzr,[$ap,#8*2]
976
+ cbnz $cnt,.Lsqr4x_cond_copy
977
+
978
+ csel $t0,$acc0,$a0,lo
979
+ stp xzr,xzr,[$tp,#8*0]
980
+ csel $t1,$acc1,$a1,lo
981
+ stp xzr,xzr,[$tp,#8*2]
982
+ csel $t2,$acc2,$a2,lo
983
+ csel $t3,$acc3,$a3,lo
984
+ stp $t0,$t1,[$ap_end,#8*0]
985
+ stp $t2,$t3,[$ap_end,#8*2]
986
+
987
+ b .Lsqr8x_done
988
+
989
+ .align 4
990
+ .Lsqr8x8_post_condition:
991
+ adc $carry,xzr,xzr
992
+ ldr x30,[x29,#8] // pull return address
993
+ // $acc0-7,$carry hold result, $a0-7 hold modulus
994
+ subs $a0,$acc0,$a0
995
+ ldr $ap,[x29,#96] // pull rp
996
+ sbcs $a1,$acc1,$a1
997
+ stp xzr,xzr,[sp,#8*0]
998
+ sbcs $a2,$acc2,$a2
999
+ stp xzr,xzr,[sp,#8*2]
1000
+ sbcs $a3,$acc3,$a3
1001
+ stp xzr,xzr,[sp,#8*4]
1002
+ sbcs $a4,$acc4,$a4
1003
+ stp xzr,xzr,[sp,#8*6]
1004
+ sbcs $a5,$acc5,$a5
1005
+ stp xzr,xzr,[sp,#8*8]
1006
+ sbcs $a6,$acc6,$a6
1007
+ stp xzr,xzr,[sp,#8*10]
1008
+ sbcs $a7,$acc7,$a7
1009
+ stp xzr,xzr,[sp,#8*12]
1010
+ sbcs $carry,$carry,xzr // did it borrow?
1011
+ stp xzr,xzr,[sp,#8*14]
1012
+
1013
+ // $a0-7 hold result-modulus
1014
+ csel $a0,$acc0,$a0,lo
1015
+ csel $a1,$acc1,$a1,lo
1016
+ csel $a2,$acc2,$a2,lo
1017
+ csel $a3,$acc3,$a3,lo
1018
+ stp $a0,$a1,[$ap,#8*0]
1019
+ csel $a4,$acc4,$a4,lo
1020
+ csel $a5,$acc5,$a5,lo
1021
+ stp $a2,$a3,[$ap,#8*2]
1022
+ csel $a6,$acc6,$a6,lo
1023
+ csel $a7,$acc7,$a7,lo
1024
+ stp $a4,$a5,[$ap,#8*4]
1025
+ stp $a6,$a7,[$ap,#8*6]
1026
+
1027
+ .Lsqr8x_done:
1028
+ ldp x19,x20,[x29,#16]
1029
+ mov sp,x29
1030
+ ldp x21,x22,[x29,#32]
1031
+ mov x0,#1
1032
+ ldp x23,x24,[x29,#48]
1033
+ ldp x25,x26,[x29,#64]
1034
+ ldp x27,x28,[x29,#80]
1035
+ ldr x29,[sp],#128
1036
+ ret
1037
+ .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038
+ ___
1039
+ }
1040
+
1041
+ {
1042
+ ########################################################################
1043
+ # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044
+ # x86_64-mont5 module, it's different in sense that it performs
1045
+ # reduction 256 bits at a time.
1046
+
1047
+ my ($a0,$a1,$a2,$a3,
1048
+ $t0,$t1,$t2,$t3,
1049
+ $m0,$m1,$m2,$m3,
1050
+ $acc0,$acc1,$acc2,$acc3,$acc4,
1051
+ $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052
+ my $bp_end=$rp;
1053
+ my ($carry,$topmost) = ($rp,"x30");
1054
+
1055
+ $code.=<<___;
1056
+ .type __bn_mul4x_mont,%function
1057
+ .align 5
1058
+ __bn_mul4x_mont:
1059
+ stp x29,x30,[sp,#-128]!
1060
+ add x29,sp,#0
1061
+ stp x19,x20,[sp,#16]
1062
+ stp x21,x22,[sp,#32]
1063
+ stp x23,x24,[sp,#48]
1064
+ stp x25,x26,[sp,#64]
1065
+ stp x27,x28,[sp,#80]
1066
+
1067
+ sub $tp,sp,$num,lsl#3
1068
+ lsl $num,$num,#3
1069
+ ldr $n0,[$n0] // *n0
1070
+ sub sp,$tp,#8*4 // alloca
1071
+
1072
+ add $t0,$bp,$num
1073
+ add $ap_end,$ap,$num
1074
+ stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1075
+
1076
+ ldr $bi,[$bp,#8*0] // b[0]
1077
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078
+ ldp $a2,$a3,[$ap,#8*2]
1079
+ add $ap,$ap,#8*4
1080
+ mov $acc0,xzr
1081
+ mov $acc1,xzr
1082
+ mov $acc2,xzr
1083
+ mov $acc3,xzr
1084
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085
+ ldp $m2,$m3,[$np,#8*2]
1086
+ adds $np,$np,#8*4 // clear carry bit
1087
+ mov $carry,xzr
1088
+ mov $cnt,#0
1089
+ mov $tp,sp
1090
+
1091
+ .Loop_mul4x_1st_reduction:
1092
+ mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093
+ adc $carry,$carry,xzr // modulo-scheduled
1094
+ mul $t1,$a1,$bi
1095
+ add $cnt,$cnt,#8
1096
+ mul $t2,$a2,$bi
1097
+ and $cnt,$cnt,#31
1098
+ mul $t3,$a3,$bi
1099
+ adds $acc0,$acc0,$t0
1100
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101
+ adcs $acc1,$acc1,$t1
1102
+ mul $mi,$acc0,$n0 // t[0]*n0
1103
+ adcs $acc2,$acc2,$t2
1104
+ umulh $t1,$a1,$bi
1105
+ adcs $acc3,$acc3,$t3
1106
+ umulh $t2,$a2,$bi
1107
+ adc $acc4,xzr,xzr
1108
+ umulh $t3,$a3,$bi
1109
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110
+ adds $acc1,$acc1,$t0
1111
+ // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113
+ adcs $acc2,$acc2,$t1
1114
+ mul $t1,$m1,$mi
1115
+ adcs $acc3,$acc3,$t2
1116
+ mul $t2,$m2,$mi
1117
+ adc $acc4,$acc4,$t3 // can't overflow
1118
+ mul $t3,$m3,$mi
1119
+ // (*) adds xzr,$acc0,$t0
1120
+ subs xzr,$acc0,#1 // (*)
1121
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122
+ adcs $acc0,$acc1,$t1
1123
+ umulh $t1,$m1,$mi
1124
+ adcs $acc1,$acc2,$t2
1125
+ umulh $t2,$m2,$mi
1126
+ adcs $acc2,$acc3,$t3
1127
+ umulh $t3,$m3,$mi
1128
+ adcs $acc3,$acc4,$carry
1129
+ adc $carry,xzr,xzr
1130
+ adds $acc0,$acc0,$t0
1131
+ sub $t0,$ap_end,$ap
1132
+ adcs $acc1,$acc1,$t1
1133
+ adcs $acc2,$acc2,$t2
1134
+ adcs $acc3,$acc3,$t3
1135
+ //adc $carry,$carry,xzr
1136
+ cbnz $cnt,.Loop_mul4x_1st_reduction
1137
+
1138
+ cbz $t0,.Lmul4x4_post_condition
1139
+
1140
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141
+ ldp $a2,$a3,[$ap,#8*2]
1142
+ add $ap,$ap,#8*4
1143
+ ldr $mi,[sp] // a[0]*n0
1144
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145
+ ldp $m2,$m3,[$np,#8*2]
1146
+ add $np,$np,#8*4
1147
+
1148
+ .Loop_mul4x_1st_tail:
1149
+ mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150
+ adc $carry,$carry,xzr // modulo-scheduled
1151
+ mul $t1,$a1,$bi
1152
+ add $cnt,$cnt,#8
1153
+ mul $t2,$a2,$bi
1154
+ and $cnt,$cnt,#31
1155
+ mul $t3,$a3,$bi
1156
+ adds $acc0,$acc0,$t0
1157
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158
+ adcs $acc1,$acc1,$t1
1159
+ umulh $t1,$a1,$bi
1160
+ adcs $acc2,$acc2,$t2
1161
+ umulh $t2,$a2,$bi
1162
+ adcs $acc3,$acc3,$t3
1163
+ umulh $t3,$a3,$bi
1164
+ adc $acc4,xzr,xzr
1165
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166
+ adds $acc1,$acc1,$t0
1167
+ mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168
+ adcs $acc2,$acc2,$t1
1169
+ mul $t1,$m1,$mi
1170
+ adcs $acc3,$acc3,$t2
1171
+ mul $t2,$m2,$mi
1172
+ adc $acc4,$acc4,$t3 // can't overflow
1173
+ mul $t3,$m3,$mi
1174
+ adds $acc0,$acc0,$t0
1175
+ umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176
+ adcs $acc1,$acc1,$t1
1177
+ umulh $t1,$m1,$mi
1178
+ adcs $acc2,$acc2,$t2
1179
+ umulh $t2,$m2,$mi
1180
+ adcs $acc3,$acc3,$t3
1181
+ adcs $acc4,$acc4,$carry
1182
+ umulh $t3,$m3,$mi
1183
+ adc $carry,xzr,xzr
1184
+ ldr $mi,[sp,$cnt] // next t[0]*n0
1185
+ str $acc0,[$tp],#8 // result!!!
1186
+ adds $acc0,$acc1,$t0
1187
+ sub $t0,$ap_end,$ap // done yet?
1188
+ adcs $acc1,$acc2,$t1
1189
+ adcs $acc2,$acc3,$t2
1190
+ adcs $acc3,$acc4,$t3
1191
+ //adc $carry,$carry,xzr
1192
+ cbnz $cnt,.Loop_mul4x_1st_tail
1193
+
1194
+ sub $t1,$ap_end,$num // rewinded $ap
1195
+ cbz $t0,.Lmul4x_proceed
1196
+
1197
+ ldp $a0,$a1,[$ap,#8*0]
1198
+ ldp $a2,$a3,[$ap,#8*2]
1199
+ add $ap,$ap,#8*4
1200
+ ldp $m0,$m1,[$np,#8*0]
1201
+ ldp $m2,$m3,[$np,#8*2]
1202
+ add $np,$np,#8*4
1203
+ b .Loop_mul4x_1st_tail
1204
+
1205
+ .align 5
1206
+ .Lmul4x_proceed:
1207
+ ldr $bi,[$bp,#8*4]! // *++b
1208
+ adc $topmost,$carry,xzr
1209
+ ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210
+ sub $np,$np,$num // rewind np
1211
+ ldp $a2,$a3,[$t1,#8*2]
1212
+ add $ap,$t1,#8*4
1213
+
1214
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217
+ ldp $acc2,$acc3,[sp,#8*6]
1218
+
1219
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
1220
+ mov $tp,sp
1221
+ ldp $m2,$m3,[$np,#8*2]
1222
+ adds $np,$np,#8*4 // clear carry bit
1223
+ mov $carry,xzr
1224
+
1225
+ .align 4
1226
+ .Loop_mul4x_reduction:
1227
+ mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228
+ adc $carry,$carry,xzr // modulo-scheduled
1229
+ mul $t1,$a1,$bi
1230
+ add $cnt,$cnt,#8
1231
+ mul $t2,$a2,$bi
1232
+ and $cnt,$cnt,#31
1233
+ mul $t3,$a3,$bi
1234
+ adds $acc0,$acc0,$t0
1235
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236
+ adcs $acc1,$acc1,$t1
1237
+ mul $mi,$acc0,$n0 // t[0]*n0
1238
+ adcs $acc2,$acc2,$t2
1239
+ umulh $t1,$a1,$bi
1240
+ adcs $acc3,$acc3,$t3
1241
+ umulh $t2,$a2,$bi
1242
+ adc $acc4,xzr,xzr
1243
+ umulh $t3,$a3,$bi
1244
+ ldr $bi,[$bp,$cnt] // next b[i]
1245
+ adds $acc1,$acc1,$t0
1246
+ // (*) mul $t0,$m0,$mi
1247
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248
+ adcs $acc2,$acc2,$t1
1249
+ mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250
+ adcs $acc3,$acc3,$t2
1251
+ mul $t2,$m2,$mi
1252
+ adc $acc4,$acc4,$t3 // can't overflow
1253
+ mul $t3,$m3,$mi
1254
+ // (*) adds xzr,$acc0,$t0
1255
+ subs xzr,$acc0,#1 // (*)
1256
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257
+ adcs $acc0,$acc1,$t1
1258
+ umulh $t1,$m1,$mi
1259
+ adcs $acc1,$acc2,$t2
1260
+ umulh $t2,$m2,$mi
1261
+ adcs $acc2,$acc3,$t3
1262
+ umulh $t3,$m3,$mi
1263
+ adcs $acc3,$acc4,$carry
1264
+ adc $carry,xzr,xzr
1265
+ adds $acc0,$acc0,$t0
1266
+ adcs $acc1,$acc1,$t1
1267
+ adcs $acc2,$acc2,$t2
1268
+ adcs $acc3,$acc3,$t3
1269
+ //adc $carry,$carry,xzr
1270
+ cbnz $cnt,.Loop_mul4x_reduction
1271
+
1272
+ adc $carry,$carry,xzr
1273
+ ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274
+ ldp $t2,$t3,[$tp,#8*6]
1275
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276
+ ldp $a2,$a3,[$ap,#8*2]
1277
+ add $ap,$ap,#8*4
1278
+ adds $acc0,$acc0,$t0
1279
+ adcs $acc1,$acc1,$t1
1280
+ adcs $acc2,$acc2,$t2
1281
+ adcs $acc3,$acc3,$t3
1282
+ //adc $carry,$carry,xzr
1283
+
1284
+ ldr $mi,[sp] // t[0]*n0
1285
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286
+ ldp $m2,$m3,[$np,#8*2]
1287
+ add $np,$np,#8*4
1288
+
1289
+ .align 4
1290
+ .Loop_mul4x_tail:
1291
+ mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292
+ adc $carry,$carry,xzr // modulo-scheduled
1293
+ mul $t1,$a1,$bi
1294
+ add $cnt,$cnt,#8
1295
+ mul $t2,$a2,$bi
1296
+ and $cnt,$cnt,#31
1297
+ mul $t3,$a3,$bi
1298
+ adds $acc0,$acc0,$t0
1299
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300
+ adcs $acc1,$acc1,$t1
1301
+ umulh $t1,$a1,$bi
1302
+ adcs $acc2,$acc2,$t2
1303
+ umulh $t2,$a2,$bi
1304
+ adcs $acc3,$acc3,$t3
1305
+ umulh $t3,$a3,$bi
1306
+ adc $acc4,xzr,xzr
1307
+ ldr $bi,[$bp,$cnt] // next b[i]
1308
+ adds $acc1,$acc1,$t0
1309
+ mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310
+ adcs $acc2,$acc2,$t1
1311
+ mul $t1,$m1,$mi
1312
+ adcs $acc3,$acc3,$t2
1313
+ mul $t2,$m2,$mi
1314
+ adc $acc4,$acc4,$t3 // can't overflow
1315
+ mul $t3,$m3,$mi
1316
+ adds $acc0,$acc0,$t0
1317
+ umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318
+ adcs $acc1,$acc1,$t1
1319
+ umulh $t1,$m1,$mi
1320
+ adcs $acc2,$acc2,$t2
1321
+ umulh $t2,$m2,$mi
1322
+ adcs $acc3,$acc3,$t3
1323
+ umulh $t3,$m3,$mi
1324
+ adcs $acc4,$acc4,$carry
1325
+ ldr $mi,[sp,$cnt] // next a[0]*n0
1326
+ adc $carry,xzr,xzr
1327
+ str $acc0,[$tp],#8 // result!!!
1328
+ adds $acc0,$acc1,$t0
1329
+ sub $t0,$ap_end,$ap // done yet?
1330
+ adcs $acc1,$acc2,$t1
1331
+ adcs $acc2,$acc3,$t2
1332
+ adcs $acc3,$acc4,$t3
1333
+ //adc $carry,$carry,xzr
1334
+ cbnz $cnt,.Loop_mul4x_tail
1335
+
1336
+ sub $t1,$np,$num // rewinded np?
1337
+ adc $carry,$carry,xzr
1338
+ cbz $t0,.Loop_mul4x_break
1339
+
1340
+ ldp $t0,$t1,[$tp,#8*4]
1341
+ ldp $t2,$t3,[$tp,#8*6]
1342
+ ldp $a0,$a1,[$ap,#8*0]
1343
+ ldp $a2,$a3,[$ap,#8*2]
1344
+ add $ap,$ap,#8*4
1345
+ adds $acc0,$acc0,$t0
1346
+ adcs $acc1,$acc1,$t1
1347
+ adcs $acc2,$acc2,$t2
1348
+ adcs $acc3,$acc3,$t3
1349
+ //adc $carry,$carry,xzr
1350
+ ldp $m0,$m1,[$np,#8*0]
1351
+ ldp $m2,$m3,[$np,#8*2]
1352
+ add $np,$np,#8*4
1353
+ b .Loop_mul4x_tail
1354
+
1355
+ .align 4
1356
+ .Loop_mul4x_break:
1357
+ ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358
+ adds $acc0,$acc0,$topmost
1359
+ add $bp,$bp,#8*4 // bp++
1360
+ adcs $acc1,$acc1,xzr
1361
+ sub $ap,$ap,$num // rewind ap
1362
+ adcs $acc2,$acc2,xzr
1363
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364
+ adcs $acc3,$acc3,xzr
1365
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366
+ adc $topmost,$carry,xzr
1367
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368
+ cmp $bp,$t3 // done yet?
1369
+ ldp $acc2,$acc3,[sp,#8*6]
1370
+ ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371
+ ldp $m2,$m3,[$t1,#8*2]
1372
+ add $np,$t1,#8*4
1373
+ b.eq .Lmul4x_post
1374
+
1375
+ ldr $bi,[$bp]
1376
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377
+ ldp $a2,$a3,[$ap,#8*2]
1378
+ adds $ap,$ap,#8*4 // clear carry bit
1379
+ mov $carry,xzr
1380
+ mov $tp,sp
1381
+ b .Loop_mul4x_reduction
1382
+
1383
+ .align 4
1384
+ .Lmul4x_post:
1385
+ // Final step. We see if result is larger than modulus, and
1386
+ // if it is, subtract the modulus. But comparison implies
1387
+ // subtraction. So we subtract modulus, see if it borrowed,
1388
+ // and conditionally copy original value.
1389
+ mov $rp,$t2
1390
+ mov $ap_end,$t2 // $rp copy
1391
+ subs $t0,$acc0,$m0
1392
+ add $tp,sp,#8*8
1393
+ sbcs $t1,$acc1,$m1
1394
+ sub $cnt,$num,#8*4
1395
+
1396
+ .Lmul4x_sub:
1397
+ sbcs $t2,$acc2,$m2
1398
+ ldp $m0,$m1,[$np,#8*0]
1399
+ sub $cnt,$cnt,#8*4
1400
+ ldp $acc0,$acc1,[$tp,#8*0]
1401
+ sbcs $t3,$acc3,$m3
1402
+ ldp $m2,$m3,[$np,#8*2]
1403
+ add $np,$np,#8*4
1404
+ ldp $acc2,$acc3,[$tp,#8*2]
1405
+ add $tp,$tp,#8*4
1406
+ stp $t0,$t1,[$rp,#8*0]
1407
+ sbcs $t0,$acc0,$m0
1408
+ stp $t2,$t3,[$rp,#8*2]
1409
+ add $rp,$rp,#8*4
1410
+ sbcs $t1,$acc1,$m1
1411
+ cbnz $cnt,.Lmul4x_sub
1412
+
1413
+ sbcs $t2,$acc2,$m2
1414
+ mov $tp,sp
1415
+ add $ap,sp,#8*4
1416
+ ldp $a0,$a1,[$ap_end,#8*0]
1417
+ sbcs $t3,$acc3,$m3
1418
+ stp $t0,$t1,[$rp,#8*0]
1419
+ ldp $a2,$a3,[$ap_end,#8*2]
1420
+ stp $t2,$t3,[$rp,#8*2]
1421
+ ldp $acc0,$acc1,[$ap,#8*0]
1422
+ ldp $acc2,$acc3,[$ap,#8*2]
1423
+ sbcs xzr,$topmost,xzr // did it borrow?
1424
+ ldr x30,[x29,#8] // pull return address
1425
+
1426
+ sub $cnt,$num,#8*4
1427
+ .Lmul4x_cond_copy:
1428
+ sub $cnt,$cnt,#8*4
1429
+ csel $t0,$acc0,$a0,lo
1430
+ stp xzr,xzr,[$tp,#8*0]
1431
+ csel $t1,$acc1,$a1,lo
1432
+ ldp $a0,$a1,[$ap_end,#8*4]
1433
+ ldp $acc0,$acc1,[$ap,#8*4]
1434
+ csel $t2,$acc2,$a2,lo
1435
+ stp xzr,xzr,[$tp,#8*2]
1436
+ add $tp,$tp,#8*4
1437
+ csel $t3,$acc3,$a3,lo
1438
+ ldp $a2,$a3,[$ap_end,#8*6]
1439
+ ldp $acc2,$acc3,[$ap,#8*6]
1440
+ add $ap,$ap,#8*4
1441
+ stp $t0,$t1,[$ap_end,#8*0]
1442
+ stp $t2,$t3,[$ap_end,#8*2]
1443
+ add $ap_end,$ap_end,#8*4
1444
+ cbnz $cnt,.Lmul4x_cond_copy
1445
+
1446
+ csel $t0,$acc0,$a0,lo
1447
+ stp xzr,xzr,[$tp,#8*0]
1448
+ csel $t1,$acc1,$a1,lo
1449
+ stp xzr,xzr,[$tp,#8*2]
1450
+ csel $t2,$acc2,$a2,lo
1451
+ stp xzr,xzr,[$tp,#8*3]
1452
+ csel $t3,$acc3,$a3,lo
1453
+ stp xzr,xzr,[$tp,#8*4]
1454
+ stp $t0,$t1,[$ap_end,#8*0]
1455
+ stp $t2,$t3,[$ap_end,#8*2]
1456
+
1457
+ b .Lmul4x_done
1458
+
1459
+ .align 4
1460
+ .Lmul4x4_post_condition:
1461
+ adc $carry,$carry,xzr
1462
+ ldr $ap,[x29,#96] // pull rp
1463
+ // $acc0-3,$carry hold result, $m0-7 hold modulus
1464
+ subs $a0,$acc0,$m0
1465
+ ldr x30,[x29,#8] // pull return address
1466
+ sbcs $a1,$acc1,$m1
1467
+ stp xzr,xzr,[sp,#8*0]
1468
+ sbcs $a2,$acc2,$m2
1469
+ stp xzr,xzr,[sp,#8*2]
1470
+ sbcs $a3,$acc3,$m3
1471
+ stp xzr,xzr,[sp,#8*4]
1472
+ sbcs xzr,$carry,xzr // did it borrow?
1473
+ stp xzr,xzr,[sp,#8*6]
1474
+
1475
+ // $a0-3 hold result-modulus
1476
+ csel $a0,$acc0,$a0,lo
1477
+ csel $a1,$acc1,$a1,lo
1478
+ csel $a2,$acc2,$a2,lo
1479
+ csel $a3,$acc3,$a3,lo
1480
+ stp $a0,$a1,[$ap,#8*0]
1481
+ stp $a2,$a3,[$ap,#8*2]
1482
+
1483
+ .Lmul4x_done:
1484
+ ldp x19,x20,[x29,#16]
1485
+ mov sp,x29
1486
+ ldp x21,x22,[x29,#32]
1487
+ mov x0,#1
1488
+ ldp x23,x24,[x29,#48]
1489
+ ldp x25,x26,[x29,#64]
1490
+ ldp x27,x28,[x29,#80]
1491
+ ldr x29,[sp],#128
1492
+ ret
1493
+ .size __bn_mul4x_mont,.-__bn_mul4x_mont
1494
+ ___
1495
+ }
1496
+ $code.=<<___;
1497
+ .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498
+ .align 4
1499
+ ___
1500
+
1501
+ print $code;
1502
+
1503
+ close STDOUT;