ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1503 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # March 2015
11
+ #
12
+ # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13
+ # work. While it does improve RSA sign performance by 20-30% (less for
14
+ # longer keys) on most processors, for some reason RSA2048 is not
15
+ # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16
+ # instruction issue rate is limited on processor in question, meaning
17
+ # that dedicated squaring procedure is a must. Well, actually all
18
+ # contemporary AArch64 processors seem to have limited multiplication
19
+ # issue rate, i.e. they can't issue multiplication every cycle, which
20
+ # explains moderate improvement coefficients in comparison to
21
+ # compiler-generated code. Recall that compiler is instructed to use
22
+ # umulh and therefore uses same amount of multiplication instructions
23
+ # to do the job. Assembly's edge is to minimize number of "collateral"
24
+ # instructions and of course instruction scheduling.
25
+ #
26
+ # April 2015
27
+ #
28
+ # Squaring procedure that handles lengths divisible by 8 improves
29
+ # RSA/DSA performance by 25-40-60% depending on processor and key
30
+ # length. Overall improvement coefficients are always positive in
31
+ # comparison to compiler-generated code. On Cortex-A57 improvement
32
+ # is still modest on longest key lengths, while others exhibit e.g.
33
+ # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34
+ # on Cortex-A57 and ~60-100% faster on others.
35
+
36
+ $flavour = shift;
37
+ $output = shift;
38
+
39
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
+ die "can't locate arm-xlate.pl";
43
+
44
+ open OUT,"| \"$^X\" $xlate $flavour $output";
45
+ *STDOUT=*OUT;
46
+
47
+ ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48
+ $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49
+ $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
+
51
+ # int bn_mul_mont(
52
+ $rp="x0"; # BN_ULONG *rp,
53
+ $ap="x1"; # const BN_ULONG *ap,
54
+ $bp="x2"; # const BN_ULONG *bp,
55
+ $np="x3"; # const BN_ULONG *np,
56
+ $n0="x4"; # const BN_ULONG *n0,
57
+ $num="x5"; # int num);
58
+
59
+ $code.=<<___;
60
+ .text
61
+
62
+ .globl bn_mul_mont
63
+ .type bn_mul_mont,%function
64
+ .align 5
65
+ bn_mul_mont:
66
+ tst $num,#7
67
+ b.eq __bn_sqr8x_mont
68
+ tst $num,#3
69
+ b.eq __bn_mul4x_mont
70
+ .Lmul_mont:
71
+ stp x29,x30,[sp,#-64]!
72
+ add x29,sp,#0
73
+ stp x19,x20,[sp,#16]
74
+ stp x21,x22,[sp,#32]
75
+ stp x23,x24,[sp,#48]
76
+
77
+ ldr $m0,[$bp],#8 // bp[0]
78
+ sub $tp,sp,$num,lsl#3
79
+ ldp $hi0,$aj,[$ap],#16 // ap[0..1]
80
+ lsl $num,$num,#3
81
+ ldr $n0,[$n0] // *n0
82
+ and $tp,$tp,#-16 // ABI says so
83
+ ldp $hi1,$nj,[$np],#16 // np[0..1]
84
+
85
+ mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86
+ sub $j,$num,#16 // j=num-2
87
+ umulh $hi0,$hi0,$m0
88
+ mul $alo,$aj,$m0 // ap[1]*bp[0]
89
+ umulh $ahi,$aj,$m0
90
+
91
+ mul $m1,$lo0,$n0 // "tp[0]"*n0
92
+ mov sp,$tp // alloca
93
+
94
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
95
+ umulh $hi1,$hi1,$m1
96
+ mul $nlo,$nj,$m1 // np[1]*m1
97
+ // (*) adds $lo1,$lo1,$lo0 // discarded
98
+ // (*) As for removal of first multiplication and addition
99
+ // instructions. The outcome of first addition is
100
+ // guaranteed to be zero, which leaves two computationally
101
+ // significant outcomes: it either carries or not. Then
102
+ // question is when does it carry? Is there alternative
103
+ // way to deduce it? If you follow operations, you can
104
+ // observe that condition for carry is quite simple:
105
+ // $lo0 being non-zero. So that carry can be calculated
106
+ // by adding -1 to $lo0. That's what next instruction does.
107
+ subs xzr,$lo0,#1 // (*)
108
+ umulh $nhi,$nj,$m1
109
+ adc $hi1,$hi1,xzr
110
+ cbz $j,.L1st_skip
111
+
112
+ .L1st:
113
+ ldr $aj,[$ap],#8
114
+ adds $lo0,$alo,$hi0
115
+ sub $j,$j,#8 // j--
116
+ adc $hi0,$ahi,xzr
117
+
118
+ ldr $nj,[$np],#8
119
+ adds $lo1,$nlo,$hi1
120
+ mul $alo,$aj,$m0 // ap[j]*bp[0]
121
+ adc $hi1,$nhi,xzr
122
+ umulh $ahi,$aj,$m0
123
+
124
+ adds $lo1,$lo1,$lo0
125
+ mul $nlo,$nj,$m1 // np[j]*m1
126
+ adc $hi1,$hi1,xzr
127
+ umulh $nhi,$nj,$m1
128
+ str $lo1,[$tp],#8 // tp[j-1]
129
+ cbnz $j,.L1st
130
+
131
+ .L1st_skip:
132
+ adds $lo0,$alo,$hi0
133
+ sub $ap,$ap,$num // rewind $ap
134
+ adc $hi0,$ahi,xzr
135
+
136
+ adds $lo1,$nlo,$hi1
137
+ sub $np,$np,$num // rewind $np
138
+ adc $hi1,$nhi,xzr
139
+
140
+ adds $lo1,$lo1,$lo0
141
+ sub $i,$num,#8 // i=num-1
142
+ adcs $hi1,$hi1,$hi0
143
+
144
+ adc $ovf,xzr,xzr // upmost overflow bit
145
+ stp $lo1,$hi1,[$tp]
146
+
147
+ .Louter:
148
+ ldr $m0,[$bp],#8 // bp[i]
149
+ ldp $hi0,$aj,[$ap],#16
150
+ ldr $tj,[sp] // tp[0]
151
+ add $tp,sp,#8
152
+
153
+ mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154
+ sub $j,$num,#16 // j=num-2
155
+ umulh $hi0,$hi0,$m0
156
+ ldp $hi1,$nj,[$np],#16
157
+ mul $alo,$aj,$m0 // ap[1]*bp[i]
158
+ adds $lo0,$lo0,$tj
159
+ umulh $ahi,$aj,$m0
160
+ adc $hi0,$hi0,xzr
161
+
162
+ mul $m1,$lo0,$n0
163
+ sub $i,$i,#8 // i--
164
+
165
+ // (*) mul $lo1,$hi1,$m1 // np[0]*m1
166
+ umulh $hi1,$hi1,$m1
167
+ mul $nlo,$nj,$m1 // np[1]*m1
168
+ // (*) adds $lo1,$lo1,$lo0
169
+ subs xzr,$lo0,#1 // (*)
170
+ umulh $nhi,$nj,$m1
171
+ cbz $j,.Linner_skip
172
+
173
+ .Linner:
174
+ ldr $aj,[$ap],#8
175
+ adc $hi1,$hi1,xzr
176
+ ldr $tj,[$tp],#8 // tp[j]
177
+ adds $lo0,$alo,$hi0
178
+ sub $j,$j,#8 // j--
179
+ adc $hi0,$ahi,xzr
180
+
181
+ adds $lo1,$nlo,$hi1
182
+ ldr $nj,[$np],#8
183
+ adc $hi1,$nhi,xzr
184
+
185
+ mul $alo,$aj,$m0 // ap[j]*bp[i]
186
+ adds $lo0,$lo0,$tj
187
+ umulh $ahi,$aj,$m0
188
+ adc $hi0,$hi0,xzr
189
+
190
+ mul $nlo,$nj,$m1 // np[j]*m1
191
+ adds $lo1,$lo1,$lo0
192
+ umulh $nhi,$nj,$m1
193
+ str $lo1,[$tp,#-16] // tp[j-1]
194
+ cbnz $j,.Linner
195
+
196
+ .Linner_skip:
197
+ ldr $tj,[$tp],#8 // tp[j]
198
+ adc $hi1,$hi1,xzr
199
+ adds $lo0,$alo,$hi0
200
+ sub $ap,$ap,$num // rewind $ap
201
+ adc $hi0,$ahi,xzr
202
+
203
+ adds $lo1,$nlo,$hi1
204
+ sub $np,$np,$num // rewind $np
205
+ adcs $hi1,$nhi,$ovf
206
+ adc $ovf,xzr,xzr
207
+
208
+ adds $lo0,$lo0,$tj
209
+ adc $hi0,$hi0,xzr
210
+
211
+ adds $lo1,$lo1,$lo0
212
+ adcs $hi1,$hi1,$hi0
213
+ adc $ovf,$ovf,xzr // upmost overflow bit
214
+ stp $lo1,$hi1,[$tp,#-16]
215
+
216
+ cbnz $i,.Louter
217
+
218
+ // Final step. We see if result is larger than modulus, and
219
+ // if it is, subtract the modulus. But comparison implies
220
+ // subtraction. So we subtract modulus, see if it borrowed,
221
+ // and conditionally copy original value.
222
+ ldr $tj,[sp] // tp[0]
223
+ add $tp,sp,#8
224
+ ldr $nj,[$np],#8 // np[0]
225
+ subs $j,$num,#8 // j=num-1 and clear borrow
226
+ mov $ap,$rp
227
+ .Lsub:
228
+ sbcs $aj,$tj,$nj // tp[j]-np[j]
229
+ ldr $tj,[$tp],#8
230
+ sub $j,$j,#8 // j--
231
+ ldr $nj,[$np],#8
232
+ str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
233
+ cbnz $j,.Lsub
234
+
235
+ sbcs $aj,$tj,$nj
236
+ sbcs $ovf,$ovf,xzr // did it borrow?
237
+ str $aj,[$ap],#8 // rp[num-1]
238
+
239
+ ldr $tj,[sp] // tp[0]
240
+ add $tp,sp,#8
241
+ ldr $aj,[$rp],#8 // rp[0]
242
+ sub $num,$num,#8 // num--
243
+ nop
244
+ .Lcond_copy:
245
+ sub $num,$num,#8 // num--
246
+ csel $nj,$tj,$aj,lo // did it borrow?
247
+ ldr $tj,[$tp],#8
248
+ ldr $aj,[$rp],#8
249
+ str xzr,[$tp,#-16] // wipe tp
250
+ str $nj,[$rp,#-16]
251
+ cbnz $num,.Lcond_copy
252
+
253
+ csel $nj,$tj,$aj,lo
254
+ str xzr,[$tp,#-8] // wipe tp
255
+ str $nj,[$rp,#-8]
256
+
257
+ ldp x19,x20,[x29,#16]
258
+ mov sp,x29
259
+ ldp x21,x22,[x29,#32]
260
+ mov x0,#1
261
+ ldp x23,x24,[x29,#48]
262
+ ldr x29,[sp],#64
263
+ ret
264
+ .size bn_mul_mont,.-bn_mul_mont
265
+ ___
266
+ {
267
+ ########################################################################
268
+ # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
+
270
+ my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271
+ my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272
+ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273
+ my ($cnt,$carry,$topmost)=("x27","x28","x30");
274
+ my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
+
276
+ $code.=<<___;
277
+ .type __bn_sqr8x_mont,%function
278
+ .align 5
279
+ __bn_sqr8x_mont:
280
+ cmp $ap,$bp
281
+ b.ne __bn_mul4x_mont
282
+ .Lsqr8x_mont:
283
+ stp x29,x30,[sp,#-128]!
284
+ add x29,sp,#0
285
+ stp x19,x20,[sp,#16]
286
+ stp x21,x22,[sp,#32]
287
+ stp x23,x24,[sp,#48]
288
+ stp x25,x26,[sp,#64]
289
+ stp x27,x28,[sp,#80]
290
+ stp $rp,$np,[sp,#96] // offload rp and np
291
+
292
+ ldp $a0,$a1,[$ap,#8*0]
293
+ ldp $a2,$a3,[$ap,#8*2]
294
+ ldp $a4,$a5,[$ap,#8*4]
295
+ ldp $a6,$a7,[$ap,#8*6]
296
+
297
+ sub $tp,sp,$num,lsl#4
298
+ lsl $num,$num,#3
299
+ ldr $n0,[$n0] // *n0
300
+ mov sp,$tp // alloca
301
+ sub $cnt,$num,#8*8
302
+ b .Lsqr8x_zero_start
303
+
304
+ .Lsqr8x_zero:
305
+ sub $cnt,$cnt,#8*8
306
+ stp xzr,xzr,[$tp,#8*0]
307
+ stp xzr,xzr,[$tp,#8*2]
308
+ stp xzr,xzr,[$tp,#8*4]
309
+ stp xzr,xzr,[$tp,#8*6]
310
+ .Lsqr8x_zero_start:
311
+ stp xzr,xzr,[$tp,#8*8]
312
+ stp xzr,xzr,[$tp,#8*10]
313
+ stp xzr,xzr,[$tp,#8*12]
314
+ stp xzr,xzr,[$tp,#8*14]
315
+ add $tp,$tp,#8*16
316
+ cbnz $cnt,.Lsqr8x_zero
317
+
318
+ add $ap_end,$ap,$num
319
+ add $ap,$ap,#8*8
320
+ mov $acc0,xzr
321
+ mov $acc1,xzr
322
+ mov $acc2,xzr
323
+ mov $acc3,xzr
324
+ mov $acc4,xzr
325
+ mov $acc5,xzr
326
+ mov $acc6,xzr
327
+ mov $acc7,xzr
328
+ mov $tp,sp
329
+ str $n0,[x29,#112] // offload n0
330
+
331
+ // Multiply everything but a[i]*a[i]
332
+ .align 4
333
+ .Lsqr8x_outer_loop:
334
+ // a[1]a[0] (i)
335
+ // a[2]a[0]
336
+ // a[3]a[0]
337
+ // a[4]a[0]
338
+ // a[5]a[0]
339
+ // a[6]a[0]
340
+ // a[7]a[0]
341
+ // a[2]a[1] (ii)
342
+ // a[3]a[1]
343
+ // a[4]a[1]
344
+ // a[5]a[1]
345
+ // a[6]a[1]
346
+ // a[7]a[1]
347
+ // a[3]a[2] (iii)
348
+ // a[4]a[2]
349
+ // a[5]a[2]
350
+ // a[6]a[2]
351
+ // a[7]a[2]
352
+ // a[4]a[3] (iv)
353
+ // a[5]a[3]
354
+ // a[6]a[3]
355
+ // a[7]a[3]
356
+ // a[5]a[4] (v)
357
+ // a[6]a[4]
358
+ // a[7]a[4]
359
+ // a[6]a[5] (vi)
360
+ // a[7]a[5]
361
+ // a[7]a[6] (vii)
362
+
363
+ mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
364
+ mul $t1,$a2,$a0
365
+ mul $t2,$a3,$a0
366
+ mul $t3,$a4,$a0
367
+ adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
368
+ mul $t0,$a5,$a0
369
+ adcs $acc2,$acc2,$t1
370
+ mul $t1,$a6,$a0
371
+ adcs $acc3,$acc3,$t2
372
+ mul $t2,$a7,$a0
373
+ adcs $acc4,$acc4,$t3
374
+ umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
375
+ adcs $acc5,$acc5,$t0
376
+ umulh $t0,$a2,$a0
377
+ adcs $acc6,$acc6,$t1
378
+ umulh $t1,$a3,$a0
379
+ adcs $acc7,$acc7,$t2
380
+ umulh $t2,$a4,$a0
381
+ stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382
+ adc $acc0,xzr,xzr // t[8]
383
+ adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
384
+ umulh $t3,$a5,$a0
385
+ adcs $acc3,$acc3,$t0
386
+ umulh $t0,$a6,$a0
387
+ adcs $acc4,$acc4,$t1
388
+ umulh $t1,$a7,$a0
389
+ adcs $acc5,$acc5,$t2
390
+ mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
391
+ adcs $acc6,$acc6,$t3
392
+ mul $t3,$a3,$a1
393
+ adcs $acc7,$acc7,$t0
394
+ mul $t0,$a4,$a1
395
+ adc $acc0,$acc0,$t1
396
+
397
+ mul $t1,$a5,$a1
398
+ adds $acc3,$acc3,$t2
399
+ mul $t2,$a6,$a1
400
+ adcs $acc4,$acc4,$t3
401
+ mul $t3,$a7,$a1
402
+ adcs $acc5,$acc5,$t0
403
+ umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
404
+ adcs $acc6,$acc6,$t1
405
+ umulh $t1,$a3,$a1
406
+ adcs $acc7,$acc7,$t2
407
+ umulh $t2,$a4,$a1
408
+ adcs $acc0,$acc0,$t3
409
+ umulh $t3,$a5,$a1
410
+ stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411
+ adc $acc1,xzr,xzr // t[9]
412
+ adds $acc4,$acc4,$t0
413
+ umulh $t0,$a6,$a1
414
+ adcs $acc5,$acc5,$t1
415
+ umulh $t1,$a7,$a1
416
+ adcs $acc6,$acc6,$t2
417
+ mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
418
+ adcs $acc7,$acc7,$t3
419
+ mul $t3,$a4,$a2
420
+ adcs $acc0,$acc0,$t0
421
+ mul $t0,$a5,$a2
422
+ adc $acc1,$acc1,$t1
423
+
424
+ mul $t1,$a6,$a2
425
+ adds $acc5,$acc5,$t2
426
+ mul $t2,$a7,$a2
427
+ adcs $acc6,$acc6,$t3
428
+ umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
429
+ adcs $acc7,$acc7,$t0
430
+ umulh $t0,$a4,$a2
431
+ adcs $acc0,$acc0,$t1
432
+ umulh $t1,$a5,$a2
433
+ adcs $acc1,$acc1,$t2
434
+ umulh $t2,$a6,$a2
435
+ stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436
+ adc $acc2,xzr,xzr // t[10]
437
+ adds $acc6,$acc6,$t3
438
+ umulh $t3,$a7,$a2
439
+ adcs $acc7,$acc7,$t0
440
+ mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
441
+ adcs $acc0,$acc0,$t1
442
+ mul $t1,$a5,$a3
443
+ adcs $acc1,$acc1,$t2
444
+ mul $t2,$a6,$a3
445
+ adc $acc2,$acc2,$t3
446
+
447
+ mul $t3,$a7,$a3
448
+ adds $acc7,$acc7,$t0
449
+ umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
450
+ adcs $acc0,$acc0,$t1
451
+ umulh $t1,$a5,$a3
452
+ adcs $acc1,$acc1,$t2
453
+ umulh $t2,$a6,$a3
454
+ adcs $acc2,$acc2,$t3
455
+ umulh $t3,$a7,$a3
456
+ stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457
+ adc $acc3,xzr,xzr // t[11]
458
+ adds $acc0,$acc0,$t0
459
+ mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
460
+ adcs $acc1,$acc1,$t1
461
+ mul $t1,$a6,$a4
462
+ adcs $acc2,$acc2,$t2
463
+ mul $t2,$a7,$a4
464
+ adc $acc3,$acc3,$t3
465
+
466
+ umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
467
+ adds $acc1,$acc1,$t0
468
+ umulh $t0,$a6,$a4
469
+ adcs $acc2,$acc2,$t1
470
+ umulh $t1,$a7,$a4
471
+ adcs $acc3,$acc3,$t2
472
+ mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473
+ adc $acc4,xzr,xzr // t[12]
474
+ adds $acc2,$acc2,$t3
475
+ mul $t3,$a7,$a5
476
+ adcs $acc3,$acc3,$t0
477
+ umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
478
+ adc $acc4,$acc4,$t1
479
+
480
+ umulh $t1,$a7,$a5
481
+ adds $acc3,$acc3,$t2
482
+ mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
483
+ adcs $acc4,$acc4,$t3
484
+ umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485
+ adc $acc5,xzr,xzr // t[13]
486
+ adds $acc4,$acc4,$t0
487
+ sub $cnt,$ap_end,$ap // done yet?
488
+ adc $acc5,$acc5,$t1
489
+
490
+ adds $acc5,$acc5,$t2
491
+ sub $t0,$ap_end,$num // rewinded ap
492
+ adc $acc6,xzr,xzr // t[14]
493
+ add $acc6,$acc6,$t3
494
+
495
+ cbz $cnt,.Lsqr8x_outer_break
496
+
497
+ mov $n0,$a0
498
+ ldp $a0,$a1,[$tp,#8*0]
499
+ ldp $a2,$a3,[$tp,#8*2]
500
+ ldp $a4,$a5,[$tp,#8*4]
501
+ ldp $a6,$a7,[$tp,#8*6]
502
+ adds $acc0,$acc0,$a0
503
+ adcs $acc1,$acc1,$a1
504
+ ldp $a0,$a1,[$ap,#8*0]
505
+ adcs $acc2,$acc2,$a2
506
+ adcs $acc3,$acc3,$a3
507
+ ldp $a2,$a3,[$ap,#8*2]
508
+ adcs $acc4,$acc4,$a4
509
+ adcs $acc5,$acc5,$a5
510
+ ldp $a4,$a5,[$ap,#8*4]
511
+ adcs $acc6,$acc6,$a6
512
+ mov $rp,$ap
513
+ adcs $acc7,xzr,$a7
514
+ ldp $a6,$a7,[$ap,#8*6]
515
+ add $ap,$ap,#8*8
516
+ //adc $carry,xzr,xzr // moved below
517
+ mov $cnt,#-8*8
518
+
519
+ // a[8]a[0]
520
+ // a[9]a[0]
521
+ // a[a]a[0]
522
+ // a[b]a[0]
523
+ // a[c]a[0]
524
+ // a[d]a[0]
525
+ // a[e]a[0]
526
+ // a[f]a[0]
527
+ // a[8]a[1]
528
+ // a[f]a[1]........................
529
+ // a[8]a[2]
530
+ // a[f]a[2]........................
531
+ // a[8]a[3]
532
+ // a[f]a[3]........................
533
+ // a[8]a[4]
534
+ // a[f]a[4]........................
535
+ // a[8]a[5]
536
+ // a[f]a[5]........................
537
+ // a[8]a[6]
538
+ // a[f]a[6]........................
539
+ // a[8]a[7]
540
+ // a[f]a[7]........................
541
+ .Lsqr8x_mul:
542
+ mul $t0,$a0,$n0
543
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
544
+ mul $t1,$a1,$n0
545
+ add $cnt,$cnt,#8
546
+ mul $t2,$a2,$n0
547
+ mul $t3,$a3,$n0
548
+ adds $acc0,$acc0,$t0
549
+ mul $t0,$a4,$n0
550
+ adcs $acc1,$acc1,$t1
551
+ mul $t1,$a5,$n0
552
+ adcs $acc2,$acc2,$t2
553
+ mul $t2,$a6,$n0
554
+ adcs $acc3,$acc3,$t3
555
+ mul $t3,$a7,$n0
556
+ adcs $acc4,$acc4,$t0
557
+ umulh $t0,$a0,$n0
558
+ adcs $acc5,$acc5,$t1
559
+ umulh $t1,$a1,$n0
560
+ adcs $acc6,$acc6,$t2
561
+ umulh $t2,$a2,$n0
562
+ adcs $acc7,$acc7,$t3
563
+ umulh $t3,$a3,$n0
564
+ adc $carry,$carry,xzr
565
+ str $acc0,[$tp],#8
566
+ adds $acc0,$acc1,$t0
567
+ umulh $t0,$a4,$n0
568
+ adcs $acc1,$acc2,$t1
569
+ umulh $t1,$a5,$n0
570
+ adcs $acc2,$acc3,$t2
571
+ umulh $t2,$a6,$n0
572
+ adcs $acc3,$acc4,$t3
573
+ umulh $t3,$a7,$n0
574
+ ldr $n0,[$rp,$cnt]
575
+ adcs $acc4,$acc5,$t0
576
+ adcs $acc5,$acc6,$t1
577
+ adcs $acc6,$acc7,$t2
578
+ adcs $acc7,$carry,$t3
579
+ //adc $carry,xzr,xzr // moved above
580
+ cbnz $cnt,.Lsqr8x_mul
581
+ // note that carry flag is guaranteed
582
+ // to be zero at this point
583
+ cmp $ap,$ap_end // done yet?
584
+ b.eq .Lsqr8x_break
585
+
586
+ ldp $a0,$a1,[$tp,#8*0]
587
+ ldp $a2,$a3,[$tp,#8*2]
588
+ ldp $a4,$a5,[$tp,#8*4]
589
+ ldp $a6,$a7,[$tp,#8*6]
590
+ adds $acc0,$acc0,$a0
591
+ ldr $n0,[$rp,#-8*8]
592
+ adcs $acc1,$acc1,$a1
593
+ ldp $a0,$a1,[$ap,#8*0]
594
+ adcs $acc2,$acc2,$a2
595
+ adcs $acc3,$acc3,$a3
596
+ ldp $a2,$a3,[$ap,#8*2]
597
+ adcs $acc4,$acc4,$a4
598
+ adcs $acc5,$acc5,$a5
599
+ ldp $a4,$a5,[$ap,#8*4]
600
+ adcs $acc6,$acc6,$a6
601
+ mov $cnt,#-8*8
602
+ adcs $acc7,$acc7,$a7
603
+ ldp $a6,$a7,[$ap,#8*6]
604
+ add $ap,$ap,#8*8
605
+ //adc $carry,xzr,xzr // moved above
606
+ b .Lsqr8x_mul
607
+
608
+ .align 4
609
+ .Lsqr8x_break:
610
+ ldp $a0,$a1,[$rp,#8*0]
611
+ add $ap,$rp,#8*8
612
+ ldp $a2,$a3,[$rp,#8*2]
613
+ sub $t0,$ap_end,$ap // is it last iteration?
614
+ ldp $a4,$a5,[$rp,#8*4]
615
+ sub $t1,$tp,$t0
616
+ ldp $a6,$a7,[$rp,#8*6]
617
+ cbz $t0,.Lsqr8x_outer_loop
618
+
619
+ stp $acc0,$acc1,[$tp,#8*0]
620
+ ldp $acc0,$acc1,[$t1,#8*0]
621
+ stp $acc2,$acc3,[$tp,#8*2]
622
+ ldp $acc2,$acc3,[$t1,#8*2]
623
+ stp $acc4,$acc5,[$tp,#8*4]
624
+ ldp $acc4,$acc5,[$t1,#8*4]
625
+ stp $acc6,$acc7,[$tp,#8*6]
626
+ mov $tp,$t1
627
+ ldp $acc6,$acc7,[$t1,#8*6]
628
+ b .Lsqr8x_outer_loop
629
+
630
+ .align 4
631
+ .Lsqr8x_outer_break:
632
+ // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633
+ ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634
+ ldp $t1,$t2,[sp,#8*1]
635
+ ldp $a5,$a7,[$t0,#8*2]
636
+ add $ap,$t0,#8*4
637
+ ldp $t3,$t0,[sp,#8*3]
638
+
639
+ stp $acc0,$acc1,[$tp,#8*0]
640
+ mul $acc0,$a1,$a1
641
+ stp $acc2,$acc3,[$tp,#8*2]
642
+ umulh $a1,$a1,$a1
643
+ stp $acc4,$acc5,[$tp,#8*4]
644
+ mul $a2,$a3,$a3
645
+ stp $acc6,$acc7,[$tp,#8*6]
646
+ mov $tp,sp
647
+ umulh $a3,$a3,$a3
648
+ adds $acc1,$a1,$t1,lsl#1
649
+ extr $t1,$t2,$t1,#63
650
+ sub $cnt,$num,#8*4
651
+
652
+ .Lsqr4x_shift_n_add:
653
+ adcs $acc2,$a2,$t1
654
+ extr $t2,$t3,$t2,#63
655
+ sub $cnt,$cnt,#8*4
656
+ adcs $acc3,$a3,$t2
657
+ ldp $t1,$t2,[$tp,#8*5]
658
+ mul $a4,$a5,$a5
659
+ ldp $a1,$a3,[$ap],#8*2
660
+ umulh $a5,$a5,$a5
661
+ mul $a6,$a7,$a7
662
+ umulh $a7,$a7,$a7
663
+ extr $t3,$t0,$t3,#63
664
+ stp $acc0,$acc1,[$tp,#8*0]
665
+ adcs $acc4,$a4,$t3
666
+ extr $t0,$t1,$t0,#63
667
+ stp $acc2,$acc3,[$tp,#8*2]
668
+ adcs $acc5,$a5,$t0
669
+ ldp $t3,$t0,[$tp,#8*7]
670
+ extr $t1,$t2,$t1,#63
671
+ adcs $acc6,$a6,$t1
672
+ extr $t2,$t3,$t2,#63
673
+ adcs $acc7,$a7,$t2
674
+ ldp $t1,$t2,[$tp,#8*9]
675
+ mul $a0,$a1,$a1
676
+ ldp $a5,$a7,[$ap],#8*2
677
+ umulh $a1,$a1,$a1
678
+ mul $a2,$a3,$a3
679
+ umulh $a3,$a3,$a3
680
+ stp $acc4,$acc5,[$tp,#8*4]
681
+ extr $t3,$t0,$t3,#63
682
+ stp $acc6,$acc7,[$tp,#8*6]
683
+ add $tp,$tp,#8*8
684
+ adcs $acc0,$a0,$t3
685
+ extr $t0,$t1,$t0,#63
686
+ adcs $acc1,$a1,$t0
687
+ ldp $t3,$t0,[$tp,#8*3]
688
+ extr $t1,$t2,$t1,#63
689
+ cbnz $cnt,.Lsqr4x_shift_n_add
690
+ ___
691
+ my ($np,$np_end)=($ap,$ap_end);
692
+ $code.=<<___;
693
+ ldp $np,$n0,[x29,#104] // pull np and n0
694
+
695
+ adcs $acc2,$a2,$t1
696
+ extr $t2,$t3,$t2,#63
697
+ adcs $acc3,$a3,$t2
698
+ ldp $t1,$t2,[$tp,#8*5]
699
+ mul $a4,$a5,$a5
700
+ umulh $a5,$a5,$a5
701
+ stp $acc0,$acc1,[$tp,#8*0]
702
+ mul $a6,$a7,$a7
703
+ umulh $a7,$a7,$a7
704
+ stp $acc2,$acc3,[$tp,#8*2]
705
+ extr $t3,$t0,$t3,#63
706
+ adcs $acc4,$a4,$t3
707
+ extr $t0,$t1,$t0,#63
708
+ ldp $acc0,$acc1,[sp,#8*0]
709
+ adcs $acc5,$a5,$t0
710
+ extr $t1,$t2,$t1,#63
711
+ ldp $a0,$a1,[$np,#8*0]
712
+ adcs $acc6,$a6,$t1
713
+ extr $t2,xzr,$t2,#63
714
+ ldp $a2,$a3,[$np,#8*2]
715
+ adc $acc7,$a7,$t2
716
+ ldp $a4,$a5,[$np,#8*4]
717
+
718
+ // Reduce by 512 bits per iteration
719
+ mul $na0,$n0,$acc0 // t[0]*n0
720
+ ldp $a6,$a7,[$np,#8*6]
721
+ add $np_end,$np,$num
722
+ ldp $acc2,$acc3,[sp,#8*2]
723
+ stp $acc4,$acc5,[$tp,#8*4]
724
+ ldp $acc4,$acc5,[sp,#8*4]
725
+ stp $acc6,$acc7,[$tp,#8*6]
726
+ ldp $acc6,$acc7,[sp,#8*6]
727
+ add $np,$np,#8*8
728
+ mov $topmost,xzr // initial top-most carry
729
+ mov $tp,sp
730
+ mov $cnt,#8
731
+
732
+ .Lsqr8x_reduction:
733
+ // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
734
+ mul $t1,$a1,$na0
735
+ sub $cnt,$cnt,#1
736
+ mul $t2,$a2,$na0
737
+ str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
738
+ mul $t3,$a3,$na0
739
+ // (*) adds xzr,$acc0,$t0
740
+ subs xzr,$acc0,#1 // (*)
741
+ mul $t0,$a4,$na0
742
+ adcs $acc0,$acc1,$t1
743
+ mul $t1,$a5,$na0
744
+ adcs $acc1,$acc2,$t2
745
+ mul $t2,$a6,$na0
746
+ adcs $acc2,$acc3,$t3
747
+ mul $t3,$a7,$na0
748
+ adcs $acc3,$acc4,$t0
749
+ umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
750
+ adcs $acc4,$acc5,$t1
751
+ umulh $t1,$a1,$na0
752
+ adcs $acc5,$acc6,$t2
753
+ umulh $t2,$a2,$na0
754
+ adcs $acc6,$acc7,$t3
755
+ umulh $t3,$a3,$na0
756
+ adc $acc7,xzr,xzr
757
+ adds $acc0,$acc0,$t0
758
+ umulh $t0,$a4,$na0
759
+ adcs $acc1,$acc1,$t1
760
+ umulh $t1,$a5,$na0
761
+ adcs $acc2,$acc2,$t2
762
+ umulh $t2,$a6,$na0
763
+ adcs $acc3,$acc3,$t3
764
+ umulh $t3,$a7,$na0
765
+ mul $na0,$n0,$acc0 // next t[0]*n0
766
+ adcs $acc4,$acc4,$t0
767
+ adcs $acc5,$acc5,$t1
768
+ adcs $acc6,$acc6,$t2
769
+ adc $acc7,$acc7,$t3
770
+ cbnz $cnt,.Lsqr8x_reduction
771
+
772
+ ldp $t0,$t1,[$tp,#8*0]
773
+ ldp $t2,$t3,[$tp,#8*2]
774
+ mov $rp,$tp
775
+ sub $cnt,$np_end,$np // done yet?
776
+ adds $acc0,$acc0,$t0
777
+ adcs $acc1,$acc1,$t1
778
+ ldp $t0,$t1,[$tp,#8*4]
779
+ adcs $acc2,$acc2,$t2
780
+ adcs $acc3,$acc3,$t3
781
+ ldp $t2,$t3,[$tp,#8*6]
782
+ adcs $acc4,$acc4,$t0
783
+ adcs $acc5,$acc5,$t1
784
+ adcs $acc6,$acc6,$t2
785
+ adcs $acc7,$acc7,$t3
786
+ //adc $carry,xzr,xzr // moved below
787
+ cbz $cnt,.Lsqr8x8_post_condition
788
+
789
+ ldr $n0,[$tp,#-8*8]
790
+ ldp $a0,$a1,[$np,#8*0]
791
+ ldp $a2,$a3,[$np,#8*2]
792
+ ldp $a4,$a5,[$np,#8*4]
793
+ mov $cnt,#-8*8
794
+ ldp $a6,$a7,[$np,#8*6]
795
+ add $np,$np,#8*8
796
+
797
+ .Lsqr8x_tail:
798
+ mul $t0,$a0,$n0
799
+ adc $carry,xzr,xzr // carry bit, modulo-scheduled
800
+ mul $t1,$a1,$n0
801
+ add $cnt,$cnt,#8
802
+ mul $t2,$a2,$n0
803
+ mul $t3,$a3,$n0
804
+ adds $acc0,$acc0,$t0
805
+ mul $t0,$a4,$n0
806
+ adcs $acc1,$acc1,$t1
807
+ mul $t1,$a5,$n0
808
+ adcs $acc2,$acc2,$t2
809
+ mul $t2,$a6,$n0
810
+ adcs $acc3,$acc3,$t3
811
+ mul $t3,$a7,$n0
812
+ adcs $acc4,$acc4,$t0
813
+ umulh $t0,$a0,$n0
814
+ adcs $acc5,$acc5,$t1
815
+ umulh $t1,$a1,$n0
816
+ adcs $acc6,$acc6,$t2
817
+ umulh $t2,$a2,$n0
818
+ adcs $acc7,$acc7,$t3
819
+ umulh $t3,$a3,$n0
820
+ adc $carry,$carry,xzr
821
+ str $acc0,[$tp],#8
822
+ adds $acc0,$acc1,$t0
823
+ umulh $t0,$a4,$n0
824
+ adcs $acc1,$acc2,$t1
825
+ umulh $t1,$a5,$n0
826
+ adcs $acc2,$acc3,$t2
827
+ umulh $t2,$a6,$n0
828
+ adcs $acc3,$acc4,$t3
829
+ umulh $t3,$a7,$n0
830
+ ldr $n0,[$rp,$cnt]
831
+ adcs $acc4,$acc5,$t0
832
+ adcs $acc5,$acc6,$t1
833
+ adcs $acc6,$acc7,$t2
834
+ adcs $acc7,$carry,$t3
835
+ //adc $carry,xzr,xzr // moved above
836
+ cbnz $cnt,.Lsqr8x_tail
837
+ // note that carry flag is guaranteed
838
+ // to be zero at this point
839
+ ldp $a0,$a1,[$tp,#8*0]
840
+ sub $cnt,$np_end,$np // done yet?
841
+ sub $t2,$np_end,$num // rewinded np
842
+ ldp $a2,$a3,[$tp,#8*2]
843
+ ldp $a4,$a5,[$tp,#8*4]
844
+ ldp $a6,$a7,[$tp,#8*6]
845
+ cbz $cnt,.Lsqr8x_tail_break
846
+
847
+ ldr $n0,[$rp,#-8*8]
848
+ adds $acc0,$acc0,$a0
849
+ adcs $acc1,$acc1,$a1
850
+ ldp $a0,$a1,[$np,#8*0]
851
+ adcs $acc2,$acc2,$a2
852
+ adcs $acc3,$acc3,$a3
853
+ ldp $a2,$a3,[$np,#8*2]
854
+ adcs $acc4,$acc4,$a4
855
+ adcs $acc5,$acc5,$a5
856
+ ldp $a4,$a5,[$np,#8*4]
857
+ adcs $acc6,$acc6,$a6
858
+ mov $cnt,#-8*8
859
+ adcs $acc7,$acc7,$a7
860
+ ldp $a6,$a7,[$np,#8*6]
861
+ add $np,$np,#8*8
862
+ //adc $carry,xzr,xzr // moved above
863
+ b .Lsqr8x_tail
864
+
865
+ .align 4
866
+ .Lsqr8x_tail_break:
867
+ ldr $n0,[x29,#112] // pull n0
868
+ add $cnt,$tp,#8*8 // end of current t[num] window
869
+
870
+ subs xzr,$topmost,#1 // "move" top-most carry to carry bit
871
+ adcs $t0,$acc0,$a0
872
+ adcs $t1,$acc1,$a1
873
+ ldp $acc0,$acc1,[$rp,#8*0]
874
+ adcs $acc2,$acc2,$a2
875
+ ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
876
+ adcs $acc3,$acc3,$a3
877
+ ldp $a2,$a3,[$t2,#8*2]
878
+ adcs $acc4,$acc4,$a4
879
+ adcs $acc5,$acc5,$a5
880
+ ldp $a4,$a5,[$t2,#8*4]
881
+ adcs $acc6,$acc6,$a6
882
+ adcs $acc7,$acc7,$a7
883
+ ldp $a6,$a7,[$t2,#8*6]
884
+ add $np,$t2,#8*8
885
+ adc $topmost,xzr,xzr // top-most carry
886
+ mul $na0,$n0,$acc0
887
+ stp $t0,$t1,[$tp,#8*0]
888
+ stp $acc2,$acc3,[$tp,#8*2]
889
+ ldp $acc2,$acc3,[$rp,#8*2]
890
+ stp $acc4,$acc5,[$tp,#8*4]
891
+ ldp $acc4,$acc5,[$rp,#8*4]
892
+ cmp $cnt,x29 // did we hit the bottom?
893
+ stp $acc6,$acc7,[$tp,#8*6]
894
+ mov $tp,$rp // slide the window
895
+ ldp $acc6,$acc7,[$rp,#8*6]
896
+ mov $cnt,#8
897
+ b.ne .Lsqr8x_reduction
898
+
899
+ // Final step. We see if result is larger than modulus, and
900
+ // if it is, subtract the modulus. But comparison implies
901
+ // subtraction. So we subtract modulus, see if it borrowed,
902
+ // and conditionally copy original value.
903
+ ldr $rp,[x29,#96] // pull rp
904
+ add $tp,$tp,#8*8
905
+ subs $t0,$acc0,$a0
906
+ sbcs $t1,$acc1,$a1
907
+ sub $cnt,$num,#8*8
908
+ mov $ap_end,$rp // $rp copy
909
+
910
+ .Lsqr8x_sub:
911
+ sbcs $t2,$acc2,$a2
912
+ ldp $a0,$a1,[$np,#8*0]
913
+ sbcs $t3,$acc3,$a3
914
+ stp $t0,$t1,[$rp,#8*0]
915
+ sbcs $t0,$acc4,$a4
916
+ ldp $a2,$a3,[$np,#8*2]
917
+ sbcs $t1,$acc5,$a5
918
+ stp $t2,$t3,[$rp,#8*2]
919
+ sbcs $t2,$acc6,$a6
920
+ ldp $a4,$a5,[$np,#8*4]
921
+ sbcs $t3,$acc7,$a7
922
+ ldp $a6,$a7,[$np,#8*6]
923
+ add $np,$np,#8*8
924
+ ldp $acc0,$acc1,[$tp,#8*0]
925
+ sub $cnt,$cnt,#8*8
926
+ ldp $acc2,$acc3,[$tp,#8*2]
927
+ ldp $acc4,$acc5,[$tp,#8*4]
928
+ ldp $acc6,$acc7,[$tp,#8*6]
929
+ add $tp,$tp,#8*8
930
+ stp $t0,$t1,[$rp,#8*4]
931
+ sbcs $t0,$acc0,$a0
932
+ stp $t2,$t3,[$rp,#8*6]
933
+ add $rp,$rp,#8*8
934
+ sbcs $t1,$acc1,$a1
935
+ cbnz $cnt,.Lsqr8x_sub
936
+
937
+ sbcs $t2,$acc2,$a2
938
+ mov $tp,sp
939
+ add $ap,sp,$num
940
+ ldp $a0,$a1,[$ap_end,#8*0]
941
+ sbcs $t3,$acc3,$a3
942
+ stp $t0,$t1,[$rp,#8*0]
943
+ sbcs $t0,$acc4,$a4
944
+ ldp $a2,$a3,[$ap_end,#8*2]
945
+ sbcs $t1,$acc5,$a5
946
+ stp $t2,$t3,[$rp,#8*2]
947
+ sbcs $t2,$acc6,$a6
948
+ ldp $acc0,$acc1,[$ap,#8*0]
949
+ sbcs $t3,$acc7,$a7
950
+ ldp $acc2,$acc3,[$ap,#8*2]
951
+ sbcs xzr,$topmost,xzr // did it borrow?
952
+ ldr x30,[x29,#8] // pull return address
953
+ stp $t0,$t1,[$rp,#8*4]
954
+ stp $t2,$t3,[$rp,#8*6]
955
+
956
+ sub $cnt,$num,#8*4
957
+ .Lsqr4x_cond_copy:
958
+ sub $cnt,$cnt,#8*4
959
+ csel $t0,$acc0,$a0,lo
960
+ stp xzr,xzr,[$tp,#8*0]
961
+ csel $t1,$acc1,$a1,lo
962
+ ldp $a0,$a1,[$ap_end,#8*4]
963
+ ldp $acc0,$acc1,[$ap,#8*4]
964
+ csel $t2,$acc2,$a2,lo
965
+ stp xzr,xzr,[$tp,#8*2]
966
+ add $tp,$tp,#8*4
967
+ csel $t3,$acc3,$a3,lo
968
+ ldp $a2,$a3,[$ap_end,#8*6]
969
+ ldp $acc2,$acc3,[$ap,#8*6]
970
+ add $ap,$ap,#8*4
971
+ stp $t0,$t1,[$ap_end,#8*0]
972
+ stp $t2,$t3,[$ap_end,#8*2]
973
+ add $ap_end,$ap_end,#8*4
974
+ stp xzr,xzr,[$ap,#8*0]
975
+ stp xzr,xzr,[$ap,#8*2]
976
+ cbnz $cnt,.Lsqr4x_cond_copy
977
+
978
+ csel $t0,$acc0,$a0,lo
979
+ stp xzr,xzr,[$tp,#8*0]
980
+ csel $t1,$acc1,$a1,lo
981
+ stp xzr,xzr,[$tp,#8*2]
982
+ csel $t2,$acc2,$a2,lo
983
+ csel $t3,$acc3,$a3,lo
984
+ stp $t0,$t1,[$ap_end,#8*0]
985
+ stp $t2,$t3,[$ap_end,#8*2]
986
+
987
+ b .Lsqr8x_done
988
+
989
+ .align 4
990
+ .Lsqr8x8_post_condition:
991
+ adc $carry,xzr,xzr
992
+ ldr x30,[x29,#8] // pull return address
993
+ // $acc0-7,$carry hold result, $a0-7 hold modulus
994
+ subs $a0,$acc0,$a0
995
+ ldr $ap,[x29,#96] // pull rp
996
+ sbcs $a1,$acc1,$a1
997
+ stp xzr,xzr,[sp,#8*0]
998
+ sbcs $a2,$acc2,$a2
999
+ stp xzr,xzr,[sp,#8*2]
1000
+ sbcs $a3,$acc3,$a3
1001
+ stp xzr,xzr,[sp,#8*4]
1002
+ sbcs $a4,$acc4,$a4
1003
+ stp xzr,xzr,[sp,#8*6]
1004
+ sbcs $a5,$acc5,$a5
1005
+ stp xzr,xzr,[sp,#8*8]
1006
+ sbcs $a6,$acc6,$a6
1007
+ stp xzr,xzr,[sp,#8*10]
1008
+ sbcs $a7,$acc7,$a7
1009
+ stp xzr,xzr,[sp,#8*12]
1010
+ sbcs $carry,$carry,xzr // did it borrow?
1011
+ stp xzr,xzr,[sp,#8*14]
1012
+
1013
+ // $a0-7 hold result-modulus
1014
+ csel $a0,$acc0,$a0,lo
1015
+ csel $a1,$acc1,$a1,lo
1016
+ csel $a2,$acc2,$a2,lo
1017
+ csel $a3,$acc3,$a3,lo
1018
+ stp $a0,$a1,[$ap,#8*0]
1019
+ csel $a4,$acc4,$a4,lo
1020
+ csel $a5,$acc5,$a5,lo
1021
+ stp $a2,$a3,[$ap,#8*2]
1022
+ csel $a6,$acc6,$a6,lo
1023
+ csel $a7,$acc7,$a7,lo
1024
+ stp $a4,$a5,[$ap,#8*4]
1025
+ stp $a6,$a7,[$ap,#8*6]
1026
+
1027
+ .Lsqr8x_done:
1028
+ ldp x19,x20,[x29,#16]
1029
+ mov sp,x29
1030
+ ldp x21,x22,[x29,#32]
1031
+ mov x0,#1
1032
+ ldp x23,x24,[x29,#48]
1033
+ ldp x25,x26,[x29,#64]
1034
+ ldp x27,x28,[x29,#80]
1035
+ ldr x29,[sp],#128
1036
+ ret
1037
+ .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038
+ ___
1039
+ }
1040
+
1041
+ {
1042
+ ########################################################################
1043
+ # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044
+ # x86_64-mont5 module, it's different in sense that it performs
1045
+ # reduction 256 bits at a time.
1046
+
1047
+ my ($a0,$a1,$a2,$a3,
1048
+ $t0,$t1,$t2,$t3,
1049
+ $m0,$m1,$m2,$m3,
1050
+ $acc0,$acc1,$acc2,$acc3,$acc4,
1051
+ $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052
+ my $bp_end=$rp;
1053
+ my ($carry,$topmost) = ($rp,"x30");
1054
+
1055
+ $code.=<<___;
1056
+ .type __bn_mul4x_mont,%function
1057
+ .align 5
1058
+ __bn_mul4x_mont:
1059
+ stp x29,x30,[sp,#-128]!
1060
+ add x29,sp,#0
1061
+ stp x19,x20,[sp,#16]
1062
+ stp x21,x22,[sp,#32]
1063
+ stp x23,x24,[sp,#48]
1064
+ stp x25,x26,[sp,#64]
1065
+ stp x27,x28,[sp,#80]
1066
+
1067
+ sub $tp,sp,$num,lsl#3
1068
+ lsl $num,$num,#3
1069
+ ldr $n0,[$n0] // *n0
1070
+ sub sp,$tp,#8*4 // alloca
1071
+
1072
+ add $t0,$bp,$num
1073
+ add $ap_end,$ap,$num
1074
+ stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1075
+
1076
+ ldr $bi,[$bp,#8*0] // b[0]
1077
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078
+ ldp $a2,$a3,[$ap,#8*2]
1079
+ add $ap,$ap,#8*4
1080
+ mov $acc0,xzr
1081
+ mov $acc1,xzr
1082
+ mov $acc2,xzr
1083
+ mov $acc3,xzr
1084
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085
+ ldp $m2,$m3,[$np,#8*2]
1086
+ adds $np,$np,#8*4 // clear carry bit
1087
+ mov $carry,xzr
1088
+ mov $cnt,#0
1089
+ mov $tp,sp
1090
+
1091
+ .Loop_mul4x_1st_reduction:
1092
+ mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093
+ adc $carry,$carry,xzr // modulo-scheduled
1094
+ mul $t1,$a1,$bi
1095
+ add $cnt,$cnt,#8
1096
+ mul $t2,$a2,$bi
1097
+ and $cnt,$cnt,#31
1098
+ mul $t3,$a3,$bi
1099
+ adds $acc0,$acc0,$t0
1100
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101
+ adcs $acc1,$acc1,$t1
1102
+ mul $mi,$acc0,$n0 // t[0]*n0
1103
+ adcs $acc2,$acc2,$t2
1104
+ umulh $t1,$a1,$bi
1105
+ adcs $acc3,$acc3,$t3
1106
+ umulh $t2,$a2,$bi
1107
+ adc $acc4,xzr,xzr
1108
+ umulh $t3,$a3,$bi
1109
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110
+ adds $acc1,$acc1,$t0
1111
+ // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113
+ adcs $acc2,$acc2,$t1
1114
+ mul $t1,$m1,$mi
1115
+ adcs $acc3,$acc3,$t2
1116
+ mul $t2,$m2,$mi
1117
+ adc $acc4,$acc4,$t3 // can't overflow
1118
+ mul $t3,$m3,$mi
1119
+ // (*) adds xzr,$acc0,$t0
1120
+ subs xzr,$acc0,#1 // (*)
1121
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122
+ adcs $acc0,$acc1,$t1
1123
+ umulh $t1,$m1,$mi
1124
+ adcs $acc1,$acc2,$t2
1125
+ umulh $t2,$m2,$mi
1126
+ adcs $acc2,$acc3,$t3
1127
+ umulh $t3,$m3,$mi
1128
+ adcs $acc3,$acc4,$carry
1129
+ adc $carry,xzr,xzr
1130
+ adds $acc0,$acc0,$t0
1131
+ sub $t0,$ap_end,$ap
1132
+ adcs $acc1,$acc1,$t1
1133
+ adcs $acc2,$acc2,$t2
1134
+ adcs $acc3,$acc3,$t3
1135
+ //adc $carry,$carry,xzr
1136
+ cbnz $cnt,.Loop_mul4x_1st_reduction
1137
+
1138
+ cbz $t0,.Lmul4x4_post_condition
1139
+
1140
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141
+ ldp $a2,$a3,[$ap,#8*2]
1142
+ add $ap,$ap,#8*4
1143
+ ldr $mi,[sp] // a[0]*n0
1144
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145
+ ldp $m2,$m3,[$np,#8*2]
1146
+ add $np,$np,#8*4
1147
+
1148
+ .Loop_mul4x_1st_tail:
1149
+ mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150
+ adc $carry,$carry,xzr // modulo-scheduled
1151
+ mul $t1,$a1,$bi
1152
+ add $cnt,$cnt,#8
1153
+ mul $t2,$a2,$bi
1154
+ and $cnt,$cnt,#31
1155
+ mul $t3,$a3,$bi
1156
+ adds $acc0,$acc0,$t0
1157
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158
+ adcs $acc1,$acc1,$t1
1159
+ umulh $t1,$a1,$bi
1160
+ adcs $acc2,$acc2,$t2
1161
+ umulh $t2,$a2,$bi
1162
+ adcs $acc3,$acc3,$t3
1163
+ umulh $t3,$a3,$bi
1164
+ adc $acc4,xzr,xzr
1165
+ ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166
+ adds $acc1,$acc1,$t0
1167
+ mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168
+ adcs $acc2,$acc2,$t1
1169
+ mul $t1,$m1,$mi
1170
+ adcs $acc3,$acc3,$t2
1171
+ mul $t2,$m2,$mi
1172
+ adc $acc4,$acc4,$t3 // can't overflow
1173
+ mul $t3,$m3,$mi
1174
+ adds $acc0,$acc0,$t0
1175
+ umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176
+ adcs $acc1,$acc1,$t1
1177
+ umulh $t1,$m1,$mi
1178
+ adcs $acc2,$acc2,$t2
1179
+ umulh $t2,$m2,$mi
1180
+ adcs $acc3,$acc3,$t3
1181
+ adcs $acc4,$acc4,$carry
1182
+ umulh $t3,$m3,$mi
1183
+ adc $carry,xzr,xzr
1184
+ ldr $mi,[sp,$cnt] // next t[0]*n0
1185
+ str $acc0,[$tp],#8 // result!!!
1186
+ adds $acc0,$acc1,$t0
1187
+ sub $t0,$ap_end,$ap // done yet?
1188
+ adcs $acc1,$acc2,$t1
1189
+ adcs $acc2,$acc3,$t2
1190
+ adcs $acc3,$acc4,$t3
1191
+ //adc $carry,$carry,xzr
1192
+ cbnz $cnt,.Loop_mul4x_1st_tail
1193
+
1194
+ sub $t1,$ap_end,$num // rewinded $ap
1195
+ cbz $t0,.Lmul4x_proceed
1196
+
1197
+ ldp $a0,$a1,[$ap,#8*0]
1198
+ ldp $a2,$a3,[$ap,#8*2]
1199
+ add $ap,$ap,#8*4
1200
+ ldp $m0,$m1,[$np,#8*0]
1201
+ ldp $m2,$m3,[$np,#8*2]
1202
+ add $np,$np,#8*4
1203
+ b .Loop_mul4x_1st_tail
1204
+
1205
+ .align 5
1206
+ .Lmul4x_proceed:
1207
+ ldr $bi,[$bp,#8*4]! // *++b
1208
+ adc $topmost,$carry,xzr
1209
+ ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210
+ sub $np,$np,$num // rewind np
1211
+ ldp $a2,$a3,[$t1,#8*2]
1212
+ add $ap,$t1,#8*4
1213
+
1214
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217
+ ldp $acc2,$acc3,[sp,#8*6]
1218
+
1219
+ ldp $m0,$m1,[$np,#8*0] // n[0..3]
1220
+ mov $tp,sp
1221
+ ldp $m2,$m3,[$np,#8*2]
1222
+ adds $np,$np,#8*4 // clear carry bit
1223
+ mov $carry,xzr
1224
+
1225
+ .align 4
1226
+ .Loop_mul4x_reduction:
1227
+ mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228
+ adc $carry,$carry,xzr // modulo-scheduled
1229
+ mul $t1,$a1,$bi
1230
+ add $cnt,$cnt,#8
1231
+ mul $t2,$a2,$bi
1232
+ and $cnt,$cnt,#31
1233
+ mul $t3,$a3,$bi
1234
+ adds $acc0,$acc0,$t0
1235
+ umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236
+ adcs $acc1,$acc1,$t1
1237
+ mul $mi,$acc0,$n0 // t[0]*n0
1238
+ adcs $acc2,$acc2,$t2
1239
+ umulh $t1,$a1,$bi
1240
+ adcs $acc3,$acc3,$t3
1241
+ umulh $t2,$a2,$bi
1242
+ adc $acc4,xzr,xzr
1243
+ umulh $t3,$a3,$bi
1244
+ ldr $bi,[$bp,$cnt] // next b[i]
1245
+ adds $acc1,$acc1,$t0
1246
+ // (*) mul $t0,$m0,$mi
1247
+ str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248
+ adcs $acc2,$acc2,$t1
1249
+ mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250
+ adcs $acc3,$acc3,$t2
1251
+ mul $t2,$m2,$mi
1252
+ adc $acc4,$acc4,$t3 // can't overflow
1253
+ mul $t3,$m3,$mi
1254
+ // (*) adds xzr,$acc0,$t0
1255
+ subs xzr,$acc0,#1 // (*)
1256
+ umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257
+ adcs $acc0,$acc1,$t1
1258
+ umulh $t1,$m1,$mi
1259
+ adcs $acc1,$acc2,$t2
1260
+ umulh $t2,$m2,$mi
1261
+ adcs $acc2,$acc3,$t3
1262
+ umulh $t3,$m3,$mi
1263
+ adcs $acc3,$acc4,$carry
1264
+ adc $carry,xzr,xzr
1265
+ adds $acc0,$acc0,$t0
1266
+ adcs $acc1,$acc1,$t1
1267
+ adcs $acc2,$acc2,$t2
1268
+ adcs $acc3,$acc3,$t3
1269
+ //adc $carry,$carry,xzr
1270
+ cbnz $cnt,.Loop_mul4x_reduction
1271
+
1272
+ adc $carry,$carry,xzr
1273
+ ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274
+ ldp $t2,$t3,[$tp,#8*6]
1275
+ ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276
+ ldp $a2,$a3,[$ap,#8*2]
1277
+ add $ap,$ap,#8*4
1278
+ adds $acc0,$acc0,$t0
1279
+ adcs $acc1,$acc1,$t1
1280
+ adcs $acc2,$acc2,$t2
1281
+ adcs $acc3,$acc3,$t3
1282
+ //adc $carry,$carry,xzr
1283
+
1284
+ ldr $mi,[sp] // t[0]*n0
1285
+ ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286
+ ldp $m2,$m3,[$np,#8*2]
1287
+ add $np,$np,#8*4
1288
+
1289
+ .align 4
1290
+ .Loop_mul4x_tail:
1291
+ mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292
+ adc $carry,$carry,xzr // modulo-scheduled
1293
+ mul $t1,$a1,$bi
1294
+ add $cnt,$cnt,#8
1295
+ mul $t2,$a2,$bi
1296
+ and $cnt,$cnt,#31
1297
+ mul $t3,$a3,$bi
1298
+ adds $acc0,$acc0,$t0
1299
+ umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300
+ adcs $acc1,$acc1,$t1
1301
+ umulh $t1,$a1,$bi
1302
+ adcs $acc2,$acc2,$t2
1303
+ umulh $t2,$a2,$bi
1304
+ adcs $acc3,$acc3,$t3
1305
+ umulh $t3,$a3,$bi
1306
+ adc $acc4,xzr,xzr
1307
+ ldr $bi,[$bp,$cnt] // next b[i]
1308
+ adds $acc1,$acc1,$t0
1309
+ mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310
+ adcs $acc2,$acc2,$t1
1311
+ mul $t1,$m1,$mi
1312
+ adcs $acc3,$acc3,$t2
1313
+ mul $t2,$m2,$mi
1314
+ adc $acc4,$acc4,$t3 // can't overflow
1315
+ mul $t3,$m3,$mi
1316
+ adds $acc0,$acc0,$t0
1317
+ umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318
+ adcs $acc1,$acc1,$t1
1319
+ umulh $t1,$m1,$mi
1320
+ adcs $acc2,$acc2,$t2
1321
+ umulh $t2,$m2,$mi
1322
+ adcs $acc3,$acc3,$t3
1323
+ umulh $t3,$m3,$mi
1324
+ adcs $acc4,$acc4,$carry
1325
+ ldr $mi,[sp,$cnt] // next a[0]*n0
1326
+ adc $carry,xzr,xzr
1327
+ str $acc0,[$tp],#8 // result!!!
1328
+ adds $acc0,$acc1,$t0
1329
+ sub $t0,$ap_end,$ap // done yet?
1330
+ adcs $acc1,$acc2,$t1
1331
+ adcs $acc2,$acc3,$t2
1332
+ adcs $acc3,$acc4,$t3
1333
+ //adc $carry,$carry,xzr
1334
+ cbnz $cnt,.Loop_mul4x_tail
1335
+
1336
+ sub $t1,$np,$num // rewinded np?
1337
+ adc $carry,$carry,xzr
1338
+ cbz $t0,.Loop_mul4x_break
1339
+
1340
+ ldp $t0,$t1,[$tp,#8*4]
1341
+ ldp $t2,$t3,[$tp,#8*6]
1342
+ ldp $a0,$a1,[$ap,#8*0]
1343
+ ldp $a2,$a3,[$ap,#8*2]
1344
+ add $ap,$ap,#8*4
1345
+ adds $acc0,$acc0,$t0
1346
+ adcs $acc1,$acc1,$t1
1347
+ adcs $acc2,$acc2,$t2
1348
+ adcs $acc3,$acc3,$t3
1349
+ //adc $carry,$carry,xzr
1350
+ ldp $m0,$m1,[$np,#8*0]
1351
+ ldp $m2,$m3,[$np,#8*2]
1352
+ add $np,$np,#8*4
1353
+ b .Loop_mul4x_tail
1354
+
1355
+ .align 4
1356
+ .Loop_mul4x_break:
1357
+ ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358
+ adds $acc0,$acc0,$topmost
1359
+ add $bp,$bp,#8*4 // bp++
1360
+ adcs $acc1,$acc1,xzr
1361
+ sub $ap,$ap,$num // rewind ap
1362
+ adcs $acc2,$acc2,xzr
1363
+ stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364
+ adcs $acc3,$acc3,xzr
1365
+ ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366
+ adc $topmost,$carry,xzr
1367
+ stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368
+ cmp $bp,$t3 // done yet?
1369
+ ldp $acc2,$acc3,[sp,#8*6]
1370
+ ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371
+ ldp $m2,$m3,[$t1,#8*2]
1372
+ add $np,$t1,#8*4
1373
+ b.eq .Lmul4x_post
1374
+
1375
+ ldr $bi,[$bp]
1376
+ ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377
+ ldp $a2,$a3,[$ap,#8*2]
1378
+ adds $ap,$ap,#8*4 // clear carry bit
1379
+ mov $carry,xzr
1380
+ mov $tp,sp
1381
+ b .Loop_mul4x_reduction
1382
+
1383
+ .align 4
1384
+ .Lmul4x_post:
1385
+ // Final step. We see if result is larger than modulus, and
1386
+ // if it is, subtract the modulus. But comparison implies
1387
+ // subtraction. So we subtract modulus, see if it borrowed,
1388
+ // and conditionally copy original value.
1389
+ mov $rp,$t2
1390
+ mov $ap_end,$t2 // $rp copy
1391
+ subs $t0,$acc0,$m0
1392
+ add $tp,sp,#8*8
1393
+ sbcs $t1,$acc1,$m1
1394
+ sub $cnt,$num,#8*4
1395
+
1396
+ .Lmul4x_sub:
1397
+ sbcs $t2,$acc2,$m2
1398
+ ldp $m0,$m1,[$np,#8*0]
1399
+ sub $cnt,$cnt,#8*4
1400
+ ldp $acc0,$acc1,[$tp,#8*0]
1401
+ sbcs $t3,$acc3,$m3
1402
+ ldp $m2,$m3,[$np,#8*2]
1403
+ add $np,$np,#8*4
1404
+ ldp $acc2,$acc3,[$tp,#8*2]
1405
+ add $tp,$tp,#8*4
1406
+ stp $t0,$t1,[$rp,#8*0]
1407
+ sbcs $t0,$acc0,$m0
1408
+ stp $t2,$t3,[$rp,#8*2]
1409
+ add $rp,$rp,#8*4
1410
+ sbcs $t1,$acc1,$m1
1411
+ cbnz $cnt,.Lmul4x_sub
1412
+
1413
+ sbcs $t2,$acc2,$m2
1414
+ mov $tp,sp
1415
+ add $ap,sp,#8*4
1416
+ ldp $a0,$a1,[$ap_end,#8*0]
1417
+ sbcs $t3,$acc3,$m3
1418
+ stp $t0,$t1,[$rp,#8*0]
1419
+ ldp $a2,$a3,[$ap_end,#8*2]
1420
+ stp $t2,$t3,[$rp,#8*2]
1421
+ ldp $acc0,$acc1,[$ap,#8*0]
1422
+ ldp $acc2,$acc3,[$ap,#8*2]
1423
+ sbcs xzr,$topmost,xzr // did it borrow?
1424
+ ldr x30,[x29,#8] // pull return address
1425
+
1426
+ sub $cnt,$num,#8*4
1427
+ .Lmul4x_cond_copy:
1428
+ sub $cnt,$cnt,#8*4
1429
+ csel $t0,$acc0,$a0,lo
1430
+ stp xzr,xzr,[$tp,#8*0]
1431
+ csel $t1,$acc1,$a1,lo
1432
+ ldp $a0,$a1,[$ap_end,#8*4]
1433
+ ldp $acc0,$acc1,[$ap,#8*4]
1434
+ csel $t2,$acc2,$a2,lo
1435
+ stp xzr,xzr,[$tp,#8*2]
1436
+ add $tp,$tp,#8*4
1437
+ csel $t3,$acc3,$a3,lo
1438
+ ldp $a2,$a3,[$ap_end,#8*6]
1439
+ ldp $acc2,$acc3,[$ap,#8*6]
1440
+ add $ap,$ap,#8*4
1441
+ stp $t0,$t1,[$ap_end,#8*0]
1442
+ stp $t2,$t3,[$ap_end,#8*2]
1443
+ add $ap_end,$ap_end,#8*4
1444
+ cbnz $cnt,.Lmul4x_cond_copy
1445
+
1446
+ csel $t0,$acc0,$a0,lo
1447
+ stp xzr,xzr,[$tp,#8*0]
1448
+ csel $t1,$acc1,$a1,lo
1449
+ stp xzr,xzr,[$tp,#8*2]
1450
+ csel $t2,$acc2,$a2,lo
1451
+ stp xzr,xzr,[$tp,#8*3]
1452
+ csel $t3,$acc3,$a3,lo
1453
+ stp xzr,xzr,[$tp,#8*4]
1454
+ stp $t0,$t1,[$ap_end,#8*0]
1455
+ stp $t2,$t3,[$ap_end,#8*2]
1456
+
1457
+ b .Lmul4x_done
1458
+
1459
+ .align 4
1460
+ .Lmul4x4_post_condition:
1461
+ adc $carry,$carry,xzr
1462
+ ldr $ap,[x29,#96] // pull rp
1463
+ // $acc0-3,$carry hold result, $m0-7 hold modulus
1464
+ subs $a0,$acc0,$m0
1465
+ ldr x30,[x29,#8] // pull return address
1466
+ sbcs $a1,$acc1,$m1
1467
+ stp xzr,xzr,[sp,#8*0]
1468
+ sbcs $a2,$acc2,$m2
1469
+ stp xzr,xzr,[sp,#8*2]
1470
+ sbcs $a3,$acc3,$m3
1471
+ stp xzr,xzr,[sp,#8*4]
1472
+ sbcs xzr,$carry,xzr // did it borrow?
1473
+ stp xzr,xzr,[sp,#8*6]
1474
+
1475
+ // $a0-3 hold result-modulus
1476
+ csel $a0,$acc0,$a0,lo
1477
+ csel $a1,$acc1,$a1,lo
1478
+ csel $a2,$acc2,$a2,lo
1479
+ csel $a3,$acc3,$a3,lo
1480
+ stp $a0,$a1,[$ap,#8*0]
1481
+ stp $a2,$a3,[$ap,#8*2]
1482
+
1483
+ .Lmul4x_done:
1484
+ ldp x19,x20,[x29,#16]
1485
+ mov sp,x29
1486
+ ldp x21,x22,[x29,#32]
1487
+ mov x0,#1
1488
+ ldp x23,x24,[x29,#48]
1489
+ ldp x25,x26,[x29,#64]
1490
+ ldp x27,x28,[x29,#80]
1491
+ ldr x29,[sp],#128
1492
+ ret
1493
+ .size __bn_mul4x_mont,.-__bn_mul4x_mont
1494
+ ___
1495
+ }
1496
+ $code.=<<___;
1497
+ .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498
+ .align 4
1499
+ ___
1500
+
1501
+ print $code;
1502
+
1503
+ close STDOUT;