ring-native 0.0.0 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1503 +0,0 @@
1
- #!/usr/bin/env perl
2
-
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
-
10
- # March 2015
11
- #
12
- # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13
- # work. While it does improve RSA sign performance by 20-30% (less for
14
- # longer keys) on most processors, for some reason RSA2048 is not
15
- # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16
- # instruction issue rate is limited on processor in question, meaning
17
- # that dedicated squaring procedure is a must. Well, actually all
18
- # contemporary AArch64 processors seem to have limited multiplication
19
- # issue rate, i.e. they can't issue multiplication every cycle, which
20
- # explains moderate improvement coefficients in comparison to
21
- # compiler-generated code. Recall that compiler is instructed to use
22
- # umulh and therefore uses same amount of multiplication instructions
23
- # to do the job. Assembly's edge is to minimize number of "collateral"
24
- # instructions and of course instruction scheduling.
25
- #
26
- # April 2015
27
- #
28
- # Squaring procedure that handles lengths divisible by 8 improves
29
- # RSA/DSA performance by 25-40-60% depending on processor and key
30
- # length. Overall improvement coefficients are always positive in
31
- # comparison to compiler-generated code. On Cortex-A57 improvement
32
- # is still modest on longest key lengths, while others exhibit e.g.
33
- # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34
- # on Cortex-A57 and ~60-100% faster on others.
35
-
36
- $flavour = shift;
37
- $output = shift;
38
-
39
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
- die "can't locate arm-xlate.pl";
43
-
44
- open OUT,"| \"$^X\" $xlate $flavour $output";
45
- *STDOUT=*OUT;
46
-
47
- ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48
- $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49
- $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
-
51
- # int bn_mul_mont(
52
- $rp="x0"; # BN_ULONG *rp,
53
- $ap="x1"; # const BN_ULONG *ap,
54
- $bp="x2"; # const BN_ULONG *bp,
55
- $np="x3"; # const BN_ULONG *np,
56
- $n0="x4"; # const BN_ULONG *n0,
57
- $num="x5"; # int num);
58
-
59
- $code.=<<___;
60
- .text
61
-
62
- .globl bn_mul_mont
63
- .type bn_mul_mont,%function
64
- .align 5
65
- bn_mul_mont:
66
- tst $num,#7
67
- b.eq __bn_sqr8x_mont
68
- tst $num,#3
69
- b.eq __bn_mul4x_mont
70
- .Lmul_mont:
71
- stp x29,x30,[sp,#-64]!
72
- add x29,sp,#0
73
- stp x19,x20,[sp,#16]
74
- stp x21,x22,[sp,#32]
75
- stp x23,x24,[sp,#48]
76
-
77
- ldr $m0,[$bp],#8 // bp[0]
78
- sub $tp,sp,$num,lsl#3
79
- ldp $hi0,$aj,[$ap],#16 // ap[0..1]
80
- lsl $num,$num,#3
81
- ldr $n0,[$n0] // *n0
82
- and $tp,$tp,#-16 // ABI says so
83
- ldp $hi1,$nj,[$np],#16 // np[0..1]
84
-
85
- mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86
- sub $j,$num,#16 // j=num-2
87
- umulh $hi0,$hi0,$m0
88
- mul $alo,$aj,$m0 // ap[1]*bp[0]
89
- umulh $ahi,$aj,$m0
90
-
91
- mul $m1,$lo0,$n0 // "tp[0]"*n0
92
- mov sp,$tp // alloca
93
-
94
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
95
- umulh $hi1,$hi1,$m1
96
- mul $nlo,$nj,$m1 // np[1]*m1
97
- // (*) adds $lo1,$lo1,$lo0 // discarded
98
- // (*) As for removal of first multiplication and addition
99
- // instructions. The outcome of first addition is
100
- // guaranteed to be zero, which leaves two computationally
101
- // significant outcomes: it either carries or not. Then
102
- // question is when does it carry? Is there alternative
103
- // way to deduce it? If you follow operations, you can
104
- // observe that condition for carry is quite simple:
105
- // $lo0 being non-zero. So that carry can be calculated
106
- // by adding -1 to $lo0. That's what next instruction does.
107
- subs xzr,$lo0,#1 // (*)
108
- umulh $nhi,$nj,$m1
109
- adc $hi1,$hi1,xzr
110
- cbz $j,.L1st_skip
111
-
112
- .L1st:
113
- ldr $aj,[$ap],#8
114
- adds $lo0,$alo,$hi0
115
- sub $j,$j,#8 // j--
116
- adc $hi0,$ahi,xzr
117
-
118
- ldr $nj,[$np],#8
119
- adds $lo1,$nlo,$hi1
120
- mul $alo,$aj,$m0 // ap[j]*bp[0]
121
- adc $hi1,$nhi,xzr
122
- umulh $ahi,$aj,$m0
123
-
124
- adds $lo1,$lo1,$lo0
125
- mul $nlo,$nj,$m1 // np[j]*m1
126
- adc $hi1,$hi1,xzr
127
- umulh $nhi,$nj,$m1
128
- str $lo1,[$tp],#8 // tp[j-1]
129
- cbnz $j,.L1st
130
-
131
- .L1st_skip:
132
- adds $lo0,$alo,$hi0
133
- sub $ap,$ap,$num // rewind $ap
134
- adc $hi0,$ahi,xzr
135
-
136
- adds $lo1,$nlo,$hi1
137
- sub $np,$np,$num // rewind $np
138
- adc $hi1,$nhi,xzr
139
-
140
- adds $lo1,$lo1,$lo0
141
- sub $i,$num,#8 // i=num-1
142
- adcs $hi1,$hi1,$hi0
143
-
144
- adc $ovf,xzr,xzr // upmost overflow bit
145
- stp $lo1,$hi1,[$tp]
146
-
147
- .Louter:
148
- ldr $m0,[$bp],#8 // bp[i]
149
- ldp $hi0,$aj,[$ap],#16
150
- ldr $tj,[sp] // tp[0]
151
- add $tp,sp,#8
152
-
153
- mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154
- sub $j,$num,#16 // j=num-2
155
- umulh $hi0,$hi0,$m0
156
- ldp $hi1,$nj,[$np],#16
157
- mul $alo,$aj,$m0 // ap[1]*bp[i]
158
- adds $lo0,$lo0,$tj
159
- umulh $ahi,$aj,$m0
160
- adc $hi0,$hi0,xzr
161
-
162
- mul $m1,$lo0,$n0
163
- sub $i,$i,#8 // i--
164
-
165
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
166
- umulh $hi1,$hi1,$m1
167
- mul $nlo,$nj,$m1 // np[1]*m1
168
- // (*) adds $lo1,$lo1,$lo0
169
- subs xzr,$lo0,#1 // (*)
170
- umulh $nhi,$nj,$m1
171
- cbz $j,.Linner_skip
172
-
173
- .Linner:
174
- ldr $aj,[$ap],#8
175
- adc $hi1,$hi1,xzr
176
- ldr $tj,[$tp],#8 // tp[j]
177
- adds $lo0,$alo,$hi0
178
- sub $j,$j,#8 // j--
179
- adc $hi0,$ahi,xzr
180
-
181
- adds $lo1,$nlo,$hi1
182
- ldr $nj,[$np],#8
183
- adc $hi1,$nhi,xzr
184
-
185
- mul $alo,$aj,$m0 // ap[j]*bp[i]
186
- adds $lo0,$lo0,$tj
187
- umulh $ahi,$aj,$m0
188
- adc $hi0,$hi0,xzr
189
-
190
- mul $nlo,$nj,$m1 // np[j]*m1
191
- adds $lo1,$lo1,$lo0
192
- umulh $nhi,$nj,$m1
193
- str $lo1,[$tp,#-16] // tp[j-1]
194
- cbnz $j,.Linner
195
-
196
- .Linner_skip:
197
- ldr $tj,[$tp],#8 // tp[j]
198
- adc $hi1,$hi1,xzr
199
- adds $lo0,$alo,$hi0
200
- sub $ap,$ap,$num // rewind $ap
201
- adc $hi0,$ahi,xzr
202
-
203
- adds $lo1,$nlo,$hi1
204
- sub $np,$np,$num // rewind $np
205
- adcs $hi1,$nhi,$ovf
206
- adc $ovf,xzr,xzr
207
-
208
- adds $lo0,$lo0,$tj
209
- adc $hi0,$hi0,xzr
210
-
211
- adds $lo1,$lo1,$lo0
212
- adcs $hi1,$hi1,$hi0
213
- adc $ovf,$ovf,xzr // upmost overflow bit
214
- stp $lo1,$hi1,[$tp,#-16]
215
-
216
- cbnz $i,.Louter
217
-
218
- // Final step. We see if result is larger than modulus, and
219
- // if it is, subtract the modulus. But comparison implies
220
- // subtraction. So we subtract modulus, see if it borrowed,
221
- // and conditionally copy original value.
222
- ldr $tj,[sp] // tp[0]
223
- add $tp,sp,#8
224
- ldr $nj,[$np],#8 // np[0]
225
- subs $j,$num,#8 // j=num-1 and clear borrow
226
- mov $ap,$rp
227
- .Lsub:
228
- sbcs $aj,$tj,$nj // tp[j]-np[j]
229
- ldr $tj,[$tp],#8
230
- sub $j,$j,#8 // j--
231
- ldr $nj,[$np],#8
232
- str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
233
- cbnz $j,.Lsub
234
-
235
- sbcs $aj,$tj,$nj
236
- sbcs $ovf,$ovf,xzr // did it borrow?
237
- str $aj,[$ap],#8 // rp[num-1]
238
-
239
- ldr $tj,[sp] // tp[0]
240
- add $tp,sp,#8
241
- ldr $aj,[$rp],#8 // rp[0]
242
- sub $num,$num,#8 // num--
243
- nop
244
- .Lcond_copy:
245
- sub $num,$num,#8 // num--
246
- csel $nj,$tj,$aj,lo // did it borrow?
247
- ldr $tj,[$tp],#8
248
- ldr $aj,[$rp],#8
249
- str xzr,[$tp,#-16] // wipe tp
250
- str $nj,[$rp,#-16]
251
- cbnz $num,.Lcond_copy
252
-
253
- csel $nj,$tj,$aj,lo
254
- str xzr,[$tp,#-8] // wipe tp
255
- str $nj,[$rp,#-8]
256
-
257
- ldp x19,x20,[x29,#16]
258
- mov sp,x29
259
- ldp x21,x22,[x29,#32]
260
- mov x0,#1
261
- ldp x23,x24,[x29,#48]
262
- ldr x29,[sp],#64
263
- ret
264
- .size bn_mul_mont,.-bn_mul_mont
265
- ___
266
- {
267
- ########################################################################
268
- # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
-
270
- my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271
- my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272
- my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273
- my ($cnt,$carry,$topmost)=("x27","x28","x30");
274
- my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
-
276
- $code.=<<___;
277
- .type __bn_sqr8x_mont,%function
278
- .align 5
279
- __bn_sqr8x_mont:
280
- cmp $ap,$bp
281
- b.ne __bn_mul4x_mont
282
- .Lsqr8x_mont:
283
- stp x29,x30,[sp,#-128]!
284
- add x29,sp,#0
285
- stp x19,x20,[sp,#16]
286
- stp x21,x22,[sp,#32]
287
- stp x23,x24,[sp,#48]
288
- stp x25,x26,[sp,#64]
289
- stp x27,x28,[sp,#80]
290
- stp $rp,$np,[sp,#96] // offload rp and np
291
-
292
- ldp $a0,$a1,[$ap,#8*0]
293
- ldp $a2,$a3,[$ap,#8*2]
294
- ldp $a4,$a5,[$ap,#8*4]
295
- ldp $a6,$a7,[$ap,#8*6]
296
-
297
- sub $tp,sp,$num,lsl#4
298
- lsl $num,$num,#3
299
- ldr $n0,[$n0] // *n0
300
- mov sp,$tp // alloca
301
- sub $cnt,$num,#8*8
302
- b .Lsqr8x_zero_start
303
-
304
- .Lsqr8x_zero:
305
- sub $cnt,$cnt,#8*8
306
- stp xzr,xzr,[$tp,#8*0]
307
- stp xzr,xzr,[$tp,#8*2]
308
- stp xzr,xzr,[$tp,#8*4]
309
- stp xzr,xzr,[$tp,#8*6]
310
- .Lsqr8x_zero_start:
311
- stp xzr,xzr,[$tp,#8*8]
312
- stp xzr,xzr,[$tp,#8*10]
313
- stp xzr,xzr,[$tp,#8*12]
314
- stp xzr,xzr,[$tp,#8*14]
315
- add $tp,$tp,#8*16
316
- cbnz $cnt,.Lsqr8x_zero
317
-
318
- add $ap_end,$ap,$num
319
- add $ap,$ap,#8*8
320
- mov $acc0,xzr
321
- mov $acc1,xzr
322
- mov $acc2,xzr
323
- mov $acc3,xzr
324
- mov $acc4,xzr
325
- mov $acc5,xzr
326
- mov $acc6,xzr
327
- mov $acc7,xzr
328
- mov $tp,sp
329
- str $n0,[x29,#112] // offload n0
330
-
331
- // Multiply everything but a[i]*a[i]
332
- .align 4
333
- .Lsqr8x_outer_loop:
334
- // a[1]a[0] (i)
335
- // a[2]a[0]
336
- // a[3]a[0]
337
- // a[4]a[0]
338
- // a[5]a[0]
339
- // a[6]a[0]
340
- // a[7]a[0]
341
- // a[2]a[1] (ii)
342
- // a[3]a[1]
343
- // a[4]a[1]
344
- // a[5]a[1]
345
- // a[6]a[1]
346
- // a[7]a[1]
347
- // a[3]a[2] (iii)
348
- // a[4]a[2]
349
- // a[5]a[2]
350
- // a[6]a[2]
351
- // a[7]a[2]
352
- // a[4]a[3] (iv)
353
- // a[5]a[3]
354
- // a[6]a[3]
355
- // a[7]a[3]
356
- // a[5]a[4] (v)
357
- // a[6]a[4]
358
- // a[7]a[4]
359
- // a[6]a[5] (vi)
360
- // a[7]a[5]
361
- // a[7]a[6] (vii)
362
-
363
- mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
364
- mul $t1,$a2,$a0
365
- mul $t2,$a3,$a0
366
- mul $t3,$a4,$a0
367
- adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
368
- mul $t0,$a5,$a0
369
- adcs $acc2,$acc2,$t1
370
- mul $t1,$a6,$a0
371
- adcs $acc3,$acc3,$t2
372
- mul $t2,$a7,$a0
373
- adcs $acc4,$acc4,$t3
374
- umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
375
- adcs $acc5,$acc5,$t0
376
- umulh $t0,$a2,$a0
377
- adcs $acc6,$acc6,$t1
378
- umulh $t1,$a3,$a0
379
- adcs $acc7,$acc7,$t2
380
- umulh $t2,$a4,$a0
381
- stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382
- adc $acc0,xzr,xzr // t[8]
383
- adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
384
- umulh $t3,$a5,$a0
385
- adcs $acc3,$acc3,$t0
386
- umulh $t0,$a6,$a0
387
- adcs $acc4,$acc4,$t1
388
- umulh $t1,$a7,$a0
389
- adcs $acc5,$acc5,$t2
390
- mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
391
- adcs $acc6,$acc6,$t3
392
- mul $t3,$a3,$a1
393
- adcs $acc7,$acc7,$t0
394
- mul $t0,$a4,$a1
395
- adc $acc0,$acc0,$t1
396
-
397
- mul $t1,$a5,$a1
398
- adds $acc3,$acc3,$t2
399
- mul $t2,$a6,$a1
400
- adcs $acc4,$acc4,$t3
401
- mul $t3,$a7,$a1
402
- adcs $acc5,$acc5,$t0
403
- umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
404
- adcs $acc6,$acc6,$t1
405
- umulh $t1,$a3,$a1
406
- adcs $acc7,$acc7,$t2
407
- umulh $t2,$a4,$a1
408
- adcs $acc0,$acc0,$t3
409
- umulh $t3,$a5,$a1
410
- stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411
- adc $acc1,xzr,xzr // t[9]
412
- adds $acc4,$acc4,$t0
413
- umulh $t0,$a6,$a1
414
- adcs $acc5,$acc5,$t1
415
- umulh $t1,$a7,$a1
416
- adcs $acc6,$acc6,$t2
417
- mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
418
- adcs $acc7,$acc7,$t3
419
- mul $t3,$a4,$a2
420
- adcs $acc0,$acc0,$t0
421
- mul $t0,$a5,$a2
422
- adc $acc1,$acc1,$t1
423
-
424
- mul $t1,$a6,$a2
425
- adds $acc5,$acc5,$t2
426
- mul $t2,$a7,$a2
427
- adcs $acc6,$acc6,$t3
428
- umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
429
- adcs $acc7,$acc7,$t0
430
- umulh $t0,$a4,$a2
431
- adcs $acc0,$acc0,$t1
432
- umulh $t1,$a5,$a2
433
- adcs $acc1,$acc1,$t2
434
- umulh $t2,$a6,$a2
435
- stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436
- adc $acc2,xzr,xzr // t[10]
437
- adds $acc6,$acc6,$t3
438
- umulh $t3,$a7,$a2
439
- adcs $acc7,$acc7,$t0
440
- mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
441
- adcs $acc0,$acc0,$t1
442
- mul $t1,$a5,$a3
443
- adcs $acc1,$acc1,$t2
444
- mul $t2,$a6,$a3
445
- adc $acc2,$acc2,$t3
446
-
447
- mul $t3,$a7,$a3
448
- adds $acc7,$acc7,$t0
449
- umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
450
- adcs $acc0,$acc0,$t1
451
- umulh $t1,$a5,$a3
452
- adcs $acc1,$acc1,$t2
453
- umulh $t2,$a6,$a3
454
- adcs $acc2,$acc2,$t3
455
- umulh $t3,$a7,$a3
456
- stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457
- adc $acc3,xzr,xzr // t[11]
458
- adds $acc0,$acc0,$t0
459
- mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
460
- adcs $acc1,$acc1,$t1
461
- mul $t1,$a6,$a4
462
- adcs $acc2,$acc2,$t2
463
- mul $t2,$a7,$a4
464
- adc $acc3,$acc3,$t3
465
-
466
- umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
467
- adds $acc1,$acc1,$t0
468
- umulh $t0,$a6,$a4
469
- adcs $acc2,$acc2,$t1
470
- umulh $t1,$a7,$a4
471
- adcs $acc3,$acc3,$t2
472
- mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473
- adc $acc4,xzr,xzr // t[12]
474
- adds $acc2,$acc2,$t3
475
- mul $t3,$a7,$a5
476
- adcs $acc3,$acc3,$t0
477
- umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
478
- adc $acc4,$acc4,$t1
479
-
480
- umulh $t1,$a7,$a5
481
- adds $acc3,$acc3,$t2
482
- mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
483
- adcs $acc4,$acc4,$t3
484
- umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485
- adc $acc5,xzr,xzr // t[13]
486
- adds $acc4,$acc4,$t0
487
- sub $cnt,$ap_end,$ap // done yet?
488
- adc $acc5,$acc5,$t1
489
-
490
- adds $acc5,$acc5,$t2
491
- sub $t0,$ap_end,$num // rewinded ap
492
- adc $acc6,xzr,xzr // t[14]
493
- add $acc6,$acc6,$t3
494
-
495
- cbz $cnt,.Lsqr8x_outer_break
496
-
497
- mov $n0,$a0
498
- ldp $a0,$a1,[$tp,#8*0]
499
- ldp $a2,$a3,[$tp,#8*2]
500
- ldp $a4,$a5,[$tp,#8*4]
501
- ldp $a6,$a7,[$tp,#8*6]
502
- adds $acc0,$acc0,$a0
503
- adcs $acc1,$acc1,$a1
504
- ldp $a0,$a1,[$ap,#8*0]
505
- adcs $acc2,$acc2,$a2
506
- adcs $acc3,$acc3,$a3
507
- ldp $a2,$a3,[$ap,#8*2]
508
- adcs $acc4,$acc4,$a4
509
- adcs $acc5,$acc5,$a5
510
- ldp $a4,$a5,[$ap,#8*4]
511
- adcs $acc6,$acc6,$a6
512
- mov $rp,$ap
513
- adcs $acc7,xzr,$a7
514
- ldp $a6,$a7,[$ap,#8*6]
515
- add $ap,$ap,#8*8
516
- //adc $carry,xzr,xzr // moved below
517
- mov $cnt,#-8*8
518
-
519
- // a[8]a[0]
520
- // a[9]a[0]
521
- // a[a]a[0]
522
- // a[b]a[0]
523
- // a[c]a[0]
524
- // a[d]a[0]
525
- // a[e]a[0]
526
- // a[f]a[0]
527
- // a[8]a[1]
528
- // a[f]a[1]........................
529
- // a[8]a[2]
530
- // a[f]a[2]........................
531
- // a[8]a[3]
532
- // a[f]a[3]........................
533
- // a[8]a[4]
534
- // a[f]a[4]........................
535
- // a[8]a[5]
536
- // a[f]a[5]........................
537
- // a[8]a[6]
538
- // a[f]a[6]........................
539
- // a[8]a[7]
540
- // a[f]a[7]........................
541
- .Lsqr8x_mul:
542
- mul $t0,$a0,$n0
543
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
544
- mul $t1,$a1,$n0
545
- add $cnt,$cnt,#8
546
- mul $t2,$a2,$n0
547
- mul $t3,$a3,$n0
548
- adds $acc0,$acc0,$t0
549
- mul $t0,$a4,$n0
550
- adcs $acc1,$acc1,$t1
551
- mul $t1,$a5,$n0
552
- adcs $acc2,$acc2,$t2
553
- mul $t2,$a6,$n0
554
- adcs $acc3,$acc3,$t3
555
- mul $t3,$a7,$n0
556
- adcs $acc4,$acc4,$t0
557
- umulh $t0,$a0,$n0
558
- adcs $acc5,$acc5,$t1
559
- umulh $t1,$a1,$n0
560
- adcs $acc6,$acc6,$t2
561
- umulh $t2,$a2,$n0
562
- adcs $acc7,$acc7,$t3
563
- umulh $t3,$a3,$n0
564
- adc $carry,$carry,xzr
565
- str $acc0,[$tp],#8
566
- adds $acc0,$acc1,$t0
567
- umulh $t0,$a4,$n0
568
- adcs $acc1,$acc2,$t1
569
- umulh $t1,$a5,$n0
570
- adcs $acc2,$acc3,$t2
571
- umulh $t2,$a6,$n0
572
- adcs $acc3,$acc4,$t3
573
- umulh $t3,$a7,$n0
574
- ldr $n0,[$rp,$cnt]
575
- adcs $acc4,$acc5,$t0
576
- adcs $acc5,$acc6,$t1
577
- adcs $acc6,$acc7,$t2
578
- adcs $acc7,$carry,$t3
579
- //adc $carry,xzr,xzr // moved above
580
- cbnz $cnt,.Lsqr8x_mul
581
- // note that carry flag is guaranteed
582
- // to be zero at this point
583
- cmp $ap,$ap_end // done yet?
584
- b.eq .Lsqr8x_break
585
-
586
- ldp $a0,$a1,[$tp,#8*0]
587
- ldp $a2,$a3,[$tp,#8*2]
588
- ldp $a4,$a5,[$tp,#8*4]
589
- ldp $a6,$a7,[$tp,#8*6]
590
- adds $acc0,$acc0,$a0
591
- ldr $n0,[$rp,#-8*8]
592
- adcs $acc1,$acc1,$a1
593
- ldp $a0,$a1,[$ap,#8*0]
594
- adcs $acc2,$acc2,$a2
595
- adcs $acc3,$acc3,$a3
596
- ldp $a2,$a3,[$ap,#8*2]
597
- adcs $acc4,$acc4,$a4
598
- adcs $acc5,$acc5,$a5
599
- ldp $a4,$a5,[$ap,#8*4]
600
- adcs $acc6,$acc6,$a6
601
- mov $cnt,#-8*8
602
- adcs $acc7,$acc7,$a7
603
- ldp $a6,$a7,[$ap,#8*6]
604
- add $ap,$ap,#8*8
605
- //adc $carry,xzr,xzr // moved above
606
- b .Lsqr8x_mul
607
-
608
- .align 4
609
- .Lsqr8x_break:
610
- ldp $a0,$a1,[$rp,#8*0]
611
- add $ap,$rp,#8*8
612
- ldp $a2,$a3,[$rp,#8*2]
613
- sub $t0,$ap_end,$ap // is it last iteration?
614
- ldp $a4,$a5,[$rp,#8*4]
615
- sub $t1,$tp,$t0
616
- ldp $a6,$a7,[$rp,#8*6]
617
- cbz $t0,.Lsqr8x_outer_loop
618
-
619
- stp $acc0,$acc1,[$tp,#8*0]
620
- ldp $acc0,$acc1,[$t1,#8*0]
621
- stp $acc2,$acc3,[$tp,#8*2]
622
- ldp $acc2,$acc3,[$t1,#8*2]
623
- stp $acc4,$acc5,[$tp,#8*4]
624
- ldp $acc4,$acc5,[$t1,#8*4]
625
- stp $acc6,$acc7,[$tp,#8*6]
626
- mov $tp,$t1
627
- ldp $acc6,$acc7,[$t1,#8*6]
628
- b .Lsqr8x_outer_loop
629
-
630
- .align 4
631
- .Lsqr8x_outer_break:
632
- // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633
- ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634
- ldp $t1,$t2,[sp,#8*1]
635
- ldp $a5,$a7,[$t0,#8*2]
636
- add $ap,$t0,#8*4
637
- ldp $t3,$t0,[sp,#8*3]
638
-
639
- stp $acc0,$acc1,[$tp,#8*0]
640
- mul $acc0,$a1,$a1
641
- stp $acc2,$acc3,[$tp,#8*2]
642
- umulh $a1,$a1,$a1
643
- stp $acc4,$acc5,[$tp,#8*4]
644
- mul $a2,$a3,$a3
645
- stp $acc6,$acc7,[$tp,#8*6]
646
- mov $tp,sp
647
- umulh $a3,$a3,$a3
648
- adds $acc1,$a1,$t1,lsl#1
649
- extr $t1,$t2,$t1,#63
650
- sub $cnt,$num,#8*4
651
-
652
- .Lsqr4x_shift_n_add:
653
- adcs $acc2,$a2,$t1
654
- extr $t2,$t3,$t2,#63
655
- sub $cnt,$cnt,#8*4
656
- adcs $acc3,$a3,$t2
657
- ldp $t1,$t2,[$tp,#8*5]
658
- mul $a4,$a5,$a5
659
- ldp $a1,$a3,[$ap],#8*2
660
- umulh $a5,$a5,$a5
661
- mul $a6,$a7,$a7
662
- umulh $a7,$a7,$a7
663
- extr $t3,$t0,$t3,#63
664
- stp $acc0,$acc1,[$tp,#8*0]
665
- adcs $acc4,$a4,$t3
666
- extr $t0,$t1,$t0,#63
667
- stp $acc2,$acc3,[$tp,#8*2]
668
- adcs $acc5,$a5,$t0
669
- ldp $t3,$t0,[$tp,#8*7]
670
- extr $t1,$t2,$t1,#63
671
- adcs $acc6,$a6,$t1
672
- extr $t2,$t3,$t2,#63
673
- adcs $acc7,$a7,$t2
674
- ldp $t1,$t2,[$tp,#8*9]
675
- mul $a0,$a1,$a1
676
- ldp $a5,$a7,[$ap],#8*2
677
- umulh $a1,$a1,$a1
678
- mul $a2,$a3,$a3
679
- umulh $a3,$a3,$a3
680
- stp $acc4,$acc5,[$tp,#8*4]
681
- extr $t3,$t0,$t3,#63
682
- stp $acc6,$acc7,[$tp,#8*6]
683
- add $tp,$tp,#8*8
684
- adcs $acc0,$a0,$t3
685
- extr $t0,$t1,$t0,#63
686
- adcs $acc1,$a1,$t0
687
- ldp $t3,$t0,[$tp,#8*3]
688
- extr $t1,$t2,$t1,#63
689
- cbnz $cnt,.Lsqr4x_shift_n_add
690
- ___
691
- my ($np,$np_end)=($ap,$ap_end);
692
- $code.=<<___;
693
- ldp $np,$n0,[x29,#104] // pull np and n0
694
-
695
- adcs $acc2,$a2,$t1
696
- extr $t2,$t3,$t2,#63
697
- adcs $acc3,$a3,$t2
698
- ldp $t1,$t2,[$tp,#8*5]
699
- mul $a4,$a5,$a5
700
- umulh $a5,$a5,$a5
701
- stp $acc0,$acc1,[$tp,#8*0]
702
- mul $a6,$a7,$a7
703
- umulh $a7,$a7,$a7
704
- stp $acc2,$acc3,[$tp,#8*2]
705
- extr $t3,$t0,$t3,#63
706
- adcs $acc4,$a4,$t3
707
- extr $t0,$t1,$t0,#63
708
- ldp $acc0,$acc1,[sp,#8*0]
709
- adcs $acc5,$a5,$t0
710
- extr $t1,$t2,$t1,#63
711
- ldp $a0,$a1,[$np,#8*0]
712
- adcs $acc6,$a6,$t1
713
- extr $t2,xzr,$t2,#63
714
- ldp $a2,$a3,[$np,#8*2]
715
- adc $acc7,$a7,$t2
716
- ldp $a4,$a5,[$np,#8*4]
717
-
718
- // Reduce by 512 bits per iteration
719
- mul $na0,$n0,$acc0 // t[0]*n0
720
- ldp $a6,$a7,[$np,#8*6]
721
- add $np_end,$np,$num
722
- ldp $acc2,$acc3,[sp,#8*2]
723
- stp $acc4,$acc5,[$tp,#8*4]
724
- ldp $acc4,$acc5,[sp,#8*4]
725
- stp $acc6,$acc7,[$tp,#8*6]
726
- ldp $acc6,$acc7,[sp,#8*6]
727
- add $np,$np,#8*8
728
- mov $topmost,xzr // initial top-most carry
729
- mov $tp,sp
730
- mov $cnt,#8
731
-
732
- .Lsqr8x_reduction:
733
- // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
734
- mul $t1,$a1,$na0
735
- sub $cnt,$cnt,#1
736
- mul $t2,$a2,$na0
737
- str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
738
- mul $t3,$a3,$na0
739
- // (*) adds xzr,$acc0,$t0
740
- subs xzr,$acc0,#1 // (*)
741
- mul $t0,$a4,$na0
742
- adcs $acc0,$acc1,$t1
743
- mul $t1,$a5,$na0
744
- adcs $acc1,$acc2,$t2
745
- mul $t2,$a6,$na0
746
- adcs $acc2,$acc3,$t3
747
- mul $t3,$a7,$na0
748
- adcs $acc3,$acc4,$t0
749
- umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
750
- adcs $acc4,$acc5,$t1
751
- umulh $t1,$a1,$na0
752
- adcs $acc5,$acc6,$t2
753
- umulh $t2,$a2,$na0
754
- adcs $acc6,$acc7,$t3
755
- umulh $t3,$a3,$na0
756
- adc $acc7,xzr,xzr
757
- adds $acc0,$acc0,$t0
758
- umulh $t0,$a4,$na0
759
- adcs $acc1,$acc1,$t1
760
- umulh $t1,$a5,$na0
761
- adcs $acc2,$acc2,$t2
762
- umulh $t2,$a6,$na0
763
- adcs $acc3,$acc3,$t3
764
- umulh $t3,$a7,$na0
765
- mul $na0,$n0,$acc0 // next t[0]*n0
766
- adcs $acc4,$acc4,$t0
767
- adcs $acc5,$acc5,$t1
768
- adcs $acc6,$acc6,$t2
769
- adc $acc7,$acc7,$t3
770
- cbnz $cnt,.Lsqr8x_reduction
771
-
772
- ldp $t0,$t1,[$tp,#8*0]
773
- ldp $t2,$t3,[$tp,#8*2]
774
- mov $rp,$tp
775
- sub $cnt,$np_end,$np // done yet?
776
- adds $acc0,$acc0,$t0
777
- adcs $acc1,$acc1,$t1
778
- ldp $t0,$t1,[$tp,#8*4]
779
- adcs $acc2,$acc2,$t2
780
- adcs $acc3,$acc3,$t3
781
- ldp $t2,$t3,[$tp,#8*6]
782
- adcs $acc4,$acc4,$t0
783
- adcs $acc5,$acc5,$t1
784
- adcs $acc6,$acc6,$t2
785
- adcs $acc7,$acc7,$t3
786
- //adc $carry,xzr,xzr // moved below
787
- cbz $cnt,.Lsqr8x8_post_condition
788
-
789
- ldr $n0,[$tp,#-8*8]
790
- ldp $a0,$a1,[$np,#8*0]
791
- ldp $a2,$a3,[$np,#8*2]
792
- ldp $a4,$a5,[$np,#8*4]
793
- mov $cnt,#-8*8
794
- ldp $a6,$a7,[$np,#8*6]
795
- add $np,$np,#8*8
796
-
797
- .Lsqr8x_tail:
798
- mul $t0,$a0,$n0
799
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
800
- mul $t1,$a1,$n0
801
- add $cnt,$cnt,#8
802
- mul $t2,$a2,$n0
803
- mul $t3,$a3,$n0
804
- adds $acc0,$acc0,$t0
805
- mul $t0,$a4,$n0
806
- adcs $acc1,$acc1,$t1
807
- mul $t1,$a5,$n0
808
- adcs $acc2,$acc2,$t2
809
- mul $t2,$a6,$n0
810
- adcs $acc3,$acc3,$t3
811
- mul $t3,$a7,$n0
812
- adcs $acc4,$acc4,$t0
813
- umulh $t0,$a0,$n0
814
- adcs $acc5,$acc5,$t1
815
- umulh $t1,$a1,$n0
816
- adcs $acc6,$acc6,$t2
817
- umulh $t2,$a2,$n0
818
- adcs $acc7,$acc7,$t3
819
- umulh $t3,$a3,$n0
820
- adc $carry,$carry,xzr
821
- str $acc0,[$tp],#8
822
- adds $acc0,$acc1,$t0
823
- umulh $t0,$a4,$n0
824
- adcs $acc1,$acc2,$t1
825
- umulh $t1,$a5,$n0
826
- adcs $acc2,$acc3,$t2
827
- umulh $t2,$a6,$n0
828
- adcs $acc3,$acc4,$t3
829
- umulh $t3,$a7,$n0
830
- ldr $n0,[$rp,$cnt]
831
- adcs $acc4,$acc5,$t0
832
- adcs $acc5,$acc6,$t1
833
- adcs $acc6,$acc7,$t2
834
- adcs $acc7,$carry,$t3
835
- //adc $carry,xzr,xzr // moved above
836
- cbnz $cnt,.Lsqr8x_tail
837
- // note that carry flag is guaranteed
838
- // to be zero at this point
839
- ldp $a0,$a1,[$tp,#8*0]
840
- sub $cnt,$np_end,$np // done yet?
841
- sub $t2,$np_end,$num // rewinded np
842
- ldp $a2,$a3,[$tp,#8*2]
843
- ldp $a4,$a5,[$tp,#8*4]
844
- ldp $a6,$a7,[$tp,#8*6]
845
- cbz $cnt,.Lsqr8x_tail_break
846
-
847
- ldr $n0,[$rp,#-8*8]
848
- adds $acc0,$acc0,$a0
849
- adcs $acc1,$acc1,$a1
850
- ldp $a0,$a1,[$np,#8*0]
851
- adcs $acc2,$acc2,$a2
852
- adcs $acc3,$acc3,$a3
853
- ldp $a2,$a3,[$np,#8*2]
854
- adcs $acc4,$acc4,$a4
855
- adcs $acc5,$acc5,$a5
856
- ldp $a4,$a5,[$np,#8*4]
857
- adcs $acc6,$acc6,$a6
858
- mov $cnt,#-8*8
859
- adcs $acc7,$acc7,$a7
860
- ldp $a6,$a7,[$np,#8*6]
861
- add $np,$np,#8*8
862
- //adc $carry,xzr,xzr // moved above
863
- b .Lsqr8x_tail
864
-
865
- .align 4
866
- .Lsqr8x_tail_break:
867
- ldr $n0,[x29,#112] // pull n0
868
- add $cnt,$tp,#8*8 // end of current t[num] window
869
-
870
- subs xzr,$topmost,#1 // "move" top-most carry to carry bit
871
- adcs $t0,$acc0,$a0
872
- adcs $t1,$acc1,$a1
873
- ldp $acc0,$acc1,[$rp,#8*0]
874
- adcs $acc2,$acc2,$a2
875
- ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
876
- adcs $acc3,$acc3,$a3
877
- ldp $a2,$a3,[$t2,#8*2]
878
- adcs $acc4,$acc4,$a4
879
- adcs $acc5,$acc5,$a5
880
- ldp $a4,$a5,[$t2,#8*4]
881
- adcs $acc6,$acc6,$a6
882
- adcs $acc7,$acc7,$a7
883
- ldp $a6,$a7,[$t2,#8*6]
884
- add $np,$t2,#8*8
885
- adc $topmost,xzr,xzr // top-most carry
886
- mul $na0,$n0,$acc0
887
- stp $t0,$t1,[$tp,#8*0]
888
- stp $acc2,$acc3,[$tp,#8*2]
889
- ldp $acc2,$acc3,[$rp,#8*2]
890
- stp $acc4,$acc5,[$tp,#8*4]
891
- ldp $acc4,$acc5,[$rp,#8*4]
892
- cmp $cnt,x29 // did we hit the bottom?
893
- stp $acc6,$acc7,[$tp,#8*6]
894
- mov $tp,$rp // slide the window
895
- ldp $acc6,$acc7,[$rp,#8*6]
896
- mov $cnt,#8
897
- b.ne .Lsqr8x_reduction
898
-
899
- // Final step. We see if result is larger than modulus, and
900
- // if it is, subtract the modulus. But comparison implies
901
- // subtraction. So we subtract modulus, see if it borrowed,
902
- // and conditionally copy original value.
903
- ldr $rp,[x29,#96] // pull rp
904
- add $tp,$tp,#8*8
905
- subs $t0,$acc0,$a0
906
- sbcs $t1,$acc1,$a1
907
- sub $cnt,$num,#8*8
908
- mov $ap_end,$rp // $rp copy
909
-
910
- .Lsqr8x_sub:
911
- sbcs $t2,$acc2,$a2
912
- ldp $a0,$a1,[$np,#8*0]
913
- sbcs $t3,$acc3,$a3
914
- stp $t0,$t1,[$rp,#8*0]
915
- sbcs $t0,$acc4,$a4
916
- ldp $a2,$a3,[$np,#8*2]
917
- sbcs $t1,$acc5,$a5
918
- stp $t2,$t3,[$rp,#8*2]
919
- sbcs $t2,$acc6,$a6
920
- ldp $a4,$a5,[$np,#8*4]
921
- sbcs $t3,$acc7,$a7
922
- ldp $a6,$a7,[$np,#8*6]
923
- add $np,$np,#8*8
924
- ldp $acc0,$acc1,[$tp,#8*0]
925
- sub $cnt,$cnt,#8*8
926
- ldp $acc2,$acc3,[$tp,#8*2]
927
- ldp $acc4,$acc5,[$tp,#8*4]
928
- ldp $acc6,$acc7,[$tp,#8*6]
929
- add $tp,$tp,#8*8
930
- stp $t0,$t1,[$rp,#8*4]
931
- sbcs $t0,$acc0,$a0
932
- stp $t2,$t3,[$rp,#8*6]
933
- add $rp,$rp,#8*8
934
- sbcs $t1,$acc1,$a1
935
- cbnz $cnt,.Lsqr8x_sub
936
-
937
- sbcs $t2,$acc2,$a2
938
- mov $tp,sp
939
- add $ap,sp,$num
940
- ldp $a0,$a1,[$ap_end,#8*0]
941
- sbcs $t3,$acc3,$a3
942
- stp $t0,$t1,[$rp,#8*0]
943
- sbcs $t0,$acc4,$a4
944
- ldp $a2,$a3,[$ap_end,#8*2]
945
- sbcs $t1,$acc5,$a5
946
- stp $t2,$t3,[$rp,#8*2]
947
- sbcs $t2,$acc6,$a6
948
- ldp $acc0,$acc1,[$ap,#8*0]
949
- sbcs $t3,$acc7,$a7
950
- ldp $acc2,$acc3,[$ap,#8*2]
951
- sbcs xzr,$topmost,xzr // did it borrow?
952
- ldr x30,[x29,#8] // pull return address
953
- stp $t0,$t1,[$rp,#8*4]
954
- stp $t2,$t3,[$rp,#8*6]
955
-
956
- sub $cnt,$num,#8*4
957
- .Lsqr4x_cond_copy:
958
- sub $cnt,$cnt,#8*4
959
- csel $t0,$acc0,$a0,lo
960
- stp xzr,xzr,[$tp,#8*0]
961
- csel $t1,$acc1,$a1,lo
962
- ldp $a0,$a1,[$ap_end,#8*4]
963
- ldp $acc0,$acc1,[$ap,#8*4]
964
- csel $t2,$acc2,$a2,lo
965
- stp xzr,xzr,[$tp,#8*2]
966
- add $tp,$tp,#8*4
967
- csel $t3,$acc3,$a3,lo
968
- ldp $a2,$a3,[$ap_end,#8*6]
969
- ldp $acc2,$acc3,[$ap,#8*6]
970
- add $ap,$ap,#8*4
971
- stp $t0,$t1,[$ap_end,#8*0]
972
- stp $t2,$t3,[$ap_end,#8*2]
973
- add $ap_end,$ap_end,#8*4
974
- stp xzr,xzr,[$ap,#8*0]
975
- stp xzr,xzr,[$ap,#8*2]
976
- cbnz $cnt,.Lsqr4x_cond_copy
977
-
978
- csel $t0,$acc0,$a0,lo
979
- stp xzr,xzr,[$tp,#8*0]
980
- csel $t1,$acc1,$a1,lo
981
- stp xzr,xzr,[$tp,#8*2]
982
- csel $t2,$acc2,$a2,lo
983
- csel $t3,$acc3,$a3,lo
984
- stp $t0,$t1,[$ap_end,#8*0]
985
- stp $t2,$t3,[$ap_end,#8*2]
986
-
987
- b .Lsqr8x_done
988
-
989
- .align 4
990
- .Lsqr8x8_post_condition:
991
- adc $carry,xzr,xzr
992
- ldr x30,[x29,#8] // pull return address
993
- // $acc0-7,$carry hold result, $a0-7 hold modulus
994
- subs $a0,$acc0,$a0
995
- ldr $ap,[x29,#96] // pull rp
996
- sbcs $a1,$acc1,$a1
997
- stp xzr,xzr,[sp,#8*0]
998
- sbcs $a2,$acc2,$a2
999
- stp xzr,xzr,[sp,#8*2]
1000
- sbcs $a3,$acc3,$a3
1001
- stp xzr,xzr,[sp,#8*4]
1002
- sbcs $a4,$acc4,$a4
1003
- stp xzr,xzr,[sp,#8*6]
1004
- sbcs $a5,$acc5,$a5
1005
- stp xzr,xzr,[sp,#8*8]
1006
- sbcs $a6,$acc6,$a6
1007
- stp xzr,xzr,[sp,#8*10]
1008
- sbcs $a7,$acc7,$a7
1009
- stp xzr,xzr,[sp,#8*12]
1010
- sbcs $carry,$carry,xzr // did it borrow?
1011
- stp xzr,xzr,[sp,#8*14]
1012
-
1013
- // $a0-7 hold result-modulus
1014
- csel $a0,$acc0,$a0,lo
1015
- csel $a1,$acc1,$a1,lo
1016
- csel $a2,$acc2,$a2,lo
1017
- csel $a3,$acc3,$a3,lo
1018
- stp $a0,$a1,[$ap,#8*0]
1019
- csel $a4,$acc4,$a4,lo
1020
- csel $a5,$acc5,$a5,lo
1021
- stp $a2,$a3,[$ap,#8*2]
1022
- csel $a6,$acc6,$a6,lo
1023
- csel $a7,$acc7,$a7,lo
1024
- stp $a4,$a5,[$ap,#8*4]
1025
- stp $a6,$a7,[$ap,#8*6]
1026
-
1027
- .Lsqr8x_done:
1028
- ldp x19,x20,[x29,#16]
1029
- mov sp,x29
1030
- ldp x21,x22,[x29,#32]
1031
- mov x0,#1
1032
- ldp x23,x24,[x29,#48]
1033
- ldp x25,x26,[x29,#64]
1034
- ldp x27,x28,[x29,#80]
1035
- ldr x29,[sp],#128
1036
- ret
1037
- .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038
- ___
1039
- }
1040
-
1041
- {
1042
- ########################################################################
1043
- # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044
- # x86_64-mont5 module, it's different in sense that it performs
1045
- # reduction 256 bits at a time.
1046
-
1047
- my ($a0,$a1,$a2,$a3,
1048
- $t0,$t1,$t2,$t3,
1049
- $m0,$m1,$m2,$m3,
1050
- $acc0,$acc1,$acc2,$acc3,$acc4,
1051
- $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052
- my $bp_end=$rp;
1053
- my ($carry,$topmost) = ($rp,"x30");
1054
-
1055
- $code.=<<___;
1056
- .type __bn_mul4x_mont,%function
1057
- .align 5
1058
- __bn_mul4x_mont:
1059
- stp x29,x30,[sp,#-128]!
1060
- add x29,sp,#0
1061
- stp x19,x20,[sp,#16]
1062
- stp x21,x22,[sp,#32]
1063
- stp x23,x24,[sp,#48]
1064
- stp x25,x26,[sp,#64]
1065
- stp x27,x28,[sp,#80]
1066
-
1067
- sub $tp,sp,$num,lsl#3
1068
- lsl $num,$num,#3
1069
- ldr $n0,[$n0] // *n0
1070
- sub sp,$tp,#8*4 // alloca
1071
-
1072
- add $t0,$bp,$num
1073
- add $ap_end,$ap,$num
1074
- stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1075
-
1076
- ldr $bi,[$bp,#8*0] // b[0]
1077
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078
- ldp $a2,$a3,[$ap,#8*2]
1079
- add $ap,$ap,#8*4
1080
- mov $acc0,xzr
1081
- mov $acc1,xzr
1082
- mov $acc2,xzr
1083
- mov $acc3,xzr
1084
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085
- ldp $m2,$m3,[$np,#8*2]
1086
- adds $np,$np,#8*4 // clear carry bit
1087
- mov $carry,xzr
1088
- mov $cnt,#0
1089
- mov $tp,sp
1090
-
1091
- .Loop_mul4x_1st_reduction:
1092
- mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093
- adc $carry,$carry,xzr // modulo-scheduled
1094
- mul $t1,$a1,$bi
1095
- add $cnt,$cnt,#8
1096
- mul $t2,$a2,$bi
1097
- and $cnt,$cnt,#31
1098
- mul $t3,$a3,$bi
1099
- adds $acc0,$acc0,$t0
1100
- umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101
- adcs $acc1,$acc1,$t1
1102
- mul $mi,$acc0,$n0 // t[0]*n0
1103
- adcs $acc2,$acc2,$t2
1104
- umulh $t1,$a1,$bi
1105
- adcs $acc3,$acc3,$t3
1106
- umulh $t2,$a2,$bi
1107
- adc $acc4,xzr,xzr
1108
- umulh $t3,$a3,$bi
1109
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110
- adds $acc1,$acc1,$t0
1111
- // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113
- adcs $acc2,$acc2,$t1
1114
- mul $t1,$m1,$mi
1115
- adcs $acc3,$acc3,$t2
1116
- mul $t2,$m2,$mi
1117
- adc $acc4,$acc4,$t3 // can't overflow
1118
- mul $t3,$m3,$mi
1119
- // (*) adds xzr,$acc0,$t0
1120
- subs xzr,$acc0,#1 // (*)
1121
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122
- adcs $acc0,$acc1,$t1
1123
- umulh $t1,$m1,$mi
1124
- adcs $acc1,$acc2,$t2
1125
- umulh $t2,$m2,$mi
1126
- adcs $acc2,$acc3,$t3
1127
- umulh $t3,$m3,$mi
1128
- adcs $acc3,$acc4,$carry
1129
- adc $carry,xzr,xzr
1130
- adds $acc0,$acc0,$t0
1131
- sub $t0,$ap_end,$ap
1132
- adcs $acc1,$acc1,$t1
1133
- adcs $acc2,$acc2,$t2
1134
- adcs $acc3,$acc3,$t3
1135
- //adc $carry,$carry,xzr
1136
- cbnz $cnt,.Loop_mul4x_1st_reduction
1137
-
1138
- cbz $t0,.Lmul4x4_post_condition
1139
-
1140
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141
- ldp $a2,$a3,[$ap,#8*2]
1142
- add $ap,$ap,#8*4
1143
- ldr $mi,[sp] // a[0]*n0
1144
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145
- ldp $m2,$m3,[$np,#8*2]
1146
- add $np,$np,#8*4
1147
-
1148
- .Loop_mul4x_1st_tail:
1149
- mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150
- adc $carry,$carry,xzr // modulo-scheduled
1151
- mul $t1,$a1,$bi
1152
- add $cnt,$cnt,#8
1153
- mul $t2,$a2,$bi
1154
- and $cnt,$cnt,#31
1155
- mul $t3,$a3,$bi
1156
- adds $acc0,$acc0,$t0
1157
- umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158
- adcs $acc1,$acc1,$t1
1159
- umulh $t1,$a1,$bi
1160
- adcs $acc2,$acc2,$t2
1161
- umulh $t2,$a2,$bi
1162
- adcs $acc3,$acc3,$t3
1163
- umulh $t3,$a3,$bi
1164
- adc $acc4,xzr,xzr
1165
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166
- adds $acc1,$acc1,$t0
1167
- mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168
- adcs $acc2,$acc2,$t1
1169
- mul $t1,$m1,$mi
1170
- adcs $acc3,$acc3,$t2
1171
- mul $t2,$m2,$mi
1172
- adc $acc4,$acc4,$t3 // can't overflow
1173
- mul $t3,$m3,$mi
1174
- adds $acc0,$acc0,$t0
1175
- umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176
- adcs $acc1,$acc1,$t1
1177
- umulh $t1,$m1,$mi
1178
- adcs $acc2,$acc2,$t2
1179
- umulh $t2,$m2,$mi
1180
- adcs $acc3,$acc3,$t3
1181
- adcs $acc4,$acc4,$carry
1182
- umulh $t3,$m3,$mi
1183
- adc $carry,xzr,xzr
1184
- ldr $mi,[sp,$cnt] // next t[0]*n0
1185
- str $acc0,[$tp],#8 // result!!!
1186
- adds $acc0,$acc1,$t0
1187
- sub $t0,$ap_end,$ap // done yet?
1188
- adcs $acc1,$acc2,$t1
1189
- adcs $acc2,$acc3,$t2
1190
- adcs $acc3,$acc4,$t3
1191
- //adc $carry,$carry,xzr
1192
- cbnz $cnt,.Loop_mul4x_1st_tail
1193
-
1194
- sub $t1,$ap_end,$num // rewinded $ap
1195
- cbz $t0,.Lmul4x_proceed
1196
-
1197
- ldp $a0,$a1,[$ap,#8*0]
1198
- ldp $a2,$a3,[$ap,#8*2]
1199
- add $ap,$ap,#8*4
1200
- ldp $m0,$m1,[$np,#8*0]
1201
- ldp $m2,$m3,[$np,#8*2]
1202
- add $np,$np,#8*4
1203
- b .Loop_mul4x_1st_tail
1204
-
1205
- .align 5
1206
- .Lmul4x_proceed:
1207
- ldr $bi,[$bp,#8*4]! // *++b
1208
- adc $topmost,$carry,xzr
1209
- ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210
- sub $np,$np,$num // rewind np
1211
- ldp $a2,$a3,[$t1,#8*2]
1212
- add $ap,$t1,#8*4
1213
-
1214
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217
- ldp $acc2,$acc3,[sp,#8*6]
1218
-
1219
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
1220
- mov $tp,sp
1221
- ldp $m2,$m3,[$np,#8*2]
1222
- adds $np,$np,#8*4 // clear carry bit
1223
- mov $carry,xzr
1224
-
1225
- .align 4
1226
- .Loop_mul4x_reduction:
1227
- mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228
- adc $carry,$carry,xzr // modulo-scheduled
1229
- mul $t1,$a1,$bi
1230
- add $cnt,$cnt,#8
1231
- mul $t2,$a2,$bi
1232
- and $cnt,$cnt,#31
1233
- mul $t3,$a3,$bi
1234
- adds $acc0,$acc0,$t0
1235
- umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236
- adcs $acc1,$acc1,$t1
1237
- mul $mi,$acc0,$n0 // t[0]*n0
1238
- adcs $acc2,$acc2,$t2
1239
- umulh $t1,$a1,$bi
1240
- adcs $acc3,$acc3,$t3
1241
- umulh $t2,$a2,$bi
1242
- adc $acc4,xzr,xzr
1243
- umulh $t3,$a3,$bi
1244
- ldr $bi,[$bp,$cnt] // next b[i]
1245
- adds $acc1,$acc1,$t0
1246
- // (*) mul $t0,$m0,$mi
1247
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248
- adcs $acc2,$acc2,$t1
1249
- mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250
- adcs $acc3,$acc3,$t2
1251
- mul $t2,$m2,$mi
1252
- adc $acc4,$acc4,$t3 // can't overflow
1253
- mul $t3,$m3,$mi
1254
- // (*) adds xzr,$acc0,$t0
1255
- subs xzr,$acc0,#1 // (*)
1256
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257
- adcs $acc0,$acc1,$t1
1258
- umulh $t1,$m1,$mi
1259
- adcs $acc1,$acc2,$t2
1260
- umulh $t2,$m2,$mi
1261
- adcs $acc2,$acc3,$t3
1262
- umulh $t3,$m3,$mi
1263
- adcs $acc3,$acc4,$carry
1264
- adc $carry,xzr,xzr
1265
- adds $acc0,$acc0,$t0
1266
- adcs $acc1,$acc1,$t1
1267
- adcs $acc2,$acc2,$t2
1268
- adcs $acc3,$acc3,$t3
1269
- //adc $carry,$carry,xzr
1270
- cbnz $cnt,.Loop_mul4x_reduction
1271
-
1272
- adc $carry,$carry,xzr
1273
- ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274
- ldp $t2,$t3,[$tp,#8*6]
1275
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276
- ldp $a2,$a3,[$ap,#8*2]
1277
- add $ap,$ap,#8*4
1278
- adds $acc0,$acc0,$t0
1279
- adcs $acc1,$acc1,$t1
1280
- adcs $acc2,$acc2,$t2
1281
- adcs $acc3,$acc3,$t3
1282
- //adc $carry,$carry,xzr
1283
-
1284
- ldr $mi,[sp] // t[0]*n0
1285
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286
- ldp $m2,$m3,[$np,#8*2]
1287
- add $np,$np,#8*4
1288
-
1289
- .align 4
1290
- .Loop_mul4x_tail:
1291
- mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292
- adc $carry,$carry,xzr // modulo-scheduled
1293
- mul $t1,$a1,$bi
1294
- add $cnt,$cnt,#8
1295
- mul $t2,$a2,$bi
1296
- and $cnt,$cnt,#31
1297
- mul $t3,$a3,$bi
1298
- adds $acc0,$acc0,$t0
1299
- umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300
- adcs $acc1,$acc1,$t1
1301
- umulh $t1,$a1,$bi
1302
- adcs $acc2,$acc2,$t2
1303
- umulh $t2,$a2,$bi
1304
- adcs $acc3,$acc3,$t3
1305
- umulh $t3,$a3,$bi
1306
- adc $acc4,xzr,xzr
1307
- ldr $bi,[$bp,$cnt] // next b[i]
1308
- adds $acc1,$acc1,$t0
1309
- mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310
- adcs $acc2,$acc2,$t1
1311
- mul $t1,$m1,$mi
1312
- adcs $acc3,$acc3,$t2
1313
- mul $t2,$m2,$mi
1314
- adc $acc4,$acc4,$t3 // can't overflow
1315
- mul $t3,$m3,$mi
1316
- adds $acc0,$acc0,$t0
1317
- umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318
- adcs $acc1,$acc1,$t1
1319
- umulh $t1,$m1,$mi
1320
- adcs $acc2,$acc2,$t2
1321
- umulh $t2,$m2,$mi
1322
- adcs $acc3,$acc3,$t3
1323
- umulh $t3,$m3,$mi
1324
- adcs $acc4,$acc4,$carry
1325
- ldr $mi,[sp,$cnt] // next a[0]*n0
1326
- adc $carry,xzr,xzr
1327
- str $acc0,[$tp],#8 // result!!!
1328
- adds $acc0,$acc1,$t0
1329
- sub $t0,$ap_end,$ap // done yet?
1330
- adcs $acc1,$acc2,$t1
1331
- adcs $acc2,$acc3,$t2
1332
- adcs $acc3,$acc4,$t3
1333
- //adc $carry,$carry,xzr
1334
- cbnz $cnt,.Loop_mul4x_tail
1335
-
1336
- sub $t1,$np,$num // rewinded np?
1337
- adc $carry,$carry,xzr
1338
- cbz $t0,.Loop_mul4x_break
1339
-
1340
- ldp $t0,$t1,[$tp,#8*4]
1341
- ldp $t2,$t3,[$tp,#8*6]
1342
- ldp $a0,$a1,[$ap,#8*0]
1343
- ldp $a2,$a3,[$ap,#8*2]
1344
- add $ap,$ap,#8*4
1345
- adds $acc0,$acc0,$t0
1346
- adcs $acc1,$acc1,$t1
1347
- adcs $acc2,$acc2,$t2
1348
- adcs $acc3,$acc3,$t3
1349
- //adc $carry,$carry,xzr
1350
- ldp $m0,$m1,[$np,#8*0]
1351
- ldp $m2,$m3,[$np,#8*2]
1352
- add $np,$np,#8*4
1353
- b .Loop_mul4x_tail
1354
-
1355
- .align 4
1356
- .Loop_mul4x_break:
1357
- ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358
- adds $acc0,$acc0,$topmost
1359
- add $bp,$bp,#8*4 // bp++
1360
- adcs $acc1,$acc1,xzr
1361
- sub $ap,$ap,$num // rewind ap
1362
- adcs $acc2,$acc2,xzr
1363
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364
- adcs $acc3,$acc3,xzr
1365
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366
- adc $topmost,$carry,xzr
1367
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368
- cmp $bp,$t3 // done yet?
1369
- ldp $acc2,$acc3,[sp,#8*6]
1370
- ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371
- ldp $m2,$m3,[$t1,#8*2]
1372
- add $np,$t1,#8*4
1373
- b.eq .Lmul4x_post
1374
-
1375
- ldr $bi,[$bp]
1376
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377
- ldp $a2,$a3,[$ap,#8*2]
1378
- adds $ap,$ap,#8*4 // clear carry bit
1379
- mov $carry,xzr
1380
- mov $tp,sp
1381
- b .Loop_mul4x_reduction
1382
-
1383
- .align 4
1384
- .Lmul4x_post:
1385
- // Final step. We see if result is larger than modulus, and
1386
- // if it is, subtract the modulus. But comparison implies
1387
- // subtraction. So we subtract modulus, see if it borrowed,
1388
- // and conditionally copy original value.
1389
- mov $rp,$t2
1390
- mov $ap_end,$t2 // $rp copy
1391
- subs $t0,$acc0,$m0
1392
- add $tp,sp,#8*8
1393
- sbcs $t1,$acc1,$m1
1394
- sub $cnt,$num,#8*4
1395
-
1396
- .Lmul4x_sub:
1397
- sbcs $t2,$acc2,$m2
1398
- ldp $m0,$m1,[$np,#8*0]
1399
- sub $cnt,$cnt,#8*4
1400
- ldp $acc0,$acc1,[$tp,#8*0]
1401
- sbcs $t3,$acc3,$m3
1402
- ldp $m2,$m3,[$np,#8*2]
1403
- add $np,$np,#8*4
1404
- ldp $acc2,$acc3,[$tp,#8*2]
1405
- add $tp,$tp,#8*4
1406
- stp $t0,$t1,[$rp,#8*0]
1407
- sbcs $t0,$acc0,$m0
1408
- stp $t2,$t3,[$rp,#8*2]
1409
- add $rp,$rp,#8*4
1410
- sbcs $t1,$acc1,$m1
1411
- cbnz $cnt,.Lmul4x_sub
1412
-
1413
- sbcs $t2,$acc2,$m2
1414
- mov $tp,sp
1415
- add $ap,sp,#8*4
1416
- ldp $a0,$a1,[$ap_end,#8*0]
1417
- sbcs $t3,$acc3,$m3
1418
- stp $t0,$t1,[$rp,#8*0]
1419
- ldp $a2,$a3,[$ap_end,#8*2]
1420
- stp $t2,$t3,[$rp,#8*2]
1421
- ldp $acc0,$acc1,[$ap,#8*0]
1422
- ldp $acc2,$acc3,[$ap,#8*2]
1423
- sbcs xzr,$topmost,xzr // did it borrow?
1424
- ldr x30,[x29,#8] // pull return address
1425
-
1426
- sub $cnt,$num,#8*4
1427
- .Lmul4x_cond_copy:
1428
- sub $cnt,$cnt,#8*4
1429
- csel $t0,$acc0,$a0,lo
1430
- stp xzr,xzr,[$tp,#8*0]
1431
- csel $t1,$acc1,$a1,lo
1432
- ldp $a0,$a1,[$ap_end,#8*4]
1433
- ldp $acc0,$acc1,[$ap,#8*4]
1434
- csel $t2,$acc2,$a2,lo
1435
- stp xzr,xzr,[$tp,#8*2]
1436
- add $tp,$tp,#8*4
1437
- csel $t3,$acc3,$a3,lo
1438
- ldp $a2,$a3,[$ap_end,#8*6]
1439
- ldp $acc2,$acc3,[$ap,#8*6]
1440
- add $ap,$ap,#8*4
1441
- stp $t0,$t1,[$ap_end,#8*0]
1442
- stp $t2,$t3,[$ap_end,#8*2]
1443
- add $ap_end,$ap_end,#8*4
1444
- cbnz $cnt,.Lmul4x_cond_copy
1445
-
1446
- csel $t0,$acc0,$a0,lo
1447
- stp xzr,xzr,[$tp,#8*0]
1448
- csel $t1,$acc1,$a1,lo
1449
- stp xzr,xzr,[$tp,#8*2]
1450
- csel $t2,$acc2,$a2,lo
1451
- stp xzr,xzr,[$tp,#8*3]
1452
- csel $t3,$acc3,$a3,lo
1453
- stp xzr,xzr,[$tp,#8*4]
1454
- stp $t0,$t1,[$ap_end,#8*0]
1455
- stp $t2,$t3,[$ap_end,#8*2]
1456
-
1457
- b .Lmul4x_done
1458
-
1459
- .align 4
1460
- .Lmul4x4_post_condition:
1461
- adc $carry,$carry,xzr
1462
- ldr $ap,[x29,#96] // pull rp
1463
- // $acc0-3,$carry hold result, $m0-7 hold modulus
1464
- subs $a0,$acc0,$m0
1465
- ldr x30,[x29,#8] // pull return address
1466
- sbcs $a1,$acc1,$m1
1467
- stp xzr,xzr,[sp,#8*0]
1468
- sbcs $a2,$acc2,$m2
1469
- stp xzr,xzr,[sp,#8*2]
1470
- sbcs $a3,$acc3,$m3
1471
- stp xzr,xzr,[sp,#8*4]
1472
- sbcs xzr,$carry,xzr // did it borrow?
1473
- stp xzr,xzr,[sp,#8*6]
1474
-
1475
- // $a0-3 hold result-modulus
1476
- csel $a0,$acc0,$a0,lo
1477
- csel $a1,$acc1,$a1,lo
1478
- csel $a2,$acc2,$a2,lo
1479
- csel $a3,$acc3,$a3,lo
1480
- stp $a0,$a1,[$ap,#8*0]
1481
- stp $a2,$a3,[$ap,#8*2]
1482
-
1483
- .Lmul4x_done:
1484
- ldp x19,x20,[x29,#16]
1485
- mov sp,x29
1486
- ldp x21,x22,[x29,#32]
1487
- mov x0,#1
1488
- ldp x23,x24,[x29,#48]
1489
- ldp x25,x26,[x29,#64]
1490
- ldp x27,x28,[x29,#80]
1491
- ldr x29,[sp],#128
1492
- ret
1493
- .size __bn_mul4x_mont,.-__bn_mul4x_mont
1494
- ___
1495
- }
1496
- $code.=<<___;
1497
- .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498
- .align 4
1499
- ___
1500
-
1501
- print $code;
1502
-
1503
- close STDOUT;