ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1503 +0,0 @@
1
- #!/usr/bin/env perl
2
-
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
-
10
- # March 2015
11
- #
12
- # "Teaser" Montgomery multiplication module for ARMv8. Needs more
13
- # work. While it does improve RSA sign performance by 20-30% (less for
14
- # longer keys) on most processors, for some reason RSA2048 is not
15
- # faster and RSA4096 goes 15-20% slower on Cortex-A57. Multiplication
16
- # instruction issue rate is limited on processor in question, meaning
17
- # that dedicated squaring procedure is a must. Well, actually all
18
- # contemporary AArch64 processors seem to have limited multiplication
19
- # issue rate, i.e. they can't issue multiplication every cycle, which
20
- # explains moderate improvement coefficients in comparison to
21
- # compiler-generated code. Recall that compiler is instructed to use
22
- # umulh and therefore uses same amount of multiplication instructions
23
- # to do the job. Assembly's edge is to minimize number of "collateral"
24
- # instructions and of course instruction scheduling.
25
- #
26
- # April 2015
27
- #
28
- # Squaring procedure that handles lengths divisible by 8 improves
29
- # RSA/DSA performance by 25-40-60% depending on processor and key
30
- # length. Overall improvement coefficients are always positive in
31
- # comparison to compiler-generated code. On Cortex-A57 improvement
32
- # is still modest on longest key lengths, while others exhibit e.g.
33
- # 50-70% improvement for RSA4096 sign. RSA2048 sign is ~25% faster
34
- # on Cortex-A57 and ~60-100% faster on others.
35
-
36
- $flavour = shift;
37
- $output = shift;
38
-
39
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
41
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
42
- die "can't locate arm-xlate.pl";
43
-
44
- open OUT,"| \"$^X\" $xlate $flavour $output";
45
- *STDOUT=*OUT;
46
-
47
- ($lo0,$hi0,$aj,$m0,$alo,$ahi,
48
- $lo1,$hi1,$nj,$m1,$nlo,$nhi,
49
- $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
50
-
51
- # int bn_mul_mont(
52
- $rp="x0"; # BN_ULONG *rp,
53
- $ap="x1"; # const BN_ULONG *ap,
54
- $bp="x2"; # const BN_ULONG *bp,
55
- $np="x3"; # const BN_ULONG *np,
56
- $n0="x4"; # const BN_ULONG *n0,
57
- $num="x5"; # int num);
58
-
59
- $code.=<<___;
60
- .text
61
-
62
- .globl bn_mul_mont
63
- .type bn_mul_mont,%function
64
- .align 5
65
- bn_mul_mont:
66
- tst $num,#7
67
- b.eq __bn_sqr8x_mont
68
- tst $num,#3
69
- b.eq __bn_mul4x_mont
70
- .Lmul_mont:
71
- stp x29,x30,[sp,#-64]!
72
- add x29,sp,#0
73
- stp x19,x20,[sp,#16]
74
- stp x21,x22,[sp,#32]
75
- stp x23,x24,[sp,#48]
76
-
77
- ldr $m0,[$bp],#8 // bp[0]
78
- sub $tp,sp,$num,lsl#3
79
- ldp $hi0,$aj,[$ap],#16 // ap[0..1]
80
- lsl $num,$num,#3
81
- ldr $n0,[$n0] // *n0
82
- and $tp,$tp,#-16 // ABI says so
83
- ldp $hi1,$nj,[$np],#16 // np[0..1]
84
-
85
- mul $lo0,$hi0,$m0 // ap[0]*bp[0]
86
- sub $j,$num,#16 // j=num-2
87
- umulh $hi0,$hi0,$m0
88
- mul $alo,$aj,$m0 // ap[1]*bp[0]
89
- umulh $ahi,$aj,$m0
90
-
91
- mul $m1,$lo0,$n0 // "tp[0]"*n0
92
- mov sp,$tp // alloca
93
-
94
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
95
- umulh $hi1,$hi1,$m1
96
- mul $nlo,$nj,$m1 // np[1]*m1
97
- // (*) adds $lo1,$lo1,$lo0 // discarded
98
- // (*) As for removal of first multiplication and addition
99
- // instructions. The outcome of first addition is
100
- // guaranteed to be zero, which leaves two computationally
101
- // significant outcomes: it either carries or not. Then
102
- // question is when does it carry? Is there alternative
103
- // way to deduce it? If you follow operations, you can
104
- // observe that condition for carry is quite simple:
105
- // $lo0 being non-zero. So that carry can be calculated
106
- // by adding -1 to $lo0. That's what next instruction does.
107
- subs xzr,$lo0,#1 // (*)
108
- umulh $nhi,$nj,$m1
109
- adc $hi1,$hi1,xzr
110
- cbz $j,.L1st_skip
111
-
112
- .L1st:
113
- ldr $aj,[$ap],#8
114
- adds $lo0,$alo,$hi0
115
- sub $j,$j,#8 // j--
116
- adc $hi0,$ahi,xzr
117
-
118
- ldr $nj,[$np],#8
119
- adds $lo1,$nlo,$hi1
120
- mul $alo,$aj,$m0 // ap[j]*bp[0]
121
- adc $hi1,$nhi,xzr
122
- umulh $ahi,$aj,$m0
123
-
124
- adds $lo1,$lo1,$lo0
125
- mul $nlo,$nj,$m1 // np[j]*m1
126
- adc $hi1,$hi1,xzr
127
- umulh $nhi,$nj,$m1
128
- str $lo1,[$tp],#8 // tp[j-1]
129
- cbnz $j,.L1st
130
-
131
- .L1st_skip:
132
- adds $lo0,$alo,$hi0
133
- sub $ap,$ap,$num // rewind $ap
134
- adc $hi0,$ahi,xzr
135
-
136
- adds $lo1,$nlo,$hi1
137
- sub $np,$np,$num // rewind $np
138
- adc $hi1,$nhi,xzr
139
-
140
- adds $lo1,$lo1,$lo0
141
- sub $i,$num,#8 // i=num-1
142
- adcs $hi1,$hi1,$hi0
143
-
144
- adc $ovf,xzr,xzr // upmost overflow bit
145
- stp $lo1,$hi1,[$tp]
146
-
147
- .Louter:
148
- ldr $m0,[$bp],#8 // bp[i]
149
- ldp $hi0,$aj,[$ap],#16
150
- ldr $tj,[sp] // tp[0]
151
- add $tp,sp,#8
152
-
153
- mul $lo0,$hi0,$m0 // ap[0]*bp[i]
154
- sub $j,$num,#16 // j=num-2
155
- umulh $hi0,$hi0,$m0
156
- ldp $hi1,$nj,[$np],#16
157
- mul $alo,$aj,$m0 // ap[1]*bp[i]
158
- adds $lo0,$lo0,$tj
159
- umulh $ahi,$aj,$m0
160
- adc $hi0,$hi0,xzr
161
-
162
- mul $m1,$lo0,$n0
163
- sub $i,$i,#8 // i--
164
-
165
- // (*) mul $lo1,$hi1,$m1 // np[0]*m1
166
- umulh $hi1,$hi1,$m1
167
- mul $nlo,$nj,$m1 // np[1]*m1
168
- // (*) adds $lo1,$lo1,$lo0
169
- subs xzr,$lo0,#1 // (*)
170
- umulh $nhi,$nj,$m1
171
- cbz $j,.Linner_skip
172
-
173
- .Linner:
174
- ldr $aj,[$ap],#8
175
- adc $hi1,$hi1,xzr
176
- ldr $tj,[$tp],#8 // tp[j]
177
- adds $lo0,$alo,$hi0
178
- sub $j,$j,#8 // j--
179
- adc $hi0,$ahi,xzr
180
-
181
- adds $lo1,$nlo,$hi1
182
- ldr $nj,[$np],#8
183
- adc $hi1,$nhi,xzr
184
-
185
- mul $alo,$aj,$m0 // ap[j]*bp[i]
186
- adds $lo0,$lo0,$tj
187
- umulh $ahi,$aj,$m0
188
- adc $hi0,$hi0,xzr
189
-
190
- mul $nlo,$nj,$m1 // np[j]*m1
191
- adds $lo1,$lo1,$lo0
192
- umulh $nhi,$nj,$m1
193
- str $lo1,[$tp,#-16] // tp[j-1]
194
- cbnz $j,.Linner
195
-
196
- .Linner_skip:
197
- ldr $tj,[$tp],#8 // tp[j]
198
- adc $hi1,$hi1,xzr
199
- adds $lo0,$alo,$hi0
200
- sub $ap,$ap,$num // rewind $ap
201
- adc $hi0,$ahi,xzr
202
-
203
- adds $lo1,$nlo,$hi1
204
- sub $np,$np,$num // rewind $np
205
- adcs $hi1,$nhi,$ovf
206
- adc $ovf,xzr,xzr
207
-
208
- adds $lo0,$lo0,$tj
209
- adc $hi0,$hi0,xzr
210
-
211
- adds $lo1,$lo1,$lo0
212
- adcs $hi1,$hi1,$hi0
213
- adc $ovf,$ovf,xzr // upmost overflow bit
214
- stp $lo1,$hi1,[$tp,#-16]
215
-
216
- cbnz $i,.Louter
217
-
218
- // Final step. We see if result is larger than modulus, and
219
- // if it is, subtract the modulus. But comparison implies
220
- // subtraction. So we subtract modulus, see if it borrowed,
221
- // and conditionally copy original value.
222
- ldr $tj,[sp] // tp[0]
223
- add $tp,sp,#8
224
- ldr $nj,[$np],#8 // np[0]
225
- subs $j,$num,#8 // j=num-1 and clear borrow
226
- mov $ap,$rp
227
- .Lsub:
228
- sbcs $aj,$tj,$nj // tp[j]-np[j]
229
- ldr $tj,[$tp],#8
230
- sub $j,$j,#8 // j--
231
- ldr $nj,[$np],#8
232
- str $aj,[$ap],#8 // rp[j]=tp[j]-np[j]
233
- cbnz $j,.Lsub
234
-
235
- sbcs $aj,$tj,$nj
236
- sbcs $ovf,$ovf,xzr // did it borrow?
237
- str $aj,[$ap],#8 // rp[num-1]
238
-
239
- ldr $tj,[sp] // tp[0]
240
- add $tp,sp,#8
241
- ldr $aj,[$rp],#8 // rp[0]
242
- sub $num,$num,#8 // num--
243
- nop
244
- .Lcond_copy:
245
- sub $num,$num,#8 // num--
246
- csel $nj,$tj,$aj,lo // did it borrow?
247
- ldr $tj,[$tp],#8
248
- ldr $aj,[$rp],#8
249
- str xzr,[$tp,#-16] // wipe tp
250
- str $nj,[$rp,#-16]
251
- cbnz $num,.Lcond_copy
252
-
253
- csel $nj,$tj,$aj,lo
254
- str xzr,[$tp,#-8] // wipe tp
255
- str $nj,[$rp,#-8]
256
-
257
- ldp x19,x20,[x29,#16]
258
- mov sp,x29
259
- ldp x21,x22,[x29,#32]
260
- mov x0,#1
261
- ldp x23,x24,[x29,#48]
262
- ldr x29,[sp],#64
263
- ret
264
- .size bn_mul_mont,.-bn_mul_mont
265
- ___
266
- {
267
- ########################################################################
268
- # Following is ARMv8 adaptation of sqrx8x_mont from x86_64-mont5 module.
269
-
270
- my ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("x$_",(6..13));
271
- my ($t0,$t1,$t2,$t3)=map("x$_",(14..17));
272
- my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26));
273
- my ($cnt,$carry,$topmost)=("x27","x28","x30");
274
- my ($tp,$ap_end,$na0)=($bp,$np,$carry);
275
-
276
- $code.=<<___;
277
- .type __bn_sqr8x_mont,%function
278
- .align 5
279
- __bn_sqr8x_mont:
280
- cmp $ap,$bp
281
- b.ne __bn_mul4x_mont
282
- .Lsqr8x_mont:
283
- stp x29,x30,[sp,#-128]!
284
- add x29,sp,#0
285
- stp x19,x20,[sp,#16]
286
- stp x21,x22,[sp,#32]
287
- stp x23,x24,[sp,#48]
288
- stp x25,x26,[sp,#64]
289
- stp x27,x28,[sp,#80]
290
- stp $rp,$np,[sp,#96] // offload rp and np
291
-
292
- ldp $a0,$a1,[$ap,#8*0]
293
- ldp $a2,$a3,[$ap,#8*2]
294
- ldp $a4,$a5,[$ap,#8*4]
295
- ldp $a6,$a7,[$ap,#8*6]
296
-
297
- sub $tp,sp,$num,lsl#4
298
- lsl $num,$num,#3
299
- ldr $n0,[$n0] // *n0
300
- mov sp,$tp // alloca
301
- sub $cnt,$num,#8*8
302
- b .Lsqr8x_zero_start
303
-
304
- .Lsqr8x_zero:
305
- sub $cnt,$cnt,#8*8
306
- stp xzr,xzr,[$tp,#8*0]
307
- stp xzr,xzr,[$tp,#8*2]
308
- stp xzr,xzr,[$tp,#8*4]
309
- stp xzr,xzr,[$tp,#8*6]
310
- .Lsqr8x_zero_start:
311
- stp xzr,xzr,[$tp,#8*8]
312
- stp xzr,xzr,[$tp,#8*10]
313
- stp xzr,xzr,[$tp,#8*12]
314
- stp xzr,xzr,[$tp,#8*14]
315
- add $tp,$tp,#8*16
316
- cbnz $cnt,.Lsqr8x_zero
317
-
318
- add $ap_end,$ap,$num
319
- add $ap,$ap,#8*8
320
- mov $acc0,xzr
321
- mov $acc1,xzr
322
- mov $acc2,xzr
323
- mov $acc3,xzr
324
- mov $acc4,xzr
325
- mov $acc5,xzr
326
- mov $acc6,xzr
327
- mov $acc7,xzr
328
- mov $tp,sp
329
- str $n0,[x29,#112] // offload n0
330
-
331
- // Multiply everything but a[i]*a[i]
332
- .align 4
333
- .Lsqr8x_outer_loop:
334
- // a[1]a[0] (i)
335
- // a[2]a[0]
336
- // a[3]a[0]
337
- // a[4]a[0]
338
- // a[5]a[0]
339
- // a[6]a[0]
340
- // a[7]a[0]
341
- // a[2]a[1] (ii)
342
- // a[3]a[1]
343
- // a[4]a[1]
344
- // a[5]a[1]
345
- // a[6]a[1]
346
- // a[7]a[1]
347
- // a[3]a[2] (iii)
348
- // a[4]a[2]
349
- // a[5]a[2]
350
- // a[6]a[2]
351
- // a[7]a[2]
352
- // a[4]a[3] (iv)
353
- // a[5]a[3]
354
- // a[6]a[3]
355
- // a[7]a[3]
356
- // a[5]a[4] (v)
357
- // a[6]a[4]
358
- // a[7]a[4]
359
- // a[6]a[5] (vi)
360
- // a[7]a[5]
361
- // a[7]a[6] (vii)
362
-
363
- mul $t0,$a1,$a0 // lo(a[1..7]*a[0]) (i)
364
- mul $t1,$a2,$a0
365
- mul $t2,$a3,$a0
366
- mul $t3,$a4,$a0
367
- adds $acc1,$acc1,$t0 // t[1]+lo(a[1]*a[0])
368
- mul $t0,$a5,$a0
369
- adcs $acc2,$acc2,$t1
370
- mul $t1,$a6,$a0
371
- adcs $acc3,$acc3,$t2
372
- mul $t2,$a7,$a0
373
- adcs $acc4,$acc4,$t3
374
- umulh $t3,$a1,$a0 // hi(a[1..7]*a[0])
375
- adcs $acc5,$acc5,$t0
376
- umulh $t0,$a2,$a0
377
- adcs $acc6,$acc6,$t1
378
- umulh $t1,$a3,$a0
379
- adcs $acc7,$acc7,$t2
380
- umulh $t2,$a4,$a0
381
- stp $acc0,$acc1,[$tp],#8*2 // t[0..1]
382
- adc $acc0,xzr,xzr // t[8]
383
- adds $acc2,$acc2,$t3 // t[2]+lo(a[1]*a[0])
384
- umulh $t3,$a5,$a0
385
- adcs $acc3,$acc3,$t0
386
- umulh $t0,$a6,$a0
387
- adcs $acc4,$acc4,$t1
388
- umulh $t1,$a7,$a0
389
- adcs $acc5,$acc5,$t2
390
- mul $t2,$a2,$a1 // lo(a[2..7]*a[1]) (ii)
391
- adcs $acc6,$acc6,$t3
392
- mul $t3,$a3,$a1
393
- adcs $acc7,$acc7,$t0
394
- mul $t0,$a4,$a1
395
- adc $acc0,$acc0,$t1
396
-
397
- mul $t1,$a5,$a1
398
- adds $acc3,$acc3,$t2
399
- mul $t2,$a6,$a1
400
- adcs $acc4,$acc4,$t3
401
- mul $t3,$a7,$a1
402
- adcs $acc5,$acc5,$t0
403
- umulh $t0,$a2,$a1 // hi(a[2..7]*a[1])
404
- adcs $acc6,$acc6,$t1
405
- umulh $t1,$a3,$a1
406
- adcs $acc7,$acc7,$t2
407
- umulh $t2,$a4,$a1
408
- adcs $acc0,$acc0,$t3
409
- umulh $t3,$a5,$a1
410
- stp $acc2,$acc3,[$tp],#8*2 // t[2..3]
411
- adc $acc1,xzr,xzr // t[9]
412
- adds $acc4,$acc4,$t0
413
- umulh $t0,$a6,$a1
414
- adcs $acc5,$acc5,$t1
415
- umulh $t1,$a7,$a1
416
- adcs $acc6,$acc6,$t2
417
- mul $t2,$a3,$a2 // lo(a[3..7]*a[2]) (iii)
418
- adcs $acc7,$acc7,$t3
419
- mul $t3,$a4,$a2
420
- adcs $acc0,$acc0,$t0
421
- mul $t0,$a5,$a2
422
- adc $acc1,$acc1,$t1
423
-
424
- mul $t1,$a6,$a2
425
- adds $acc5,$acc5,$t2
426
- mul $t2,$a7,$a2
427
- adcs $acc6,$acc6,$t3
428
- umulh $t3,$a3,$a2 // hi(a[3..7]*a[2])
429
- adcs $acc7,$acc7,$t0
430
- umulh $t0,$a4,$a2
431
- adcs $acc0,$acc0,$t1
432
- umulh $t1,$a5,$a2
433
- adcs $acc1,$acc1,$t2
434
- umulh $t2,$a6,$a2
435
- stp $acc4,$acc5,[$tp],#8*2 // t[4..5]
436
- adc $acc2,xzr,xzr // t[10]
437
- adds $acc6,$acc6,$t3
438
- umulh $t3,$a7,$a2
439
- adcs $acc7,$acc7,$t0
440
- mul $t0,$a4,$a3 // lo(a[4..7]*a[3]) (iv)
441
- adcs $acc0,$acc0,$t1
442
- mul $t1,$a5,$a3
443
- adcs $acc1,$acc1,$t2
444
- mul $t2,$a6,$a3
445
- adc $acc2,$acc2,$t3
446
-
447
- mul $t3,$a7,$a3
448
- adds $acc7,$acc7,$t0
449
- umulh $t0,$a4,$a3 // hi(a[4..7]*a[3])
450
- adcs $acc0,$acc0,$t1
451
- umulh $t1,$a5,$a3
452
- adcs $acc1,$acc1,$t2
453
- umulh $t2,$a6,$a3
454
- adcs $acc2,$acc2,$t3
455
- umulh $t3,$a7,$a3
456
- stp $acc6,$acc7,[$tp],#8*2 // t[6..7]
457
- adc $acc3,xzr,xzr // t[11]
458
- adds $acc0,$acc0,$t0
459
- mul $t0,$a5,$a4 // lo(a[5..7]*a[4]) (v)
460
- adcs $acc1,$acc1,$t1
461
- mul $t1,$a6,$a4
462
- adcs $acc2,$acc2,$t2
463
- mul $t2,$a7,$a4
464
- adc $acc3,$acc3,$t3
465
-
466
- umulh $t3,$a5,$a4 // hi(a[5..7]*a[4])
467
- adds $acc1,$acc1,$t0
468
- umulh $t0,$a6,$a4
469
- adcs $acc2,$acc2,$t1
470
- umulh $t1,$a7,$a4
471
- adcs $acc3,$acc3,$t2
472
- mul $t2,$a6,$a5 // lo(a[6..7]*a[5]) (vi)
473
- adc $acc4,xzr,xzr // t[12]
474
- adds $acc2,$acc2,$t3
475
- mul $t3,$a7,$a5
476
- adcs $acc3,$acc3,$t0
477
- umulh $t0,$a6,$a5 // hi(a[6..7]*a[5])
478
- adc $acc4,$acc4,$t1
479
-
480
- umulh $t1,$a7,$a5
481
- adds $acc3,$acc3,$t2
482
- mul $t2,$a7,$a6 // lo(a[7]*a[6]) (vii)
483
- adcs $acc4,$acc4,$t3
484
- umulh $t3,$a7,$a6 // hi(a[7]*a[6])
485
- adc $acc5,xzr,xzr // t[13]
486
- adds $acc4,$acc4,$t0
487
- sub $cnt,$ap_end,$ap // done yet?
488
- adc $acc5,$acc5,$t1
489
-
490
- adds $acc5,$acc5,$t2
491
- sub $t0,$ap_end,$num // rewinded ap
492
- adc $acc6,xzr,xzr // t[14]
493
- add $acc6,$acc6,$t3
494
-
495
- cbz $cnt,.Lsqr8x_outer_break
496
-
497
- mov $n0,$a0
498
- ldp $a0,$a1,[$tp,#8*0]
499
- ldp $a2,$a3,[$tp,#8*2]
500
- ldp $a4,$a5,[$tp,#8*4]
501
- ldp $a6,$a7,[$tp,#8*6]
502
- adds $acc0,$acc0,$a0
503
- adcs $acc1,$acc1,$a1
504
- ldp $a0,$a1,[$ap,#8*0]
505
- adcs $acc2,$acc2,$a2
506
- adcs $acc3,$acc3,$a3
507
- ldp $a2,$a3,[$ap,#8*2]
508
- adcs $acc4,$acc4,$a4
509
- adcs $acc5,$acc5,$a5
510
- ldp $a4,$a5,[$ap,#8*4]
511
- adcs $acc6,$acc6,$a6
512
- mov $rp,$ap
513
- adcs $acc7,xzr,$a7
514
- ldp $a6,$a7,[$ap,#8*6]
515
- add $ap,$ap,#8*8
516
- //adc $carry,xzr,xzr // moved below
517
- mov $cnt,#-8*8
518
-
519
- // a[8]a[0]
520
- // a[9]a[0]
521
- // a[a]a[0]
522
- // a[b]a[0]
523
- // a[c]a[0]
524
- // a[d]a[0]
525
- // a[e]a[0]
526
- // a[f]a[0]
527
- // a[8]a[1]
528
- // a[f]a[1]........................
529
- // a[8]a[2]
530
- // a[f]a[2]........................
531
- // a[8]a[3]
532
- // a[f]a[3]........................
533
- // a[8]a[4]
534
- // a[f]a[4]........................
535
- // a[8]a[5]
536
- // a[f]a[5]........................
537
- // a[8]a[6]
538
- // a[f]a[6]........................
539
- // a[8]a[7]
540
- // a[f]a[7]........................
541
- .Lsqr8x_mul:
542
- mul $t0,$a0,$n0
543
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
544
- mul $t1,$a1,$n0
545
- add $cnt,$cnt,#8
546
- mul $t2,$a2,$n0
547
- mul $t3,$a3,$n0
548
- adds $acc0,$acc0,$t0
549
- mul $t0,$a4,$n0
550
- adcs $acc1,$acc1,$t1
551
- mul $t1,$a5,$n0
552
- adcs $acc2,$acc2,$t2
553
- mul $t2,$a6,$n0
554
- adcs $acc3,$acc3,$t3
555
- mul $t3,$a7,$n0
556
- adcs $acc4,$acc4,$t0
557
- umulh $t0,$a0,$n0
558
- adcs $acc5,$acc5,$t1
559
- umulh $t1,$a1,$n0
560
- adcs $acc6,$acc6,$t2
561
- umulh $t2,$a2,$n0
562
- adcs $acc7,$acc7,$t3
563
- umulh $t3,$a3,$n0
564
- adc $carry,$carry,xzr
565
- str $acc0,[$tp],#8
566
- adds $acc0,$acc1,$t0
567
- umulh $t0,$a4,$n0
568
- adcs $acc1,$acc2,$t1
569
- umulh $t1,$a5,$n0
570
- adcs $acc2,$acc3,$t2
571
- umulh $t2,$a6,$n0
572
- adcs $acc3,$acc4,$t3
573
- umulh $t3,$a7,$n0
574
- ldr $n0,[$rp,$cnt]
575
- adcs $acc4,$acc5,$t0
576
- adcs $acc5,$acc6,$t1
577
- adcs $acc6,$acc7,$t2
578
- adcs $acc7,$carry,$t3
579
- //adc $carry,xzr,xzr // moved above
580
- cbnz $cnt,.Lsqr8x_mul
581
- // note that carry flag is guaranteed
582
- // to be zero at this point
583
- cmp $ap,$ap_end // done yet?
584
- b.eq .Lsqr8x_break
585
-
586
- ldp $a0,$a1,[$tp,#8*0]
587
- ldp $a2,$a3,[$tp,#8*2]
588
- ldp $a4,$a5,[$tp,#8*4]
589
- ldp $a6,$a7,[$tp,#8*6]
590
- adds $acc0,$acc0,$a0
591
- ldr $n0,[$rp,#-8*8]
592
- adcs $acc1,$acc1,$a1
593
- ldp $a0,$a1,[$ap,#8*0]
594
- adcs $acc2,$acc2,$a2
595
- adcs $acc3,$acc3,$a3
596
- ldp $a2,$a3,[$ap,#8*2]
597
- adcs $acc4,$acc4,$a4
598
- adcs $acc5,$acc5,$a5
599
- ldp $a4,$a5,[$ap,#8*4]
600
- adcs $acc6,$acc6,$a6
601
- mov $cnt,#-8*8
602
- adcs $acc7,$acc7,$a7
603
- ldp $a6,$a7,[$ap,#8*6]
604
- add $ap,$ap,#8*8
605
- //adc $carry,xzr,xzr // moved above
606
- b .Lsqr8x_mul
607
-
608
- .align 4
609
- .Lsqr8x_break:
610
- ldp $a0,$a1,[$rp,#8*0]
611
- add $ap,$rp,#8*8
612
- ldp $a2,$a3,[$rp,#8*2]
613
- sub $t0,$ap_end,$ap // is it last iteration?
614
- ldp $a4,$a5,[$rp,#8*4]
615
- sub $t1,$tp,$t0
616
- ldp $a6,$a7,[$rp,#8*6]
617
- cbz $t0,.Lsqr8x_outer_loop
618
-
619
- stp $acc0,$acc1,[$tp,#8*0]
620
- ldp $acc0,$acc1,[$t1,#8*0]
621
- stp $acc2,$acc3,[$tp,#8*2]
622
- ldp $acc2,$acc3,[$t1,#8*2]
623
- stp $acc4,$acc5,[$tp,#8*4]
624
- ldp $acc4,$acc5,[$t1,#8*4]
625
- stp $acc6,$acc7,[$tp,#8*6]
626
- mov $tp,$t1
627
- ldp $acc6,$acc7,[$t1,#8*6]
628
- b .Lsqr8x_outer_loop
629
-
630
- .align 4
631
- .Lsqr8x_outer_break:
632
- // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
633
- ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0]
634
- ldp $t1,$t2,[sp,#8*1]
635
- ldp $a5,$a7,[$t0,#8*2]
636
- add $ap,$t0,#8*4
637
- ldp $t3,$t0,[sp,#8*3]
638
-
639
- stp $acc0,$acc1,[$tp,#8*0]
640
- mul $acc0,$a1,$a1
641
- stp $acc2,$acc3,[$tp,#8*2]
642
- umulh $a1,$a1,$a1
643
- stp $acc4,$acc5,[$tp,#8*4]
644
- mul $a2,$a3,$a3
645
- stp $acc6,$acc7,[$tp,#8*6]
646
- mov $tp,sp
647
- umulh $a3,$a3,$a3
648
- adds $acc1,$a1,$t1,lsl#1
649
- extr $t1,$t2,$t1,#63
650
- sub $cnt,$num,#8*4
651
-
652
- .Lsqr4x_shift_n_add:
653
- adcs $acc2,$a2,$t1
654
- extr $t2,$t3,$t2,#63
655
- sub $cnt,$cnt,#8*4
656
- adcs $acc3,$a3,$t2
657
- ldp $t1,$t2,[$tp,#8*5]
658
- mul $a4,$a5,$a5
659
- ldp $a1,$a3,[$ap],#8*2
660
- umulh $a5,$a5,$a5
661
- mul $a6,$a7,$a7
662
- umulh $a7,$a7,$a7
663
- extr $t3,$t0,$t3,#63
664
- stp $acc0,$acc1,[$tp,#8*0]
665
- adcs $acc4,$a4,$t3
666
- extr $t0,$t1,$t0,#63
667
- stp $acc2,$acc3,[$tp,#8*2]
668
- adcs $acc5,$a5,$t0
669
- ldp $t3,$t0,[$tp,#8*7]
670
- extr $t1,$t2,$t1,#63
671
- adcs $acc6,$a6,$t1
672
- extr $t2,$t3,$t2,#63
673
- adcs $acc7,$a7,$t2
674
- ldp $t1,$t2,[$tp,#8*9]
675
- mul $a0,$a1,$a1
676
- ldp $a5,$a7,[$ap],#8*2
677
- umulh $a1,$a1,$a1
678
- mul $a2,$a3,$a3
679
- umulh $a3,$a3,$a3
680
- stp $acc4,$acc5,[$tp,#8*4]
681
- extr $t3,$t0,$t3,#63
682
- stp $acc6,$acc7,[$tp,#8*6]
683
- add $tp,$tp,#8*8
684
- adcs $acc0,$a0,$t3
685
- extr $t0,$t1,$t0,#63
686
- adcs $acc1,$a1,$t0
687
- ldp $t3,$t0,[$tp,#8*3]
688
- extr $t1,$t2,$t1,#63
689
- cbnz $cnt,.Lsqr4x_shift_n_add
690
- ___
691
- my ($np,$np_end)=($ap,$ap_end);
692
- $code.=<<___;
693
- ldp $np,$n0,[x29,#104] // pull np and n0
694
-
695
- adcs $acc2,$a2,$t1
696
- extr $t2,$t3,$t2,#63
697
- adcs $acc3,$a3,$t2
698
- ldp $t1,$t2,[$tp,#8*5]
699
- mul $a4,$a5,$a5
700
- umulh $a5,$a5,$a5
701
- stp $acc0,$acc1,[$tp,#8*0]
702
- mul $a6,$a7,$a7
703
- umulh $a7,$a7,$a7
704
- stp $acc2,$acc3,[$tp,#8*2]
705
- extr $t3,$t0,$t3,#63
706
- adcs $acc4,$a4,$t3
707
- extr $t0,$t1,$t0,#63
708
- ldp $acc0,$acc1,[sp,#8*0]
709
- adcs $acc5,$a5,$t0
710
- extr $t1,$t2,$t1,#63
711
- ldp $a0,$a1,[$np,#8*0]
712
- adcs $acc6,$a6,$t1
713
- extr $t2,xzr,$t2,#63
714
- ldp $a2,$a3,[$np,#8*2]
715
- adc $acc7,$a7,$t2
716
- ldp $a4,$a5,[$np,#8*4]
717
-
718
- // Reduce by 512 bits per iteration
719
- mul $na0,$n0,$acc0 // t[0]*n0
720
- ldp $a6,$a7,[$np,#8*6]
721
- add $np_end,$np,$num
722
- ldp $acc2,$acc3,[sp,#8*2]
723
- stp $acc4,$acc5,[$tp,#8*4]
724
- ldp $acc4,$acc5,[sp,#8*4]
725
- stp $acc6,$acc7,[$tp,#8*6]
726
- ldp $acc6,$acc7,[sp,#8*6]
727
- add $np,$np,#8*8
728
- mov $topmost,xzr // initial top-most carry
729
- mov $tp,sp
730
- mov $cnt,#8
731
-
732
- .Lsqr8x_reduction:
733
- // (*) mul $t0,$a0,$na0 // lo(n[0-7])*lo(t[0]*n0)
734
- mul $t1,$a1,$na0
735
- sub $cnt,$cnt,#1
736
- mul $t2,$a2,$na0
737
- str $na0,[$tp],#8 // put aside t[0]*n0 for tail processing
738
- mul $t3,$a3,$na0
739
- // (*) adds xzr,$acc0,$t0
740
- subs xzr,$acc0,#1 // (*)
741
- mul $t0,$a4,$na0
742
- adcs $acc0,$acc1,$t1
743
- mul $t1,$a5,$na0
744
- adcs $acc1,$acc2,$t2
745
- mul $t2,$a6,$na0
746
- adcs $acc2,$acc3,$t3
747
- mul $t3,$a7,$na0
748
- adcs $acc3,$acc4,$t0
749
- umulh $t0,$a0,$na0 // hi(n[0-7])*lo(t[0]*n0)
750
- adcs $acc4,$acc5,$t1
751
- umulh $t1,$a1,$na0
752
- adcs $acc5,$acc6,$t2
753
- umulh $t2,$a2,$na0
754
- adcs $acc6,$acc7,$t3
755
- umulh $t3,$a3,$na0
756
- adc $acc7,xzr,xzr
757
- adds $acc0,$acc0,$t0
758
- umulh $t0,$a4,$na0
759
- adcs $acc1,$acc1,$t1
760
- umulh $t1,$a5,$na0
761
- adcs $acc2,$acc2,$t2
762
- umulh $t2,$a6,$na0
763
- adcs $acc3,$acc3,$t3
764
- umulh $t3,$a7,$na0
765
- mul $na0,$n0,$acc0 // next t[0]*n0
766
- adcs $acc4,$acc4,$t0
767
- adcs $acc5,$acc5,$t1
768
- adcs $acc6,$acc6,$t2
769
- adc $acc7,$acc7,$t3
770
- cbnz $cnt,.Lsqr8x_reduction
771
-
772
- ldp $t0,$t1,[$tp,#8*0]
773
- ldp $t2,$t3,[$tp,#8*2]
774
- mov $rp,$tp
775
- sub $cnt,$np_end,$np // done yet?
776
- adds $acc0,$acc0,$t0
777
- adcs $acc1,$acc1,$t1
778
- ldp $t0,$t1,[$tp,#8*4]
779
- adcs $acc2,$acc2,$t2
780
- adcs $acc3,$acc3,$t3
781
- ldp $t2,$t3,[$tp,#8*6]
782
- adcs $acc4,$acc4,$t0
783
- adcs $acc5,$acc5,$t1
784
- adcs $acc6,$acc6,$t2
785
- adcs $acc7,$acc7,$t3
786
- //adc $carry,xzr,xzr // moved below
787
- cbz $cnt,.Lsqr8x8_post_condition
788
-
789
- ldr $n0,[$tp,#-8*8]
790
- ldp $a0,$a1,[$np,#8*0]
791
- ldp $a2,$a3,[$np,#8*2]
792
- ldp $a4,$a5,[$np,#8*4]
793
- mov $cnt,#-8*8
794
- ldp $a6,$a7,[$np,#8*6]
795
- add $np,$np,#8*8
796
-
797
- .Lsqr8x_tail:
798
- mul $t0,$a0,$n0
799
- adc $carry,xzr,xzr // carry bit, modulo-scheduled
800
- mul $t1,$a1,$n0
801
- add $cnt,$cnt,#8
802
- mul $t2,$a2,$n0
803
- mul $t3,$a3,$n0
804
- adds $acc0,$acc0,$t0
805
- mul $t0,$a4,$n0
806
- adcs $acc1,$acc1,$t1
807
- mul $t1,$a5,$n0
808
- adcs $acc2,$acc2,$t2
809
- mul $t2,$a6,$n0
810
- adcs $acc3,$acc3,$t3
811
- mul $t3,$a7,$n0
812
- adcs $acc4,$acc4,$t0
813
- umulh $t0,$a0,$n0
814
- adcs $acc5,$acc5,$t1
815
- umulh $t1,$a1,$n0
816
- adcs $acc6,$acc6,$t2
817
- umulh $t2,$a2,$n0
818
- adcs $acc7,$acc7,$t3
819
- umulh $t3,$a3,$n0
820
- adc $carry,$carry,xzr
821
- str $acc0,[$tp],#8
822
- adds $acc0,$acc1,$t0
823
- umulh $t0,$a4,$n0
824
- adcs $acc1,$acc2,$t1
825
- umulh $t1,$a5,$n0
826
- adcs $acc2,$acc3,$t2
827
- umulh $t2,$a6,$n0
828
- adcs $acc3,$acc4,$t3
829
- umulh $t3,$a7,$n0
830
- ldr $n0,[$rp,$cnt]
831
- adcs $acc4,$acc5,$t0
832
- adcs $acc5,$acc6,$t1
833
- adcs $acc6,$acc7,$t2
834
- adcs $acc7,$carry,$t3
835
- //adc $carry,xzr,xzr // moved above
836
- cbnz $cnt,.Lsqr8x_tail
837
- // note that carry flag is guaranteed
838
- // to be zero at this point
839
- ldp $a0,$a1,[$tp,#8*0]
840
- sub $cnt,$np_end,$np // done yet?
841
- sub $t2,$np_end,$num // rewinded np
842
- ldp $a2,$a3,[$tp,#8*2]
843
- ldp $a4,$a5,[$tp,#8*4]
844
- ldp $a6,$a7,[$tp,#8*6]
845
- cbz $cnt,.Lsqr8x_tail_break
846
-
847
- ldr $n0,[$rp,#-8*8]
848
- adds $acc0,$acc0,$a0
849
- adcs $acc1,$acc1,$a1
850
- ldp $a0,$a1,[$np,#8*0]
851
- adcs $acc2,$acc2,$a2
852
- adcs $acc3,$acc3,$a3
853
- ldp $a2,$a3,[$np,#8*2]
854
- adcs $acc4,$acc4,$a4
855
- adcs $acc5,$acc5,$a5
856
- ldp $a4,$a5,[$np,#8*4]
857
- adcs $acc6,$acc6,$a6
858
- mov $cnt,#-8*8
859
- adcs $acc7,$acc7,$a7
860
- ldp $a6,$a7,[$np,#8*6]
861
- add $np,$np,#8*8
862
- //adc $carry,xzr,xzr // moved above
863
- b .Lsqr8x_tail
864
-
865
- .align 4
866
- .Lsqr8x_tail_break:
867
- ldr $n0,[x29,#112] // pull n0
868
- add $cnt,$tp,#8*8 // end of current t[num] window
869
-
870
- subs xzr,$topmost,#1 // "move" top-most carry to carry bit
871
- adcs $t0,$acc0,$a0
872
- adcs $t1,$acc1,$a1
873
- ldp $acc0,$acc1,[$rp,#8*0]
874
- adcs $acc2,$acc2,$a2
875
- ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0]
876
- adcs $acc3,$acc3,$a3
877
- ldp $a2,$a3,[$t2,#8*2]
878
- adcs $acc4,$acc4,$a4
879
- adcs $acc5,$acc5,$a5
880
- ldp $a4,$a5,[$t2,#8*4]
881
- adcs $acc6,$acc6,$a6
882
- adcs $acc7,$acc7,$a7
883
- ldp $a6,$a7,[$t2,#8*6]
884
- add $np,$t2,#8*8
885
- adc $topmost,xzr,xzr // top-most carry
886
- mul $na0,$n0,$acc0
887
- stp $t0,$t1,[$tp,#8*0]
888
- stp $acc2,$acc3,[$tp,#8*2]
889
- ldp $acc2,$acc3,[$rp,#8*2]
890
- stp $acc4,$acc5,[$tp,#8*4]
891
- ldp $acc4,$acc5,[$rp,#8*4]
892
- cmp $cnt,x29 // did we hit the bottom?
893
- stp $acc6,$acc7,[$tp,#8*6]
894
- mov $tp,$rp // slide the window
895
- ldp $acc6,$acc7,[$rp,#8*6]
896
- mov $cnt,#8
897
- b.ne .Lsqr8x_reduction
898
-
899
- // Final step. We see if result is larger than modulus, and
900
- // if it is, subtract the modulus. But comparison implies
901
- // subtraction. So we subtract modulus, see if it borrowed,
902
- // and conditionally copy original value.
903
- ldr $rp,[x29,#96] // pull rp
904
- add $tp,$tp,#8*8
905
- subs $t0,$acc0,$a0
906
- sbcs $t1,$acc1,$a1
907
- sub $cnt,$num,#8*8
908
- mov $ap_end,$rp // $rp copy
909
-
910
- .Lsqr8x_sub:
911
- sbcs $t2,$acc2,$a2
912
- ldp $a0,$a1,[$np,#8*0]
913
- sbcs $t3,$acc3,$a3
914
- stp $t0,$t1,[$rp,#8*0]
915
- sbcs $t0,$acc4,$a4
916
- ldp $a2,$a3,[$np,#8*2]
917
- sbcs $t1,$acc5,$a5
918
- stp $t2,$t3,[$rp,#8*2]
919
- sbcs $t2,$acc6,$a6
920
- ldp $a4,$a5,[$np,#8*4]
921
- sbcs $t3,$acc7,$a7
922
- ldp $a6,$a7,[$np,#8*6]
923
- add $np,$np,#8*8
924
- ldp $acc0,$acc1,[$tp,#8*0]
925
- sub $cnt,$cnt,#8*8
926
- ldp $acc2,$acc3,[$tp,#8*2]
927
- ldp $acc4,$acc5,[$tp,#8*4]
928
- ldp $acc6,$acc7,[$tp,#8*6]
929
- add $tp,$tp,#8*8
930
- stp $t0,$t1,[$rp,#8*4]
931
- sbcs $t0,$acc0,$a0
932
- stp $t2,$t3,[$rp,#8*6]
933
- add $rp,$rp,#8*8
934
- sbcs $t1,$acc1,$a1
935
- cbnz $cnt,.Lsqr8x_sub
936
-
937
- sbcs $t2,$acc2,$a2
938
- mov $tp,sp
939
- add $ap,sp,$num
940
- ldp $a0,$a1,[$ap_end,#8*0]
941
- sbcs $t3,$acc3,$a3
942
- stp $t0,$t1,[$rp,#8*0]
943
- sbcs $t0,$acc4,$a4
944
- ldp $a2,$a3,[$ap_end,#8*2]
945
- sbcs $t1,$acc5,$a5
946
- stp $t2,$t3,[$rp,#8*2]
947
- sbcs $t2,$acc6,$a6
948
- ldp $acc0,$acc1,[$ap,#8*0]
949
- sbcs $t3,$acc7,$a7
950
- ldp $acc2,$acc3,[$ap,#8*2]
951
- sbcs xzr,$topmost,xzr // did it borrow?
952
- ldr x30,[x29,#8] // pull return address
953
- stp $t0,$t1,[$rp,#8*4]
954
- stp $t2,$t3,[$rp,#8*6]
955
-
956
- sub $cnt,$num,#8*4
957
- .Lsqr4x_cond_copy:
958
- sub $cnt,$cnt,#8*4
959
- csel $t0,$acc0,$a0,lo
960
- stp xzr,xzr,[$tp,#8*0]
961
- csel $t1,$acc1,$a1,lo
962
- ldp $a0,$a1,[$ap_end,#8*4]
963
- ldp $acc0,$acc1,[$ap,#8*4]
964
- csel $t2,$acc2,$a2,lo
965
- stp xzr,xzr,[$tp,#8*2]
966
- add $tp,$tp,#8*4
967
- csel $t3,$acc3,$a3,lo
968
- ldp $a2,$a3,[$ap_end,#8*6]
969
- ldp $acc2,$acc3,[$ap,#8*6]
970
- add $ap,$ap,#8*4
971
- stp $t0,$t1,[$ap_end,#8*0]
972
- stp $t2,$t3,[$ap_end,#8*2]
973
- add $ap_end,$ap_end,#8*4
974
- stp xzr,xzr,[$ap,#8*0]
975
- stp xzr,xzr,[$ap,#8*2]
976
- cbnz $cnt,.Lsqr4x_cond_copy
977
-
978
- csel $t0,$acc0,$a0,lo
979
- stp xzr,xzr,[$tp,#8*0]
980
- csel $t1,$acc1,$a1,lo
981
- stp xzr,xzr,[$tp,#8*2]
982
- csel $t2,$acc2,$a2,lo
983
- csel $t3,$acc3,$a3,lo
984
- stp $t0,$t1,[$ap_end,#8*0]
985
- stp $t2,$t3,[$ap_end,#8*2]
986
-
987
- b .Lsqr8x_done
988
-
989
- .align 4
990
- .Lsqr8x8_post_condition:
991
- adc $carry,xzr,xzr
992
- ldr x30,[x29,#8] // pull return address
993
- // $acc0-7,$carry hold result, $a0-7 hold modulus
994
- subs $a0,$acc0,$a0
995
- ldr $ap,[x29,#96] // pull rp
996
- sbcs $a1,$acc1,$a1
997
- stp xzr,xzr,[sp,#8*0]
998
- sbcs $a2,$acc2,$a2
999
- stp xzr,xzr,[sp,#8*2]
1000
- sbcs $a3,$acc3,$a3
1001
- stp xzr,xzr,[sp,#8*4]
1002
- sbcs $a4,$acc4,$a4
1003
- stp xzr,xzr,[sp,#8*6]
1004
- sbcs $a5,$acc5,$a5
1005
- stp xzr,xzr,[sp,#8*8]
1006
- sbcs $a6,$acc6,$a6
1007
- stp xzr,xzr,[sp,#8*10]
1008
- sbcs $a7,$acc7,$a7
1009
- stp xzr,xzr,[sp,#8*12]
1010
- sbcs $carry,$carry,xzr // did it borrow?
1011
- stp xzr,xzr,[sp,#8*14]
1012
-
1013
- // $a0-7 hold result-modulus
1014
- csel $a0,$acc0,$a0,lo
1015
- csel $a1,$acc1,$a1,lo
1016
- csel $a2,$acc2,$a2,lo
1017
- csel $a3,$acc3,$a3,lo
1018
- stp $a0,$a1,[$ap,#8*0]
1019
- csel $a4,$acc4,$a4,lo
1020
- csel $a5,$acc5,$a5,lo
1021
- stp $a2,$a3,[$ap,#8*2]
1022
- csel $a6,$acc6,$a6,lo
1023
- csel $a7,$acc7,$a7,lo
1024
- stp $a4,$a5,[$ap,#8*4]
1025
- stp $a6,$a7,[$ap,#8*6]
1026
-
1027
- .Lsqr8x_done:
1028
- ldp x19,x20,[x29,#16]
1029
- mov sp,x29
1030
- ldp x21,x22,[x29,#32]
1031
- mov x0,#1
1032
- ldp x23,x24,[x29,#48]
1033
- ldp x25,x26,[x29,#64]
1034
- ldp x27,x28,[x29,#80]
1035
- ldr x29,[sp],#128
1036
- ret
1037
- .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1038
- ___
1039
- }
1040
-
1041
- {
1042
- ########################################################################
1043
- # Even though this might look as ARMv8 adaptation of mulx4x_mont from
1044
- # x86_64-mont5 module, it's different in sense that it performs
1045
- # reduction 256 bits at a time.
1046
-
1047
- my ($a0,$a1,$a2,$a3,
1048
- $t0,$t1,$t2,$t3,
1049
- $m0,$m1,$m2,$m3,
1050
- $acc0,$acc1,$acc2,$acc3,$acc4,
1051
- $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28));
1052
- my $bp_end=$rp;
1053
- my ($carry,$topmost) = ($rp,"x30");
1054
-
1055
- $code.=<<___;
1056
- .type __bn_mul4x_mont,%function
1057
- .align 5
1058
- __bn_mul4x_mont:
1059
- stp x29,x30,[sp,#-128]!
1060
- add x29,sp,#0
1061
- stp x19,x20,[sp,#16]
1062
- stp x21,x22,[sp,#32]
1063
- stp x23,x24,[sp,#48]
1064
- stp x25,x26,[sp,#64]
1065
- stp x27,x28,[sp,#80]
1066
-
1067
- sub $tp,sp,$num,lsl#3
1068
- lsl $num,$num,#3
1069
- ldr $n0,[$n0] // *n0
1070
- sub sp,$tp,#8*4 // alloca
1071
-
1072
- add $t0,$bp,$num
1073
- add $ap_end,$ap,$num
1074
- stp $rp,$t0,[x29,#96] // offload rp and &b[num]
1075
-
1076
- ldr $bi,[$bp,#8*0] // b[0]
1077
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1078
- ldp $a2,$a3,[$ap,#8*2]
1079
- add $ap,$ap,#8*4
1080
- mov $acc0,xzr
1081
- mov $acc1,xzr
1082
- mov $acc2,xzr
1083
- mov $acc3,xzr
1084
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
1085
- ldp $m2,$m3,[$np,#8*2]
1086
- adds $np,$np,#8*4 // clear carry bit
1087
- mov $carry,xzr
1088
- mov $cnt,#0
1089
- mov $tp,sp
1090
-
1091
- .Loop_mul4x_1st_reduction:
1092
- mul $t0,$a0,$bi // lo(a[0..3]*b[0])
1093
- adc $carry,$carry,xzr // modulo-scheduled
1094
- mul $t1,$a1,$bi
1095
- add $cnt,$cnt,#8
1096
- mul $t2,$a2,$bi
1097
- and $cnt,$cnt,#31
1098
- mul $t3,$a3,$bi
1099
- adds $acc0,$acc0,$t0
1100
- umulh $t0,$a0,$bi // hi(a[0..3]*b[0])
1101
- adcs $acc1,$acc1,$t1
1102
- mul $mi,$acc0,$n0 // t[0]*n0
1103
- adcs $acc2,$acc2,$t2
1104
- umulh $t1,$a1,$bi
1105
- adcs $acc3,$acc3,$t3
1106
- umulh $t2,$a2,$bi
1107
- adc $acc4,xzr,xzr
1108
- umulh $t3,$a3,$bi
1109
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1110
- adds $acc1,$acc1,$t0
1111
- // (*) mul $t0,$m0,$mi // lo(n[0..3]*t[0]*n0)
1112
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1113
- adcs $acc2,$acc2,$t1
1114
- mul $t1,$m1,$mi
1115
- adcs $acc3,$acc3,$t2
1116
- mul $t2,$m2,$mi
1117
- adc $acc4,$acc4,$t3 // can't overflow
1118
- mul $t3,$m3,$mi
1119
- // (*) adds xzr,$acc0,$t0
1120
- subs xzr,$acc0,#1 // (*)
1121
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0)
1122
- adcs $acc0,$acc1,$t1
1123
- umulh $t1,$m1,$mi
1124
- adcs $acc1,$acc2,$t2
1125
- umulh $t2,$m2,$mi
1126
- adcs $acc2,$acc3,$t3
1127
- umulh $t3,$m3,$mi
1128
- adcs $acc3,$acc4,$carry
1129
- adc $carry,xzr,xzr
1130
- adds $acc0,$acc0,$t0
1131
- sub $t0,$ap_end,$ap
1132
- adcs $acc1,$acc1,$t1
1133
- adcs $acc2,$acc2,$t2
1134
- adcs $acc3,$acc3,$t3
1135
- //adc $carry,$carry,xzr
1136
- cbnz $cnt,.Loop_mul4x_1st_reduction
1137
-
1138
- cbz $t0,.Lmul4x4_post_condition
1139
-
1140
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1141
- ldp $a2,$a3,[$ap,#8*2]
1142
- add $ap,$ap,#8*4
1143
- ldr $mi,[sp] // a[0]*n0
1144
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
1145
- ldp $m2,$m3,[$np,#8*2]
1146
- add $np,$np,#8*4
1147
-
1148
- .Loop_mul4x_1st_tail:
1149
- mul $t0,$a0,$bi // lo(a[4..7]*b[i])
1150
- adc $carry,$carry,xzr // modulo-scheduled
1151
- mul $t1,$a1,$bi
1152
- add $cnt,$cnt,#8
1153
- mul $t2,$a2,$bi
1154
- and $cnt,$cnt,#31
1155
- mul $t3,$a3,$bi
1156
- adds $acc0,$acc0,$t0
1157
- umulh $t0,$a0,$bi // hi(a[4..7]*b[i])
1158
- adcs $acc1,$acc1,$t1
1159
- umulh $t1,$a1,$bi
1160
- adcs $acc2,$acc2,$t2
1161
- umulh $t2,$a2,$bi
1162
- adcs $acc3,$acc3,$t3
1163
- umulh $t3,$a3,$bi
1164
- adc $acc4,xzr,xzr
1165
- ldr $bi,[$bp,$cnt] // next b[i] (or b[0])
1166
- adds $acc1,$acc1,$t0
1167
- mul $t0,$m0,$mi // lo(n[4..7]*a[0]*n0)
1168
- adcs $acc2,$acc2,$t1
1169
- mul $t1,$m1,$mi
1170
- adcs $acc3,$acc3,$t2
1171
- mul $t2,$m2,$mi
1172
- adc $acc4,$acc4,$t3 // can't overflow
1173
- mul $t3,$m3,$mi
1174
- adds $acc0,$acc0,$t0
1175
- umulh $t0,$m0,$mi // hi(n[4..7]*a[0]*n0)
1176
- adcs $acc1,$acc1,$t1
1177
- umulh $t1,$m1,$mi
1178
- adcs $acc2,$acc2,$t2
1179
- umulh $t2,$m2,$mi
1180
- adcs $acc3,$acc3,$t3
1181
- adcs $acc4,$acc4,$carry
1182
- umulh $t3,$m3,$mi
1183
- adc $carry,xzr,xzr
1184
- ldr $mi,[sp,$cnt] // next t[0]*n0
1185
- str $acc0,[$tp],#8 // result!!!
1186
- adds $acc0,$acc1,$t0
1187
- sub $t0,$ap_end,$ap // done yet?
1188
- adcs $acc1,$acc2,$t1
1189
- adcs $acc2,$acc3,$t2
1190
- adcs $acc3,$acc4,$t3
1191
- //adc $carry,$carry,xzr
1192
- cbnz $cnt,.Loop_mul4x_1st_tail
1193
-
1194
- sub $t1,$ap_end,$num // rewinded $ap
1195
- cbz $t0,.Lmul4x_proceed
1196
-
1197
- ldp $a0,$a1,[$ap,#8*0]
1198
- ldp $a2,$a3,[$ap,#8*2]
1199
- add $ap,$ap,#8*4
1200
- ldp $m0,$m1,[$np,#8*0]
1201
- ldp $m2,$m3,[$np,#8*2]
1202
- add $np,$np,#8*4
1203
- b .Loop_mul4x_1st_tail
1204
-
1205
- .align 5
1206
- .Lmul4x_proceed:
1207
- ldr $bi,[$bp,#8*4]! // *++b
1208
- adc $topmost,$carry,xzr
1209
- ldp $a0,$a1,[$t1,#8*0] // a[0..3]
1210
- sub $np,$np,$num // rewind np
1211
- ldp $a2,$a3,[$t1,#8*2]
1212
- add $ap,$t1,#8*4
1213
-
1214
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
1215
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1216
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
1217
- ldp $acc2,$acc3,[sp,#8*6]
1218
-
1219
- ldp $m0,$m1,[$np,#8*0] // n[0..3]
1220
- mov $tp,sp
1221
- ldp $m2,$m3,[$np,#8*2]
1222
- adds $np,$np,#8*4 // clear carry bit
1223
- mov $carry,xzr
1224
-
1225
- .align 4
1226
- .Loop_mul4x_reduction:
1227
- mul $t0,$a0,$bi // lo(a[0..3]*b[4])
1228
- adc $carry,$carry,xzr // modulo-scheduled
1229
- mul $t1,$a1,$bi
1230
- add $cnt,$cnt,#8
1231
- mul $t2,$a2,$bi
1232
- and $cnt,$cnt,#31
1233
- mul $t3,$a3,$bi
1234
- adds $acc0,$acc0,$t0
1235
- umulh $t0,$a0,$bi // hi(a[0..3]*b[4])
1236
- adcs $acc1,$acc1,$t1
1237
- mul $mi,$acc0,$n0 // t[0]*n0
1238
- adcs $acc2,$acc2,$t2
1239
- umulh $t1,$a1,$bi
1240
- adcs $acc3,$acc3,$t3
1241
- umulh $t2,$a2,$bi
1242
- adc $acc4,xzr,xzr
1243
- umulh $t3,$a3,$bi
1244
- ldr $bi,[$bp,$cnt] // next b[i]
1245
- adds $acc1,$acc1,$t0
1246
- // (*) mul $t0,$m0,$mi
1247
- str $mi,[$tp],#8 // put aside t[0]*n0 for tail processing
1248
- adcs $acc2,$acc2,$t1
1249
- mul $t1,$m1,$mi // lo(n[0..3]*t[0]*n0
1250
- adcs $acc3,$acc3,$t2
1251
- mul $t2,$m2,$mi
1252
- adc $acc4,$acc4,$t3 // can't overflow
1253
- mul $t3,$m3,$mi
1254
- // (*) adds xzr,$acc0,$t0
1255
- subs xzr,$acc0,#1 // (*)
1256
- umulh $t0,$m0,$mi // hi(n[0..3]*t[0]*n0
1257
- adcs $acc0,$acc1,$t1
1258
- umulh $t1,$m1,$mi
1259
- adcs $acc1,$acc2,$t2
1260
- umulh $t2,$m2,$mi
1261
- adcs $acc2,$acc3,$t3
1262
- umulh $t3,$m3,$mi
1263
- adcs $acc3,$acc4,$carry
1264
- adc $carry,xzr,xzr
1265
- adds $acc0,$acc0,$t0
1266
- adcs $acc1,$acc1,$t1
1267
- adcs $acc2,$acc2,$t2
1268
- adcs $acc3,$acc3,$t3
1269
- //adc $carry,$carry,xzr
1270
- cbnz $cnt,.Loop_mul4x_reduction
1271
-
1272
- adc $carry,$carry,xzr
1273
- ldp $t0,$t1,[$tp,#8*4] // t[4..7]
1274
- ldp $t2,$t3,[$tp,#8*6]
1275
- ldp $a0,$a1,[$ap,#8*0] // a[4..7]
1276
- ldp $a2,$a3,[$ap,#8*2]
1277
- add $ap,$ap,#8*4
1278
- adds $acc0,$acc0,$t0
1279
- adcs $acc1,$acc1,$t1
1280
- adcs $acc2,$acc2,$t2
1281
- adcs $acc3,$acc3,$t3
1282
- //adc $carry,$carry,xzr
1283
-
1284
- ldr $mi,[sp] // t[0]*n0
1285
- ldp $m0,$m1,[$np,#8*0] // n[4..7]
1286
- ldp $m2,$m3,[$np,#8*2]
1287
- add $np,$np,#8*4
1288
-
1289
- .align 4
1290
- .Loop_mul4x_tail:
1291
- mul $t0,$a0,$bi // lo(a[4..7]*b[4])
1292
- adc $carry,$carry,xzr // modulo-scheduled
1293
- mul $t1,$a1,$bi
1294
- add $cnt,$cnt,#8
1295
- mul $t2,$a2,$bi
1296
- and $cnt,$cnt,#31
1297
- mul $t3,$a3,$bi
1298
- adds $acc0,$acc0,$t0
1299
- umulh $t0,$a0,$bi // hi(a[4..7]*b[4])
1300
- adcs $acc1,$acc1,$t1
1301
- umulh $t1,$a1,$bi
1302
- adcs $acc2,$acc2,$t2
1303
- umulh $t2,$a2,$bi
1304
- adcs $acc3,$acc3,$t3
1305
- umulh $t3,$a3,$bi
1306
- adc $acc4,xzr,xzr
1307
- ldr $bi,[$bp,$cnt] // next b[i]
1308
- adds $acc1,$acc1,$t0
1309
- mul $t0,$m0,$mi // lo(n[4..7]*t[0]*n0)
1310
- adcs $acc2,$acc2,$t1
1311
- mul $t1,$m1,$mi
1312
- adcs $acc3,$acc3,$t2
1313
- mul $t2,$m2,$mi
1314
- adc $acc4,$acc4,$t3 // can't overflow
1315
- mul $t3,$m3,$mi
1316
- adds $acc0,$acc0,$t0
1317
- umulh $t0,$m0,$mi // hi(n[4..7]*t[0]*n0)
1318
- adcs $acc1,$acc1,$t1
1319
- umulh $t1,$m1,$mi
1320
- adcs $acc2,$acc2,$t2
1321
- umulh $t2,$m2,$mi
1322
- adcs $acc3,$acc3,$t3
1323
- umulh $t3,$m3,$mi
1324
- adcs $acc4,$acc4,$carry
1325
- ldr $mi,[sp,$cnt] // next a[0]*n0
1326
- adc $carry,xzr,xzr
1327
- str $acc0,[$tp],#8 // result!!!
1328
- adds $acc0,$acc1,$t0
1329
- sub $t0,$ap_end,$ap // done yet?
1330
- adcs $acc1,$acc2,$t1
1331
- adcs $acc2,$acc3,$t2
1332
- adcs $acc3,$acc4,$t3
1333
- //adc $carry,$carry,xzr
1334
- cbnz $cnt,.Loop_mul4x_tail
1335
-
1336
- sub $t1,$np,$num // rewinded np?
1337
- adc $carry,$carry,xzr
1338
- cbz $t0,.Loop_mul4x_break
1339
-
1340
- ldp $t0,$t1,[$tp,#8*4]
1341
- ldp $t2,$t3,[$tp,#8*6]
1342
- ldp $a0,$a1,[$ap,#8*0]
1343
- ldp $a2,$a3,[$ap,#8*2]
1344
- add $ap,$ap,#8*4
1345
- adds $acc0,$acc0,$t0
1346
- adcs $acc1,$acc1,$t1
1347
- adcs $acc2,$acc2,$t2
1348
- adcs $acc3,$acc3,$t3
1349
- //adc $carry,$carry,xzr
1350
- ldp $m0,$m1,[$np,#8*0]
1351
- ldp $m2,$m3,[$np,#8*2]
1352
- add $np,$np,#8*4
1353
- b .Loop_mul4x_tail
1354
-
1355
- .align 4
1356
- .Loop_mul4x_break:
1357
- ldp $t2,$t3,[x29,#96] // pull rp and &b[num]
1358
- adds $acc0,$acc0,$topmost
1359
- add $bp,$bp,#8*4 // bp++
1360
- adcs $acc1,$acc1,xzr
1361
- sub $ap,$ap,$num // rewind ap
1362
- adcs $acc2,$acc2,xzr
1363
- stp $acc0,$acc1,[$tp,#8*0] // result!!!
1364
- adcs $acc3,$acc3,xzr
1365
- ldp $acc0,$acc1,[sp,#8*4] // t[0..3]
1366
- adc $topmost,$carry,xzr
1367
- stp $acc2,$acc3,[$tp,#8*2] // result!!!
1368
- cmp $bp,$t3 // done yet?
1369
- ldp $acc2,$acc3,[sp,#8*6]
1370
- ldp $m0,$m1,[$t1,#8*0] // n[0..3]
1371
- ldp $m2,$m3,[$t1,#8*2]
1372
- add $np,$t1,#8*4
1373
- b.eq .Lmul4x_post
1374
-
1375
- ldr $bi,[$bp]
1376
- ldp $a0,$a1,[$ap,#8*0] // a[0..3]
1377
- ldp $a2,$a3,[$ap,#8*2]
1378
- adds $ap,$ap,#8*4 // clear carry bit
1379
- mov $carry,xzr
1380
- mov $tp,sp
1381
- b .Loop_mul4x_reduction
1382
-
1383
- .align 4
1384
- .Lmul4x_post:
1385
- // Final step. We see if result is larger than modulus, and
1386
- // if it is, subtract the modulus. But comparison implies
1387
- // subtraction. So we subtract modulus, see if it borrowed,
1388
- // and conditionally copy original value.
1389
- mov $rp,$t2
1390
- mov $ap_end,$t2 // $rp copy
1391
- subs $t0,$acc0,$m0
1392
- add $tp,sp,#8*8
1393
- sbcs $t1,$acc1,$m1
1394
- sub $cnt,$num,#8*4
1395
-
1396
- .Lmul4x_sub:
1397
- sbcs $t2,$acc2,$m2
1398
- ldp $m0,$m1,[$np,#8*0]
1399
- sub $cnt,$cnt,#8*4
1400
- ldp $acc0,$acc1,[$tp,#8*0]
1401
- sbcs $t3,$acc3,$m3
1402
- ldp $m2,$m3,[$np,#8*2]
1403
- add $np,$np,#8*4
1404
- ldp $acc2,$acc3,[$tp,#8*2]
1405
- add $tp,$tp,#8*4
1406
- stp $t0,$t1,[$rp,#8*0]
1407
- sbcs $t0,$acc0,$m0
1408
- stp $t2,$t3,[$rp,#8*2]
1409
- add $rp,$rp,#8*4
1410
- sbcs $t1,$acc1,$m1
1411
- cbnz $cnt,.Lmul4x_sub
1412
-
1413
- sbcs $t2,$acc2,$m2
1414
- mov $tp,sp
1415
- add $ap,sp,#8*4
1416
- ldp $a0,$a1,[$ap_end,#8*0]
1417
- sbcs $t3,$acc3,$m3
1418
- stp $t0,$t1,[$rp,#8*0]
1419
- ldp $a2,$a3,[$ap_end,#8*2]
1420
- stp $t2,$t3,[$rp,#8*2]
1421
- ldp $acc0,$acc1,[$ap,#8*0]
1422
- ldp $acc2,$acc3,[$ap,#8*2]
1423
- sbcs xzr,$topmost,xzr // did it borrow?
1424
- ldr x30,[x29,#8] // pull return address
1425
-
1426
- sub $cnt,$num,#8*4
1427
- .Lmul4x_cond_copy:
1428
- sub $cnt,$cnt,#8*4
1429
- csel $t0,$acc0,$a0,lo
1430
- stp xzr,xzr,[$tp,#8*0]
1431
- csel $t1,$acc1,$a1,lo
1432
- ldp $a0,$a1,[$ap_end,#8*4]
1433
- ldp $acc0,$acc1,[$ap,#8*4]
1434
- csel $t2,$acc2,$a2,lo
1435
- stp xzr,xzr,[$tp,#8*2]
1436
- add $tp,$tp,#8*4
1437
- csel $t3,$acc3,$a3,lo
1438
- ldp $a2,$a3,[$ap_end,#8*6]
1439
- ldp $acc2,$acc3,[$ap,#8*6]
1440
- add $ap,$ap,#8*4
1441
- stp $t0,$t1,[$ap_end,#8*0]
1442
- stp $t2,$t3,[$ap_end,#8*2]
1443
- add $ap_end,$ap_end,#8*4
1444
- cbnz $cnt,.Lmul4x_cond_copy
1445
-
1446
- csel $t0,$acc0,$a0,lo
1447
- stp xzr,xzr,[$tp,#8*0]
1448
- csel $t1,$acc1,$a1,lo
1449
- stp xzr,xzr,[$tp,#8*2]
1450
- csel $t2,$acc2,$a2,lo
1451
- stp xzr,xzr,[$tp,#8*3]
1452
- csel $t3,$acc3,$a3,lo
1453
- stp xzr,xzr,[$tp,#8*4]
1454
- stp $t0,$t1,[$ap_end,#8*0]
1455
- stp $t2,$t3,[$ap_end,#8*2]
1456
-
1457
- b .Lmul4x_done
1458
-
1459
- .align 4
1460
- .Lmul4x4_post_condition:
1461
- adc $carry,$carry,xzr
1462
- ldr $ap,[x29,#96] // pull rp
1463
- // $acc0-3,$carry hold result, $m0-7 hold modulus
1464
- subs $a0,$acc0,$m0
1465
- ldr x30,[x29,#8] // pull return address
1466
- sbcs $a1,$acc1,$m1
1467
- stp xzr,xzr,[sp,#8*0]
1468
- sbcs $a2,$acc2,$m2
1469
- stp xzr,xzr,[sp,#8*2]
1470
- sbcs $a3,$acc3,$m3
1471
- stp xzr,xzr,[sp,#8*4]
1472
- sbcs xzr,$carry,xzr // did it borrow?
1473
- stp xzr,xzr,[sp,#8*6]
1474
-
1475
- // $a0-3 hold result-modulus
1476
- csel $a0,$acc0,$a0,lo
1477
- csel $a1,$acc1,$a1,lo
1478
- csel $a2,$acc2,$a2,lo
1479
- csel $a3,$acc3,$a3,lo
1480
- stp $a0,$a1,[$ap,#8*0]
1481
- stp $a2,$a3,[$ap,#8*2]
1482
-
1483
- .Lmul4x_done:
1484
- ldp x19,x20,[x29,#16]
1485
- mov sp,x29
1486
- ldp x21,x22,[x29,#32]
1487
- mov x0,#1
1488
- ldp x23,x24,[x29,#48]
1489
- ldp x25,x26,[x29,#64]
1490
- ldp x27,x28,[x29,#80]
1491
- ldr x29,[sp],#128
1492
- ret
1493
- .size __bn_mul4x_mont,.-__bn_mul4x_mont
1494
- ___
1495
- }
1496
- $code.=<<___;
1497
- .asciz "Montgomery Multiplication for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
1498
- .align 4
1499
- ___
1500
-
1501
- print $code;
1502
-
1503
- close STDOUT;