ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,694 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+
10
+ # January 2007.
11
+
12
+ # Montgomery multiplication for ARMv4.
13
+ #
14
+ # Performance improvement naturally varies among CPU implementations
15
+ # and compilers. The code was observed to provide +65-35% improvement
16
+ # [depending on key length, less for longer keys] on ARM920T, and
17
+ # +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
18
+ # base and compiler generated code with in-lined umull and even umlal
19
+ # instructions. The latter means that this code didn't really have an
20
+ # "advantage" of utilizing some "secret" instruction.
21
+ #
22
+ # The code is interoperable with Thumb ISA and is rather compact, less
23
+ # than 1/2KB. Windows CE port would be trivial, as it's exclusively
24
+ # about decorations, ABI and instruction syntax are identical.
25
+
26
+ # November 2013
27
+ #
28
+ # Add NEON code path, which handles lengths divisible by 8. RSA/DSA
29
+ # performance improvement on Cortex-A8 is ~45-100% depending on key
30
+ # length, more for longer keys. On Cortex-A15 the span is ~10-105%.
31
+ # On Snapdragon S4 improvement was measured to vary from ~70% to
32
+ # incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
33
+ # rather because original integer-only code seems to perform
34
+ # suboptimally on S4. Situation on Cortex-A9 is unfortunately
35
+ # different. It's being looked into, but the trouble is that
36
+ # performance for vectors longer than 256 bits is actually couple
37
+ # of percent worse than for integer-only code. The code is chosen
38
+ # for execution on all NEON-capable processors, because gain on
39
+ # others outweighs the marginal loss on Cortex-A9.
40
+
41
+ $flavour = shift;
42
+ if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
43
+ else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44
+
45
+ if ($flavour && $flavour ne "void") {
46
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49
+ die "can't locate arm-xlate.pl";
50
+
51
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
52
+ } else {
53
+ open STDOUT,">$output";
54
+ }
55
+
56
+ $num="r0"; # starts as num argument, but holds &tp[num-1]
57
+ $ap="r1";
58
+ $bp="r2"; $bi="r2"; $rp="r2";
59
+ $np="r3";
60
+ $tp="r4";
61
+ $aj="r5";
62
+ $nj="r6";
63
+ $tj="r7";
64
+ $n0="r8";
65
+ ########### # r9 is reserved by ELF as platform specific, e.g. TLS pointer
66
+ $alo="r10"; # sl, gcc uses it to keep @GOT
67
+ $ahi="r11"; # fp
68
+ $nlo="r12"; # ip
69
+ ########### # r13 is stack pointer
70
+ $nhi="r14"; # lr
71
+ ########### # r15 is program counter
72
+
73
+ #### argument block layout relative to &tp[num-1], a.k.a. $num
74
+ $_rp="$num,#12*4";
75
+ # ap permanently resides in r1
76
+ $_bp="$num,#13*4";
77
+ # np permanently resides in r3
78
+ $_n0="$num,#14*4";
79
+ $_num="$num,#15*4"; $_bpend=$_num;
80
+
81
+ $code=<<___;
82
+ #include <openssl/arm_arch.h>
83
+
84
+ .text
85
+ .code 32
86
+
87
+ #if __ARM_MAX_ARCH__>=7
88
+ .align 5
89
+ .LOPENSSL_armcap:
90
+ .word OPENSSL_armcap_P-.Lbn_mul_mont
91
+ #endif
92
+
93
+ .global bn_mul_mont
94
+ .hidden bn_mul_mont
95
+ .type bn_mul_mont,%function
96
+
97
+ .align 5
98
+ bn_mul_mont:
99
+ .Lbn_mul_mont:
100
+ ldr ip,[sp,#4] @ load num
101
+ stmdb sp!,{r0,r2} @ sp points at argument block
102
+ #if __ARM_MAX_ARCH__>=7
103
+ tst ip,#7
104
+ bne .Lialu
105
+ adr r0,bn_mul_mont
106
+ ldr r2,.LOPENSSL_armcap
107
+ ldr r0,[r0,r2]
108
+ #ifdef __APPLE__
109
+ ldr r0,[r0]
110
+ #endif
111
+ tst r0,#1 @ NEON available?
112
+ ldmia sp, {r0,r2}
113
+ beq .Lialu
114
+ add sp,sp,#8
115
+ b bn_mul8x_mont_neon
116
+ .align 4
117
+ .Lialu:
118
+ #endif
119
+ cmp ip,#2
120
+ mov $num,ip @ load num
121
+ movlt r0,#0
122
+ addlt sp,sp,#2*4
123
+ blt .Labrt
124
+
125
+ stmdb sp!,{r4-r12,lr} @ save 10 registers
126
+
127
+ mov $num,$num,lsl#2 @ rescale $num for byte count
128
+ sub sp,sp,$num @ alloca(4*num)
129
+ sub sp,sp,#4 @ +extra dword
130
+ sub $num,$num,#4 @ "num=num-1"
131
+ add $tp,$bp,$num @ &bp[num-1]
132
+
133
+ add $num,sp,$num @ $num to point at &tp[num-1]
134
+ ldr $n0,[$_n0] @ &n0
135
+ ldr $bi,[$bp] @ bp[0]
136
+ ldr $aj,[$ap],#4 @ ap[0],ap++
137
+ ldr $nj,[$np],#4 @ np[0],np++
138
+ ldr $n0,[$n0] @ *n0
139
+ str $tp,[$_bpend] @ save &bp[num]
140
+
141
+ umull $alo,$ahi,$aj,$bi @ ap[0]*bp[0]
142
+ str $n0,[$_n0] @ save n0 value
143
+ mul $n0,$alo,$n0 @ "tp[0]"*n0
144
+ mov $nlo,#0
145
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"t[0]"
146
+ mov $tp,sp
147
+
148
+ .L1st:
149
+ ldr $aj,[$ap],#4 @ ap[j],ap++
150
+ mov $alo,$ahi
151
+ ldr $nj,[$np],#4 @ np[j],np++
152
+ mov $ahi,#0
153
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[0]
154
+ mov $nhi,#0
155
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
156
+ adds $nlo,$nlo,$alo
157
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
158
+ adc $nlo,$nhi,#0
159
+ cmp $tp,$num
160
+ bne .L1st
161
+
162
+ adds $nlo,$nlo,$ahi
163
+ ldr $tp,[$_bp] @ restore bp
164
+ mov $nhi,#0
165
+ ldr $n0,[$_n0] @ restore n0
166
+ adc $nhi,$nhi,#0
167
+ str $nlo,[$num] @ tp[num-1]=
168
+ str $nhi,[$num,#4] @ tp[num]=
169
+
170
+ .Louter:
171
+ sub $tj,$num,sp @ "original" $num-1 value
172
+ sub $ap,$ap,$tj @ "rewind" ap to &ap[1]
173
+ ldr $bi,[$tp,#4]! @ *(++bp)
174
+ sub $np,$np,$tj @ "rewind" np to &np[1]
175
+ ldr $aj,[$ap,#-4] @ ap[0]
176
+ ldr $alo,[sp] @ tp[0]
177
+ ldr $nj,[$np,#-4] @ np[0]
178
+ ldr $tj,[sp,#4] @ tp[1]
179
+
180
+ mov $ahi,#0
181
+ umlal $alo,$ahi,$aj,$bi @ ap[0]*bp[i]+tp[0]
182
+ str $tp,[$_bp] @ save bp
183
+ mul $n0,$alo,$n0
184
+ mov $nlo,#0
185
+ umlal $alo,$nlo,$nj,$n0 @ np[0]*n0+"tp[0]"
186
+ mov $tp,sp
187
+
188
+ .Linner:
189
+ ldr $aj,[$ap],#4 @ ap[j],ap++
190
+ adds $alo,$ahi,$tj @ +=tp[j]
191
+ ldr $nj,[$np],#4 @ np[j],np++
192
+ mov $ahi,#0
193
+ umlal $alo,$ahi,$aj,$bi @ ap[j]*bp[i]
194
+ mov $nhi,#0
195
+ umlal $nlo,$nhi,$nj,$n0 @ np[j]*n0
196
+ adc $ahi,$ahi,#0
197
+ ldr $tj,[$tp,#8] @ tp[j+1]
198
+ adds $nlo,$nlo,$alo
199
+ str $nlo,[$tp],#4 @ tp[j-1]=,tp++
200
+ adc $nlo,$nhi,#0
201
+ cmp $tp,$num
202
+ bne .Linner
203
+
204
+ adds $nlo,$nlo,$ahi
205
+ mov $nhi,#0
206
+ ldr $tp,[$_bp] @ restore bp
207
+ adc $nhi,$nhi,#0
208
+ ldr $n0,[$_n0] @ restore n0
209
+ adds $nlo,$nlo,$tj
210
+ ldr $tj,[$_bpend] @ restore &bp[num]
211
+ adc $nhi,$nhi,#0
212
+ str $nlo,[$num] @ tp[num-1]=
213
+ str $nhi,[$num,#4] @ tp[num]=
214
+
215
+ cmp $tp,$tj
216
+ bne .Louter
217
+
218
+ ldr $rp,[$_rp] @ pull rp
219
+ add $num,$num,#4 @ $num to point at &tp[num]
220
+ sub $aj,$num,sp @ "original" num value
221
+ mov $tp,sp @ "rewind" $tp
222
+ mov $ap,$tp @ "borrow" $ap
223
+ sub $np,$np,$aj @ "rewind" $np to &np[0]
224
+
225
+ subs $tj,$tj,$tj @ "clear" carry flag
226
+ .Lsub: ldr $tj,[$tp],#4
227
+ ldr $nj,[$np],#4
228
+ sbcs $tj,$tj,$nj @ tp[j]-np[j]
229
+ str $tj,[$rp],#4 @ rp[j]=
230
+ teq $tp,$num @ preserve carry
231
+ bne .Lsub
232
+ sbcs $nhi,$nhi,#0 @ upmost carry
233
+ mov $tp,sp @ "rewind" $tp
234
+ sub $rp,$rp,$aj @ "rewind" $rp
235
+
236
+ and $ap,$tp,$nhi
237
+ bic $np,$rp,$nhi
238
+ orr $ap,$ap,$np @ ap=borrow?tp:rp
239
+
240
+ .Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
241
+ str sp,[$tp],#4 @ zap tp
242
+ str $tj,[$rp],#4
243
+ cmp $tp,$num
244
+ bne .Lcopy
245
+
246
+ add sp,$num,#4 @ skip over tp[num+1]
247
+ ldmia sp!,{r4-r12,lr} @ restore registers
248
+ add sp,sp,#2*4 @ skip over {r0,r2}
249
+ mov r0,#1
250
+ .Labrt:
251
+ #if __ARM_ARCH__>=5
252
+ ret @ bx lr
253
+ #else
254
+ tst lr,#1
255
+ moveq pc,lr @ be binary compatible with V4, yet
256
+ bx lr @ interoperable with Thumb ISA:-)
257
+ #endif
258
+ .size bn_mul_mont,.-bn_mul_mont
259
+ ___
260
+ {
261
+ sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
262
+ sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
263
+
264
+ my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
265
+ my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
266
+ my ($Z,$Temp)=("q4","q5");
267
+ my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
268
+ my ($Bi,$Ni,$M0)=map("d$_",(28..31));
269
+ my $zero=&Dlo($Z);
270
+ my $temp=&Dlo($Temp);
271
+
272
+ my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
273
+ my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
274
+
275
+ $code.=<<___;
276
+ #if __ARM_MAX_ARCH__>=7
277
+ .arch armv7-a
278
+ .fpu neon
279
+
280
+ .type bn_mul8x_mont_neon,%function
281
+ .align 5
282
+ bn_mul8x_mont_neon:
283
+ mov ip,sp
284
+ stmdb sp!,{r4-r11}
285
+ vstmdb sp!,{d8-d15} @ ABI specification says so
286
+ ldmia ip,{r4-r5} @ load rest of parameter block
287
+
288
+ sub $toutptr,sp,#16
289
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
290
+ sub $toutptr,$toutptr,$num,lsl#4
291
+ vld1.32 {$A0-$A3}, [$aptr]! @ can't specify :32 :-(
292
+ and $toutptr,$toutptr,#-64
293
+ vld1.32 {${M0}[0]}, [$n0,:32]
294
+ mov sp,$toutptr @ alloca
295
+ veor $zero,$zero,$zero
296
+ subs $inner,$num,#8
297
+ vzip.16 $Bi,$zero
298
+
299
+ vmull.u32 $A0xB,$Bi,${A0}[0]
300
+ vmull.u32 $A1xB,$Bi,${A0}[1]
301
+ vmull.u32 $A2xB,$Bi,${A1}[0]
302
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
303
+ vmull.u32 $A3xB,$Bi,${A1}[1]
304
+
305
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
306
+ veor $zero,$zero,$zero
307
+ vmul.u32 $Ni,$temp,$M0
308
+
309
+ vmull.u32 $A4xB,$Bi,${A2}[0]
310
+ vld1.32 {$N0-$N3}, [$nptr]!
311
+ vmull.u32 $A5xB,$Bi,${A2}[1]
312
+ vmull.u32 $A6xB,$Bi,${A3}[0]
313
+ vzip.16 $Ni,$zero
314
+ vmull.u32 $A7xB,$Bi,${A3}[1]
315
+
316
+ bne .LNEON_1st
317
+
318
+ @ special case for num=8, everything is in register bank...
319
+
320
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
321
+ sub $outer,$num,#1
322
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
323
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
324
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
325
+
326
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
327
+ vmov $Temp,$A0xB
328
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
329
+ vmov $A0xB,$A1xB
330
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
331
+ vmov $A1xB,$A2xB
332
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
333
+ vmov $A2xB,$A3xB
334
+ vmov $A3xB,$A4xB
335
+ vshr.u64 $temp,$temp,#16
336
+ vmov $A4xB,$A5xB
337
+ vmov $A5xB,$A6xB
338
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
339
+ vmov $A6xB,$A7xB
340
+ veor $A7xB,$A7xB
341
+ vshr.u64 $temp,$temp,#16
342
+
343
+ b .LNEON_outer8
344
+
345
+ .align 4
346
+ .LNEON_outer8:
347
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
348
+ veor $zero,$zero,$zero
349
+ vzip.16 $Bi,$zero
350
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
351
+
352
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
353
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
354
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
355
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
356
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
357
+
358
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
359
+ veor $zero,$zero,$zero
360
+ subs $outer,$outer,#1
361
+ vmul.u32 $Ni,$temp,$M0
362
+
363
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
364
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
365
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
366
+ vzip.16 $Ni,$zero
367
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
368
+
369
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
370
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
371
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
372
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
373
+
374
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
375
+ vmov $Temp,$A0xB
376
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
377
+ vmov $A0xB,$A1xB
378
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
379
+ vmov $A1xB,$A2xB
380
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
381
+ vmov $A2xB,$A3xB
382
+ vmov $A3xB,$A4xB
383
+ vshr.u64 $temp,$temp,#16
384
+ vmov $A4xB,$A5xB
385
+ vmov $A5xB,$A6xB
386
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
387
+ vmov $A6xB,$A7xB
388
+ veor $A7xB,$A7xB
389
+ vshr.u64 $temp,$temp,#16
390
+
391
+ bne .LNEON_outer8
392
+
393
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
394
+ mov $toutptr,sp
395
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
396
+ mov $inner,$num
397
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
398
+ add $tinptr,sp,#16
399
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
400
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
401
+
402
+ b .LNEON_tail2
403
+
404
+ .align 4
405
+ .LNEON_1st:
406
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
407
+ vld1.32 {$A0-$A3}, [$aptr]!
408
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
409
+ subs $inner,$inner,#8
410
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
411
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
412
+
413
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
414
+ vld1.32 {$N0-$N1}, [$nptr]!
415
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
416
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
417
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
418
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
419
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
420
+
421
+ vmull.u32 $A0xB,$Bi,${A0}[0]
422
+ vld1.32 {$N2-$N3}, [$nptr]!
423
+ vmull.u32 $A1xB,$Bi,${A0}[1]
424
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
425
+ vmull.u32 $A2xB,$Bi,${A1}[0]
426
+ vmull.u32 $A3xB,$Bi,${A1}[1]
427
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
428
+
429
+ vmull.u32 $A4xB,$Bi,${A2}[0]
430
+ vmull.u32 $A5xB,$Bi,${A2}[1]
431
+ vmull.u32 $A6xB,$Bi,${A3}[0]
432
+ vmull.u32 $A7xB,$Bi,${A3}[1]
433
+
434
+ bne .LNEON_1st
435
+
436
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
437
+ add $tinptr,sp,#16
438
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
439
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
440
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
441
+ vld1.64 {$Temp}, [sp,:128]
442
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
443
+ sub $outer,$num,#1
444
+
445
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
446
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
447
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
448
+ vshr.u64 $temp,$temp,#16
449
+ vld1.64 {$A0xB}, [$tinptr, :128]!
450
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
451
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
452
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
453
+
454
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
455
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
456
+ veor $Z,$Z,$Z
457
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
458
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
459
+ vst1.64 {$Z}, [$toutptr,:128]
460
+ vshr.u64 $temp,$temp,#16
461
+
462
+ b .LNEON_outer
463
+
464
+ .align 4
465
+ .LNEON_outer:
466
+ vld1.32 {${Bi}[0]}, [$bptr,:32]!
467
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
468
+ vld1.32 {$A0-$A3}, [$aptr]!
469
+ veor $zero,$zero,$zero
470
+ mov $toutptr,sp
471
+ vzip.16 $Bi,$zero
472
+ sub $inner,$num,#8
473
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
474
+
475
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
476
+ vld1.64 {$A3xB-$A4xB},[$tinptr,:256]!
477
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
478
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
479
+ vld1.64 {$A5xB-$A6xB},[$tinptr,:256]!
480
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
481
+
482
+ vshl.i64 $temp,`&Dhi("$A0xB")`,#16
483
+ veor $zero,$zero,$zero
484
+ vadd.u64 $temp,$temp,`&Dlo("$A0xB")`
485
+ vld1.64 {$A7xB},[$tinptr,:128]!
486
+ vmul.u32 $Ni,$temp,$M0
487
+
488
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
489
+ vld1.32 {$N0-$N3}, [$nptr]!
490
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
491
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
492
+ vzip.16 $Ni,$zero
493
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
494
+
495
+ .LNEON_inner:
496
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
497
+ vld1.32 {$A0-$A3}, [$aptr]!
498
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
499
+ subs $inner,$inner,#8
500
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
501
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
502
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
503
+
504
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
505
+ vld1.64 {$A0xB}, [$tinptr, :128]!
506
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
507
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
508
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
509
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
510
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
511
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
512
+
513
+ vmlal.u32 $A0xB,$Bi,${A0}[0]
514
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
515
+ vmlal.u32 $A1xB,$Bi,${A0}[1]
516
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
517
+ vmlal.u32 $A2xB,$Bi,${A1}[0]
518
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
519
+ vmlal.u32 $A3xB,$Bi,${A1}[1]
520
+ vld1.32 {$N0-$N3}, [$nptr]!
521
+
522
+ vmlal.u32 $A4xB,$Bi,${A2}[0]
523
+ vld1.64 {$A7xB}, [$tinptr, :128]!
524
+ vmlal.u32 $A5xB,$Bi,${A2}[1]
525
+ vmlal.u32 $A6xB,$Bi,${A3}[0]
526
+ vmlal.u32 $A7xB,$Bi,${A3}[1]
527
+
528
+ bne .LNEON_inner
529
+
530
+ vmlal.u32 $A0xB,$Ni,${N0}[0]
531
+ add $tinptr,sp,#16
532
+ vmlal.u32 $A1xB,$Ni,${N0}[1]
533
+ sub $aptr,$aptr,$num,lsl#2 @ rewind $aptr
534
+ vmlal.u32 $A2xB,$Ni,${N1}[0]
535
+ vld1.64 {$Temp}, [sp,:128]
536
+ vmlal.u32 $A3xB,$Ni,${N1}[1]
537
+ subs $outer,$outer,#1
538
+
539
+ vmlal.u32 $A4xB,$Ni,${N2}[0]
540
+ vst1.64 {$A0xB-$A1xB}, [$toutptr,:256]!
541
+ vmlal.u32 $A5xB,$Ni,${N2}[1]
542
+ vld1.64 {$A0xB}, [$tinptr, :128]!
543
+ vshr.u64 $temp,$temp,#16
544
+ vst1.64 {$A2xB-$A3xB}, [$toutptr,:256]!
545
+ vmlal.u32 $A6xB,$Ni,${N3}[0]
546
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
547
+ vmlal.u32 $A7xB,$Ni,${N3}[1]
548
+
549
+ vst1.64 {$A4xB-$A5xB}, [$toutptr,:256]!
550
+ vadd.u64 $temp,$temp,`&Dhi("$Temp")`
551
+ vst1.64 {$A6xB-$A7xB}, [$toutptr,:256]!
552
+ vshr.u64 $temp,$temp,#16
553
+
554
+ bne .LNEON_outer
555
+
556
+ mov $toutptr,sp
557
+ mov $inner,$num
558
+
559
+ .LNEON_tail:
560
+ vadd.u64 `&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
561
+ vld1.64 {$A3xB-$A4xB}, [$tinptr, :256]!
562
+ vshr.u64 $temp,`&Dlo("$A0xB")`,#16
563
+ vadd.u64 `&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
564
+ vld1.64 {$A5xB-$A6xB}, [$tinptr, :256]!
565
+ vshr.u64 $temp,`&Dhi("$A0xB")`,#16
566
+ vld1.64 {$A7xB}, [$tinptr, :128]!
567
+ vzip.16 `&Dlo("$A0xB")`,`&Dhi("$A0xB")`
568
+
569
+ .LNEON_tail2:
570
+ vadd.u64 `&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
571
+ vst1.32 {`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
572
+ vshr.u64 $temp,`&Dlo("$A1xB")`,#16
573
+ vadd.u64 `&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
574
+ vshr.u64 $temp,`&Dhi("$A1xB")`,#16
575
+ vzip.16 `&Dlo("$A1xB")`,`&Dhi("$A1xB")`
576
+
577
+ vadd.u64 `&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
578
+ vst1.32 {`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
579
+ vshr.u64 $temp,`&Dlo("$A2xB")`,#16
580
+ vadd.u64 `&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
581
+ vshr.u64 $temp,`&Dhi("$A2xB")`,#16
582
+ vzip.16 `&Dlo("$A2xB")`,`&Dhi("$A2xB")`
583
+
584
+ vadd.u64 `&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
585
+ vst1.32 {`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
586
+ vshr.u64 $temp,`&Dlo("$A3xB")`,#16
587
+ vadd.u64 `&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
588
+ vshr.u64 $temp,`&Dhi("$A3xB")`,#16
589
+ vzip.16 `&Dlo("$A3xB")`,`&Dhi("$A3xB")`
590
+
591
+ vadd.u64 `&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
592
+ vst1.32 {`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
593
+ vshr.u64 $temp,`&Dlo("$A4xB")`,#16
594
+ vadd.u64 `&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
595
+ vshr.u64 $temp,`&Dhi("$A4xB")`,#16
596
+ vzip.16 `&Dlo("$A4xB")`,`&Dhi("$A4xB")`
597
+
598
+ vadd.u64 `&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
599
+ vst1.32 {`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
600
+ vshr.u64 $temp,`&Dlo("$A5xB")`,#16
601
+ vadd.u64 `&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
602
+ vshr.u64 $temp,`&Dhi("$A5xB")`,#16
603
+ vzip.16 `&Dlo("$A5xB")`,`&Dhi("$A5xB")`
604
+
605
+ vadd.u64 `&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
606
+ vst1.32 {`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
607
+ vshr.u64 $temp,`&Dlo("$A6xB")`,#16
608
+ vadd.u64 `&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
609
+ vld1.64 {$A0xB}, [$tinptr, :128]!
610
+ vshr.u64 $temp,`&Dhi("$A6xB")`,#16
611
+ vzip.16 `&Dlo("$A6xB")`,`&Dhi("$A6xB")`
612
+
613
+ vadd.u64 `&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
614
+ vst1.32 {`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
615
+ vshr.u64 $temp,`&Dlo("$A7xB")`,#16
616
+ vadd.u64 `&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
617
+ vld1.64 {$A1xB-$A2xB}, [$tinptr, :256]!
618
+ vshr.u64 $temp,`&Dhi("$A7xB")`,#16
619
+ vzip.16 `&Dlo("$A7xB")`,`&Dhi("$A7xB")`
620
+ subs $inner,$inner,#8
621
+ vst1.32 {`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
622
+
623
+ bne .LNEON_tail
624
+
625
+ vst1.32 {${temp}[0]}, [$toutptr, :32] @ top-most bit
626
+ sub $nptr,$nptr,$num,lsl#2 @ rewind $nptr
627
+ subs $aptr,sp,#0 @ clear carry flag
628
+ add $bptr,sp,$num,lsl#2
629
+
630
+ .LNEON_sub:
631
+ ldmia $aptr!, {r4-r7}
632
+ ldmia $nptr!, {r8-r11}
633
+ sbcs r8, r4,r8
634
+ sbcs r9, r5,r9
635
+ sbcs r10,r6,r10
636
+ sbcs r11,r7,r11
637
+ teq $aptr,$bptr @ preserves carry
638
+ stmia $rptr!, {r8-r11}
639
+ bne .LNEON_sub
640
+
641
+ ldr r10, [$aptr] @ load top-most bit
642
+ veor q0,q0,q0
643
+ sub r11,$bptr,sp @ this is num*4
644
+ veor q1,q1,q1
645
+ mov $aptr,sp
646
+ sub $rptr,$rptr,r11 @ rewind $rptr
647
+ mov $nptr,$bptr @ second 3/4th of frame
648
+ sbcs r10,r10,#0 @ result is carry flag
649
+
650
+ .LNEON_copy_n_zap:
651
+ ldmia $aptr!, {r4-r7}
652
+ ldmia $rptr, {r8-r11}
653
+ movcc r8, r4
654
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
655
+ movcc r9, r5
656
+ movcc r10,r6
657
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
658
+ movcc r11,r7
659
+ ldmia $aptr, {r4-r7}
660
+ stmia $rptr!, {r8-r11}
661
+ sub $aptr,$aptr,#16
662
+ ldmia $rptr, {r8-r11}
663
+ movcc r8, r4
664
+ vst1.64 {q0-q1}, [$aptr,:256]! @ wipe
665
+ movcc r9, r5
666
+ movcc r10,r6
667
+ vst1.64 {q0-q1}, [$nptr,:256]! @ wipe
668
+ movcc r11,r7
669
+ teq $aptr,$bptr @ preserves carry
670
+ stmia $rptr!, {r8-r11}
671
+ bne .LNEON_copy_n_zap
672
+
673
+ sub sp,ip,#96
674
+ vldmia sp!,{d8-d15}
675
+ ldmia sp!,{r4-r11}
676
+ ret @ bx lr
677
+ .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
678
+ #endif
679
+ ___
680
+ }
681
+ $code.=<<___;
682
+ .asciz "Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
683
+ .align 2
684
+ #if __ARM_MAX_ARCH__>=7
685
+ .comm OPENSSL_armcap_P,4,4
686
+ .hidden OPENSSL_armcap_P
687
+ #endif
688
+ ___
689
+
690
+ $code =~ s/\`([^\`]*)\`/eval $1/gem;
691
+ $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
692
+ $code =~ s/\bret\b/bx lr/gm;
693
+ print $code;
694
+ close STDOUT;