ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,301 @@
1
+ /* Copyright (c) 2014, Google Inc.
2
+ *
3
+ * Permission to use, copy, modify, and/or distribute this software for any
4
+ * purpose with or without fee is hereby granted, provided that the above
5
+ * copyright notice and this permission notice appear in all copies.
6
+ *
7
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
+
15
+ /* This implementation was taken from the public domain, neon2 version in
16
+ * SUPERCOP by D. J. Bernstein and Peter Schwabe. */
17
+
18
+ #include <openssl/poly1305.h>
19
+
20
+ #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM)
21
+
22
+ #include <string.h>
23
+
24
+
25
+ typedef struct {
26
+ uint32_t v[12]; /* for alignment; only using 10 */
27
+ } fe1305x2;
28
+
29
+ #define addmulmod openssl_poly1305_neon2_addmulmod
30
+ #define blocks openssl_poly1305_neon2_blocks
31
+
32
+ extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y,
33
+ const fe1305x2 *c);
34
+
35
+ extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in,
36
+ unsigned int inlen);
37
+
38
+ static void freeze(fe1305x2 *r) {
39
+ int i;
40
+
41
+ uint32_t x0 = r->v[0];
42
+ uint32_t x1 = r->v[2];
43
+ uint32_t x2 = r->v[4];
44
+ uint32_t x3 = r->v[6];
45
+ uint32_t x4 = r->v[8];
46
+ uint32_t y0;
47
+ uint32_t y1;
48
+ uint32_t y2;
49
+ uint32_t y3;
50
+ uint32_t y4;
51
+ uint32_t swap;
52
+
53
+ for (i = 0; i < 3; ++i) {
54
+ x1 += x0 >> 26;
55
+ x0 &= 0x3ffffff;
56
+ x2 += x1 >> 26;
57
+ x1 &= 0x3ffffff;
58
+ x3 += x2 >> 26;
59
+ x2 &= 0x3ffffff;
60
+ x4 += x3 >> 26;
61
+ x3 &= 0x3ffffff;
62
+ x0 += 5 * (x4 >> 26);
63
+ x4 &= 0x3ffffff;
64
+ }
65
+
66
+ y0 = x0 + 5;
67
+ y1 = x1 + (y0 >> 26);
68
+ y0 &= 0x3ffffff;
69
+ y2 = x2 + (y1 >> 26);
70
+ y1 &= 0x3ffffff;
71
+ y3 = x3 + (y2 >> 26);
72
+ y2 &= 0x3ffffff;
73
+ y4 = x4 + (y3 >> 26);
74
+ y3 &= 0x3ffffff;
75
+ swap = -(y4 >> 26);
76
+ y4 &= 0x3ffffff;
77
+
78
+ y0 ^= x0;
79
+ y1 ^= x1;
80
+ y2 ^= x2;
81
+ y3 ^= x3;
82
+ y4 ^= x4;
83
+
84
+ y0 &= swap;
85
+ y1 &= swap;
86
+ y2 &= swap;
87
+ y3 &= swap;
88
+ y4 &= swap;
89
+
90
+ y0 ^= x0;
91
+ y1 ^= x1;
92
+ y2 ^= x2;
93
+ y3 ^= x3;
94
+ y4 ^= x4;
95
+
96
+ r->v[0] = y0;
97
+ r->v[2] = y1;
98
+ r->v[4] = y2;
99
+ r->v[6] = y3;
100
+ r->v[8] = y4;
101
+ }
102
+
103
+ static void fe1305x2_tobytearray(uint8_t *r, fe1305x2 *x) {
104
+ uint32_t x0 = x->v[0];
105
+ uint32_t x1 = x->v[2];
106
+ uint32_t x2 = x->v[4];
107
+ uint32_t x3 = x->v[6];
108
+ uint32_t x4 = x->v[8];
109
+
110
+ x1 += x0 >> 26;
111
+ x0 &= 0x3ffffff;
112
+ x2 += x1 >> 26;
113
+ x1 &= 0x3ffffff;
114
+ x3 += x2 >> 26;
115
+ x2 &= 0x3ffffff;
116
+ x4 += x3 >> 26;
117
+ x3 &= 0x3ffffff;
118
+
119
+ *(uint32_t *)r = x0 + (x1 << 26);
120
+ *(uint32_t *)(r + 4) = (x1 >> 6) + (x2 << 20);
121
+ *(uint32_t *)(r + 8) = (x2 >> 12) + (x3 << 14);
122
+ *(uint32_t *)(r + 12) = (x3 >> 18) + (x4 << 8);
123
+ }
124
+
125
+ /* load32 exists to avoid breaking strict aliasing rules in
126
+ * fe1305x2_frombytearray. */
127
+ static uint32_t load32(uint8_t *t) {
128
+ uint32_t tmp;
129
+ memcpy(&tmp, t, sizeof(tmp));
130
+ return tmp;
131
+ }
132
+
133
+ static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x,
134
+ unsigned long long xlen) {
135
+ unsigned i;
136
+ uint8_t t[17];
137
+
138
+ for (i = 0; (i < 16) && (i < xlen); i++) {
139
+ t[i] = x[i];
140
+ }
141
+ xlen -= i;
142
+ x += i;
143
+ t[i++] = 1;
144
+ for (; i < 17; i++) {
145
+ t[i] = 0;
146
+ }
147
+
148
+ r->v[0] = 0x3ffffff & load32(t);
149
+ r->v[2] = 0x3ffffff & (load32(t + 3) >> 2);
150
+ r->v[4] = 0x3ffffff & (load32(t + 6) >> 4);
151
+ r->v[6] = 0x3ffffff & (load32(t + 9) >> 6);
152
+ r->v[8] = load32(t + 13);
153
+
154
+ if (xlen) {
155
+ for (i = 0; (i < 16) && (i < xlen); i++) {
156
+ t[i] = x[i];
157
+ }
158
+ t[i++] = 1;
159
+ for (; i < 17; i++) {
160
+ t[i] = 0;
161
+ }
162
+
163
+ r->v[1] = 0x3ffffff & load32(t);
164
+ r->v[3] = 0x3ffffff & (load32(t + 3) >> 2);
165
+ r->v[5] = 0x3ffffff & (load32(t + 6) >> 4);
166
+ r->v[7] = 0x3ffffff & (load32(t + 9) >> 6);
167
+ r->v[9] = load32(t + 13);
168
+ } else {
169
+ r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0;
170
+ }
171
+ }
172
+
173
+ static const fe1305x2 zero __attribute__((aligned(16)));
174
+
175
+ struct poly1305_state_st {
176
+ uint8_t data[sizeof(fe1305x2[5]) + 128];
177
+ uint8_t buf[32];
178
+ unsigned int buf_used;
179
+ uint8_t key[16];
180
+ };
181
+
182
+ void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) {
183
+ struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
184
+ fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
185
+ fe1305x2 *const h = r + 1;
186
+ fe1305x2 *const c = h + 1;
187
+ fe1305x2 *const precomp = c + 1;
188
+ unsigned int j;
189
+
190
+ r->v[1] = r->v[0] = 0x3ffffff & *(uint32_t *)key;
191
+ r->v[3] = r->v[2] = 0x3ffff03 & ((*(uint32_t *)(key + 3)) >> 2);
192
+ r->v[5] = r->v[4] = 0x3ffc0ff & ((*(uint32_t *)(key + 6)) >> 4);
193
+ r->v[7] = r->v[6] = 0x3f03fff & ((*(uint32_t *)(key + 9)) >> 6);
194
+ r->v[9] = r->v[8] = 0x00fffff & ((*(uint32_t *)(key + 12)) >> 8);
195
+
196
+ for (j = 0; j < 10; j++) {
197
+ h->v[j] = 0; /* XXX: should fast-forward a bit */
198
+ }
199
+
200
+ addmulmod(precomp, r, r, &zero); /* precompute r^2 */
201
+ addmulmod(precomp + 1, precomp, precomp, &zero); /* precompute r^4 */
202
+
203
+ memcpy(st->key, key + 16, 16);
204
+ st->buf_used = 0;
205
+ }
206
+
207
+ void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in,
208
+ size_t in_len) {
209
+ struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
210
+ fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
211
+ fe1305x2 *const h = r + 1;
212
+ fe1305x2 *const c = h + 1;
213
+ fe1305x2 *const precomp = c + 1;
214
+ unsigned int i;
215
+
216
+ if (st->buf_used) {
217
+ unsigned int todo = 32 - st->buf_used;
218
+ if (todo > in_len) {
219
+ todo = in_len;
220
+ }
221
+ for (i = 0; i < todo; i++) {
222
+ st->buf[st->buf_used + i] = in[i];
223
+ }
224
+ st->buf_used += todo;
225
+ in_len -= todo;
226
+ in += todo;
227
+
228
+ if (st->buf_used == sizeof(st->buf) && in_len) {
229
+ addmulmod(h, h, precomp, &zero);
230
+ fe1305x2_frombytearray(c, st->buf, sizeof(st->buf));
231
+ for (i = 0; i < 10; i++) {
232
+ h->v[i] += c->v[i];
233
+ }
234
+ st->buf_used = 0;
235
+ }
236
+ }
237
+
238
+ while (in_len > 32) {
239
+ unsigned int tlen = 1048576;
240
+ if (in_len < tlen) {
241
+ tlen = in_len;
242
+ }
243
+ tlen -= blocks(h, precomp, in, tlen);
244
+ in_len -= tlen;
245
+ in += tlen;
246
+ }
247
+
248
+ if (in_len) {
249
+ for (i = 0; i < in_len; i++) {
250
+ st->buf[i] = in[i];
251
+ }
252
+ st->buf_used = in_len;
253
+ }
254
+ }
255
+
256
+ void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) {
257
+ struct poly1305_state_st *st = (struct poly1305_state_st *)(state);
258
+ fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data)));
259
+ fe1305x2 *const h = r + 1;
260
+ fe1305x2 *const c = h + 1;
261
+ fe1305x2 *const precomp = c + 1;
262
+
263
+ addmulmod(h, h, precomp, &zero);
264
+
265
+ if (st->buf_used > 16) {
266
+ fe1305x2_frombytearray(c, st->buf, st->buf_used);
267
+ precomp->v[1] = r->v[1];
268
+ precomp->v[3] = r->v[3];
269
+ precomp->v[5] = r->v[5];
270
+ precomp->v[7] = r->v[7];
271
+ precomp->v[9] = r->v[9];
272
+ addmulmod(h, h, precomp, c);
273
+ } else if (st->buf_used > 0) {
274
+ fe1305x2_frombytearray(c, st->buf, st->buf_used);
275
+ r->v[1] = 1;
276
+ r->v[3] = 0;
277
+ r->v[5] = 0;
278
+ r->v[7] = 0;
279
+ r->v[9] = 0;
280
+ addmulmod(h, h, r, c);
281
+ }
282
+
283
+ h->v[0] += h->v[1];
284
+ h->v[2] += h->v[3];
285
+ h->v[4] += h->v[5];
286
+ h->v[6] += h->v[7];
287
+ h->v[8] += h->v[9];
288
+ freeze(h);
289
+
290
+ fe1305x2_frombytearray(c, st->key, 16);
291
+ c->v[8] ^= (1 << 24);
292
+
293
+ h->v[0] += c->v[0];
294
+ h->v[2] += c->v[2];
295
+ h->v[4] += c->v[4];
296
+ h->v[6] += c->v[6];
297
+ h->v[8] += c->v[8];
298
+ fe1305x2_tobytearray(mac, h);
299
+ }
300
+
301
+ #endif /* OPENSSL_ARM && !OPENSSL_NO_ASM */
@@ -0,0 +1,2015 @@
1
+ #if defined(__arm__) && !defined(OPENSSL_NO_ASM)
2
+
3
+ # This implementation was taken from the public domain, neon2 version in
4
+ # SUPERCOP by D. J. Bernstein and Peter Schwabe.
5
+
6
+ # qhasm: int32 input_0
7
+
8
+ # qhasm: int32 input_1
9
+
10
+ # qhasm: int32 input_2
11
+
12
+ # qhasm: int32 input_3
13
+
14
+ # qhasm: stack32 input_4
15
+
16
+ # qhasm: stack32 input_5
17
+
18
+ # qhasm: stack32 input_6
19
+
20
+ # qhasm: stack32 input_7
21
+
22
+ # qhasm: int32 caller_r4
23
+
24
+ # qhasm: int32 caller_r5
25
+
26
+ # qhasm: int32 caller_r6
27
+
28
+ # qhasm: int32 caller_r7
29
+
30
+ # qhasm: int32 caller_r8
31
+
32
+ # qhasm: int32 caller_r9
33
+
34
+ # qhasm: int32 caller_r10
35
+
36
+ # qhasm: int32 caller_r11
37
+
38
+ # qhasm: int32 caller_r12
39
+
40
+ # qhasm: int32 caller_r14
41
+
42
+ # qhasm: reg128 caller_q4
43
+
44
+ # qhasm: reg128 caller_q5
45
+
46
+ # qhasm: reg128 caller_q6
47
+
48
+ # qhasm: reg128 caller_q7
49
+
50
+ # qhasm: startcode
51
+ .fpu neon
52
+ .text
53
+
54
+ # qhasm: reg128 r0
55
+
56
+ # qhasm: reg128 r1
57
+
58
+ # qhasm: reg128 r2
59
+
60
+ # qhasm: reg128 r3
61
+
62
+ # qhasm: reg128 r4
63
+
64
+ # qhasm: reg128 x01
65
+
66
+ # qhasm: reg128 x23
67
+
68
+ # qhasm: reg128 x4
69
+
70
+ # qhasm: reg128 y0
71
+
72
+ # qhasm: reg128 y12
73
+
74
+ # qhasm: reg128 y34
75
+
76
+ # qhasm: reg128 5y12
77
+
78
+ # qhasm: reg128 5y34
79
+
80
+ # qhasm: stack128 y0_stack
81
+
82
+ # qhasm: stack128 y12_stack
83
+
84
+ # qhasm: stack128 y34_stack
85
+
86
+ # qhasm: stack128 5y12_stack
87
+
88
+ # qhasm: stack128 5y34_stack
89
+
90
+ # qhasm: reg128 z0
91
+
92
+ # qhasm: reg128 z12
93
+
94
+ # qhasm: reg128 z34
95
+
96
+ # qhasm: reg128 5z12
97
+
98
+ # qhasm: reg128 5z34
99
+
100
+ # qhasm: stack128 z0_stack
101
+
102
+ # qhasm: stack128 z12_stack
103
+
104
+ # qhasm: stack128 z34_stack
105
+
106
+ # qhasm: stack128 5z12_stack
107
+
108
+ # qhasm: stack128 5z34_stack
109
+
110
+ # qhasm: stack128 two24
111
+
112
+ # qhasm: int32 ptr
113
+
114
+ # qhasm: reg128 c01
115
+
116
+ # qhasm: reg128 c23
117
+
118
+ # qhasm: reg128 d01
119
+
120
+ # qhasm: reg128 d23
121
+
122
+ # qhasm: reg128 t0
123
+
124
+ # qhasm: reg128 t1
125
+
126
+ # qhasm: reg128 t2
127
+
128
+ # qhasm: reg128 t3
129
+
130
+ # qhasm: reg128 t4
131
+
132
+ # qhasm: reg128 mask
133
+
134
+ # qhasm: reg128 u0
135
+
136
+ # qhasm: reg128 u1
137
+
138
+ # qhasm: reg128 u2
139
+
140
+ # qhasm: reg128 u3
141
+
142
+ # qhasm: reg128 u4
143
+
144
+ # qhasm: reg128 v01
145
+
146
+ # qhasm: reg128 mid
147
+
148
+ # qhasm: reg128 v23
149
+
150
+ # qhasm: reg128 v4
151
+
152
+ # qhasm: int32 len
153
+
154
+ # qhasm: qpushenter crypto_onetimeauth_poly1305_neon2_blocks
155
+ .align 4
156
+ .global openssl_poly1305_neon2_blocks
157
+ .hidden openssl_poly1305_neon2_blocks
158
+ .type openssl_poly1305_neon2_blocks STT_FUNC
159
+ openssl_poly1305_neon2_blocks:
160
+ vpush {q4,q5,q6,q7}
161
+ mov r12,sp
162
+ sub sp,sp,#192
163
+ bic sp,sp,#31
164
+
165
+ # qhasm: len = input_3
166
+ # asm 1: mov >len=int32#4,<input_3=int32#4
167
+ # asm 2: mov >len=r3,<input_3=r3
168
+ mov r3,r3
169
+
170
+ # qhasm: new y0
171
+
172
+ # qhasm: y0 = mem64[input_1]y0[1]; input_1 += 8
173
+ # asm 1: vld1.8 {<y0=reg128#1%bot},[<input_1=int32#2]!
174
+ # asm 2: vld1.8 {<y0=d0},[<input_1=r1]!
175
+ vld1.8 {d0},[r1]!
176
+
177
+ # qhasm: y12 = mem128[input_1]; input_1 += 16
178
+ # asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<input_1=int32#2]!
179
+ # asm 2: vld1.8 {>y12=d2->y12=d3},[<input_1=r1]!
180
+ vld1.8 {d2-d3},[r1]!
181
+
182
+ # qhasm: y34 = mem128[input_1]; input_1 += 16
183
+ # asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<input_1=int32#2]!
184
+ # asm 2: vld1.8 {>y34=d4->y34=d5},[<input_1=r1]!
185
+ vld1.8 {d4-d5},[r1]!
186
+
187
+ # qhasm: input_1 += 8
188
+ # asm 1: add >input_1=int32#2,<input_1=int32#2,#8
189
+ # asm 2: add >input_1=r1,<input_1=r1,#8
190
+ add r1,r1,#8
191
+
192
+ # qhasm: new z0
193
+
194
+ # qhasm: z0 = mem64[input_1]z0[1]; input_1 += 8
195
+ # asm 1: vld1.8 {<z0=reg128#4%bot},[<input_1=int32#2]!
196
+ # asm 2: vld1.8 {<z0=d6},[<input_1=r1]!
197
+ vld1.8 {d6},[r1]!
198
+
199
+ # qhasm: z12 = mem128[input_1]; input_1 += 16
200
+ # asm 1: vld1.8 {>z12=reg128#5%bot->z12=reg128#5%top},[<input_1=int32#2]!
201
+ # asm 2: vld1.8 {>z12=d8->z12=d9},[<input_1=r1]!
202
+ vld1.8 {d8-d9},[r1]!
203
+
204
+ # qhasm: z34 = mem128[input_1]; input_1 += 16
205
+ # asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<input_1=int32#2]!
206
+ # asm 2: vld1.8 {>z34=d10->z34=d11},[<input_1=r1]!
207
+ vld1.8 {d10-d11},[r1]!
208
+
209
+ # qhasm: 2x mask = 0xffffffff
210
+ # asm 1: vmov.i64 >mask=reg128#7,#0xffffffff
211
+ # asm 2: vmov.i64 >mask=q6,#0xffffffff
212
+ vmov.i64 q6,#0xffffffff
213
+
214
+ # qhasm: 2x u4 = 0xff
215
+ # asm 1: vmov.i64 >u4=reg128#8,#0xff
216
+ # asm 2: vmov.i64 >u4=q7,#0xff
217
+ vmov.i64 q7,#0xff
218
+
219
+ # qhasm: x01 aligned= mem128[input_0];input_0+=16
220
+ # asm 1: vld1.8 {>x01=reg128#9%bot->x01=reg128#9%top},[<input_0=int32#1,: 128]!
221
+ # asm 2: vld1.8 {>x01=d16->x01=d17},[<input_0=r0,: 128]!
222
+ vld1.8 {d16-d17},[r0,: 128]!
223
+
224
+ # qhasm: x23 aligned= mem128[input_0];input_0+=16
225
+ # asm 1: vld1.8 {>x23=reg128#10%bot->x23=reg128#10%top},[<input_0=int32#1,: 128]!
226
+ # asm 2: vld1.8 {>x23=d18->x23=d19},[<input_0=r0,: 128]!
227
+ vld1.8 {d18-d19},[r0,: 128]!
228
+
229
+ # qhasm: x4 aligned= mem64[input_0]x4[1]
230
+ # asm 1: vld1.8 {<x4=reg128#11%bot},[<input_0=int32#1,: 64]
231
+ # asm 2: vld1.8 {<x4=d20},[<input_0=r0,: 64]
232
+ vld1.8 {d20},[r0,: 64]
233
+
234
+ # qhasm: input_0 -= 32
235
+ # asm 1: sub >input_0=int32#1,<input_0=int32#1,#32
236
+ # asm 2: sub >input_0=r0,<input_0=r0,#32
237
+ sub r0,r0,#32
238
+
239
+ # qhasm: 2x mask unsigned>>=6
240
+ # asm 1: vshr.u64 >mask=reg128#7,<mask=reg128#7,#6
241
+ # asm 2: vshr.u64 >mask=q6,<mask=q6,#6
242
+ vshr.u64 q6,q6,#6
243
+
244
+ # qhasm: 2x u4 unsigned>>= 7
245
+ # asm 1: vshr.u64 >u4=reg128#8,<u4=reg128#8,#7
246
+ # asm 2: vshr.u64 >u4=q7,<u4=q7,#7
247
+ vshr.u64 q7,q7,#7
248
+
249
+ # qhasm: 4x 5y12 = y12 << 2
250
+ # asm 1: vshl.i32 >5y12=reg128#12,<y12=reg128#2,#2
251
+ # asm 2: vshl.i32 >5y12=q11,<y12=q1,#2
252
+ vshl.i32 q11,q1,#2
253
+
254
+ # qhasm: 4x 5y34 = y34 << 2
255
+ # asm 1: vshl.i32 >5y34=reg128#13,<y34=reg128#3,#2
256
+ # asm 2: vshl.i32 >5y34=q12,<y34=q2,#2
257
+ vshl.i32 q12,q2,#2
258
+
259
+ # qhasm: 4x 5y12 += y12
260
+ # asm 1: vadd.i32 >5y12=reg128#12,<5y12=reg128#12,<y12=reg128#2
261
+ # asm 2: vadd.i32 >5y12=q11,<5y12=q11,<y12=q1
262
+ vadd.i32 q11,q11,q1
263
+
264
+ # qhasm: 4x 5y34 += y34
265
+ # asm 1: vadd.i32 >5y34=reg128#13,<5y34=reg128#13,<y34=reg128#3
266
+ # asm 2: vadd.i32 >5y34=q12,<5y34=q12,<y34=q2
267
+ vadd.i32 q12,q12,q2
268
+
269
+ # qhasm: 2x u4 <<= 24
270
+ # asm 1: vshl.i64 >u4=reg128#8,<u4=reg128#8,#24
271
+ # asm 2: vshl.i64 >u4=q7,<u4=q7,#24
272
+ vshl.i64 q7,q7,#24
273
+
274
+ # qhasm: 4x 5z12 = z12 << 2
275
+ # asm 1: vshl.i32 >5z12=reg128#14,<z12=reg128#5,#2
276
+ # asm 2: vshl.i32 >5z12=q13,<z12=q4,#2
277
+ vshl.i32 q13,q4,#2
278
+
279
+ # qhasm: 4x 5z34 = z34 << 2
280
+ # asm 1: vshl.i32 >5z34=reg128#15,<z34=reg128#6,#2
281
+ # asm 2: vshl.i32 >5z34=q14,<z34=q5,#2
282
+ vshl.i32 q14,q5,#2
283
+
284
+ # qhasm: 4x 5z12 += z12
285
+ # asm 1: vadd.i32 >5z12=reg128#14,<5z12=reg128#14,<z12=reg128#5
286
+ # asm 2: vadd.i32 >5z12=q13,<5z12=q13,<z12=q4
287
+ vadd.i32 q13,q13,q4
288
+
289
+ # qhasm: 4x 5z34 += z34
290
+ # asm 1: vadd.i32 >5z34=reg128#15,<5z34=reg128#15,<z34=reg128#6
291
+ # asm 2: vadd.i32 >5z34=q14,<5z34=q14,<z34=q5
292
+ vadd.i32 q14,q14,q5
293
+
294
+ # qhasm: new two24
295
+
296
+ # qhasm: new y0_stack
297
+
298
+ # qhasm: new y12_stack
299
+
300
+ # qhasm: new y34_stack
301
+
302
+ # qhasm: new 5y12_stack
303
+
304
+ # qhasm: new 5y34_stack
305
+
306
+ # qhasm: new z0_stack
307
+
308
+ # qhasm: new z12_stack
309
+
310
+ # qhasm: new z34_stack
311
+
312
+ # qhasm: new 5z12_stack
313
+
314
+ # qhasm: new 5z34_stack
315
+
316
+ # qhasm: ptr = &two24
317
+ # asm 1: lea >ptr=int32#2,<two24=stack128#1
318
+ # asm 2: lea >ptr=r1,<two24=[sp,#0]
319
+ add r1,sp,#0
320
+
321
+ # qhasm: mem128[ptr] aligned= u4
322
+ # asm 1: vst1.8 {<u4=reg128#8%bot-<u4=reg128#8%top},[<ptr=int32#2,: 128]
323
+ # asm 2: vst1.8 {<u4=d14-<u4=d15},[<ptr=r1,: 128]
324
+ vst1.8 {d14-d15},[r1,: 128]
325
+
326
+ # qhasm: r4 = u4
327
+ # asm 1: vmov >r4=reg128#16,<u4=reg128#8
328
+ # asm 2: vmov >r4=q15,<u4=q7
329
+ vmov q15,q7
330
+
331
+ # qhasm: r0 = u4
332
+ # asm 1: vmov >r0=reg128#8,<u4=reg128#8
333
+ # asm 2: vmov >r0=q7,<u4=q7
334
+ vmov q7,q7
335
+
336
+ # qhasm: ptr = &y0_stack
337
+ # asm 1: lea >ptr=int32#2,<y0_stack=stack128#2
338
+ # asm 2: lea >ptr=r1,<y0_stack=[sp,#16]
339
+ add r1,sp,#16
340
+
341
+ # qhasm: mem128[ptr] aligned= y0
342
+ # asm 1: vst1.8 {<y0=reg128#1%bot-<y0=reg128#1%top},[<ptr=int32#2,: 128]
343
+ # asm 2: vst1.8 {<y0=d0-<y0=d1},[<ptr=r1,: 128]
344
+ vst1.8 {d0-d1},[r1,: 128]
345
+
346
+ # qhasm: ptr = &y12_stack
347
+ # asm 1: lea >ptr=int32#2,<y12_stack=stack128#3
348
+ # asm 2: lea >ptr=r1,<y12_stack=[sp,#32]
349
+ add r1,sp,#32
350
+
351
+ # qhasm: mem128[ptr] aligned= y12
352
+ # asm 1: vst1.8 {<y12=reg128#2%bot-<y12=reg128#2%top},[<ptr=int32#2,: 128]
353
+ # asm 2: vst1.8 {<y12=d2-<y12=d3},[<ptr=r1,: 128]
354
+ vst1.8 {d2-d3},[r1,: 128]
355
+
356
+ # qhasm: ptr = &y34_stack
357
+ # asm 1: lea >ptr=int32#2,<y34_stack=stack128#4
358
+ # asm 2: lea >ptr=r1,<y34_stack=[sp,#48]
359
+ add r1,sp,#48
360
+
361
+ # qhasm: mem128[ptr] aligned= y34
362
+ # asm 1: vst1.8 {<y34=reg128#3%bot-<y34=reg128#3%top},[<ptr=int32#2,: 128]
363
+ # asm 2: vst1.8 {<y34=d4-<y34=d5},[<ptr=r1,: 128]
364
+ vst1.8 {d4-d5},[r1,: 128]
365
+
366
+ # qhasm: ptr = &z0_stack
367
+ # asm 1: lea >ptr=int32#2,<z0_stack=stack128#7
368
+ # asm 2: lea >ptr=r1,<z0_stack=[sp,#96]
369
+ add r1,sp,#96
370
+
371
+ # qhasm: mem128[ptr] aligned= z0
372
+ # asm 1: vst1.8 {<z0=reg128#4%bot-<z0=reg128#4%top},[<ptr=int32#2,: 128]
373
+ # asm 2: vst1.8 {<z0=d6-<z0=d7},[<ptr=r1,: 128]
374
+ vst1.8 {d6-d7},[r1,: 128]
375
+
376
+ # qhasm: ptr = &z12_stack
377
+ # asm 1: lea >ptr=int32#2,<z12_stack=stack128#8
378
+ # asm 2: lea >ptr=r1,<z12_stack=[sp,#112]
379
+ add r1,sp,#112
380
+
381
+ # qhasm: mem128[ptr] aligned= z12
382
+ # asm 1: vst1.8 {<z12=reg128#5%bot-<z12=reg128#5%top},[<ptr=int32#2,: 128]
383
+ # asm 2: vst1.8 {<z12=d8-<z12=d9},[<ptr=r1,: 128]
384
+ vst1.8 {d8-d9},[r1,: 128]
385
+
386
+ # qhasm: ptr = &z34_stack
387
+ # asm 1: lea >ptr=int32#2,<z34_stack=stack128#9
388
+ # asm 2: lea >ptr=r1,<z34_stack=[sp,#128]
389
+ add r1,sp,#128
390
+
391
+ # qhasm: mem128[ptr] aligned= z34
392
+ # asm 1: vst1.8 {<z34=reg128#6%bot-<z34=reg128#6%top},[<ptr=int32#2,: 128]
393
+ # asm 2: vst1.8 {<z34=d10-<z34=d11},[<ptr=r1,: 128]
394
+ vst1.8 {d10-d11},[r1,: 128]
395
+
396
+ # qhasm: ptr = &5y12_stack
397
+ # asm 1: lea >ptr=int32#2,<5y12_stack=stack128#5
398
+ # asm 2: lea >ptr=r1,<5y12_stack=[sp,#64]
399
+ add r1,sp,#64
400
+
401
+ # qhasm: mem128[ptr] aligned= 5y12
402
+ # asm 1: vst1.8 {<5y12=reg128#12%bot-<5y12=reg128#12%top},[<ptr=int32#2,: 128]
403
+ # asm 2: vst1.8 {<5y12=d22-<5y12=d23},[<ptr=r1,: 128]
404
+ vst1.8 {d22-d23},[r1,: 128]
405
+
406
+ # qhasm: ptr = &5y34_stack
407
+ # asm 1: lea >ptr=int32#2,<5y34_stack=stack128#6
408
+ # asm 2: lea >ptr=r1,<5y34_stack=[sp,#80]
409
+ add r1,sp,#80
410
+
411
+ # qhasm: mem128[ptr] aligned= 5y34
412
+ # asm 1: vst1.8 {<5y34=reg128#13%bot-<5y34=reg128#13%top},[<ptr=int32#2,: 128]
413
+ # asm 2: vst1.8 {<5y34=d24-<5y34=d25},[<ptr=r1,: 128]
414
+ vst1.8 {d24-d25},[r1,: 128]
415
+
416
+ # qhasm: ptr = &5z12_stack
417
+ # asm 1: lea >ptr=int32#2,<5z12_stack=stack128#10
418
+ # asm 2: lea >ptr=r1,<5z12_stack=[sp,#144]
419
+ add r1,sp,#144
420
+
421
+ # qhasm: mem128[ptr] aligned= 5z12
422
+ # asm 1: vst1.8 {<5z12=reg128#14%bot-<5z12=reg128#14%top},[<ptr=int32#2,: 128]
423
+ # asm 2: vst1.8 {<5z12=d26-<5z12=d27},[<ptr=r1,: 128]
424
+ vst1.8 {d26-d27},[r1,: 128]
425
+
426
+ # qhasm: ptr = &5z34_stack
427
+ # asm 1: lea >ptr=int32#2,<5z34_stack=stack128#11
428
+ # asm 2: lea >ptr=r1,<5z34_stack=[sp,#160]
429
+ add r1,sp,#160
430
+
431
+ # qhasm: mem128[ptr] aligned= 5z34
432
+ # asm 1: vst1.8 {<5z34=reg128#15%bot-<5z34=reg128#15%top},[<ptr=int32#2,: 128]
433
+ # asm 2: vst1.8 {<5z34=d28-<5z34=d29},[<ptr=r1,: 128]
434
+ vst1.8 {d28-d29},[r1,: 128]
435
+
436
+ # qhasm: unsigned>? len - 64
437
+ # asm 1: cmp <len=int32#4,#64
438
+ # asm 2: cmp <len=r3,#64
439
+ cmp r3,#64
440
+
441
+ # qhasm: goto below64bytes if !unsigned>
442
+ bls ._below64bytes
443
+
444
+ # qhasm: input_2 += 32
445
+ # asm 1: add >input_2=int32#2,<input_2=int32#3,#32
446
+ # asm 2: add >input_2=r1,<input_2=r2,#32
447
+ add r1,r2,#32
448
+
449
+ # qhasm: mainloop2:
450
+ ._mainloop2:
451
+
452
+ # qhasm: c01 = mem128[input_2];input_2+=16
453
+ # asm 1: vld1.8 {>c01=reg128#1%bot->c01=reg128#1%top},[<input_2=int32#2]!
454
+ # asm 2: vld1.8 {>c01=d0->c01=d1},[<input_2=r1]!
455
+ vld1.8 {d0-d1},[r1]!
456
+
457
+ # qhasm: c23 = mem128[input_2];input_2+=16
458
+ # asm 1: vld1.8 {>c23=reg128#2%bot->c23=reg128#2%top},[<input_2=int32#2]!
459
+ # asm 2: vld1.8 {>c23=d2->c23=d3},[<input_2=r1]!
460
+ vld1.8 {d2-d3},[r1]!
461
+
462
+ # qhasm: r4[0,1] += x01[0] unsigned* z34[2]; r4[2,3] += x01[1] unsigned* z34[3]
463
+ # asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%bot,<z34=reg128#6%top
464
+ # asm 2: vmlal.u32 <r4=q15,<x01=d16,<z34=d11
465
+ vmlal.u32 q15,d16,d11
466
+
467
+ # qhasm: ptr = &z12_stack
468
+ # asm 1: lea >ptr=int32#3,<z12_stack=stack128#8
469
+ # asm 2: lea >ptr=r2,<z12_stack=[sp,#112]
470
+ add r2,sp,#112
471
+
472
+ # qhasm: z12 aligned= mem128[ptr]
473
+ # asm 1: vld1.8 {>z12=reg128#3%bot->z12=reg128#3%top},[<ptr=int32#3,: 128]
474
+ # asm 2: vld1.8 {>z12=d4->z12=d5},[<ptr=r2,: 128]
475
+ vld1.8 {d4-d5},[r2,: 128]
476
+
477
+ # qhasm: r4[0,1] += x01[2] unsigned* z34[0]; r4[2,3] += x01[3] unsigned* z34[1]
478
+ # asm 1: vmlal.u32 <r4=reg128#16,<x01=reg128#9%top,<z34=reg128#6%bot
479
+ # asm 2: vmlal.u32 <r4=q15,<x01=d17,<z34=d10
480
+ vmlal.u32 q15,d17,d10
481
+
482
+ # qhasm: ptr = &z0_stack
483
+ # asm 1: lea >ptr=int32#3,<z0_stack=stack128#7
484
+ # asm 2: lea >ptr=r2,<z0_stack=[sp,#96]
485
+ add r2,sp,#96
486
+
487
+ # qhasm: z0 aligned= mem128[ptr]
488
+ # asm 1: vld1.8 {>z0=reg128#4%bot->z0=reg128#4%top},[<ptr=int32#3,: 128]
489
+ # asm 2: vld1.8 {>z0=d6->z0=d7},[<ptr=r2,: 128]
490
+ vld1.8 {d6-d7},[r2,: 128]
491
+
492
+ # qhasm: r4[0,1] += x23[0] unsigned* z12[2]; r4[2,3] += x23[1] unsigned* z12[3]
493
+ # asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%bot,<z12=reg128#3%top
494
+ # asm 2: vmlal.u32 <r4=q15,<x23=d18,<z12=d5
495
+ vmlal.u32 q15,d18,d5
496
+
497
+ # qhasm: c01 c23 = c01[0]c01[1]c01[2]c23[2]c23[0]c23[1]c01[3]c23[3]
498
+ # asm 1: vtrn.32 <c01=reg128#1%top,<c23=reg128#2%top
499
+ # asm 2: vtrn.32 <c01=d1,<c23=d3
500
+ vtrn.32 d1,d3
501
+
502
+ # qhasm: r4[0,1] += x23[2] unsigned* z12[0]; r4[2,3] += x23[3] unsigned* z12[1]
503
+ # asm 1: vmlal.u32 <r4=reg128#16,<x23=reg128#10%top,<z12=reg128#3%bot
504
+ # asm 2: vmlal.u32 <r4=q15,<x23=d19,<z12=d4
505
+ vmlal.u32 q15,d19,d4
506
+
507
+ # qhasm: r4[0,1] += x4[0] unsigned* z0[0]; r4[2,3] += x4[1] unsigned* z0[1]
508
+ # asm 1: vmlal.u32 <r4=reg128#16,<x4=reg128#11%bot,<z0=reg128#4%bot
509
+ # asm 2: vmlal.u32 <r4=q15,<x4=d20,<z0=d6
510
+ vmlal.u32 q15,d20,d6
511
+
512
+ # qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18
513
+ # asm 1: vshll.u32 >r3=reg128#5,<c23=reg128#2%top,#18
514
+ # asm 2: vshll.u32 >r3=q4,<c23=d3,#18
515
+ vshll.u32 q4,d3,#18
516
+
517
+ # qhasm: c01 c23 = c01[0]c23[0]c01[2]c01[3]c01[1]c23[1]c23[2]c23[3]
518
+ # asm 1: vtrn.32 <c01=reg128#1%bot,<c23=reg128#2%bot
519
+ # asm 2: vtrn.32 <c01=d0,<c23=d2
520
+ vtrn.32 d0,d2
521
+
522
+ # qhasm: r3[0,1] += x01[0] unsigned* z34[0]; r3[2,3] += x01[1] unsigned* z34[1]
523
+ # asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%bot,<z34=reg128#6%bot
524
+ # asm 2: vmlal.u32 <r3=q4,<x01=d16,<z34=d10
525
+ vmlal.u32 q4,d16,d10
526
+
527
+ # qhasm: r3[0,1] += x01[2] unsigned* z12[2]; r3[2,3] += x01[3] unsigned* z12[3]
528
+ # asm 1: vmlal.u32 <r3=reg128#5,<x01=reg128#9%top,<z12=reg128#3%top
529
+ # asm 2: vmlal.u32 <r3=q4,<x01=d17,<z12=d5
530
+ vmlal.u32 q4,d17,d5
531
+
532
+ # qhasm: r0 = r0[1]c01[0]r0[2,3]
533
+ # asm 1: vext.32 <r0=reg128#8%bot,<r0=reg128#8%bot,<c01=reg128#1%bot,#1
534
+ # asm 2: vext.32 <r0=d14,<r0=d14,<c01=d0,#1
535
+ vext.32 d14,d14,d0,#1
536
+
537
+ # qhasm: r3[0,1] += x23[0] unsigned* z12[0]; r3[2,3] += x23[1] unsigned* z12[1]
538
+ # asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%bot,<z12=reg128#3%bot
539
+ # asm 2: vmlal.u32 <r3=q4,<x23=d18,<z12=d4
540
+ vmlal.u32 q4,d18,d4
541
+
542
+ # qhasm: input_2 -= 64
543
+ # asm 1: sub >input_2=int32#2,<input_2=int32#2,#64
544
+ # asm 2: sub >input_2=r1,<input_2=r1,#64
545
+ sub r1,r1,#64
546
+
547
+ # qhasm: r3[0,1] += x23[2] unsigned* z0[0]; r3[2,3] += x23[3] unsigned* z0[1]
548
+ # asm 1: vmlal.u32 <r3=reg128#5,<x23=reg128#10%top,<z0=reg128#4%bot
549
+ # asm 2: vmlal.u32 <r3=q4,<x23=d19,<z0=d6
550
+ vmlal.u32 q4,d19,d6
551
+
552
+ # qhasm: ptr = &5z34_stack
553
+ # asm 1: lea >ptr=int32#3,<5z34_stack=stack128#11
554
+ # asm 2: lea >ptr=r2,<5z34_stack=[sp,#160]
555
+ add r2,sp,#160
556
+
557
+ # qhasm: 5z34 aligned= mem128[ptr]
558
+ # asm 1: vld1.8 {>5z34=reg128#6%bot->5z34=reg128#6%top},[<ptr=int32#3,: 128]
559
+ # asm 2: vld1.8 {>5z34=d10->5z34=d11},[<ptr=r2,: 128]
560
+ vld1.8 {d10-d11},[r2,: 128]
561
+
562
+ # qhasm: r3[0,1] += x4[0] unsigned* 5z34[2]; r3[2,3] += x4[1] unsigned* 5z34[3]
563
+ # asm 1: vmlal.u32 <r3=reg128#5,<x4=reg128#11%bot,<5z34=reg128#6%top
564
+ # asm 2: vmlal.u32 <r3=q4,<x4=d20,<5z34=d11
565
+ vmlal.u32 q4,d20,d11
566
+
567
+ # qhasm: r0 = r0[1]r0[0]r0[3]r0[2]
568
+ # asm 1: vrev64.i32 >r0=reg128#8,<r0=reg128#8
569
+ # asm 2: vrev64.i32 >r0=q7,<r0=q7
570
+ vrev64.i32 q7,q7
571
+
572
+ # qhasm: r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12
573
+ # asm 1: vshll.u32 >r2=reg128#14,<c01=reg128#1%top,#12
574
+ # asm 2: vshll.u32 >r2=q13,<c01=d1,#12
575
+ vshll.u32 q13,d1,#12
576
+
577
+ # qhasm: d01 = mem128[input_2];input_2+=16
578
+ # asm 1: vld1.8 {>d01=reg128#12%bot->d01=reg128#12%top},[<input_2=int32#2]!
579
+ # asm 2: vld1.8 {>d01=d22->d01=d23},[<input_2=r1]!
580
+ vld1.8 {d22-d23},[r1]!
581
+
582
+ # qhasm: r2[0,1] += x01[0] unsigned* z12[2]; r2[2,3] += x01[1] unsigned* z12[3]
583
+ # asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%bot,<z12=reg128#3%top
584
+ # asm 2: vmlal.u32 <r2=q13,<x01=d16,<z12=d5
585
+ vmlal.u32 q13,d16,d5
586
+
587
+ # qhasm: r2[0,1] += x01[2] unsigned* z12[0]; r2[2,3] += x01[3] unsigned* z12[1]
588
+ # asm 1: vmlal.u32 <r2=reg128#14,<x01=reg128#9%top,<z12=reg128#3%bot
589
+ # asm 2: vmlal.u32 <r2=q13,<x01=d17,<z12=d4
590
+ vmlal.u32 q13,d17,d4
591
+
592
+ # qhasm: r2[0,1] += x23[0] unsigned* z0[0]; r2[2,3] += x23[1] unsigned* z0[1]
593
+ # asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%bot,<z0=reg128#4%bot
594
+ # asm 2: vmlal.u32 <r2=q13,<x23=d18,<z0=d6
595
+ vmlal.u32 q13,d18,d6
596
+
597
+ # qhasm: r2[0,1] += x23[2] unsigned* 5z34[2]; r2[2,3] += x23[3] unsigned* 5z34[3]
598
+ # asm 1: vmlal.u32 <r2=reg128#14,<x23=reg128#10%top,<5z34=reg128#6%top
599
+ # asm 2: vmlal.u32 <r2=q13,<x23=d19,<5z34=d11
600
+ vmlal.u32 q13,d19,d11
601
+
602
+ # qhasm: r2[0,1] += x4[0] unsigned* 5z34[0]; r2[2,3] += x4[1] unsigned* 5z34[1]
603
+ # asm 1: vmlal.u32 <r2=reg128#14,<x4=reg128#11%bot,<5z34=reg128#6%bot
604
+ # asm 2: vmlal.u32 <r2=q13,<x4=d20,<5z34=d10
605
+ vmlal.u32 q13,d20,d10
606
+
607
+ # qhasm: r0 = r0[0,1]c01[1]r0[2]
608
+ # asm 1: vext.32 <r0=reg128#8%top,<c01=reg128#1%bot,<r0=reg128#8%top,#1
609
+ # asm 2: vext.32 <r0=d15,<c01=d0,<r0=d15,#1
610
+ vext.32 d15,d0,d15,#1
611
+
612
+ # qhasm: r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6
613
+ # asm 1: vshll.u32 >r1=reg128#15,<c23=reg128#2%bot,#6
614
+ # asm 2: vshll.u32 >r1=q14,<c23=d2,#6
615
+ vshll.u32 q14,d2,#6
616
+
617
+ # qhasm: r1[0,1] += x01[0] unsigned* z12[0]; r1[2,3] += x01[1] unsigned* z12[1]
618
+ # asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%bot,<z12=reg128#3%bot
619
+ # asm 2: vmlal.u32 <r1=q14,<x01=d16,<z12=d4
620
+ vmlal.u32 q14,d16,d4
621
+
622
+ # qhasm: r1[0,1] += x01[2] unsigned* z0[0]; r1[2,3] += x01[3] unsigned* z0[1]
623
+ # asm 1: vmlal.u32 <r1=reg128#15,<x01=reg128#9%top,<z0=reg128#4%bot
624
+ # asm 2: vmlal.u32 <r1=q14,<x01=d17,<z0=d6
625
+ vmlal.u32 q14,d17,d6
626
+
627
+ # qhasm: r1[0,1] += x23[0] unsigned* 5z34[2]; r1[2,3] += x23[1] unsigned* 5z34[3]
628
+ # asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%bot,<5z34=reg128#6%top
629
+ # asm 2: vmlal.u32 <r1=q14,<x23=d18,<5z34=d11
630
+ vmlal.u32 q14,d18,d11
631
+
632
+ # qhasm: r1[0,1] += x23[2] unsigned* 5z34[0]; r1[2,3] += x23[3] unsigned* 5z34[1]
633
+ # asm 1: vmlal.u32 <r1=reg128#15,<x23=reg128#10%top,<5z34=reg128#6%bot
634
+ # asm 2: vmlal.u32 <r1=q14,<x23=d19,<5z34=d10
635
+ vmlal.u32 q14,d19,d10
636
+
637
+ # qhasm: ptr = &5z12_stack
638
+ # asm 1: lea >ptr=int32#3,<5z12_stack=stack128#10
639
+ # asm 2: lea >ptr=r2,<5z12_stack=[sp,#144]
640
+ add r2,sp,#144
641
+
642
+ # qhasm: 5z12 aligned= mem128[ptr]
643
+ # asm 1: vld1.8 {>5z12=reg128#1%bot->5z12=reg128#1%top},[<ptr=int32#3,: 128]
644
+ # asm 2: vld1.8 {>5z12=d0->5z12=d1},[<ptr=r2,: 128]
645
+ vld1.8 {d0-d1},[r2,: 128]
646
+
647
+ # qhasm: r1[0,1] += x4[0] unsigned* 5z12[2]; r1[2,3] += x4[1] unsigned* 5z12[3]
648
+ # asm 1: vmlal.u32 <r1=reg128#15,<x4=reg128#11%bot,<5z12=reg128#1%top
649
+ # asm 2: vmlal.u32 <r1=q14,<x4=d20,<5z12=d1
650
+ vmlal.u32 q14,d20,d1
651
+
652
+ # qhasm: d23 = mem128[input_2];input_2+=16
653
+ # asm 1: vld1.8 {>d23=reg128#2%bot->d23=reg128#2%top},[<input_2=int32#2]!
654
+ # asm 2: vld1.8 {>d23=d2->d23=d3},[<input_2=r1]!
655
+ vld1.8 {d2-d3},[r1]!
656
+
657
+ # qhasm: input_2 += 32
658
+ # asm 1: add >input_2=int32#2,<input_2=int32#2,#32
659
+ # asm 2: add >input_2=r1,<input_2=r1,#32
660
+ add r1,r1,#32
661
+
662
+ # qhasm: r0[0,1] += x4[0] unsigned* 5z12[0]; r0[2,3] += x4[1] unsigned* 5z12[1]
663
+ # asm 1: vmlal.u32 <r0=reg128#8,<x4=reg128#11%bot,<5z12=reg128#1%bot
664
+ # asm 2: vmlal.u32 <r0=q7,<x4=d20,<5z12=d0
665
+ vmlal.u32 q7,d20,d0
666
+
667
+ # qhasm: r0[0,1] += x23[0] unsigned* 5z34[0]; r0[2,3] += x23[1] unsigned* 5z34[1]
668
+ # asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%bot,<5z34=reg128#6%bot
669
+ # asm 2: vmlal.u32 <r0=q7,<x23=d18,<5z34=d10
670
+ vmlal.u32 q7,d18,d10
671
+
672
+ # qhasm: d01 d23 = d01[0] d23[0] d01[1] d23[1]
673
+ # asm 1: vswp <d23=reg128#2%bot,<d01=reg128#12%top
674
+ # asm 2: vswp <d23=d2,<d01=d23
675
+ vswp d2,d23
676
+
677
+ # qhasm: r0[0,1] += x23[2] unsigned* 5z12[2]; r0[2,3] += x23[3] unsigned* 5z12[3]
678
+ # asm 1: vmlal.u32 <r0=reg128#8,<x23=reg128#10%top,<5z12=reg128#1%top
679
+ # asm 2: vmlal.u32 <r0=q7,<x23=d19,<5z12=d1
680
+ vmlal.u32 q7,d19,d1
681
+
682
+ # qhasm: r0[0,1] += x01[0] unsigned* z0[0]; r0[2,3] += x01[1] unsigned* z0[1]
683
+ # asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%bot,<z0=reg128#4%bot
684
+ # asm 2: vmlal.u32 <r0=q7,<x01=d16,<z0=d6
685
+ vmlal.u32 q7,d16,d6
686
+
687
+ # qhasm: new mid
688
+
689
+ # qhasm: 2x v4 = d23 unsigned>> 40
690
+ # asm 1: vshr.u64 >v4=reg128#4,<d23=reg128#2,#40
691
+ # asm 2: vshr.u64 >v4=q3,<d23=q1,#40
692
+ vshr.u64 q3,q1,#40
693
+
694
+ # qhasm: mid = d01[1]d23[0] mid[2,3]
695
+ # asm 1: vext.32 <mid=reg128#1%bot,<d01=reg128#12%bot,<d23=reg128#2%bot,#1
696
+ # asm 2: vext.32 <mid=d0,<d01=d22,<d23=d2,#1
697
+ vext.32 d0,d22,d2,#1
698
+
699
+ # qhasm: new v23
700
+
701
+ # qhasm: v23[2] = d23[0,1] unsigned>> 14; v23[3] = d23[2,3] unsigned>> 14
702
+ # asm 1: vshrn.u64 <v23=reg128#10%top,<d23=reg128#2,#14
703
+ # asm 2: vshrn.u64 <v23=d19,<d23=q1,#14
704
+ vshrn.u64 d19,q1,#14
705
+
706
+ # qhasm: mid = mid[0,1] d01[3]d23[2]
707
+ # asm 1: vext.32 <mid=reg128#1%top,<d01=reg128#12%top,<d23=reg128#2%top,#1
708
+ # asm 2: vext.32 <mid=d1,<d01=d23,<d23=d3,#1
709
+ vext.32 d1,d23,d3,#1
710
+
711
+ # qhasm: new v01
712
+
713
+ # qhasm: v01[2] = d01[0,1] unsigned>> 26; v01[3] = d01[2,3] unsigned>> 26
714
+ # asm 1: vshrn.u64 <v01=reg128#11%top,<d01=reg128#12,#26
715
+ # asm 2: vshrn.u64 <v01=d21,<d01=q11,#26
716
+ vshrn.u64 d21,q11,#26
717
+
718
+ # qhasm: v01 = d01[1]d01[0] v01[2,3]
719
+ # asm 1: vext.32 <v01=reg128#11%bot,<d01=reg128#12%bot,<d01=reg128#12%bot,#1
720
+ # asm 2: vext.32 <v01=d20,<d01=d22,<d01=d22,#1
721
+ vext.32 d20,d22,d22,#1
722
+
723
+ # qhasm: r0[0,1] += x01[2] unsigned* 5z34[2]; r0[2,3] += x01[3] unsigned* 5z34[3]
724
+ # asm 1: vmlal.u32 <r0=reg128#8,<x01=reg128#9%top,<5z34=reg128#6%top
725
+ # asm 2: vmlal.u32 <r0=q7,<x01=d17,<5z34=d11
726
+ vmlal.u32 q7,d17,d11
727
+
728
+ # qhasm: v01 = v01[1]d01[2] v01[2,3]
729
+ # asm 1: vext.32 <v01=reg128#11%bot,<v01=reg128#11%bot,<d01=reg128#12%top,#1
730
+ # asm 2: vext.32 <v01=d20,<v01=d20,<d01=d23,#1
731
+ vext.32 d20,d20,d23,#1
732
+
733
+ # qhasm: v23[0] = mid[0,1] unsigned>> 20; v23[1] = mid[2,3] unsigned>> 20
734
+ # asm 1: vshrn.u64 <v23=reg128#10%bot,<mid=reg128#1,#20
735
+ # asm 2: vshrn.u64 <v23=d18,<mid=q0,#20
736
+ vshrn.u64 d18,q0,#20
737
+
738
+ # qhasm: v4 = v4[0]v4[2]v4[1]v4[3]
739
+ # asm 1: vtrn.32 <v4=reg128#4%bot,<v4=reg128#4%top
740
+ # asm 2: vtrn.32 <v4=d6,<v4=d7
741
+ vtrn.32 d6,d7
742
+
743
+ # qhasm: 4x v01 &= 0x03ffffff
744
+ # asm 1: vand.i32 <v01=reg128#11,#0x03ffffff
745
+ # asm 2: vand.i32 <v01=q10,#0x03ffffff
746
+ vand.i32 q10,#0x03ffffff
747
+
748
+ # qhasm: ptr = &y34_stack
749
+ # asm 1: lea >ptr=int32#3,<y34_stack=stack128#4
750
+ # asm 2: lea >ptr=r2,<y34_stack=[sp,#48]
751
+ add r2,sp,#48
752
+
753
+ # qhasm: y34 aligned= mem128[ptr]
754
+ # asm 1: vld1.8 {>y34=reg128#3%bot->y34=reg128#3%top},[<ptr=int32#3,: 128]
755
+ # asm 2: vld1.8 {>y34=d4->y34=d5},[<ptr=r2,: 128]
756
+ vld1.8 {d4-d5},[r2,: 128]
757
+
758
+ # qhasm: 4x v23 &= 0x03ffffff
759
+ # asm 1: vand.i32 <v23=reg128#10,#0x03ffffff
760
+ # asm 2: vand.i32 <v23=q9,#0x03ffffff
761
+ vand.i32 q9,#0x03ffffff
762
+
763
+ # qhasm: ptr = &y12_stack
764
+ # asm 1: lea >ptr=int32#3,<y12_stack=stack128#3
765
+ # asm 2: lea >ptr=r2,<y12_stack=[sp,#32]
766
+ add r2,sp,#32
767
+
768
+ # qhasm: y12 aligned= mem128[ptr]
769
+ # asm 1: vld1.8 {>y12=reg128#2%bot->y12=reg128#2%top},[<ptr=int32#3,: 128]
770
+ # asm 2: vld1.8 {>y12=d2->y12=d3},[<ptr=r2,: 128]
771
+ vld1.8 {d2-d3},[r2,: 128]
772
+
773
+ # qhasm: 4x v4 |= 0x01000000
774
+ # asm 1: vorr.i32 <v4=reg128#4,#0x01000000
775
+ # asm 2: vorr.i32 <v4=q3,#0x01000000
776
+ vorr.i32 q3,#0x01000000
777
+
778
+ # qhasm: ptr = &y0_stack
779
+ # asm 1: lea >ptr=int32#3,<y0_stack=stack128#2
780
+ # asm 2: lea >ptr=r2,<y0_stack=[sp,#16]
781
+ add r2,sp,#16
782
+
783
+ # qhasm: y0 aligned= mem128[ptr]
784
+ # asm 1: vld1.8 {>y0=reg128#1%bot->y0=reg128#1%top},[<ptr=int32#3,: 128]
785
+ # asm 2: vld1.8 {>y0=d0->y0=d1},[<ptr=r2,: 128]
786
+ vld1.8 {d0-d1},[r2,: 128]
787
+
788
+ # qhasm: r4[0,1] += v01[0] unsigned* y34[2]; r4[2,3] += v01[1] unsigned* y34[3]
789
+ # asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%bot,<y34=reg128#3%top
790
+ # asm 2: vmlal.u32 <r4=q15,<v01=d20,<y34=d5
791
+ vmlal.u32 q15,d20,d5
792
+
793
+ # qhasm: r4[0,1] += v01[2] unsigned* y34[0]; r4[2,3] += v01[3] unsigned* y34[1]
794
+ # asm 1: vmlal.u32 <r4=reg128#16,<v01=reg128#11%top,<y34=reg128#3%bot
795
+ # asm 2: vmlal.u32 <r4=q15,<v01=d21,<y34=d4
796
+ vmlal.u32 q15,d21,d4
797
+
798
+ # qhasm: r4[0,1] += v23[0] unsigned* y12[2]; r4[2,3] += v23[1] unsigned* y12[3]
799
+ # asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%bot,<y12=reg128#2%top
800
+ # asm 2: vmlal.u32 <r4=q15,<v23=d18,<y12=d3
801
+ vmlal.u32 q15,d18,d3
802
+
803
+ # qhasm: r4[0,1] += v23[2] unsigned* y12[0]; r4[2,3] += v23[3] unsigned* y12[1]
804
+ # asm 1: vmlal.u32 <r4=reg128#16,<v23=reg128#10%top,<y12=reg128#2%bot
805
+ # asm 2: vmlal.u32 <r4=q15,<v23=d19,<y12=d2
806
+ vmlal.u32 q15,d19,d2
807
+
808
+ # qhasm: r4[0,1] += v4[0] unsigned* y0[0]; r4[2,3] += v4[1] unsigned* y0[1]
809
+ # asm 1: vmlal.u32 <r4=reg128#16,<v4=reg128#4%bot,<y0=reg128#1%bot
810
+ # asm 2: vmlal.u32 <r4=q15,<v4=d6,<y0=d0
811
+ vmlal.u32 q15,d6,d0
812
+
813
+ # qhasm: ptr = &5y34_stack
814
+ # asm 1: lea >ptr=int32#3,<5y34_stack=stack128#6
815
+ # asm 2: lea >ptr=r2,<5y34_stack=[sp,#80]
816
+ add r2,sp,#80
817
+
818
+ # qhasm: 5y34 aligned= mem128[ptr]
819
+ # asm 1: vld1.8 {>5y34=reg128#13%bot->5y34=reg128#13%top},[<ptr=int32#3,: 128]
820
+ # asm 2: vld1.8 {>5y34=d24->5y34=d25},[<ptr=r2,: 128]
821
+ vld1.8 {d24-d25},[r2,: 128]
822
+
823
+ # qhasm: r3[0,1] += v01[0] unsigned* y34[0]; r3[2,3] += v01[1] unsigned* y34[1]
824
+ # asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%bot,<y34=reg128#3%bot
825
+ # asm 2: vmlal.u32 <r3=q4,<v01=d20,<y34=d4
826
+ vmlal.u32 q4,d20,d4
827
+
828
+ # qhasm: r3[0,1] += v01[2] unsigned* y12[2]; r3[2,3] += v01[3] unsigned* y12[3]
829
+ # asm 1: vmlal.u32 <r3=reg128#5,<v01=reg128#11%top,<y12=reg128#2%top
830
+ # asm 2: vmlal.u32 <r3=q4,<v01=d21,<y12=d3
831
+ vmlal.u32 q4,d21,d3
832
+
833
+ # qhasm: r3[0,1] += v23[0] unsigned* y12[0]; r3[2,3] += v23[1] unsigned* y12[1]
834
+ # asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%bot,<y12=reg128#2%bot
835
+ # asm 2: vmlal.u32 <r3=q4,<v23=d18,<y12=d2
836
+ vmlal.u32 q4,d18,d2
837
+
838
+ # qhasm: r3[0,1] += v23[2] unsigned* y0[0]; r3[2,3] += v23[3] unsigned* y0[1]
839
+ # asm 1: vmlal.u32 <r3=reg128#5,<v23=reg128#10%top,<y0=reg128#1%bot
840
+ # asm 2: vmlal.u32 <r3=q4,<v23=d19,<y0=d0
841
+ vmlal.u32 q4,d19,d0
842
+
843
+ # qhasm: r3[0,1] += v4[0] unsigned* 5y34[2]; r3[2,3] += v4[1] unsigned* 5y34[3]
844
+ # asm 1: vmlal.u32 <r3=reg128#5,<v4=reg128#4%bot,<5y34=reg128#13%top
845
+ # asm 2: vmlal.u32 <r3=q4,<v4=d6,<5y34=d25
846
+ vmlal.u32 q4,d6,d25
847
+
848
+ # qhasm: ptr = &5y12_stack
849
+ # asm 1: lea >ptr=int32#3,<5y12_stack=stack128#5
850
+ # asm 2: lea >ptr=r2,<5y12_stack=[sp,#64]
851
+ add r2,sp,#64
852
+
853
+ # qhasm: 5y12 aligned= mem128[ptr]
854
+ # asm 1: vld1.8 {>5y12=reg128#12%bot->5y12=reg128#12%top},[<ptr=int32#3,: 128]
855
+ # asm 2: vld1.8 {>5y12=d22->5y12=d23},[<ptr=r2,: 128]
856
+ vld1.8 {d22-d23},[r2,: 128]
857
+
858
+ # qhasm: r0[0,1] += v4[0] unsigned* 5y12[0]; r0[2,3] += v4[1] unsigned* 5y12[1]
859
+ # asm 1: vmlal.u32 <r0=reg128#8,<v4=reg128#4%bot,<5y12=reg128#12%bot
860
+ # asm 2: vmlal.u32 <r0=q7,<v4=d6,<5y12=d22
861
+ vmlal.u32 q7,d6,d22
862
+
863
+ # qhasm: r0[0,1] += v23[0] unsigned* 5y34[0]; r0[2,3] += v23[1] unsigned* 5y34[1]
864
+ # asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%bot,<5y34=reg128#13%bot
865
+ # asm 2: vmlal.u32 <r0=q7,<v23=d18,<5y34=d24
866
+ vmlal.u32 q7,d18,d24
867
+
868
+ # qhasm: r0[0,1] += v23[2] unsigned* 5y12[2]; r0[2,3] += v23[3] unsigned* 5y12[3]
869
+ # asm 1: vmlal.u32 <r0=reg128#8,<v23=reg128#10%top,<5y12=reg128#12%top
870
+ # asm 2: vmlal.u32 <r0=q7,<v23=d19,<5y12=d23
871
+ vmlal.u32 q7,d19,d23
872
+
873
+ # qhasm: r0[0,1] += v01[0] unsigned* y0[0]; r0[2,3] += v01[1] unsigned* y0[1]
874
+ # asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%bot,<y0=reg128#1%bot
875
+ # asm 2: vmlal.u32 <r0=q7,<v01=d20,<y0=d0
876
+ vmlal.u32 q7,d20,d0
877
+
878
+ # qhasm: r0[0,1] += v01[2] unsigned* 5y34[2]; r0[2,3] += v01[3] unsigned* 5y34[3]
879
+ # asm 1: vmlal.u32 <r0=reg128#8,<v01=reg128#11%top,<5y34=reg128#13%top
880
+ # asm 2: vmlal.u32 <r0=q7,<v01=d21,<5y34=d25
881
+ vmlal.u32 q7,d21,d25
882
+
883
+ # qhasm: r1[0,1] += v01[0] unsigned* y12[0]; r1[2,3] += v01[1] unsigned* y12[1]
884
+ # asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%bot,<y12=reg128#2%bot
885
+ # asm 2: vmlal.u32 <r1=q14,<v01=d20,<y12=d2
886
+ vmlal.u32 q14,d20,d2
887
+
888
+ # qhasm: r1[0,1] += v01[2] unsigned* y0[0]; r1[2,3] += v01[3] unsigned* y0[1]
889
+ # asm 1: vmlal.u32 <r1=reg128#15,<v01=reg128#11%top,<y0=reg128#1%bot
890
+ # asm 2: vmlal.u32 <r1=q14,<v01=d21,<y0=d0
891
+ vmlal.u32 q14,d21,d0
892
+
893
+ # qhasm: r1[0,1] += v23[0] unsigned* 5y34[2]; r1[2,3] += v23[1] unsigned* 5y34[3]
894
+ # asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%bot,<5y34=reg128#13%top
895
+ # asm 2: vmlal.u32 <r1=q14,<v23=d18,<5y34=d25
896
+ vmlal.u32 q14,d18,d25
897
+
898
+ # qhasm: r1[0,1] += v23[2] unsigned* 5y34[0]; r1[2,3] += v23[3] unsigned* 5y34[1]
899
+ # asm 1: vmlal.u32 <r1=reg128#15,<v23=reg128#10%top,<5y34=reg128#13%bot
900
+ # asm 2: vmlal.u32 <r1=q14,<v23=d19,<5y34=d24
901
+ vmlal.u32 q14,d19,d24
902
+
903
+ # qhasm: r1[0,1] += v4[0] unsigned* 5y12[2]; r1[2,3] += v4[1] unsigned* 5y12[3]
904
+ # asm 1: vmlal.u32 <r1=reg128#15,<v4=reg128#4%bot,<5y12=reg128#12%top
905
+ # asm 2: vmlal.u32 <r1=q14,<v4=d6,<5y12=d23
906
+ vmlal.u32 q14,d6,d23
907
+
908
+ # qhasm: r2[0,1] += v01[0] unsigned* y12[2]; r2[2,3] += v01[1] unsigned* y12[3]
909
+ # asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%bot,<y12=reg128#2%top
910
+ # asm 2: vmlal.u32 <r2=q13,<v01=d20,<y12=d3
911
+ vmlal.u32 q13,d20,d3
912
+
913
+ # qhasm: r2[0,1] += v01[2] unsigned* y12[0]; r2[2,3] += v01[3] unsigned* y12[1]
914
+ # asm 1: vmlal.u32 <r2=reg128#14,<v01=reg128#11%top,<y12=reg128#2%bot
915
+ # asm 2: vmlal.u32 <r2=q13,<v01=d21,<y12=d2
916
+ vmlal.u32 q13,d21,d2
917
+
918
+ # qhasm: r2[0,1] += v23[0] unsigned* y0[0]; r2[2,3] += v23[1] unsigned* y0[1]
919
+ # asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%bot,<y0=reg128#1%bot
920
+ # asm 2: vmlal.u32 <r2=q13,<v23=d18,<y0=d0
921
+ vmlal.u32 q13,d18,d0
922
+
923
+ # qhasm: r2[0,1] += v23[2] unsigned* 5y34[2]; r2[2,3] += v23[3] unsigned* 5y34[3]
924
+ # asm 1: vmlal.u32 <r2=reg128#14,<v23=reg128#10%top,<5y34=reg128#13%top
925
+ # asm 2: vmlal.u32 <r2=q13,<v23=d19,<5y34=d25
926
+ vmlal.u32 q13,d19,d25
927
+
928
+ # qhasm: r2[0,1] += v4[0] unsigned* 5y34[0]; r2[2,3] += v4[1] unsigned* 5y34[1]
929
+ # asm 1: vmlal.u32 <r2=reg128#14,<v4=reg128#4%bot,<5y34=reg128#13%bot
930
+ # asm 2: vmlal.u32 <r2=q13,<v4=d6,<5y34=d24
931
+ vmlal.u32 q13,d6,d24
932
+
933
+ # qhasm: ptr = &two24
934
+ # asm 1: lea >ptr=int32#3,<two24=stack128#1
935
+ # asm 2: lea >ptr=r2,<two24=[sp,#0]
936
+ add r2,sp,#0
937
+
938
+ # qhasm: 2x t1 = r0 unsigned>> 26
939
+ # asm 1: vshr.u64 >t1=reg128#4,<r0=reg128#8,#26
940
+ # asm 2: vshr.u64 >t1=q3,<r0=q7,#26
941
+ vshr.u64 q3,q7,#26
942
+
943
+ # qhasm: len -= 64
944
+ # asm 1: sub >len=int32#4,<len=int32#4,#64
945
+ # asm 2: sub >len=r3,<len=r3,#64
946
+ sub r3,r3,#64
947
+
948
+ # qhasm: r0 &= mask
949
+ # asm 1: vand >r0=reg128#6,<r0=reg128#8,<mask=reg128#7
950
+ # asm 2: vand >r0=q5,<r0=q7,<mask=q6
951
+ vand q5,q7,q6
952
+
953
+ # qhasm: 2x r1 += t1
954
+ # asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#15,<t1=reg128#4
955
+ # asm 2: vadd.i64 >r1=q3,<r1=q14,<t1=q3
956
+ vadd.i64 q3,q14,q3
957
+
958
+ # qhasm: 2x t4 = r3 unsigned>> 26
959
+ # asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#5,#26
960
+ # asm 2: vshr.u64 >t4=q7,<r3=q4,#26
961
+ vshr.u64 q7,q4,#26
962
+
963
+ # qhasm: r3 &= mask
964
+ # asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
965
+ # asm 2: vand >r3=q4,<r3=q4,<mask=q6
966
+ vand q4,q4,q6
967
+
968
+ # qhasm: 2x x4 = r4 + t4
969
+ # asm 1: vadd.i64 >x4=reg128#8,<r4=reg128#16,<t4=reg128#8
970
+ # asm 2: vadd.i64 >x4=q7,<r4=q15,<t4=q7
971
+ vadd.i64 q7,q15,q7
972
+
973
+ # qhasm: r4 aligned= mem128[ptr]
974
+ # asm 1: vld1.8 {>r4=reg128#16%bot->r4=reg128#16%top},[<ptr=int32#3,: 128]
975
+ # asm 2: vld1.8 {>r4=d30->r4=d31},[<ptr=r2,: 128]
976
+ vld1.8 {d30-d31},[r2,: 128]
977
+
978
+ # qhasm: 2x t2 = r1 unsigned>> 26
979
+ # asm 1: vshr.u64 >t2=reg128#9,<r1=reg128#4,#26
980
+ # asm 2: vshr.u64 >t2=q8,<r1=q3,#26
981
+ vshr.u64 q8,q3,#26
982
+
983
+ # qhasm: r1 &= mask
984
+ # asm 1: vand >r1=reg128#4,<r1=reg128#4,<mask=reg128#7
985
+ # asm 2: vand >r1=q3,<r1=q3,<mask=q6
986
+ vand q3,q3,q6
987
+
988
+ # qhasm: 2x t0 = x4 unsigned>> 26
989
+ # asm 1: vshr.u64 >t0=reg128#10,<x4=reg128#8,#26
990
+ # asm 2: vshr.u64 >t0=q9,<x4=q7,#26
991
+ vshr.u64 q9,q7,#26
992
+
993
+ # qhasm: 2x r2 += t2
994
+ # asm 1: vadd.i64 >r2=reg128#9,<r2=reg128#14,<t2=reg128#9
995
+ # asm 2: vadd.i64 >r2=q8,<r2=q13,<t2=q8
996
+ vadd.i64 q8,q13,q8
997
+
998
+ # qhasm: x4 &= mask
999
+ # asm 1: vand >x4=reg128#11,<x4=reg128#8,<mask=reg128#7
1000
+ # asm 2: vand >x4=q10,<x4=q7,<mask=q6
1001
+ vand q10,q7,q6
1002
+
1003
+ # qhasm: 2x x01 = r0 + t0
1004
+ # asm 1: vadd.i64 >x01=reg128#6,<r0=reg128#6,<t0=reg128#10
1005
+ # asm 2: vadd.i64 >x01=q5,<r0=q5,<t0=q9
1006
+ vadd.i64 q5,q5,q9
1007
+
1008
+ # qhasm: r0 aligned= mem128[ptr]
1009
+ # asm 1: vld1.8 {>r0=reg128#8%bot->r0=reg128#8%top},[<ptr=int32#3,: 128]
1010
+ # asm 2: vld1.8 {>r0=d14->r0=d15},[<ptr=r2,: 128]
1011
+ vld1.8 {d14-d15},[r2,: 128]
1012
+
1013
+ # qhasm: ptr = &z34_stack
1014
+ # asm 1: lea >ptr=int32#3,<z34_stack=stack128#9
1015
+ # asm 2: lea >ptr=r2,<z34_stack=[sp,#128]
1016
+ add r2,sp,#128
1017
+
1018
+ # qhasm: 2x t0 <<= 2
1019
+ # asm 1: vshl.i64 >t0=reg128#10,<t0=reg128#10,#2
1020
+ # asm 2: vshl.i64 >t0=q9,<t0=q9,#2
1021
+ vshl.i64 q9,q9,#2
1022
+
1023
+ # qhasm: 2x t3 = r2 unsigned>> 26
1024
+ # asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#9,#26
1025
+ # asm 2: vshr.u64 >t3=q13,<r2=q8,#26
1026
+ vshr.u64 q13,q8,#26
1027
+
1028
+ # qhasm: 2x x01 += t0
1029
+ # asm 1: vadd.i64 >x01=reg128#15,<x01=reg128#6,<t0=reg128#10
1030
+ # asm 2: vadd.i64 >x01=q14,<x01=q5,<t0=q9
1031
+ vadd.i64 q14,q5,q9
1032
+
1033
+ # qhasm: z34 aligned= mem128[ptr]
1034
+ # asm 1: vld1.8 {>z34=reg128#6%bot->z34=reg128#6%top},[<ptr=int32#3,: 128]
1035
+ # asm 2: vld1.8 {>z34=d10->z34=d11},[<ptr=r2,: 128]
1036
+ vld1.8 {d10-d11},[r2,: 128]
1037
+
1038
+ # qhasm: x23 = r2 & mask
1039
+ # asm 1: vand >x23=reg128#10,<r2=reg128#9,<mask=reg128#7
1040
+ # asm 2: vand >x23=q9,<r2=q8,<mask=q6
1041
+ vand q9,q8,q6
1042
+
1043
+ # qhasm: 2x r3 += t3
1044
+ # asm 1: vadd.i64 >r3=reg128#5,<r3=reg128#5,<t3=reg128#14
1045
+ # asm 2: vadd.i64 >r3=q4,<r3=q4,<t3=q13
1046
+ vadd.i64 q4,q4,q13
1047
+
1048
+ # qhasm: input_2 += 32
1049
+ # asm 1: add >input_2=int32#2,<input_2=int32#2,#32
1050
+ # asm 2: add >input_2=r1,<input_2=r1,#32
1051
+ add r1,r1,#32
1052
+
1053
+ # qhasm: 2x t1 = x01 unsigned>> 26
1054
+ # asm 1: vshr.u64 >t1=reg128#14,<x01=reg128#15,#26
1055
+ # asm 2: vshr.u64 >t1=q13,<x01=q14,#26
1056
+ vshr.u64 q13,q14,#26
1057
+
1058
+ # qhasm: x23 = x23[0,2,1,3]
1059
+ # asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
1060
+ # asm 2: vtrn.32 <x23=d18,<x23=d19
1061
+ vtrn.32 d18,d19
1062
+
1063
+ # qhasm: x01 = x01 & mask
1064
+ # asm 1: vand >x01=reg128#9,<x01=reg128#15,<mask=reg128#7
1065
+ # asm 2: vand >x01=q8,<x01=q14,<mask=q6
1066
+ vand q8,q14,q6
1067
+
1068
+ # qhasm: 2x r1 += t1
1069
+ # asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#4,<t1=reg128#14
1070
+ # asm 2: vadd.i64 >r1=q3,<r1=q3,<t1=q13
1071
+ vadd.i64 q3,q3,q13
1072
+
1073
+ # qhasm: 2x t4 = r3 unsigned>> 26
1074
+ # asm 1: vshr.u64 >t4=reg128#14,<r3=reg128#5,#26
1075
+ # asm 2: vshr.u64 >t4=q13,<r3=q4,#26
1076
+ vshr.u64 q13,q4,#26
1077
+
1078
+ # qhasm: x01 = x01[0,2,1,3]
1079
+ # asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
1080
+ # asm 2: vtrn.32 <x01=d16,<x01=d17
1081
+ vtrn.32 d16,d17
1082
+
1083
+ # qhasm: r3 &= mask
1084
+ # asm 1: vand >r3=reg128#5,<r3=reg128#5,<mask=reg128#7
1085
+ # asm 2: vand >r3=q4,<r3=q4,<mask=q6
1086
+ vand q4,q4,q6
1087
+
1088
+ # qhasm: r1 = r1[0,2,1,3]
1089
+ # asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
1090
+ # asm 2: vtrn.32 <r1=d6,<r1=d7
1091
+ vtrn.32 d6,d7
1092
+
1093
+ # qhasm: 2x x4 += t4
1094
+ # asm 1: vadd.i64 >x4=reg128#11,<x4=reg128#11,<t4=reg128#14
1095
+ # asm 2: vadd.i64 >x4=q10,<x4=q10,<t4=q13
1096
+ vadd.i64 q10,q10,q13
1097
+
1098
+ # qhasm: r3 = r3[0,2,1,3]
1099
+ # asm 1: vtrn.32 <r3=reg128#5%bot,<r3=reg128#5%top
1100
+ # asm 2: vtrn.32 <r3=d8,<r3=d9
1101
+ vtrn.32 d8,d9
1102
+
1103
+ # qhasm: x01 = x01[0,1] r1[0,1]
1104
+ # asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
1105
+ # asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
1106
+ vext.32 d17,d6,d6,#0
1107
+
1108
+ # qhasm: x23 = x23[0,1] r3[0,1]
1109
+ # asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#5%bot,<r3=reg128#5%bot,#0
1110
+ # asm 2: vext.32 <x23=d19,<r3=d8,<r3=d8,#0
1111
+ vext.32 d19,d8,d8,#0
1112
+
1113
+ # qhasm: x4 = x4[0,2,1,3]
1114
+ # asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
1115
+ # asm 2: vtrn.32 <x4=d20,<x4=d21
1116
+ vtrn.32 d20,d21
1117
+
1118
+ # qhasm: unsigned>? len - 64
1119
+ # asm 1: cmp <len=int32#4,#64
1120
+ # asm 2: cmp <len=r3,#64
1121
+ cmp r3,#64
1122
+
1123
+ # qhasm: goto mainloop2 if unsigned>
1124
+ bhi ._mainloop2
1125
+
1126
+ # qhasm: input_2 -= 32
1127
+ # asm 1: sub >input_2=int32#3,<input_2=int32#2,#32
1128
+ # asm 2: sub >input_2=r2,<input_2=r1,#32
1129
+ sub r2,r1,#32
1130
+
1131
+ # qhasm: below64bytes:
1132
+ ._below64bytes:
1133
+
1134
+ # qhasm: unsigned>? len - 32
1135
+ # asm 1: cmp <len=int32#4,#32
1136
+ # asm 2: cmp <len=r3,#32
1137
+ cmp r3,#32
1138
+
1139
+ # qhasm: goto end if !unsigned>
1140
+ bls ._end
1141
+
1142
+ # qhasm: mainloop:
1143
+ ._mainloop:
1144
+
1145
+ # qhasm: new r0
1146
+
1147
+ # qhasm: ptr = &two24
1148
+ # asm 1: lea >ptr=int32#2,<two24=stack128#1
1149
+ # asm 2: lea >ptr=r1,<two24=[sp,#0]
1150
+ add r1,sp,#0
1151
+
1152
+ # qhasm: r4 aligned= mem128[ptr]
1153
+ # asm 1: vld1.8 {>r4=reg128#5%bot->r4=reg128#5%top},[<ptr=int32#2,: 128]
1154
+ # asm 2: vld1.8 {>r4=d8->r4=d9},[<ptr=r1,: 128]
1155
+ vld1.8 {d8-d9},[r1,: 128]
1156
+
1157
+ # qhasm: u4 aligned= mem128[ptr]
1158
+ # asm 1: vld1.8 {>u4=reg128#6%bot->u4=reg128#6%top},[<ptr=int32#2,: 128]
1159
+ # asm 2: vld1.8 {>u4=d10->u4=d11},[<ptr=r1,: 128]
1160
+ vld1.8 {d10-d11},[r1,: 128]
1161
+
1162
+ # qhasm: c01 = mem128[input_2];input_2+=16
1163
+ # asm 1: vld1.8 {>c01=reg128#8%bot->c01=reg128#8%top},[<input_2=int32#3]!
1164
+ # asm 2: vld1.8 {>c01=d14->c01=d15},[<input_2=r2]!
1165
+ vld1.8 {d14-d15},[r2]!
1166
+
1167
+ # qhasm: r4[0,1] += x01[0] unsigned* y34[2]; r4[2,3] += x01[1] unsigned* y34[3]
1168
+ # asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%bot,<y34=reg128#3%top
1169
+ # asm 2: vmlal.u32 <r4=q4,<x01=d16,<y34=d5
1170
+ vmlal.u32 q4,d16,d5
1171
+
1172
+ # qhasm: c23 = mem128[input_2];input_2+=16
1173
+ # asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_2=int32#3]!
1174
+ # asm 2: vld1.8 {>c23=d26->c23=d27},[<input_2=r2]!
1175
+ vld1.8 {d26-d27},[r2]!
1176
+
1177
+ # qhasm: r4[0,1] += x01[2] unsigned* y34[0]; r4[2,3] += x01[3] unsigned* y34[1]
1178
+ # asm 1: vmlal.u32 <r4=reg128#5,<x01=reg128#9%top,<y34=reg128#3%bot
1179
+ # asm 2: vmlal.u32 <r4=q4,<x01=d17,<y34=d4
1180
+ vmlal.u32 q4,d17,d4
1181
+
1182
+ # qhasm: r0 = u4[1]c01[0]r0[2,3]
1183
+ # asm 1: vext.32 <r0=reg128#4%bot,<u4=reg128#6%bot,<c01=reg128#8%bot,#1
1184
+ # asm 2: vext.32 <r0=d6,<u4=d10,<c01=d14,#1
1185
+ vext.32 d6,d10,d14,#1
1186
+
1187
+ # qhasm: r4[0,1] += x23[0] unsigned* y12[2]; r4[2,3] += x23[1] unsigned* y12[3]
1188
+ # asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%bot,<y12=reg128#2%top
1189
+ # asm 2: vmlal.u32 <r4=q4,<x23=d18,<y12=d3
1190
+ vmlal.u32 q4,d18,d3
1191
+
1192
+ # qhasm: r0 = r0[0,1]u4[1]c23[0]
1193
+ # asm 1: vext.32 <r0=reg128#4%top,<u4=reg128#6%bot,<c23=reg128#14%bot,#1
1194
+ # asm 2: vext.32 <r0=d7,<u4=d10,<c23=d26,#1
1195
+ vext.32 d7,d10,d26,#1
1196
+
1197
+ # qhasm: r4[0,1] += x23[2] unsigned* y12[0]; r4[2,3] += x23[3] unsigned* y12[1]
1198
+ # asm 1: vmlal.u32 <r4=reg128#5,<x23=reg128#10%top,<y12=reg128#2%bot
1199
+ # asm 2: vmlal.u32 <r4=q4,<x23=d19,<y12=d2
1200
+ vmlal.u32 q4,d19,d2
1201
+
1202
+ # qhasm: r0 = r0[1]r0[0]r0[3]r0[2]
1203
+ # asm 1: vrev64.i32 >r0=reg128#4,<r0=reg128#4
1204
+ # asm 2: vrev64.i32 >r0=q3,<r0=q3
1205
+ vrev64.i32 q3,q3
1206
+
1207
+ # qhasm: r4[0,1] += x4[0] unsigned* y0[0]; r4[2,3] += x4[1] unsigned* y0[1]
1208
+ # asm 1: vmlal.u32 <r4=reg128#5,<x4=reg128#11%bot,<y0=reg128#1%bot
1209
+ # asm 2: vmlal.u32 <r4=q4,<x4=d20,<y0=d0
1210
+ vmlal.u32 q4,d20,d0
1211
+
1212
+ # qhasm: r0[0,1] += x4[0] unsigned* 5y12[0]; r0[2,3] += x4[1] unsigned* 5y12[1]
1213
+ # asm 1: vmlal.u32 <r0=reg128#4,<x4=reg128#11%bot,<5y12=reg128#12%bot
1214
+ # asm 2: vmlal.u32 <r0=q3,<x4=d20,<5y12=d22
1215
+ vmlal.u32 q3,d20,d22
1216
+
1217
+ # qhasm: r0[0,1] += x23[0] unsigned* 5y34[0]; r0[2,3] += x23[1] unsigned* 5y34[1]
1218
+ # asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%bot,<5y34=reg128#13%bot
1219
+ # asm 2: vmlal.u32 <r0=q3,<x23=d18,<5y34=d24
1220
+ vmlal.u32 q3,d18,d24
1221
+
1222
+ # qhasm: r0[0,1] += x23[2] unsigned* 5y12[2]; r0[2,3] += x23[3] unsigned* 5y12[3]
1223
+ # asm 1: vmlal.u32 <r0=reg128#4,<x23=reg128#10%top,<5y12=reg128#12%top
1224
+ # asm 2: vmlal.u32 <r0=q3,<x23=d19,<5y12=d23
1225
+ vmlal.u32 q3,d19,d23
1226
+
1227
+ # qhasm: c01 c23 = c01[0]c23[0]c01[2]c23[2]c01[1]c23[1]c01[3]c23[3]
1228
+ # asm 1: vtrn.32 <c01=reg128#8,<c23=reg128#14
1229
+ # asm 2: vtrn.32 <c01=q7,<c23=q13
1230
+ vtrn.32 q7,q13
1231
+
1232
+ # qhasm: r0[0,1] += x01[0] unsigned* y0[0]; r0[2,3] += x01[1] unsigned* y0[1]
1233
+ # asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%bot,<y0=reg128#1%bot
1234
+ # asm 2: vmlal.u32 <r0=q3,<x01=d16,<y0=d0
1235
+ vmlal.u32 q3,d16,d0
1236
+
1237
+ # qhasm: r3[0,1] = c23[2]<<18; r3[2,3] = c23[3]<<18
1238
+ # asm 1: vshll.u32 >r3=reg128#6,<c23=reg128#14%top,#18
1239
+ # asm 2: vshll.u32 >r3=q5,<c23=d27,#18
1240
+ vshll.u32 q5,d27,#18
1241
+
1242
+ # qhasm: r0[0,1] += x01[2] unsigned* 5y34[2]; r0[2,3] += x01[3] unsigned* 5y34[3]
1243
+ # asm 1: vmlal.u32 <r0=reg128#4,<x01=reg128#9%top,<5y34=reg128#13%top
1244
+ # asm 2: vmlal.u32 <r0=q3,<x01=d17,<5y34=d25
1245
+ vmlal.u32 q3,d17,d25
1246
+
1247
+ # qhasm: r3[0,1] += x01[0] unsigned* y34[0]; r3[2,3] += x01[1] unsigned* y34[1]
1248
+ # asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%bot,<y34=reg128#3%bot
1249
+ # asm 2: vmlal.u32 <r3=q5,<x01=d16,<y34=d4
1250
+ vmlal.u32 q5,d16,d4
1251
+
1252
+ # qhasm: r3[0,1] += x01[2] unsigned* y12[2]; r3[2,3] += x01[3] unsigned* y12[3]
1253
+ # asm 1: vmlal.u32 <r3=reg128#6,<x01=reg128#9%top,<y12=reg128#2%top
1254
+ # asm 2: vmlal.u32 <r3=q5,<x01=d17,<y12=d3
1255
+ vmlal.u32 q5,d17,d3
1256
+
1257
+ # qhasm: r3[0,1] += x23[0] unsigned* y12[0]; r3[2,3] += x23[1] unsigned* y12[1]
1258
+ # asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%bot,<y12=reg128#2%bot
1259
+ # asm 2: vmlal.u32 <r3=q5,<x23=d18,<y12=d2
1260
+ vmlal.u32 q5,d18,d2
1261
+
1262
+ # qhasm: r3[0,1] += x23[2] unsigned* y0[0]; r3[2,3] += x23[3] unsigned* y0[1]
1263
+ # asm 1: vmlal.u32 <r3=reg128#6,<x23=reg128#10%top,<y0=reg128#1%bot
1264
+ # asm 2: vmlal.u32 <r3=q5,<x23=d19,<y0=d0
1265
+ vmlal.u32 q5,d19,d0
1266
+
1267
+ # qhasm: r1[0,1] = c23[0]<<6; r1[2,3] = c23[1]<<6
1268
+ # asm 1: vshll.u32 >r1=reg128#14,<c23=reg128#14%bot,#6
1269
+ # asm 2: vshll.u32 >r1=q13,<c23=d26,#6
1270
+ vshll.u32 q13,d26,#6
1271
+
1272
+ # qhasm: r3[0,1] += x4[0] unsigned* 5y34[2]; r3[2,3] += x4[1] unsigned* 5y34[3]
1273
+ # asm 1: vmlal.u32 <r3=reg128#6,<x4=reg128#11%bot,<5y34=reg128#13%top
1274
+ # asm 2: vmlal.u32 <r3=q5,<x4=d20,<5y34=d25
1275
+ vmlal.u32 q5,d20,d25
1276
+
1277
+ # qhasm: r1[0,1] += x01[0] unsigned* y12[0]; r1[2,3] += x01[1] unsigned* y12[1]
1278
+ # asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%bot,<y12=reg128#2%bot
1279
+ # asm 2: vmlal.u32 <r1=q13,<x01=d16,<y12=d2
1280
+ vmlal.u32 q13,d16,d2
1281
+
1282
+ # qhasm: r1[0,1] += x01[2] unsigned* y0[0]; r1[2,3] += x01[3] unsigned* y0[1]
1283
+ # asm 1: vmlal.u32 <r1=reg128#14,<x01=reg128#9%top,<y0=reg128#1%bot
1284
+ # asm 2: vmlal.u32 <r1=q13,<x01=d17,<y0=d0
1285
+ vmlal.u32 q13,d17,d0
1286
+
1287
+ # qhasm: r1[0,1] += x23[0] unsigned* 5y34[2]; r1[2,3] += x23[1] unsigned* 5y34[3]
1288
+ # asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%bot,<5y34=reg128#13%top
1289
+ # asm 2: vmlal.u32 <r1=q13,<x23=d18,<5y34=d25
1290
+ vmlal.u32 q13,d18,d25
1291
+
1292
+ # qhasm: r1[0,1] += x23[2] unsigned* 5y34[0]; r1[2,3] += x23[3] unsigned* 5y34[1]
1293
+ # asm 1: vmlal.u32 <r1=reg128#14,<x23=reg128#10%top,<5y34=reg128#13%bot
1294
+ # asm 2: vmlal.u32 <r1=q13,<x23=d19,<5y34=d24
1295
+ vmlal.u32 q13,d19,d24
1296
+
1297
+ # qhasm: r2[0,1] = c01[2]<<12; r2[2,3] = c01[3]<<12
1298
+ # asm 1: vshll.u32 >r2=reg128#8,<c01=reg128#8%top,#12
1299
+ # asm 2: vshll.u32 >r2=q7,<c01=d15,#12
1300
+ vshll.u32 q7,d15,#12
1301
+
1302
+ # qhasm: r1[0,1] += x4[0] unsigned* 5y12[2]; r1[2,3] += x4[1] unsigned* 5y12[3]
1303
+ # asm 1: vmlal.u32 <r1=reg128#14,<x4=reg128#11%bot,<5y12=reg128#12%top
1304
+ # asm 2: vmlal.u32 <r1=q13,<x4=d20,<5y12=d23
1305
+ vmlal.u32 q13,d20,d23
1306
+
1307
+ # qhasm: r2[0,1] += x01[0] unsigned* y12[2]; r2[2,3] += x01[1] unsigned* y12[3]
1308
+ # asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%bot,<y12=reg128#2%top
1309
+ # asm 2: vmlal.u32 <r2=q7,<x01=d16,<y12=d3
1310
+ vmlal.u32 q7,d16,d3
1311
+
1312
+ # qhasm: r2[0,1] += x01[2] unsigned* y12[0]; r2[2,3] += x01[3] unsigned* y12[1]
1313
+ # asm 1: vmlal.u32 <r2=reg128#8,<x01=reg128#9%top,<y12=reg128#2%bot
1314
+ # asm 2: vmlal.u32 <r2=q7,<x01=d17,<y12=d2
1315
+ vmlal.u32 q7,d17,d2
1316
+
1317
+ # qhasm: r2[0,1] += x23[0] unsigned* y0[0]; r2[2,3] += x23[1] unsigned* y0[1]
1318
+ # asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%bot,<y0=reg128#1%bot
1319
+ # asm 2: vmlal.u32 <r2=q7,<x23=d18,<y0=d0
1320
+ vmlal.u32 q7,d18,d0
1321
+
1322
+ # qhasm: r2[0,1] += x23[2] unsigned* 5y34[2]; r2[2,3] += x23[3] unsigned* 5y34[3]
1323
+ # asm 1: vmlal.u32 <r2=reg128#8,<x23=reg128#10%top,<5y34=reg128#13%top
1324
+ # asm 2: vmlal.u32 <r2=q7,<x23=d19,<5y34=d25
1325
+ vmlal.u32 q7,d19,d25
1326
+
1327
+ # qhasm: r2[0,1] += x4[0] unsigned* 5y34[0]; r2[2,3] += x4[1] unsigned* 5y34[1]
1328
+ # asm 1: vmlal.u32 <r2=reg128#8,<x4=reg128#11%bot,<5y34=reg128#13%bot
1329
+ # asm 2: vmlal.u32 <r2=q7,<x4=d20,<5y34=d24
1330
+ vmlal.u32 q7,d20,d24
1331
+
1332
+ # qhasm: 2x t1 = r0 unsigned>> 26
1333
+ # asm 1: vshr.u64 >t1=reg128#9,<r0=reg128#4,#26
1334
+ # asm 2: vshr.u64 >t1=q8,<r0=q3,#26
1335
+ vshr.u64 q8,q3,#26
1336
+
1337
+ # qhasm: r0 &= mask
1338
+ # asm 1: vand >r0=reg128#4,<r0=reg128#4,<mask=reg128#7
1339
+ # asm 2: vand >r0=q3,<r0=q3,<mask=q6
1340
+ vand q3,q3,q6
1341
+
1342
+ # qhasm: 2x r1 += t1
1343
+ # asm 1: vadd.i64 >r1=reg128#9,<r1=reg128#14,<t1=reg128#9
1344
+ # asm 2: vadd.i64 >r1=q8,<r1=q13,<t1=q8
1345
+ vadd.i64 q8,q13,q8
1346
+
1347
+ # qhasm: 2x t4 = r3 unsigned>> 26
1348
+ # asm 1: vshr.u64 >t4=reg128#10,<r3=reg128#6,#26
1349
+ # asm 2: vshr.u64 >t4=q9,<r3=q5,#26
1350
+ vshr.u64 q9,q5,#26
1351
+
1352
+ # qhasm: r3 &= mask
1353
+ # asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
1354
+ # asm 2: vand >r3=q5,<r3=q5,<mask=q6
1355
+ vand q5,q5,q6
1356
+
1357
+ # qhasm: 2x r4 += t4
1358
+ # asm 1: vadd.i64 >r4=reg128#5,<r4=reg128#5,<t4=reg128#10
1359
+ # asm 2: vadd.i64 >r4=q4,<r4=q4,<t4=q9
1360
+ vadd.i64 q4,q4,q9
1361
+
1362
+ # qhasm: 2x t2 = r1 unsigned>> 26
1363
+ # asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#9,#26
1364
+ # asm 2: vshr.u64 >t2=q9,<r1=q8,#26
1365
+ vshr.u64 q9,q8,#26
1366
+
1367
+ # qhasm: r1 &= mask
1368
+ # asm 1: vand >r1=reg128#11,<r1=reg128#9,<mask=reg128#7
1369
+ # asm 2: vand >r1=q10,<r1=q8,<mask=q6
1370
+ vand q10,q8,q6
1371
+
1372
+ # qhasm: 2x t0 = r4 unsigned>> 26
1373
+ # asm 1: vshr.u64 >t0=reg128#9,<r4=reg128#5,#26
1374
+ # asm 2: vshr.u64 >t0=q8,<r4=q4,#26
1375
+ vshr.u64 q8,q4,#26
1376
+
1377
+ # qhasm: 2x r2 += t2
1378
+ # asm 1: vadd.i64 >r2=reg128#8,<r2=reg128#8,<t2=reg128#10
1379
+ # asm 2: vadd.i64 >r2=q7,<r2=q7,<t2=q9
1380
+ vadd.i64 q7,q7,q9
1381
+
1382
+ # qhasm: r4 &= mask
1383
+ # asm 1: vand >r4=reg128#5,<r4=reg128#5,<mask=reg128#7
1384
+ # asm 2: vand >r4=q4,<r4=q4,<mask=q6
1385
+ vand q4,q4,q6
1386
+
1387
+ # qhasm: 2x r0 += t0
1388
+ # asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
1389
+ # asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
1390
+ vadd.i64 q3,q3,q8
1391
+
1392
+ # qhasm: 2x t0 <<= 2
1393
+ # asm 1: vshl.i64 >t0=reg128#9,<t0=reg128#9,#2
1394
+ # asm 2: vshl.i64 >t0=q8,<t0=q8,#2
1395
+ vshl.i64 q8,q8,#2
1396
+
1397
+ # qhasm: 2x t3 = r2 unsigned>> 26
1398
+ # asm 1: vshr.u64 >t3=reg128#14,<r2=reg128#8,#26
1399
+ # asm 2: vshr.u64 >t3=q13,<r2=q7,#26
1400
+ vshr.u64 q13,q7,#26
1401
+
1402
+ # qhasm: 2x r0 += t0
1403
+ # asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#9
1404
+ # asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q8
1405
+ vadd.i64 q3,q3,q8
1406
+
1407
+ # qhasm: x23 = r2 & mask
1408
+ # asm 1: vand >x23=reg128#10,<r2=reg128#8,<mask=reg128#7
1409
+ # asm 2: vand >x23=q9,<r2=q7,<mask=q6
1410
+ vand q9,q7,q6
1411
+
1412
+ # qhasm: 2x r3 += t3
1413
+ # asm 1: vadd.i64 >r3=reg128#6,<r3=reg128#6,<t3=reg128#14
1414
+ # asm 2: vadd.i64 >r3=q5,<r3=q5,<t3=q13
1415
+ vadd.i64 q5,q5,q13
1416
+
1417
+ # qhasm: 2x t1 = r0 unsigned>> 26
1418
+ # asm 1: vshr.u64 >t1=reg128#8,<r0=reg128#4,#26
1419
+ # asm 2: vshr.u64 >t1=q7,<r0=q3,#26
1420
+ vshr.u64 q7,q3,#26
1421
+
1422
+ # qhasm: x01 = r0 & mask
1423
+ # asm 1: vand >x01=reg128#9,<r0=reg128#4,<mask=reg128#7
1424
+ # asm 2: vand >x01=q8,<r0=q3,<mask=q6
1425
+ vand q8,q3,q6
1426
+
1427
+ # qhasm: 2x r1 += t1
1428
+ # asm 1: vadd.i64 >r1=reg128#4,<r1=reg128#11,<t1=reg128#8
1429
+ # asm 2: vadd.i64 >r1=q3,<r1=q10,<t1=q7
1430
+ vadd.i64 q3,q10,q7
1431
+
1432
+ # qhasm: 2x t4 = r3 unsigned>> 26
1433
+ # asm 1: vshr.u64 >t4=reg128#8,<r3=reg128#6,#26
1434
+ # asm 2: vshr.u64 >t4=q7,<r3=q5,#26
1435
+ vshr.u64 q7,q5,#26
1436
+
1437
+ # qhasm: r3 &= mask
1438
+ # asm 1: vand >r3=reg128#6,<r3=reg128#6,<mask=reg128#7
1439
+ # asm 2: vand >r3=q5,<r3=q5,<mask=q6
1440
+ vand q5,q5,q6
1441
+
1442
+ # qhasm: 2x x4 = r4 + t4
1443
+ # asm 1: vadd.i64 >x4=reg128#11,<r4=reg128#5,<t4=reg128#8
1444
+ # asm 2: vadd.i64 >x4=q10,<r4=q4,<t4=q7
1445
+ vadd.i64 q10,q4,q7
1446
+
1447
+ # qhasm: len -= 32
1448
+ # asm 1: sub >len=int32#4,<len=int32#4,#32
1449
+ # asm 2: sub >len=r3,<len=r3,#32
1450
+ sub r3,r3,#32
1451
+
1452
+ # qhasm: x01 = x01[0,2,1,3]
1453
+ # asm 1: vtrn.32 <x01=reg128#9%bot,<x01=reg128#9%top
1454
+ # asm 2: vtrn.32 <x01=d16,<x01=d17
1455
+ vtrn.32 d16,d17
1456
+
1457
+ # qhasm: x23 = x23[0,2,1,3]
1458
+ # asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
1459
+ # asm 2: vtrn.32 <x23=d18,<x23=d19
1460
+ vtrn.32 d18,d19
1461
+
1462
+ # qhasm: r1 = r1[0,2,1,3]
1463
+ # asm 1: vtrn.32 <r1=reg128#4%bot,<r1=reg128#4%top
1464
+ # asm 2: vtrn.32 <r1=d6,<r1=d7
1465
+ vtrn.32 d6,d7
1466
+
1467
+ # qhasm: r3 = r3[0,2,1,3]
1468
+ # asm 1: vtrn.32 <r3=reg128#6%bot,<r3=reg128#6%top
1469
+ # asm 2: vtrn.32 <r3=d10,<r3=d11
1470
+ vtrn.32 d10,d11
1471
+
1472
+ # qhasm: x4 = x4[0,2,1,3]
1473
+ # asm 1: vtrn.32 <x4=reg128#11%bot,<x4=reg128#11%top
1474
+ # asm 2: vtrn.32 <x4=d20,<x4=d21
1475
+ vtrn.32 d20,d21
1476
+
1477
+ # qhasm: x01 = x01[0,1] r1[0,1]
1478
+ # asm 1: vext.32 <x01=reg128#9%top,<r1=reg128#4%bot,<r1=reg128#4%bot,#0
1479
+ # asm 2: vext.32 <x01=d17,<r1=d6,<r1=d6,#0
1480
+ vext.32 d17,d6,d6,#0
1481
+
1482
+ # qhasm: x23 = x23[0,1] r3[0,1]
1483
+ # asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#6%bot,<r3=reg128#6%bot,#0
1484
+ # asm 2: vext.32 <x23=d19,<r3=d10,<r3=d10,#0
1485
+ vext.32 d19,d10,d10,#0
1486
+
1487
+ # qhasm: unsigned>? len - 32
1488
+ # asm 1: cmp <len=int32#4,#32
1489
+ # asm 2: cmp <len=r3,#32
1490
+ cmp r3,#32
1491
+
1492
+ # qhasm: goto mainloop if unsigned>
1493
+ bhi ._mainloop
1494
+
1495
+ # qhasm: end:
1496
+ ._end:
1497
+
1498
+ # qhasm: mem128[input_0] = x01;input_0+=16
1499
+ # asm 1: vst1.8 {<x01=reg128#9%bot-<x01=reg128#9%top},[<input_0=int32#1]!
1500
+ # asm 2: vst1.8 {<x01=d16-<x01=d17},[<input_0=r0]!
1501
+ vst1.8 {d16-d17},[r0]!
1502
+
1503
+ # qhasm: mem128[input_0] = x23;input_0+=16
1504
+ # asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1]!
1505
+ # asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0]!
1506
+ vst1.8 {d18-d19},[r0]!
1507
+
1508
+ # qhasm: mem64[input_0] = x4[0]
1509
+ # asm 1: vst1.8 <x4=reg128#11%bot,[<input_0=int32#1]
1510
+ # asm 2: vst1.8 <x4=d20,[<input_0=r0]
1511
+ vst1.8 d20,[r0]
1512
+
1513
+ # qhasm: len = len
1514
+ # asm 1: mov >len=int32#1,<len=int32#4
1515
+ # asm 2: mov >len=r0,<len=r3
1516
+ mov r0,r3
1517
+
1518
+ # qhasm: qpopreturn len
1519
+ mov sp,r12
1520
+ vpop {q4,q5,q6,q7}
1521
+ bx lr
1522
+
1523
+ # qhasm: int32 input_0
1524
+
1525
+ # qhasm: int32 input_1
1526
+
1527
+ # qhasm: int32 input_2
1528
+
1529
+ # qhasm: int32 input_3
1530
+
1531
+ # qhasm: stack32 input_4
1532
+
1533
+ # qhasm: stack32 input_5
1534
+
1535
+ # qhasm: stack32 input_6
1536
+
1537
+ # qhasm: stack32 input_7
1538
+
1539
+ # qhasm: int32 caller_r4
1540
+
1541
+ # qhasm: int32 caller_r5
1542
+
1543
+ # qhasm: int32 caller_r6
1544
+
1545
+ # qhasm: int32 caller_r7
1546
+
1547
+ # qhasm: int32 caller_r8
1548
+
1549
+ # qhasm: int32 caller_r9
1550
+
1551
+ # qhasm: int32 caller_r10
1552
+
1553
+ # qhasm: int32 caller_r11
1554
+
1555
+ # qhasm: int32 caller_r12
1556
+
1557
+ # qhasm: int32 caller_r14
1558
+
1559
+ # qhasm: reg128 caller_q4
1560
+
1561
+ # qhasm: reg128 caller_q5
1562
+
1563
+ # qhasm: reg128 caller_q6
1564
+
1565
+ # qhasm: reg128 caller_q7
1566
+
1567
+ # qhasm: reg128 r0
1568
+
1569
+ # qhasm: reg128 r1
1570
+
1571
+ # qhasm: reg128 r2
1572
+
1573
+ # qhasm: reg128 r3
1574
+
1575
+ # qhasm: reg128 r4
1576
+
1577
+ # qhasm: reg128 x01
1578
+
1579
+ # qhasm: reg128 x23
1580
+
1581
+ # qhasm: reg128 x4
1582
+
1583
+ # qhasm: reg128 y01
1584
+
1585
+ # qhasm: reg128 y23
1586
+
1587
+ # qhasm: reg128 y4
1588
+
1589
+ # qhasm: reg128 _5y01
1590
+
1591
+ # qhasm: reg128 _5y23
1592
+
1593
+ # qhasm: reg128 _5y4
1594
+
1595
+ # qhasm: reg128 c01
1596
+
1597
+ # qhasm: reg128 c23
1598
+
1599
+ # qhasm: reg128 c4
1600
+
1601
+ # qhasm: reg128 t0
1602
+
1603
+ # qhasm: reg128 t1
1604
+
1605
+ # qhasm: reg128 t2
1606
+
1607
+ # qhasm: reg128 t3
1608
+
1609
+ # qhasm: reg128 t4
1610
+
1611
+ # qhasm: reg128 mask
1612
+
1613
+ # qhasm: enter crypto_onetimeauth_poly1305_neon2_addmulmod
1614
+ .align 2
1615
+ .global openssl_poly1305_neon2_addmulmod
1616
+ .hidden openssl_poly1305_neon2_addmulmod
1617
+ .type openssl_poly1305_neon2_addmulmod STT_FUNC
1618
+ openssl_poly1305_neon2_addmulmod:
1619
+ sub sp,sp,#0
1620
+
1621
+ # qhasm: 2x mask = 0xffffffff
1622
+ # asm 1: vmov.i64 >mask=reg128#1,#0xffffffff
1623
+ # asm 2: vmov.i64 >mask=q0,#0xffffffff
1624
+ vmov.i64 q0,#0xffffffff
1625
+
1626
+ # qhasm: y01 aligned= mem128[input_2];input_2+=16
1627
+ # asm 1: vld1.8 {>y01=reg128#2%bot->y01=reg128#2%top},[<input_2=int32#3,: 128]!
1628
+ # asm 2: vld1.8 {>y01=d2->y01=d3},[<input_2=r2,: 128]!
1629
+ vld1.8 {d2-d3},[r2,: 128]!
1630
+
1631
+ # qhasm: 4x _5y01 = y01 << 2
1632
+ # asm 1: vshl.i32 >_5y01=reg128#3,<y01=reg128#2,#2
1633
+ # asm 2: vshl.i32 >_5y01=q2,<y01=q1,#2
1634
+ vshl.i32 q2,q1,#2
1635
+
1636
+ # qhasm: y23 aligned= mem128[input_2];input_2+=16
1637
+ # asm 1: vld1.8 {>y23=reg128#4%bot->y23=reg128#4%top},[<input_2=int32#3,: 128]!
1638
+ # asm 2: vld1.8 {>y23=d6->y23=d7},[<input_2=r2,: 128]!
1639
+ vld1.8 {d6-d7},[r2,: 128]!
1640
+
1641
+ # qhasm: 4x _5y23 = y23 << 2
1642
+ # asm 1: vshl.i32 >_5y23=reg128#9,<y23=reg128#4,#2
1643
+ # asm 2: vshl.i32 >_5y23=q8,<y23=q3,#2
1644
+ vshl.i32 q8,q3,#2
1645
+
1646
+ # qhasm: y4 aligned= mem64[input_2]y4[1]
1647
+ # asm 1: vld1.8 {<y4=reg128#10%bot},[<input_2=int32#3,: 64]
1648
+ # asm 2: vld1.8 {<y4=d18},[<input_2=r2,: 64]
1649
+ vld1.8 {d18},[r2,: 64]
1650
+
1651
+ # qhasm: 4x _5y4 = y4 << 2
1652
+ # asm 1: vshl.i32 >_5y4=reg128#11,<y4=reg128#10,#2
1653
+ # asm 2: vshl.i32 >_5y4=q10,<y4=q9,#2
1654
+ vshl.i32 q10,q9,#2
1655
+
1656
+ # qhasm: x01 aligned= mem128[input_1];input_1+=16
1657
+ # asm 1: vld1.8 {>x01=reg128#12%bot->x01=reg128#12%top},[<input_1=int32#2,: 128]!
1658
+ # asm 2: vld1.8 {>x01=d22->x01=d23},[<input_1=r1,: 128]!
1659
+ vld1.8 {d22-d23},[r1,: 128]!
1660
+
1661
+ # qhasm: 4x _5y01 += y01
1662
+ # asm 1: vadd.i32 >_5y01=reg128#3,<_5y01=reg128#3,<y01=reg128#2
1663
+ # asm 2: vadd.i32 >_5y01=q2,<_5y01=q2,<y01=q1
1664
+ vadd.i32 q2,q2,q1
1665
+
1666
+ # qhasm: x23 aligned= mem128[input_1];input_1+=16
1667
+ # asm 1: vld1.8 {>x23=reg128#13%bot->x23=reg128#13%top},[<input_1=int32#2,: 128]!
1668
+ # asm 2: vld1.8 {>x23=d24->x23=d25},[<input_1=r1,: 128]!
1669
+ vld1.8 {d24-d25},[r1,: 128]!
1670
+
1671
+ # qhasm: 4x _5y23 += y23
1672
+ # asm 1: vadd.i32 >_5y23=reg128#9,<_5y23=reg128#9,<y23=reg128#4
1673
+ # asm 2: vadd.i32 >_5y23=q8,<_5y23=q8,<y23=q3
1674
+ vadd.i32 q8,q8,q3
1675
+
1676
+ # qhasm: 4x _5y4 += y4
1677
+ # asm 1: vadd.i32 >_5y4=reg128#11,<_5y4=reg128#11,<y4=reg128#10
1678
+ # asm 2: vadd.i32 >_5y4=q10,<_5y4=q10,<y4=q9
1679
+ vadd.i32 q10,q10,q9
1680
+
1681
+ # qhasm: c01 aligned= mem128[input_3];input_3+=16
1682
+ # asm 1: vld1.8 {>c01=reg128#14%bot->c01=reg128#14%top},[<input_3=int32#4,: 128]!
1683
+ # asm 2: vld1.8 {>c01=d26->c01=d27},[<input_3=r3,: 128]!
1684
+ vld1.8 {d26-d27},[r3,: 128]!
1685
+
1686
+ # qhasm: 4x x01 += c01
1687
+ # asm 1: vadd.i32 >x01=reg128#12,<x01=reg128#12,<c01=reg128#14
1688
+ # asm 2: vadd.i32 >x01=q11,<x01=q11,<c01=q13
1689
+ vadd.i32 q11,q11,q13
1690
+
1691
+ # qhasm: c23 aligned= mem128[input_3];input_3+=16
1692
+ # asm 1: vld1.8 {>c23=reg128#14%bot->c23=reg128#14%top},[<input_3=int32#4,: 128]!
1693
+ # asm 2: vld1.8 {>c23=d26->c23=d27},[<input_3=r3,: 128]!
1694
+ vld1.8 {d26-d27},[r3,: 128]!
1695
+
1696
+ # qhasm: 4x x23 += c23
1697
+ # asm 1: vadd.i32 >x23=reg128#13,<x23=reg128#13,<c23=reg128#14
1698
+ # asm 2: vadd.i32 >x23=q12,<x23=q12,<c23=q13
1699
+ vadd.i32 q12,q12,q13
1700
+
1701
+ # qhasm: x4 aligned= mem64[input_1]x4[1]
1702
+ # asm 1: vld1.8 {<x4=reg128#14%bot},[<input_1=int32#2,: 64]
1703
+ # asm 2: vld1.8 {<x4=d26},[<input_1=r1,: 64]
1704
+ vld1.8 {d26},[r1,: 64]
1705
+
1706
+ # qhasm: 2x mask unsigned>>=6
1707
+ # asm 1: vshr.u64 >mask=reg128#1,<mask=reg128#1,#6
1708
+ # asm 2: vshr.u64 >mask=q0,<mask=q0,#6
1709
+ vshr.u64 q0,q0,#6
1710
+
1711
+ # qhasm: c4 aligned= mem64[input_3]c4[1]
1712
+ # asm 1: vld1.8 {<c4=reg128#15%bot},[<input_3=int32#4,: 64]
1713
+ # asm 2: vld1.8 {<c4=d28},[<input_3=r3,: 64]
1714
+ vld1.8 {d28},[r3,: 64]
1715
+
1716
+ # qhasm: 4x x4 += c4
1717
+ # asm 1: vadd.i32 >x4=reg128#14,<x4=reg128#14,<c4=reg128#15
1718
+ # asm 2: vadd.i32 >x4=q13,<x4=q13,<c4=q14
1719
+ vadd.i32 q13,q13,q14
1720
+
1721
+ # qhasm: r0[0,1] = x01[0] unsigned* y01[0]; r0[2,3] = x01[1] unsigned* y01[1]
1722
+ # asm 1: vmull.u32 >r0=reg128#15,<x01=reg128#12%bot,<y01=reg128#2%bot
1723
+ # asm 2: vmull.u32 >r0=q14,<x01=d22,<y01=d2
1724
+ vmull.u32 q14,d22,d2
1725
+
1726
+ # qhasm: r0[0,1] += x01[2] unsigned* _5y4[0]; r0[2,3] += x01[3] unsigned* _5y4[1]
1727
+ # asm 1: vmlal.u32 <r0=reg128#15,<x01=reg128#12%top,<_5y4=reg128#11%bot
1728
+ # asm 2: vmlal.u32 <r0=q14,<x01=d23,<_5y4=d20
1729
+ vmlal.u32 q14,d23,d20
1730
+
1731
+ # qhasm: r0[0,1] += x23[0] unsigned* _5y23[2]; r0[2,3] += x23[1] unsigned* _5y23[3]
1732
+ # asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%bot,<_5y23=reg128#9%top
1733
+ # asm 2: vmlal.u32 <r0=q14,<x23=d24,<_5y23=d17
1734
+ vmlal.u32 q14,d24,d17
1735
+
1736
+ # qhasm: r0[0,1] += x23[2] unsigned* _5y23[0]; r0[2,3] += x23[3] unsigned* _5y23[1]
1737
+ # asm 1: vmlal.u32 <r0=reg128#15,<x23=reg128#13%top,<_5y23=reg128#9%bot
1738
+ # asm 2: vmlal.u32 <r0=q14,<x23=d25,<_5y23=d16
1739
+ vmlal.u32 q14,d25,d16
1740
+
1741
+ # qhasm: r0[0,1] += x4[0] unsigned* _5y01[2]; r0[2,3] += x4[1] unsigned* _5y01[3]
1742
+ # asm 1: vmlal.u32 <r0=reg128#15,<x4=reg128#14%bot,<_5y01=reg128#3%top
1743
+ # asm 2: vmlal.u32 <r0=q14,<x4=d26,<_5y01=d5
1744
+ vmlal.u32 q14,d26,d5
1745
+
1746
+ # qhasm: r1[0,1] = x01[0] unsigned* y01[2]; r1[2,3] = x01[1] unsigned* y01[3]
1747
+ # asm 1: vmull.u32 >r1=reg128#3,<x01=reg128#12%bot,<y01=reg128#2%top
1748
+ # asm 2: vmull.u32 >r1=q2,<x01=d22,<y01=d3
1749
+ vmull.u32 q2,d22,d3
1750
+
1751
+ # qhasm: r1[0,1] += x01[2] unsigned* y01[0]; r1[2,3] += x01[3] unsigned* y01[1]
1752
+ # asm 1: vmlal.u32 <r1=reg128#3,<x01=reg128#12%top,<y01=reg128#2%bot
1753
+ # asm 2: vmlal.u32 <r1=q2,<x01=d23,<y01=d2
1754
+ vmlal.u32 q2,d23,d2
1755
+
1756
+ # qhasm: r1[0,1] += x23[0] unsigned* _5y4[0]; r1[2,3] += x23[1] unsigned* _5y4[1]
1757
+ # asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%bot,<_5y4=reg128#11%bot
1758
+ # asm 2: vmlal.u32 <r1=q2,<x23=d24,<_5y4=d20
1759
+ vmlal.u32 q2,d24,d20
1760
+
1761
+ # qhasm: r1[0,1] += x23[2] unsigned* _5y23[2]; r1[2,3] += x23[3] unsigned* _5y23[3]
1762
+ # asm 1: vmlal.u32 <r1=reg128#3,<x23=reg128#13%top,<_5y23=reg128#9%top
1763
+ # asm 2: vmlal.u32 <r1=q2,<x23=d25,<_5y23=d17
1764
+ vmlal.u32 q2,d25,d17
1765
+
1766
+ # qhasm: r1[0,1] += x4[0] unsigned* _5y23[0]; r1[2,3] += x4[1] unsigned* _5y23[1]
1767
+ # asm 1: vmlal.u32 <r1=reg128#3,<x4=reg128#14%bot,<_5y23=reg128#9%bot
1768
+ # asm 2: vmlal.u32 <r1=q2,<x4=d26,<_5y23=d16
1769
+ vmlal.u32 q2,d26,d16
1770
+
1771
+ # qhasm: r2[0,1] = x01[0] unsigned* y23[0]; r2[2,3] = x01[1] unsigned* y23[1]
1772
+ # asm 1: vmull.u32 >r2=reg128#16,<x01=reg128#12%bot,<y23=reg128#4%bot
1773
+ # asm 2: vmull.u32 >r2=q15,<x01=d22,<y23=d6
1774
+ vmull.u32 q15,d22,d6
1775
+
1776
+ # qhasm: r2[0,1] += x01[2] unsigned* y01[2]; r2[2,3] += x01[3] unsigned* y01[3]
1777
+ # asm 1: vmlal.u32 <r2=reg128#16,<x01=reg128#12%top,<y01=reg128#2%top
1778
+ # asm 2: vmlal.u32 <r2=q15,<x01=d23,<y01=d3
1779
+ vmlal.u32 q15,d23,d3
1780
+
1781
+ # qhasm: r2[0,1] += x23[0] unsigned* y01[0]; r2[2,3] += x23[1] unsigned* y01[1]
1782
+ # asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%bot,<y01=reg128#2%bot
1783
+ # asm 2: vmlal.u32 <r2=q15,<x23=d24,<y01=d2
1784
+ vmlal.u32 q15,d24,d2
1785
+
1786
+ # qhasm: r2[0,1] += x23[2] unsigned* _5y4[0]; r2[2,3] += x23[3] unsigned* _5y4[1]
1787
+ # asm 1: vmlal.u32 <r2=reg128#16,<x23=reg128#13%top,<_5y4=reg128#11%bot
1788
+ # asm 2: vmlal.u32 <r2=q15,<x23=d25,<_5y4=d20
1789
+ vmlal.u32 q15,d25,d20
1790
+
1791
+ # qhasm: r2[0,1] += x4[0] unsigned* _5y23[2]; r2[2,3] += x4[1] unsigned* _5y23[3]
1792
+ # asm 1: vmlal.u32 <r2=reg128#16,<x4=reg128#14%bot,<_5y23=reg128#9%top
1793
+ # asm 2: vmlal.u32 <r2=q15,<x4=d26,<_5y23=d17
1794
+ vmlal.u32 q15,d26,d17
1795
+
1796
+ # qhasm: r3[0,1] = x01[0] unsigned* y23[2]; r3[2,3] = x01[1] unsigned* y23[3]
1797
+ # asm 1: vmull.u32 >r3=reg128#9,<x01=reg128#12%bot,<y23=reg128#4%top
1798
+ # asm 2: vmull.u32 >r3=q8,<x01=d22,<y23=d7
1799
+ vmull.u32 q8,d22,d7
1800
+
1801
+ # qhasm: r3[0,1] += x01[2] unsigned* y23[0]; r3[2,3] += x01[3] unsigned* y23[1]
1802
+ # asm 1: vmlal.u32 <r3=reg128#9,<x01=reg128#12%top,<y23=reg128#4%bot
1803
+ # asm 2: vmlal.u32 <r3=q8,<x01=d23,<y23=d6
1804
+ vmlal.u32 q8,d23,d6
1805
+
1806
+ # qhasm: r3[0,1] += x23[0] unsigned* y01[2]; r3[2,3] += x23[1] unsigned* y01[3]
1807
+ # asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%bot,<y01=reg128#2%top
1808
+ # asm 2: vmlal.u32 <r3=q8,<x23=d24,<y01=d3
1809
+ vmlal.u32 q8,d24,d3
1810
+
1811
+ # qhasm: r3[0,1] += x23[2] unsigned* y01[0]; r3[2,3] += x23[3] unsigned* y01[1]
1812
+ # asm 1: vmlal.u32 <r3=reg128#9,<x23=reg128#13%top,<y01=reg128#2%bot
1813
+ # asm 2: vmlal.u32 <r3=q8,<x23=d25,<y01=d2
1814
+ vmlal.u32 q8,d25,d2
1815
+
1816
+ # qhasm: r3[0,1] += x4[0] unsigned* _5y4[0]; r3[2,3] += x4[1] unsigned* _5y4[1]
1817
+ # asm 1: vmlal.u32 <r3=reg128#9,<x4=reg128#14%bot,<_5y4=reg128#11%bot
1818
+ # asm 2: vmlal.u32 <r3=q8,<x4=d26,<_5y4=d20
1819
+ vmlal.u32 q8,d26,d20
1820
+
1821
+ # qhasm: r4[0,1] = x01[0] unsigned* y4[0]; r4[2,3] = x01[1] unsigned* y4[1]
1822
+ # asm 1: vmull.u32 >r4=reg128#10,<x01=reg128#12%bot,<y4=reg128#10%bot
1823
+ # asm 2: vmull.u32 >r4=q9,<x01=d22,<y4=d18
1824
+ vmull.u32 q9,d22,d18
1825
+
1826
+ # qhasm: r4[0,1] += x01[2] unsigned* y23[2]; r4[2,3] += x01[3] unsigned* y23[3]
1827
+ # asm 1: vmlal.u32 <r4=reg128#10,<x01=reg128#12%top,<y23=reg128#4%top
1828
+ # asm 2: vmlal.u32 <r4=q9,<x01=d23,<y23=d7
1829
+ vmlal.u32 q9,d23,d7
1830
+
1831
+ # qhasm: r4[0,1] += x23[0] unsigned* y23[0]; r4[2,3] += x23[1] unsigned* y23[1]
1832
+ # asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%bot,<y23=reg128#4%bot
1833
+ # asm 2: vmlal.u32 <r4=q9,<x23=d24,<y23=d6
1834
+ vmlal.u32 q9,d24,d6
1835
+
1836
+ # qhasm: r4[0,1] += x23[2] unsigned* y01[2]; r4[2,3] += x23[3] unsigned* y01[3]
1837
+ # asm 1: vmlal.u32 <r4=reg128#10,<x23=reg128#13%top,<y01=reg128#2%top
1838
+ # asm 2: vmlal.u32 <r4=q9,<x23=d25,<y01=d3
1839
+ vmlal.u32 q9,d25,d3
1840
+
1841
+ # qhasm: r4[0,1] += x4[0] unsigned* y01[0]; r4[2,3] += x4[1] unsigned* y01[1]
1842
+ # asm 1: vmlal.u32 <r4=reg128#10,<x4=reg128#14%bot,<y01=reg128#2%bot
1843
+ # asm 2: vmlal.u32 <r4=q9,<x4=d26,<y01=d2
1844
+ vmlal.u32 q9,d26,d2
1845
+
1846
+ # qhasm: 2x t1 = r0 unsigned>> 26
1847
+ # asm 1: vshr.u64 >t1=reg128#2,<r0=reg128#15,#26
1848
+ # asm 2: vshr.u64 >t1=q1,<r0=q14,#26
1849
+ vshr.u64 q1,q14,#26
1850
+
1851
+ # qhasm: r0 &= mask
1852
+ # asm 1: vand >r0=reg128#4,<r0=reg128#15,<mask=reg128#1
1853
+ # asm 2: vand >r0=q3,<r0=q14,<mask=q0
1854
+ vand q3,q14,q0
1855
+
1856
+ # qhasm: 2x r1 += t1
1857
+ # asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#3,<t1=reg128#2
1858
+ # asm 2: vadd.i64 >r1=q1,<r1=q2,<t1=q1
1859
+ vadd.i64 q1,q2,q1
1860
+
1861
+ # qhasm: 2x t4 = r3 unsigned>> 26
1862
+ # asm 1: vshr.u64 >t4=reg128#3,<r3=reg128#9,#26
1863
+ # asm 2: vshr.u64 >t4=q2,<r3=q8,#26
1864
+ vshr.u64 q2,q8,#26
1865
+
1866
+ # qhasm: r3 &= mask
1867
+ # asm 1: vand >r3=reg128#9,<r3=reg128#9,<mask=reg128#1
1868
+ # asm 2: vand >r3=q8,<r3=q8,<mask=q0
1869
+ vand q8,q8,q0
1870
+
1871
+ # qhasm: 2x r4 += t4
1872
+ # asm 1: vadd.i64 >r4=reg128#3,<r4=reg128#10,<t4=reg128#3
1873
+ # asm 2: vadd.i64 >r4=q2,<r4=q9,<t4=q2
1874
+ vadd.i64 q2,q9,q2
1875
+
1876
+ # qhasm: 2x t2 = r1 unsigned>> 26
1877
+ # asm 1: vshr.u64 >t2=reg128#10,<r1=reg128#2,#26
1878
+ # asm 2: vshr.u64 >t2=q9,<r1=q1,#26
1879
+ vshr.u64 q9,q1,#26
1880
+
1881
+ # qhasm: r1 &= mask
1882
+ # asm 1: vand >r1=reg128#2,<r1=reg128#2,<mask=reg128#1
1883
+ # asm 2: vand >r1=q1,<r1=q1,<mask=q0
1884
+ vand q1,q1,q0
1885
+
1886
+ # qhasm: 2x t0 = r4 unsigned>> 26
1887
+ # asm 1: vshr.u64 >t0=reg128#11,<r4=reg128#3,#26
1888
+ # asm 2: vshr.u64 >t0=q10,<r4=q2,#26
1889
+ vshr.u64 q10,q2,#26
1890
+
1891
+ # qhasm: 2x r2 += t2
1892
+ # asm 1: vadd.i64 >r2=reg128#10,<r2=reg128#16,<t2=reg128#10
1893
+ # asm 2: vadd.i64 >r2=q9,<r2=q15,<t2=q9
1894
+ vadd.i64 q9,q15,q9
1895
+
1896
+ # qhasm: r4 &= mask
1897
+ # asm 1: vand >r4=reg128#3,<r4=reg128#3,<mask=reg128#1
1898
+ # asm 2: vand >r4=q2,<r4=q2,<mask=q0
1899
+ vand q2,q2,q0
1900
+
1901
+ # qhasm: 2x r0 += t0
1902
+ # asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
1903
+ # asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
1904
+ vadd.i64 q3,q3,q10
1905
+
1906
+ # qhasm: 2x t0 <<= 2
1907
+ # asm 1: vshl.i64 >t0=reg128#11,<t0=reg128#11,#2
1908
+ # asm 2: vshl.i64 >t0=q10,<t0=q10,#2
1909
+ vshl.i64 q10,q10,#2
1910
+
1911
+ # qhasm: 2x t3 = r2 unsigned>> 26
1912
+ # asm 1: vshr.u64 >t3=reg128#12,<r2=reg128#10,#26
1913
+ # asm 2: vshr.u64 >t3=q11,<r2=q9,#26
1914
+ vshr.u64 q11,q9,#26
1915
+
1916
+ # qhasm: 2x r0 += t0
1917
+ # asm 1: vadd.i64 >r0=reg128#4,<r0=reg128#4,<t0=reg128#11
1918
+ # asm 2: vadd.i64 >r0=q3,<r0=q3,<t0=q10
1919
+ vadd.i64 q3,q3,q10
1920
+
1921
+ # qhasm: x23 = r2 & mask
1922
+ # asm 1: vand >x23=reg128#10,<r2=reg128#10,<mask=reg128#1
1923
+ # asm 2: vand >x23=q9,<r2=q9,<mask=q0
1924
+ vand q9,q9,q0
1925
+
1926
+ # qhasm: 2x r3 += t3
1927
+ # asm 1: vadd.i64 >r3=reg128#9,<r3=reg128#9,<t3=reg128#12
1928
+ # asm 2: vadd.i64 >r3=q8,<r3=q8,<t3=q11
1929
+ vadd.i64 q8,q8,q11
1930
+
1931
+ # qhasm: 2x t1 = r0 unsigned>> 26
1932
+ # asm 1: vshr.u64 >t1=reg128#11,<r0=reg128#4,#26
1933
+ # asm 2: vshr.u64 >t1=q10,<r0=q3,#26
1934
+ vshr.u64 q10,q3,#26
1935
+
1936
+ # qhasm: x23 = x23[0,2,1,3]
1937
+ # asm 1: vtrn.32 <x23=reg128#10%bot,<x23=reg128#10%top
1938
+ # asm 2: vtrn.32 <x23=d18,<x23=d19
1939
+ vtrn.32 d18,d19
1940
+
1941
+ # qhasm: x01 = r0 & mask
1942
+ # asm 1: vand >x01=reg128#4,<r0=reg128#4,<mask=reg128#1
1943
+ # asm 2: vand >x01=q3,<r0=q3,<mask=q0
1944
+ vand q3,q3,q0
1945
+
1946
+ # qhasm: 2x r1 += t1
1947
+ # asm 1: vadd.i64 >r1=reg128#2,<r1=reg128#2,<t1=reg128#11
1948
+ # asm 2: vadd.i64 >r1=q1,<r1=q1,<t1=q10
1949
+ vadd.i64 q1,q1,q10
1950
+
1951
+ # qhasm: 2x t4 = r3 unsigned>> 26
1952
+ # asm 1: vshr.u64 >t4=reg128#11,<r3=reg128#9,#26
1953
+ # asm 2: vshr.u64 >t4=q10,<r3=q8,#26
1954
+ vshr.u64 q10,q8,#26
1955
+
1956
+ # qhasm: x01 = x01[0,2,1,3]
1957
+ # asm 1: vtrn.32 <x01=reg128#4%bot,<x01=reg128#4%top
1958
+ # asm 2: vtrn.32 <x01=d6,<x01=d7
1959
+ vtrn.32 d6,d7
1960
+
1961
+ # qhasm: r3 &= mask
1962
+ # asm 1: vand >r3=reg128#1,<r3=reg128#9,<mask=reg128#1
1963
+ # asm 2: vand >r3=q0,<r3=q8,<mask=q0
1964
+ vand q0,q8,q0
1965
+
1966
+ # qhasm: r1 = r1[0,2,1,3]
1967
+ # asm 1: vtrn.32 <r1=reg128#2%bot,<r1=reg128#2%top
1968
+ # asm 2: vtrn.32 <r1=d2,<r1=d3
1969
+ vtrn.32 d2,d3
1970
+
1971
+ # qhasm: 2x x4 = r4 + t4
1972
+ # asm 1: vadd.i64 >x4=reg128#3,<r4=reg128#3,<t4=reg128#11
1973
+ # asm 2: vadd.i64 >x4=q2,<r4=q2,<t4=q10
1974
+ vadd.i64 q2,q2,q10
1975
+
1976
+ # qhasm: r3 = r3[0,2,1,3]
1977
+ # asm 1: vtrn.32 <r3=reg128#1%bot,<r3=reg128#1%top
1978
+ # asm 2: vtrn.32 <r3=d0,<r3=d1
1979
+ vtrn.32 d0,d1
1980
+
1981
+ # qhasm: x01 = x01[0,1] r1[0,1]
1982
+ # asm 1: vext.32 <x01=reg128#4%top,<r1=reg128#2%bot,<r1=reg128#2%bot,#0
1983
+ # asm 2: vext.32 <x01=d7,<r1=d2,<r1=d2,#0
1984
+ vext.32 d7,d2,d2,#0
1985
+
1986
+ # qhasm: x23 = x23[0,1] r3[0,1]
1987
+ # asm 1: vext.32 <x23=reg128#10%top,<r3=reg128#1%bot,<r3=reg128#1%bot,#0
1988
+ # asm 2: vext.32 <x23=d19,<r3=d0,<r3=d0,#0
1989
+ vext.32 d19,d0,d0,#0
1990
+
1991
+ # qhasm: x4 = x4[0,2,1,3]
1992
+ # asm 1: vtrn.32 <x4=reg128#3%bot,<x4=reg128#3%top
1993
+ # asm 2: vtrn.32 <x4=d4,<x4=d5
1994
+ vtrn.32 d4,d5
1995
+
1996
+ # qhasm: mem128[input_0] aligned= x01;input_0+=16
1997
+ # asm 1: vst1.8 {<x01=reg128#4%bot-<x01=reg128#4%top},[<input_0=int32#1,: 128]!
1998
+ # asm 2: vst1.8 {<x01=d6-<x01=d7},[<input_0=r0,: 128]!
1999
+ vst1.8 {d6-d7},[r0,: 128]!
2000
+
2001
+ # qhasm: mem128[input_0] aligned= x23;input_0+=16
2002
+ # asm 1: vst1.8 {<x23=reg128#10%bot-<x23=reg128#10%top},[<input_0=int32#1,: 128]!
2003
+ # asm 2: vst1.8 {<x23=d18-<x23=d19},[<input_0=r0,: 128]!
2004
+ vst1.8 {d18-d19},[r0,: 128]!
2005
+
2006
+ # qhasm: mem64[input_0] aligned= x4[0]
2007
+ # asm 1: vst1.8 <x4=reg128#3%bot,[<input_0=int32#1,: 64]
2008
+ # asm 2: vst1.8 <x4=d4,[<input_0=r0,: 64]
2009
+ vst1.8 d4,[r0,: 64]
2010
+
2011
+ # qhasm: return
2012
+ add sp,sp,#0
2013
+ bx lr
2014
+
2015
+ #endif /* __arm__ && !OPENSSL_NO_ASM */