ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1364 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ #
9
+ # Specific modes and adaptation for Linux kernel by Ard Biesheuvel
10
+ # <ard.biesheuvel@linaro.org>. Permission to use under GPL terms is
11
+ # granted.
12
+ # ====================================================================
13
+
14
+ # Bit-sliced AES for ARM NEON
15
+ #
16
+ # February 2012.
17
+ #
18
+ # This implementation is direct adaptation of bsaes-x86_64 module for
19
+ # ARM NEON. Except that this module is endian-neutral [in sense that
20
+ # it can be compiled for either endianness] by courtesy of vld1.8's
21
+ # neutrality. Initial version doesn't implement interface to OpenSSL,
22
+ # only low-level primitives and unsupported entry points, just enough
23
+ # to collect performance results, which for Cortex-A8 core are:
24
+ #
25
+ # encrypt 19.5 cycles per byte processed with 128-bit key
26
+ # decrypt 22.1 cycles per byte processed with 128-bit key
27
+ # key conv. 440 cycles per 128-bit key/0.18 of 8x block
28
+ #
29
+ # Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
30
+ # which is [much] worse than anticipated (for further details see
31
+ # http://www.openssl.org/~appro/Snapdragon-S4.html).
32
+ #
33
+ # Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
34
+ # manages in 20.0 cycles].
35
+ #
36
+ # When comparing to x86_64 results keep in mind that NEON unit is
37
+ # [mostly] single-issue and thus can't [fully] benefit from
38
+ # instruction-level parallelism. And when comparing to aes-armv4
39
+ # results keep in mind key schedule conversion overhead (see
40
+ # bsaes-x86_64.pl for further details)...
41
+ #
42
+ # <appro@openssl.org>
43
+
44
+ # April-August 2013
45
+ #
46
+ # Add CBC, CTR and XTS subroutines, adapt for kernel use.
47
+ #
48
+ # <ard.biesheuvel@linaro.org>
49
+
50
+ $flavour = shift;
51
+ if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
52
+ else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
53
+
54
+ if ($flavour && $flavour ne "void") {
55
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
56
+ ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
57
+ ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
58
+ die "can't locate arm-xlate.pl";
59
+
60
+ open STDOUT,"| \"$^X\" $xlate $flavour $output";
61
+ } else {
62
+ open STDOUT,">$output";
63
+ }
64
+
65
+ my ($inp,$out,$len,$key)=("r0","r1","r2","r3");
66
+ my @XMM=map("q$_",(0..15));
67
+
68
+ {
69
+ my ($key,$rounds,$const)=("r4","r5","r6");
70
+
71
+ sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
72
+ sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
73
+
74
+ sub Sbox {
75
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
76
+ # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
77
+ my @b=@_[0..7];
78
+ my @t=@_[8..11];
79
+ my @s=@_[12..15];
80
+ &InBasisChange (@b);
81
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
82
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
83
+ }
84
+
85
+ sub InBasisChange {
86
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
87
+ # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
88
+ my @b=@_[0..7];
89
+ $code.=<<___;
90
+ veor @b[2], @b[2], @b[1]
91
+ veor @b[5], @b[5], @b[6]
92
+ veor @b[3], @b[3], @b[0]
93
+ veor @b[6], @b[6], @b[2]
94
+ veor @b[5], @b[5], @b[0]
95
+
96
+ veor @b[6], @b[6], @b[3]
97
+ veor @b[3], @b[3], @b[7]
98
+ veor @b[7], @b[7], @b[5]
99
+ veor @b[3], @b[3], @b[4]
100
+ veor @b[4], @b[4], @b[5]
101
+
102
+ veor @b[2], @b[2], @b[7]
103
+ veor @b[3], @b[3], @b[1]
104
+ veor @b[1], @b[1], @b[5]
105
+ ___
106
+ }
107
+
108
+ sub OutBasisChange {
109
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
110
+ # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
111
+ my @b=@_[0..7];
112
+ $code.=<<___;
113
+ veor @b[0], @b[0], @b[6]
114
+ veor @b[1], @b[1], @b[4]
115
+ veor @b[4], @b[4], @b[6]
116
+ veor @b[2], @b[2], @b[0]
117
+ veor @b[6], @b[6], @b[1]
118
+
119
+ veor @b[1], @b[1], @b[5]
120
+ veor @b[5], @b[5], @b[3]
121
+ veor @b[3], @b[3], @b[7]
122
+ veor @b[7], @b[7], @b[5]
123
+ veor @b[2], @b[2], @b[5]
124
+
125
+ veor @b[4], @b[4], @b[7]
126
+ ___
127
+ }
128
+
129
+ sub InvSbox {
130
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
131
+ # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
132
+ my @b=@_[0..7];
133
+ my @t=@_[8..11];
134
+ my @s=@_[12..15];
135
+ &InvInBasisChange (@b);
136
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
137
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
138
+ }
139
+
140
+ sub InvInBasisChange { # OutBasisChange in reverse (with twist)
141
+ my @b=@_[5,1,2,6,3,7,0,4];
142
+ $code.=<<___
143
+ veor @b[1], @b[1], @b[7]
144
+ veor @b[4], @b[4], @b[7]
145
+
146
+ veor @b[7], @b[7], @b[5]
147
+ veor @b[1], @b[1], @b[3]
148
+ veor @b[2], @b[2], @b[5]
149
+ veor @b[3], @b[3], @b[7]
150
+
151
+ veor @b[6], @b[6], @b[1]
152
+ veor @b[2], @b[2], @b[0]
153
+ veor @b[5], @b[5], @b[3]
154
+ veor @b[4], @b[4], @b[6]
155
+ veor @b[0], @b[0], @b[6]
156
+ veor @b[1], @b[1], @b[4]
157
+ ___
158
+ }
159
+
160
+ sub InvOutBasisChange { # InBasisChange in reverse
161
+ my @b=@_[2,5,7,3,6,1,0,4];
162
+ $code.=<<___;
163
+ veor @b[1], @b[1], @b[5]
164
+ veor @b[2], @b[2], @b[7]
165
+
166
+ veor @b[3], @b[3], @b[1]
167
+ veor @b[4], @b[4], @b[5]
168
+ veor @b[7], @b[7], @b[5]
169
+ veor @b[3], @b[3], @b[4]
170
+ veor @b[5], @b[5], @b[0]
171
+ veor @b[3], @b[3], @b[7]
172
+ veor @b[6], @b[6], @b[2]
173
+ veor @b[2], @b[2], @b[1]
174
+ veor @b[6], @b[6], @b[3]
175
+
176
+ veor @b[3], @b[3], @b[0]
177
+ veor @b[5], @b[5], @b[6]
178
+ ___
179
+ }
180
+
181
+ sub Mul_GF4 {
182
+ #;*************************************************************
183
+ #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
184
+ #;*************************************************************
185
+ my ($x0,$x1,$y0,$y1,$t0,$t1)=@_;
186
+ $code.=<<___;
187
+ veor $t0, $y0, $y1
188
+ vand $t0, $t0, $x0
189
+ veor $x0, $x0, $x1
190
+ vand $t1, $x1, $y0
191
+ vand $x0, $x0, $y1
192
+ veor $x1, $t1, $t0
193
+ veor $x0, $x0, $t1
194
+ ___
195
+ }
196
+
197
+ sub Mul_GF4_N { # not used, see next subroutine
198
+ # multiply and scale by N
199
+ my ($x0,$x1,$y0,$y1,$t0)=@_;
200
+ $code.=<<___;
201
+ veor $t0, $y0, $y1
202
+ vand $t0, $t0, $x0
203
+ veor $x0, $x0, $x1
204
+ vand $x1, $x1, $y0
205
+ vand $x0, $x0, $y1
206
+ veor $x1, $x1, $x0
207
+ veor $x0, $x0, $t0
208
+ ___
209
+ }
210
+
211
+ sub Mul_GF4_N_GF4 {
212
+ # interleaved Mul_GF4_N and Mul_GF4
213
+ my ($x0,$x1,$y0,$y1,$t0,
214
+ $x2,$x3,$y2,$y3,$t1)=@_;
215
+ $code.=<<___;
216
+ veor $t0, $y0, $y1
217
+ veor $t1, $y2, $y3
218
+ vand $t0, $t0, $x0
219
+ vand $t1, $t1, $x2
220
+ veor $x0, $x0, $x1
221
+ veor $x2, $x2, $x3
222
+ vand $x1, $x1, $y0
223
+ vand $x3, $x3, $y2
224
+ vand $x0, $x0, $y1
225
+ vand $x2, $x2, $y3
226
+ veor $x1, $x1, $x0
227
+ veor $x2, $x2, $x3
228
+ veor $x0, $x0, $t0
229
+ veor $x3, $x3, $t1
230
+ ___
231
+ }
232
+ sub Mul_GF16_2 {
233
+ my @x=@_[0..7];
234
+ my @y=@_[8..11];
235
+ my @t=@_[12..15];
236
+ $code.=<<___;
237
+ veor @t[0], @x[0], @x[2]
238
+ veor @t[1], @x[1], @x[3]
239
+ ___
240
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2..3]);
241
+ $code.=<<___;
242
+ veor @y[0], @y[0], @y[2]
243
+ veor @y[1], @y[1], @y[3]
244
+ ___
245
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
246
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
247
+ $code.=<<___;
248
+ veor @x[0], @x[0], @t[0]
249
+ veor @x[2], @x[2], @t[0]
250
+ veor @x[1], @x[1], @t[1]
251
+ veor @x[3], @x[3], @t[1]
252
+
253
+ veor @t[0], @x[4], @x[6]
254
+ veor @t[1], @x[5], @x[7]
255
+ ___
256
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
257
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
258
+ $code.=<<___;
259
+ veor @y[0], @y[0], @y[2]
260
+ veor @y[1], @y[1], @y[3]
261
+ ___
262
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[2..3]);
263
+ $code.=<<___;
264
+ veor @x[4], @x[4], @t[0]
265
+ veor @x[6], @x[6], @t[0]
266
+ veor @x[5], @x[5], @t[1]
267
+ veor @x[7], @x[7], @t[1]
268
+ ___
269
+ }
270
+ sub Inv_GF256 {
271
+ #;********************************************************************
272
+ #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
273
+ #;********************************************************************
274
+ my @x=@_[0..7];
275
+ my @t=@_[8..11];
276
+ my @s=@_[12..15];
277
+ # direct optimizations from hardware
278
+ $code.=<<___;
279
+ veor @t[3], @x[4], @x[6]
280
+ veor @t[2], @x[5], @x[7]
281
+ veor @t[1], @x[1], @x[3]
282
+ veor @s[1], @x[7], @x[6]
283
+ vmov @t[0], @t[2]
284
+ veor @s[0], @x[0], @x[2]
285
+
286
+ vorr @t[2], @t[2], @t[1]
287
+ veor @s[3], @t[3], @t[0]
288
+ vand @s[2], @t[3], @s[0]
289
+ vorr @t[3], @t[3], @s[0]
290
+ veor @s[0], @s[0], @t[1]
291
+ vand @t[0], @t[0], @t[1]
292
+ veor @t[1], @x[3], @x[2]
293
+ vand @s[3], @s[3], @s[0]
294
+ vand @s[1], @s[1], @t[1]
295
+ veor @t[1], @x[4], @x[5]
296
+ veor @s[0], @x[1], @x[0]
297
+ veor @t[3], @t[3], @s[1]
298
+ veor @t[2], @t[2], @s[1]
299
+ vand @s[1], @t[1], @s[0]
300
+ vorr @t[1], @t[1], @s[0]
301
+ veor @t[3], @t[3], @s[3]
302
+ veor @t[0], @t[0], @s[1]
303
+ veor @t[2], @t[2], @s[2]
304
+ veor @t[1], @t[1], @s[3]
305
+ veor @t[0], @t[0], @s[2]
306
+ vand @s[0], @x[7], @x[3]
307
+ veor @t[1], @t[1], @s[2]
308
+ vand @s[1], @x[6], @x[2]
309
+ vand @s[2], @x[5], @x[1]
310
+ vorr @s[3], @x[4], @x[0]
311
+ veor @t[3], @t[3], @s[0]
312
+ veor @t[1], @t[1], @s[2]
313
+ veor @t[0], @t[0], @s[3]
314
+ veor @t[2], @t[2], @s[1]
315
+
316
+ @ Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
317
+
318
+ @ new smaller inversion
319
+
320
+ vand @s[2], @t[3], @t[1]
321
+ vmov @s[0], @t[0]
322
+
323
+ veor @s[1], @t[2], @s[2]
324
+ veor @s[3], @t[0], @s[2]
325
+ veor @s[2], @t[0], @s[2] @ @s[2]=@s[3]
326
+
327
+ vbsl @s[1], @t[1], @t[0]
328
+ vbsl @s[3], @t[3], @t[2]
329
+ veor @t[3], @t[3], @t[2]
330
+
331
+ vbsl @s[0], @s[1], @s[2]
332
+ vbsl @t[0], @s[2], @s[1]
333
+
334
+ vand @s[2], @s[0], @s[3]
335
+ veor @t[1], @t[1], @t[0]
336
+
337
+ veor @s[2], @s[2], @t[3]
338
+ ___
339
+ # output in s3, s2, s1, t1
340
+
341
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
342
+
343
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
344
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
345
+
346
+ ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
347
+ }
348
+
349
+ # AES linear components
350
+
351
+ sub ShiftRows {
352
+ my @x=@_[0..7];
353
+ my @t=@_[8..11];
354
+ my $mask=pop;
355
+ $code.=<<___;
356
+ vldmia $key!, {@t[0]-@t[3]}
357
+ veor @t[0], @t[0], @x[0]
358
+ veor @t[1], @t[1], @x[1]
359
+ vtbl.8 `&Dlo(@x[0])`, {@t[0]}, `&Dlo($mask)`
360
+ vtbl.8 `&Dhi(@x[0])`, {@t[0]}, `&Dhi($mask)`
361
+ vldmia $key!, {@t[0]}
362
+ veor @t[2], @t[2], @x[2]
363
+ vtbl.8 `&Dlo(@x[1])`, {@t[1]}, `&Dlo($mask)`
364
+ vtbl.8 `&Dhi(@x[1])`, {@t[1]}, `&Dhi($mask)`
365
+ vldmia $key!, {@t[1]}
366
+ veor @t[3], @t[3], @x[3]
367
+ vtbl.8 `&Dlo(@x[2])`, {@t[2]}, `&Dlo($mask)`
368
+ vtbl.8 `&Dhi(@x[2])`, {@t[2]}, `&Dhi($mask)`
369
+ vldmia $key!, {@t[2]}
370
+ vtbl.8 `&Dlo(@x[3])`, {@t[3]}, `&Dlo($mask)`
371
+ vtbl.8 `&Dhi(@x[3])`, {@t[3]}, `&Dhi($mask)`
372
+ vldmia $key!, {@t[3]}
373
+ veor @t[0], @t[0], @x[4]
374
+ veor @t[1], @t[1], @x[5]
375
+ vtbl.8 `&Dlo(@x[4])`, {@t[0]}, `&Dlo($mask)`
376
+ vtbl.8 `&Dhi(@x[4])`, {@t[0]}, `&Dhi($mask)`
377
+ veor @t[2], @t[2], @x[6]
378
+ vtbl.8 `&Dlo(@x[5])`, {@t[1]}, `&Dlo($mask)`
379
+ vtbl.8 `&Dhi(@x[5])`, {@t[1]}, `&Dhi($mask)`
380
+ veor @t[3], @t[3], @x[7]
381
+ vtbl.8 `&Dlo(@x[6])`, {@t[2]}, `&Dlo($mask)`
382
+ vtbl.8 `&Dhi(@x[6])`, {@t[2]}, `&Dhi($mask)`
383
+ vtbl.8 `&Dlo(@x[7])`, {@t[3]}, `&Dlo($mask)`
384
+ vtbl.8 `&Dhi(@x[7])`, {@t[3]}, `&Dhi($mask)`
385
+ ___
386
+ }
387
+
388
+ sub MixColumns {
389
+ # modified to emit output in order suitable for feeding back to aesenc[last]
390
+ my @x=@_[0..7];
391
+ my @t=@_[8..15];
392
+ my $inv=@_[16]; # optional
393
+ $code.=<<___;
394
+ vext.8 @t[0], @x[0], @x[0], #12 @ x0 <<< 32
395
+ vext.8 @t[1], @x[1], @x[1], #12
396
+ veor @x[0], @x[0], @t[0] @ x0 ^ (x0 <<< 32)
397
+ vext.8 @t[2], @x[2], @x[2], #12
398
+ veor @x[1], @x[1], @t[1]
399
+ vext.8 @t[3], @x[3], @x[3], #12
400
+ veor @x[2], @x[2], @t[2]
401
+ vext.8 @t[4], @x[4], @x[4], #12
402
+ veor @x[3], @x[3], @t[3]
403
+ vext.8 @t[5], @x[5], @x[5], #12
404
+ veor @x[4], @x[4], @t[4]
405
+ vext.8 @t[6], @x[6], @x[6], #12
406
+ veor @x[5], @x[5], @t[5]
407
+ vext.8 @t[7], @x[7], @x[7], #12
408
+ veor @x[6], @x[6], @t[6]
409
+
410
+ veor @t[1], @t[1], @x[0]
411
+ veor @x[7], @x[7], @t[7]
412
+ vext.8 @x[0], @x[0], @x[0], #8 @ (x0 ^ (x0 <<< 32)) <<< 64)
413
+ veor @t[2], @t[2], @x[1]
414
+ veor @t[0], @t[0], @x[7]
415
+ veor @t[1], @t[1], @x[7]
416
+ vext.8 @x[1], @x[1], @x[1], #8
417
+ veor @t[5], @t[5], @x[4]
418
+ veor @x[0], @x[0], @t[0]
419
+ veor @t[6], @t[6], @x[5]
420
+ veor @x[1], @x[1], @t[1]
421
+ vext.8 @t[0], @x[4], @x[4], #8
422
+ veor @t[4], @t[4], @x[3]
423
+ vext.8 @t[1], @x[5], @x[5], #8
424
+ veor @t[7], @t[7], @x[6]
425
+ vext.8 @x[4], @x[3], @x[3], #8
426
+ veor @t[3], @t[3], @x[2]
427
+ vext.8 @x[5], @x[7], @x[7], #8
428
+ veor @t[4], @t[4], @x[7]
429
+ vext.8 @x[3], @x[6], @x[6], #8
430
+ veor @t[3], @t[3], @x[7]
431
+ vext.8 @x[6], @x[2], @x[2], #8
432
+ veor @x[7], @t[1], @t[5]
433
+ ___
434
+ $code.=<<___ if (!$inv);
435
+ veor @x[2], @t[0], @t[4]
436
+ veor @x[4], @x[4], @t[3]
437
+ veor @x[5], @x[5], @t[7]
438
+ veor @x[3], @x[3], @t[6]
439
+ @ vmov @x[2], @t[0]
440
+ veor @x[6], @x[6], @t[2]
441
+ @ vmov @x[7], @t[1]
442
+ ___
443
+ $code.=<<___ if ($inv);
444
+ veor @t[3], @t[3], @x[4]
445
+ veor @x[5], @x[5], @t[7]
446
+ veor @x[2], @x[3], @t[6]
447
+ veor @x[3], @t[0], @t[4]
448
+ veor @x[4], @x[6], @t[2]
449
+ vmov @x[6], @t[3]
450
+ @ vmov @x[7], @t[1]
451
+ ___
452
+ }
453
+
454
+ sub InvMixColumns_orig {
455
+ my @x=@_[0..7];
456
+ my @t=@_[8..15];
457
+
458
+ $code.=<<___;
459
+ @ multiplication by 0x0e
460
+ vext.8 @t[7], @x[7], @x[7], #12
461
+ vmov @t[2], @x[2]
462
+ veor @x[2], @x[2], @x[5] @ 2 5
463
+ veor @x[7], @x[7], @x[5] @ 7 5
464
+ vext.8 @t[0], @x[0], @x[0], #12
465
+ vmov @t[5], @x[5]
466
+ veor @x[5], @x[5], @x[0] @ 5 0 [1]
467
+ veor @x[0], @x[0], @x[1] @ 0 1
468
+ vext.8 @t[1], @x[1], @x[1], #12
469
+ veor @x[1], @x[1], @x[2] @ 1 25
470
+ veor @x[0], @x[0], @x[6] @ 01 6 [2]
471
+ vext.8 @t[3], @x[3], @x[3], #12
472
+ veor @x[1], @x[1], @x[3] @ 125 3 [4]
473
+ veor @x[2], @x[2], @x[0] @ 25 016 [3]
474
+ veor @x[3], @x[3], @x[7] @ 3 75
475
+ veor @x[7], @x[7], @x[6] @ 75 6 [0]
476
+ vext.8 @t[6], @x[6], @x[6], #12
477
+ vmov @t[4], @x[4]
478
+ veor @x[6], @x[6], @x[4] @ 6 4
479
+ veor @x[4], @x[4], @x[3] @ 4 375 [6]
480
+ veor @x[3], @x[3], @x[7] @ 375 756=36
481
+ veor @x[6], @x[6], @t[5] @ 64 5 [7]
482
+ veor @x[3], @x[3], @t[2] @ 36 2
483
+ vext.8 @t[5], @t[5], @t[5], #12
484
+ veor @x[3], @x[3], @t[4] @ 362 4 [5]
485
+ ___
486
+ my @y = @x[7,5,0,2,1,3,4,6];
487
+ $code.=<<___;
488
+ @ multiplication by 0x0b
489
+ veor @y[1], @y[1], @y[0]
490
+ veor @y[0], @y[0], @t[0]
491
+ vext.8 @t[2], @t[2], @t[2], #12
492
+ veor @y[1], @y[1], @t[1]
493
+ veor @y[0], @y[0], @t[5]
494
+ vext.8 @t[4], @t[4], @t[4], #12
495
+ veor @y[1], @y[1], @t[6]
496
+ veor @y[0], @y[0], @t[7]
497
+ veor @t[7], @t[7], @t[6] @ clobber t[7]
498
+
499
+ veor @y[3], @y[3], @t[0]
500
+ veor @y[1], @y[1], @y[0]
501
+ vext.8 @t[0], @t[0], @t[0], #12
502
+ veor @y[2], @y[2], @t[1]
503
+ veor @y[4], @y[4], @t[1]
504
+ vext.8 @t[1], @t[1], @t[1], #12
505
+ veor @y[2], @y[2], @t[2]
506
+ veor @y[3], @y[3], @t[2]
507
+ veor @y[5], @y[5], @t[2]
508
+ veor @y[2], @y[2], @t[7]
509
+ vext.8 @t[2], @t[2], @t[2], #12
510
+ veor @y[3], @y[3], @t[3]
511
+ veor @y[6], @y[6], @t[3]
512
+ veor @y[4], @y[4], @t[3]
513
+ veor @y[7], @y[7], @t[4]
514
+ vext.8 @t[3], @t[3], @t[3], #12
515
+ veor @y[5], @y[5], @t[4]
516
+ veor @y[7], @y[7], @t[7]
517
+ veor @t[7], @t[7], @t[5] @ clobber t[7] even more
518
+ veor @y[3], @y[3], @t[5]
519
+ veor @y[4], @y[4], @t[4]
520
+
521
+ veor @y[5], @y[5], @t[7]
522
+ vext.8 @t[4], @t[4], @t[4], #12
523
+ veor @y[6], @y[6], @t[7]
524
+ veor @y[4], @y[4], @t[7]
525
+
526
+ veor @t[7], @t[7], @t[5]
527
+ vext.8 @t[5], @t[5], @t[5], #12
528
+
529
+ @ multiplication by 0x0d
530
+ veor @y[4], @y[4], @y[7]
531
+ veor @t[7], @t[7], @t[6] @ restore t[7]
532
+ veor @y[7], @y[7], @t[4]
533
+ vext.8 @t[6], @t[6], @t[6], #12
534
+ veor @y[2], @y[2], @t[0]
535
+ veor @y[7], @y[7], @t[5]
536
+ vext.8 @t[7], @t[7], @t[7], #12
537
+ veor @y[2], @y[2], @t[2]
538
+
539
+ veor @y[3], @y[3], @y[1]
540
+ veor @y[1], @y[1], @t[1]
541
+ veor @y[0], @y[0], @t[0]
542
+ veor @y[3], @y[3], @t[0]
543
+ veor @y[1], @y[1], @t[5]
544
+ veor @y[0], @y[0], @t[5]
545
+ vext.8 @t[0], @t[0], @t[0], #12
546
+ veor @y[1], @y[1], @t[7]
547
+ veor @y[0], @y[0], @t[6]
548
+ veor @y[3], @y[3], @y[1]
549
+ veor @y[4], @y[4], @t[1]
550
+ vext.8 @t[1], @t[1], @t[1], #12
551
+
552
+ veor @y[7], @y[7], @t[7]
553
+ veor @y[4], @y[4], @t[2]
554
+ veor @y[5], @y[5], @t[2]
555
+ veor @y[2], @y[2], @t[6]
556
+ veor @t[6], @t[6], @t[3] @ clobber t[6]
557
+ vext.8 @t[2], @t[2], @t[2], #12
558
+ veor @y[4], @y[4], @y[7]
559
+ veor @y[3], @y[3], @t[6]
560
+
561
+ veor @y[6], @y[6], @t[6]
562
+ veor @y[5], @y[5], @t[5]
563
+ vext.8 @t[5], @t[5], @t[5], #12
564
+ veor @y[6], @y[6], @t[4]
565
+ vext.8 @t[4], @t[4], @t[4], #12
566
+ veor @y[5], @y[5], @t[6]
567
+ veor @y[6], @y[6], @t[7]
568
+ vext.8 @t[7], @t[7], @t[7], #12
569
+ veor @t[6], @t[6], @t[3] @ restore t[6]
570
+ vext.8 @t[3], @t[3], @t[3], #12
571
+
572
+ @ multiplication by 0x09
573
+ veor @y[4], @y[4], @y[1]
574
+ veor @t[1], @t[1], @y[1] @ t[1]=y[1]
575
+ veor @t[0], @t[0], @t[5] @ clobber t[0]
576
+ vext.8 @t[6], @t[6], @t[6], #12
577
+ veor @t[1], @t[1], @t[5]
578
+ veor @y[3], @y[3], @t[0]
579
+ veor @t[0], @t[0], @y[0] @ t[0]=y[0]
580
+ veor @t[1], @t[1], @t[6]
581
+ veor @t[6], @t[6], @t[7] @ clobber t[6]
582
+ veor @y[4], @y[4], @t[1]
583
+ veor @y[7], @y[7], @t[4]
584
+ veor @y[6], @y[6], @t[3]
585
+ veor @y[5], @y[5], @t[2]
586
+ veor @t[4], @t[4], @y[4] @ t[4]=y[4]
587
+ veor @t[3], @t[3], @y[3] @ t[3]=y[3]
588
+ veor @t[5], @t[5], @y[5] @ t[5]=y[5]
589
+ veor @t[2], @t[2], @y[2] @ t[2]=y[2]
590
+ veor @t[3], @t[3], @t[7]
591
+ veor @XMM[5], @t[5], @t[6]
592
+ veor @XMM[6], @t[6], @y[6] @ t[6]=y[6]
593
+ veor @XMM[2], @t[2], @t[6]
594
+ veor @XMM[7], @t[7], @y[7] @ t[7]=y[7]
595
+
596
+ vmov @XMM[0], @t[0]
597
+ vmov @XMM[1], @t[1]
598
+ @ vmov @XMM[2], @t[2]
599
+ vmov @XMM[3], @t[3]
600
+ vmov @XMM[4], @t[4]
601
+ @ vmov @XMM[5], @t[5]
602
+ @ vmov @XMM[6], @t[6]
603
+ @ vmov @XMM[7], @t[7]
604
+ ___
605
+ }
606
+
607
+ sub InvMixColumns {
608
+ my @x=@_[0..7];
609
+ my @t=@_[8..15];
610
+
611
+ # Thanks to Jussi Kivilinna for providing pointer to
612
+ #
613
+ # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
614
+ # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
615
+ # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
616
+ # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
617
+
618
+ $code.=<<___;
619
+ @ multiplication by 0x05-0x00-0x04-0x00
620
+ vext.8 @t[0], @x[0], @x[0], #8
621
+ vext.8 @t[6], @x[6], @x[6], #8
622
+ vext.8 @t[7], @x[7], @x[7], #8
623
+ veor @t[0], @t[0], @x[0]
624
+ vext.8 @t[1], @x[1], @x[1], #8
625
+ veor @t[6], @t[6], @x[6]
626
+ vext.8 @t[2], @x[2], @x[2], #8
627
+ veor @t[7], @t[7], @x[7]
628
+ vext.8 @t[3], @x[3], @x[3], #8
629
+ veor @t[1], @t[1], @x[1]
630
+ vext.8 @t[4], @x[4], @x[4], #8
631
+ veor @t[2], @t[2], @x[2]
632
+ vext.8 @t[5], @x[5], @x[5], #8
633
+ veor @t[3], @t[3], @x[3]
634
+ veor @t[4], @t[4], @x[4]
635
+ veor @t[5], @t[5], @x[5]
636
+
637
+ veor @x[0], @x[0], @t[6]
638
+ veor @x[1], @x[1], @t[6]
639
+ veor @x[2], @x[2], @t[0]
640
+ veor @x[4], @x[4], @t[2]
641
+ veor @x[3], @x[3], @t[1]
642
+ veor @x[1], @x[1], @t[7]
643
+ veor @x[2], @x[2], @t[7]
644
+ veor @x[4], @x[4], @t[6]
645
+ veor @x[5], @x[5], @t[3]
646
+ veor @x[3], @x[3], @t[6]
647
+ veor @x[6], @x[6], @t[4]
648
+ veor @x[4], @x[4], @t[7]
649
+ veor @x[5], @x[5], @t[7]
650
+ veor @x[7], @x[7], @t[5]
651
+ ___
652
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
653
+ }
654
+
655
+ sub swapmove {
656
+ my ($a,$b,$n,$mask,$t)=@_;
657
+ $code.=<<___;
658
+ vshr.u64 $t, $b, #$n
659
+ veor $t, $t, $a
660
+ vand $t, $t, $mask
661
+ veor $a, $a, $t
662
+ vshl.u64 $t, $t, #$n
663
+ veor $b, $b, $t
664
+ ___
665
+ }
666
+ sub swapmove2x {
667
+ my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
668
+ $code.=<<___;
669
+ vshr.u64 $t0, $b0, #$n
670
+ vshr.u64 $t1, $b1, #$n
671
+ veor $t0, $t0, $a0
672
+ veor $t1, $t1, $a1
673
+ vand $t0, $t0, $mask
674
+ vand $t1, $t1, $mask
675
+ veor $a0, $a0, $t0
676
+ vshl.u64 $t0, $t0, #$n
677
+ veor $a1, $a1, $t1
678
+ vshl.u64 $t1, $t1, #$n
679
+ veor $b0, $b0, $t0
680
+ veor $b1, $b1, $t1
681
+ ___
682
+ }
683
+
684
+ sub bitslice {
685
+ my @x=reverse(@_[0..7]);
686
+ my ($t0,$t1,$t2,$t3)=@_[8..11];
687
+ $code.=<<___;
688
+ vmov.i8 $t0,#0x55 @ compose .LBS0
689
+ vmov.i8 $t1,#0x33 @ compose .LBS1
690
+ ___
691
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
692
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
693
+ $code.=<<___;
694
+ vmov.i8 $t0,#0x0f @ compose .LBS2
695
+ ___
696
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
697
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
698
+
699
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
700
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
701
+ }
702
+
703
+ $code.=<<___;
704
+ #ifndef __KERNEL__
705
+ # include <openssl/arm_arch.h>
706
+
707
+ # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
708
+ # define VFP_ABI_POP vldmia sp!,{d8-d15}
709
+ # define VFP_ABI_FRAME 0x40
710
+ #else
711
+ # define VFP_ABI_PUSH
712
+ # define VFP_ABI_POP
713
+ # define VFP_ABI_FRAME 0
714
+ # define BSAES_ASM_EXTENDED_KEY
715
+ # define XTS_CHAIN_TWEAK
716
+ # define __ARM_ARCH__ __LINUX_ARM_ARCH__
717
+ # define __ARM_MAX_ARCH__ 7
718
+ #endif
719
+
720
+ #ifdef __thumb__
721
+ # define adrl adr
722
+ #endif
723
+
724
+ #if __ARM_MAX_ARCH__>=7
725
+ .arch armv7-a
726
+ .fpu neon
727
+
728
+ .text
729
+ .syntax unified @ ARMv7-capable assembler is expected to handle this
730
+ #if defined(__thumb2__) && !defined(__APPLE__)
731
+ .thumb
732
+ #else
733
+ .code 32
734
+ #endif
735
+
736
+ .type _bsaes_decrypt8,%function
737
+ .align 4
738
+ _bsaes_decrypt8:
739
+ adr $const,_bsaes_decrypt8
740
+ vldmia $key!, {@XMM[9]} @ round 0 key
741
+ #ifdef __APPLE__
742
+ adr $const,.LM0ISR
743
+ #else
744
+ add $const,$const,#.LM0ISR-_bsaes_decrypt8
745
+ #endif
746
+
747
+ vldmia $const!, {@XMM[8]} @ .LM0ISR
748
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
749
+ veor @XMM[11], @XMM[1], @XMM[9]
750
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
751
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
752
+ veor @XMM[12], @XMM[2], @XMM[9]
753
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
754
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
755
+ veor @XMM[13], @XMM[3], @XMM[9]
756
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
757
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
758
+ veor @XMM[14], @XMM[4], @XMM[9]
759
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
760
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
761
+ veor @XMM[15], @XMM[5], @XMM[9]
762
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
763
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
764
+ veor @XMM[10], @XMM[6], @XMM[9]
765
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
766
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
767
+ veor @XMM[11], @XMM[7], @XMM[9]
768
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
769
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
770
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
771
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
772
+ ___
773
+ &bitslice (@XMM[0..7, 8..11]);
774
+ $code.=<<___;
775
+ sub $rounds,$rounds,#1
776
+ b .Ldec_sbox
777
+ .align 4
778
+ .Ldec_loop:
779
+ ___
780
+ &ShiftRows (@XMM[0..7, 8..12]);
781
+ $code.=".Ldec_sbox:\n";
782
+ &InvSbox (@XMM[0..7, 8..15]);
783
+ $code.=<<___;
784
+ subs $rounds,$rounds,#1
785
+ bcc .Ldec_done
786
+ ___
787
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
788
+ $code.=<<___;
789
+ vldmia $const, {@XMM[12]} @ .LISR
790
+ ite eq @ Thumb2 thing, sanity check in ARM
791
+ addeq $const,$const,#0x10
792
+ bne .Ldec_loop
793
+ vldmia $const, {@XMM[12]} @ .LISRM0
794
+ b .Ldec_loop
795
+ .align 4
796
+ .Ldec_done:
797
+ ___
798
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
799
+ $code.=<<___;
800
+ vldmia $key, {@XMM[8]} @ last round key
801
+ veor @XMM[6], @XMM[6], @XMM[8]
802
+ veor @XMM[4], @XMM[4], @XMM[8]
803
+ veor @XMM[2], @XMM[2], @XMM[8]
804
+ veor @XMM[7], @XMM[7], @XMM[8]
805
+ veor @XMM[3], @XMM[3], @XMM[8]
806
+ veor @XMM[5], @XMM[5], @XMM[8]
807
+ veor @XMM[0], @XMM[0], @XMM[8]
808
+ veor @XMM[1], @XMM[1], @XMM[8]
809
+ bx lr
810
+ .size _bsaes_decrypt8,.-_bsaes_decrypt8
811
+
812
+ .type _bsaes_const,%object
813
+ .align 6
814
+ _bsaes_const:
815
+ .LM0ISR: @ InvShiftRows constants
816
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
817
+ .LISR:
818
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
819
+ .LISRM0:
820
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
821
+ .LM0SR: @ ShiftRows constants
822
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
823
+ .LSR:
824
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
825
+ .LSRM0:
826
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
827
+ .LM0:
828
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
829
+ .LREVM0SR:
830
+ .quad 0x090d01050c000408, 0x03070b0f060a0e02
831
+ .asciz "Bit-sliced AES for NEON, CRYPTOGAMS by <appro\@openssl.org>"
832
+ .align 6
833
+ .size _bsaes_const,.-_bsaes_const
834
+
835
+ .type _bsaes_encrypt8,%function
836
+ .align 4
837
+ _bsaes_encrypt8:
838
+ adr $const,_bsaes_encrypt8
839
+ vldmia $key!, {@XMM[9]} @ round 0 key
840
+ #ifdef __APPLE__
841
+ adr $const,.LM0SR
842
+ #else
843
+ sub $const,$const,#_bsaes_encrypt8-.LM0SR
844
+ #endif
845
+
846
+ vldmia $const!, {@XMM[8]} @ .LM0SR
847
+ _bsaes_encrypt8_alt:
848
+ veor @XMM[10], @XMM[0], @XMM[9] @ xor with round0 key
849
+ veor @XMM[11], @XMM[1], @XMM[9]
850
+ vtbl.8 `&Dlo(@XMM[0])`, {@XMM[10]}, `&Dlo(@XMM[8])`
851
+ vtbl.8 `&Dhi(@XMM[0])`, {@XMM[10]}, `&Dhi(@XMM[8])`
852
+ veor @XMM[12], @XMM[2], @XMM[9]
853
+ vtbl.8 `&Dlo(@XMM[1])`, {@XMM[11]}, `&Dlo(@XMM[8])`
854
+ vtbl.8 `&Dhi(@XMM[1])`, {@XMM[11]}, `&Dhi(@XMM[8])`
855
+ veor @XMM[13], @XMM[3], @XMM[9]
856
+ vtbl.8 `&Dlo(@XMM[2])`, {@XMM[12]}, `&Dlo(@XMM[8])`
857
+ vtbl.8 `&Dhi(@XMM[2])`, {@XMM[12]}, `&Dhi(@XMM[8])`
858
+ veor @XMM[14], @XMM[4], @XMM[9]
859
+ vtbl.8 `&Dlo(@XMM[3])`, {@XMM[13]}, `&Dlo(@XMM[8])`
860
+ vtbl.8 `&Dhi(@XMM[3])`, {@XMM[13]}, `&Dhi(@XMM[8])`
861
+ veor @XMM[15], @XMM[5], @XMM[9]
862
+ vtbl.8 `&Dlo(@XMM[4])`, {@XMM[14]}, `&Dlo(@XMM[8])`
863
+ vtbl.8 `&Dhi(@XMM[4])`, {@XMM[14]}, `&Dhi(@XMM[8])`
864
+ veor @XMM[10], @XMM[6], @XMM[9]
865
+ vtbl.8 `&Dlo(@XMM[5])`, {@XMM[15]}, `&Dlo(@XMM[8])`
866
+ vtbl.8 `&Dhi(@XMM[5])`, {@XMM[15]}, `&Dhi(@XMM[8])`
867
+ veor @XMM[11], @XMM[7], @XMM[9]
868
+ vtbl.8 `&Dlo(@XMM[6])`, {@XMM[10]}, `&Dlo(@XMM[8])`
869
+ vtbl.8 `&Dhi(@XMM[6])`, {@XMM[10]}, `&Dhi(@XMM[8])`
870
+ vtbl.8 `&Dlo(@XMM[7])`, {@XMM[11]}, `&Dlo(@XMM[8])`
871
+ vtbl.8 `&Dhi(@XMM[7])`, {@XMM[11]}, `&Dhi(@XMM[8])`
872
+ _bsaes_encrypt8_bitslice:
873
+ ___
874
+ &bitslice (@XMM[0..7, 8..11]);
875
+ $code.=<<___;
876
+ sub $rounds,$rounds,#1
877
+ b .Lenc_sbox
878
+ .align 4
879
+ .Lenc_loop:
880
+ ___
881
+ &ShiftRows (@XMM[0..7, 8..12]);
882
+ $code.=".Lenc_sbox:\n";
883
+ &Sbox (@XMM[0..7, 8..15]);
884
+ $code.=<<___;
885
+ subs $rounds,$rounds,#1
886
+ bcc .Lenc_done
887
+ ___
888
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
889
+ $code.=<<___;
890
+ vldmia $const, {@XMM[12]} @ .LSR
891
+ ite eq @ Thumb2 thing, samity check in ARM
892
+ addeq $const,$const,#0x10
893
+ bne .Lenc_loop
894
+ vldmia $const, {@XMM[12]} @ .LSRM0
895
+ b .Lenc_loop
896
+ .align 4
897
+ .Lenc_done:
898
+ ___
899
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
900
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
901
+ $code.=<<___;
902
+ vldmia $key, {@XMM[8]} @ last round key
903
+ veor @XMM[4], @XMM[4], @XMM[8]
904
+ veor @XMM[6], @XMM[6], @XMM[8]
905
+ veor @XMM[3], @XMM[3], @XMM[8]
906
+ veor @XMM[7], @XMM[7], @XMM[8]
907
+ veor @XMM[2], @XMM[2], @XMM[8]
908
+ veor @XMM[5], @XMM[5], @XMM[8]
909
+ veor @XMM[0], @XMM[0], @XMM[8]
910
+ veor @XMM[1], @XMM[1], @XMM[8]
911
+ bx lr
912
+ .size _bsaes_encrypt8,.-_bsaes_encrypt8
913
+ ___
914
+ }
915
+ {
916
+ my ($out,$inp,$rounds,$const)=("r12","r4","r5","r6");
917
+
918
+ sub bitslice_key {
919
+ my @x=reverse(@_[0..7]);
920
+ my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
921
+
922
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
923
+ $code.=<<___;
924
+ @ &swapmove(@x[2,3],1,$t0,$t2,$t3);
925
+ vmov @x[2], @x[0]
926
+ vmov @x[3], @x[1]
927
+ ___
928
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
929
+
930
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
931
+ $code.=<<___;
932
+ @ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
933
+ vmov @x[4], @x[0]
934
+ vmov @x[6], @x[2]
935
+ vmov @x[5], @x[1]
936
+ vmov @x[7], @x[3]
937
+ ___
938
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
939
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
940
+ }
941
+
942
+ $code.=<<___;
943
+ .type _bsaes_key_convert,%function
944
+ .align 4
945
+ _bsaes_key_convert:
946
+ adr $const,_bsaes_key_convert
947
+ vld1.8 {@XMM[7]}, [$inp]! @ load round 0 key
948
+ #ifdef __APPLE__
949
+ adr $const,.LM0
950
+ #else
951
+ sub $const,$const,#_bsaes_key_convert-.LM0
952
+ #endif
953
+ vld1.8 {@XMM[15]}, [$inp]! @ load round 1 key
954
+
955
+ vmov.i8 @XMM[8], #0x01 @ bit masks
956
+ vmov.i8 @XMM[9], #0x02
957
+ vmov.i8 @XMM[10], #0x04
958
+ vmov.i8 @XMM[11], #0x08
959
+ vmov.i8 @XMM[12], #0x10
960
+ vmov.i8 @XMM[13], #0x20
961
+ vldmia $const, {@XMM[14]} @ .LM0
962
+
963
+ #ifdef __ARMEL__
964
+ vrev32.8 @XMM[7], @XMM[7]
965
+ vrev32.8 @XMM[15], @XMM[15]
966
+ #endif
967
+ sub $rounds,$rounds,#1
968
+ vstmia $out!, {@XMM[7]} @ save round 0 key
969
+ b .Lkey_loop
970
+
971
+ .align 4
972
+ .Lkey_loop:
973
+ vtbl.8 `&Dlo(@XMM[7])`,{@XMM[15]},`&Dlo(@XMM[14])`
974
+ vtbl.8 `&Dhi(@XMM[7])`,{@XMM[15]},`&Dhi(@XMM[14])`
975
+ vmov.i8 @XMM[6], #0x40
976
+ vmov.i8 @XMM[15], #0x80
977
+
978
+ vtst.8 @XMM[0], @XMM[7], @XMM[8]
979
+ vtst.8 @XMM[1], @XMM[7], @XMM[9]
980
+ vtst.8 @XMM[2], @XMM[7], @XMM[10]
981
+ vtst.8 @XMM[3], @XMM[7], @XMM[11]
982
+ vtst.8 @XMM[4], @XMM[7], @XMM[12]
983
+ vtst.8 @XMM[5], @XMM[7], @XMM[13]
984
+ vtst.8 @XMM[6], @XMM[7], @XMM[6]
985
+ vtst.8 @XMM[7], @XMM[7], @XMM[15]
986
+ vld1.8 {@XMM[15]}, [$inp]! @ load next round key
987
+ vmvn @XMM[0], @XMM[0] @ "pnot"
988
+ vmvn @XMM[1], @XMM[1]
989
+ vmvn @XMM[5], @XMM[5]
990
+ vmvn @XMM[6], @XMM[6]
991
+ #ifdef __ARMEL__
992
+ vrev32.8 @XMM[15], @XMM[15]
993
+ #endif
994
+ subs $rounds,$rounds,#1
995
+ vstmia $out!,{@XMM[0]-@XMM[7]} @ write bit-sliced round key
996
+ bne .Lkey_loop
997
+
998
+ vmov.i8 @XMM[7],#0x63 @ compose .L63
999
+ @ don't save last round key
1000
+ bx lr
1001
+ .size _bsaes_key_convert,.-_bsaes_key_convert
1002
+ ___
1003
+ }
1004
+
1005
+ if (0) { # following four functions are unsupported interface
1006
+ # used for benchmarking...
1007
+ $code.=<<___;
1008
+ .globl bsaes_enc_key_convert
1009
+ .hidden bsaes_enc_key_convert
1010
+ .type bsaes_enc_key_convert,%function
1011
+ .align 4
1012
+ bsaes_enc_key_convert:
1013
+ stmdb sp!,{r4-r6,lr}
1014
+ vstmdb sp!,{d8-d15} @ ABI specification says so
1015
+
1016
+ ldr r5,[$inp,#240] @ pass rounds
1017
+ mov r4,$inp @ pass key
1018
+ mov r12,$out @ pass key schedule
1019
+ bl _bsaes_key_convert
1020
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1021
+ vstmia r12, {@XMM[7]} @ save last round key
1022
+
1023
+ vldmia sp!,{d8-d15}
1024
+ ldmia sp!,{r4-r6,pc}
1025
+ .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1026
+
1027
+ .globl bsaes_encrypt_128
1028
+ .hidden bsaes_encrypt_128
1029
+ .type bsaes_encrypt_128,%function
1030
+ .align 4
1031
+ bsaes_encrypt_128:
1032
+ stmdb sp!,{r4-r6,lr}
1033
+ vstmdb sp!,{d8-d15} @ ABI specification says so
1034
+ .Lenc128_loop:
1035
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1036
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1037
+ mov r4,$key @ pass the key
1038
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1039
+ mov r5,#10 @ pass rounds
1040
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1041
+
1042
+ bl _bsaes_encrypt8
1043
+
1044
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1045
+ vst1.8 {@XMM[4]}, [$out]!
1046
+ vst1.8 {@XMM[6]}, [$out]!
1047
+ vst1.8 {@XMM[3]}, [$out]!
1048
+ vst1.8 {@XMM[7]}, [$out]!
1049
+ vst1.8 {@XMM[2]}, [$out]!
1050
+ subs $len,$len,#0x80
1051
+ vst1.8 {@XMM[5]}, [$out]!
1052
+ bhi .Lenc128_loop
1053
+
1054
+ vldmia sp!,{d8-d15}
1055
+ ldmia sp!,{r4-r6,pc}
1056
+ .size bsaes_encrypt_128,.-bsaes_encrypt_128
1057
+
1058
+ .globl bsaes_dec_key_convert
1059
+ .hidden bsaes_dec_key_convert
1060
+ .type bsaes_dec_key_convert,%function
1061
+ .align 4
1062
+ bsaes_dec_key_convert:
1063
+ stmdb sp!,{r4-r6,lr}
1064
+ vstmdb sp!,{d8-d15} @ ABI specification says so
1065
+
1066
+ ldr r5,[$inp,#240] @ pass rounds
1067
+ mov r4,$inp @ pass key
1068
+ mov r12,$out @ pass key schedule
1069
+ bl _bsaes_key_convert
1070
+ vldmia $out, {@XMM[6]}
1071
+ vstmia r12, {@XMM[15]} @ save last round key
1072
+ veor @XMM[7], @XMM[7], @XMM[6] @ fix up round 0 key
1073
+ vstmia $out, {@XMM[7]}
1074
+
1075
+ vldmia sp!,{d8-d15}
1076
+ ldmia sp!,{r4-r6,pc}
1077
+ .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1078
+
1079
+ .globl bsaes_decrypt_128
1080
+ .hidden bsaes_decrypt_128
1081
+ .type bsaes_decrypt_128,%function
1082
+ .align 4
1083
+ bsaes_decrypt_128:
1084
+ stmdb sp!,{r4-r6,lr}
1085
+ vstmdb sp!,{d8-d15} @ ABI specification says so
1086
+ .Ldec128_loop:
1087
+ vld1.8 {@XMM[0]-@XMM[1]}, [$inp]! @ load input
1088
+ vld1.8 {@XMM[2]-@XMM[3]}, [$inp]!
1089
+ mov r4,$key @ pass the key
1090
+ vld1.8 {@XMM[4]-@XMM[5]}, [$inp]!
1091
+ mov r5,#10 @ pass rounds
1092
+ vld1.8 {@XMM[6]-@XMM[7]}, [$inp]!
1093
+
1094
+ bl _bsaes_decrypt8
1095
+
1096
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1097
+ vst1.8 {@XMM[6]}, [$out]!
1098
+ vst1.8 {@XMM[4]}, [$out]!
1099
+ vst1.8 {@XMM[2]}, [$out]!
1100
+ vst1.8 {@XMM[7]}, [$out]!
1101
+ vst1.8 {@XMM[3]}, [$out]!
1102
+ subs $len,$len,#0x80
1103
+ vst1.8 {@XMM[5]}, [$out]!
1104
+ bhi .Ldec128_loop
1105
+
1106
+ vldmia sp!,{d8-d15}
1107
+ ldmia sp!,{r4-r6,pc}
1108
+ .size bsaes_decrypt_128,.-bsaes_decrypt_128
1109
+ ___
1110
+ }
1111
+ {
1112
+ my ($inp,$out,$len,$key, $ctr,$fp,$rounds)=(map("r$_",(0..3,8..10)));
1113
+ my $const = "r6"; # shared with _bsaes_encrypt8_alt
1114
+ my $keysched = "sp";
1115
+
1116
+ $code.=<<___;
1117
+ .extern AES_encrypt
1118
+ .global bsaes_ctr32_encrypt_blocks
1119
+ .hidden bsaes_ctr32_encrypt_blocks
1120
+ .type bsaes_ctr32_encrypt_blocks,%function
1121
+ .align 5
1122
+ bsaes_ctr32_encrypt_blocks:
1123
+ cmp $len, #8 @ use plain AES for
1124
+ blo .Lctr_enc_short @ small sizes
1125
+
1126
+ mov ip, sp
1127
+ stmdb sp!, {r4-r10, lr}
1128
+ VFP_ABI_PUSH
1129
+ ldr $ctr, [ip] @ ctr is 1st arg on the stack
1130
+ sub sp, sp, #0x10 @ scratch space to carry over the ctr
1131
+ mov $fp, sp @ save sp
1132
+
1133
+ ldr $rounds, [$key, #240] @ get # of rounds
1134
+ #ifndef BSAES_ASM_EXTENDED_KEY
1135
+ @ allocate the key schedule on the stack
1136
+ sub r12, sp, $rounds, lsl#7 @ 128 bytes per inner round key
1137
+ add r12, #`128-32` @ size of bit-sliced key schedule
1138
+
1139
+ @ populate the key schedule
1140
+ mov r4, $key @ pass key
1141
+ mov r5, $rounds @ pass # of rounds
1142
+ mov sp, r12 @ sp is $keysched
1143
+ bl _bsaes_key_convert
1144
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1145
+ vstmia r12, {@XMM[7]} @ save last round key
1146
+
1147
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
1148
+ #ifdef __APPLE__
1149
+ mov $ctr, #:lower16:(.LREVM0SR-.LM0)
1150
+ add $ctr, $const, $ctr
1151
+ #else
1152
+ add $ctr, $const, #.LREVM0SR-.LM0 @ borrow $ctr
1153
+ #endif
1154
+ vldmia $keysched, {@XMM[4]} @ load round0 key
1155
+ #else
1156
+ ldr r12, [$key, #244]
1157
+ eors r12, #1
1158
+ beq 0f
1159
+
1160
+ @ populate the key schedule
1161
+ str r12, [$key, #244]
1162
+ mov r4, $key @ pass key
1163
+ mov r5, $rounds @ pass # of rounds
1164
+ add r12, $key, #248 @ pass key schedule
1165
+ bl _bsaes_key_convert
1166
+ veor @XMM[7],@XMM[7],@XMM[15] @ fix up last round key
1167
+ vstmia r12, {@XMM[7]} @ save last round key
1168
+
1169
+ .align 2
1170
+ 0: add r12, $key, #248
1171
+ vld1.8 {@XMM[0]}, [$ctr] @ load counter
1172
+ adrl $ctr, .LREVM0SR @ borrow $ctr
1173
+ vldmia r12, {@XMM[4]} @ load round0 key
1174
+ sub sp, #0x10 @ place for adjusted round0 key
1175
+ #endif
1176
+
1177
+ vmov.i32 @XMM[8],#1 @ compose 1<<96
1178
+ veor @XMM[9],@XMM[9],@XMM[9]
1179
+ vrev32.8 @XMM[0],@XMM[0]
1180
+ vext.8 @XMM[8],@XMM[9],@XMM[8],#4
1181
+ vrev32.8 @XMM[4],@XMM[4]
1182
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1183
+ vstmia $keysched, {@XMM[4]} @ save adjusted round0 key
1184
+ b .Lctr_enc_loop
1185
+
1186
+ .align 4
1187
+ .Lctr_enc_loop:
1188
+ vadd.u32 @XMM[10], @XMM[8], @XMM[9] @ compose 3<<96
1189
+ vadd.u32 @XMM[1], @XMM[0], @XMM[8] @ +1
1190
+ vadd.u32 @XMM[2], @XMM[0], @XMM[9] @ +2
1191
+ vadd.u32 @XMM[3], @XMM[0], @XMM[10] @ +3
1192
+ vadd.u32 @XMM[4], @XMM[1], @XMM[10]
1193
+ vadd.u32 @XMM[5], @XMM[2], @XMM[10]
1194
+ vadd.u32 @XMM[6], @XMM[3], @XMM[10]
1195
+ vadd.u32 @XMM[7], @XMM[4], @XMM[10]
1196
+ vadd.u32 @XMM[10], @XMM[5], @XMM[10] @ next counter
1197
+
1198
+ @ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1199
+ @ to flip byte order in 32-bit counter
1200
+
1201
+ vldmia $keysched, {@XMM[9]} @ load round0 key
1202
+ #ifndef BSAES_ASM_EXTENDED_KEY
1203
+ add r4, $keysched, #0x10 @ pass next round key
1204
+ #else
1205
+ add r4, $key, #`248+16`
1206
+ #endif
1207
+ vldmia $ctr, {@XMM[8]} @ .LREVM0SR
1208
+ mov r5, $rounds @ pass rounds
1209
+ vstmia $fp, {@XMM[10]} @ save next counter
1210
+ #ifdef __APPLE__
1211
+ mov $const, #:lower16:(.LREVM0SR-.LSR)
1212
+ sub $const, $ctr, $const
1213
+ #else
1214
+ sub $const, $ctr, #.LREVM0SR-.LSR @ pass constants
1215
+ #endif
1216
+
1217
+ bl _bsaes_encrypt8_alt
1218
+
1219
+ subs $len, $len, #8
1220
+ blo .Lctr_enc_loop_done
1221
+
1222
+ vld1.8 {@XMM[8]-@XMM[9]}, [$inp]! @ load input
1223
+ vld1.8 {@XMM[10]-@XMM[11]}, [$inp]!
1224
+ veor @XMM[0], @XMM[8]
1225
+ veor @XMM[1], @XMM[9]
1226
+ vld1.8 {@XMM[12]-@XMM[13]}, [$inp]!
1227
+ veor @XMM[4], @XMM[10]
1228
+ veor @XMM[6], @XMM[11]
1229
+ vld1.8 {@XMM[14]-@XMM[15]}, [$inp]!
1230
+ veor @XMM[3], @XMM[12]
1231
+ vst1.8 {@XMM[0]-@XMM[1]}, [$out]! @ write output
1232
+ veor @XMM[7], @XMM[13]
1233
+ veor @XMM[2], @XMM[14]
1234
+ vst1.8 {@XMM[4]}, [$out]!
1235
+ veor @XMM[5], @XMM[15]
1236
+ vst1.8 {@XMM[6]}, [$out]!
1237
+ vmov.i32 @XMM[8], #1 @ compose 1<<96
1238
+ vst1.8 {@XMM[3]}, [$out]!
1239
+ veor @XMM[9], @XMM[9], @XMM[9]
1240
+ vst1.8 {@XMM[7]}, [$out]!
1241
+ vext.8 @XMM[8], @XMM[9], @XMM[8], #4
1242
+ vst1.8 {@XMM[2]}, [$out]!
1243
+ vadd.u32 @XMM[9],@XMM[8],@XMM[8] @ compose 2<<96
1244
+ vst1.8 {@XMM[5]}, [$out]!
1245
+ vldmia $fp, {@XMM[0]} @ load counter
1246
+
1247
+ bne .Lctr_enc_loop
1248
+ b .Lctr_enc_done
1249
+
1250
+ .align 4
1251
+ .Lctr_enc_loop_done:
1252
+ add $len, $len, #8
1253
+ vld1.8 {@XMM[8]}, [$inp]! @ load input
1254
+ veor @XMM[0], @XMM[8]
1255
+ vst1.8 {@XMM[0]}, [$out]! @ write output
1256
+ cmp $len, #2
1257
+ blo .Lctr_enc_done
1258
+ vld1.8 {@XMM[9]}, [$inp]!
1259
+ veor @XMM[1], @XMM[9]
1260
+ vst1.8 {@XMM[1]}, [$out]!
1261
+ beq .Lctr_enc_done
1262
+ vld1.8 {@XMM[10]}, [$inp]!
1263
+ veor @XMM[4], @XMM[10]
1264
+ vst1.8 {@XMM[4]}, [$out]!
1265
+ cmp $len, #4
1266
+ blo .Lctr_enc_done
1267
+ vld1.8 {@XMM[11]}, [$inp]!
1268
+ veor @XMM[6], @XMM[11]
1269
+ vst1.8 {@XMM[6]}, [$out]!
1270
+ beq .Lctr_enc_done
1271
+ vld1.8 {@XMM[12]}, [$inp]!
1272
+ veor @XMM[3], @XMM[12]
1273
+ vst1.8 {@XMM[3]}, [$out]!
1274
+ cmp $len, #6
1275
+ blo .Lctr_enc_done
1276
+ vld1.8 {@XMM[13]}, [$inp]!
1277
+ veor @XMM[7], @XMM[13]
1278
+ vst1.8 {@XMM[7]}, [$out]!
1279
+ beq .Lctr_enc_done
1280
+ vld1.8 {@XMM[14]}, [$inp]
1281
+ veor @XMM[2], @XMM[14]
1282
+ vst1.8 {@XMM[2]}, [$out]!
1283
+
1284
+ .Lctr_enc_done:
1285
+ vmov.i32 q0, #0
1286
+ vmov.i32 q1, #0
1287
+ #ifndef BSAES_ASM_EXTENDED_KEY
1288
+ .Lctr_enc_bzero: @ wipe key schedule [if any]
1289
+ vstmia $keysched!, {q0-q1}
1290
+ cmp $keysched, $fp
1291
+ bne .Lctr_enc_bzero
1292
+ #else
1293
+ vstmia $keysched, {q0-q1}
1294
+ #endif
1295
+
1296
+ mov sp, $fp
1297
+ add sp, #0x10 @ add sp,$fp,#0x10 is no good for thumb
1298
+ VFP_ABI_POP
1299
+ ldmia sp!, {r4-r10, pc} @ return
1300
+
1301
+ .align 4
1302
+ .Lctr_enc_short:
1303
+ ldr ip, [sp] @ ctr pointer is passed on stack
1304
+ stmdb sp!, {r4-r8, lr}
1305
+
1306
+ mov r4, $inp @ copy arguments
1307
+ mov r5, $out
1308
+ mov r6, $len
1309
+ mov r7, $key
1310
+ ldr r8, [ip, #12] @ load counter LSW
1311
+ vld1.8 {@XMM[1]}, [ip] @ load whole counter value
1312
+ #ifdef __ARMEL__
1313
+ rev r8, r8
1314
+ #endif
1315
+ sub sp, sp, #0x10
1316
+ vst1.8 {@XMM[1]}, [sp] @ copy counter value
1317
+ sub sp, sp, #0x10
1318
+
1319
+ .Lctr_enc_short_loop:
1320
+ add r0, sp, #0x10 @ input counter value
1321
+ mov r1, sp @ output on the stack
1322
+ mov r2, r7 @ key
1323
+
1324
+ bl AES_encrypt
1325
+
1326
+ vld1.8 {@XMM[0]}, [r4]! @ load input
1327
+ vld1.8 {@XMM[1]}, [sp] @ load encrypted counter
1328
+ add r8, r8, #1
1329
+ #ifdef __ARMEL__
1330
+ rev r0, r8
1331
+ str r0, [sp, #0x1c] @ next counter value
1332
+ #else
1333
+ str r8, [sp, #0x1c] @ next counter value
1334
+ #endif
1335
+ veor @XMM[0],@XMM[0],@XMM[1]
1336
+ vst1.8 {@XMM[0]}, [r5]! @ store output
1337
+ subs r6, r6, #1
1338
+ bne .Lctr_enc_short_loop
1339
+
1340
+ vmov.i32 q0, #0
1341
+ vmov.i32 q1, #0
1342
+ vstmia sp!, {q0-q1}
1343
+
1344
+ ldmia sp!, {r4-r8, pc}
1345
+ .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1346
+ ___
1347
+ }
1348
+ $code.=<<___;
1349
+ #endif
1350
+ ___
1351
+
1352
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1353
+
1354
+ open SELF,$0;
1355
+ while(<SELF>) {
1356
+ next if (/^#!/);
1357
+ last if (!s/^#/@/ and !/^$/);
1358
+ print;
1359
+ }
1360
+ close SELF;
1361
+
1362
+ print $code;
1363
+
1364
+ close STDOUT;