ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1565 @@
1
+ #!/usr/bin/env perl
2
+
3
+ ###################################################################
4
+ ### AES-128 [originally in CTR mode] ###
5
+ ### bitsliced implementation for Intel Core 2 processors ###
6
+ ### requires support of SSE extensions up to SSSE3 ###
7
+ ### Author: Emilia Käsper and Peter Schwabe ###
8
+ ### Date: 2009-03-19 ###
9
+ ### Public domain ###
10
+ ### ###
11
+ ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12
+ ### further information. ###
13
+ ###################################################################
14
+ #
15
+ # September 2011.
16
+ #
17
+ # Started as transliteration to "perlasm" the original code has
18
+ # undergone following changes:
19
+ #
20
+ # - code was made position-independent;
21
+ # - rounds were folded into a loop resulting in >5x size reduction
22
+ # from 12.5KB to 2.2KB;
23
+ # - above was possibile thanks to mixcolumns() modification that
24
+ # allowed to feed its output back to aesenc[last], this was
25
+ # achieved at cost of two additional inter-registers moves;
26
+ # - some instruction reordering and interleaving;
27
+ # - this module doesn't implement key setup subroutine, instead it
28
+ # relies on conversion of "conventional" key schedule as returned
29
+ # by AES_set_encrypt_key (see discussion below);
30
+ # - first and last round keys are treated differently, which allowed
31
+ # to skip one shiftrows(), reduce bit-sliced key schedule and
32
+ # speed-up conversion by 22%;
33
+ # - support for 192- and 256-bit keys was added;
34
+ #
35
+ # Resulting performance in CPU cycles spent to encrypt one byte out
36
+ # of 4096-byte buffer with 128-bit key is:
37
+ #
38
+ # Emilia's this(*) difference
39
+ #
40
+ # Core 2 9.30 8.69 +7%
41
+ # Nehalem(**) 7.63 6.88 +11%
42
+ # Atom 17.1 16.4 +4%
43
+ # Silvermont - 12.9
44
+ #
45
+ # (*) Comparison is not completely fair, because "this" is ECB,
46
+ # i.e. no extra processing such as counter values calculation
47
+ # and xor-ing input as in Emilia's CTR implementation is
48
+ # performed. However, the CTR calculations stand for not more
49
+ # than 1% of total time, so comparison is *rather* fair.
50
+ #
51
+ # (**) Results were collected on Westmere, which is considered to
52
+ # be equivalent to Nehalem for this code.
53
+ #
54
+ # As for key schedule conversion subroutine. Interface to OpenSSL
55
+ # relies on per-invocation on-the-fly conversion. This naturally
56
+ # has impact on performance, especially for short inputs. Conversion
57
+ # time in CPU cycles and its ratio to CPU cycles spent in 8x block
58
+ # function is:
59
+ #
60
+ # conversion conversion/8x block
61
+ # Core 2 240 0.22
62
+ # Nehalem 180 0.20
63
+ # Atom 430 0.20
64
+ #
65
+ # The ratio values mean that 128-byte blocks will be processed
66
+ # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67
+ # etc. Then keep in mind that input sizes not divisible by 128 are
68
+ # *effectively* slower, especially shortest ones, e.g. consecutive
69
+ # 144-byte blocks are processed 44% slower than one would expect,
70
+ # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71
+ # it's still faster than ["hyper-threading-safe" code path in]
72
+ # aes-x86_64.pl on all lengths above 64 bytes...
73
+ #
74
+ # October 2011.
75
+ #
76
+ # Add decryption procedure. Performance in CPU cycles spent to decrypt
77
+ # one byte out of 4096-byte buffer with 128-bit key is:
78
+ #
79
+ # Core 2 9.98
80
+ # Nehalem 7.80
81
+ # Atom 17.9
82
+ # Silvermont 14.0
83
+ #
84
+ # November 2011.
85
+ #
86
+ # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87
+ # suboptimal, but XTS is meant to be used with larger blocks...
88
+ #
89
+ # <appro@openssl.org>
90
+
91
+ $flavour = shift;
92
+ $output = shift;
93
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
+
95
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
+
97
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100
+ die "can't locate x86_64-xlate.pl";
101
+
102
+ open OUT,"| \"$^X\" $xlate $flavour $output";
103
+ *STDOUT=*OUT;
104
+
105
+ my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106
+ my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
107
+
108
+ {
109
+ my ($key,$rounds,$const)=("%rax","%r10d","%r11");
110
+
111
+ sub Sbox {
112
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
113
+ # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
114
+ my @b=@_[0..7];
115
+ my @t=@_[8..11];
116
+ my @s=@_[12..15];
117
+ &InBasisChange (@b);
118
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
119
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
120
+ }
121
+
122
+ sub InBasisChange {
123
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
124
+ # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
125
+ my @b=@_[0..7];
126
+ $code.=<<___;
127
+ pxor @b[6], @b[5]
128
+ pxor @b[1], @b[2]
129
+ pxor @b[0], @b[3]
130
+ pxor @b[2], @b[6]
131
+ pxor @b[0], @b[5]
132
+
133
+ pxor @b[3], @b[6]
134
+ pxor @b[7], @b[3]
135
+ pxor @b[5], @b[7]
136
+ pxor @b[4], @b[3]
137
+ pxor @b[5], @b[4]
138
+ pxor @b[1], @b[3]
139
+
140
+ pxor @b[7], @b[2]
141
+ pxor @b[5], @b[1]
142
+ ___
143
+ }
144
+
145
+ sub OutBasisChange {
146
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
147
+ # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
148
+ my @b=@_[0..7];
149
+ $code.=<<___;
150
+ pxor @b[6], @b[0]
151
+ pxor @b[4], @b[1]
152
+ pxor @b[0], @b[2]
153
+ pxor @b[6], @b[4]
154
+ pxor @b[1], @b[6]
155
+
156
+ pxor @b[5], @b[1]
157
+ pxor @b[3], @b[5]
158
+ pxor @b[7], @b[3]
159
+ pxor @b[5], @b[7]
160
+ pxor @b[5], @b[2]
161
+
162
+ pxor @b[7], @b[4]
163
+ ___
164
+ }
165
+
166
+ sub InvSbox {
167
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
168
+ # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
169
+ my @b=@_[0..7];
170
+ my @t=@_[8..11];
171
+ my @s=@_[12..15];
172
+ &InvInBasisChange (@b);
173
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
174
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
175
+ }
176
+
177
+ sub InvInBasisChange { # OutBasisChange in reverse
178
+ my @b=@_[5,1,2,6,3,7,0,4];
179
+ $code.=<<___
180
+ pxor @b[7], @b[4]
181
+
182
+ pxor @b[5], @b[7]
183
+ pxor @b[5], @b[2]
184
+ pxor @b[7], @b[3]
185
+ pxor @b[3], @b[5]
186
+ pxor @b[5], @b[1]
187
+
188
+ pxor @b[1], @b[6]
189
+ pxor @b[0], @b[2]
190
+ pxor @b[6], @b[4]
191
+ pxor @b[6], @b[0]
192
+ pxor @b[4], @b[1]
193
+ ___
194
+ }
195
+
196
+ sub InvOutBasisChange { # InBasisChange in reverse
197
+ my @b=@_[2,5,7,3,6,1,0,4];
198
+ $code.=<<___;
199
+ pxor @b[5], @b[1]
200
+ pxor @b[7], @b[2]
201
+
202
+ pxor @b[1], @b[3]
203
+ pxor @b[5], @b[4]
204
+ pxor @b[5], @b[7]
205
+ pxor @b[4], @b[3]
206
+ pxor @b[0], @b[5]
207
+ pxor @b[7], @b[3]
208
+ pxor @b[2], @b[6]
209
+ pxor @b[1], @b[2]
210
+ pxor @b[3], @b[6]
211
+
212
+ pxor @b[0], @b[3]
213
+ pxor @b[6], @b[5]
214
+ ___
215
+ }
216
+
217
+ sub Mul_GF4 {
218
+ #;*************************************************************
219
+ #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
220
+ #;*************************************************************
221
+ my ($x0,$x1,$y0,$y1,$t0)=@_;
222
+ $code.=<<___;
223
+ movdqa $y0, $t0
224
+ pxor $y1, $t0
225
+ pand $x0, $t0
226
+ pxor $x1, $x0
227
+ pand $y0, $x1
228
+ pand $y1, $x0
229
+ pxor $x1, $x0
230
+ pxor $t0, $x1
231
+ ___
232
+ }
233
+
234
+ sub Mul_GF4_N { # not used, see next subroutine
235
+ # multiply and scale by N
236
+ my ($x0,$x1,$y0,$y1,$t0)=@_;
237
+ $code.=<<___;
238
+ movdqa $y0, $t0
239
+ pxor $y1, $t0
240
+ pand $x0, $t0
241
+ pxor $x1, $x0
242
+ pand $y0, $x1
243
+ pand $y1, $x0
244
+ pxor $x0, $x1
245
+ pxor $t0, $x0
246
+ ___
247
+ }
248
+
249
+ sub Mul_GF4_N_GF4 {
250
+ # interleaved Mul_GF4_N and Mul_GF4
251
+ my ($x0,$x1,$y0,$y1,$t0,
252
+ $x2,$x3,$y2,$y3,$t1)=@_;
253
+ $code.=<<___;
254
+ movdqa $y0, $t0
255
+ movdqa $y2, $t1
256
+ pxor $y1, $t0
257
+ pxor $y3, $t1
258
+ pand $x0, $t0
259
+ pand $x2, $t1
260
+ pxor $x1, $x0
261
+ pxor $x3, $x2
262
+ pand $y0, $x1
263
+ pand $y2, $x3
264
+ pand $y1, $x0
265
+ pand $y3, $x2
266
+ pxor $x0, $x1
267
+ pxor $x3, $x2
268
+ pxor $t0, $x0
269
+ pxor $t1, $x3
270
+ ___
271
+ }
272
+ sub Mul_GF16_2 {
273
+ my @x=@_[0..7];
274
+ my @y=@_[8..11];
275
+ my @t=@_[12..15];
276
+ $code.=<<___;
277
+ movdqa @x[0], @t[0]
278
+ movdqa @x[1], @t[1]
279
+ ___
280
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
281
+ $code.=<<___;
282
+ pxor @x[2], @t[0]
283
+ pxor @x[3], @t[1]
284
+ pxor @y[2], @y[0]
285
+ pxor @y[3], @y[1]
286
+ ___
287
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
288
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
289
+ $code.=<<___;
290
+ pxor @t[0], @x[0]
291
+ pxor @t[0], @x[2]
292
+ pxor @t[1], @x[1]
293
+ pxor @t[1], @x[3]
294
+
295
+ movdqa @x[4], @t[0]
296
+ movdqa @x[5], @t[1]
297
+ pxor @x[6], @t[0]
298
+ pxor @x[7], @t[1]
299
+ ___
300
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
301
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
302
+ $code.=<<___;
303
+ pxor @y[2], @y[0]
304
+ pxor @y[3], @y[1]
305
+ ___
306
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
307
+ $code.=<<___;
308
+ pxor @t[0], @x[4]
309
+ pxor @t[0], @x[6]
310
+ pxor @t[1], @x[5]
311
+ pxor @t[1], @x[7]
312
+ ___
313
+ }
314
+ sub Inv_GF256 {
315
+ #;********************************************************************
316
+ #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
317
+ #;********************************************************************
318
+ my @x=@_[0..7];
319
+ my @t=@_[8..11];
320
+ my @s=@_[12..15];
321
+ # direct optimizations from hardware
322
+ $code.=<<___;
323
+ movdqa @x[4], @t[3]
324
+ movdqa @x[5], @t[2]
325
+ movdqa @x[1], @t[1]
326
+ movdqa @x[7], @s[1]
327
+ movdqa @x[0], @s[0]
328
+
329
+ pxor @x[6], @t[3]
330
+ pxor @x[7], @t[2]
331
+ pxor @x[3], @t[1]
332
+ movdqa @t[3], @s[2]
333
+ pxor @x[6], @s[1]
334
+ movdqa @t[2], @t[0]
335
+ pxor @x[2], @s[0]
336
+ movdqa @t[3], @s[3]
337
+
338
+ por @t[1], @t[2]
339
+ por @s[0], @t[3]
340
+ pxor @t[0], @s[3]
341
+ pand @s[0], @s[2]
342
+ pxor @t[1], @s[0]
343
+ pand @t[1], @t[0]
344
+ pand @s[0], @s[3]
345
+ movdqa @x[3], @s[0]
346
+ pxor @x[2], @s[0]
347
+ pand @s[0], @s[1]
348
+ pxor @s[1], @t[3]
349
+ pxor @s[1], @t[2]
350
+ movdqa @x[4], @s[1]
351
+ movdqa @x[1], @s[0]
352
+ pxor @x[5], @s[1]
353
+ pxor @x[0], @s[0]
354
+ movdqa @s[1], @t[1]
355
+ pand @s[0], @s[1]
356
+ por @s[0], @t[1]
357
+ pxor @s[1], @t[0]
358
+ pxor @s[3], @t[3]
359
+ pxor @s[2], @t[2]
360
+ pxor @s[3], @t[1]
361
+ movdqa @x[7], @s[0]
362
+ pxor @s[2], @t[0]
363
+ movdqa @x[6], @s[1]
364
+ pxor @s[2], @t[1]
365
+ movdqa @x[5], @s[2]
366
+ pand @x[3], @s[0]
367
+ movdqa @x[4], @s[3]
368
+ pand @x[2], @s[1]
369
+ pand @x[1], @s[2]
370
+ por @x[0], @s[3]
371
+ pxor @s[0], @t[3]
372
+ pxor @s[1], @t[2]
373
+ pxor @s[2], @t[1]
374
+ pxor @s[3], @t[0]
375
+
376
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
377
+
378
+ # new smaller inversion
379
+
380
+ movdqa @t[3], @s[0]
381
+ pand @t[1], @t[3]
382
+ pxor @t[2], @s[0]
383
+
384
+ movdqa @t[0], @s[2]
385
+ movdqa @s[0], @s[3]
386
+ pxor @t[3], @s[2]
387
+ pand @s[2], @s[3]
388
+
389
+ movdqa @t[1], @s[1]
390
+ pxor @t[2], @s[3]
391
+ pxor @t[0], @s[1]
392
+
393
+ pxor @t[2], @t[3]
394
+
395
+ pand @t[3], @s[1]
396
+
397
+ movdqa @s[2], @t[2]
398
+ pxor @t[0], @s[1]
399
+
400
+ pxor @s[1], @t[2]
401
+ pxor @s[1], @t[1]
402
+
403
+ pand @t[0], @t[2]
404
+
405
+ pxor @t[2], @s[2]
406
+ pxor @t[2], @t[1]
407
+
408
+ pand @s[3], @s[2]
409
+
410
+ pxor @s[0], @s[2]
411
+ ___
412
+ # output in s3, s2, s1, t1
413
+
414
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
415
+
416
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
417
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
418
+
419
+ ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
420
+ }
421
+
422
+ # AES linear components
423
+
424
+ sub ShiftRows {
425
+ my @x=@_[0..7];
426
+ my $mask=pop;
427
+ $code.=<<___;
428
+ pxor 0x00($key),@x[0]
429
+ pxor 0x10($key),@x[1]
430
+ pxor 0x20($key),@x[2]
431
+ pxor 0x30($key),@x[3]
432
+ pshufb $mask,@x[0]
433
+ pshufb $mask,@x[1]
434
+ pxor 0x40($key),@x[4]
435
+ pxor 0x50($key),@x[5]
436
+ pshufb $mask,@x[2]
437
+ pshufb $mask,@x[3]
438
+ pxor 0x60($key),@x[6]
439
+ pxor 0x70($key),@x[7]
440
+ pshufb $mask,@x[4]
441
+ pshufb $mask,@x[5]
442
+ pshufb $mask,@x[6]
443
+ pshufb $mask,@x[7]
444
+ lea 0x80($key),$key
445
+ ___
446
+ }
447
+
448
+ sub MixColumns {
449
+ # modified to emit output in order suitable for feeding back to aesenc[last]
450
+ my @x=@_[0..7];
451
+ my @t=@_[8..15];
452
+ my $inv=@_[16]; # optional
453
+ $code.=<<___;
454
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
455
+ pshufd \$0x93, @x[1], @t[1]
456
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
457
+ pshufd \$0x93, @x[2], @t[2]
458
+ pxor @t[1], @x[1]
459
+ pshufd \$0x93, @x[3], @t[3]
460
+ pxor @t[2], @x[2]
461
+ pshufd \$0x93, @x[4], @t[4]
462
+ pxor @t[3], @x[3]
463
+ pshufd \$0x93, @x[5], @t[5]
464
+ pxor @t[4], @x[4]
465
+ pshufd \$0x93, @x[6], @t[6]
466
+ pxor @t[5], @x[5]
467
+ pshufd \$0x93, @x[7], @t[7]
468
+ pxor @t[6], @x[6]
469
+ pxor @t[7], @x[7]
470
+
471
+ pxor @x[0], @t[1]
472
+ pxor @x[7], @t[0]
473
+ pxor @x[7], @t[1]
474
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
475
+ pxor @x[1], @t[2]
476
+ pshufd \$0x4E, @x[1], @x[1]
477
+ pxor @x[4], @t[5]
478
+ pxor @t[0], @x[0]
479
+ pxor @x[5], @t[6]
480
+ pxor @t[1], @x[1]
481
+ pxor @x[3], @t[4]
482
+ pshufd \$0x4E, @x[4], @t[0]
483
+ pxor @x[6], @t[7]
484
+ pshufd \$0x4E, @x[5], @t[1]
485
+ pxor @x[2], @t[3]
486
+ pshufd \$0x4E, @x[3], @x[4]
487
+ pxor @x[7], @t[3]
488
+ pshufd \$0x4E, @x[7], @x[5]
489
+ pxor @x[7], @t[4]
490
+ pshufd \$0x4E, @x[6], @x[3]
491
+ pxor @t[4], @t[0]
492
+ pshufd \$0x4E, @x[2], @x[6]
493
+ pxor @t[5], @t[1]
494
+ ___
495
+ $code.=<<___ if (!$inv);
496
+ pxor @t[3], @x[4]
497
+ pxor @t[7], @x[5]
498
+ pxor @t[6], @x[3]
499
+ movdqa @t[0], @x[2]
500
+ pxor @t[2], @x[6]
501
+ movdqa @t[1], @x[7]
502
+ ___
503
+ $code.=<<___ if ($inv);
504
+ pxor @x[4], @t[3]
505
+ pxor @t[7], @x[5]
506
+ pxor @x[3], @t[6]
507
+ movdqa @t[0], @x[3]
508
+ pxor @t[2], @x[6]
509
+ movdqa @t[6], @x[2]
510
+ movdqa @t[1], @x[7]
511
+ movdqa @x[6], @x[4]
512
+ movdqa @t[3], @x[6]
513
+ ___
514
+ }
515
+
516
+ sub InvMixColumns_orig {
517
+ my @x=@_[0..7];
518
+ my @t=@_[8..15];
519
+
520
+ $code.=<<___;
521
+ # multiplication by 0x0e
522
+ pshufd \$0x93, @x[7], @t[7]
523
+ movdqa @x[2], @t[2]
524
+ pxor @x[5], @x[7] # 7 5
525
+ pxor @x[5], @x[2] # 2 5
526
+ pshufd \$0x93, @x[0], @t[0]
527
+ movdqa @x[5], @t[5]
528
+ pxor @x[0], @x[5] # 5 0 [1]
529
+ pxor @x[1], @x[0] # 0 1
530
+ pshufd \$0x93, @x[1], @t[1]
531
+ pxor @x[2], @x[1] # 1 25
532
+ pxor @x[6], @x[0] # 01 6 [2]
533
+ pxor @x[3], @x[1] # 125 3 [4]
534
+ pshufd \$0x93, @x[3], @t[3]
535
+ pxor @x[0], @x[2] # 25 016 [3]
536
+ pxor @x[7], @x[3] # 3 75
537
+ pxor @x[6], @x[7] # 75 6 [0]
538
+ pshufd \$0x93, @x[6], @t[6]
539
+ movdqa @x[4], @t[4]
540
+ pxor @x[4], @x[6] # 6 4
541
+ pxor @x[3], @x[4] # 4 375 [6]
542
+ pxor @x[7], @x[3] # 375 756=36
543
+ pxor @t[5], @x[6] # 64 5 [7]
544
+ pxor @t[2], @x[3] # 36 2
545
+ pxor @t[4], @x[3] # 362 4 [5]
546
+ pshufd \$0x93, @t[5], @t[5]
547
+ ___
548
+ my @y = @x[7,5,0,2,1,3,4,6];
549
+ $code.=<<___;
550
+ # multiplication by 0x0b
551
+ pxor @y[0], @y[1]
552
+ pxor @t[0], @y[0]
553
+ pxor @t[1], @y[1]
554
+ pshufd \$0x93, @t[2], @t[2]
555
+ pxor @t[5], @y[0]
556
+ pxor @t[6], @y[1]
557
+ pxor @t[7], @y[0]
558
+ pshufd \$0x93, @t[4], @t[4]
559
+ pxor @t[6], @t[7] # clobber t[7]
560
+ pxor @y[0], @y[1]
561
+
562
+ pxor @t[0], @y[3]
563
+ pshufd \$0x93, @t[0], @t[0]
564
+ pxor @t[1], @y[2]
565
+ pxor @t[1], @y[4]
566
+ pxor @t[2], @y[2]
567
+ pshufd \$0x93, @t[1], @t[1]
568
+ pxor @t[2], @y[3]
569
+ pxor @t[2], @y[5]
570
+ pxor @t[7], @y[2]
571
+ pshufd \$0x93, @t[2], @t[2]
572
+ pxor @t[3], @y[3]
573
+ pxor @t[3], @y[6]
574
+ pxor @t[3], @y[4]
575
+ pshufd \$0x93, @t[3], @t[3]
576
+ pxor @t[4], @y[7]
577
+ pxor @t[4], @y[5]
578
+ pxor @t[7], @y[7]
579
+ pxor @t[5], @y[3]
580
+ pxor @t[4], @y[4]
581
+ pxor @t[5], @t[7] # clobber t[7] even more
582
+
583
+ pxor @t[7], @y[5]
584
+ pshufd \$0x93, @t[4], @t[4]
585
+ pxor @t[7], @y[6]
586
+ pxor @t[7], @y[4]
587
+
588
+ pxor @t[5], @t[7]
589
+ pshufd \$0x93, @t[5], @t[5]
590
+ pxor @t[6], @t[7] # restore t[7]
591
+
592
+ # multiplication by 0x0d
593
+ pxor @y[7], @y[4]
594
+ pxor @t[4], @y[7]
595
+ pshufd \$0x93, @t[6], @t[6]
596
+ pxor @t[0], @y[2]
597
+ pxor @t[5], @y[7]
598
+ pxor @t[2], @y[2]
599
+ pshufd \$0x93, @t[7], @t[7]
600
+
601
+ pxor @y[1], @y[3]
602
+ pxor @t[1], @y[1]
603
+ pxor @t[0], @y[0]
604
+ pxor @t[0], @y[3]
605
+ pxor @t[5], @y[1]
606
+ pxor @t[5], @y[0]
607
+ pxor @t[7], @y[1]
608
+ pshufd \$0x93, @t[0], @t[0]
609
+ pxor @t[6], @y[0]
610
+ pxor @y[1], @y[3]
611
+ pxor @t[1], @y[4]
612
+ pshufd \$0x93, @t[1], @t[1]
613
+
614
+ pxor @t[7], @y[7]
615
+ pxor @t[2], @y[4]
616
+ pxor @t[2], @y[5]
617
+ pshufd \$0x93, @t[2], @t[2]
618
+ pxor @t[6], @y[2]
619
+ pxor @t[3], @t[6] # clobber t[6]
620
+ pxor @y[7], @y[4]
621
+ pxor @t[6], @y[3]
622
+
623
+ pxor @t[6], @y[6]
624
+ pxor @t[5], @y[5]
625
+ pxor @t[4], @y[6]
626
+ pshufd \$0x93, @t[4], @t[4]
627
+ pxor @t[6], @y[5]
628
+ pxor @t[7], @y[6]
629
+ pxor @t[3], @t[6] # restore t[6]
630
+
631
+ pshufd \$0x93, @t[5], @t[5]
632
+ pshufd \$0x93, @t[6], @t[6]
633
+ pshufd \$0x93, @t[7], @t[7]
634
+ pshufd \$0x93, @t[3], @t[3]
635
+
636
+ # multiplication by 0x09
637
+ pxor @y[1], @y[4]
638
+ pxor @y[1], @t[1] # t[1]=y[1]
639
+ pxor @t[5], @t[0] # clobber t[0]
640
+ pxor @t[5], @t[1]
641
+ pxor @t[0], @y[3]
642
+ pxor @y[0], @t[0] # t[0]=y[0]
643
+ pxor @t[6], @t[1]
644
+ pxor @t[7], @t[6] # clobber t[6]
645
+ pxor @t[1], @y[4]
646
+ pxor @t[4], @y[7]
647
+ pxor @y[4], @t[4] # t[4]=y[4]
648
+ pxor @t[3], @y[6]
649
+ pxor @y[3], @t[3] # t[3]=y[3]
650
+ pxor @t[2], @y[5]
651
+ pxor @y[2], @t[2] # t[2]=y[2]
652
+ pxor @t[7], @t[3]
653
+ pxor @y[5], @t[5] # t[5]=y[5]
654
+ pxor @t[6], @t[2]
655
+ pxor @t[6], @t[5]
656
+ pxor @y[6], @t[6] # t[6]=y[6]
657
+ pxor @y[7], @t[7] # t[7]=y[7]
658
+
659
+ movdqa @t[0],@XMM[0]
660
+ movdqa @t[1],@XMM[1]
661
+ movdqa @t[2],@XMM[2]
662
+ movdqa @t[3],@XMM[3]
663
+ movdqa @t[4],@XMM[4]
664
+ movdqa @t[5],@XMM[5]
665
+ movdqa @t[6],@XMM[6]
666
+ movdqa @t[7],@XMM[7]
667
+ ___
668
+ }
669
+
670
+ sub InvMixColumns {
671
+ my @x=@_[0..7];
672
+ my @t=@_[8..15];
673
+
674
+ # Thanks to Jussi Kivilinna for providing pointer to
675
+ #
676
+ # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
677
+ # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
678
+ # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
679
+ # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
680
+
681
+ $code.=<<___;
682
+ # multiplication by 0x05-0x00-0x04-0x00
683
+ pshufd \$0x4E, @x[0], @t[0]
684
+ pshufd \$0x4E, @x[6], @t[6]
685
+ pxor @x[0], @t[0]
686
+ pshufd \$0x4E, @x[7], @t[7]
687
+ pxor @x[6], @t[6]
688
+ pshufd \$0x4E, @x[1], @t[1]
689
+ pxor @x[7], @t[7]
690
+ pshufd \$0x4E, @x[2], @t[2]
691
+ pxor @x[1], @t[1]
692
+ pshufd \$0x4E, @x[3], @t[3]
693
+ pxor @x[2], @t[2]
694
+ pxor @t[6], @x[0]
695
+ pxor @t[6], @x[1]
696
+ pshufd \$0x4E, @x[4], @t[4]
697
+ pxor @x[3], @t[3]
698
+ pxor @t[0], @x[2]
699
+ pxor @t[1], @x[3]
700
+ pshufd \$0x4E, @x[5], @t[5]
701
+ pxor @x[4], @t[4]
702
+ pxor @t[7], @x[1]
703
+ pxor @t[2], @x[4]
704
+ pxor @x[5], @t[5]
705
+
706
+ pxor @t[7], @x[2]
707
+ pxor @t[6], @x[3]
708
+ pxor @t[6], @x[4]
709
+ pxor @t[3], @x[5]
710
+ pxor @t[4], @x[6]
711
+ pxor @t[7], @x[4]
712
+ pxor @t[7], @x[5]
713
+ pxor @t[5], @x[7]
714
+ ___
715
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
716
+ }
717
+
718
+ sub aesenc { # not used
719
+ my @b=@_[0..7];
720
+ my @t=@_[8..15];
721
+ $code.=<<___;
722
+ movdqa 0x30($const),@t[0] # .LSR
723
+ ___
724
+ &ShiftRows (@b,@t[0]);
725
+ &Sbox (@b,@t);
726
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
727
+ }
728
+
729
+ sub aesenclast { # not used
730
+ my @b=@_[0..7];
731
+ my @t=@_[8..15];
732
+ $code.=<<___;
733
+ movdqa 0x40($const),@t[0] # .LSRM0
734
+ ___
735
+ &ShiftRows (@b,@t[0]);
736
+ &Sbox (@b,@t);
737
+ $code.=<<___
738
+ pxor 0x00($key),@b[0]
739
+ pxor 0x10($key),@b[1]
740
+ pxor 0x20($key),@b[4]
741
+ pxor 0x30($key),@b[6]
742
+ pxor 0x40($key),@b[3]
743
+ pxor 0x50($key),@b[7]
744
+ pxor 0x60($key),@b[2]
745
+ pxor 0x70($key),@b[5]
746
+ ___
747
+ }
748
+
749
+ sub swapmove {
750
+ my ($a,$b,$n,$mask,$t)=@_;
751
+ $code.=<<___;
752
+ movdqa $b,$t
753
+ psrlq \$$n,$b
754
+ pxor $a,$b
755
+ pand $mask,$b
756
+ pxor $b,$a
757
+ psllq \$$n,$b
758
+ pxor $t,$b
759
+ ___
760
+ }
761
+ sub swapmove2x {
762
+ my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
763
+ $code.=<<___;
764
+ movdqa $b0,$t0
765
+ psrlq \$$n,$b0
766
+ movdqa $b1,$t1
767
+ psrlq \$$n,$b1
768
+ pxor $a0,$b0
769
+ pxor $a1,$b1
770
+ pand $mask,$b0
771
+ pand $mask,$b1
772
+ pxor $b0,$a0
773
+ psllq \$$n,$b0
774
+ pxor $b1,$a1
775
+ psllq \$$n,$b1
776
+ pxor $t0,$b0
777
+ pxor $t1,$b1
778
+ ___
779
+ }
780
+
781
+ sub bitslice {
782
+ my @x=reverse(@_[0..7]);
783
+ my ($t0,$t1,$t2,$t3)=@_[8..11];
784
+ $code.=<<___;
785
+ movdqa 0x00($const),$t0 # .LBS0
786
+ movdqa 0x10($const),$t1 # .LBS1
787
+ ___
788
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
789
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
790
+ $code.=<<___;
791
+ movdqa 0x20($const),$t0 # .LBS2
792
+ ___
793
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
794
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
795
+
796
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
797
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
798
+ }
799
+
800
+ $code.=<<___;
801
+ .text
802
+
803
+ .extern asm_AES_encrypt
804
+ .extern asm_AES_decrypt
805
+
806
+ .type _bsaes_encrypt8,\@abi-omnipotent
807
+ .align 64
808
+ _bsaes_encrypt8:
809
+ lea .LBS0(%rip), $const # constants table
810
+
811
+ movdqa ($key), @XMM[9] # round 0 key
812
+ lea 0x10($key), $key
813
+ movdqa 0x50($const), @XMM[8] # .LM0SR
814
+ pxor @XMM[9], @XMM[0] # xor with round0 key
815
+ pxor @XMM[9], @XMM[1]
816
+ pxor @XMM[9], @XMM[2]
817
+ pxor @XMM[9], @XMM[3]
818
+ pshufb @XMM[8], @XMM[0]
819
+ pshufb @XMM[8], @XMM[1]
820
+ pxor @XMM[9], @XMM[4]
821
+ pxor @XMM[9], @XMM[5]
822
+ pshufb @XMM[8], @XMM[2]
823
+ pshufb @XMM[8], @XMM[3]
824
+ pxor @XMM[9], @XMM[6]
825
+ pxor @XMM[9], @XMM[7]
826
+ pshufb @XMM[8], @XMM[4]
827
+ pshufb @XMM[8], @XMM[5]
828
+ pshufb @XMM[8], @XMM[6]
829
+ pshufb @XMM[8], @XMM[7]
830
+ _bsaes_encrypt8_bitslice:
831
+ ___
832
+ &bitslice (@XMM[0..7, 8..11]);
833
+ $code.=<<___;
834
+ dec $rounds
835
+ jmp .Lenc_sbox
836
+ .align 16
837
+ .Lenc_loop:
838
+ ___
839
+ &ShiftRows (@XMM[0..7, 8]);
840
+ $code.=".Lenc_sbox:\n";
841
+ &Sbox (@XMM[0..7, 8..15]);
842
+ $code.=<<___;
843
+ dec $rounds
844
+ jl .Lenc_done
845
+ ___
846
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
847
+ $code.=<<___;
848
+ movdqa 0x30($const), @XMM[8] # .LSR
849
+ jnz .Lenc_loop
850
+ movdqa 0x40($const), @XMM[8] # .LSRM0
851
+ jmp .Lenc_loop
852
+ .align 16
853
+ .Lenc_done:
854
+ ___
855
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
856
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
857
+ $code.=<<___;
858
+ movdqa ($key), @XMM[8] # last round key
859
+ pxor @XMM[8], @XMM[4]
860
+ pxor @XMM[8], @XMM[6]
861
+ pxor @XMM[8], @XMM[3]
862
+ pxor @XMM[8], @XMM[7]
863
+ pxor @XMM[8], @XMM[2]
864
+ pxor @XMM[8], @XMM[5]
865
+ pxor @XMM[8], @XMM[0]
866
+ pxor @XMM[8], @XMM[1]
867
+ ret
868
+ .size _bsaes_encrypt8,.-_bsaes_encrypt8
869
+
870
+ .type _bsaes_decrypt8,\@abi-omnipotent
871
+ .align 64
872
+ _bsaes_decrypt8:
873
+ lea .LBS0(%rip), $const # constants table
874
+
875
+ movdqa ($key), @XMM[9] # round 0 key
876
+ lea 0x10($key), $key
877
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
878
+ pxor @XMM[9], @XMM[0] # xor with round0 key
879
+ pxor @XMM[9], @XMM[1]
880
+ pxor @XMM[9], @XMM[2]
881
+ pxor @XMM[9], @XMM[3]
882
+ pshufb @XMM[8], @XMM[0]
883
+ pshufb @XMM[8], @XMM[1]
884
+ pxor @XMM[9], @XMM[4]
885
+ pxor @XMM[9], @XMM[5]
886
+ pshufb @XMM[8], @XMM[2]
887
+ pshufb @XMM[8], @XMM[3]
888
+ pxor @XMM[9], @XMM[6]
889
+ pxor @XMM[9], @XMM[7]
890
+ pshufb @XMM[8], @XMM[4]
891
+ pshufb @XMM[8], @XMM[5]
892
+ pshufb @XMM[8], @XMM[6]
893
+ pshufb @XMM[8], @XMM[7]
894
+ ___
895
+ &bitslice (@XMM[0..7, 8..11]);
896
+ $code.=<<___;
897
+ dec $rounds
898
+ jmp .Ldec_sbox
899
+ .align 16
900
+ .Ldec_loop:
901
+ ___
902
+ &ShiftRows (@XMM[0..7, 8]);
903
+ $code.=".Ldec_sbox:\n";
904
+ &InvSbox (@XMM[0..7, 8..15]);
905
+ $code.=<<___;
906
+ dec $rounds
907
+ jl .Ldec_done
908
+ ___
909
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
910
+ $code.=<<___;
911
+ movdqa -0x10($const), @XMM[8] # .LISR
912
+ jnz .Ldec_loop
913
+ movdqa -0x20($const), @XMM[8] # .LISRM0
914
+ jmp .Ldec_loop
915
+ .align 16
916
+ .Ldec_done:
917
+ ___
918
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
919
+ $code.=<<___;
920
+ movdqa ($key), @XMM[8] # last round key
921
+ pxor @XMM[8], @XMM[6]
922
+ pxor @XMM[8], @XMM[4]
923
+ pxor @XMM[8], @XMM[2]
924
+ pxor @XMM[8], @XMM[7]
925
+ pxor @XMM[8], @XMM[3]
926
+ pxor @XMM[8], @XMM[5]
927
+ pxor @XMM[8], @XMM[0]
928
+ pxor @XMM[8], @XMM[1]
929
+ ret
930
+ .size _bsaes_decrypt8,.-_bsaes_decrypt8
931
+ ___
932
+ }
933
+ {
934
+ my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
935
+
936
+ sub bitslice_key {
937
+ my @x=reverse(@_[0..7]);
938
+ my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
939
+
940
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
941
+ $code.=<<___;
942
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
943
+ movdqa @x[0], @x[2]
944
+ movdqa @x[1], @x[3]
945
+ ___
946
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
947
+
948
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
949
+ $code.=<<___;
950
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
951
+ movdqa @x[0], @x[4]
952
+ movdqa @x[2], @x[6]
953
+ movdqa @x[1], @x[5]
954
+ movdqa @x[3], @x[7]
955
+ ___
956
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
957
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
958
+ }
959
+
960
+ $code.=<<___;
961
+ .type _bsaes_key_convert,\@abi-omnipotent
962
+ .align 16
963
+ _bsaes_key_convert:
964
+ lea .Lmasks(%rip), $const
965
+ movdqu ($inp), %xmm7 # load round 0 key
966
+ lea 0x10($inp), $inp
967
+ movdqa 0x00($const), %xmm0 # 0x01...
968
+ movdqa 0x10($const), %xmm1 # 0x02...
969
+ movdqa 0x20($const), %xmm2 # 0x04...
970
+ movdqa 0x30($const), %xmm3 # 0x08...
971
+ movdqa 0x40($const), %xmm4 # .LM0
972
+ pcmpeqd %xmm5, %xmm5 # .LNOT
973
+
974
+ movdqu ($inp), %xmm6 # load round 1 key
975
+ movdqa %xmm7, ($out) # save round 0 key
976
+ lea 0x10($out), $out
977
+ dec $rounds
978
+ jmp .Lkey_loop
979
+ .align 16
980
+ .Lkey_loop:
981
+ pshufb %xmm4, %xmm6 # .LM0
982
+
983
+ movdqa %xmm0, %xmm8
984
+ movdqa %xmm1, %xmm9
985
+
986
+ pand %xmm6, %xmm8
987
+ pand %xmm6, %xmm9
988
+ movdqa %xmm2, %xmm10
989
+ pcmpeqb %xmm0, %xmm8
990
+ psllq \$4, %xmm0 # 0x10...
991
+ movdqa %xmm3, %xmm11
992
+ pcmpeqb %xmm1, %xmm9
993
+ psllq \$4, %xmm1 # 0x20...
994
+
995
+ pand %xmm6, %xmm10
996
+ pand %xmm6, %xmm11
997
+ movdqa %xmm0, %xmm12
998
+ pcmpeqb %xmm2, %xmm10
999
+ psllq \$4, %xmm2 # 0x40...
1000
+ movdqa %xmm1, %xmm13
1001
+ pcmpeqb %xmm3, %xmm11
1002
+ psllq \$4, %xmm3 # 0x80...
1003
+
1004
+ movdqa %xmm2, %xmm14
1005
+ movdqa %xmm3, %xmm15
1006
+ pxor %xmm5, %xmm8 # "pnot"
1007
+ pxor %xmm5, %xmm9
1008
+
1009
+ pand %xmm6, %xmm12
1010
+ pand %xmm6, %xmm13
1011
+ movdqa %xmm8, 0x00($out) # write bit-sliced round key
1012
+ pcmpeqb %xmm0, %xmm12
1013
+ psrlq \$4, %xmm0 # 0x01...
1014
+ movdqa %xmm9, 0x10($out)
1015
+ pcmpeqb %xmm1, %xmm13
1016
+ psrlq \$4, %xmm1 # 0x02...
1017
+ lea 0x10($inp), $inp
1018
+
1019
+ pand %xmm6, %xmm14
1020
+ pand %xmm6, %xmm15
1021
+ movdqa %xmm10, 0x20($out)
1022
+ pcmpeqb %xmm2, %xmm14
1023
+ psrlq \$4, %xmm2 # 0x04...
1024
+ movdqa %xmm11, 0x30($out)
1025
+ pcmpeqb %xmm3, %xmm15
1026
+ psrlq \$4, %xmm3 # 0x08...
1027
+ movdqu ($inp), %xmm6 # load next round key
1028
+
1029
+ pxor %xmm5, %xmm13 # "pnot"
1030
+ pxor %xmm5, %xmm14
1031
+ movdqa %xmm12, 0x40($out)
1032
+ movdqa %xmm13, 0x50($out)
1033
+ movdqa %xmm14, 0x60($out)
1034
+ movdqa %xmm15, 0x70($out)
1035
+ lea 0x80($out),$out
1036
+ dec $rounds
1037
+ jnz .Lkey_loop
1038
+
1039
+ movdqa 0x50($const), %xmm7 # .L63
1040
+ #movdqa %xmm6, ($out) # don't save last round key
1041
+ ret
1042
+ .size _bsaes_key_convert,.-_bsaes_key_convert
1043
+ ___
1044
+ }
1045
+
1046
+ if (0 && !$win64) { # following four functions are unsupported interface
1047
+ # used for benchmarking...
1048
+ $code.=<<___;
1049
+ .globl bsaes_enc_key_convert
1050
+ .type bsaes_enc_key_convert,\@function,2
1051
+ .align 16
1052
+ bsaes_enc_key_convert:
1053
+ mov 240($inp),%r10d # pass rounds
1054
+ mov $inp,%rcx # pass key
1055
+ mov $out,%rax # pass key schedule
1056
+ call _bsaes_key_convert
1057
+ pxor %xmm6,%xmm7 # fix up last round key
1058
+ movdqa %xmm7,(%rax) # save last round key
1059
+ ret
1060
+ .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1061
+
1062
+ .globl bsaes_encrypt_128
1063
+ .type bsaes_encrypt_128,\@function,4
1064
+ .align 16
1065
+ bsaes_encrypt_128:
1066
+ .Lenc128_loop:
1067
+ movdqu 0x00($inp), @XMM[0] # load input
1068
+ movdqu 0x10($inp), @XMM[1]
1069
+ movdqu 0x20($inp), @XMM[2]
1070
+ movdqu 0x30($inp), @XMM[3]
1071
+ movdqu 0x40($inp), @XMM[4]
1072
+ movdqu 0x50($inp), @XMM[5]
1073
+ movdqu 0x60($inp), @XMM[6]
1074
+ movdqu 0x70($inp), @XMM[7]
1075
+ mov $key, %rax # pass the $key
1076
+ lea 0x80($inp), $inp
1077
+ mov \$10,%r10d
1078
+
1079
+ call _bsaes_encrypt8
1080
+
1081
+ movdqu @XMM[0], 0x00($out) # write output
1082
+ movdqu @XMM[1], 0x10($out)
1083
+ movdqu @XMM[4], 0x20($out)
1084
+ movdqu @XMM[6], 0x30($out)
1085
+ movdqu @XMM[3], 0x40($out)
1086
+ movdqu @XMM[7], 0x50($out)
1087
+ movdqu @XMM[2], 0x60($out)
1088
+ movdqu @XMM[5], 0x70($out)
1089
+ lea 0x80($out), $out
1090
+ sub \$0x80,$len
1091
+ ja .Lenc128_loop
1092
+ ret
1093
+ .size bsaes_encrypt_128,.-bsaes_encrypt_128
1094
+
1095
+ .globl bsaes_dec_key_convert
1096
+ .type bsaes_dec_key_convert,\@function,2
1097
+ .align 16
1098
+ bsaes_dec_key_convert:
1099
+ mov 240($inp),%r10d # pass rounds
1100
+ mov $inp,%rcx # pass key
1101
+ mov $out,%rax # pass key schedule
1102
+ call _bsaes_key_convert
1103
+ pxor ($out),%xmm7 # fix up round 0 key
1104
+ movdqa %xmm6,(%rax) # save last round key
1105
+ movdqa %xmm7,($out)
1106
+ ret
1107
+ .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1108
+
1109
+ .globl bsaes_decrypt_128
1110
+ .type bsaes_decrypt_128,\@function,4
1111
+ .align 16
1112
+ bsaes_decrypt_128:
1113
+ .Ldec128_loop:
1114
+ movdqu 0x00($inp), @XMM[0] # load input
1115
+ movdqu 0x10($inp), @XMM[1]
1116
+ movdqu 0x20($inp), @XMM[2]
1117
+ movdqu 0x30($inp), @XMM[3]
1118
+ movdqu 0x40($inp), @XMM[4]
1119
+ movdqu 0x50($inp), @XMM[5]
1120
+ movdqu 0x60($inp), @XMM[6]
1121
+ movdqu 0x70($inp), @XMM[7]
1122
+ mov $key, %rax # pass the $key
1123
+ lea 0x80($inp), $inp
1124
+ mov \$10,%r10d
1125
+
1126
+ call _bsaes_decrypt8
1127
+
1128
+ movdqu @XMM[0], 0x00($out) # write output
1129
+ movdqu @XMM[1], 0x10($out)
1130
+ movdqu @XMM[6], 0x20($out)
1131
+ movdqu @XMM[4], 0x30($out)
1132
+ movdqu @XMM[2], 0x40($out)
1133
+ movdqu @XMM[7], 0x50($out)
1134
+ movdqu @XMM[3], 0x60($out)
1135
+ movdqu @XMM[5], 0x70($out)
1136
+ lea 0x80($out), $out
1137
+ sub \$0x80,$len
1138
+ ja .Ldec128_loop
1139
+ ret
1140
+ .size bsaes_decrypt_128,.-bsaes_decrypt_128
1141
+ ___
1142
+ }
1143
+ {
1144
+ ######################################################################
1145
+ #
1146
+ # OpenSSL interface
1147
+ #
1148
+ my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1149
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1150
+ my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1151
+
1152
+ $code.=<<___;
1153
+ .globl bsaes_ctr32_encrypt_blocks
1154
+ .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1155
+ .align 16
1156
+ bsaes_ctr32_encrypt_blocks:
1157
+ mov %rsp, %rax
1158
+ .Lctr_enc_prologue:
1159
+ push %rbp
1160
+ push %rbx
1161
+ push %r12
1162
+ push %r13
1163
+ push %r14
1164
+ push %r15
1165
+ lea -0x48(%rsp), %rsp
1166
+ ___
1167
+ $code.=<<___ if ($win64);
1168
+ mov 0xa0(%rsp),$arg5 # pull ivp
1169
+ lea -0xa0(%rsp), %rsp
1170
+ movaps %xmm6, 0x40(%rsp)
1171
+ movaps %xmm7, 0x50(%rsp)
1172
+ movaps %xmm8, 0x60(%rsp)
1173
+ movaps %xmm9, 0x70(%rsp)
1174
+ movaps %xmm10, 0x80(%rsp)
1175
+ movaps %xmm11, 0x90(%rsp)
1176
+ movaps %xmm12, 0xa0(%rsp)
1177
+ movaps %xmm13, 0xb0(%rsp)
1178
+ movaps %xmm14, 0xc0(%rsp)
1179
+ movaps %xmm15, 0xd0(%rsp)
1180
+ .Lctr_enc_body:
1181
+ ___
1182
+ $code.=<<___;
1183
+ mov %rsp, %rbp # backup %rsp
1184
+ movdqu ($arg5), %xmm0 # load counter
1185
+ mov 240($arg4), %eax # rounds
1186
+ mov $arg1, $inp # backup arguments
1187
+ mov $arg2, $out
1188
+ mov $arg3, $len
1189
+ mov $arg4, $key
1190
+ movdqa %xmm0, 0x20(%rbp) # copy counter
1191
+ cmp \$8, $arg3
1192
+ jb .Lctr_enc_short
1193
+
1194
+ mov %eax, %ebx # rounds
1195
+ shl \$7, %rax # 128 bytes per inner round key
1196
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
1197
+ sub %rax, %rsp
1198
+
1199
+ mov %rsp, %rax # pass key schedule
1200
+ mov $key, %rcx # pass key
1201
+ mov %ebx, %r10d # pass rounds
1202
+ call _bsaes_key_convert
1203
+ pxor %xmm6,%xmm7 # fix up last round key
1204
+ movdqa %xmm7,(%rax) # save last round key
1205
+
1206
+ movdqa (%rsp), @XMM[9] # load round0 key
1207
+ lea .LADD1(%rip), %r11
1208
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
1209
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1210
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
1211
+ pshufb @XMM[8], @XMM[0]
1212
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
1213
+ jmp .Lctr_enc_loop
1214
+ .align 16
1215
+ .Lctr_enc_loop:
1216
+ movdqa @XMM[0], 0x20(%rbp) # save counter
1217
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1218
+ movdqa @XMM[0], @XMM[2]
1219
+ paddd 0x00(%r11), @XMM[1] # .LADD1
1220
+ movdqa @XMM[0], @XMM[3]
1221
+ paddd 0x10(%r11), @XMM[2] # .LADD2
1222
+ movdqa @XMM[0], @XMM[4]
1223
+ paddd 0x20(%r11), @XMM[3] # .LADD3
1224
+ movdqa @XMM[0], @XMM[5]
1225
+ paddd 0x30(%r11), @XMM[4] # .LADD4
1226
+ movdqa @XMM[0], @XMM[6]
1227
+ paddd 0x40(%r11), @XMM[5] # .LADD5
1228
+ movdqa @XMM[0], @XMM[7]
1229
+ paddd 0x50(%r11), @XMM[6] # .LADD6
1230
+ paddd 0x60(%r11), @XMM[7] # .LADD7
1231
+
1232
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1233
+ # to flip byte order in 32-bit counter
1234
+ movdqa (%rsp), @XMM[9] # round 0 key
1235
+ lea 0x10(%rsp), %rax # pass key schedule
1236
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1237
+ pxor @XMM[9], @XMM[0] # xor with round0 key
1238
+ pxor @XMM[9], @XMM[1]
1239
+ pxor @XMM[9], @XMM[2]
1240
+ pxor @XMM[9], @XMM[3]
1241
+ pshufb @XMM[8], @XMM[0]
1242
+ pshufb @XMM[8], @XMM[1]
1243
+ pxor @XMM[9], @XMM[4]
1244
+ pxor @XMM[9], @XMM[5]
1245
+ pshufb @XMM[8], @XMM[2]
1246
+ pshufb @XMM[8], @XMM[3]
1247
+ pxor @XMM[9], @XMM[6]
1248
+ pxor @XMM[9], @XMM[7]
1249
+ pshufb @XMM[8], @XMM[4]
1250
+ pshufb @XMM[8], @XMM[5]
1251
+ pshufb @XMM[8], @XMM[6]
1252
+ pshufb @XMM[8], @XMM[7]
1253
+ lea .LBS0(%rip), %r11 # constants table
1254
+ mov %ebx,%r10d # pass rounds
1255
+
1256
+ call _bsaes_encrypt8_bitslice
1257
+
1258
+ sub \$8,$len
1259
+ jc .Lctr_enc_loop_done
1260
+
1261
+ movdqu 0x00($inp), @XMM[8] # load input
1262
+ movdqu 0x10($inp), @XMM[9]
1263
+ movdqu 0x20($inp), @XMM[10]
1264
+ movdqu 0x30($inp), @XMM[11]
1265
+ movdqu 0x40($inp), @XMM[12]
1266
+ movdqu 0x50($inp), @XMM[13]
1267
+ movdqu 0x60($inp), @XMM[14]
1268
+ movdqu 0x70($inp), @XMM[15]
1269
+ lea 0x80($inp),$inp
1270
+ pxor @XMM[0], @XMM[8]
1271
+ movdqa 0x20(%rbp), @XMM[0] # load counter
1272
+ pxor @XMM[9], @XMM[1]
1273
+ movdqu @XMM[8], 0x00($out) # write output
1274
+ pxor @XMM[10], @XMM[4]
1275
+ movdqu @XMM[1], 0x10($out)
1276
+ pxor @XMM[11], @XMM[6]
1277
+ movdqu @XMM[4], 0x20($out)
1278
+ pxor @XMM[12], @XMM[3]
1279
+ movdqu @XMM[6], 0x30($out)
1280
+ pxor @XMM[13], @XMM[7]
1281
+ movdqu @XMM[3], 0x40($out)
1282
+ pxor @XMM[14], @XMM[2]
1283
+ movdqu @XMM[7], 0x50($out)
1284
+ pxor @XMM[15], @XMM[5]
1285
+ movdqu @XMM[2], 0x60($out)
1286
+ lea .LADD1(%rip), %r11
1287
+ movdqu @XMM[5], 0x70($out)
1288
+ lea 0x80($out), $out
1289
+ paddd 0x70(%r11), @XMM[0] # .LADD8
1290
+ jnz .Lctr_enc_loop
1291
+
1292
+ jmp .Lctr_enc_done
1293
+ .align 16
1294
+ .Lctr_enc_loop_done:
1295
+ add \$8, $len
1296
+ movdqu 0x00($inp), @XMM[8] # load input
1297
+ pxor @XMM[8], @XMM[0]
1298
+ movdqu @XMM[0], 0x00($out) # write output
1299
+ cmp \$2,$len
1300
+ jb .Lctr_enc_done
1301
+ movdqu 0x10($inp), @XMM[9]
1302
+ pxor @XMM[9], @XMM[1]
1303
+ movdqu @XMM[1], 0x10($out)
1304
+ je .Lctr_enc_done
1305
+ movdqu 0x20($inp), @XMM[10]
1306
+ pxor @XMM[10], @XMM[4]
1307
+ movdqu @XMM[4], 0x20($out)
1308
+ cmp \$4,$len
1309
+ jb .Lctr_enc_done
1310
+ movdqu 0x30($inp), @XMM[11]
1311
+ pxor @XMM[11], @XMM[6]
1312
+ movdqu @XMM[6], 0x30($out)
1313
+ je .Lctr_enc_done
1314
+ movdqu 0x40($inp), @XMM[12]
1315
+ pxor @XMM[12], @XMM[3]
1316
+ movdqu @XMM[3], 0x40($out)
1317
+ cmp \$6,$len
1318
+ jb .Lctr_enc_done
1319
+ movdqu 0x50($inp), @XMM[13]
1320
+ pxor @XMM[13], @XMM[7]
1321
+ movdqu @XMM[7], 0x50($out)
1322
+ je .Lctr_enc_done
1323
+ movdqu 0x60($inp), @XMM[14]
1324
+ pxor @XMM[14], @XMM[2]
1325
+ movdqu @XMM[2], 0x60($out)
1326
+ jmp .Lctr_enc_done
1327
+
1328
+ .align 16
1329
+ .Lctr_enc_short:
1330
+ lea 0x20(%rbp), $arg1
1331
+ lea 0x30(%rbp), $arg2
1332
+ lea ($key), $arg3
1333
+ call asm_AES_encrypt
1334
+ movdqu ($inp), @XMM[1]
1335
+ lea 16($inp), $inp
1336
+ mov 0x2c(%rbp), %eax # load 32-bit counter
1337
+ bswap %eax
1338
+ pxor 0x30(%rbp), @XMM[1]
1339
+ inc %eax # increment
1340
+ movdqu @XMM[1], ($out)
1341
+ bswap %eax
1342
+ lea 16($out), $out
1343
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
1344
+ dec $len
1345
+ jnz .Lctr_enc_short
1346
+
1347
+ .Lctr_enc_done:
1348
+ lea (%rsp), %rax
1349
+ pxor %xmm0, %xmm0
1350
+ .Lctr_enc_bzero: # wipe key schedule [if any]
1351
+ movdqa %xmm0, 0x00(%rax)
1352
+ movdqa %xmm0, 0x10(%rax)
1353
+ lea 0x20(%rax), %rax
1354
+ cmp %rax, %rbp
1355
+ ja .Lctr_enc_bzero
1356
+
1357
+ lea (%rbp),%rsp # restore %rsp
1358
+ ___
1359
+ $code.=<<___ if ($win64);
1360
+ movaps 0x40(%rbp), %xmm6
1361
+ movaps 0x50(%rbp), %xmm7
1362
+ movaps 0x60(%rbp), %xmm8
1363
+ movaps 0x70(%rbp), %xmm9
1364
+ movaps 0x80(%rbp), %xmm10
1365
+ movaps 0x90(%rbp), %xmm11
1366
+ movaps 0xa0(%rbp), %xmm12
1367
+ movaps 0xb0(%rbp), %xmm13
1368
+ movaps 0xc0(%rbp), %xmm14
1369
+ movaps 0xd0(%rbp), %xmm15
1370
+ lea 0xa0(%rbp), %rsp
1371
+ ___
1372
+ $code.=<<___;
1373
+ mov 0x48(%rsp), %r15
1374
+ mov 0x50(%rsp), %r14
1375
+ mov 0x58(%rsp), %r13
1376
+ mov 0x60(%rsp), %r12
1377
+ mov 0x68(%rsp), %rbx
1378
+ mov 0x70(%rsp), %rax
1379
+ lea 0x78(%rsp), %rsp
1380
+ mov %rax, %rbp
1381
+ .Lctr_enc_epilogue:
1382
+ ret
1383
+ .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1384
+ ___
1385
+ }
1386
+ $code.=<<___;
1387
+ .type _bsaes_const,\@object
1388
+ .align 64
1389
+ _bsaes_const:
1390
+ .LM0ISR: # InvShiftRows constants
1391
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1392
+ .LISRM0:
1393
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1394
+ .LISR:
1395
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
1396
+ .LBS0: # bit-slice constants
1397
+ .quad 0x5555555555555555, 0x5555555555555555
1398
+ .LBS1:
1399
+ .quad 0x3333333333333333, 0x3333333333333333
1400
+ .LBS2:
1401
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1402
+ .LSR: # shiftrows constants
1403
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1404
+ .LSRM0:
1405
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1406
+ .LM0SR:
1407
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1408
+ .LSWPUP: # byte-swap upper dword
1409
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1410
+ .LSWPUPM0SR:
1411
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1412
+ .LADD1: # counter increment constants
1413
+ .quad 0x0000000000000000, 0x0000000100000000
1414
+ .LADD2:
1415
+ .quad 0x0000000000000000, 0x0000000200000000
1416
+ .LADD3:
1417
+ .quad 0x0000000000000000, 0x0000000300000000
1418
+ .LADD4:
1419
+ .quad 0x0000000000000000, 0x0000000400000000
1420
+ .LADD5:
1421
+ .quad 0x0000000000000000, 0x0000000500000000
1422
+ .LADD6:
1423
+ .quad 0x0000000000000000, 0x0000000600000000
1424
+ .LADD7:
1425
+ .quad 0x0000000000000000, 0x0000000700000000
1426
+ .LADD8:
1427
+ .quad 0x0000000000000000, 0x0000000800000000
1428
+ .Lmasks:
1429
+ .quad 0x0101010101010101, 0x0101010101010101
1430
+ .quad 0x0202020202020202, 0x0202020202020202
1431
+ .quad 0x0404040404040404, 0x0404040404040404
1432
+ .quad 0x0808080808080808, 0x0808080808080808
1433
+ .LM0:
1434
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1435
+ .L63:
1436
+ .quad 0x6363636363636363, 0x6363636363636363
1437
+ .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
1438
+ .align 64
1439
+ .size _bsaes_const,.-_bsaes_const
1440
+ ___
1441
+
1442
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1443
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1444
+ if ($win64) {
1445
+ $rec="%rcx";
1446
+ $frame="%rdx";
1447
+ $context="%r8";
1448
+ $disp="%r9";
1449
+
1450
+ $code.=<<___;
1451
+ .extern __imp_RtlVirtualUnwind
1452
+ .type se_handler,\@abi-omnipotent
1453
+ .align 16
1454
+ se_handler:
1455
+ push %rsi
1456
+ push %rdi
1457
+ push %rbx
1458
+ push %rbp
1459
+ push %r12
1460
+ push %r13
1461
+ push %r14
1462
+ push %r15
1463
+ pushfq
1464
+ sub \$64,%rsp
1465
+
1466
+ mov 120($context),%rax # pull context->Rax
1467
+ mov 248($context),%rbx # pull context->Rip
1468
+
1469
+ mov 8($disp),%rsi # disp->ImageBase
1470
+ mov 56($disp),%r11 # disp->HandlerData
1471
+
1472
+ mov 0(%r11),%r10d # HandlerData[0]
1473
+ lea (%rsi,%r10),%r10 # prologue label
1474
+ cmp %r10,%rbx # context->Rip<prologue label
1475
+ jb .Lin_prologue
1476
+
1477
+ mov 152($context),%rax # pull context->Rsp
1478
+
1479
+ mov 4(%r11),%r10d # HandlerData[1]
1480
+ lea (%rsi,%r10),%r10 # epilogue label
1481
+ cmp %r10,%rbx # context->Rip>=epilogue label
1482
+ jae .Lin_prologue
1483
+
1484
+ mov 160($context),%rax # pull context->Rbp
1485
+
1486
+ lea 0x40(%rax),%rsi # %xmm save area
1487
+ lea 512($context),%rdi # &context.Xmm6
1488
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1489
+ .long 0xa548f3fc # cld; rep movsq
1490
+ lea 0xa0(%rax),%rax # adjust stack pointer
1491
+
1492
+ mov 0x70(%rax),%rbp
1493
+ mov 0x68(%rax),%rbx
1494
+ mov 0x60(%rax),%r12
1495
+ mov 0x58(%rax),%r13
1496
+ mov 0x50(%rax),%r14
1497
+ mov 0x48(%rax),%r15
1498
+ lea 0x78(%rax),%rax # adjust stack pointer
1499
+ mov %rbx,144($context) # restore context->Rbx
1500
+ mov %rbp,160($context) # restore context->Rbp
1501
+ mov %r12,216($context) # restore context->R12
1502
+ mov %r13,224($context) # restore context->R13
1503
+ mov %r14,232($context) # restore context->R14
1504
+ mov %r15,240($context) # restore context->R15
1505
+
1506
+ .Lin_prologue:
1507
+ mov %rax,152($context) # restore context->Rsp
1508
+
1509
+ mov 40($disp),%rdi # disp->ContextRecord
1510
+ mov $context,%rsi # context
1511
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1512
+ .long 0xa548f3fc # cld; rep movsq
1513
+
1514
+ mov $disp,%rsi
1515
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1516
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1517
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1518
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1519
+ mov 40(%rsi),%r10 # disp->ContextRecord
1520
+ lea 56(%rsi),%r11 # &disp->HandlerData
1521
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1522
+ mov %r10,32(%rsp) # arg5
1523
+ mov %r11,40(%rsp) # arg6
1524
+ mov %r12,48(%rsp) # arg7
1525
+ mov %rcx,56(%rsp) # arg8, (NULL)
1526
+ call *__imp_RtlVirtualUnwind(%rip)
1527
+
1528
+ mov \$1,%eax # ExceptionContinueSearch
1529
+ add \$64,%rsp
1530
+ popfq
1531
+ pop %r15
1532
+ pop %r14
1533
+ pop %r13
1534
+ pop %r12
1535
+ pop %rbp
1536
+ pop %rbx
1537
+ pop %rdi
1538
+ pop %rsi
1539
+ ret
1540
+ .size se_handler,.-se_handler
1541
+
1542
+ .section .pdata
1543
+ .align 4
1544
+ ___
1545
+ $code.=<<___;
1546
+ .rva .Lctr_enc_prologue
1547
+ .rva .Lctr_enc_epilogue
1548
+ .rva .Lctr_enc_info
1549
+
1550
+ .section .xdata
1551
+ .align 8
1552
+ ___
1553
+ $code.=<<___;
1554
+ .Lctr_enc_info:
1555
+ .byte 9,0,0,0
1556
+ .rva se_handler
1557
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
1558
+ ___
1559
+ }
1560
+
1561
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1562
+
1563
+ print $code;
1564
+
1565
+ close STDOUT;