ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1565 @@
1
+ #!/usr/bin/env perl
2
+
3
+ ###################################################################
4
+ ### AES-128 [originally in CTR mode] ###
5
+ ### bitsliced implementation for Intel Core 2 processors ###
6
+ ### requires support of SSE extensions up to SSSE3 ###
7
+ ### Author: Emilia Käsper and Peter Schwabe ###
8
+ ### Date: 2009-03-19 ###
9
+ ### Public domain ###
10
+ ### ###
11
+ ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12
+ ### further information. ###
13
+ ###################################################################
14
+ #
15
+ # September 2011.
16
+ #
17
+ # Started as transliteration to "perlasm" the original code has
18
+ # undergone following changes:
19
+ #
20
+ # - code was made position-independent;
21
+ # - rounds were folded into a loop resulting in >5x size reduction
22
+ # from 12.5KB to 2.2KB;
23
+ # - above was possibile thanks to mixcolumns() modification that
24
+ # allowed to feed its output back to aesenc[last], this was
25
+ # achieved at cost of two additional inter-registers moves;
26
+ # - some instruction reordering and interleaving;
27
+ # - this module doesn't implement key setup subroutine, instead it
28
+ # relies on conversion of "conventional" key schedule as returned
29
+ # by AES_set_encrypt_key (see discussion below);
30
+ # - first and last round keys are treated differently, which allowed
31
+ # to skip one shiftrows(), reduce bit-sliced key schedule and
32
+ # speed-up conversion by 22%;
33
+ # - support for 192- and 256-bit keys was added;
34
+ #
35
+ # Resulting performance in CPU cycles spent to encrypt one byte out
36
+ # of 4096-byte buffer with 128-bit key is:
37
+ #
38
+ # Emilia's this(*) difference
39
+ #
40
+ # Core 2 9.30 8.69 +7%
41
+ # Nehalem(**) 7.63 6.88 +11%
42
+ # Atom 17.1 16.4 +4%
43
+ # Silvermont - 12.9
44
+ #
45
+ # (*) Comparison is not completely fair, because "this" is ECB,
46
+ # i.e. no extra processing such as counter values calculation
47
+ # and xor-ing input as in Emilia's CTR implementation is
48
+ # performed. However, the CTR calculations stand for not more
49
+ # than 1% of total time, so comparison is *rather* fair.
50
+ #
51
+ # (**) Results were collected on Westmere, which is considered to
52
+ # be equivalent to Nehalem for this code.
53
+ #
54
+ # As for key schedule conversion subroutine. Interface to OpenSSL
55
+ # relies on per-invocation on-the-fly conversion. This naturally
56
+ # has impact on performance, especially for short inputs. Conversion
57
+ # time in CPU cycles and its ratio to CPU cycles spent in 8x block
58
+ # function is:
59
+ #
60
+ # conversion conversion/8x block
61
+ # Core 2 240 0.22
62
+ # Nehalem 180 0.20
63
+ # Atom 430 0.20
64
+ #
65
+ # The ratio values mean that 128-byte blocks will be processed
66
+ # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
67
+ # etc. Then keep in mind that input sizes not divisible by 128 are
68
+ # *effectively* slower, especially shortest ones, e.g. consecutive
69
+ # 144-byte blocks are processed 44% slower than one would expect,
70
+ # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
71
+ # it's still faster than ["hyper-threading-safe" code path in]
72
+ # aes-x86_64.pl on all lengths above 64 bytes...
73
+ #
74
+ # October 2011.
75
+ #
76
+ # Add decryption procedure. Performance in CPU cycles spent to decrypt
77
+ # one byte out of 4096-byte buffer with 128-bit key is:
78
+ #
79
+ # Core 2 9.98
80
+ # Nehalem 7.80
81
+ # Atom 17.9
82
+ # Silvermont 14.0
83
+ #
84
+ # November 2011.
85
+ #
86
+ # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
87
+ # suboptimal, but XTS is meant to be used with larger blocks...
88
+ #
89
+ # <appro@openssl.org>
90
+
91
+ $flavour = shift;
92
+ $output = shift;
93
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
94
+
95
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
96
+
97
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
98
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
99
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
100
+ die "can't locate x86_64-xlate.pl";
101
+
102
+ open OUT,"| \"$^X\" $xlate $flavour $output";
103
+ *STDOUT=*OUT;
104
+
105
+ my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
106
+ my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
107
+
108
+ {
109
+ my ($key,$rounds,$const)=("%rax","%r10d","%r11");
110
+
111
+ sub Sbox {
112
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
113
+ # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
114
+ my @b=@_[0..7];
115
+ my @t=@_[8..11];
116
+ my @s=@_[12..15];
117
+ &InBasisChange (@b);
118
+ &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
119
+ &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
120
+ }
121
+
122
+ sub InBasisChange {
123
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
124
+ # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
125
+ my @b=@_[0..7];
126
+ $code.=<<___;
127
+ pxor @b[6], @b[5]
128
+ pxor @b[1], @b[2]
129
+ pxor @b[0], @b[3]
130
+ pxor @b[2], @b[6]
131
+ pxor @b[0], @b[5]
132
+
133
+ pxor @b[3], @b[6]
134
+ pxor @b[7], @b[3]
135
+ pxor @b[5], @b[7]
136
+ pxor @b[4], @b[3]
137
+ pxor @b[5], @b[4]
138
+ pxor @b[1], @b[3]
139
+
140
+ pxor @b[7], @b[2]
141
+ pxor @b[5], @b[1]
142
+ ___
143
+ }
144
+
145
+ sub OutBasisChange {
146
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
147
+ # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
148
+ my @b=@_[0..7];
149
+ $code.=<<___;
150
+ pxor @b[6], @b[0]
151
+ pxor @b[4], @b[1]
152
+ pxor @b[0], @b[2]
153
+ pxor @b[6], @b[4]
154
+ pxor @b[1], @b[6]
155
+
156
+ pxor @b[5], @b[1]
157
+ pxor @b[3], @b[5]
158
+ pxor @b[7], @b[3]
159
+ pxor @b[5], @b[7]
160
+ pxor @b[5], @b[2]
161
+
162
+ pxor @b[7], @b[4]
163
+ ___
164
+ }
165
+
166
+ sub InvSbox {
167
+ # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
168
+ # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
169
+ my @b=@_[0..7];
170
+ my @t=@_[8..11];
171
+ my @s=@_[12..15];
172
+ &InvInBasisChange (@b);
173
+ &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
174
+ &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
175
+ }
176
+
177
+ sub InvInBasisChange { # OutBasisChange in reverse
178
+ my @b=@_[5,1,2,6,3,7,0,4];
179
+ $code.=<<___
180
+ pxor @b[7], @b[4]
181
+
182
+ pxor @b[5], @b[7]
183
+ pxor @b[5], @b[2]
184
+ pxor @b[7], @b[3]
185
+ pxor @b[3], @b[5]
186
+ pxor @b[5], @b[1]
187
+
188
+ pxor @b[1], @b[6]
189
+ pxor @b[0], @b[2]
190
+ pxor @b[6], @b[4]
191
+ pxor @b[6], @b[0]
192
+ pxor @b[4], @b[1]
193
+ ___
194
+ }
195
+
196
+ sub InvOutBasisChange { # InBasisChange in reverse
197
+ my @b=@_[2,5,7,3,6,1,0,4];
198
+ $code.=<<___;
199
+ pxor @b[5], @b[1]
200
+ pxor @b[7], @b[2]
201
+
202
+ pxor @b[1], @b[3]
203
+ pxor @b[5], @b[4]
204
+ pxor @b[5], @b[7]
205
+ pxor @b[4], @b[3]
206
+ pxor @b[0], @b[5]
207
+ pxor @b[7], @b[3]
208
+ pxor @b[2], @b[6]
209
+ pxor @b[1], @b[2]
210
+ pxor @b[3], @b[6]
211
+
212
+ pxor @b[0], @b[3]
213
+ pxor @b[6], @b[5]
214
+ ___
215
+ }
216
+
217
+ sub Mul_GF4 {
218
+ #;*************************************************************
219
+ #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
220
+ #;*************************************************************
221
+ my ($x0,$x1,$y0,$y1,$t0)=@_;
222
+ $code.=<<___;
223
+ movdqa $y0, $t0
224
+ pxor $y1, $t0
225
+ pand $x0, $t0
226
+ pxor $x1, $x0
227
+ pand $y0, $x1
228
+ pand $y1, $x0
229
+ pxor $x1, $x0
230
+ pxor $t0, $x1
231
+ ___
232
+ }
233
+
234
+ sub Mul_GF4_N { # not used, see next subroutine
235
+ # multiply and scale by N
236
+ my ($x0,$x1,$y0,$y1,$t0)=@_;
237
+ $code.=<<___;
238
+ movdqa $y0, $t0
239
+ pxor $y1, $t0
240
+ pand $x0, $t0
241
+ pxor $x1, $x0
242
+ pand $y0, $x1
243
+ pand $y1, $x0
244
+ pxor $x0, $x1
245
+ pxor $t0, $x0
246
+ ___
247
+ }
248
+
249
+ sub Mul_GF4_N_GF4 {
250
+ # interleaved Mul_GF4_N and Mul_GF4
251
+ my ($x0,$x1,$y0,$y1,$t0,
252
+ $x2,$x3,$y2,$y3,$t1)=@_;
253
+ $code.=<<___;
254
+ movdqa $y0, $t0
255
+ movdqa $y2, $t1
256
+ pxor $y1, $t0
257
+ pxor $y3, $t1
258
+ pand $x0, $t0
259
+ pand $x2, $t1
260
+ pxor $x1, $x0
261
+ pxor $x3, $x2
262
+ pand $y0, $x1
263
+ pand $y2, $x3
264
+ pand $y1, $x0
265
+ pand $y3, $x2
266
+ pxor $x0, $x1
267
+ pxor $x3, $x2
268
+ pxor $t0, $x0
269
+ pxor $t1, $x3
270
+ ___
271
+ }
272
+ sub Mul_GF16_2 {
273
+ my @x=@_[0..7];
274
+ my @y=@_[8..11];
275
+ my @t=@_[12..15];
276
+ $code.=<<___;
277
+ movdqa @x[0], @t[0]
278
+ movdqa @x[1], @t[1]
279
+ ___
280
+ &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
281
+ $code.=<<___;
282
+ pxor @x[2], @t[0]
283
+ pxor @x[3], @t[1]
284
+ pxor @y[2], @y[0]
285
+ pxor @y[3], @y[1]
286
+ ___
287
+ Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
288
+ @x[2], @x[3], @y[2], @y[3], @t[2]);
289
+ $code.=<<___;
290
+ pxor @t[0], @x[0]
291
+ pxor @t[0], @x[2]
292
+ pxor @t[1], @x[1]
293
+ pxor @t[1], @x[3]
294
+
295
+ movdqa @x[4], @t[0]
296
+ movdqa @x[5], @t[1]
297
+ pxor @x[6], @t[0]
298
+ pxor @x[7], @t[1]
299
+ ___
300
+ &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
301
+ @x[6], @x[7], @y[2], @y[3], @t[2]);
302
+ $code.=<<___;
303
+ pxor @y[2], @y[0]
304
+ pxor @y[3], @y[1]
305
+ ___
306
+ &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
307
+ $code.=<<___;
308
+ pxor @t[0], @x[4]
309
+ pxor @t[0], @x[6]
310
+ pxor @t[1], @x[5]
311
+ pxor @t[1], @x[7]
312
+ ___
313
+ }
314
+ sub Inv_GF256 {
315
+ #;********************************************************************
316
+ #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
317
+ #;********************************************************************
318
+ my @x=@_[0..7];
319
+ my @t=@_[8..11];
320
+ my @s=@_[12..15];
321
+ # direct optimizations from hardware
322
+ $code.=<<___;
323
+ movdqa @x[4], @t[3]
324
+ movdqa @x[5], @t[2]
325
+ movdqa @x[1], @t[1]
326
+ movdqa @x[7], @s[1]
327
+ movdqa @x[0], @s[0]
328
+
329
+ pxor @x[6], @t[3]
330
+ pxor @x[7], @t[2]
331
+ pxor @x[3], @t[1]
332
+ movdqa @t[3], @s[2]
333
+ pxor @x[6], @s[1]
334
+ movdqa @t[2], @t[0]
335
+ pxor @x[2], @s[0]
336
+ movdqa @t[3], @s[3]
337
+
338
+ por @t[1], @t[2]
339
+ por @s[0], @t[3]
340
+ pxor @t[0], @s[3]
341
+ pand @s[0], @s[2]
342
+ pxor @t[1], @s[0]
343
+ pand @t[1], @t[0]
344
+ pand @s[0], @s[3]
345
+ movdqa @x[3], @s[0]
346
+ pxor @x[2], @s[0]
347
+ pand @s[0], @s[1]
348
+ pxor @s[1], @t[3]
349
+ pxor @s[1], @t[2]
350
+ movdqa @x[4], @s[1]
351
+ movdqa @x[1], @s[0]
352
+ pxor @x[5], @s[1]
353
+ pxor @x[0], @s[0]
354
+ movdqa @s[1], @t[1]
355
+ pand @s[0], @s[1]
356
+ por @s[0], @t[1]
357
+ pxor @s[1], @t[0]
358
+ pxor @s[3], @t[3]
359
+ pxor @s[2], @t[2]
360
+ pxor @s[3], @t[1]
361
+ movdqa @x[7], @s[0]
362
+ pxor @s[2], @t[0]
363
+ movdqa @x[6], @s[1]
364
+ pxor @s[2], @t[1]
365
+ movdqa @x[5], @s[2]
366
+ pand @x[3], @s[0]
367
+ movdqa @x[4], @s[3]
368
+ pand @x[2], @s[1]
369
+ pand @x[1], @s[2]
370
+ por @x[0], @s[3]
371
+ pxor @s[0], @t[3]
372
+ pxor @s[1], @t[2]
373
+ pxor @s[2], @t[1]
374
+ pxor @s[3], @t[0]
375
+
376
+ #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
377
+
378
+ # new smaller inversion
379
+
380
+ movdqa @t[3], @s[0]
381
+ pand @t[1], @t[3]
382
+ pxor @t[2], @s[0]
383
+
384
+ movdqa @t[0], @s[2]
385
+ movdqa @s[0], @s[3]
386
+ pxor @t[3], @s[2]
387
+ pand @s[2], @s[3]
388
+
389
+ movdqa @t[1], @s[1]
390
+ pxor @t[2], @s[3]
391
+ pxor @t[0], @s[1]
392
+
393
+ pxor @t[2], @t[3]
394
+
395
+ pand @t[3], @s[1]
396
+
397
+ movdqa @s[2], @t[2]
398
+ pxor @t[0], @s[1]
399
+
400
+ pxor @s[1], @t[2]
401
+ pxor @s[1], @t[1]
402
+
403
+ pand @t[0], @t[2]
404
+
405
+ pxor @t[2], @s[2]
406
+ pxor @t[2], @t[1]
407
+
408
+ pand @s[3], @s[2]
409
+
410
+ pxor @s[0], @s[2]
411
+ ___
412
+ # output in s3, s2, s1, t1
413
+
414
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
415
+
416
+ # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
417
+ &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
418
+
419
+ ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
420
+ }
421
+
422
+ # AES linear components
423
+
424
+ sub ShiftRows {
425
+ my @x=@_[0..7];
426
+ my $mask=pop;
427
+ $code.=<<___;
428
+ pxor 0x00($key),@x[0]
429
+ pxor 0x10($key),@x[1]
430
+ pxor 0x20($key),@x[2]
431
+ pxor 0x30($key),@x[3]
432
+ pshufb $mask,@x[0]
433
+ pshufb $mask,@x[1]
434
+ pxor 0x40($key),@x[4]
435
+ pxor 0x50($key),@x[5]
436
+ pshufb $mask,@x[2]
437
+ pshufb $mask,@x[3]
438
+ pxor 0x60($key),@x[6]
439
+ pxor 0x70($key),@x[7]
440
+ pshufb $mask,@x[4]
441
+ pshufb $mask,@x[5]
442
+ pshufb $mask,@x[6]
443
+ pshufb $mask,@x[7]
444
+ lea 0x80($key),$key
445
+ ___
446
+ }
447
+
448
+ sub MixColumns {
449
+ # modified to emit output in order suitable for feeding back to aesenc[last]
450
+ my @x=@_[0..7];
451
+ my @t=@_[8..15];
452
+ my $inv=@_[16]; # optional
453
+ $code.=<<___;
454
+ pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
455
+ pshufd \$0x93, @x[1], @t[1]
456
+ pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
457
+ pshufd \$0x93, @x[2], @t[2]
458
+ pxor @t[1], @x[1]
459
+ pshufd \$0x93, @x[3], @t[3]
460
+ pxor @t[2], @x[2]
461
+ pshufd \$0x93, @x[4], @t[4]
462
+ pxor @t[3], @x[3]
463
+ pshufd \$0x93, @x[5], @t[5]
464
+ pxor @t[4], @x[4]
465
+ pshufd \$0x93, @x[6], @t[6]
466
+ pxor @t[5], @x[5]
467
+ pshufd \$0x93, @x[7], @t[7]
468
+ pxor @t[6], @x[6]
469
+ pxor @t[7], @x[7]
470
+
471
+ pxor @x[0], @t[1]
472
+ pxor @x[7], @t[0]
473
+ pxor @x[7], @t[1]
474
+ pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
475
+ pxor @x[1], @t[2]
476
+ pshufd \$0x4E, @x[1], @x[1]
477
+ pxor @x[4], @t[5]
478
+ pxor @t[0], @x[0]
479
+ pxor @x[5], @t[6]
480
+ pxor @t[1], @x[1]
481
+ pxor @x[3], @t[4]
482
+ pshufd \$0x4E, @x[4], @t[0]
483
+ pxor @x[6], @t[7]
484
+ pshufd \$0x4E, @x[5], @t[1]
485
+ pxor @x[2], @t[3]
486
+ pshufd \$0x4E, @x[3], @x[4]
487
+ pxor @x[7], @t[3]
488
+ pshufd \$0x4E, @x[7], @x[5]
489
+ pxor @x[7], @t[4]
490
+ pshufd \$0x4E, @x[6], @x[3]
491
+ pxor @t[4], @t[0]
492
+ pshufd \$0x4E, @x[2], @x[6]
493
+ pxor @t[5], @t[1]
494
+ ___
495
+ $code.=<<___ if (!$inv);
496
+ pxor @t[3], @x[4]
497
+ pxor @t[7], @x[5]
498
+ pxor @t[6], @x[3]
499
+ movdqa @t[0], @x[2]
500
+ pxor @t[2], @x[6]
501
+ movdqa @t[1], @x[7]
502
+ ___
503
+ $code.=<<___ if ($inv);
504
+ pxor @x[4], @t[3]
505
+ pxor @t[7], @x[5]
506
+ pxor @x[3], @t[6]
507
+ movdqa @t[0], @x[3]
508
+ pxor @t[2], @x[6]
509
+ movdqa @t[6], @x[2]
510
+ movdqa @t[1], @x[7]
511
+ movdqa @x[6], @x[4]
512
+ movdqa @t[3], @x[6]
513
+ ___
514
+ }
515
+
516
+ sub InvMixColumns_orig {
517
+ my @x=@_[0..7];
518
+ my @t=@_[8..15];
519
+
520
+ $code.=<<___;
521
+ # multiplication by 0x0e
522
+ pshufd \$0x93, @x[7], @t[7]
523
+ movdqa @x[2], @t[2]
524
+ pxor @x[5], @x[7] # 7 5
525
+ pxor @x[5], @x[2] # 2 5
526
+ pshufd \$0x93, @x[0], @t[0]
527
+ movdqa @x[5], @t[5]
528
+ pxor @x[0], @x[5] # 5 0 [1]
529
+ pxor @x[1], @x[0] # 0 1
530
+ pshufd \$0x93, @x[1], @t[1]
531
+ pxor @x[2], @x[1] # 1 25
532
+ pxor @x[6], @x[0] # 01 6 [2]
533
+ pxor @x[3], @x[1] # 125 3 [4]
534
+ pshufd \$0x93, @x[3], @t[3]
535
+ pxor @x[0], @x[2] # 25 016 [3]
536
+ pxor @x[7], @x[3] # 3 75
537
+ pxor @x[6], @x[7] # 75 6 [0]
538
+ pshufd \$0x93, @x[6], @t[6]
539
+ movdqa @x[4], @t[4]
540
+ pxor @x[4], @x[6] # 6 4
541
+ pxor @x[3], @x[4] # 4 375 [6]
542
+ pxor @x[7], @x[3] # 375 756=36
543
+ pxor @t[5], @x[6] # 64 5 [7]
544
+ pxor @t[2], @x[3] # 36 2
545
+ pxor @t[4], @x[3] # 362 4 [5]
546
+ pshufd \$0x93, @t[5], @t[5]
547
+ ___
548
+ my @y = @x[7,5,0,2,1,3,4,6];
549
+ $code.=<<___;
550
+ # multiplication by 0x0b
551
+ pxor @y[0], @y[1]
552
+ pxor @t[0], @y[0]
553
+ pxor @t[1], @y[1]
554
+ pshufd \$0x93, @t[2], @t[2]
555
+ pxor @t[5], @y[0]
556
+ pxor @t[6], @y[1]
557
+ pxor @t[7], @y[0]
558
+ pshufd \$0x93, @t[4], @t[4]
559
+ pxor @t[6], @t[7] # clobber t[7]
560
+ pxor @y[0], @y[1]
561
+
562
+ pxor @t[0], @y[3]
563
+ pshufd \$0x93, @t[0], @t[0]
564
+ pxor @t[1], @y[2]
565
+ pxor @t[1], @y[4]
566
+ pxor @t[2], @y[2]
567
+ pshufd \$0x93, @t[1], @t[1]
568
+ pxor @t[2], @y[3]
569
+ pxor @t[2], @y[5]
570
+ pxor @t[7], @y[2]
571
+ pshufd \$0x93, @t[2], @t[2]
572
+ pxor @t[3], @y[3]
573
+ pxor @t[3], @y[6]
574
+ pxor @t[3], @y[4]
575
+ pshufd \$0x93, @t[3], @t[3]
576
+ pxor @t[4], @y[7]
577
+ pxor @t[4], @y[5]
578
+ pxor @t[7], @y[7]
579
+ pxor @t[5], @y[3]
580
+ pxor @t[4], @y[4]
581
+ pxor @t[5], @t[7] # clobber t[7] even more
582
+
583
+ pxor @t[7], @y[5]
584
+ pshufd \$0x93, @t[4], @t[4]
585
+ pxor @t[7], @y[6]
586
+ pxor @t[7], @y[4]
587
+
588
+ pxor @t[5], @t[7]
589
+ pshufd \$0x93, @t[5], @t[5]
590
+ pxor @t[6], @t[7] # restore t[7]
591
+
592
+ # multiplication by 0x0d
593
+ pxor @y[7], @y[4]
594
+ pxor @t[4], @y[7]
595
+ pshufd \$0x93, @t[6], @t[6]
596
+ pxor @t[0], @y[2]
597
+ pxor @t[5], @y[7]
598
+ pxor @t[2], @y[2]
599
+ pshufd \$0x93, @t[7], @t[7]
600
+
601
+ pxor @y[1], @y[3]
602
+ pxor @t[1], @y[1]
603
+ pxor @t[0], @y[0]
604
+ pxor @t[0], @y[3]
605
+ pxor @t[5], @y[1]
606
+ pxor @t[5], @y[0]
607
+ pxor @t[7], @y[1]
608
+ pshufd \$0x93, @t[0], @t[0]
609
+ pxor @t[6], @y[0]
610
+ pxor @y[1], @y[3]
611
+ pxor @t[1], @y[4]
612
+ pshufd \$0x93, @t[1], @t[1]
613
+
614
+ pxor @t[7], @y[7]
615
+ pxor @t[2], @y[4]
616
+ pxor @t[2], @y[5]
617
+ pshufd \$0x93, @t[2], @t[2]
618
+ pxor @t[6], @y[2]
619
+ pxor @t[3], @t[6] # clobber t[6]
620
+ pxor @y[7], @y[4]
621
+ pxor @t[6], @y[3]
622
+
623
+ pxor @t[6], @y[6]
624
+ pxor @t[5], @y[5]
625
+ pxor @t[4], @y[6]
626
+ pshufd \$0x93, @t[4], @t[4]
627
+ pxor @t[6], @y[5]
628
+ pxor @t[7], @y[6]
629
+ pxor @t[3], @t[6] # restore t[6]
630
+
631
+ pshufd \$0x93, @t[5], @t[5]
632
+ pshufd \$0x93, @t[6], @t[6]
633
+ pshufd \$0x93, @t[7], @t[7]
634
+ pshufd \$0x93, @t[3], @t[3]
635
+
636
+ # multiplication by 0x09
637
+ pxor @y[1], @y[4]
638
+ pxor @y[1], @t[1] # t[1]=y[1]
639
+ pxor @t[5], @t[0] # clobber t[0]
640
+ pxor @t[5], @t[1]
641
+ pxor @t[0], @y[3]
642
+ pxor @y[0], @t[0] # t[0]=y[0]
643
+ pxor @t[6], @t[1]
644
+ pxor @t[7], @t[6] # clobber t[6]
645
+ pxor @t[1], @y[4]
646
+ pxor @t[4], @y[7]
647
+ pxor @y[4], @t[4] # t[4]=y[4]
648
+ pxor @t[3], @y[6]
649
+ pxor @y[3], @t[3] # t[3]=y[3]
650
+ pxor @t[2], @y[5]
651
+ pxor @y[2], @t[2] # t[2]=y[2]
652
+ pxor @t[7], @t[3]
653
+ pxor @y[5], @t[5] # t[5]=y[5]
654
+ pxor @t[6], @t[2]
655
+ pxor @t[6], @t[5]
656
+ pxor @y[6], @t[6] # t[6]=y[6]
657
+ pxor @y[7], @t[7] # t[7]=y[7]
658
+
659
+ movdqa @t[0],@XMM[0]
660
+ movdqa @t[1],@XMM[1]
661
+ movdqa @t[2],@XMM[2]
662
+ movdqa @t[3],@XMM[3]
663
+ movdqa @t[4],@XMM[4]
664
+ movdqa @t[5],@XMM[5]
665
+ movdqa @t[6],@XMM[6]
666
+ movdqa @t[7],@XMM[7]
667
+ ___
668
+ }
669
+
670
+ sub InvMixColumns {
671
+ my @x=@_[0..7];
672
+ my @t=@_[8..15];
673
+
674
+ # Thanks to Jussi Kivilinna for providing pointer to
675
+ #
676
+ # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
677
+ # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
678
+ # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
679
+ # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
680
+
681
+ $code.=<<___;
682
+ # multiplication by 0x05-0x00-0x04-0x00
683
+ pshufd \$0x4E, @x[0], @t[0]
684
+ pshufd \$0x4E, @x[6], @t[6]
685
+ pxor @x[0], @t[0]
686
+ pshufd \$0x4E, @x[7], @t[7]
687
+ pxor @x[6], @t[6]
688
+ pshufd \$0x4E, @x[1], @t[1]
689
+ pxor @x[7], @t[7]
690
+ pshufd \$0x4E, @x[2], @t[2]
691
+ pxor @x[1], @t[1]
692
+ pshufd \$0x4E, @x[3], @t[3]
693
+ pxor @x[2], @t[2]
694
+ pxor @t[6], @x[0]
695
+ pxor @t[6], @x[1]
696
+ pshufd \$0x4E, @x[4], @t[4]
697
+ pxor @x[3], @t[3]
698
+ pxor @t[0], @x[2]
699
+ pxor @t[1], @x[3]
700
+ pshufd \$0x4E, @x[5], @t[5]
701
+ pxor @x[4], @t[4]
702
+ pxor @t[7], @x[1]
703
+ pxor @t[2], @x[4]
704
+ pxor @x[5], @t[5]
705
+
706
+ pxor @t[7], @x[2]
707
+ pxor @t[6], @x[3]
708
+ pxor @t[6], @x[4]
709
+ pxor @t[3], @x[5]
710
+ pxor @t[4], @x[6]
711
+ pxor @t[7], @x[4]
712
+ pxor @t[7], @x[5]
713
+ pxor @t[5], @x[7]
714
+ ___
715
+ &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
716
+ }
717
+
718
+ sub aesenc { # not used
719
+ my @b=@_[0..7];
720
+ my @t=@_[8..15];
721
+ $code.=<<___;
722
+ movdqa 0x30($const),@t[0] # .LSR
723
+ ___
724
+ &ShiftRows (@b,@t[0]);
725
+ &Sbox (@b,@t);
726
+ &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
727
+ }
728
+
729
+ sub aesenclast { # not used
730
+ my @b=@_[0..7];
731
+ my @t=@_[8..15];
732
+ $code.=<<___;
733
+ movdqa 0x40($const),@t[0] # .LSRM0
734
+ ___
735
+ &ShiftRows (@b,@t[0]);
736
+ &Sbox (@b,@t);
737
+ $code.=<<___
738
+ pxor 0x00($key),@b[0]
739
+ pxor 0x10($key),@b[1]
740
+ pxor 0x20($key),@b[4]
741
+ pxor 0x30($key),@b[6]
742
+ pxor 0x40($key),@b[3]
743
+ pxor 0x50($key),@b[7]
744
+ pxor 0x60($key),@b[2]
745
+ pxor 0x70($key),@b[5]
746
+ ___
747
+ }
748
+
749
+ sub swapmove {
750
+ my ($a,$b,$n,$mask,$t)=@_;
751
+ $code.=<<___;
752
+ movdqa $b,$t
753
+ psrlq \$$n,$b
754
+ pxor $a,$b
755
+ pand $mask,$b
756
+ pxor $b,$a
757
+ psllq \$$n,$b
758
+ pxor $t,$b
759
+ ___
760
+ }
761
+ sub swapmove2x {
762
+ my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
763
+ $code.=<<___;
764
+ movdqa $b0,$t0
765
+ psrlq \$$n,$b0
766
+ movdqa $b1,$t1
767
+ psrlq \$$n,$b1
768
+ pxor $a0,$b0
769
+ pxor $a1,$b1
770
+ pand $mask,$b0
771
+ pand $mask,$b1
772
+ pxor $b0,$a0
773
+ psllq \$$n,$b0
774
+ pxor $b1,$a1
775
+ psllq \$$n,$b1
776
+ pxor $t0,$b0
777
+ pxor $t1,$b1
778
+ ___
779
+ }
780
+
781
+ sub bitslice {
782
+ my @x=reverse(@_[0..7]);
783
+ my ($t0,$t1,$t2,$t3)=@_[8..11];
784
+ $code.=<<___;
785
+ movdqa 0x00($const),$t0 # .LBS0
786
+ movdqa 0x10($const),$t1 # .LBS1
787
+ ___
788
+ &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
789
+ &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
790
+ $code.=<<___;
791
+ movdqa 0x20($const),$t0 # .LBS2
792
+ ___
793
+ &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
794
+ &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
795
+
796
+ &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
797
+ &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
798
+ }
799
+
800
+ $code.=<<___;
801
+ .text
802
+
803
+ .extern asm_AES_encrypt
804
+ .extern asm_AES_decrypt
805
+
806
+ .type _bsaes_encrypt8,\@abi-omnipotent
807
+ .align 64
808
+ _bsaes_encrypt8:
809
+ lea .LBS0(%rip), $const # constants table
810
+
811
+ movdqa ($key), @XMM[9] # round 0 key
812
+ lea 0x10($key), $key
813
+ movdqa 0x50($const), @XMM[8] # .LM0SR
814
+ pxor @XMM[9], @XMM[0] # xor with round0 key
815
+ pxor @XMM[9], @XMM[1]
816
+ pxor @XMM[9], @XMM[2]
817
+ pxor @XMM[9], @XMM[3]
818
+ pshufb @XMM[8], @XMM[0]
819
+ pshufb @XMM[8], @XMM[1]
820
+ pxor @XMM[9], @XMM[4]
821
+ pxor @XMM[9], @XMM[5]
822
+ pshufb @XMM[8], @XMM[2]
823
+ pshufb @XMM[8], @XMM[3]
824
+ pxor @XMM[9], @XMM[6]
825
+ pxor @XMM[9], @XMM[7]
826
+ pshufb @XMM[8], @XMM[4]
827
+ pshufb @XMM[8], @XMM[5]
828
+ pshufb @XMM[8], @XMM[6]
829
+ pshufb @XMM[8], @XMM[7]
830
+ _bsaes_encrypt8_bitslice:
831
+ ___
832
+ &bitslice (@XMM[0..7, 8..11]);
833
+ $code.=<<___;
834
+ dec $rounds
835
+ jmp .Lenc_sbox
836
+ .align 16
837
+ .Lenc_loop:
838
+ ___
839
+ &ShiftRows (@XMM[0..7, 8]);
840
+ $code.=".Lenc_sbox:\n";
841
+ &Sbox (@XMM[0..7, 8..15]);
842
+ $code.=<<___;
843
+ dec $rounds
844
+ jl .Lenc_done
845
+ ___
846
+ &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
847
+ $code.=<<___;
848
+ movdqa 0x30($const), @XMM[8] # .LSR
849
+ jnz .Lenc_loop
850
+ movdqa 0x40($const), @XMM[8] # .LSRM0
851
+ jmp .Lenc_loop
852
+ .align 16
853
+ .Lenc_done:
854
+ ___
855
+ # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
856
+ &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
857
+ $code.=<<___;
858
+ movdqa ($key), @XMM[8] # last round key
859
+ pxor @XMM[8], @XMM[4]
860
+ pxor @XMM[8], @XMM[6]
861
+ pxor @XMM[8], @XMM[3]
862
+ pxor @XMM[8], @XMM[7]
863
+ pxor @XMM[8], @XMM[2]
864
+ pxor @XMM[8], @XMM[5]
865
+ pxor @XMM[8], @XMM[0]
866
+ pxor @XMM[8], @XMM[1]
867
+ ret
868
+ .size _bsaes_encrypt8,.-_bsaes_encrypt8
869
+
870
+ .type _bsaes_decrypt8,\@abi-omnipotent
871
+ .align 64
872
+ _bsaes_decrypt8:
873
+ lea .LBS0(%rip), $const # constants table
874
+
875
+ movdqa ($key), @XMM[9] # round 0 key
876
+ lea 0x10($key), $key
877
+ movdqa -0x30($const), @XMM[8] # .LM0ISR
878
+ pxor @XMM[9], @XMM[0] # xor with round0 key
879
+ pxor @XMM[9], @XMM[1]
880
+ pxor @XMM[9], @XMM[2]
881
+ pxor @XMM[9], @XMM[3]
882
+ pshufb @XMM[8], @XMM[0]
883
+ pshufb @XMM[8], @XMM[1]
884
+ pxor @XMM[9], @XMM[4]
885
+ pxor @XMM[9], @XMM[5]
886
+ pshufb @XMM[8], @XMM[2]
887
+ pshufb @XMM[8], @XMM[3]
888
+ pxor @XMM[9], @XMM[6]
889
+ pxor @XMM[9], @XMM[7]
890
+ pshufb @XMM[8], @XMM[4]
891
+ pshufb @XMM[8], @XMM[5]
892
+ pshufb @XMM[8], @XMM[6]
893
+ pshufb @XMM[8], @XMM[7]
894
+ ___
895
+ &bitslice (@XMM[0..7, 8..11]);
896
+ $code.=<<___;
897
+ dec $rounds
898
+ jmp .Ldec_sbox
899
+ .align 16
900
+ .Ldec_loop:
901
+ ___
902
+ &ShiftRows (@XMM[0..7, 8]);
903
+ $code.=".Ldec_sbox:\n";
904
+ &InvSbox (@XMM[0..7, 8..15]);
905
+ $code.=<<___;
906
+ dec $rounds
907
+ jl .Ldec_done
908
+ ___
909
+ &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
910
+ $code.=<<___;
911
+ movdqa -0x10($const), @XMM[8] # .LISR
912
+ jnz .Ldec_loop
913
+ movdqa -0x20($const), @XMM[8] # .LISRM0
914
+ jmp .Ldec_loop
915
+ .align 16
916
+ .Ldec_done:
917
+ ___
918
+ &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
919
+ $code.=<<___;
920
+ movdqa ($key), @XMM[8] # last round key
921
+ pxor @XMM[8], @XMM[6]
922
+ pxor @XMM[8], @XMM[4]
923
+ pxor @XMM[8], @XMM[2]
924
+ pxor @XMM[8], @XMM[7]
925
+ pxor @XMM[8], @XMM[3]
926
+ pxor @XMM[8], @XMM[5]
927
+ pxor @XMM[8], @XMM[0]
928
+ pxor @XMM[8], @XMM[1]
929
+ ret
930
+ .size _bsaes_decrypt8,.-_bsaes_decrypt8
931
+ ___
932
+ }
933
+ {
934
+ my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
935
+
936
+ sub bitslice_key {
937
+ my @x=reverse(@_[0..7]);
938
+ my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
939
+
940
+ &swapmove (@x[0,1],1,$bs0,$t2,$t3);
941
+ $code.=<<___;
942
+ #&swapmove(@x[2,3],1,$t0,$t2,$t3);
943
+ movdqa @x[0], @x[2]
944
+ movdqa @x[1], @x[3]
945
+ ___
946
+ #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
947
+
948
+ &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
949
+ $code.=<<___;
950
+ #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
951
+ movdqa @x[0], @x[4]
952
+ movdqa @x[2], @x[6]
953
+ movdqa @x[1], @x[5]
954
+ movdqa @x[3], @x[7]
955
+ ___
956
+ &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
957
+ &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
958
+ }
959
+
960
+ $code.=<<___;
961
+ .type _bsaes_key_convert,\@abi-omnipotent
962
+ .align 16
963
+ _bsaes_key_convert:
964
+ lea .Lmasks(%rip), $const
965
+ movdqu ($inp), %xmm7 # load round 0 key
966
+ lea 0x10($inp), $inp
967
+ movdqa 0x00($const), %xmm0 # 0x01...
968
+ movdqa 0x10($const), %xmm1 # 0x02...
969
+ movdqa 0x20($const), %xmm2 # 0x04...
970
+ movdqa 0x30($const), %xmm3 # 0x08...
971
+ movdqa 0x40($const), %xmm4 # .LM0
972
+ pcmpeqd %xmm5, %xmm5 # .LNOT
973
+
974
+ movdqu ($inp), %xmm6 # load round 1 key
975
+ movdqa %xmm7, ($out) # save round 0 key
976
+ lea 0x10($out), $out
977
+ dec $rounds
978
+ jmp .Lkey_loop
979
+ .align 16
980
+ .Lkey_loop:
981
+ pshufb %xmm4, %xmm6 # .LM0
982
+
983
+ movdqa %xmm0, %xmm8
984
+ movdqa %xmm1, %xmm9
985
+
986
+ pand %xmm6, %xmm8
987
+ pand %xmm6, %xmm9
988
+ movdqa %xmm2, %xmm10
989
+ pcmpeqb %xmm0, %xmm8
990
+ psllq \$4, %xmm0 # 0x10...
991
+ movdqa %xmm3, %xmm11
992
+ pcmpeqb %xmm1, %xmm9
993
+ psllq \$4, %xmm1 # 0x20...
994
+
995
+ pand %xmm6, %xmm10
996
+ pand %xmm6, %xmm11
997
+ movdqa %xmm0, %xmm12
998
+ pcmpeqb %xmm2, %xmm10
999
+ psllq \$4, %xmm2 # 0x40...
1000
+ movdqa %xmm1, %xmm13
1001
+ pcmpeqb %xmm3, %xmm11
1002
+ psllq \$4, %xmm3 # 0x80...
1003
+
1004
+ movdqa %xmm2, %xmm14
1005
+ movdqa %xmm3, %xmm15
1006
+ pxor %xmm5, %xmm8 # "pnot"
1007
+ pxor %xmm5, %xmm9
1008
+
1009
+ pand %xmm6, %xmm12
1010
+ pand %xmm6, %xmm13
1011
+ movdqa %xmm8, 0x00($out) # write bit-sliced round key
1012
+ pcmpeqb %xmm0, %xmm12
1013
+ psrlq \$4, %xmm0 # 0x01...
1014
+ movdqa %xmm9, 0x10($out)
1015
+ pcmpeqb %xmm1, %xmm13
1016
+ psrlq \$4, %xmm1 # 0x02...
1017
+ lea 0x10($inp), $inp
1018
+
1019
+ pand %xmm6, %xmm14
1020
+ pand %xmm6, %xmm15
1021
+ movdqa %xmm10, 0x20($out)
1022
+ pcmpeqb %xmm2, %xmm14
1023
+ psrlq \$4, %xmm2 # 0x04...
1024
+ movdqa %xmm11, 0x30($out)
1025
+ pcmpeqb %xmm3, %xmm15
1026
+ psrlq \$4, %xmm3 # 0x08...
1027
+ movdqu ($inp), %xmm6 # load next round key
1028
+
1029
+ pxor %xmm5, %xmm13 # "pnot"
1030
+ pxor %xmm5, %xmm14
1031
+ movdqa %xmm12, 0x40($out)
1032
+ movdqa %xmm13, 0x50($out)
1033
+ movdqa %xmm14, 0x60($out)
1034
+ movdqa %xmm15, 0x70($out)
1035
+ lea 0x80($out),$out
1036
+ dec $rounds
1037
+ jnz .Lkey_loop
1038
+
1039
+ movdqa 0x50($const), %xmm7 # .L63
1040
+ #movdqa %xmm6, ($out) # don't save last round key
1041
+ ret
1042
+ .size _bsaes_key_convert,.-_bsaes_key_convert
1043
+ ___
1044
+ }
1045
+
1046
+ if (0 && !$win64) { # following four functions are unsupported interface
1047
+ # used for benchmarking...
1048
+ $code.=<<___;
1049
+ .globl bsaes_enc_key_convert
1050
+ .type bsaes_enc_key_convert,\@function,2
1051
+ .align 16
1052
+ bsaes_enc_key_convert:
1053
+ mov 240($inp),%r10d # pass rounds
1054
+ mov $inp,%rcx # pass key
1055
+ mov $out,%rax # pass key schedule
1056
+ call _bsaes_key_convert
1057
+ pxor %xmm6,%xmm7 # fix up last round key
1058
+ movdqa %xmm7,(%rax) # save last round key
1059
+ ret
1060
+ .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1061
+
1062
+ .globl bsaes_encrypt_128
1063
+ .type bsaes_encrypt_128,\@function,4
1064
+ .align 16
1065
+ bsaes_encrypt_128:
1066
+ .Lenc128_loop:
1067
+ movdqu 0x00($inp), @XMM[0] # load input
1068
+ movdqu 0x10($inp), @XMM[1]
1069
+ movdqu 0x20($inp), @XMM[2]
1070
+ movdqu 0x30($inp), @XMM[3]
1071
+ movdqu 0x40($inp), @XMM[4]
1072
+ movdqu 0x50($inp), @XMM[5]
1073
+ movdqu 0x60($inp), @XMM[6]
1074
+ movdqu 0x70($inp), @XMM[7]
1075
+ mov $key, %rax # pass the $key
1076
+ lea 0x80($inp), $inp
1077
+ mov \$10,%r10d
1078
+
1079
+ call _bsaes_encrypt8
1080
+
1081
+ movdqu @XMM[0], 0x00($out) # write output
1082
+ movdqu @XMM[1], 0x10($out)
1083
+ movdqu @XMM[4], 0x20($out)
1084
+ movdqu @XMM[6], 0x30($out)
1085
+ movdqu @XMM[3], 0x40($out)
1086
+ movdqu @XMM[7], 0x50($out)
1087
+ movdqu @XMM[2], 0x60($out)
1088
+ movdqu @XMM[5], 0x70($out)
1089
+ lea 0x80($out), $out
1090
+ sub \$0x80,$len
1091
+ ja .Lenc128_loop
1092
+ ret
1093
+ .size bsaes_encrypt_128,.-bsaes_encrypt_128
1094
+
1095
+ .globl bsaes_dec_key_convert
1096
+ .type bsaes_dec_key_convert,\@function,2
1097
+ .align 16
1098
+ bsaes_dec_key_convert:
1099
+ mov 240($inp),%r10d # pass rounds
1100
+ mov $inp,%rcx # pass key
1101
+ mov $out,%rax # pass key schedule
1102
+ call _bsaes_key_convert
1103
+ pxor ($out),%xmm7 # fix up round 0 key
1104
+ movdqa %xmm6,(%rax) # save last round key
1105
+ movdqa %xmm7,($out)
1106
+ ret
1107
+ .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1108
+
1109
+ .globl bsaes_decrypt_128
1110
+ .type bsaes_decrypt_128,\@function,4
1111
+ .align 16
1112
+ bsaes_decrypt_128:
1113
+ .Ldec128_loop:
1114
+ movdqu 0x00($inp), @XMM[0] # load input
1115
+ movdqu 0x10($inp), @XMM[1]
1116
+ movdqu 0x20($inp), @XMM[2]
1117
+ movdqu 0x30($inp), @XMM[3]
1118
+ movdqu 0x40($inp), @XMM[4]
1119
+ movdqu 0x50($inp), @XMM[5]
1120
+ movdqu 0x60($inp), @XMM[6]
1121
+ movdqu 0x70($inp), @XMM[7]
1122
+ mov $key, %rax # pass the $key
1123
+ lea 0x80($inp), $inp
1124
+ mov \$10,%r10d
1125
+
1126
+ call _bsaes_decrypt8
1127
+
1128
+ movdqu @XMM[0], 0x00($out) # write output
1129
+ movdqu @XMM[1], 0x10($out)
1130
+ movdqu @XMM[6], 0x20($out)
1131
+ movdqu @XMM[4], 0x30($out)
1132
+ movdqu @XMM[2], 0x40($out)
1133
+ movdqu @XMM[7], 0x50($out)
1134
+ movdqu @XMM[3], 0x60($out)
1135
+ movdqu @XMM[5], 0x70($out)
1136
+ lea 0x80($out), $out
1137
+ sub \$0x80,$len
1138
+ ja .Ldec128_loop
1139
+ ret
1140
+ .size bsaes_decrypt_128,.-bsaes_decrypt_128
1141
+ ___
1142
+ }
1143
+ {
1144
+ ######################################################################
1145
+ #
1146
+ # OpenSSL interface
1147
+ #
1148
+ my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1149
+ : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1150
+ my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1151
+
1152
+ $code.=<<___;
1153
+ .globl bsaes_ctr32_encrypt_blocks
1154
+ .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1155
+ .align 16
1156
+ bsaes_ctr32_encrypt_blocks:
1157
+ mov %rsp, %rax
1158
+ .Lctr_enc_prologue:
1159
+ push %rbp
1160
+ push %rbx
1161
+ push %r12
1162
+ push %r13
1163
+ push %r14
1164
+ push %r15
1165
+ lea -0x48(%rsp), %rsp
1166
+ ___
1167
+ $code.=<<___ if ($win64);
1168
+ mov 0xa0(%rsp),$arg5 # pull ivp
1169
+ lea -0xa0(%rsp), %rsp
1170
+ movaps %xmm6, 0x40(%rsp)
1171
+ movaps %xmm7, 0x50(%rsp)
1172
+ movaps %xmm8, 0x60(%rsp)
1173
+ movaps %xmm9, 0x70(%rsp)
1174
+ movaps %xmm10, 0x80(%rsp)
1175
+ movaps %xmm11, 0x90(%rsp)
1176
+ movaps %xmm12, 0xa0(%rsp)
1177
+ movaps %xmm13, 0xb0(%rsp)
1178
+ movaps %xmm14, 0xc0(%rsp)
1179
+ movaps %xmm15, 0xd0(%rsp)
1180
+ .Lctr_enc_body:
1181
+ ___
1182
+ $code.=<<___;
1183
+ mov %rsp, %rbp # backup %rsp
1184
+ movdqu ($arg5), %xmm0 # load counter
1185
+ mov 240($arg4), %eax # rounds
1186
+ mov $arg1, $inp # backup arguments
1187
+ mov $arg2, $out
1188
+ mov $arg3, $len
1189
+ mov $arg4, $key
1190
+ movdqa %xmm0, 0x20(%rbp) # copy counter
1191
+ cmp \$8, $arg3
1192
+ jb .Lctr_enc_short
1193
+
1194
+ mov %eax, %ebx # rounds
1195
+ shl \$7, %rax # 128 bytes per inner round key
1196
+ sub \$`128-32`, %rax # size of bit-sliced key schedule
1197
+ sub %rax, %rsp
1198
+
1199
+ mov %rsp, %rax # pass key schedule
1200
+ mov $key, %rcx # pass key
1201
+ mov %ebx, %r10d # pass rounds
1202
+ call _bsaes_key_convert
1203
+ pxor %xmm6,%xmm7 # fix up last round key
1204
+ movdqa %xmm7,(%rax) # save last round key
1205
+
1206
+ movdqa (%rsp), @XMM[9] # load round0 key
1207
+ lea .LADD1(%rip), %r11
1208
+ movdqa 0x20(%rbp), @XMM[0] # counter copy
1209
+ movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1210
+ pshufb @XMM[8], @XMM[9] # byte swap upper part
1211
+ pshufb @XMM[8], @XMM[0]
1212
+ movdqa @XMM[9], (%rsp) # save adjusted round0 key
1213
+ jmp .Lctr_enc_loop
1214
+ .align 16
1215
+ .Lctr_enc_loop:
1216
+ movdqa @XMM[0], 0x20(%rbp) # save counter
1217
+ movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1218
+ movdqa @XMM[0], @XMM[2]
1219
+ paddd 0x00(%r11), @XMM[1] # .LADD1
1220
+ movdqa @XMM[0], @XMM[3]
1221
+ paddd 0x10(%r11), @XMM[2] # .LADD2
1222
+ movdqa @XMM[0], @XMM[4]
1223
+ paddd 0x20(%r11), @XMM[3] # .LADD3
1224
+ movdqa @XMM[0], @XMM[5]
1225
+ paddd 0x30(%r11), @XMM[4] # .LADD4
1226
+ movdqa @XMM[0], @XMM[6]
1227
+ paddd 0x40(%r11), @XMM[5] # .LADD5
1228
+ movdqa @XMM[0], @XMM[7]
1229
+ paddd 0x50(%r11), @XMM[6] # .LADD6
1230
+ paddd 0x60(%r11), @XMM[7] # .LADD7
1231
+
1232
+ # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1233
+ # to flip byte order in 32-bit counter
1234
+ movdqa (%rsp), @XMM[9] # round 0 key
1235
+ lea 0x10(%rsp), %rax # pass key schedule
1236
+ movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1237
+ pxor @XMM[9], @XMM[0] # xor with round0 key
1238
+ pxor @XMM[9], @XMM[1]
1239
+ pxor @XMM[9], @XMM[2]
1240
+ pxor @XMM[9], @XMM[3]
1241
+ pshufb @XMM[8], @XMM[0]
1242
+ pshufb @XMM[8], @XMM[1]
1243
+ pxor @XMM[9], @XMM[4]
1244
+ pxor @XMM[9], @XMM[5]
1245
+ pshufb @XMM[8], @XMM[2]
1246
+ pshufb @XMM[8], @XMM[3]
1247
+ pxor @XMM[9], @XMM[6]
1248
+ pxor @XMM[9], @XMM[7]
1249
+ pshufb @XMM[8], @XMM[4]
1250
+ pshufb @XMM[8], @XMM[5]
1251
+ pshufb @XMM[8], @XMM[6]
1252
+ pshufb @XMM[8], @XMM[7]
1253
+ lea .LBS0(%rip), %r11 # constants table
1254
+ mov %ebx,%r10d # pass rounds
1255
+
1256
+ call _bsaes_encrypt8_bitslice
1257
+
1258
+ sub \$8,$len
1259
+ jc .Lctr_enc_loop_done
1260
+
1261
+ movdqu 0x00($inp), @XMM[8] # load input
1262
+ movdqu 0x10($inp), @XMM[9]
1263
+ movdqu 0x20($inp), @XMM[10]
1264
+ movdqu 0x30($inp), @XMM[11]
1265
+ movdqu 0x40($inp), @XMM[12]
1266
+ movdqu 0x50($inp), @XMM[13]
1267
+ movdqu 0x60($inp), @XMM[14]
1268
+ movdqu 0x70($inp), @XMM[15]
1269
+ lea 0x80($inp),$inp
1270
+ pxor @XMM[0], @XMM[8]
1271
+ movdqa 0x20(%rbp), @XMM[0] # load counter
1272
+ pxor @XMM[9], @XMM[1]
1273
+ movdqu @XMM[8], 0x00($out) # write output
1274
+ pxor @XMM[10], @XMM[4]
1275
+ movdqu @XMM[1], 0x10($out)
1276
+ pxor @XMM[11], @XMM[6]
1277
+ movdqu @XMM[4], 0x20($out)
1278
+ pxor @XMM[12], @XMM[3]
1279
+ movdqu @XMM[6], 0x30($out)
1280
+ pxor @XMM[13], @XMM[7]
1281
+ movdqu @XMM[3], 0x40($out)
1282
+ pxor @XMM[14], @XMM[2]
1283
+ movdqu @XMM[7], 0x50($out)
1284
+ pxor @XMM[15], @XMM[5]
1285
+ movdqu @XMM[2], 0x60($out)
1286
+ lea .LADD1(%rip), %r11
1287
+ movdqu @XMM[5], 0x70($out)
1288
+ lea 0x80($out), $out
1289
+ paddd 0x70(%r11), @XMM[0] # .LADD8
1290
+ jnz .Lctr_enc_loop
1291
+
1292
+ jmp .Lctr_enc_done
1293
+ .align 16
1294
+ .Lctr_enc_loop_done:
1295
+ add \$8, $len
1296
+ movdqu 0x00($inp), @XMM[8] # load input
1297
+ pxor @XMM[8], @XMM[0]
1298
+ movdqu @XMM[0], 0x00($out) # write output
1299
+ cmp \$2,$len
1300
+ jb .Lctr_enc_done
1301
+ movdqu 0x10($inp), @XMM[9]
1302
+ pxor @XMM[9], @XMM[1]
1303
+ movdqu @XMM[1], 0x10($out)
1304
+ je .Lctr_enc_done
1305
+ movdqu 0x20($inp), @XMM[10]
1306
+ pxor @XMM[10], @XMM[4]
1307
+ movdqu @XMM[4], 0x20($out)
1308
+ cmp \$4,$len
1309
+ jb .Lctr_enc_done
1310
+ movdqu 0x30($inp), @XMM[11]
1311
+ pxor @XMM[11], @XMM[6]
1312
+ movdqu @XMM[6], 0x30($out)
1313
+ je .Lctr_enc_done
1314
+ movdqu 0x40($inp), @XMM[12]
1315
+ pxor @XMM[12], @XMM[3]
1316
+ movdqu @XMM[3], 0x40($out)
1317
+ cmp \$6,$len
1318
+ jb .Lctr_enc_done
1319
+ movdqu 0x50($inp), @XMM[13]
1320
+ pxor @XMM[13], @XMM[7]
1321
+ movdqu @XMM[7], 0x50($out)
1322
+ je .Lctr_enc_done
1323
+ movdqu 0x60($inp), @XMM[14]
1324
+ pxor @XMM[14], @XMM[2]
1325
+ movdqu @XMM[2], 0x60($out)
1326
+ jmp .Lctr_enc_done
1327
+
1328
+ .align 16
1329
+ .Lctr_enc_short:
1330
+ lea 0x20(%rbp), $arg1
1331
+ lea 0x30(%rbp), $arg2
1332
+ lea ($key), $arg3
1333
+ call asm_AES_encrypt
1334
+ movdqu ($inp), @XMM[1]
1335
+ lea 16($inp), $inp
1336
+ mov 0x2c(%rbp), %eax # load 32-bit counter
1337
+ bswap %eax
1338
+ pxor 0x30(%rbp), @XMM[1]
1339
+ inc %eax # increment
1340
+ movdqu @XMM[1], ($out)
1341
+ bswap %eax
1342
+ lea 16($out), $out
1343
+ mov %eax, 0x2c(%rsp) # save 32-bit counter
1344
+ dec $len
1345
+ jnz .Lctr_enc_short
1346
+
1347
+ .Lctr_enc_done:
1348
+ lea (%rsp), %rax
1349
+ pxor %xmm0, %xmm0
1350
+ .Lctr_enc_bzero: # wipe key schedule [if any]
1351
+ movdqa %xmm0, 0x00(%rax)
1352
+ movdqa %xmm0, 0x10(%rax)
1353
+ lea 0x20(%rax), %rax
1354
+ cmp %rax, %rbp
1355
+ ja .Lctr_enc_bzero
1356
+
1357
+ lea (%rbp),%rsp # restore %rsp
1358
+ ___
1359
+ $code.=<<___ if ($win64);
1360
+ movaps 0x40(%rbp), %xmm6
1361
+ movaps 0x50(%rbp), %xmm7
1362
+ movaps 0x60(%rbp), %xmm8
1363
+ movaps 0x70(%rbp), %xmm9
1364
+ movaps 0x80(%rbp), %xmm10
1365
+ movaps 0x90(%rbp), %xmm11
1366
+ movaps 0xa0(%rbp), %xmm12
1367
+ movaps 0xb0(%rbp), %xmm13
1368
+ movaps 0xc0(%rbp), %xmm14
1369
+ movaps 0xd0(%rbp), %xmm15
1370
+ lea 0xa0(%rbp), %rsp
1371
+ ___
1372
+ $code.=<<___;
1373
+ mov 0x48(%rsp), %r15
1374
+ mov 0x50(%rsp), %r14
1375
+ mov 0x58(%rsp), %r13
1376
+ mov 0x60(%rsp), %r12
1377
+ mov 0x68(%rsp), %rbx
1378
+ mov 0x70(%rsp), %rax
1379
+ lea 0x78(%rsp), %rsp
1380
+ mov %rax, %rbp
1381
+ .Lctr_enc_epilogue:
1382
+ ret
1383
+ .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1384
+ ___
1385
+ }
1386
+ $code.=<<___;
1387
+ .type _bsaes_const,\@object
1388
+ .align 64
1389
+ _bsaes_const:
1390
+ .LM0ISR: # InvShiftRows constants
1391
+ .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
1392
+ .LISRM0:
1393
+ .quad 0x01040b0e0205080f, 0x0306090c00070a0d
1394
+ .LISR:
1395
+ .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
1396
+ .LBS0: # bit-slice constants
1397
+ .quad 0x5555555555555555, 0x5555555555555555
1398
+ .LBS1:
1399
+ .quad 0x3333333333333333, 0x3333333333333333
1400
+ .LBS2:
1401
+ .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1402
+ .LSR: # shiftrows constants
1403
+ .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1404
+ .LSRM0:
1405
+ .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1406
+ .LM0SR:
1407
+ .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1408
+ .LSWPUP: # byte-swap upper dword
1409
+ .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1410
+ .LSWPUPM0SR:
1411
+ .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1412
+ .LADD1: # counter increment constants
1413
+ .quad 0x0000000000000000, 0x0000000100000000
1414
+ .LADD2:
1415
+ .quad 0x0000000000000000, 0x0000000200000000
1416
+ .LADD3:
1417
+ .quad 0x0000000000000000, 0x0000000300000000
1418
+ .LADD4:
1419
+ .quad 0x0000000000000000, 0x0000000400000000
1420
+ .LADD5:
1421
+ .quad 0x0000000000000000, 0x0000000500000000
1422
+ .LADD6:
1423
+ .quad 0x0000000000000000, 0x0000000600000000
1424
+ .LADD7:
1425
+ .quad 0x0000000000000000, 0x0000000700000000
1426
+ .LADD8:
1427
+ .quad 0x0000000000000000, 0x0000000800000000
1428
+ .Lmasks:
1429
+ .quad 0x0101010101010101, 0x0101010101010101
1430
+ .quad 0x0202020202020202, 0x0202020202020202
1431
+ .quad 0x0404040404040404, 0x0404040404040404
1432
+ .quad 0x0808080808080808, 0x0808080808080808
1433
+ .LM0:
1434
+ .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1435
+ .L63:
1436
+ .quad 0x6363636363636363, 0x6363636363636363
1437
+ .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
1438
+ .align 64
1439
+ .size _bsaes_const,.-_bsaes_const
1440
+ ___
1441
+
1442
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1443
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1444
+ if ($win64) {
1445
+ $rec="%rcx";
1446
+ $frame="%rdx";
1447
+ $context="%r8";
1448
+ $disp="%r9";
1449
+
1450
+ $code.=<<___;
1451
+ .extern __imp_RtlVirtualUnwind
1452
+ .type se_handler,\@abi-omnipotent
1453
+ .align 16
1454
+ se_handler:
1455
+ push %rsi
1456
+ push %rdi
1457
+ push %rbx
1458
+ push %rbp
1459
+ push %r12
1460
+ push %r13
1461
+ push %r14
1462
+ push %r15
1463
+ pushfq
1464
+ sub \$64,%rsp
1465
+
1466
+ mov 120($context),%rax # pull context->Rax
1467
+ mov 248($context),%rbx # pull context->Rip
1468
+
1469
+ mov 8($disp),%rsi # disp->ImageBase
1470
+ mov 56($disp),%r11 # disp->HandlerData
1471
+
1472
+ mov 0(%r11),%r10d # HandlerData[0]
1473
+ lea (%rsi,%r10),%r10 # prologue label
1474
+ cmp %r10,%rbx # context->Rip<prologue label
1475
+ jb .Lin_prologue
1476
+
1477
+ mov 152($context),%rax # pull context->Rsp
1478
+
1479
+ mov 4(%r11),%r10d # HandlerData[1]
1480
+ lea (%rsi,%r10),%r10 # epilogue label
1481
+ cmp %r10,%rbx # context->Rip>=epilogue label
1482
+ jae .Lin_prologue
1483
+
1484
+ mov 160($context),%rax # pull context->Rbp
1485
+
1486
+ lea 0x40(%rax),%rsi # %xmm save area
1487
+ lea 512($context),%rdi # &context.Xmm6
1488
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1489
+ .long 0xa548f3fc # cld; rep movsq
1490
+ lea 0xa0(%rax),%rax # adjust stack pointer
1491
+
1492
+ mov 0x70(%rax),%rbp
1493
+ mov 0x68(%rax),%rbx
1494
+ mov 0x60(%rax),%r12
1495
+ mov 0x58(%rax),%r13
1496
+ mov 0x50(%rax),%r14
1497
+ mov 0x48(%rax),%r15
1498
+ lea 0x78(%rax),%rax # adjust stack pointer
1499
+ mov %rbx,144($context) # restore context->Rbx
1500
+ mov %rbp,160($context) # restore context->Rbp
1501
+ mov %r12,216($context) # restore context->R12
1502
+ mov %r13,224($context) # restore context->R13
1503
+ mov %r14,232($context) # restore context->R14
1504
+ mov %r15,240($context) # restore context->R15
1505
+
1506
+ .Lin_prologue:
1507
+ mov %rax,152($context) # restore context->Rsp
1508
+
1509
+ mov 40($disp),%rdi # disp->ContextRecord
1510
+ mov $context,%rsi # context
1511
+ mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1512
+ .long 0xa548f3fc # cld; rep movsq
1513
+
1514
+ mov $disp,%rsi
1515
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1516
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1517
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1518
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1519
+ mov 40(%rsi),%r10 # disp->ContextRecord
1520
+ lea 56(%rsi),%r11 # &disp->HandlerData
1521
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1522
+ mov %r10,32(%rsp) # arg5
1523
+ mov %r11,40(%rsp) # arg6
1524
+ mov %r12,48(%rsp) # arg7
1525
+ mov %rcx,56(%rsp) # arg8, (NULL)
1526
+ call *__imp_RtlVirtualUnwind(%rip)
1527
+
1528
+ mov \$1,%eax # ExceptionContinueSearch
1529
+ add \$64,%rsp
1530
+ popfq
1531
+ pop %r15
1532
+ pop %r14
1533
+ pop %r13
1534
+ pop %r12
1535
+ pop %rbp
1536
+ pop %rbx
1537
+ pop %rdi
1538
+ pop %rsi
1539
+ ret
1540
+ .size se_handler,.-se_handler
1541
+
1542
+ .section .pdata
1543
+ .align 4
1544
+ ___
1545
+ $code.=<<___;
1546
+ .rva .Lctr_enc_prologue
1547
+ .rva .Lctr_enc_epilogue
1548
+ .rva .Lctr_enc_info
1549
+
1550
+ .section .xdata
1551
+ .align 8
1552
+ ___
1553
+ $code.=<<___;
1554
+ .Lctr_enc_info:
1555
+ .byte 9,0,0,0
1556
+ .rva se_handler
1557
+ .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
1558
+ ___
1559
+ }
1560
+
1561
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1562
+
1563
+ print $code;
1564
+
1565
+ close STDOUT;