ring-native 0.0.0 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (267) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +1 -0
  3. data/CHANGES.md +7 -0
  4. data/Makefile +5 -0
  5. data/README.md +12 -5
  6. data/Rakefile +4 -0
  7. data/ext/ring/extconf.rb +4 -5
  8. data/lib/ring/native.rb +3 -1
  9. data/lib/ring/native/version.rb +5 -1
  10. data/ring-native.gemspec +6 -6
  11. data/vendor/ring-ffi/Cargo.lock +26 -0
  12. data/vendor/ring-ffi/Cargo.toml +45 -0
  13. data/vendor/ring-ffi/LICENSE +16 -0
  14. data/vendor/ring-ffi/README.md +59 -0
  15. data/vendor/ring-ffi/src/lib.rs +79 -0
  16. metadata +10 -255
  17. data/vendor/ring/BUILDING.md +0 -40
  18. data/vendor/ring/Cargo.toml +0 -43
  19. data/vendor/ring/LICENSE +0 -185
  20. data/vendor/ring/Makefile +0 -35
  21. data/vendor/ring/PORTING.md +0 -163
  22. data/vendor/ring/README.md +0 -113
  23. data/vendor/ring/STYLE.md +0 -197
  24. data/vendor/ring/appveyor.yml +0 -27
  25. data/vendor/ring/build.rs +0 -108
  26. data/vendor/ring/crypto/aes/aes.c +0 -1142
  27. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
  28. data/vendor/ring/crypto/aes/aes_test.cc +0 -93
  29. data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
  30. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
  31. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
  32. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
  33. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
  34. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
  35. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
  36. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
  37. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
  38. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
  39. data/vendor/ring/crypto/aes/internal.h +0 -87
  40. data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
  41. data/vendor/ring/crypto/bn/add.c +0 -394
  42. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
  43. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
  44. data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
  45. data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
  46. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
  47. data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
  48. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
  49. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
  50. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
  51. data/vendor/ring/crypto/bn/bn.c +0 -352
  52. data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
  53. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
  54. data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
  55. data/vendor/ring/crypto/bn/cmp.c +0 -200
  56. data/vendor/ring/crypto/bn/convert.c +0 -433
  57. data/vendor/ring/crypto/bn/ctx.c +0 -311
  58. data/vendor/ring/crypto/bn/div.c +0 -594
  59. data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
  60. data/vendor/ring/crypto/bn/gcd.c +0 -711
  61. data/vendor/ring/crypto/bn/generic.c +0 -1019
  62. data/vendor/ring/crypto/bn/internal.h +0 -316
  63. data/vendor/ring/crypto/bn/montgomery.c +0 -516
  64. data/vendor/ring/crypto/bn/mul.c +0 -888
  65. data/vendor/ring/crypto/bn/prime.c +0 -829
  66. data/vendor/ring/crypto/bn/random.c +0 -334
  67. data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
  68. data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
  69. data/vendor/ring/crypto/bn/shift.c +0 -276
  70. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
  71. data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
  72. data/vendor/ring/crypto/bytestring/cbb.c +0 -399
  73. data/vendor/ring/crypto/bytestring/cbs.c +0 -227
  74. data/vendor/ring/crypto/bytestring/internal.h +0 -46
  75. data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
  76. data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
  77. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
  78. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
  79. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
  80. data/vendor/ring/crypto/cipher/e_aes.c +0 -390
  81. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
  82. data/vendor/ring/crypto/cipher/internal.h +0 -173
  83. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
  84. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
  85. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
  86. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
  87. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
  88. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
  89. data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
  90. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
  91. data/vendor/ring/crypto/constant_time_test.c +0 -304
  92. data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
  93. data/vendor/ring/crypto/cpu-arm.c +0 -199
  94. data/vendor/ring/crypto/cpu-intel.c +0 -261
  95. data/vendor/ring/crypto/crypto.c +0 -151
  96. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
  97. data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
  98. data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
  99. data/vendor/ring/crypto/digest/md32_common.h +0 -181
  100. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
  101. data/vendor/ring/crypto/ec/ec.c +0 -193
  102. data/vendor/ring/crypto/ec/ec_curves.c +0 -61
  103. data/vendor/ring/crypto/ec/ec_key.c +0 -228
  104. data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
  105. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
  106. data/vendor/ring/crypto/ec/internal.h +0 -243
  107. data/vendor/ring/crypto/ec/oct.c +0 -253
  108. data/vendor/ring/crypto/ec/p256-64.c +0 -1794
  109. data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
  110. data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
  111. data/vendor/ring/crypto/ec/simple.c +0 -1007
  112. data/vendor/ring/crypto/ec/util-64.c +0 -183
  113. data/vendor/ring/crypto/ec/wnaf.c +0 -508
  114. data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
  115. data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
  116. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
  117. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
  118. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
  119. data/vendor/ring/crypto/header_removed.h +0 -17
  120. data/vendor/ring/crypto/internal.h +0 -495
  121. data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
  122. data/vendor/ring/crypto/mem.c +0 -98
  123. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
  124. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
  125. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
  126. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
  127. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
  128. data/vendor/ring/crypto/modes/ctr.c +0 -226
  129. data/vendor/ring/crypto/modes/gcm.c +0 -1206
  130. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
  131. data/vendor/ring/crypto/modes/gcm_test.c +0 -348
  132. data/vendor/ring/crypto/modes/internal.h +0 -299
  133. data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
  134. data/vendor/ring/crypto/perlasm/readme +0 -100
  135. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
  136. data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
  137. data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
  138. data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
  139. data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
  140. data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
  141. data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
  142. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
  143. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
  144. data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
  145. data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
  146. data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
  147. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
  148. data/vendor/ring/crypto/rand/internal.h +0 -32
  149. data/vendor/ring/crypto/rand/rand.c +0 -189
  150. data/vendor/ring/crypto/rand/urandom.c +0 -219
  151. data/vendor/ring/crypto/rand/windows.c +0 -56
  152. data/vendor/ring/crypto/refcount_c11.c +0 -66
  153. data/vendor/ring/crypto/refcount_lock.c +0 -53
  154. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
  155. data/vendor/ring/crypto/refcount_test.c +0 -58
  156. data/vendor/ring/crypto/rsa/blinding.c +0 -462
  157. data/vendor/ring/crypto/rsa/internal.h +0 -108
  158. data/vendor/ring/crypto/rsa/padding.c +0 -300
  159. data/vendor/ring/crypto/rsa/rsa.c +0 -450
  160. data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
  161. data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
  162. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
  163. data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
  164. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
  165. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
  166. data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
  167. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
  168. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
  169. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
  170. data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
  171. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
  172. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
  173. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
  174. data/vendor/ring/crypto/sha/sha1.c +0 -271
  175. data/vendor/ring/crypto/sha/sha256.c +0 -204
  176. data/vendor/ring/crypto/sha/sha512.c +0 -355
  177. data/vendor/ring/crypto/test/file_test.cc +0 -326
  178. data/vendor/ring/crypto/test/file_test.h +0 -181
  179. data/vendor/ring/crypto/test/malloc.cc +0 -150
  180. data/vendor/ring/crypto/test/scoped_types.h +0 -95
  181. data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
  182. data/vendor/ring/crypto/test/test_util.cc +0 -46
  183. data/vendor/ring/crypto/test/test_util.h +0 -41
  184. data/vendor/ring/crypto/thread_none.c +0 -55
  185. data/vendor/ring/crypto/thread_pthread.c +0 -165
  186. data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
  187. data/vendor/ring/crypto/thread_test.c +0 -200
  188. data/vendor/ring/crypto/thread_win.c +0 -282
  189. data/vendor/ring/examples/checkdigest.rs +0 -103
  190. data/vendor/ring/include/openssl/aes.h +0 -121
  191. data/vendor/ring/include/openssl/arm_arch.h +0 -129
  192. data/vendor/ring/include/openssl/base.h +0 -156
  193. data/vendor/ring/include/openssl/bn.h +0 -794
  194. data/vendor/ring/include/openssl/buffer.h +0 -18
  195. data/vendor/ring/include/openssl/bytestring.h +0 -235
  196. data/vendor/ring/include/openssl/chacha.h +0 -37
  197. data/vendor/ring/include/openssl/cmac.h +0 -76
  198. data/vendor/ring/include/openssl/cpu.h +0 -184
  199. data/vendor/ring/include/openssl/crypto.h +0 -43
  200. data/vendor/ring/include/openssl/curve25519.h +0 -88
  201. data/vendor/ring/include/openssl/ec.h +0 -225
  202. data/vendor/ring/include/openssl/ec_key.h +0 -129
  203. data/vendor/ring/include/openssl/ecdh.h +0 -110
  204. data/vendor/ring/include/openssl/ecdsa.h +0 -156
  205. data/vendor/ring/include/openssl/err.h +0 -201
  206. data/vendor/ring/include/openssl/mem.h +0 -101
  207. data/vendor/ring/include/openssl/obj_mac.h +0 -71
  208. data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
  209. data/vendor/ring/include/openssl/opensslv.h +0 -18
  210. data/vendor/ring/include/openssl/ossl_typ.h +0 -18
  211. data/vendor/ring/include/openssl/poly1305.h +0 -51
  212. data/vendor/ring/include/openssl/rand.h +0 -70
  213. data/vendor/ring/include/openssl/rsa.h +0 -399
  214. data/vendor/ring/include/openssl/thread.h +0 -133
  215. data/vendor/ring/include/openssl/type_check.h +0 -71
  216. data/vendor/ring/mk/Common.props +0 -63
  217. data/vendor/ring/mk/Windows.props +0 -42
  218. data/vendor/ring/mk/WindowsTest.props +0 -18
  219. data/vendor/ring/mk/appveyor.bat +0 -62
  220. data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
  221. data/vendor/ring/mk/ring.mk +0 -266
  222. data/vendor/ring/mk/top_of_makefile.mk +0 -214
  223. data/vendor/ring/mk/travis.sh +0 -40
  224. data/vendor/ring/mk/update-travis-yml.py +0 -229
  225. data/vendor/ring/ring.sln +0 -153
  226. data/vendor/ring/src/aead.rs +0 -682
  227. data/vendor/ring/src/agreement.rs +0 -248
  228. data/vendor/ring/src/c.rs +0 -129
  229. data/vendor/ring/src/constant_time.rs +0 -37
  230. data/vendor/ring/src/der.rs +0 -96
  231. data/vendor/ring/src/digest.rs +0 -690
  232. data/vendor/ring/src/digest_tests.txt +0 -57
  233. data/vendor/ring/src/ecc.rs +0 -28
  234. data/vendor/ring/src/ecc_build.rs +0 -279
  235. data/vendor/ring/src/ecc_curves.rs +0 -117
  236. data/vendor/ring/src/ed25519_tests.txt +0 -2579
  237. data/vendor/ring/src/exe_tests.rs +0 -46
  238. data/vendor/ring/src/ffi.rs +0 -29
  239. data/vendor/ring/src/file_test.rs +0 -187
  240. data/vendor/ring/src/hkdf.rs +0 -153
  241. data/vendor/ring/src/hkdf_tests.txt +0 -59
  242. data/vendor/ring/src/hmac.rs +0 -414
  243. data/vendor/ring/src/hmac_tests.txt +0 -97
  244. data/vendor/ring/src/input.rs +0 -312
  245. data/vendor/ring/src/lib.rs +0 -41
  246. data/vendor/ring/src/pbkdf2.rs +0 -265
  247. data/vendor/ring/src/pbkdf2_tests.txt +0 -113
  248. data/vendor/ring/src/polyfill.rs +0 -57
  249. data/vendor/ring/src/rand.rs +0 -28
  250. data/vendor/ring/src/signature.rs +0 -314
  251. data/vendor/ring/third-party/NIST/README.md +0 -9
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
  260. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
  261. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
  262. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
  263. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
  264. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
  265. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
  266. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
  267. data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,2084 +0,0 @@
1
- #!/usr/bin/env perl
2
- #
3
- # ====================================================================
4
- # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
- # project. The module is, however, dual licensed under OpenSSL and
6
- # CRYPTOGAMS licenses depending on where you obtain it. For further
7
- # details see http://www.openssl.org/~appro/cryptogams/.
8
- # ====================================================================
9
- #
10
- # This module implements support for Intel AES-NI extension. In
11
- # OpenSSL context it's used with Intel engine, but can also be used as
12
- # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13
- # details].
14
- #
15
- # Performance.
16
- #
17
- # Given aes(enc|dec) instructions' latency asymptotic performance for
18
- # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19
- # processed with 128-bit key. And given their throughput asymptotic
20
- # performance for parallelizable modes is 1.25 cycles per byte. Being
21
- # asymptotic limit it's not something you commonly achieve in reality,
22
- # but how close does one get? Below are results collected for
23
- # different modes and block sized. Pairs of numbers are for en-/
24
- # decryption.
25
- #
26
- # 16-byte 64-byte 256-byte 1-KB 8-KB
27
- # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28
- # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29
- # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30
- # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31
- # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32
- # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33
- #
34
- # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35
- # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36
- # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37
- # The results were collected with specially crafted speed.c benchmark
38
- # in order to compare them with results reported in "Intel Advanced
39
- # Encryption Standard (AES) New Instruction Set" White Paper Revision
40
- # 3.0 dated May 2010. All above results are consistently better. This
41
- # module also provides better performance for block sizes smaller than
42
- # 128 bytes in points *not* represented in the above table.
43
- #
44
- # Looking at the results for 8-KB buffer.
45
- #
46
- # CFB and OFB results are far from the limit, because implementation
47
- # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48
- # single-block aesni_encrypt, which is not the most optimal way to go.
49
- # CBC encrypt result is unexpectedly high and there is no documented
50
- # explanation for it. Seemingly there is a small penalty for feeding
51
- # the result back to AES unit the way it's done in CBC mode. There is
52
- # nothing one can do and the result appears optimal. CCM result is
53
- # identical to CBC, because CBC-MAC is essentially CBC encrypt without
54
- # saving output. CCM CTR "stays invisible," because it's neatly
55
- # interleaved wih CBC-MAC. This provides ~30% improvement over
56
- # "straghtforward" CCM implementation with CTR and CBC-MAC performed
57
- # disjointly. Parallelizable modes practically achieve the theoretical
58
- # limit.
59
- #
60
- # Looking at how results vary with buffer size.
61
- #
62
- # Curves are practically saturated at 1-KB buffer size. In most cases
63
- # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64
- # CTR curve doesn't follow this pattern and is "slowest" changing one
65
- # with "256-byte" result being 87% of "8-KB." This is because overhead
66
- # in CTR mode is most computationally intensive. Small-block CCM
67
- # decrypt is slower than encrypt, because first CTR and last CBC-MAC
68
- # iterations can't be interleaved.
69
- #
70
- # Results for 192- and 256-bit keys.
71
- #
72
- # EVP-free results were observed to scale perfectly with number of
73
- # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74
- # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75
- # are a tad smaller, because the above mentioned penalty biases all
76
- # results by same constant value. In similar way function call
77
- # overhead affects small-block performance, as well as OFB and CFB
78
- # results. Differences are not large, most common coefficients are
79
- # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80
- # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
-
82
- # January 2011
83
- #
84
- # While Westmere processor features 6 cycles latency for aes[enc|dec]
85
- # instructions, which can be scheduled every second cycle, Sandy
86
- # Bridge spends 8 cycles per instruction, but it can schedule them
87
- # every cycle. This means that code targeting Westmere would perform
88
- # suboptimally on Sandy Bridge. Therefore this update.
89
- #
90
- # In addition, non-parallelizable CBC encrypt (as well as CCM) is
91
- # optimized. Relative improvement might appear modest, 8% on Westmere,
92
- # but in absolute terms it's 3.77 cycles per byte encrypted with
93
- # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94
- # should be compared to asymptotic limits of 3.75 for Westmere and
95
- # 5.00 for Sandy Bridge. Actually, the fact that they get this close
96
- # to asymptotic limits is quite amazing. Indeed, the limit is
97
- # calculated as latency times number of rounds, 10 for 128-bit key,
98
- # and divided by 16, the number of bytes in block, or in other words
99
- # it accounts *solely* for aesenc instructions. But there are extra
100
- # instructions, and numbers so close to the asymptotic limits mean
101
- # that it's as if it takes as little as *one* additional cycle to
102
- # execute all of them. How is it possible? It is possible thanks to
103
- # out-of-order execution logic, which manages to overlap post-
104
- # processing of previous block, things like saving the output, with
105
- # actual encryption of current block, as well as pre-processing of
106
- # current block, things like fetching input and xor-ing it with
107
- # 0-round element of the key schedule, with actual encryption of
108
- # previous block. Keep this in mind...
109
- #
110
- # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111
- # performance is achieved by interleaving instructions working on
112
- # independent blocks. In which case asymptotic limit for such modes
113
- # can be obtained by dividing above mentioned numbers by AES
114
- # instructions' interleave factor. Westmere can execute at most 3
115
- # instructions at a time, meaning that optimal interleave factor is 3,
116
- # and that's where the "magic" number of 1.25 come from. "Optimal
117
- # interleave factor" means that increase of interleave factor does
118
- # not improve performance. The formula has proven to reflect reality
119
- # pretty well on Westmere... Sandy Bridge on the other hand can
120
- # execute up to 8 AES instructions at a time, so how does varying
121
- # interleave factor affect the performance? Here is table for ECB
122
- # (numbers are cycles per byte processed with 128-bit key):
123
- #
124
- # instruction interleave factor 3x 6x 8x
125
- # theoretical asymptotic limit 1.67 0.83 0.625
126
- # measured performance for 8KB block 1.05 0.86 0.84
127
- #
128
- # "as if" interleave factor 4.7x 5.8x 6.0x
129
- #
130
- # Further data for other parallelizable modes:
131
- #
132
- # CBC decrypt 1.16 0.93 0.74
133
- # CTR 1.14 0.91 0.74
134
- #
135
- # Well, given 3x column it's probably inappropriate to call the limit
136
- # asymptotic, if it can be surpassed, isn't it? What happens there?
137
- # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138
- # magic is responsible for this. Processor overlaps not only the
139
- # additional instructions with AES ones, but even AES instuctions
140
- # processing adjacent triplets of independent blocks. In the 6x case
141
- # additional instructions still claim disproportionally small amount
142
- # of additional cycles, but in 8x case number of instructions must be
143
- # a tad too high for out-of-order logic to cope with, and AES unit
144
- # remains underutilized... As you can see 8x interleave is hardly
145
- # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146
- # utilizies 6x interleave because of limited register bank capacity.
147
- #
148
- # Higher interleave factors do have negative impact on Westmere
149
- # performance. While for ECB mode it's negligible ~1.5%, other
150
- # parallelizables perform ~5% worse, which is outweighed by ~25%
151
- # improvement on Sandy Bridge. To balance regression on Westmere
152
- # CTR mode was implemented with 6x aesenc interleave factor.
153
-
154
- # April 2011
155
- #
156
- # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
157
- # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
158
- # in CTR mode AES instruction interleave factor was chosen to be 6x.
159
-
160
- ######################################################################
161
- # Current large-block performance in cycles per byte processed with
162
- # 128-bit key (less is better).
163
- #
164
- # CBC en-/decrypt CTR XTS ECB
165
- # Westmere 3.77/1.25 1.25 1.25 1.26
166
- # * Bridge 5.07/0.74 0.75 0.90 0.85
167
- # Haswell 4.44/0.63 0.63 0.73 0.63
168
- # Silvermont 5.75/3.54 3.56 4.12 3.87(*)
169
- # Bulldozer 5.77/0.70 0.72 0.90 0.70
170
- #
171
- # (*) Atom Silvermont ECB result is suboptimal because of penalties
172
- # incurred by operations on %xmm8-15. As ECB is not considered
173
- # critical, nothing was done to mitigate the problem.
174
-
175
- $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
176
- # generates drop-in replacement for
177
- # crypto/aes/asm/aes-x86_64.pl:-)
178
-
179
- $flavour = shift;
180
- $output = shift;
181
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
182
-
183
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
184
-
185
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
186
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
187
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
188
- die "can't locate x86_64-xlate.pl";
189
-
190
- open OUT,"| \"$^X\" $xlate $flavour $output";
191
- *STDOUT=*OUT;
192
-
193
- $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
194
- @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
195
- ("%rdi","%rsi","%rdx","%rcx"); # Unix order
196
-
197
- $code=".text\n";
198
- $code.=".extern OPENSSL_ia32cap_P\n";
199
-
200
- $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
201
- # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
202
- $inp="%rdi";
203
- $out="%rsi";
204
- $len="%rdx";
205
- $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
206
- $ivp="%r8"; # cbc, ctr, ...
207
-
208
- $rnds_="%r10d"; # backup copy for $rounds
209
- $key_="%r11"; # backup copy for $key
210
-
211
- # %xmm register layout
212
- $rndkey0="%xmm0"; $rndkey1="%xmm1";
213
- $inout0="%xmm2"; $inout1="%xmm3";
214
- $inout2="%xmm4"; $inout3="%xmm5";
215
- $inout4="%xmm6"; $inout5="%xmm7";
216
- $inout6="%xmm8"; $inout7="%xmm9";
217
-
218
- $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
219
- $in0="%xmm8"; $iv="%xmm9";
220
-
221
- # Inline version of internal aesni_[en|de]crypt1.
222
- #
223
- # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
224
- # cycles which take care of loop variables...
225
- { my $sn;
226
- sub aesni_generate1 {
227
- my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
228
- ++$sn;
229
- $code.=<<___;
230
- $movkey ($key),$rndkey0
231
- $movkey 16($key),$rndkey1
232
- ___
233
- $code.=<<___ if (defined($ivec));
234
- xorps $rndkey0,$ivec
235
- lea 32($key),$key
236
- xorps $ivec,$inout
237
- ___
238
- $code.=<<___ if (!defined($ivec));
239
- lea 32($key),$key
240
- xorps $rndkey0,$inout
241
- ___
242
- $code.=<<___;
243
- .Loop_${p}1_$sn:
244
- aes${p} $rndkey1,$inout
245
- dec $rounds
246
- $movkey ($key),$rndkey1
247
- lea 16($key),$key
248
- jnz .Loop_${p}1_$sn # loop body is 16 bytes
249
- aes${p}last $rndkey1,$inout
250
- ___
251
- }}
252
- # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
253
- #
254
- { my ($inp,$out,$key) = @_4args;
255
-
256
- $code.=<<___;
257
- .globl ${PREFIX}_encrypt
258
- .type ${PREFIX}_encrypt,\@abi-omnipotent
259
- .align 16
260
- ${PREFIX}_encrypt:
261
- movups ($inp),$inout0 # load input
262
- mov 240($key),$rounds # key->rounds
263
- ___
264
- &aesni_generate1("enc",$key,$rounds);
265
- $code.=<<___;
266
- pxor $rndkey0,$rndkey0 # clear register bank
267
- pxor $rndkey1,$rndkey1
268
- movups $inout0,($out) # output
269
- pxor $inout0,$inout0
270
- ret
271
- .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
272
-
273
- .globl ${PREFIX}_decrypt
274
- .type ${PREFIX}_decrypt,\@abi-omnipotent
275
- .align 16
276
- ${PREFIX}_decrypt:
277
- movups ($inp),$inout0 # load input
278
- mov 240($key),$rounds # key->rounds
279
- ___
280
- &aesni_generate1("dec",$key,$rounds);
281
- $code.=<<___;
282
- pxor $rndkey0,$rndkey0 # clear register bank
283
- pxor $rndkey1,$rndkey1
284
- movups $inout0,($out) # output
285
- pxor $inout0,$inout0
286
- ret
287
- .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
288
- ___
289
- }
290
-
291
- # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
292
- # factor. Why 3x subroutine were originally used in loops? Even though
293
- # aes[enc|dec] latency was originally 6, it could be scheduled only
294
- # every *2nd* cycle. Thus 3x interleave was the one providing optimal
295
- # utilization, i.e. when subroutine's throughput is virtually same as
296
- # of non-interleaved subroutine [for number of input blocks up to 3].
297
- # This is why it originally made no sense to implement 2x subroutine.
298
- # But times change and it became appropriate to spend extra 192 bytes
299
- # on 2x subroutine on Atom Silvermont account. For processors that
300
- # can schedule aes[enc|dec] every cycle optimal interleave factor
301
- # equals to corresponding instructions latency. 8x is optimal for
302
- # * Bridge and "super-optimal" for other Intel CPUs...
303
-
304
- sub aesni_generate2 {
305
- my $dir=shift;
306
- # As already mentioned it takes in $key and $rounds, which are *not*
307
- # preserved. $inout[0-1] is cipher/clear text...
308
- $code.=<<___;
309
- .type _aesni_${dir}rypt2,\@abi-omnipotent
310
- .align 16
311
- _aesni_${dir}rypt2:
312
- $movkey ($key),$rndkey0
313
- shl \$4,$rounds
314
- $movkey 16($key),$rndkey1
315
- xorps $rndkey0,$inout0
316
- xorps $rndkey0,$inout1
317
- $movkey 32($key),$rndkey0
318
- lea 32($key,$rounds),$key
319
- neg %rax # $rounds
320
- add \$16,%rax
321
-
322
- .L${dir}_loop2:
323
- aes${dir} $rndkey1,$inout0
324
- aes${dir} $rndkey1,$inout1
325
- $movkey ($key,%rax),$rndkey1
326
- add \$32,%rax
327
- aes${dir} $rndkey0,$inout0
328
- aes${dir} $rndkey0,$inout1
329
- $movkey -16($key,%rax),$rndkey0
330
- jnz .L${dir}_loop2
331
-
332
- aes${dir} $rndkey1,$inout0
333
- aes${dir} $rndkey1,$inout1
334
- aes${dir}last $rndkey0,$inout0
335
- aes${dir}last $rndkey0,$inout1
336
- ret
337
- .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
338
- ___
339
- }
340
- sub aesni_generate3 {
341
- my $dir=shift;
342
- # As already mentioned it takes in $key and $rounds, which are *not*
343
- # preserved. $inout[0-2] is cipher/clear text...
344
- $code.=<<___;
345
- .type _aesni_${dir}rypt3,\@abi-omnipotent
346
- .align 16
347
- _aesni_${dir}rypt3:
348
- $movkey ($key),$rndkey0
349
- shl \$4,$rounds
350
- $movkey 16($key),$rndkey1
351
- xorps $rndkey0,$inout0
352
- xorps $rndkey0,$inout1
353
- xorps $rndkey0,$inout2
354
- $movkey 32($key),$rndkey0
355
- lea 32($key,$rounds),$key
356
- neg %rax # $rounds
357
- add \$16,%rax
358
-
359
- .L${dir}_loop3:
360
- aes${dir} $rndkey1,$inout0
361
- aes${dir} $rndkey1,$inout1
362
- aes${dir} $rndkey1,$inout2
363
- $movkey ($key,%rax),$rndkey1
364
- add \$32,%rax
365
- aes${dir} $rndkey0,$inout0
366
- aes${dir} $rndkey0,$inout1
367
- aes${dir} $rndkey0,$inout2
368
- $movkey -16($key,%rax),$rndkey0
369
- jnz .L${dir}_loop3
370
-
371
- aes${dir} $rndkey1,$inout0
372
- aes${dir} $rndkey1,$inout1
373
- aes${dir} $rndkey1,$inout2
374
- aes${dir}last $rndkey0,$inout0
375
- aes${dir}last $rndkey0,$inout1
376
- aes${dir}last $rndkey0,$inout2
377
- ret
378
- .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
379
- ___
380
- }
381
- # 4x interleave is implemented to improve small block performance,
382
- # most notably [and naturally] 4 block by ~30%. One can argue that one
383
- # should have implemented 5x as well, but improvement would be <20%,
384
- # so it's not worth it...
385
- sub aesni_generate4 {
386
- my $dir=shift;
387
- # As already mentioned it takes in $key and $rounds, which are *not*
388
- # preserved. $inout[0-3] is cipher/clear text...
389
- $code.=<<___;
390
- .type _aesni_${dir}rypt4,\@abi-omnipotent
391
- .align 16
392
- _aesni_${dir}rypt4:
393
- $movkey ($key),$rndkey0
394
- shl \$4,$rounds
395
- $movkey 16($key),$rndkey1
396
- xorps $rndkey0,$inout0
397
- xorps $rndkey0,$inout1
398
- xorps $rndkey0,$inout2
399
- xorps $rndkey0,$inout3
400
- $movkey 32($key),$rndkey0
401
- lea 32($key,$rounds),$key
402
- neg %rax # $rounds
403
- .byte 0x0f,0x1f,0x00
404
- add \$16,%rax
405
-
406
- .L${dir}_loop4:
407
- aes${dir} $rndkey1,$inout0
408
- aes${dir} $rndkey1,$inout1
409
- aes${dir} $rndkey1,$inout2
410
- aes${dir} $rndkey1,$inout3
411
- $movkey ($key,%rax),$rndkey1
412
- add \$32,%rax
413
- aes${dir} $rndkey0,$inout0
414
- aes${dir} $rndkey0,$inout1
415
- aes${dir} $rndkey0,$inout2
416
- aes${dir} $rndkey0,$inout3
417
- $movkey -16($key,%rax),$rndkey0
418
- jnz .L${dir}_loop4
419
-
420
- aes${dir} $rndkey1,$inout0
421
- aes${dir} $rndkey1,$inout1
422
- aes${dir} $rndkey1,$inout2
423
- aes${dir} $rndkey1,$inout3
424
- aes${dir}last $rndkey0,$inout0
425
- aes${dir}last $rndkey0,$inout1
426
- aes${dir}last $rndkey0,$inout2
427
- aes${dir}last $rndkey0,$inout3
428
- ret
429
- .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
430
- ___
431
- }
432
- sub aesni_generate6 {
433
- my $dir=shift;
434
- # As already mentioned it takes in $key and $rounds, which are *not*
435
- # preserved. $inout[0-5] is cipher/clear text...
436
- $code.=<<___;
437
- .type _aesni_${dir}rypt6,\@abi-omnipotent
438
- .align 16
439
- _aesni_${dir}rypt6:
440
- $movkey ($key),$rndkey0
441
- shl \$4,$rounds
442
- $movkey 16($key),$rndkey1
443
- xorps $rndkey0,$inout0
444
- pxor $rndkey0,$inout1
445
- pxor $rndkey0,$inout2
446
- aes${dir} $rndkey1,$inout0
447
- lea 32($key,$rounds),$key
448
- neg %rax # $rounds
449
- aes${dir} $rndkey1,$inout1
450
- pxor $rndkey0,$inout3
451
- pxor $rndkey0,$inout4
452
- aes${dir} $rndkey1,$inout2
453
- pxor $rndkey0,$inout5
454
- $movkey ($key,%rax),$rndkey0
455
- add \$16,%rax
456
- jmp .L${dir}_loop6_enter
457
- .align 16
458
- .L${dir}_loop6:
459
- aes${dir} $rndkey1,$inout0
460
- aes${dir} $rndkey1,$inout1
461
- aes${dir} $rndkey1,$inout2
462
- .L${dir}_loop6_enter:
463
- aes${dir} $rndkey1,$inout3
464
- aes${dir} $rndkey1,$inout4
465
- aes${dir} $rndkey1,$inout5
466
- $movkey ($key,%rax),$rndkey1
467
- add \$32,%rax
468
- aes${dir} $rndkey0,$inout0
469
- aes${dir} $rndkey0,$inout1
470
- aes${dir} $rndkey0,$inout2
471
- aes${dir} $rndkey0,$inout3
472
- aes${dir} $rndkey0,$inout4
473
- aes${dir} $rndkey0,$inout5
474
- $movkey -16($key,%rax),$rndkey0
475
- jnz .L${dir}_loop6
476
-
477
- aes${dir} $rndkey1,$inout0
478
- aes${dir} $rndkey1,$inout1
479
- aes${dir} $rndkey1,$inout2
480
- aes${dir} $rndkey1,$inout3
481
- aes${dir} $rndkey1,$inout4
482
- aes${dir} $rndkey1,$inout5
483
- aes${dir}last $rndkey0,$inout0
484
- aes${dir}last $rndkey0,$inout1
485
- aes${dir}last $rndkey0,$inout2
486
- aes${dir}last $rndkey0,$inout3
487
- aes${dir}last $rndkey0,$inout4
488
- aes${dir}last $rndkey0,$inout5
489
- ret
490
- .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
491
- ___
492
- }
493
- sub aesni_generate8 {
494
- my $dir=shift;
495
- # As already mentioned it takes in $key and $rounds, which are *not*
496
- # preserved. $inout[0-7] is cipher/clear text...
497
- $code.=<<___;
498
- .type _aesni_${dir}rypt8,\@abi-omnipotent
499
- .align 16
500
- _aesni_${dir}rypt8:
501
- $movkey ($key),$rndkey0
502
- shl \$4,$rounds
503
- $movkey 16($key),$rndkey1
504
- xorps $rndkey0,$inout0
505
- xorps $rndkey0,$inout1
506
- pxor $rndkey0,$inout2
507
- pxor $rndkey0,$inout3
508
- pxor $rndkey0,$inout4
509
- lea 32($key,$rounds),$key
510
- neg %rax # $rounds
511
- aes${dir} $rndkey1,$inout0
512
- pxor $rndkey0,$inout5
513
- pxor $rndkey0,$inout6
514
- aes${dir} $rndkey1,$inout1
515
- pxor $rndkey0,$inout7
516
- $movkey ($key,%rax),$rndkey0
517
- add \$16,%rax
518
- jmp .L${dir}_loop8_inner
519
- .align 16
520
- .L${dir}_loop8:
521
- aes${dir} $rndkey1,$inout0
522
- aes${dir} $rndkey1,$inout1
523
- .L${dir}_loop8_inner:
524
- aes${dir} $rndkey1,$inout2
525
- aes${dir} $rndkey1,$inout3
526
- aes${dir} $rndkey1,$inout4
527
- aes${dir} $rndkey1,$inout5
528
- aes${dir} $rndkey1,$inout6
529
- aes${dir} $rndkey1,$inout7
530
- .L${dir}_loop8_enter:
531
- $movkey ($key,%rax),$rndkey1
532
- add \$32,%rax
533
- aes${dir} $rndkey0,$inout0
534
- aes${dir} $rndkey0,$inout1
535
- aes${dir} $rndkey0,$inout2
536
- aes${dir} $rndkey0,$inout3
537
- aes${dir} $rndkey0,$inout4
538
- aes${dir} $rndkey0,$inout5
539
- aes${dir} $rndkey0,$inout6
540
- aes${dir} $rndkey0,$inout7
541
- $movkey -16($key,%rax),$rndkey0
542
- jnz .L${dir}_loop8
543
-
544
- aes${dir} $rndkey1,$inout0
545
- aes${dir} $rndkey1,$inout1
546
- aes${dir} $rndkey1,$inout2
547
- aes${dir} $rndkey1,$inout3
548
- aes${dir} $rndkey1,$inout4
549
- aes${dir} $rndkey1,$inout5
550
- aes${dir} $rndkey1,$inout6
551
- aes${dir} $rndkey1,$inout7
552
- aes${dir}last $rndkey0,$inout0
553
- aes${dir}last $rndkey0,$inout1
554
- aes${dir}last $rndkey0,$inout2
555
- aes${dir}last $rndkey0,$inout3
556
- aes${dir}last $rndkey0,$inout4
557
- aes${dir}last $rndkey0,$inout5
558
- aes${dir}last $rndkey0,$inout6
559
- aes${dir}last $rndkey0,$inout7
560
- ret
561
- .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
562
- ___
563
- }
564
- &aesni_generate2("enc") if ($PREFIX eq "aesni");
565
- &aesni_generate2("dec");
566
- &aesni_generate3("enc") if ($PREFIX eq "aesni");
567
- &aesni_generate3("dec");
568
- &aesni_generate4("enc") if ($PREFIX eq "aesni");
569
- &aesni_generate4("dec");
570
- &aesni_generate6("enc") if ($PREFIX eq "aesni");
571
- &aesni_generate6("dec");
572
- &aesni_generate8("enc") if ($PREFIX eq "aesni");
573
- &aesni_generate8("dec");
574
-
575
- if ($PREFIX eq "aesni") {
576
- {
577
- ######################################################################
578
- # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
579
- # size_t blocks, const AES_KEY *key,
580
- # const char *ivec,char *cmac);
581
- #
582
- # Handles only complete blocks, operates on 64-bit counter and
583
- # does not update *ivec! Nor does it finalize CMAC value
584
- # (see engine/eng_aesni.c for details)
585
- #
586
- {
587
- my $cmac="%r9"; # 6th argument
588
-
589
- my $increment="%xmm9";
590
- my $iv="%xmm6";
591
- my $bswap_mask="%xmm7";
592
-
593
- $code.=<<___;
594
- .globl aesni_ccm64_encrypt_blocks
595
- .type aesni_ccm64_encrypt_blocks,\@function,6
596
- .align 16
597
- aesni_ccm64_encrypt_blocks:
598
- ___
599
- $code.=<<___ if ($win64);
600
- lea -0x58(%rsp),%rsp
601
- movaps %xmm6,(%rsp) # $iv
602
- movaps %xmm7,0x10(%rsp) # $bswap_mask
603
- movaps %xmm8,0x20(%rsp) # $in0
604
- movaps %xmm9,0x30(%rsp) # $increment
605
- .Lccm64_enc_body:
606
- ___
607
- $code.=<<___;
608
- mov 240($key),$rounds # key->rounds
609
- movdqu ($ivp),$iv
610
- movdqa .Lincrement64(%rip),$increment
611
- movdqa .Lbswap_mask(%rip),$bswap_mask
612
-
613
- shl \$4,$rounds
614
- mov \$16,$rnds_
615
- lea 0($key),$key_
616
- movdqu ($cmac),$inout1
617
- movdqa $iv,$inout0
618
- lea 32($key,$rounds),$key # end of key schedule
619
- pshufb $bswap_mask,$iv
620
- sub %rax,%r10 # twisted $rounds
621
- jmp .Lccm64_enc_outer
622
- .align 16
623
- .Lccm64_enc_outer:
624
- $movkey ($key_),$rndkey0
625
- mov %r10,%rax
626
- movups ($inp),$in0 # load inp
627
-
628
- xorps $rndkey0,$inout0 # counter
629
- $movkey 16($key_),$rndkey1
630
- xorps $in0,$rndkey0
631
- xorps $rndkey0,$inout1 # cmac^=inp
632
- $movkey 32($key_),$rndkey0
633
-
634
- .Lccm64_enc2_loop:
635
- aesenc $rndkey1,$inout0
636
- aesenc $rndkey1,$inout1
637
- $movkey ($key,%rax),$rndkey1
638
- add \$32,%rax
639
- aesenc $rndkey0,$inout0
640
- aesenc $rndkey0,$inout1
641
- $movkey -16($key,%rax),$rndkey0
642
- jnz .Lccm64_enc2_loop
643
- aesenc $rndkey1,$inout0
644
- aesenc $rndkey1,$inout1
645
- paddq $increment,$iv
646
- dec $len # $len-- ($len is in blocks)
647
- aesenclast $rndkey0,$inout0
648
- aesenclast $rndkey0,$inout1
649
-
650
- lea 16($inp),$inp
651
- xorps $inout0,$in0 # inp ^= E(iv)
652
- movdqa $iv,$inout0
653
- movups $in0,($out) # save output
654
- pshufb $bswap_mask,$inout0
655
- lea 16($out),$out # $out+=16
656
- jnz .Lccm64_enc_outer # loop if ($len!=0)
657
-
658
- pxor $rndkey0,$rndkey0 # clear register bank
659
- pxor $rndkey1,$rndkey1
660
- pxor $inout0,$inout0
661
- movups $inout1,($cmac) # store resulting mac
662
- pxor $inout1,$inout1
663
- pxor $in0,$in0
664
- pxor $iv,$iv
665
- ___
666
- $code.=<<___ if ($win64);
667
- movaps (%rsp),%xmm6
668
- movaps %xmm0,(%rsp) # clear stack
669
- movaps 0x10(%rsp),%xmm7
670
- movaps %xmm0,0x10(%rsp)
671
- movaps 0x20(%rsp),%xmm8
672
- movaps %xmm0,0x20(%rsp)
673
- movaps 0x30(%rsp),%xmm9
674
- movaps %xmm0,0x30(%rsp)
675
- lea 0x58(%rsp),%rsp
676
- .Lccm64_enc_ret:
677
- ___
678
- $code.=<<___;
679
- ret
680
- .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
681
- ___
682
- ######################################################################
683
- $code.=<<___;
684
- .globl aesni_ccm64_decrypt_blocks
685
- .type aesni_ccm64_decrypt_blocks,\@function,6
686
- .align 16
687
- aesni_ccm64_decrypt_blocks:
688
- ___
689
- $code.=<<___ if ($win64);
690
- lea -0x58(%rsp),%rsp
691
- movaps %xmm6,(%rsp) # $iv
692
- movaps %xmm7,0x10(%rsp) # $bswap_mask
693
- movaps %xmm8,0x20(%rsp) # $in8
694
- movaps %xmm9,0x30(%rsp) # $increment
695
- .Lccm64_dec_body:
696
- ___
697
- $code.=<<___;
698
- mov 240($key),$rounds # key->rounds
699
- movups ($ivp),$iv
700
- movdqu ($cmac),$inout1
701
- movdqa .Lincrement64(%rip),$increment
702
- movdqa .Lbswap_mask(%rip),$bswap_mask
703
-
704
- movaps $iv,$inout0
705
- mov $rounds,$rnds_
706
- mov $key,$key_
707
- pshufb $bswap_mask,$iv
708
- ___
709
- &aesni_generate1("enc",$key,$rounds);
710
- $code.=<<___;
711
- shl \$4,$rnds_
712
- mov \$16,$rounds
713
- movups ($inp),$in0 # load inp
714
- paddq $increment,$iv
715
- lea 16($inp),$inp # $inp+=16
716
- sub %r10,%rax # twisted $rounds
717
- lea 32($key_,$rnds_),$key # end of key schedule
718
- mov %rax,%r10
719
- jmp .Lccm64_dec_outer
720
- .align 16
721
- .Lccm64_dec_outer:
722
- xorps $inout0,$in0 # inp ^= E(iv)
723
- movdqa $iv,$inout0
724
- movups $in0,($out) # save output
725
- lea 16($out),$out # $out+=16
726
- pshufb $bswap_mask,$inout0
727
-
728
- sub \$1,$len # $len-- ($len is in blocks)
729
- jz .Lccm64_dec_break # if ($len==0) break
730
-
731
- $movkey ($key_),$rndkey0
732
- mov %r10,%rax
733
- $movkey 16($key_),$rndkey1
734
- xorps $rndkey0,$in0
735
- xorps $rndkey0,$inout0
736
- xorps $in0,$inout1 # cmac^=out
737
- $movkey 32($key_),$rndkey0
738
- jmp .Lccm64_dec2_loop
739
- .align 16
740
- .Lccm64_dec2_loop:
741
- aesenc $rndkey1,$inout0
742
- aesenc $rndkey1,$inout1
743
- $movkey ($key,%rax),$rndkey1
744
- add \$32,%rax
745
- aesenc $rndkey0,$inout0
746
- aesenc $rndkey0,$inout1
747
- $movkey -16($key,%rax),$rndkey0
748
- jnz .Lccm64_dec2_loop
749
- movups ($inp),$in0 # load input
750
- paddq $increment,$iv
751
- aesenc $rndkey1,$inout0
752
- aesenc $rndkey1,$inout1
753
- aesenclast $rndkey0,$inout0
754
- aesenclast $rndkey0,$inout1
755
- lea 16($inp),$inp # $inp+=16
756
- jmp .Lccm64_dec_outer
757
-
758
- .align 16
759
- .Lccm64_dec_break:
760
- #xorps $in0,$inout1 # cmac^=out
761
- mov 240($key_),$rounds
762
- ___
763
- &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
764
- $code.=<<___;
765
- pxor $rndkey0,$rndkey0 # clear register bank
766
- pxor $rndkey1,$rndkey1
767
- pxor $inout0,$inout0
768
- movups $inout1,($cmac) # store resulting mac
769
- pxor $inout1,$inout1
770
- pxor $in0,$in0
771
- pxor $iv,$iv
772
- ___
773
- $code.=<<___ if ($win64);
774
- movaps (%rsp),%xmm6
775
- movaps %xmm0,(%rsp) # clear stack
776
- movaps 0x10(%rsp),%xmm7
777
- movaps %xmm0,0x10(%rsp)
778
- movaps 0x20(%rsp),%xmm8
779
- movaps %xmm0,0x20(%rsp)
780
- movaps 0x30(%rsp),%xmm9
781
- movaps %xmm0,0x30(%rsp)
782
- lea 0x58(%rsp),%rsp
783
- .Lccm64_dec_ret:
784
- ___
785
- $code.=<<___;
786
- ret
787
- .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
788
- ___
789
- }
790
- ######################################################################
791
- # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
792
- # size_t blocks, const AES_KEY *key,
793
- # const char *ivec);
794
- #
795
- # Handles only complete blocks, operates on 32-bit counter and
796
- # does not update *ivec! (see crypto/modes/ctr128.c for details)
797
- #
798
- # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
799
- # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
800
- # Keywords are full unroll and modulo-schedule counter calculations
801
- # with zero-round key xor.
802
- {
803
- my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
804
- my ($key0,$ctr)=("${key_}d","${ivp}d");
805
- my $frame_size = 0x80 + ($win64?160:0);
806
-
807
- $code.=<<___;
808
- .globl aesni_ctr32_encrypt_blocks
809
- .type aesni_ctr32_encrypt_blocks,\@function,5
810
- .align 16
811
- aesni_ctr32_encrypt_blocks:
812
- cmp \$1,$len
813
- jne .Lctr32_bulk
814
-
815
- # handle single block without allocating stack frame,
816
- # useful when handling edges
817
- movups ($ivp),$inout0
818
- movups ($inp),$inout1
819
- mov 240($key),%edx # key->rounds
820
- ___
821
- &aesni_generate1("enc",$key,"%edx");
822
- $code.=<<___;
823
- pxor $rndkey0,$rndkey0 # clear register bank
824
- pxor $rndkey1,$rndkey1
825
- xorps $inout1,$inout0
826
- pxor $inout1,$inout1
827
- movups $inout0,($out)
828
- xorps $inout0,$inout0
829
- jmp .Lctr32_epilogue
830
-
831
- .align 16
832
- .Lctr32_bulk:
833
- lea (%rsp),%rax
834
- push %rbp
835
- sub \$$frame_size,%rsp
836
- and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
837
- ___
838
- $code.=<<___ if ($win64);
839
- movaps %xmm6,-0xa8(%rax) # offload everything
840
- movaps %xmm7,-0x98(%rax)
841
- movaps %xmm8,-0x88(%rax)
842
- movaps %xmm9,-0x78(%rax)
843
- movaps %xmm10,-0x68(%rax)
844
- movaps %xmm11,-0x58(%rax)
845
- movaps %xmm12,-0x48(%rax)
846
- movaps %xmm13,-0x38(%rax)
847
- movaps %xmm14,-0x28(%rax)
848
- movaps %xmm15,-0x18(%rax)
849
- .Lctr32_body:
850
- ___
851
- $code.=<<___;
852
- lea -8(%rax),%rbp
853
-
854
- # 8 16-byte words on top of stack are counter values
855
- # xor-ed with zero-round key
856
-
857
- movdqu ($ivp),$inout0
858
- movdqu ($key),$rndkey0
859
- mov 12($ivp),$ctr # counter LSB
860
- pxor $rndkey0,$inout0
861
- mov 12($key),$key0 # 0-round key LSB
862
- movdqa $inout0,0x00(%rsp) # populate counter block
863
- bswap $ctr
864
- movdqa $inout0,$inout1
865
- movdqa $inout0,$inout2
866
- movdqa $inout0,$inout3
867
- movdqa $inout0,0x40(%rsp)
868
- movdqa $inout0,0x50(%rsp)
869
- movdqa $inout0,0x60(%rsp)
870
- mov %rdx,%r10 # about to borrow %rdx
871
- movdqa $inout0,0x70(%rsp)
872
-
873
- lea 1($ctr),%rax
874
- lea 2($ctr),%rdx
875
- bswap %eax
876
- bswap %edx
877
- xor $key0,%eax
878
- xor $key0,%edx
879
- pinsrd \$3,%eax,$inout1
880
- lea 3($ctr),%rax
881
- movdqa $inout1,0x10(%rsp)
882
- pinsrd \$3,%edx,$inout2
883
- bswap %eax
884
- mov %r10,%rdx # restore %rdx
885
- lea 4($ctr),%r10
886
- movdqa $inout2,0x20(%rsp)
887
- xor $key0,%eax
888
- bswap %r10d
889
- pinsrd \$3,%eax,$inout3
890
- xor $key0,%r10d
891
- movdqa $inout3,0x30(%rsp)
892
- lea 5($ctr),%r9
893
- mov %r10d,0x40+12(%rsp)
894
- bswap %r9d
895
- lea 6($ctr),%r10
896
- mov 240($key),$rounds # key->rounds
897
- xor $key0,%r9d
898
- bswap %r10d
899
- mov %r9d,0x50+12(%rsp)
900
- xor $key0,%r10d
901
- lea 7($ctr),%r9
902
- mov %r10d,0x60+12(%rsp)
903
- bswap %r9d
904
- mov OPENSSL_ia32cap_P+4(%rip),%r10d
905
- xor $key0,%r9d
906
- and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
907
- mov %r9d,0x70+12(%rsp)
908
-
909
- $movkey 0x10($key),$rndkey1
910
-
911
- movdqa 0x40(%rsp),$inout4
912
- movdqa 0x50(%rsp),$inout5
913
-
914
- cmp \$8,$len # $len is in blocks
915
- jb .Lctr32_tail # short input if ($len<8)
916
-
917
- sub \$6,$len # $len is biased by -6
918
- cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
919
- je .Lctr32_6x # [which denotes Atom Silvermont]
920
-
921
- lea 0x80($key),$key # size optimization
922
- sub \$2,$len # $len is biased by -8
923
- jmp .Lctr32_loop8
924
-
925
- .align 16
926
- .Lctr32_6x:
927
- shl \$4,$rounds
928
- mov \$48,$rnds_
929
- bswap $key0
930
- lea 32($key,$rounds),$key # end of key schedule
931
- sub %rax,%r10 # twisted $rounds
932
- jmp .Lctr32_loop6
933
-
934
- .align 16
935
- .Lctr32_loop6:
936
- add \$6,$ctr # next counter value
937
- $movkey -48($key,$rnds_),$rndkey0
938
- aesenc $rndkey1,$inout0
939
- mov $ctr,%eax
940
- xor $key0,%eax
941
- aesenc $rndkey1,$inout1
942
- movbe %eax,`0x00+12`(%rsp) # store next counter value
943
- lea 1($ctr),%eax
944
- aesenc $rndkey1,$inout2
945
- xor $key0,%eax
946
- movbe %eax,`0x10+12`(%rsp)
947
- aesenc $rndkey1,$inout3
948
- lea 2($ctr),%eax
949
- xor $key0,%eax
950
- aesenc $rndkey1,$inout4
951
- movbe %eax,`0x20+12`(%rsp)
952
- lea 3($ctr),%eax
953
- aesenc $rndkey1,$inout5
954
- $movkey -32($key,$rnds_),$rndkey1
955
- xor $key0,%eax
956
-
957
- aesenc $rndkey0,$inout0
958
- movbe %eax,`0x30+12`(%rsp)
959
- lea 4($ctr),%eax
960
- aesenc $rndkey0,$inout1
961
- xor $key0,%eax
962
- movbe %eax,`0x40+12`(%rsp)
963
- aesenc $rndkey0,$inout2
964
- lea 5($ctr),%eax
965
- xor $key0,%eax
966
- aesenc $rndkey0,$inout3
967
- movbe %eax,`0x50+12`(%rsp)
968
- mov %r10,%rax # mov $rnds_,$rounds
969
- aesenc $rndkey0,$inout4
970
- aesenc $rndkey0,$inout5
971
- $movkey -16($key,$rnds_),$rndkey0
972
-
973
- call .Lenc_loop6
974
-
975
- movdqu ($inp),$inout6 # load 6 input blocks
976
- movdqu 0x10($inp),$inout7
977
- movdqu 0x20($inp),$in0
978
- movdqu 0x30($inp),$in1
979
- movdqu 0x40($inp),$in2
980
- movdqu 0x50($inp),$in3
981
- lea 0x60($inp),$inp # $inp+=6*16
982
- $movkey -64($key,$rnds_),$rndkey1
983
- pxor $inout0,$inout6 # inp^=E(ctr)
984
- movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
985
- pxor $inout1,$inout7
986
- movaps 0x10(%rsp),$inout1
987
- pxor $inout2,$in0
988
- movaps 0x20(%rsp),$inout2
989
- pxor $inout3,$in1
990
- movaps 0x30(%rsp),$inout3
991
- pxor $inout4,$in2
992
- movaps 0x40(%rsp),$inout4
993
- pxor $inout5,$in3
994
- movaps 0x50(%rsp),$inout5
995
- movdqu $inout6,($out) # store 6 output blocks
996
- movdqu $inout7,0x10($out)
997
- movdqu $in0,0x20($out)
998
- movdqu $in1,0x30($out)
999
- movdqu $in2,0x40($out)
1000
- movdqu $in3,0x50($out)
1001
- lea 0x60($out),$out # $out+=6*16
1002
-
1003
- sub \$6,$len
1004
- jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1005
-
1006
- add \$6,$len # restore real remaining $len
1007
- jz .Lctr32_done # done if ($len==0)
1008
-
1009
- lea -48($rnds_),$rounds
1010
- lea -80($key,$rnds_),$key # restore $key
1011
- neg $rounds
1012
- shr \$4,$rounds # restore $rounds
1013
- jmp .Lctr32_tail
1014
-
1015
- .align 32
1016
- .Lctr32_loop8:
1017
- add \$8,$ctr # next counter value
1018
- movdqa 0x60(%rsp),$inout6
1019
- aesenc $rndkey1,$inout0
1020
- mov $ctr,%r9d
1021
- movdqa 0x70(%rsp),$inout7
1022
- aesenc $rndkey1,$inout1
1023
- bswap %r9d
1024
- $movkey 0x20-0x80($key),$rndkey0
1025
- aesenc $rndkey1,$inout2
1026
- xor $key0,%r9d
1027
- nop
1028
- aesenc $rndkey1,$inout3
1029
- mov %r9d,0x00+12(%rsp) # store next counter value
1030
- lea 1($ctr),%r9
1031
- aesenc $rndkey1,$inout4
1032
- aesenc $rndkey1,$inout5
1033
- aesenc $rndkey1,$inout6
1034
- aesenc $rndkey1,$inout7
1035
- $movkey 0x30-0x80($key),$rndkey1
1036
- ___
1037
- for($i=2;$i<8;$i++) {
1038
- my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1039
- $code.=<<___;
1040
- bswap %r9d
1041
- aesenc $rndkeyx,$inout0
1042
- aesenc $rndkeyx,$inout1
1043
- xor $key0,%r9d
1044
- .byte 0x66,0x90
1045
- aesenc $rndkeyx,$inout2
1046
- aesenc $rndkeyx,$inout3
1047
- mov %r9d,`0x10*($i-1)`+12(%rsp)
1048
- lea $i($ctr),%r9
1049
- aesenc $rndkeyx,$inout4
1050
- aesenc $rndkeyx,$inout5
1051
- aesenc $rndkeyx,$inout6
1052
- aesenc $rndkeyx,$inout7
1053
- $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1054
- ___
1055
- }
1056
- $code.=<<___;
1057
- bswap %r9d
1058
- aesenc $rndkey0,$inout0
1059
- aesenc $rndkey0,$inout1
1060
- aesenc $rndkey0,$inout2
1061
- xor $key0,%r9d
1062
- movdqu 0x00($inp),$in0 # start loading input
1063
- aesenc $rndkey0,$inout3
1064
- mov %r9d,0x70+12(%rsp)
1065
- cmp \$11,$rounds
1066
- aesenc $rndkey0,$inout4
1067
- aesenc $rndkey0,$inout5
1068
- aesenc $rndkey0,$inout6
1069
- aesenc $rndkey0,$inout7
1070
- $movkey 0xa0-0x80($key),$rndkey0
1071
-
1072
- jb .Lctr32_enc_done
1073
-
1074
- aesenc $rndkey1,$inout0
1075
- aesenc $rndkey1,$inout1
1076
- aesenc $rndkey1,$inout2
1077
- aesenc $rndkey1,$inout3
1078
- aesenc $rndkey1,$inout4
1079
- aesenc $rndkey1,$inout5
1080
- aesenc $rndkey1,$inout6
1081
- aesenc $rndkey1,$inout7
1082
- $movkey 0xb0-0x80($key),$rndkey1
1083
-
1084
- aesenc $rndkey0,$inout0
1085
- aesenc $rndkey0,$inout1
1086
- aesenc $rndkey0,$inout2
1087
- aesenc $rndkey0,$inout3
1088
- aesenc $rndkey0,$inout4
1089
- aesenc $rndkey0,$inout5
1090
- aesenc $rndkey0,$inout6
1091
- aesenc $rndkey0,$inout7
1092
- $movkey 0xc0-0x80($key),$rndkey0
1093
- je .Lctr32_enc_done
1094
-
1095
- aesenc $rndkey1,$inout0
1096
- aesenc $rndkey1,$inout1
1097
- aesenc $rndkey1,$inout2
1098
- aesenc $rndkey1,$inout3
1099
- aesenc $rndkey1,$inout4
1100
- aesenc $rndkey1,$inout5
1101
- aesenc $rndkey1,$inout6
1102
- aesenc $rndkey1,$inout7
1103
- $movkey 0xd0-0x80($key),$rndkey1
1104
-
1105
- aesenc $rndkey0,$inout0
1106
- aesenc $rndkey0,$inout1
1107
- aesenc $rndkey0,$inout2
1108
- aesenc $rndkey0,$inout3
1109
- aesenc $rndkey0,$inout4
1110
- aesenc $rndkey0,$inout5
1111
- aesenc $rndkey0,$inout6
1112
- aesenc $rndkey0,$inout7
1113
- $movkey 0xe0-0x80($key),$rndkey0
1114
- jmp .Lctr32_enc_done
1115
-
1116
- .align 16
1117
- .Lctr32_enc_done:
1118
- movdqu 0x10($inp),$in1
1119
- pxor $rndkey0,$in0 # input^=round[last]
1120
- movdqu 0x20($inp),$in2
1121
- pxor $rndkey0,$in1
1122
- movdqu 0x30($inp),$in3
1123
- pxor $rndkey0,$in2
1124
- movdqu 0x40($inp),$in4
1125
- pxor $rndkey0,$in3
1126
- movdqu 0x50($inp),$in5
1127
- pxor $rndkey0,$in4
1128
- pxor $rndkey0,$in5
1129
- aesenc $rndkey1,$inout0
1130
- aesenc $rndkey1,$inout1
1131
- aesenc $rndkey1,$inout2
1132
- aesenc $rndkey1,$inout3
1133
- aesenc $rndkey1,$inout4
1134
- aesenc $rndkey1,$inout5
1135
- aesenc $rndkey1,$inout6
1136
- aesenc $rndkey1,$inout7
1137
- movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1138
- lea 0x80($inp),$inp # $inp+=8*16
1139
-
1140
- aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1141
- pxor $rndkey0,$rndkey1 # borrowed $rndkey
1142
- movdqu 0x70-0x80($inp),$in0
1143
- aesenclast $in1,$inout1
1144
- pxor $rndkey0,$in0
1145
- movdqa 0x00(%rsp),$in1 # load next counter block
1146
- aesenclast $in2,$inout2
1147
- aesenclast $in3,$inout3
1148
- movdqa 0x10(%rsp),$in2
1149
- movdqa 0x20(%rsp),$in3
1150
- aesenclast $in4,$inout4
1151
- aesenclast $in5,$inout5
1152
- movdqa 0x30(%rsp),$in4
1153
- movdqa 0x40(%rsp),$in5
1154
- aesenclast $rndkey1,$inout6
1155
- movdqa 0x50(%rsp),$rndkey0
1156
- $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1157
- aesenclast $in0,$inout7
1158
-
1159
- movups $inout0,($out) # store 8 output blocks
1160
- movdqa $in1,$inout0
1161
- movups $inout1,0x10($out)
1162
- movdqa $in2,$inout1
1163
- movups $inout2,0x20($out)
1164
- movdqa $in3,$inout2
1165
- movups $inout3,0x30($out)
1166
- movdqa $in4,$inout3
1167
- movups $inout4,0x40($out)
1168
- movdqa $in5,$inout4
1169
- movups $inout5,0x50($out)
1170
- movdqa $rndkey0,$inout5
1171
- movups $inout6,0x60($out)
1172
- movups $inout7,0x70($out)
1173
- lea 0x80($out),$out # $out+=8*16
1174
-
1175
- sub \$8,$len
1176
- jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1177
-
1178
- add \$8,$len # restore real remainig $len
1179
- jz .Lctr32_done # done if ($len==0)
1180
- lea -0x80($key),$key
1181
-
1182
- .Lctr32_tail:
1183
- # note that at this point $inout0..5 are populated with
1184
- # counter values xor-ed with 0-round key
1185
- lea 16($key),$key
1186
- cmp \$4,$len
1187
- jb .Lctr32_loop3
1188
- je .Lctr32_loop4
1189
-
1190
- # if ($len>4) compute 7 E(counter)
1191
- shl \$4,$rounds
1192
- movdqa 0x60(%rsp),$inout6
1193
- pxor $inout7,$inout7
1194
-
1195
- $movkey 16($key),$rndkey0
1196
- aesenc $rndkey1,$inout0
1197
- aesenc $rndkey1,$inout1
1198
- lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1199
- neg %rax
1200
- aesenc $rndkey1,$inout2
1201
- add \$16,%rax # prepare for .Lenc_loop8_enter
1202
- movups ($inp),$in0
1203
- aesenc $rndkey1,$inout3
1204
- aesenc $rndkey1,$inout4
1205
- movups 0x10($inp),$in1 # pre-load input
1206
- movups 0x20($inp),$in2
1207
- aesenc $rndkey1,$inout5
1208
- aesenc $rndkey1,$inout6
1209
-
1210
- call .Lenc_loop8_enter
1211
-
1212
- movdqu 0x30($inp),$in3
1213
- pxor $in0,$inout0
1214
- movdqu 0x40($inp),$in0
1215
- pxor $in1,$inout1
1216
- movdqu $inout0,($out) # store output
1217
- pxor $in2,$inout2
1218
- movdqu $inout1,0x10($out)
1219
- pxor $in3,$inout3
1220
- movdqu $inout2,0x20($out)
1221
- pxor $in0,$inout4
1222
- movdqu $inout3,0x30($out)
1223
- movdqu $inout4,0x40($out)
1224
- cmp \$6,$len
1225
- jb .Lctr32_done # $len was 5, stop store
1226
-
1227
- movups 0x50($inp),$in1
1228
- xorps $in1,$inout5
1229
- movups $inout5,0x50($out)
1230
- je .Lctr32_done # $len was 6, stop store
1231
-
1232
- movups 0x60($inp),$in2
1233
- xorps $in2,$inout6
1234
- movups $inout6,0x60($out)
1235
- jmp .Lctr32_done # $len was 7, stop store
1236
-
1237
- .align 32
1238
- .Lctr32_loop4:
1239
- aesenc $rndkey1,$inout0
1240
- lea 16($key),$key
1241
- dec $rounds
1242
- aesenc $rndkey1,$inout1
1243
- aesenc $rndkey1,$inout2
1244
- aesenc $rndkey1,$inout3
1245
- $movkey ($key),$rndkey1
1246
- jnz .Lctr32_loop4
1247
- aesenclast $rndkey1,$inout0
1248
- aesenclast $rndkey1,$inout1
1249
- movups ($inp),$in0 # load input
1250
- movups 0x10($inp),$in1
1251
- aesenclast $rndkey1,$inout2
1252
- aesenclast $rndkey1,$inout3
1253
- movups 0x20($inp),$in2
1254
- movups 0x30($inp),$in3
1255
-
1256
- xorps $in0,$inout0
1257
- movups $inout0,($out) # store output
1258
- xorps $in1,$inout1
1259
- movups $inout1,0x10($out)
1260
- pxor $in2,$inout2
1261
- movdqu $inout2,0x20($out)
1262
- pxor $in3,$inout3
1263
- movdqu $inout3,0x30($out)
1264
- jmp .Lctr32_done # $len was 4, stop store
1265
-
1266
- .align 32
1267
- .Lctr32_loop3:
1268
- aesenc $rndkey1,$inout0
1269
- lea 16($key),$key
1270
- dec $rounds
1271
- aesenc $rndkey1,$inout1
1272
- aesenc $rndkey1,$inout2
1273
- $movkey ($key),$rndkey1
1274
- jnz .Lctr32_loop3
1275
- aesenclast $rndkey1,$inout0
1276
- aesenclast $rndkey1,$inout1
1277
- aesenclast $rndkey1,$inout2
1278
-
1279
- movups ($inp),$in0 # load input
1280
- xorps $in0,$inout0
1281
- movups $inout0,($out) # store output
1282
- cmp \$2,$len
1283
- jb .Lctr32_done # $len was 1, stop store
1284
-
1285
- movups 0x10($inp),$in1
1286
- xorps $in1,$inout1
1287
- movups $inout1,0x10($out)
1288
- je .Lctr32_done # $len was 2, stop store
1289
-
1290
- movups 0x20($inp),$in2
1291
- xorps $in2,$inout2
1292
- movups $inout2,0x20($out) # $len was 3, stop store
1293
-
1294
- .Lctr32_done:
1295
- xorps %xmm0,%xmm0 # clear regiser bank
1296
- xor $key0,$key0
1297
- pxor %xmm1,%xmm1
1298
- pxor %xmm2,%xmm2
1299
- pxor %xmm3,%xmm3
1300
- pxor %xmm4,%xmm4
1301
- pxor %xmm5,%xmm5
1302
- ___
1303
- $code.=<<___ if (!$win64);
1304
- pxor %xmm6,%xmm6
1305
- pxor %xmm7,%xmm7
1306
- movaps %xmm0,0x00(%rsp) # clear stack
1307
- pxor %xmm8,%xmm8
1308
- movaps %xmm0,0x10(%rsp)
1309
- pxor %xmm9,%xmm9
1310
- movaps %xmm0,0x20(%rsp)
1311
- pxor %xmm10,%xmm10
1312
- movaps %xmm0,0x30(%rsp)
1313
- pxor %xmm11,%xmm11
1314
- movaps %xmm0,0x40(%rsp)
1315
- pxor %xmm12,%xmm12
1316
- movaps %xmm0,0x50(%rsp)
1317
- pxor %xmm13,%xmm13
1318
- movaps %xmm0,0x60(%rsp)
1319
- pxor %xmm14,%xmm14
1320
- movaps %xmm0,0x70(%rsp)
1321
- pxor %xmm15,%xmm15
1322
- ___
1323
- $code.=<<___ if ($win64);
1324
- movaps -0xa0(%rbp),%xmm6
1325
- movaps %xmm0,-0xa0(%rbp) # clear stack
1326
- movaps -0x90(%rbp),%xmm7
1327
- movaps %xmm0,-0x90(%rbp)
1328
- movaps -0x80(%rbp),%xmm8
1329
- movaps %xmm0,-0x80(%rbp)
1330
- movaps -0x70(%rbp),%xmm9
1331
- movaps %xmm0,-0x70(%rbp)
1332
- movaps -0x60(%rbp),%xmm10
1333
- movaps %xmm0,-0x60(%rbp)
1334
- movaps -0x50(%rbp),%xmm11
1335
- movaps %xmm0,-0x50(%rbp)
1336
- movaps -0x40(%rbp),%xmm12
1337
- movaps %xmm0,-0x40(%rbp)
1338
- movaps -0x30(%rbp),%xmm13
1339
- movaps %xmm0,-0x30(%rbp)
1340
- movaps -0x20(%rbp),%xmm14
1341
- movaps %xmm0,-0x20(%rbp)
1342
- movaps -0x10(%rbp),%xmm15
1343
- movaps %xmm0,-0x10(%rbp)
1344
- movaps %xmm0,0x00(%rsp)
1345
- movaps %xmm0,0x10(%rsp)
1346
- movaps %xmm0,0x20(%rsp)
1347
- movaps %xmm0,0x30(%rsp)
1348
- movaps %xmm0,0x40(%rsp)
1349
- movaps %xmm0,0x50(%rsp)
1350
- movaps %xmm0,0x60(%rsp)
1351
- movaps %xmm0,0x70(%rsp)
1352
- ___
1353
- $code.=<<___;
1354
- lea (%rbp),%rsp
1355
- pop %rbp
1356
- .Lctr32_epilogue:
1357
- ret
1358
- .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1359
- ___
1360
- } }}
1361
-
1362
- # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
1363
- # int bits, AES_KEY *key)
1364
- #
1365
- # input: $inp user-supplied key
1366
- # $bits $inp length in bits
1367
- # $key pointer to key schedule
1368
- # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1369
- # *$key key schedule
1370
- #
1371
- { my ($inp,$bits,$key) = @_4args;
1372
- $bits =~ s/%r/%e/;
1373
-
1374
- $code.=<<___;
1375
- .globl ${PREFIX}_set_decrypt_key
1376
- .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
1377
- .align 16
1378
- ${PREFIX}_set_decrypt_key:
1379
- .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1380
- call __aesni_set_encrypt_key
1381
- shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
1382
- test %eax,%eax
1383
- jnz .Ldec_key_ret
1384
- lea 16($key,$bits),$inp # points at the end of key schedule
1385
-
1386
- $movkey ($key),%xmm0 # just swap
1387
- $movkey ($inp),%xmm1
1388
- $movkey %xmm0,($inp)
1389
- $movkey %xmm1,($key)
1390
- lea 16($key),$key
1391
- lea -16($inp),$inp
1392
-
1393
- .Ldec_key_inverse:
1394
- $movkey ($key),%xmm0 # swap and inverse
1395
- $movkey ($inp),%xmm1
1396
- aesimc %xmm0,%xmm0
1397
- aesimc %xmm1,%xmm1
1398
- lea 16($key),$key
1399
- lea -16($inp),$inp
1400
- $movkey %xmm0,16($inp)
1401
- $movkey %xmm1,-16($key)
1402
- cmp $key,$inp
1403
- ja .Ldec_key_inverse
1404
-
1405
- $movkey ($key),%xmm0 # inverse middle
1406
- aesimc %xmm0,%xmm0
1407
- pxor %xmm1,%xmm1
1408
- $movkey %xmm0,($inp)
1409
- pxor %xmm0,%xmm0
1410
- .Ldec_key_ret:
1411
- add \$8,%rsp
1412
- ret
1413
- .LSEH_end_set_decrypt_key:
1414
- .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
1415
- ___
1416
-
1417
- # This is based on submission by
1418
- #
1419
- # Huang Ying <ying.huang@intel.com>
1420
- # Vinodh Gopal <vinodh.gopal@intel.com>
1421
- # Kahraman Akdemir
1422
- #
1423
- # Agressively optimized in respect to aeskeygenassist's critical path
1424
- # and is contained in %xmm0-5 to meet Win64 ABI requirement.
1425
- #
1426
- # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
1427
- # int bits, AES_KEY * const key);
1428
- #
1429
- # input: $inp user-supplied key
1430
- # $bits $inp length in bits
1431
- # $key pointer to key schedule
1432
- # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1433
- # $bits rounds-1 (used in aesni_set_decrypt_key)
1434
- # *$key key schedule
1435
- # $key pointer to key schedule (used in
1436
- # aesni_set_decrypt_key)
1437
- #
1438
- # Subroutine is frame-less, which means that only volatile registers
1439
- # are used. Note that it's declared "abi-omnipotent", which means that
1440
- # amount of volatile registers is smaller on Windows.
1441
- #
1442
- $code.=<<___;
1443
- .globl ${PREFIX}_set_encrypt_key
1444
- .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
1445
- .align 16
1446
- ${PREFIX}_set_encrypt_key:
1447
- __aesni_set_encrypt_key:
1448
- .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1449
- mov \$-1,%rax
1450
- test $inp,$inp
1451
- jz .Lenc_key_ret
1452
- test $key,$key
1453
- jz .Lenc_key_ret
1454
-
1455
- mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
1456
- movups ($inp),%xmm0 # pull first 128 bits of *userKey
1457
- xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
1458
- and OPENSSL_ia32cap_P+4(%rip),%r10d
1459
- lea 16($key),%rax # %rax is used as modifiable copy of $key
1460
- cmp \$256,$bits
1461
- je .L14rounds
1462
- cmp \$192,$bits
1463
- je .L12rounds
1464
- cmp \$128,$bits
1465
- jne .Lbad_keybits
1466
-
1467
- .L10rounds:
1468
- mov \$9,$bits # 10 rounds for 128-bit key
1469
- cmp \$`1<<28`,%r10d # AVX, bit no XOP
1470
- je .L10rounds_alt
1471
-
1472
- $movkey %xmm0,($key) # round 0
1473
- aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
1474
- call .Lkey_expansion_128_cold
1475
- aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
1476
- call .Lkey_expansion_128
1477
- aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
1478
- call .Lkey_expansion_128
1479
- aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
1480
- call .Lkey_expansion_128
1481
- aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
1482
- call .Lkey_expansion_128
1483
- aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
1484
- call .Lkey_expansion_128
1485
- aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
1486
- call .Lkey_expansion_128
1487
- aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
1488
- call .Lkey_expansion_128
1489
- aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
1490
- call .Lkey_expansion_128
1491
- aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
1492
- call .Lkey_expansion_128
1493
- $movkey %xmm0,(%rax)
1494
- mov $bits,80(%rax) # 240(%rdx)
1495
- xor %eax,%eax
1496
- jmp .Lenc_key_ret
1497
-
1498
- .align 16
1499
- .L10rounds_alt:
1500
- movdqa .Lkey_rotate(%rip),%xmm5
1501
- mov \$8,%r10d
1502
- movdqa .Lkey_rcon1(%rip),%xmm4
1503
- movdqa %xmm0,%xmm2
1504
- movdqu %xmm0,($key)
1505
- jmp .Loop_key128
1506
-
1507
- .align 16
1508
- .Loop_key128:
1509
- pshufb %xmm5,%xmm0
1510
- aesenclast %xmm4,%xmm0
1511
- pslld \$1,%xmm4
1512
- lea 16(%rax),%rax
1513
-
1514
- movdqa %xmm2,%xmm3
1515
- pslldq \$4,%xmm2
1516
- pxor %xmm2,%xmm3
1517
- pslldq \$4,%xmm2
1518
- pxor %xmm2,%xmm3
1519
- pslldq \$4,%xmm2
1520
- pxor %xmm3,%xmm2
1521
-
1522
- pxor %xmm2,%xmm0
1523
- movdqu %xmm0,-16(%rax)
1524
- movdqa %xmm0,%xmm2
1525
-
1526
- dec %r10d
1527
- jnz .Loop_key128
1528
-
1529
- movdqa .Lkey_rcon1b(%rip),%xmm4
1530
-
1531
- pshufb %xmm5,%xmm0
1532
- aesenclast %xmm4,%xmm0
1533
- pslld \$1,%xmm4
1534
-
1535
- movdqa %xmm2,%xmm3
1536
- pslldq \$4,%xmm2
1537
- pxor %xmm2,%xmm3
1538
- pslldq \$4,%xmm2
1539
- pxor %xmm2,%xmm3
1540
- pslldq \$4,%xmm2
1541
- pxor %xmm3,%xmm2
1542
-
1543
- pxor %xmm2,%xmm0
1544
- movdqu %xmm0,(%rax)
1545
-
1546
- movdqa %xmm0,%xmm2
1547
- pshufb %xmm5,%xmm0
1548
- aesenclast %xmm4,%xmm0
1549
-
1550
- movdqa %xmm2,%xmm3
1551
- pslldq \$4,%xmm2
1552
- pxor %xmm2,%xmm3
1553
- pslldq \$4,%xmm2
1554
- pxor %xmm2,%xmm3
1555
- pslldq \$4,%xmm2
1556
- pxor %xmm3,%xmm2
1557
-
1558
- pxor %xmm2,%xmm0
1559
- movdqu %xmm0,16(%rax)
1560
-
1561
- mov $bits,96(%rax) # 240($key)
1562
- xor %eax,%eax
1563
- jmp .Lenc_key_ret
1564
-
1565
- .align 16
1566
- .L12rounds:
1567
- movq 16($inp),%xmm2 # remaining 1/3 of *userKey
1568
- mov \$11,$bits # 12 rounds for 192
1569
- cmp \$`1<<28`,%r10d # AVX, but no XOP
1570
- je .L12rounds_alt
1571
-
1572
- $movkey %xmm0,($key) # round 0
1573
- aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
1574
- call .Lkey_expansion_192a_cold
1575
- aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
1576
- call .Lkey_expansion_192b
1577
- aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
1578
- call .Lkey_expansion_192a
1579
- aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
1580
- call .Lkey_expansion_192b
1581
- aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
1582
- call .Lkey_expansion_192a
1583
- aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
1584
- call .Lkey_expansion_192b
1585
- aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
1586
- call .Lkey_expansion_192a
1587
- aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
1588
- call .Lkey_expansion_192b
1589
- $movkey %xmm0,(%rax)
1590
- mov $bits,48(%rax) # 240(%rdx)
1591
- xor %rax, %rax
1592
- jmp .Lenc_key_ret
1593
-
1594
- .align 16
1595
- .L12rounds_alt:
1596
- movdqa .Lkey_rotate192(%rip),%xmm5
1597
- movdqa .Lkey_rcon1(%rip),%xmm4
1598
- mov \$8,%r10d
1599
- movdqu %xmm0,($key)
1600
- jmp .Loop_key192
1601
-
1602
- .align 16
1603
- .Loop_key192:
1604
- movq %xmm2,0(%rax)
1605
- movdqa %xmm2,%xmm1
1606
- pshufb %xmm5,%xmm2
1607
- aesenclast %xmm4,%xmm2
1608
- pslld \$1, %xmm4
1609
- lea 24(%rax),%rax
1610
-
1611
- movdqa %xmm0,%xmm3
1612
- pslldq \$4,%xmm0
1613
- pxor %xmm0,%xmm3
1614
- pslldq \$4,%xmm0
1615
- pxor %xmm0,%xmm3
1616
- pslldq \$4,%xmm0
1617
- pxor %xmm3,%xmm0
1618
-
1619
- pshufd \$0xff,%xmm0,%xmm3
1620
- pxor %xmm1,%xmm3
1621
- pslldq \$4,%xmm1
1622
- pxor %xmm1,%xmm3
1623
-
1624
- pxor %xmm2,%xmm0
1625
- pxor %xmm3,%xmm2
1626
- movdqu %xmm0,-16(%rax)
1627
-
1628
- dec %r10d
1629
- jnz .Loop_key192
1630
-
1631
- mov $bits,32(%rax) # 240($key)
1632
- xor %eax,%eax
1633
- jmp .Lenc_key_ret
1634
-
1635
- .align 16
1636
- .L14rounds:
1637
- movups 16($inp),%xmm2 # remaning half of *userKey
1638
- mov \$13,$bits # 14 rounds for 256
1639
- lea 16(%rax),%rax
1640
- cmp \$`1<<28`,%r10d # AVX, but no XOP
1641
- je .L14rounds_alt
1642
-
1643
- $movkey %xmm0,($key) # round 0
1644
- $movkey %xmm2,16($key) # round 1
1645
- aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
1646
- call .Lkey_expansion_256a_cold
1647
- aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
1648
- call .Lkey_expansion_256b
1649
- aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
1650
- call .Lkey_expansion_256a
1651
- aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
1652
- call .Lkey_expansion_256b
1653
- aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
1654
- call .Lkey_expansion_256a
1655
- aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
1656
- call .Lkey_expansion_256b
1657
- aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
1658
- call .Lkey_expansion_256a
1659
- aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
1660
- call .Lkey_expansion_256b
1661
- aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
1662
- call .Lkey_expansion_256a
1663
- aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
1664
- call .Lkey_expansion_256b
1665
- aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
1666
- call .Lkey_expansion_256a
1667
- aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
1668
- call .Lkey_expansion_256b
1669
- aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
1670
- call .Lkey_expansion_256a
1671
- $movkey %xmm0,(%rax)
1672
- mov $bits,16(%rax) # 240(%rdx)
1673
- xor %rax,%rax
1674
- jmp .Lenc_key_ret
1675
-
1676
- .align 16
1677
- .L14rounds_alt:
1678
- movdqa .Lkey_rotate(%rip),%xmm5
1679
- movdqa .Lkey_rcon1(%rip),%xmm4
1680
- mov \$7,%r10d
1681
- movdqu %xmm0,0($key)
1682
- movdqa %xmm2,%xmm1
1683
- movdqu %xmm2,16($key)
1684
- jmp .Loop_key256
1685
-
1686
- .align 16
1687
- .Loop_key256:
1688
- pshufb %xmm5,%xmm2
1689
- aesenclast %xmm4,%xmm2
1690
-
1691
- movdqa %xmm0,%xmm3
1692
- pslldq \$4,%xmm0
1693
- pxor %xmm0,%xmm3
1694
- pslldq \$4,%xmm0
1695
- pxor %xmm0,%xmm3
1696
- pslldq \$4,%xmm0
1697
- pxor %xmm3,%xmm0
1698
- pslld \$1,%xmm4
1699
-
1700
- pxor %xmm2,%xmm0
1701
- movdqu %xmm0,(%rax)
1702
-
1703
- dec %r10d
1704
- jz .Ldone_key256
1705
-
1706
- pshufd \$0xff,%xmm0,%xmm2
1707
- pxor %xmm3,%xmm3
1708
- aesenclast %xmm3,%xmm2
1709
-
1710
- movdqa %xmm1,%xmm3
1711
- pslldq \$4,%xmm1
1712
- pxor %xmm1,%xmm3
1713
- pslldq \$4,%xmm1
1714
- pxor %xmm1,%xmm3
1715
- pslldq \$4,%xmm1
1716
- pxor %xmm3,%xmm1
1717
-
1718
- pxor %xmm1,%xmm2
1719
- movdqu %xmm2,16(%rax)
1720
- lea 32(%rax),%rax
1721
- movdqa %xmm2,%xmm1
1722
-
1723
- jmp .Loop_key256
1724
-
1725
- .Ldone_key256:
1726
- mov $bits,16(%rax) # 240($key)
1727
- xor %eax,%eax
1728
- jmp .Lenc_key_ret
1729
-
1730
- .align 16
1731
- .Lbad_keybits:
1732
- mov \$-2,%rax
1733
- .Lenc_key_ret:
1734
- pxor %xmm0,%xmm0
1735
- pxor %xmm1,%xmm1
1736
- pxor %xmm2,%xmm2
1737
- pxor %xmm3,%xmm3
1738
- pxor %xmm4,%xmm4
1739
- pxor %xmm5,%xmm5
1740
- add \$8,%rsp
1741
- ret
1742
- .LSEH_end_set_encrypt_key:
1743
-
1744
- .align 16
1745
- .Lkey_expansion_128:
1746
- $movkey %xmm0,(%rax)
1747
- lea 16(%rax),%rax
1748
- .Lkey_expansion_128_cold:
1749
- shufps \$0b00010000,%xmm0,%xmm4
1750
- xorps %xmm4, %xmm0
1751
- shufps \$0b10001100,%xmm0,%xmm4
1752
- xorps %xmm4, %xmm0
1753
- shufps \$0b11111111,%xmm1,%xmm1 # critical path
1754
- xorps %xmm1,%xmm0
1755
- ret
1756
-
1757
- .align 16
1758
- .Lkey_expansion_192a:
1759
- $movkey %xmm0,(%rax)
1760
- lea 16(%rax),%rax
1761
- .Lkey_expansion_192a_cold:
1762
- movaps %xmm2, %xmm5
1763
- .Lkey_expansion_192b_warm:
1764
- shufps \$0b00010000,%xmm0,%xmm4
1765
- movdqa %xmm2,%xmm3
1766
- xorps %xmm4,%xmm0
1767
- shufps \$0b10001100,%xmm0,%xmm4
1768
- pslldq \$4,%xmm3
1769
- xorps %xmm4,%xmm0
1770
- pshufd \$0b01010101,%xmm1,%xmm1 # critical path
1771
- pxor %xmm3,%xmm2
1772
- pxor %xmm1,%xmm0
1773
- pshufd \$0b11111111,%xmm0,%xmm3
1774
- pxor %xmm3,%xmm2
1775
- ret
1776
-
1777
- .align 16
1778
- .Lkey_expansion_192b:
1779
- movaps %xmm0,%xmm3
1780
- shufps \$0b01000100,%xmm0,%xmm5
1781
- $movkey %xmm5,(%rax)
1782
- shufps \$0b01001110,%xmm2,%xmm3
1783
- $movkey %xmm3,16(%rax)
1784
- lea 32(%rax),%rax
1785
- jmp .Lkey_expansion_192b_warm
1786
-
1787
- .align 16
1788
- .Lkey_expansion_256a:
1789
- $movkey %xmm2,(%rax)
1790
- lea 16(%rax),%rax
1791
- .Lkey_expansion_256a_cold:
1792
- shufps \$0b00010000,%xmm0,%xmm4
1793
- xorps %xmm4,%xmm0
1794
- shufps \$0b10001100,%xmm0,%xmm4
1795
- xorps %xmm4,%xmm0
1796
- shufps \$0b11111111,%xmm1,%xmm1 # critical path
1797
- xorps %xmm1,%xmm0
1798
- ret
1799
-
1800
- .align 16
1801
- .Lkey_expansion_256b:
1802
- $movkey %xmm0,(%rax)
1803
- lea 16(%rax),%rax
1804
-
1805
- shufps \$0b00010000,%xmm2,%xmm4
1806
- xorps %xmm4,%xmm2
1807
- shufps \$0b10001100,%xmm2,%xmm4
1808
- xorps %xmm4,%xmm2
1809
- shufps \$0b10101010,%xmm1,%xmm1 # critical path
1810
- xorps %xmm1,%xmm2
1811
- ret
1812
- .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
1813
- .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
1814
- ___
1815
- }
1816
-
1817
- $code.=<<___;
1818
- .align 64
1819
- .Lbswap_mask:
1820
- .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1821
- .Lincrement32:
1822
- .long 6,6,6,0
1823
- .Lincrement64:
1824
- .long 1,0,0,0
1825
- .Lincrement1:
1826
- .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1827
- .Lkey_rotate:
1828
- .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1829
- .Lkey_rotate192:
1830
- .long 0x04070605,0x04070605,0x04070605,0x04070605
1831
- .Lkey_rcon1:
1832
- .long 1,1,1,1
1833
- .Lkey_rcon1b:
1834
- .long 0x1b,0x1b,0x1b,0x1b
1835
-
1836
- .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
1837
- .align 64
1838
- ___
1839
-
1840
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1841
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1842
- if ($win64) {
1843
- $rec="%rcx";
1844
- $frame="%rdx";
1845
- $context="%r8";
1846
- $disp="%r9";
1847
-
1848
- $code.=<<___;
1849
- .extern __imp_RtlVirtualUnwind
1850
- ___
1851
- $code.=<<___ if ($PREFIX eq "aesni");
1852
- .type ccm64_se_handler,\@abi-omnipotent
1853
- .align 16
1854
- ccm64_se_handler:
1855
- push %rsi
1856
- push %rdi
1857
- push %rbx
1858
- push %rbp
1859
- push %r12
1860
- push %r13
1861
- push %r14
1862
- push %r15
1863
- pushfq
1864
- sub \$64,%rsp
1865
-
1866
- mov 120($context),%rax # pull context->Rax
1867
- mov 248($context),%rbx # pull context->Rip
1868
-
1869
- mov 8($disp),%rsi # disp->ImageBase
1870
- mov 56($disp),%r11 # disp->HandlerData
1871
-
1872
- mov 0(%r11),%r10d # HandlerData[0]
1873
- lea (%rsi,%r10),%r10 # prologue label
1874
- cmp %r10,%rbx # context->Rip<prologue label
1875
- jb .Lcommon_seh_tail
1876
-
1877
- mov 152($context),%rax # pull context->Rsp
1878
-
1879
- mov 4(%r11),%r10d # HandlerData[1]
1880
- lea (%rsi,%r10),%r10 # epilogue label
1881
- cmp %r10,%rbx # context->Rip>=epilogue label
1882
- jae .Lcommon_seh_tail
1883
-
1884
- lea 0(%rax),%rsi # %xmm save area
1885
- lea 512($context),%rdi # &context.Xmm6
1886
- mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
1887
- .long 0xa548f3fc # cld; rep movsq
1888
- lea 0x58(%rax),%rax # adjust stack pointer
1889
-
1890
- jmp .Lcommon_seh_tail
1891
- .size ccm64_se_handler,.-ccm64_se_handler
1892
-
1893
- .type ctr_se_handler,\@abi-omnipotent
1894
- .align 16
1895
- ctr_se_handler:
1896
- push %rsi
1897
- push %rdi
1898
- push %rbx
1899
- push %rbp
1900
- push %r12
1901
- push %r13
1902
- push %r14
1903
- push %r15
1904
- pushfq
1905
- sub \$64,%rsp
1906
-
1907
- mov 120($context),%rax # pull context->Rax
1908
- mov 248($context),%rbx # pull context->Rip
1909
-
1910
- mov 8($disp),%rsi # disp->ImageBase
1911
- mov 56($disp),%r11 # disp->HandlerData
1912
-
1913
- mov 0(%r11),%r10d # HandlerData[0]
1914
- lea (%rsi,%r10),%r10 # prologue lable
1915
- cmp %r10,%rbx # context->Rip<prologue label
1916
- jb .Lcommon_seh_tail
1917
-
1918
- mov 152($context),%rax # pull context->Rsp
1919
-
1920
- mov 4(%r11),%r10d # HandlerData[1]
1921
- lea (%rsi,%r10),%r10 # epilogue label
1922
- cmp %r10,%rbx # context->Rip>=epilogue label
1923
- jae .Lcommon_seh_tail
1924
-
1925
- mov 160($context),%rax # pull context->Rbp
1926
- lea -0xa0(%rax),%rsi # %xmm save area
1927
- lea 512($context),%rdi # & context.Xmm6
1928
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1929
- .long 0xa548f3fc # cld; rep movsq
1930
-
1931
- mov 160($context),%rax # pull context->Rbp
1932
- mov (%rax),%rbp # restore saved %rbp
1933
- lea 8(%rax),%rax # adjust stack pointer
1934
- mov %rbp,160($context) # restore context->Rbp
1935
-
1936
- mov 8(%rax),%rdi
1937
- mov 16(%rax),%rsi
1938
- mov %rax,152($context) # restore context->Rsp
1939
- mov %rsi,168($context) # restore context->Rsi
1940
- mov %rdi,176($context) # restore context->Rdi
1941
-
1942
- mov 40($disp),%rdi # disp->ContextRecord
1943
- mov $context,%rsi # context
1944
- mov \$154,%ecx # sizeof(CONTEXT)
1945
- .long 0xa548f3fc # cld; rep movsq
1946
-
1947
- mov $disp,%rsi
1948
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1949
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
1950
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
1951
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1952
- mov 40(%rsi),%r10 # disp->ContextRecord
1953
- lea 56(%rsi),%r11 # &disp->HandlerData
1954
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
1955
- mov %r10,32(%rsp) # arg5
1956
- mov %r11,40(%rsp) # arg6
1957
- mov %r12,48(%rsp) # arg7
1958
- mov %rcx,56(%rsp) # arg8, (NULL)
1959
- call *__imp_RtlVirtualUnwind(%rip)
1960
-
1961
- mov \$1,%eax # ExceptionContinueSearch
1962
- add \$64,%rsp
1963
- popfq
1964
- pop %r15
1965
- pop %r14
1966
- pop %r13
1967
- pop %r12
1968
- pop %rbp
1969
- pop %rbx
1970
- pop %rdi
1971
- pop %rsi
1972
- ret
1973
- .size ctr_se_handler,.-ctr_se_handler
1974
-
1975
- .section .pdata
1976
- .align 4
1977
- ___
1978
- $code.=<<___ if ($PREFIX eq "aesni");
1979
- .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
1980
- .rva .LSEH_end_aesni_ccm64_encrypt_blocks
1981
- .rva .LSEH_info_ccm64_enc
1982
-
1983
- .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
1984
- .rva .LSEH_end_aesni_ccm64_decrypt_blocks
1985
- .rva .LSEH_info_ccm64_dec
1986
-
1987
- .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
1988
- .rva .LSEH_end_aesni_ctr32_encrypt_blocks
1989
- .rva .LSEH_info_ctr32
1990
- ___
1991
- $code.=<<___;
1992
- .rva ${PREFIX}_set_decrypt_key
1993
- .rva .LSEH_end_set_decrypt_key
1994
- .rva .LSEH_info_key
1995
-
1996
- .rva ${PREFIX}_set_encrypt_key
1997
- .rva .LSEH_end_set_encrypt_key
1998
- .rva .LSEH_info_key
1999
- .section .xdata
2000
- .align 8
2001
- ___
2002
- $code.=<<___ if ($PREFIX eq "aesni");
2003
- .LSEH_info_ccm64_enc:
2004
- .byte 9,0,0,0
2005
- .rva ccm64_se_handler
2006
- .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
2007
- .LSEH_info_ccm64_dec:
2008
- .byte 9,0,0,0
2009
- .rva ccm64_se_handler
2010
- .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
2011
- .LSEH_info_ctr32:
2012
- .byte 9,0,0,0
2013
- .rva ctr_se_handler
2014
- .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
2015
- ___
2016
- $code.=<<___;
2017
- .LSEH_info_key:
2018
- .byte 0x01,0x04,0x01,0x00
2019
- .byte 0x04,0x02,0x00,0x00 # sub rsp,8
2020
- ___
2021
- }
2022
-
2023
- sub rex {
2024
- local *opcode=shift;
2025
- my ($dst,$src)=@_;
2026
- my $rex=0;
2027
-
2028
- $rex|=0x04 if($dst>=8);
2029
- $rex|=0x01 if($src>=8);
2030
- push @opcode,$rex|0x40 if($rex);
2031
- }
2032
-
2033
- sub aesni {
2034
- my $line=shift;
2035
- my @opcode=(0x66);
2036
-
2037
- if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2038
- rex(\@opcode,$4,$3);
2039
- push @opcode,0x0f,0x3a,0xdf;
2040
- push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
2041
- my $c=$2;
2042
- push @opcode,$c=~/^0/?oct($c):$c;
2043
- return ".byte\t".join(',',@opcode);
2044
- }
2045
- elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2046
- my %opcodelet = (
2047
- "aesimc" => 0xdb,
2048
- "aesenc" => 0xdc, "aesenclast" => 0xdd,
2049
- "aesdec" => 0xde, "aesdeclast" => 0xdf
2050
- );
2051
- return undef if (!defined($opcodelet{$1}));
2052
- rex(\@opcode,$3,$2);
2053
- push @opcode,0x0f,0x38,$opcodelet{$1};
2054
- push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
2055
- return ".byte\t".join(',',@opcode);
2056
- }
2057
- elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
2058
- my %opcodelet = (
2059
- "aesenc" => 0xdc, "aesenclast" => 0xdd,
2060
- "aesdec" => 0xde, "aesdeclast" => 0xdf
2061
- );
2062
- return undef if (!defined($opcodelet{$1}));
2063
- my $off = $2;
2064
- push @opcode,0x44 if ($3>=8);
2065
- push @opcode,0x0f,0x38,$opcodelet{$1};
2066
- push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
2067
- push @opcode,($off=~/^0/?oct($off):$off)&0xff;
2068
- return ".byte\t".join(',',@opcode);
2069
- }
2070
- return $line;
2071
- }
2072
-
2073
- sub movbe {
2074
- ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
2075
- }
2076
-
2077
- $code =~ s/\`([^\`]*)\`/eval($1)/gem;
2078
- $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
2079
- #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
2080
- $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
2081
-
2082
- print $code;
2083
-
2084
- close STDOUT;