ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,2084 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # This module implements support for Intel AES-NI extension. In
11
+ # OpenSSL context it's used with Intel engine, but can also be used as
12
+ # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13
+ # details].
14
+ #
15
+ # Performance.
16
+ #
17
+ # Given aes(enc|dec) instructions' latency asymptotic performance for
18
+ # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19
+ # processed with 128-bit key. And given their throughput asymptotic
20
+ # performance for parallelizable modes is 1.25 cycles per byte. Being
21
+ # asymptotic limit it's not something you commonly achieve in reality,
22
+ # but how close does one get? Below are results collected for
23
+ # different modes and block sized. Pairs of numbers are for en-/
24
+ # decryption.
25
+ #
26
+ # 16-byte 64-byte 256-byte 1-KB 8-KB
27
+ # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28
+ # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29
+ # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30
+ # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31
+ # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32
+ # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33
+ #
34
+ # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35
+ # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36
+ # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37
+ # The results were collected with specially crafted speed.c benchmark
38
+ # in order to compare them with results reported in "Intel Advanced
39
+ # Encryption Standard (AES) New Instruction Set" White Paper Revision
40
+ # 3.0 dated May 2010. All above results are consistently better. This
41
+ # module also provides better performance for block sizes smaller than
42
+ # 128 bytes in points *not* represented in the above table.
43
+ #
44
+ # Looking at the results for 8-KB buffer.
45
+ #
46
+ # CFB and OFB results are far from the limit, because implementation
47
+ # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48
+ # single-block aesni_encrypt, which is not the most optimal way to go.
49
+ # CBC encrypt result is unexpectedly high and there is no documented
50
+ # explanation for it. Seemingly there is a small penalty for feeding
51
+ # the result back to AES unit the way it's done in CBC mode. There is
52
+ # nothing one can do and the result appears optimal. CCM result is
53
+ # identical to CBC, because CBC-MAC is essentially CBC encrypt without
54
+ # saving output. CCM CTR "stays invisible," because it's neatly
55
+ # interleaved wih CBC-MAC. This provides ~30% improvement over
56
+ # "straghtforward" CCM implementation with CTR and CBC-MAC performed
57
+ # disjointly. Parallelizable modes practically achieve the theoretical
58
+ # limit.
59
+ #
60
+ # Looking at how results vary with buffer size.
61
+ #
62
+ # Curves are practically saturated at 1-KB buffer size. In most cases
63
+ # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64
+ # CTR curve doesn't follow this pattern and is "slowest" changing one
65
+ # with "256-byte" result being 87% of "8-KB." This is because overhead
66
+ # in CTR mode is most computationally intensive. Small-block CCM
67
+ # decrypt is slower than encrypt, because first CTR and last CBC-MAC
68
+ # iterations can't be interleaved.
69
+ #
70
+ # Results for 192- and 256-bit keys.
71
+ #
72
+ # EVP-free results were observed to scale perfectly with number of
73
+ # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74
+ # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75
+ # are a tad smaller, because the above mentioned penalty biases all
76
+ # results by same constant value. In similar way function call
77
+ # overhead affects small-block performance, as well as OFB and CFB
78
+ # results. Differences are not large, most common coefficients are
79
+ # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80
+ # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
+
82
+ # January 2011
83
+ #
84
+ # While Westmere processor features 6 cycles latency for aes[enc|dec]
85
+ # instructions, which can be scheduled every second cycle, Sandy
86
+ # Bridge spends 8 cycles per instruction, but it can schedule them
87
+ # every cycle. This means that code targeting Westmere would perform
88
+ # suboptimally on Sandy Bridge. Therefore this update.
89
+ #
90
+ # In addition, non-parallelizable CBC encrypt (as well as CCM) is
91
+ # optimized. Relative improvement might appear modest, 8% on Westmere,
92
+ # but in absolute terms it's 3.77 cycles per byte encrypted with
93
+ # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94
+ # should be compared to asymptotic limits of 3.75 for Westmere and
95
+ # 5.00 for Sandy Bridge. Actually, the fact that they get this close
96
+ # to asymptotic limits is quite amazing. Indeed, the limit is
97
+ # calculated as latency times number of rounds, 10 for 128-bit key,
98
+ # and divided by 16, the number of bytes in block, or in other words
99
+ # it accounts *solely* for aesenc instructions. But there are extra
100
+ # instructions, and numbers so close to the asymptotic limits mean
101
+ # that it's as if it takes as little as *one* additional cycle to
102
+ # execute all of them. How is it possible? It is possible thanks to
103
+ # out-of-order execution logic, which manages to overlap post-
104
+ # processing of previous block, things like saving the output, with
105
+ # actual encryption of current block, as well as pre-processing of
106
+ # current block, things like fetching input and xor-ing it with
107
+ # 0-round element of the key schedule, with actual encryption of
108
+ # previous block. Keep this in mind...
109
+ #
110
+ # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111
+ # performance is achieved by interleaving instructions working on
112
+ # independent blocks. In which case asymptotic limit for such modes
113
+ # can be obtained by dividing above mentioned numbers by AES
114
+ # instructions' interleave factor. Westmere can execute at most 3
115
+ # instructions at a time, meaning that optimal interleave factor is 3,
116
+ # and that's where the "magic" number of 1.25 come from. "Optimal
117
+ # interleave factor" means that increase of interleave factor does
118
+ # not improve performance. The formula has proven to reflect reality
119
+ # pretty well on Westmere... Sandy Bridge on the other hand can
120
+ # execute up to 8 AES instructions at a time, so how does varying
121
+ # interleave factor affect the performance? Here is table for ECB
122
+ # (numbers are cycles per byte processed with 128-bit key):
123
+ #
124
+ # instruction interleave factor 3x 6x 8x
125
+ # theoretical asymptotic limit 1.67 0.83 0.625
126
+ # measured performance for 8KB block 1.05 0.86 0.84
127
+ #
128
+ # "as if" interleave factor 4.7x 5.8x 6.0x
129
+ #
130
+ # Further data for other parallelizable modes:
131
+ #
132
+ # CBC decrypt 1.16 0.93 0.74
133
+ # CTR 1.14 0.91 0.74
134
+ #
135
+ # Well, given 3x column it's probably inappropriate to call the limit
136
+ # asymptotic, if it can be surpassed, isn't it? What happens there?
137
+ # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138
+ # magic is responsible for this. Processor overlaps not only the
139
+ # additional instructions with AES ones, but even AES instuctions
140
+ # processing adjacent triplets of independent blocks. In the 6x case
141
+ # additional instructions still claim disproportionally small amount
142
+ # of additional cycles, but in 8x case number of instructions must be
143
+ # a tad too high for out-of-order logic to cope with, and AES unit
144
+ # remains underutilized... As you can see 8x interleave is hardly
145
+ # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146
+ # utilizies 6x interleave because of limited register bank capacity.
147
+ #
148
+ # Higher interleave factors do have negative impact on Westmere
149
+ # performance. While for ECB mode it's negligible ~1.5%, other
150
+ # parallelizables perform ~5% worse, which is outweighed by ~25%
151
+ # improvement on Sandy Bridge. To balance regression on Westmere
152
+ # CTR mode was implemented with 6x aesenc interleave factor.
153
+
154
+ # April 2011
155
+ #
156
+ # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
157
+ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
158
+ # in CTR mode AES instruction interleave factor was chosen to be 6x.
159
+
160
+ ######################################################################
161
+ # Current large-block performance in cycles per byte processed with
162
+ # 128-bit key (less is better).
163
+ #
164
+ # CBC en-/decrypt CTR XTS ECB
165
+ # Westmere 3.77/1.25 1.25 1.25 1.26
166
+ # * Bridge 5.07/0.74 0.75 0.90 0.85
167
+ # Haswell 4.44/0.63 0.63 0.73 0.63
168
+ # Silvermont 5.75/3.54 3.56 4.12 3.87(*)
169
+ # Bulldozer 5.77/0.70 0.72 0.90 0.70
170
+ #
171
+ # (*) Atom Silvermont ECB result is suboptimal because of penalties
172
+ # incurred by operations on %xmm8-15. As ECB is not considered
173
+ # critical, nothing was done to mitigate the problem.
174
+
175
+ $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
176
+ # generates drop-in replacement for
177
+ # crypto/aes/asm/aes-x86_64.pl:-)
178
+
179
+ $flavour = shift;
180
+ $output = shift;
181
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
182
+
183
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
184
+
185
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
186
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
187
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
188
+ die "can't locate x86_64-xlate.pl";
189
+
190
+ open OUT,"| \"$^X\" $xlate $flavour $output";
191
+ *STDOUT=*OUT;
192
+
193
+ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
194
+ @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
195
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
196
+
197
+ $code=".text\n";
198
+ $code.=".extern OPENSSL_ia32cap_P\n";
199
+
200
+ $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
201
+ # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
202
+ $inp="%rdi";
203
+ $out="%rsi";
204
+ $len="%rdx";
205
+ $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
206
+ $ivp="%r8"; # cbc, ctr, ...
207
+
208
+ $rnds_="%r10d"; # backup copy for $rounds
209
+ $key_="%r11"; # backup copy for $key
210
+
211
+ # %xmm register layout
212
+ $rndkey0="%xmm0"; $rndkey1="%xmm1";
213
+ $inout0="%xmm2"; $inout1="%xmm3";
214
+ $inout2="%xmm4"; $inout3="%xmm5";
215
+ $inout4="%xmm6"; $inout5="%xmm7";
216
+ $inout6="%xmm8"; $inout7="%xmm9";
217
+
218
+ $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
219
+ $in0="%xmm8"; $iv="%xmm9";
220
+
221
+ # Inline version of internal aesni_[en|de]crypt1.
222
+ #
223
+ # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
224
+ # cycles which take care of loop variables...
225
+ { my $sn;
226
+ sub aesni_generate1 {
227
+ my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
228
+ ++$sn;
229
+ $code.=<<___;
230
+ $movkey ($key),$rndkey0
231
+ $movkey 16($key),$rndkey1
232
+ ___
233
+ $code.=<<___ if (defined($ivec));
234
+ xorps $rndkey0,$ivec
235
+ lea 32($key),$key
236
+ xorps $ivec,$inout
237
+ ___
238
+ $code.=<<___ if (!defined($ivec));
239
+ lea 32($key),$key
240
+ xorps $rndkey0,$inout
241
+ ___
242
+ $code.=<<___;
243
+ .Loop_${p}1_$sn:
244
+ aes${p} $rndkey1,$inout
245
+ dec $rounds
246
+ $movkey ($key),$rndkey1
247
+ lea 16($key),$key
248
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
249
+ aes${p}last $rndkey1,$inout
250
+ ___
251
+ }}
252
+ # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
253
+ #
254
+ { my ($inp,$out,$key) = @_4args;
255
+
256
+ $code.=<<___;
257
+ .globl ${PREFIX}_encrypt
258
+ .type ${PREFIX}_encrypt,\@abi-omnipotent
259
+ .align 16
260
+ ${PREFIX}_encrypt:
261
+ movups ($inp),$inout0 # load input
262
+ mov 240($key),$rounds # key->rounds
263
+ ___
264
+ &aesni_generate1("enc",$key,$rounds);
265
+ $code.=<<___;
266
+ pxor $rndkey0,$rndkey0 # clear register bank
267
+ pxor $rndkey1,$rndkey1
268
+ movups $inout0,($out) # output
269
+ pxor $inout0,$inout0
270
+ ret
271
+ .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
272
+
273
+ .globl ${PREFIX}_decrypt
274
+ .type ${PREFIX}_decrypt,\@abi-omnipotent
275
+ .align 16
276
+ ${PREFIX}_decrypt:
277
+ movups ($inp),$inout0 # load input
278
+ mov 240($key),$rounds # key->rounds
279
+ ___
280
+ &aesni_generate1("dec",$key,$rounds);
281
+ $code.=<<___;
282
+ pxor $rndkey0,$rndkey0 # clear register bank
283
+ pxor $rndkey1,$rndkey1
284
+ movups $inout0,($out) # output
285
+ pxor $inout0,$inout0
286
+ ret
287
+ .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
288
+ ___
289
+ }
290
+
291
+ # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
292
+ # factor. Why 3x subroutine were originally used in loops? Even though
293
+ # aes[enc|dec] latency was originally 6, it could be scheduled only
294
+ # every *2nd* cycle. Thus 3x interleave was the one providing optimal
295
+ # utilization, i.e. when subroutine's throughput is virtually same as
296
+ # of non-interleaved subroutine [for number of input blocks up to 3].
297
+ # This is why it originally made no sense to implement 2x subroutine.
298
+ # But times change and it became appropriate to spend extra 192 bytes
299
+ # on 2x subroutine on Atom Silvermont account. For processors that
300
+ # can schedule aes[enc|dec] every cycle optimal interleave factor
301
+ # equals to corresponding instructions latency. 8x is optimal for
302
+ # * Bridge and "super-optimal" for other Intel CPUs...
303
+
304
+ sub aesni_generate2 {
305
+ my $dir=shift;
306
+ # As already mentioned it takes in $key and $rounds, which are *not*
307
+ # preserved. $inout[0-1] is cipher/clear text...
308
+ $code.=<<___;
309
+ .type _aesni_${dir}rypt2,\@abi-omnipotent
310
+ .align 16
311
+ _aesni_${dir}rypt2:
312
+ $movkey ($key),$rndkey0
313
+ shl \$4,$rounds
314
+ $movkey 16($key),$rndkey1
315
+ xorps $rndkey0,$inout0
316
+ xorps $rndkey0,$inout1
317
+ $movkey 32($key),$rndkey0
318
+ lea 32($key,$rounds),$key
319
+ neg %rax # $rounds
320
+ add \$16,%rax
321
+
322
+ .L${dir}_loop2:
323
+ aes${dir} $rndkey1,$inout0
324
+ aes${dir} $rndkey1,$inout1
325
+ $movkey ($key,%rax),$rndkey1
326
+ add \$32,%rax
327
+ aes${dir} $rndkey0,$inout0
328
+ aes${dir} $rndkey0,$inout1
329
+ $movkey -16($key,%rax),$rndkey0
330
+ jnz .L${dir}_loop2
331
+
332
+ aes${dir} $rndkey1,$inout0
333
+ aes${dir} $rndkey1,$inout1
334
+ aes${dir}last $rndkey0,$inout0
335
+ aes${dir}last $rndkey0,$inout1
336
+ ret
337
+ .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
338
+ ___
339
+ }
340
+ sub aesni_generate3 {
341
+ my $dir=shift;
342
+ # As already mentioned it takes in $key and $rounds, which are *not*
343
+ # preserved. $inout[0-2] is cipher/clear text...
344
+ $code.=<<___;
345
+ .type _aesni_${dir}rypt3,\@abi-omnipotent
346
+ .align 16
347
+ _aesni_${dir}rypt3:
348
+ $movkey ($key),$rndkey0
349
+ shl \$4,$rounds
350
+ $movkey 16($key),$rndkey1
351
+ xorps $rndkey0,$inout0
352
+ xorps $rndkey0,$inout1
353
+ xorps $rndkey0,$inout2
354
+ $movkey 32($key),$rndkey0
355
+ lea 32($key,$rounds),$key
356
+ neg %rax # $rounds
357
+ add \$16,%rax
358
+
359
+ .L${dir}_loop3:
360
+ aes${dir} $rndkey1,$inout0
361
+ aes${dir} $rndkey1,$inout1
362
+ aes${dir} $rndkey1,$inout2
363
+ $movkey ($key,%rax),$rndkey1
364
+ add \$32,%rax
365
+ aes${dir} $rndkey0,$inout0
366
+ aes${dir} $rndkey0,$inout1
367
+ aes${dir} $rndkey0,$inout2
368
+ $movkey -16($key,%rax),$rndkey0
369
+ jnz .L${dir}_loop3
370
+
371
+ aes${dir} $rndkey1,$inout0
372
+ aes${dir} $rndkey1,$inout1
373
+ aes${dir} $rndkey1,$inout2
374
+ aes${dir}last $rndkey0,$inout0
375
+ aes${dir}last $rndkey0,$inout1
376
+ aes${dir}last $rndkey0,$inout2
377
+ ret
378
+ .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
379
+ ___
380
+ }
381
+ # 4x interleave is implemented to improve small block performance,
382
+ # most notably [and naturally] 4 block by ~30%. One can argue that one
383
+ # should have implemented 5x as well, but improvement would be <20%,
384
+ # so it's not worth it...
385
+ sub aesni_generate4 {
386
+ my $dir=shift;
387
+ # As already mentioned it takes in $key and $rounds, which are *not*
388
+ # preserved. $inout[0-3] is cipher/clear text...
389
+ $code.=<<___;
390
+ .type _aesni_${dir}rypt4,\@abi-omnipotent
391
+ .align 16
392
+ _aesni_${dir}rypt4:
393
+ $movkey ($key),$rndkey0
394
+ shl \$4,$rounds
395
+ $movkey 16($key),$rndkey1
396
+ xorps $rndkey0,$inout0
397
+ xorps $rndkey0,$inout1
398
+ xorps $rndkey0,$inout2
399
+ xorps $rndkey0,$inout3
400
+ $movkey 32($key),$rndkey0
401
+ lea 32($key,$rounds),$key
402
+ neg %rax # $rounds
403
+ .byte 0x0f,0x1f,0x00
404
+ add \$16,%rax
405
+
406
+ .L${dir}_loop4:
407
+ aes${dir} $rndkey1,$inout0
408
+ aes${dir} $rndkey1,$inout1
409
+ aes${dir} $rndkey1,$inout2
410
+ aes${dir} $rndkey1,$inout3
411
+ $movkey ($key,%rax),$rndkey1
412
+ add \$32,%rax
413
+ aes${dir} $rndkey0,$inout0
414
+ aes${dir} $rndkey0,$inout1
415
+ aes${dir} $rndkey0,$inout2
416
+ aes${dir} $rndkey0,$inout3
417
+ $movkey -16($key,%rax),$rndkey0
418
+ jnz .L${dir}_loop4
419
+
420
+ aes${dir} $rndkey1,$inout0
421
+ aes${dir} $rndkey1,$inout1
422
+ aes${dir} $rndkey1,$inout2
423
+ aes${dir} $rndkey1,$inout3
424
+ aes${dir}last $rndkey0,$inout0
425
+ aes${dir}last $rndkey0,$inout1
426
+ aes${dir}last $rndkey0,$inout2
427
+ aes${dir}last $rndkey0,$inout3
428
+ ret
429
+ .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
430
+ ___
431
+ }
432
+ sub aesni_generate6 {
433
+ my $dir=shift;
434
+ # As already mentioned it takes in $key and $rounds, which are *not*
435
+ # preserved. $inout[0-5] is cipher/clear text...
436
+ $code.=<<___;
437
+ .type _aesni_${dir}rypt6,\@abi-omnipotent
438
+ .align 16
439
+ _aesni_${dir}rypt6:
440
+ $movkey ($key),$rndkey0
441
+ shl \$4,$rounds
442
+ $movkey 16($key),$rndkey1
443
+ xorps $rndkey0,$inout0
444
+ pxor $rndkey0,$inout1
445
+ pxor $rndkey0,$inout2
446
+ aes${dir} $rndkey1,$inout0
447
+ lea 32($key,$rounds),$key
448
+ neg %rax # $rounds
449
+ aes${dir} $rndkey1,$inout1
450
+ pxor $rndkey0,$inout3
451
+ pxor $rndkey0,$inout4
452
+ aes${dir} $rndkey1,$inout2
453
+ pxor $rndkey0,$inout5
454
+ $movkey ($key,%rax),$rndkey0
455
+ add \$16,%rax
456
+ jmp .L${dir}_loop6_enter
457
+ .align 16
458
+ .L${dir}_loop6:
459
+ aes${dir} $rndkey1,$inout0
460
+ aes${dir} $rndkey1,$inout1
461
+ aes${dir} $rndkey1,$inout2
462
+ .L${dir}_loop6_enter:
463
+ aes${dir} $rndkey1,$inout3
464
+ aes${dir} $rndkey1,$inout4
465
+ aes${dir} $rndkey1,$inout5
466
+ $movkey ($key,%rax),$rndkey1
467
+ add \$32,%rax
468
+ aes${dir} $rndkey0,$inout0
469
+ aes${dir} $rndkey0,$inout1
470
+ aes${dir} $rndkey0,$inout2
471
+ aes${dir} $rndkey0,$inout3
472
+ aes${dir} $rndkey0,$inout4
473
+ aes${dir} $rndkey0,$inout5
474
+ $movkey -16($key,%rax),$rndkey0
475
+ jnz .L${dir}_loop6
476
+
477
+ aes${dir} $rndkey1,$inout0
478
+ aes${dir} $rndkey1,$inout1
479
+ aes${dir} $rndkey1,$inout2
480
+ aes${dir} $rndkey1,$inout3
481
+ aes${dir} $rndkey1,$inout4
482
+ aes${dir} $rndkey1,$inout5
483
+ aes${dir}last $rndkey0,$inout0
484
+ aes${dir}last $rndkey0,$inout1
485
+ aes${dir}last $rndkey0,$inout2
486
+ aes${dir}last $rndkey0,$inout3
487
+ aes${dir}last $rndkey0,$inout4
488
+ aes${dir}last $rndkey0,$inout5
489
+ ret
490
+ .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
491
+ ___
492
+ }
493
+ sub aesni_generate8 {
494
+ my $dir=shift;
495
+ # As already mentioned it takes in $key and $rounds, which are *not*
496
+ # preserved. $inout[0-7] is cipher/clear text...
497
+ $code.=<<___;
498
+ .type _aesni_${dir}rypt8,\@abi-omnipotent
499
+ .align 16
500
+ _aesni_${dir}rypt8:
501
+ $movkey ($key),$rndkey0
502
+ shl \$4,$rounds
503
+ $movkey 16($key),$rndkey1
504
+ xorps $rndkey0,$inout0
505
+ xorps $rndkey0,$inout1
506
+ pxor $rndkey0,$inout2
507
+ pxor $rndkey0,$inout3
508
+ pxor $rndkey0,$inout4
509
+ lea 32($key,$rounds),$key
510
+ neg %rax # $rounds
511
+ aes${dir} $rndkey1,$inout0
512
+ pxor $rndkey0,$inout5
513
+ pxor $rndkey0,$inout6
514
+ aes${dir} $rndkey1,$inout1
515
+ pxor $rndkey0,$inout7
516
+ $movkey ($key,%rax),$rndkey0
517
+ add \$16,%rax
518
+ jmp .L${dir}_loop8_inner
519
+ .align 16
520
+ .L${dir}_loop8:
521
+ aes${dir} $rndkey1,$inout0
522
+ aes${dir} $rndkey1,$inout1
523
+ .L${dir}_loop8_inner:
524
+ aes${dir} $rndkey1,$inout2
525
+ aes${dir} $rndkey1,$inout3
526
+ aes${dir} $rndkey1,$inout4
527
+ aes${dir} $rndkey1,$inout5
528
+ aes${dir} $rndkey1,$inout6
529
+ aes${dir} $rndkey1,$inout7
530
+ .L${dir}_loop8_enter:
531
+ $movkey ($key,%rax),$rndkey1
532
+ add \$32,%rax
533
+ aes${dir} $rndkey0,$inout0
534
+ aes${dir} $rndkey0,$inout1
535
+ aes${dir} $rndkey0,$inout2
536
+ aes${dir} $rndkey0,$inout3
537
+ aes${dir} $rndkey0,$inout4
538
+ aes${dir} $rndkey0,$inout5
539
+ aes${dir} $rndkey0,$inout6
540
+ aes${dir} $rndkey0,$inout7
541
+ $movkey -16($key,%rax),$rndkey0
542
+ jnz .L${dir}_loop8
543
+
544
+ aes${dir} $rndkey1,$inout0
545
+ aes${dir} $rndkey1,$inout1
546
+ aes${dir} $rndkey1,$inout2
547
+ aes${dir} $rndkey1,$inout3
548
+ aes${dir} $rndkey1,$inout4
549
+ aes${dir} $rndkey1,$inout5
550
+ aes${dir} $rndkey1,$inout6
551
+ aes${dir} $rndkey1,$inout7
552
+ aes${dir}last $rndkey0,$inout0
553
+ aes${dir}last $rndkey0,$inout1
554
+ aes${dir}last $rndkey0,$inout2
555
+ aes${dir}last $rndkey0,$inout3
556
+ aes${dir}last $rndkey0,$inout4
557
+ aes${dir}last $rndkey0,$inout5
558
+ aes${dir}last $rndkey0,$inout6
559
+ aes${dir}last $rndkey0,$inout7
560
+ ret
561
+ .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
562
+ ___
563
+ }
564
+ &aesni_generate2("enc") if ($PREFIX eq "aesni");
565
+ &aesni_generate2("dec");
566
+ &aesni_generate3("enc") if ($PREFIX eq "aesni");
567
+ &aesni_generate3("dec");
568
+ &aesni_generate4("enc") if ($PREFIX eq "aesni");
569
+ &aesni_generate4("dec");
570
+ &aesni_generate6("enc") if ($PREFIX eq "aesni");
571
+ &aesni_generate6("dec");
572
+ &aesni_generate8("enc") if ($PREFIX eq "aesni");
573
+ &aesni_generate8("dec");
574
+
575
+ if ($PREFIX eq "aesni") {
576
+ {
577
+ ######################################################################
578
+ # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
579
+ # size_t blocks, const AES_KEY *key,
580
+ # const char *ivec,char *cmac);
581
+ #
582
+ # Handles only complete blocks, operates on 64-bit counter and
583
+ # does not update *ivec! Nor does it finalize CMAC value
584
+ # (see engine/eng_aesni.c for details)
585
+ #
586
+ {
587
+ my $cmac="%r9"; # 6th argument
588
+
589
+ my $increment="%xmm9";
590
+ my $iv="%xmm6";
591
+ my $bswap_mask="%xmm7";
592
+
593
+ $code.=<<___;
594
+ .globl aesni_ccm64_encrypt_blocks
595
+ .type aesni_ccm64_encrypt_blocks,\@function,6
596
+ .align 16
597
+ aesni_ccm64_encrypt_blocks:
598
+ ___
599
+ $code.=<<___ if ($win64);
600
+ lea -0x58(%rsp),%rsp
601
+ movaps %xmm6,(%rsp) # $iv
602
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
603
+ movaps %xmm8,0x20(%rsp) # $in0
604
+ movaps %xmm9,0x30(%rsp) # $increment
605
+ .Lccm64_enc_body:
606
+ ___
607
+ $code.=<<___;
608
+ mov 240($key),$rounds # key->rounds
609
+ movdqu ($ivp),$iv
610
+ movdqa .Lincrement64(%rip),$increment
611
+ movdqa .Lbswap_mask(%rip),$bswap_mask
612
+
613
+ shl \$4,$rounds
614
+ mov \$16,$rnds_
615
+ lea 0($key),$key_
616
+ movdqu ($cmac),$inout1
617
+ movdqa $iv,$inout0
618
+ lea 32($key,$rounds),$key # end of key schedule
619
+ pshufb $bswap_mask,$iv
620
+ sub %rax,%r10 # twisted $rounds
621
+ jmp .Lccm64_enc_outer
622
+ .align 16
623
+ .Lccm64_enc_outer:
624
+ $movkey ($key_),$rndkey0
625
+ mov %r10,%rax
626
+ movups ($inp),$in0 # load inp
627
+
628
+ xorps $rndkey0,$inout0 # counter
629
+ $movkey 16($key_),$rndkey1
630
+ xorps $in0,$rndkey0
631
+ xorps $rndkey0,$inout1 # cmac^=inp
632
+ $movkey 32($key_),$rndkey0
633
+
634
+ .Lccm64_enc2_loop:
635
+ aesenc $rndkey1,$inout0
636
+ aesenc $rndkey1,$inout1
637
+ $movkey ($key,%rax),$rndkey1
638
+ add \$32,%rax
639
+ aesenc $rndkey0,$inout0
640
+ aesenc $rndkey0,$inout1
641
+ $movkey -16($key,%rax),$rndkey0
642
+ jnz .Lccm64_enc2_loop
643
+ aesenc $rndkey1,$inout0
644
+ aesenc $rndkey1,$inout1
645
+ paddq $increment,$iv
646
+ dec $len # $len-- ($len is in blocks)
647
+ aesenclast $rndkey0,$inout0
648
+ aesenclast $rndkey0,$inout1
649
+
650
+ lea 16($inp),$inp
651
+ xorps $inout0,$in0 # inp ^= E(iv)
652
+ movdqa $iv,$inout0
653
+ movups $in0,($out) # save output
654
+ pshufb $bswap_mask,$inout0
655
+ lea 16($out),$out # $out+=16
656
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
657
+
658
+ pxor $rndkey0,$rndkey0 # clear register bank
659
+ pxor $rndkey1,$rndkey1
660
+ pxor $inout0,$inout0
661
+ movups $inout1,($cmac) # store resulting mac
662
+ pxor $inout1,$inout1
663
+ pxor $in0,$in0
664
+ pxor $iv,$iv
665
+ ___
666
+ $code.=<<___ if ($win64);
667
+ movaps (%rsp),%xmm6
668
+ movaps %xmm0,(%rsp) # clear stack
669
+ movaps 0x10(%rsp),%xmm7
670
+ movaps %xmm0,0x10(%rsp)
671
+ movaps 0x20(%rsp),%xmm8
672
+ movaps %xmm0,0x20(%rsp)
673
+ movaps 0x30(%rsp),%xmm9
674
+ movaps %xmm0,0x30(%rsp)
675
+ lea 0x58(%rsp),%rsp
676
+ .Lccm64_enc_ret:
677
+ ___
678
+ $code.=<<___;
679
+ ret
680
+ .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
681
+ ___
682
+ ######################################################################
683
+ $code.=<<___;
684
+ .globl aesni_ccm64_decrypt_blocks
685
+ .type aesni_ccm64_decrypt_blocks,\@function,6
686
+ .align 16
687
+ aesni_ccm64_decrypt_blocks:
688
+ ___
689
+ $code.=<<___ if ($win64);
690
+ lea -0x58(%rsp),%rsp
691
+ movaps %xmm6,(%rsp) # $iv
692
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
693
+ movaps %xmm8,0x20(%rsp) # $in8
694
+ movaps %xmm9,0x30(%rsp) # $increment
695
+ .Lccm64_dec_body:
696
+ ___
697
+ $code.=<<___;
698
+ mov 240($key),$rounds # key->rounds
699
+ movups ($ivp),$iv
700
+ movdqu ($cmac),$inout1
701
+ movdqa .Lincrement64(%rip),$increment
702
+ movdqa .Lbswap_mask(%rip),$bswap_mask
703
+
704
+ movaps $iv,$inout0
705
+ mov $rounds,$rnds_
706
+ mov $key,$key_
707
+ pshufb $bswap_mask,$iv
708
+ ___
709
+ &aesni_generate1("enc",$key,$rounds);
710
+ $code.=<<___;
711
+ shl \$4,$rnds_
712
+ mov \$16,$rounds
713
+ movups ($inp),$in0 # load inp
714
+ paddq $increment,$iv
715
+ lea 16($inp),$inp # $inp+=16
716
+ sub %r10,%rax # twisted $rounds
717
+ lea 32($key_,$rnds_),$key # end of key schedule
718
+ mov %rax,%r10
719
+ jmp .Lccm64_dec_outer
720
+ .align 16
721
+ .Lccm64_dec_outer:
722
+ xorps $inout0,$in0 # inp ^= E(iv)
723
+ movdqa $iv,$inout0
724
+ movups $in0,($out) # save output
725
+ lea 16($out),$out # $out+=16
726
+ pshufb $bswap_mask,$inout0
727
+
728
+ sub \$1,$len # $len-- ($len is in blocks)
729
+ jz .Lccm64_dec_break # if ($len==0) break
730
+
731
+ $movkey ($key_),$rndkey0
732
+ mov %r10,%rax
733
+ $movkey 16($key_),$rndkey1
734
+ xorps $rndkey0,$in0
735
+ xorps $rndkey0,$inout0
736
+ xorps $in0,$inout1 # cmac^=out
737
+ $movkey 32($key_),$rndkey0
738
+ jmp .Lccm64_dec2_loop
739
+ .align 16
740
+ .Lccm64_dec2_loop:
741
+ aesenc $rndkey1,$inout0
742
+ aesenc $rndkey1,$inout1
743
+ $movkey ($key,%rax),$rndkey1
744
+ add \$32,%rax
745
+ aesenc $rndkey0,$inout0
746
+ aesenc $rndkey0,$inout1
747
+ $movkey -16($key,%rax),$rndkey0
748
+ jnz .Lccm64_dec2_loop
749
+ movups ($inp),$in0 # load input
750
+ paddq $increment,$iv
751
+ aesenc $rndkey1,$inout0
752
+ aesenc $rndkey1,$inout1
753
+ aesenclast $rndkey0,$inout0
754
+ aesenclast $rndkey0,$inout1
755
+ lea 16($inp),$inp # $inp+=16
756
+ jmp .Lccm64_dec_outer
757
+
758
+ .align 16
759
+ .Lccm64_dec_break:
760
+ #xorps $in0,$inout1 # cmac^=out
761
+ mov 240($key_),$rounds
762
+ ___
763
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
764
+ $code.=<<___;
765
+ pxor $rndkey0,$rndkey0 # clear register bank
766
+ pxor $rndkey1,$rndkey1
767
+ pxor $inout0,$inout0
768
+ movups $inout1,($cmac) # store resulting mac
769
+ pxor $inout1,$inout1
770
+ pxor $in0,$in0
771
+ pxor $iv,$iv
772
+ ___
773
+ $code.=<<___ if ($win64);
774
+ movaps (%rsp),%xmm6
775
+ movaps %xmm0,(%rsp) # clear stack
776
+ movaps 0x10(%rsp),%xmm7
777
+ movaps %xmm0,0x10(%rsp)
778
+ movaps 0x20(%rsp),%xmm8
779
+ movaps %xmm0,0x20(%rsp)
780
+ movaps 0x30(%rsp),%xmm9
781
+ movaps %xmm0,0x30(%rsp)
782
+ lea 0x58(%rsp),%rsp
783
+ .Lccm64_dec_ret:
784
+ ___
785
+ $code.=<<___;
786
+ ret
787
+ .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
788
+ ___
789
+ }
790
+ ######################################################################
791
+ # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
792
+ # size_t blocks, const AES_KEY *key,
793
+ # const char *ivec);
794
+ #
795
+ # Handles only complete blocks, operates on 32-bit counter and
796
+ # does not update *ivec! (see crypto/modes/ctr128.c for details)
797
+ #
798
+ # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
799
+ # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
800
+ # Keywords are full unroll and modulo-schedule counter calculations
801
+ # with zero-round key xor.
802
+ {
803
+ my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
804
+ my ($key0,$ctr)=("${key_}d","${ivp}d");
805
+ my $frame_size = 0x80 + ($win64?160:0);
806
+
807
+ $code.=<<___;
808
+ .globl aesni_ctr32_encrypt_blocks
809
+ .type aesni_ctr32_encrypt_blocks,\@function,5
810
+ .align 16
811
+ aesni_ctr32_encrypt_blocks:
812
+ cmp \$1,$len
813
+ jne .Lctr32_bulk
814
+
815
+ # handle single block without allocating stack frame,
816
+ # useful when handling edges
817
+ movups ($ivp),$inout0
818
+ movups ($inp),$inout1
819
+ mov 240($key),%edx # key->rounds
820
+ ___
821
+ &aesni_generate1("enc",$key,"%edx");
822
+ $code.=<<___;
823
+ pxor $rndkey0,$rndkey0 # clear register bank
824
+ pxor $rndkey1,$rndkey1
825
+ xorps $inout1,$inout0
826
+ pxor $inout1,$inout1
827
+ movups $inout0,($out)
828
+ xorps $inout0,$inout0
829
+ jmp .Lctr32_epilogue
830
+
831
+ .align 16
832
+ .Lctr32_bulk:
833
+ lea (%rsp),%rax
834
+ push %rbp
835
+ sub \$$frame_size,%rsp
836
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
837
+ ___
838
+ $code.=<<___ if ($win64);
839
+ movaps %xmm6,-0xa8(%rax) # offload everything
840
+ movaps %xmm7,-0x98(%rax)
841
+ movaps %xmm8,-0x88(%rax)
842
+ movaps %xmm9,-0x78(%rax)
843
+ movaps %xmm10,-0x68(%rax)
844
+ movaps %xmm11,-0x58(%rax)
845
+ movaps %xmm12,-0x48(%rax)
846
+ movaps %xmm13,-0x38(%rax)
847
+ movaps %xmm14,-0x28(%rax)
848
+ movaps %xmm15,-0x18(%rax)
849
+ .Lctr32_body:
850
+ ___
851
+ $code.=<<___;
852
+ lea -8(%rax),%rbp
853
+
854
+ # 8 16-byte words on top of stack are counter values
855
+ # xor-ed with zero-round key
856
+
857
+ movdqu ($ivp),$inout0
858
+ movdqu ($key),$rndkey0
859
+ mov 12($ivp),$ctr # counter LSB
860
+ pxor $rndkey0,$inout0
861
+ mov 12($key),$key0 # 0-round key LSB
862
+ movdqa $inout0,0x00(%rsp) # populate counter block
863
+ bswap $ctr
864
+ movdqa $inout0,$inout1
865
+ movdqa $inout0,$inout2
866
+ movdqa $inout0,$inout3
867
+ movdqa $inout0,0x40(%rsp)
868
+ movdqa $inout0,0x50(%rsp)
869
+ movdqa $inout0,0x60(%rsp)
870
+ mov %rdx,%r10 # about to borrow %rdx
871
+ movdqa $inout0,0x70(%rsp)
872
+
873
+ lea 1($ctr),%rax
874
+ lea 2($ctr),%rdx
875
+ bswap %eax
876
+ bswap %edx
877
+ xor $key0,%eax
878
+ xor $key0,%edx
879
+ pinsrd \$3,%eax,$inout1
880
+ lea 3($ctr),%rax
881
+ movdqa $inout1,0x10(%rsp)
882
+ pinsrd \$3,%edx,$inout2
883
+ bswap %eax
884
+ mov %r10,%rdx # restore %rdx
885
+ lea 4($ctr),%r10
886
+ movdqa $inout2,0x20(%rsp)
887
+ xor $key0,%eax
888
+ bswap %r10d
889
+ pinsrd \$3,%eax,$inout3
890
+ xor $key0,%r10d
891
+ movdqa $inout3,0x30(%rsp)
892
+ lea 5($ctr),%r9
893
+ mov %r10d,0x40+12(%rsp)
894
+ bswap %r9d
895
+ lea 6($ctr),%r10
896
+ mov 240($key),$rounds # key->rounds
897
+ xor $key0,%r9d
898
+ bswap %r10d
899
+ mov %r9d,0x50+12(%rsp)
900
+ xor $key0,%r10d
901
+ lea 7($ctr),%r9
902
+ mov %r10d,0x60+12(%rsp)
903
+ bswap %r9d
904
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
905
+ xor $key0,%r9d
906
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
907
+ mov %r9d,0x70+12(%rsp)
908
+
909
+ $movkey 0x10($key),$rndkey1
910
+
911
+ movdqa 0x40(%rsp),$inout4
912
+ movdqa 0x50(%rsp),$inout5
913
+
914
+ cmp \$8,$len # $len is in blocks
915
+ jb .Lctr32_tail # short input if ($len<8)
916
+
917
+ sub \$6,$len # $len is biased by -6
918
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
919
+ je .Lctr32_6x # [which denotes Atom Silvermont]
920
+
921
+ lea 0x80($key),$key # size optimization
922
+ sub \$2,$len # $len is biased by -8
923
+ jmp .Lctr32_loop8
924
+
925
+ .align 16
926
+ .Lctr32_6x:
927
+ shl \$4,$rounds
928
+ mov \$48,$rnds_
929
+ bswap $key0
930
+ lea 32($key,$rounds),$key # end of key schedule
931
+ sub %rax,%r10 # twisted $rounds
932
+ jmp .Lctr32_loop6
933
+
934
+ .align 16
935
+ .Lctr32_loop6:
936
+ add \$6,$ctr # next counter value
937
+ $movkey -48($key,$rnds_),$rndkey0
938
+ aesenc $rndkey1,$inout0
939
+ mov $ctr,%eax
940
+ xor $key0,%eax
941
+ aesenc $rndkey1,$inout1
942
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
943
+ lea 1($ctr),%eax
944
+ aesenc $rndkey1,$inout2
945
+ xor $key0,%eax
946
+ movbe %eax,`0x10+12`(%rsp)
947
+ aesenc $rndkey1,$inout3
948
+ lea 2($ctr),%eax
949
+ xor $key0,%eax
950
+ aesenc $rndkey1,$inout4
951
+ movbe %eax,`0x20+12`(%rsp)
952
+ lea 3($ctr),%eax
953
+ aesenc $rndkey1,$inout5
954
+ $movkey -32($key,$rnds_),$rndkey1
955
+ xor $key0,%eax
956
+
957
+ aesenc $rndkey0,$inout0
958
+ movbe %eax,`0x30+12`(%rsp)
959
+ lea 4($ctr),%eax
960
+ aesenc $rndkey0,$inout1
961
+ xor $key0,%eax
962
+ movbe %eax,`0x40+12`(%rsp)
963
+ aesenc $rndkey0,$inout2
964
+ lea 5($ctr),%eax
965
+ xor $key0,%eax
966
+ aesenc $rndkey0,$inout3
967
+ movbe %eax,`0x50+12`(%rsp)
968
+ mov %r10,%rax # mov $rnds_,$rounds
969
+ aesenc $rndkey0,$inout4
970
+ aesenc $rndkey0,$inout5
971
+ $movkey -16($key,$rnds_),$rndkey0
972
+
973
+ call .Lenc_loop6
974
+
975
+ movdqu ($inp),$inout6 # load 6 input blocks
976
+ movdqu 0x10($inp),$inout7
977
+ movdqu 0x20($inp),$in0
978
+ movdqu 0x30($inp),$in1
979
+ movdqu 0x40($inp),$in2
980
+ movdqu 0x50($inp),$in3
981
+ lea 0x60($inp),$inp # $inp+=6*16
982
+ $movkey -64($key,$rnds_),$rndkey1
983
+ pxor $inout0,$inout6 # inp^=E(ctr)
984
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
985
+ pxor $inout1,$inout7
986
+ movaps 0x10(%rsp),$inout1
987
+ pxor $inout2,$in0
988
+ movaps 0x20(%rsp),$inout2
989
+ pxor $inout3,$in1
990
+ movaps 0x30(%rsp),$inout3
991
+ pxor $inout4,$in2
992
+ movaps 0x40(%rsp),$inout4
993
+ pxor $inout5,$in3
994
+ movaps 0x50(%rsp),$inout5
995
+ movdqu $inout6,($out) # store 6 output blocks
996
+ movdqu $inout7,0x10($out)
997
+ movdqu $in0,0x20($out)
998
+ movdqu $in1,0x30($out)
999
+ movdqu $in2,0x40($out)
1000
+ movdqu $in3,0x50($out)
1001
+ lea 0x60($out),$out # $out+=6*16
1002
+
1003
+ sub \$6,$len
1004
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1005
+
1006
+ add \$6,$len # restore real remaining $len
1007
+ jz .Lctr32_done # done if ($len==0)
1008
+
1009
+ lea -48($rnds_),$rounds
1010
+ lea -80($key,$rnds_),$key # restore $key
1011
+ neg $rounds
1012
+ shr \$4,$rounds # restore $rounds
1013
+ jmp .Lctr32_tail
1014
+
1015
+ .align 32
1016
+ .Lctr32_loop8:
1017
+ add \$8,$ctr # next counter value
1018
+ movdqa 0x60(%rsp),$inout6
1019
+ aesenc $rndkey1,$inout0
1020
+ mov $ctr,%r9d
1021
+ movdqa 0x70(%rsp),$inout7
1022
+ aesenc $rndkey1,$inout1
1023
+ bswap %r9d
1024
+ $movkey 0x20-0x80($key),$rndkey0
1025
+ aesenc $rndkey1,$inout2
1026
+ xor $key0,%r9d
1027
+ nop
1028
+ aesenc $rndkey1,$inout3
1029
+ mov %r9d,0x00+12(%rsp) # store next counter value
1030
+ lea 1($ctr),%r9
1031
+ aesenc $rndkey1,$inout4
1032
+ aesenc $rndkey1,$inout5
1033
+ aesenc $rndkey1,$inout6
1034
+ aesenc $rndkey1,$inout7
1035
+ $movkey 0x30-0x80($key),$rndkey1
1036
+ ___
1037
+ for($i=2;$i<8;$i++) {
1038
+ my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1039
+ $code.=<<___;
1040
+ bswap %r9d
1041
+ aesenc $rndkeyx,$inout0
1042
+ aesenc $rndkeyx,$inout1
1043
+ xor $key0,%r9d
1044
+ .byte 0x66,0x90
1045
+ aesenc $rndkeyx,$inout2
1046
+ aesenc $rndkeyx,$inout3
1047
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
1048
+ lea $i($ctr),%r9
1049
+ aesenc $rndkeyx,$inout4
1050
+ aesenc $rndkeyx,$inout5
1051
+ aesenc $rndkeyx,$inout6
1052
+ aesenc $rndkeyx,$inout7
1053
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1054
+ ___
1055
+ }
1056
+ $code.=<<___;
1057
+ bswap %r9d
1058
+ aesenc $rndkey0,$inout0
1059
+ aesenc $rndkey0,$inout1
1060
+ aesenc $rndkey0,$inout2
1061
+ xor $key0,%r9d
1062
+ movdqu 0x00($inp),$in0 # start loading input
1063
+ aesenc $rndkey0,$inout3
1064
+ mov %r9d,0x70+12(%rsp)
1065
+ cmp \$11,$rounds
1066
+ aesenc $rndkey0,$inout4
1067
+ aesenc $rndkey0,$inout5
1068
+ aesenc $rndkey0,$inout6
1069
+ aesenc $rndkey0,$inout7
1070
+ $movkey 0xa0-0x80($key),$rndkey0
1071
+
1072
+ jb .Lctr32_enc_done
1073
+
1074
+ aesenc $rndkey1,$inout0
1075
+ aesenc $rndkey1,$inout1
1076
+ aesenc $rndkey1,$inout2
1077
+ aesenc $rndkey1,$inout3
1078
+ aesenc $rndkey1,$inout4
1079
+ aesenc $rndkey1,$inout5
1080
+ aesenc $rndkey1,$inout6
1081
+ aesenc $rndkey1,$inout7
1082
+ $movkey 0xb0-0x80($key),$rndkey1
1083
+
1084
+ aesenc $rndkey0,$inout0
1085
+ aesenc $rndkey0,$inout1
1086
+ aesenc $rndkey0,$inout2
1087
+ aesenc $rndkey0,$inout3
1088
+ aesenc $rndkey0,$inout4
1089
+ aesenc $rndkey0,$inout5
1090
+ aesenc $rndkey0,$inout6
1091
+ aesenc $rndkey0,$inout7
1092
+ $movkey 0xc0-0x80($key),$rndkey0
1093
+ je .Lctr32_enc_done
1094
+
1095
+ aesenc $rndkey1,$inout0
1096
+ aesenc $rndkey1,$inout1
1097
+ aesenc $rndkey1,$inout2
1098
+ aesenc $rndkey1,$inout3
1099
+ aesenc $rndkey1,$inout4
1100
+ aesenc $rndkey1,$inout5
1101
+ aesenc $rndkey1,$inout6
1102
+ aesenc $rndkey1,$inout7
1103
+ $movkey 0xd0-0x80($key),$rndkey1
1104
+
1105
+ aesenc $rndkey0,$inout0
1106
+ aesenc $rndkey0,$inout1
1107
+ aesenc $rndkey0,$inout2
1108
+ aesenc $rndkey0,$inout3
1109
+ aesenc $rndkey0,$inout4
1110
+ aesenc $rndkey0,$inout5
1111
+ aesenc $rndkey0,$inout6
1112
+ aesenc $rndkey0,$inout7
1113
+ $movkey 0xe0-0x80($key),$rndkey0
1114
+ jmp .Lctr32_enc_done
1115
+
1116
+ .align 16
1117
+ .Lctr32_enc_done:
1118
+ movdqu 0x10($inp),$in1
1119
+ pxor $rndkey0,$in0 # input^=round[last]
1120
+ movdqu 0x20($inp),$in2
1121
+ pxor $rndkey0,$in1
1122
+ movdqu 0x30($inp),$in3
1123
+ pxor $rndkey0,$in2
1124
+ movdqu 0x40($inp),$in4
1125
+ pxor $rndkey0,$in3
1126
+ movdqu 0x50($inp),$in5
1127
+ pxor $rndkey0,$in4
1128
+ pxor $rndkey0,$in5
1129
+ aesenc $rndkey1,$inout0
1130
+ aesenc $rndkey1,$inout1
1131
+ aesenc $rndkey1,$inout2
1132
+ aesenc $rndkey1,$inout3
1133
+ aesenc $rndkey1,$inout4
1134
+ aesenc $rndkey1,$inout5
1135
+ aesenc $rndkey1,$inout6
1136
+ aesenc $rndkey1,$inout7
1137
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1138
+ lea 0x80($inp),$inp # $inp+=8*16
1139
+
1140
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1141
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
1142
+ movdqu 0x70-0x80($inp),$in0
1143
+ aesenclast $in1,$inout1
1144
+ pxor $rndkey0,$in0
1145
+ movdqa 0x00(%rsp),$in1 # load next counter block
1146
+ aesenclast $in2,$inout2
1147
+ aesenclast $in3,$inout3
1148
+ movdqa 0x10(%rsp),$in2
1149
+ movdqa 0x20(%rsp),$in3
1150
+ aesenclast $in4,$inout4
1151
+ aesenclast $in5,$inout5
1152
+ movdqa 0x30(%rsp),$in4
1153
+ movdqa 0x40(%rsp),$in5
1154
+ aesenclast $rndkey1,$inout6
1155
+ movdqa 0x50(%rsp),$rndkey0
1156
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1157
+ aesenclast $in0,$inout7
1158
+
1159
+ movups $inout0,($out) # store 8 output blocks
1160
+ movdqa $in1,$inout0
1161
+ movups $inout1,0x10($out)
1162
+ movdqa $in2,$inout1
1163
+ movups $inout2,0x20($out)
1164
+ movdqa $in3,$inout2
1165
+ movups $inout3,0x30($out)
1166
+ movdqa $in4,$inout3
1167
+ movups $inout4,0x40($out)
1168
+ movdqa $in5,$inout4
1169
+ movups $inout5,0x50($out)
1170
+ movdqa $rndkey0,$inout5
1171
+ movups $inout6,0x60($out)
1172
+ movups $inout7,0x70($out)
1173
+ lea 0x80($out),$out # $out+=8*16
1174
+
1175
+ sub \$8,$len
1176
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1177
+
1178
+ add \$8,$len # restore real remainig $len
1179
+ jz .Lctr32_done # done if ($len==0)
1180
+ lea -0x80($key),$key
1181
+
1182
+ .Lctr32_tail:
1183
+ # note that at this point $inout0..5 are populated with
1184
+ # counter values xor-ed with 0-round key
1185
+ lea 16($key),$key
1186
+ cmp \$4,$len
1187
+ jb .Lctr32_loop3
1188
+ je .Lctr32_loop4
1189
+
1190
+ # if ($len>4) compute 7 E(counter)
1191
+ shl \$4,$rounds
1192
+ movdqa 0x60(%rsp),$inout6
1193
+ pxor $inout7,$inout7
1194
+
1195
+ $movkey 16($key),$rndkey0
1196
+ aesenc $rndkey1,$inout0
1197
+ aesenc $rndkey1,$inout1
1198
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1199
+ neg %rax
1200
+ aesenc $rndkey1,$inout2
1201
+ add \$16,%rax # prepare for .Lenc_loop8_enter
1202
+ movups ($inp),$in0
1203
+ aesenc $rndkey1,$inout3
1204
+ aesenc $rndkey1,$inout4
1205
+ movups 0x10($inp),$in1 # pre-load input
1206
+ movups 0x20($inp),$in2
1207
+ aesenc $rndkey1,$inout5
1208
+ aesenc $rndkey1,$inout6
1209
+
1210
+ call .Lenc_loop8_enter
1211
+
1212
+ movdqu 0x30($inp),$in3
1213
+ pxor $in0,$inout0
1214
+ movdqu 0x40($inp),$in0
1215
+ pxor $in1,$inout1
1216
+ movdqu $inout0,($out) # store output
1217
+ pxor $in2,$inout2
1218
+ movdqu $inout1,0x10($out)
1219
+ pxor $in3,$inout3
1220
+ movdqu $inout2,0x20($out)
1221
+ pxor $in0,$inout4
1222
+ movdqu $inout3,0x30($out)
1223
+ movdqu $inout4,0x40($out)
1224
+ cmp \$6,$len
1225
+ jb .Lctr32_done # $len was 5, stop store
1226
+
1227
+ movups 0x50($inp),$in1
1228
+ xorps $in1,$inout5
1229
+ movups $inout5,0x50($out)
1230
+ je .Lctr32_done # $len was 6, stop store
1231
+
1232
+ movups 0x60($inp),$in2
1233
+ xorps $in2,$inout6
1234
+ movups $inout6,0x60($out)
1235
+ jmp .Lctr32_done # $len was 7, stop store
1236
+
1237
+ .align 32
1238
+ .Lctr32_loop4:
1239
+ aesenc $rndkey1,$inout0
1240
+ lea 16($key),$key
1241
+ dec $rounds
1242
+ aesenc $rndkey1,$inout1
1243
+ aesenc $rndkey1,$inout2
1244
+ aesenc $rndkey1,$inout3
1245
+ $movkey ($key),$rndkey1
1246
+ jnz .Lctr32_loop4
1247
+ aesenclast $rndkey1,$inout0
1248
+ aesenclast $rndkey1,$inout1
1249
+ movups ($inp),$in0 # load input
1250
+ movups 0x10($inp),$in1
1251
+ aesenclast $rndkey1,$inout2
1252
+ aesenclast $rndkey1,$inout3
1253
+ movups 0x20($inp),$in2
1254
+ movups 0x30($inp),$in3
1255
+
1256
+ xorps $in0,$inout0
1257
+ movups $inout0,($out) # store output
1258
+ xorps $in1,$inout1
1259
+ movups $inout1,0x10($out)
1260
+ pxor $in2,$inout2
1261
+ movdqu $inout2,0x20($out)
1262
+ pxor $in3,$inout3
1263
+ movdqu $inout3,0x30($out)
1264
+ jmp .Lctr32_done # $len was 4, stop store
1265
+
1266
+ .align 32
1267
+ .Lctr32_loop3:
1268
+ aesenc $rndkey1,$inout0
1269
+ lea 16($key),$key
1270
+ dec $rounds
1271
+ aesenc $rndkey1,$inout1
1272
+ aesenc $rndkey1,$inout2
1273
+ $movkey ($key),$rndkey1
1274
+ jnz .Lctr32_loop3
1275
+ aesenclast $rndkey1,$inout0
1276
+ aesenclast $rndkey1,$inout1
1277
+ aesenclast $rndkey1,$inout2
1278
+
1279
+ movups ($inp),$in0 # load input
1280
+ xorps $in0,$inout0
1281
+ movups $inout0,($out) # store output
1282
+ cmp \$2,$len
1283
+ jb .Lctr32_done # $len was 1, stop store
1284
+
1285
+ movups 0x10($inp),$in1
1286
+ xorps $in1,$inout1
1287
+ movups $inout1,0x10($out)
1288
+ je .Lctr32_done # $len was 2, stop store
1289
+
1290
+ movups 0x20($inp),$in2
1291
+ xorps $in2,$inout2
1292
+ movups $inout2,0x20($out) # $len was 3, stop store
1293
+
1294
+ .Lctr32_done:
1295
+ xorps %xmm0,%xmm0 # clear regiser bank
1296
+ xor $key0,$key0
1297
+ pxor %xmm1,%xmm1
1298
+ pxor %xmm2,%xmm2
1299
+ pxor %xmm3,%xmm3
1300
+ pxor %xmm4,%xmm4
1301
+ pxor %xmm5,%xmm5
1302
+ ___
1303
+ $code.=<<___ if (!$win64);
1304
+ pxor %xmm6,%xmm6
1305
+ pxor %xmm7,%xmm7
1306
+ movaps %xmm0,0x00(%rsp) # clear stack
1307
+ pxor %xmm8,%xmm8
1308
+ movaps %xmm0,0x10(%rsp)
1309
+ pxor %xmm9,%xmm9
1310
+ movaps %xmm0,0x20(%rsp)
1311
+ pxor %xmm10,%xmm10
1312
+ movaps %xmm0,0x30(%rsp)
1313
+ pxor %xmm11,%xmm11
1314
+ movaps %xmm0,0x40(%rsp)
1315
+ pxor %xmm12,%xmm12
1316
+ movaps %xmm0,0x50(%rsp)
1317
+ pxor %xmm13,%xmm13
1318
+ movaps %xmm0,0x60(%rsp)
1319
+ pxor %xmm14,%xmm14
1320
+ movaps %xmm0,0x70(%rsp)
1321
+ pxor %xmm15,%xmm15
1322
+ ___
1323
+ $code.=<<___ if ($win64);
1324
+ movaps -0xa0(%rbp),%xmm6
1325
+ movaps %xmm0,-0xa0(%rbp) # clear stack
1326
+ movaps -0x90(%rbp),%xmm7
1327
+ movaps %xmm0,-0x90(%rbp)
1328
+ movaps -0x80(%rbp),%xmm8
1329
+ movaps %xmm0,-0x80(%rbp)
1330
+ movaps -0x70(%rbp),%xmm9
1331
+ movaps %xmm0,-0x70(%rbp)
1332
+ movaps -0x60(%rbp),%xmm10
1333
+ movaps %xmm0,-0x60(%rbp)
1334
+ movaps -0x50(%rbp),%xmm11
1335
+ movaps %xmm0,-0x50(%rbp)
1336
+ movaps -0x40(%rbp),%xmm12
1337
+ movaps %xmm0,-0x40(%rbp)
1338
+ movaps -0x30(%rbp),%xmm13
1339
+ movaps %xmm0,-0x30(%rbp)
1340
+ movaps -0x20(%rbp),%xmm14
1341
+ movaps %xmm0,-0x20(%rbp)
1342
+ movaps -0x10(%rbp),%xmm15
1343
+ movaps %xmm0,-0x10(%rbp)
1344
+ movaps %xmm0,0x00(%rsp)
1345
+ movaps %xmm0,0x10(%rsp)
1346
+ movaps %xmm0,0x20(%rsp)
1347
+ movaps %xmm0,0x30(%rsp)
1348
+ movaps %xmm0,0x40(%rsp)
1349
+ movaps %xmm0,0x50(%rsp)
1350
+ movaps %xmm0,0x60(%rsp)
1351
+ movaps %xmm0,0x70(%rsp)
1352
+ ___
1353
+ $code.=<<___;
1354
+ lea (%rbp),%rsp
1355
+ pop %rbp
1356
+ .Lctr32_epilogue:
1357
+ ret
1358
+ .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1359
+ ___
1360
+ } }}
1361
+
1362
+ # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
1363
+ # int bits, AES_KEY *key)
1364
+ #
1365
+ # input: $inp user-supplied key
1366
+ # $bits $inp length in bits
1367
+ # $key pointer to key schedule
1368
+ # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1369
+ # *$key key schedule
1370
+ #
1371
+ { my ($inp,$bits,$key) = @_4args;
1372
+ $bits =~ s/%r/%e/;
1373
+
1374
+ $code.=<<___;
1375
+ .globl ${PREFIX}_set_decrypt_key
1376
+ .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
1377
+ .align 16
1378
+ ${PREFIX}_set_decrypt_key:
1379
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1380
+ call __aesni_set_encrypt_key
1381
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
1382
+ test %eax,%eax
1383
+ jnz .Ldec_key_ret
1384
+ lea 16($key,$bits),$inp # points at the end of key schedule
1385
+
1386
+ $movkey ($key),%xmm0 # just swap
1387
+ $movkey ($inp),%xmm1
1388
+ $movkey %xmm0,($inp)
1389
+ $movkey %xmm1,($key)
1390
+ lea 16($key),$key
1391
+ lea -16($inp),$inp
1392
+
1393
+ .Ldec_key_inverse:
1394
+ $movkey ($key),%xmm0 # swap and inverse
1395
+ $movkey ($inp),%xmm1
1396
+ aesimc %xmm0,%xmm0
1397
+ aesimc %xmm1,%xmm1
1398
+ lea 16($key),$key
1399
+ lea -16($inp),$inp
1400
+ $movkey %xmm0,16($inp)
1401
+ $movkey %xmm1,-16($key)
1402
+ cmp $key,$inp
1403
+ ja .Ldec_key_inverse
1404
+
1405
+ $movkey ($key),%xmm0 # inverse middle
1406
+ aesimc %xmm0,%xmm0
1407
+ pxor %xmm1,%xmm1
1408
+ $movkey %xmm0,($inp)
1409
+ pxor %xmm0,%xmm0
1410
+ .Ldec_key_ret:
1411
+ add \$8,%rsp
1412
+ ret
1413
+ .LSEH_end_set_decrypt_key:
1414
+ .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
1415
+ ___
1416
+
1417
+ # This is based on submission by
1418
+ #
1419
+ # Huang Ying <ying.huang@intel.com>
1420
+ # Vinodh Gopal <vinodh.gopal@intel.com>
1421
+ # Kahraman Akdemir
1422
+ #
1423
+ # Agressively optimized in respect to aeskeygenassist's critical path
1424
+ # and is contained in %xmm0-5 to meet Win64 ABI requirement.
1425
+ #
1426
+ # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
1427
+ # int bits, AES_KEY * const key);
1428
+ #
1429
+ # input: $inp user-supplied key
1430
+ # $bits $inp length in bits
1431
+ # $key pointer to key schedule
1432
+ # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1433
+ # $bits rounds-1 (used in aesni_set_decrypt_key)
1434
+ # *$key key schedule
1435
+ # $key pointer to key schedule (used in
1436
+ # aesni_set_decrypt_key)
1437
+ #
1438
+ # Subroutine is frame-less, which means that only volatile registers
1439
+ # are used. Note that it's declared "abi-omnipotent", which means that
1440
+ # amount of volatile registers is smaller on Windows.
1441
+ #
1442
+ $code.=<<___;
1443
+ .globl ${PREFIX}_set_encrypt_key
1444
+ .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
1445
+ .align 16
1446
+ ${PREFIX}_set_encrypt_key:
1447
+ __aesni_set_encrypt_key:
1448
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1449
+ mov \$-1,%rax
1450
+ test $inp,$inp
1451
+ jz .Lenc_key_ret
1452
+ test $key,$key
1453
+ jz .Lenc_key_ret
1454
+
1455
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
1456
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
1457
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
1458
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
1459
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
1460
+ cmp \$256,$bits
1461
+ je .L14rounds
1462
+ cmp \$192,$bits
1463
+ je .L12rounds
1464
+ cmp \$128,$bits
1465
+ jne .Lbad_keybits
1466
+
1467
+ .L10rounds:
1468
+ mov \$9,$bits # 10 rounds for 128-bit key
1469
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
1470
+ je .L10rounds_alt
1471
+
1472
+ $movkey %xmm0,($key) # round 0
1473
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
1474
+ call .Lkey_expansion_128_cold
1475
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
1476
+ call .Lkey_expansion_128
1477
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
1478
+ call .Lkey_expansion_128
1479
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
1480
+ call .Lkey_expansion_128
1481
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
1482
+ call .Lkey_expansion_128
1483
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
1484
+ call .Lkey_expansion_128
1485
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
1486
+ call .Lkey_expansion_128
1487
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
1488
+ call .Lkey_expansion_128
1489
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
1490
+ call .Lkey_expansion_128
1491
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
1492
+ call .Lkey_expansion_128
1493
+ $movkey %xmm0,(%rax)
1494
+ mov $bits,80(%rax) # 240(%rdx)
1495
+ xor %eax,%eax
1496
+ jmp .Lenc_key_ret
1497
+
1498
+ .align 16
1499
+ .L10rounds_alt:
1500
+ movdqa .Lkey_rotate(%rip),%xmm5
1501
+ mov \$8,%r10d
1502
+ movdqa .Lkey_rcon1(%rip),%xmm4
1503
+ movdqa %xmm0,%xmm2
1504
+ movdqu %xmm0,($key)
1505
+ jmp .Loop_key128
1506
+
1507
+ .align 16
1508
+ .Loop_key128:
1509
+ pshufb %xmm5,%xmm0
1510
+ aesenclast %xmm4,%xmm0
1511
+ pslld \$1,%xmm4
1512
+ lea 16(%rax),%rax
1513
+
1514
+ movdqa %xmm2,%xmm3
1515
+ pslldq \$4,%xmm2
1516
+ pxor %xmm2,%xmm3
1517
+ pslldq \$4,%xmm2
1518
+ pxor %xmm2,%xmm3
1519
+ pslldq \$4,%xmm2
1520
+ pxor %xmm3,%xmm2
1521
+
1522
+ pxor %xmm2,%xmm0
1523
+ movdqu %xmm0,-16(%rax)
1524
+ movdqa %xmm0,%xmm2
1525
+
1526
+ dec %r10d
1527
+ jnz .Loop_key128
1528
+
1529
+ movdqa .Lkey_rcon1b(%rip),%xmm4
1530
+
1531
+ pshufb %xmm5,%xmm0
1532
+ aesenclast %xmm4,%xmm0
1533
+ pslld \$1,%xmm4
1534
+
1535
+ movdqa %xmm2,%xmm3
1536
+ pslldq \$4,%xmm2
1537
+ pxor %xmm2,%xmm3
1538
+ pslldq \$4,%xmm2
1539
+ pxor %xmm2,%xmm3
1540
+ pslldq \$4,%xmm2
1541
+ pxor %xmm3,%xmm2
1542
+
1543
+ pxor %xmm2,%xmm0
1544
+ movdqu %xmm0,(%rax)
1545
+
1546
+ movdqa %xmm0,%xmm2
1547
+ pshufb %xmm5,%xmm0
1548
+ aesenclast %xmm4,%xmm0
1549
+
1550
+ movdqa %xmm2,%xmm3
1551
+ pslldq \$4,%xmm2
1552
+ pxor %xmm2,%xmm3
1553
+ pslldq \$4,%xmm2
1554
+ pxor %xmm2,%xmm3
1555
+ pslldq \$4,%xmm2
1556
+ pxor %xmm3,%xmm2
1557
+
1558
+ pxor %xmm2,%xmm0
1559
+ movdqu %xmm0,16(%rax)
1560
+
1561
+ mov $bits,96(%rax) # 240($key)
1562
+ xor %eax,%eax
1563
+ jmp .Lenc_key_ret
1564
+
1565
+ .align 16
1566
+ .L12rounds:
1567
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
1568
+ mov \$11,$bits # 12 rounds for 192
1569
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
1570
+ je .L12rounds_alt
1571
+
1572
+ $movkey %xmm0,($key) # round 0
1573
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
1574
+ call .Lkey_expansion_192a_cold
1575
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
1576
+ call .Lkey_expansion_192b
1577
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
1578
+ call .Lkey_expansion_192a
1579
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
1580
+ call .Lkey_expansion_192b
1581
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
1582
+ call .Lkey_expansion_192a
1583
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
1584
+ call .Lkey_expansion_192b
1585
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
1586
+ call .Lkey_expansion_192a
1587
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
1588
+ call .Lkey_expansion_192b
1589
+ $movkey %xmm0,(%rax)
1590
+ mov $bits,48(%rax) # 240(%rdx)
1591
+ xor %rax, %rax
1592
+ jmp .Lenc_key_ret
1593
+
1594
+ .align 16
1595
+ .L12rounds_alt:
1596
+ movdqa .Lkey_rotate192(%rip),%xmm5
1597
+ movdqa .Lkey_rcon1(%rip),%xmm4
1598
+ mov \$8,%r10d
1599
+ movdqu %xmm0,($key)
1600
+ jmp .Loop_key192
1601
+
1602
+ .align 16
1603
+ .Loop_key192:
1604
+ movq %xmm2,0(%rax)
1605
+ movdqa %xmm2,%xmm1
1606
+ pshufb %xmm5,%xmm2
1607
+ aesenclast %xmm4,%xmm2
1608
+ pslld \$1, %xmm4
1609
+ lea 24(%rax),%rax
1610
+
1611
+ movdqa %xmm0,%xmm3
1612
+ pslldq \$4,%xmm0
1613
+ pxor %xmm0,%xmm3
1614
+ pslldq \$4,%xmm0
1615
+ pxor %xmm0,%xmm3
1616
+ pslldq \$4,%xmm0
1617
+ pxor %xmm3,%xmm0
1618
+
1619
+ pshufd \$0xff,%xmm0,%xmm3
1620
+ pxor %xmm1,%xmm3
1621
+ pslldq \$4,%xmm1
1622
+ pxor %xmm1,%xmm3
1623
+
1624
+ pxor %xmm2,%xmm0
1625
+ pxor %xmm3,%xmm2
1626
+ movdqu %xmm0,-16(%rax)
1627
+
1628
+ dec %r10d
1629
+ jnz .Loop_key192
1630
+
1631
+ mov $bits,32(%rax) # 240($key)
1632
+ xor %eax,%eax
1633
+ jmp .Lenc_key_ret
1634
+
1635
+ .align 16
1636
+ .L14rounds:
1637
+ movups 16($inp),%xmm2 # remaning half of *userKey
1638
+ mov \$13,$bits # 14 rounds for 256
1639
+ lea 16(%rax),%rax
1640
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
1641
+ je .L14rounds_alt
1642
+
1643
+ $movkey %xmm0,($key) # round 0
1644
+ $movkey %xmm2,16($key) # round 1
1645
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
1646
+ call .Lkey_expansion_256a_cold
1647
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
1648
+ call .Lkey_expansion_256b
1649
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
1650
+ call .Lkey_expansion_256a
1651
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
1652
+ call .Lkey_expansion_256b
1653
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
1654
+ call .Lkey_expansion_256a
1655
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
1656
+ call .Lkey_expansion_256b
1657
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
1658
+ call .Lkey_expansion_256a
1659
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
1660
+ call .Lkey_expansion_256b
1661
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
1662
+ call .Lkey_expansion_256a
1663
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
1664
+ call .Lkey_expansion_256b
1665
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
1666
+ call .Lkey_expansion_256a
1667
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
1668
+ call .Lkey_expansion_256b
1669
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
1670
+ call .Lkey_expansion_256a
1671
+ $movkey %xmm0,(%rax)
1672
+ mov $bits,16(%rax) # 240(%rdx)
1673
+ xor %rax,%rax
1674
+ jmp .Lenc_key_ret
1675
+
1676
+ .align 16
1677
+ .L14rounds_alt:
1678
+ movdqa .Lkey_rotate(%rip),%xmm5
1679
+ movdqa .Lkey_rcon1(%rip),%xmm4
1680
+ mov \$7,%r10d
1681
+ movdqu %xmm0,0($key)
1682
+ movdqa %xmm2,%xmm1
1683
+ movdqu %xmm2,16($key)
1684
+ jmp .Loop_key256
1685
+
1686
+ .align 16
1687
+ .Loop_key256:
1688
+ pshufb %xmm5,%xmm2
1689
+ aesenclast %xmm4,%xmm2
1690
+
1691
+ movdqa %xmm0,%xmm3
1692
+ pslldq \$4,%xmm0
1693
+ pxor %xmm0,%xmm3
1694
+ pslldq \$4,%xmm0
1695
+ pxor %xmm0,%xmm3
1696
+ pslldq \$4,%xmm0
1697
+ pxor %xmm3,%xmm0
1698
+ pslld \$1,%xmm4
1699
+
1700
+ pxor %xmm2,%xmm0
1701
+ movdqu %xmm0,(%rax)
1702
+
1703
+ dec %r10d
1704
+ jz .Ldone_key256
1705
+
1706
+ pshufd \$0xff,%xmm0,%xmm2
1707
+ pxor %xmm3,%xmm3
1708
+ aesenclast %xmm3,%xmm2
1709
+
1710
+ movdqa %xmm1,%xmm3
1711
+ pslldq \$4,%xmm1
1712
+ pxor %xmm1,%xmm3
1713
+ pslldq \$4,%xmm1
1714
+ pxor %xmm1,%xmm3
1715
+ pslldq \$4,%xmm1
1716
+ pxor %xmm3,%xmm1
1717
+
1718
+ pxor %xmm1,%xmm2
1719
+ movdqu %xmm2,16(%rax)
1720
+ lea 32(%rax),%rax
1721
+ movdqa %xmm2,%xmm1
1722
+
1723
+ jmp .Loop_key256
1724
+
1725
+ .Ldone_key256:
1726
+ mov $bits,16(%rax) # 240($key)
1727
+ xor %eax,%eax
1728
+ jmp .Lenc_key_ret
1729
+
1730
+ .align 16
1731
+ .Lbad_keybits:
1732
+ mov \$-2,%rax
1733
+ .Lenc_key_ret:
1734
+ pxor %xmm0,%xmm0
1735
+ pxor %xmm1,%xmm1
1736
+ pxor %xmm2,%xmm2
1737
+ pxor %xmm3,%xmm3
1738
+ pxor %xmm4,%xmm4
1739
+ pxor %xmm5,%xmm5
1740
+ add \$8,%rsp
1741
+ ret
1742
+ .LSEH_end_set_encrypt_key:
1743
+
1744
+ .align 16
1745
+ .Lkey_expansion_128:
1746
+ $movkey %xmm0,(%rax)
1747
+ lea 16(%rax),%rax
1748
+ .Lkey_expansion_128_cold:
1749
+ shufps \$0b00010000,%xmm0,%xmm4
1750
+ xorps %xmm4, %xmm0
1751
+ shufps \$0b10001100,%xmm0,%xmm4
1752
+ xorps %xmm4, %xmm0
1753
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
1754
+ xorps %xmm1,%xmm0
1755
+ ret
1756
+
1757
+ .align 16
1758
+ .Lkey_expansion_192a:
1759
+ $movkey %xmm0,(%rax)
1760
+ lea 16(%rax),%rax
1761
+ .Lkey_expansion_192a_cold:
1762
+ movaps %xmm2, %xmm5
1763
+ .Lkey_expansion_192b_warm:
1764
+ shufps \$0b00010000,%xmm0,%xmm4
1765
+ movdqa %xmm2,%xmm3
1766
+ xorps %xmm4,%xmm0
1767
+ shufps \$0b10001100,%xmm0,%xmm4
1768
+ pslldq \$4,%xmm3
1769
+ xorps %xmm4,%xmm0
1770
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
1771
+ pxor %xmm3,%xmm2
1772
+ pxor %xmm1,%xmm0
1773
+ pshufd \$0b11111111,%xmm0,%xmm3
1774
+ pxor %xmm3,%xmm2
1775
+ ret
1776
+
1777
+ .align 16
1778
+ .Lkey_expansion_192b:
1779
+ movaps %xmm0,%xmm3
1780
+ shufps \$0b01000100,%xmm0,%xmm5
1781
+ $movkey %xmm5,(%rax)
1782
+ shufps \$0b01001110,%xmm2,%xmm3
1783
+ $movkey %xmm3,16(%rax)
1784
+ lea 32(%rax),%rax
1785
+ jmp .Lkey_expansion_192b_warm
1786
+
1787
+ .align 16
1788
+ .Lkey_expansion_256a:
1789
+ $movkey %xmm2,(%rax)
1790
+ lea 16(%rax),%rax
1791
+ .Lkey_expansion_256a_cold:
1792
+ shufps \$0b00010000,%xmm0,%xmm4
1793
+ xorps %xmm4,%xmm0
1794
+ shufps \$0b10001100,%xmm0,%xmm4
1795
+ xorps %xmm4,%xmm0
1796
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
1797
+ xorps %xmm1,%xmm0
1798
+ ret
1799
+
1800
+ .align 16
1801
+ .Lkey_expansion_256b:
1802
+ $movkey %xmm0,(%rax)
1803
+ lea 16(%rax),%rax
1804
+
1805
+ shufps \$0b00010000,%xmm2,%xmm4
1806
+ xorps %xmm4,%xmm2
1807
+ shufps \$0b10001100,%xmm2,%xmm4
1808
+ xorps %xmm4,%xmm2
1809
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
1810
+ xorps %xmm1,%xmm2
1811
+ ret
1812
+ .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
1813
+ .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
1814
+ ___
1815
+ }
1816
+
1817
+ $code.=<<___;
1818
+ .align 64
1819
+ .Lbswap_mask:
1820
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1821
+ .Lincrement32:
1822
+ .long 6,6,6,0
1823
+ .Lincrement64:
1824
+ .long 1,0,0,0
1825
+ .Lincrement1:
1826
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1827
+ .Lkey_rotate:
1828
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1829
+ .Lkey_rotate192:
1830
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
1831
+ .Lkey_rcon1:
1832
+ .long 1,1,1,1
1833
+ .Lkey_rcon1b:
1834
+ .long 0x1b,0x1b,0x1b,0x1b
1835
+
1836
+ .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
1837
+ .align 64
1838
+ ___
1839
+
1840
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1841
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1842
+ if ($win64) {
1843
+ $rec="%rcx";
1844
+ $frame="%rdx";
1845
+ $context="%r8";
1846
+ $disp="%r9";
1847
+
1848
+ $code.=<<___;
1849
+ .extern __imp_RtlVirtualUnwind
1850
+ ___
1851
+ $code.=<<___ if ($PREFIX eq "aesni");
1852
+ .type ccm64_se_handler,\@abi-omnipotent
1853
+ .align 16
1854
+ ccm64_se_handler:
1855
+ push %rsi
1856
+ push %rdi
1857
+ push %rbx
1858
+ push %rbp
1859
+ push %r12
1860
+ push %r13
1861
+ push %r14
1862
+ push %r15
1863
+ pushfq
1864
+ sub \$64,%rsp
1865
+
1866
+ mov 120($context),%rax # pull context->Rax
1867
+ mov 248($context),%rbx # pull context->Rip
1868
+
1869
+ mov 8($disp),%rsi # disp->ImageBase
1870
+ mov 56($disp),%r11 # disp->HandlerData
1871
+
1872
+ mov 0(%r11),%r10d # HandlerData[0]
1873
+ lea (%rsi,%r10),%r10 # prologue label
1874
+ cmp %r10,%rbx # context->Rip<prologue label
1875
+ jb .Lcommon_seh_tail
1876
+
1877
+ mov 152($context),%rax # pull context->Rsp
1878
+
1879
+ mov 4(%r11),%r10d # HandlerData[1]
1880
+ lea (%rsi,%r10),%r10 # epilogue label
1881
+ cmp %r10,%rbx # context->Rip>=epilogue label
1882
+ jae .Lcommon_seh_tail
1883
+
1884
+ lea 0(%rax),%rsi # %xmm save area
1885
+ lea 512($context),%rdi # &context.Xmm6
1886
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
1887
+ .long 0xa548f3fc # cld; rep movsq
1888
+ lea 0x58(%rax),%rax # adjust stack pointer
1889
+
1890
+ jmp .Lcommon_seh_tail
1891
+ .size ccm64_se_handler,.-ccm64_se_handler
1892
+
1893
+ .type ctr_se_handler,\@abi-omnipotent
1894
+ .align 16
1895
+ ctr_se_handler:
1896
+ push %rsi
1897
+ push %rdi
1898
+ push %rbx
1899
+ push %rbp
1900
+ push %r12
1901
+ push %r13
1902
+ push %r14
1903
+ push %r15
1904
+ pushfq
1905
+ sub \$64,%rsp
1906
+
1907
+ mov 120($context),%rax # pull context->Rax
1908
+ mov 248($context),%rbx # pull context->Rip
1909
+
1910
+ mov 8($disp),%rsi # disp->ImageBase
1911
+ mov 56($disp),%r11 # disp->HandlerData
1912
+
1913
+ mov 0(%r11),%r10d # HandlerData[0]
1914
+ lea (%rsi,%r10),%r10 # prologue lable
1915
+ cmp %r10,%rbx # context->Rip<prologue label
1916
+ jb .Lcommon_seh_tail
1917
+
1918
+ mov 152($context),%rax # pull context->Rsp
1919
+
1920
+ mov 4(%r11),%r10d # HandlerData[1]
1921
+ lea (%rsi,%r10),%r10 # epilogue label
1922
+ cmp %r10,%rbx # context->Rip>=epilogue label
1923
+ jae .Lcommon_seh_tail
1924
+
1925
+ mov 160($context),%rax # pull context->Rbp
1926
+ lea -0xa0(%rax),%rsi # %xmm save area
1927
+ lea 512($context),%rdi # & context.Xmm6
1928
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1929
+ .long 0xa548f3fc # cld; rep movsq
1930
+
1931
+ mov 160($context),%rax # pull context->Rbp
1932
+ mov (%rax),%rbp # restore saved %rbp
1933
+ lea 8(%rax),%rax # adjust stack pointer
1934
+ mov %rbp,160($context) # restore context->Rbp
1935
+
1936
+ mov 8(%rax),%rdi
1937
+ mov 16(%rax),%rsi
1938
+ mov %rax,152($context) # restore context->Rsp
1939
+ mov %rsi,168($context) # restore context->Rsi
1940
+ mov %rdi,176($context) # restore context->Rdi
1941
+
1942
+ mov 40($disp),%rdi # disp->ContextRecord
1943
+ mov $context,%rsi # context
1944
+ mov \$154,%ecx # sizeof(CONTEXT)
1945
+ .long 0xa548f3fc # cld; rep movsq
1946
+
1947
+ mov $disp,%rsi
1948
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1949
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1950
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1951
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1952
+ mov 40(%rsi),%r10 # disp->ContextRecord
1953
+ lea 56(%rsi),%r11 # &disp->HandlerData
1954
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1955
+ mov %r10,32(%rsp) # arg5
1956
+ mov %r11,40(%rsp) # arg6
1957
+ mov %r12,48(%rsp) # arg7
1958
+ mov %rcx,56(%rsp) # arg8, (NULL)
1959
+ call *__imp_RtlVirtualUnwind(%rip)
1960
+
1961
+ mov \$1,%eax # ExceptionContinueSearch
1962
+ add \$64,%rsp
1963
+ popfq
1964
+ pop %r15
1965
+ pop %r14
1966
+ pop %r13
1967
+ pop %r12
1968
+ pop %rbp
1969
+ pop %rbx
1970
+ pop %rdi
1971
+ pop %rsi
1972
+ ret
1973
+ .size ctr_se_handler,.-ctr_se_handler
1974
+
1975
+ .section .pdata
1976
+ .align 4
1977
+ ___
1978
+ $code.=<<___ if ($PREFIX eq "aesni");
1979
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
1980
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
1981
+ .rva .LSEH_info_ccm64_enc
1982
+
1983
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
1984
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
1985
+ .rva .LSEH_info_ccm64_dec
1986
+
1987
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
1988
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
1989
+ .rva .LSEH_info_ctr32
1990
+ ___
1991
+ $code.=<<___;
1992
+ .rva ${PREFIX}_set_decrypt_key
1993
+ .rva .LSEH_end_set_decrypt_key
1994
+ .rva .LSEH_info_key
1995
+
1996
+ .rva ${PREFIX}_set_encrypt_key
1997
+ .rva .LSEH_end_set_encrypt_key
1998
+ .rva .LSEH_info_key
1999
+ .section .xdata
2000
+ .align 8
2001
+ ___
2002
+ $code.=<<___ if ($PREFIX eq "aesni");
2003
+ .LSEH_info_ccm64_enc:
2004
+ .byte 9,0,0,0
2005
+ .rva ccm64_se_handler
2006
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
2007
+ .LSEH_info_ccm64_dec:
2008
+ .byte 9,0,0,0
2009
+ .rva ccm64_se_handler
2010
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
2011
+ .LSEH_info_ctr32:
2012
+ .byte 9,0,0,0
2013
+ .rva ctr_se_handler
2014
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
2015
+ ___
2016
+ $code.=<<___;
2017
+ .LSEH_info_key:
2018
+ .byte 0x01,0x04,0x01,0x00
2019
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
2020
+ ___
2021
+ }
2022
+
2023
+ sub rex {
2024
+ local *opcode=shift;
2025
+ my ($dst,$src)=@_;
2026
+ my $rex=0;
2027
+
2028
+ $rex|=0x04 if($dst>=8);
2029
+ $rex|=0x01 if($src>=8);
2030
+ push @opcode,$rex|0x40 if($rex);
2031
+ }
2032
+
2033
+ sub aesni {
2034
+ my $line=shift;
2035
+ my @opcode=(0x66);
2036
+
2037
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2038
+ rex(\@opcode,$4,$3);
2039
+ push @opcode,0x0f,0x3a,0xdf;
2040
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
2041
+ my $c=$2;
2042
+ push @opcode,$c=~/^0/?oct($c):$c;
2043
+ return ".byte\t".join(',',@opcode);
2044
+ }
2045
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2046
+ my %opcodelet = (
2047
+ "aesimc" => 0xdb,
2048
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
2049
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
2050
+ );
2051
+ return undef if (!defined($opcodelet{$1}));
2052
+ rex(\@opcode,$3,$2);
2053
+ push @opcode,0x0f,0x38,$opcodelet{$1};
2054
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
2055
+ return ".byte\t".join(',',@opcode);
2056
+ }
2057
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
2058
+ my %opcodelet = (
2059
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
2060
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
2061
+ );
2062
+ return undef if (!defined($opcodelet{$1}));
2063
+ my $off = $2;
2064
+ push @opcode,0x44 if ($3>=8);
2065
+ push @opcode,0x0f,0x38,$opcodelet{$1};
2066
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
2067
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
2068
+ return ".byte\t".join(',',@opcode);
2069
+ }
2070
+ return $line;
2071
+ }
2072
+
2073
+ sub movbe {
2074
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
2075
+ }
2076
+
2077
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
2078
+ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
2079
+ #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
2080
+ $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
2081
+
2082
+ print $code;
2083
+
2084
+ close STDOUT;