ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,2084 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # This module implements support for Intel AES-NI extension. In
11
+ # OpenSSL context it's used with Intel engine, but can also be used as
12
+ # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
13
+ # details].
14
+ #
15
+ # Performance.
16
+ #
17
+ # Given aes(enc|dec) instructions' latency asymptotic performance for
18
+ # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
19
+ # processed with 128-bit key. And given their throughput asymptotic
20
+ # performance for parallelizable modes is 1.25 cycles per byte. Being
21
+ # asymptotic limit it's not something you commonly achieve in reality,
22
+ # but how close does one get? Below are results collected for
23
+ # different modes and block sized. Pairs of numbers are for en-/
24
+ # decryption.
25
+ #
26
+ # 16-byte 64-byte 256-byte 1-KB 8-KB
27
+ # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
28
+ # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
29
+ # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
30
+ # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
31
+ # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
32
+ # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
33
+ #
34
+ # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
35
+ # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
36
+ # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
37
+ # The results were collected with specially crafted speed.c benchmark
38
+ # in order to compare them with results reported in "Intel Advanced
39
+ # Encryption Standard (AES) New Instruction Set" White Paper Revision
40
+ # 3.0 dated May 2010. All above results are consistently better. This
41
+ # module also provides better performance for block sizes smaller than
42
+ # 128 bytes in points *not* represented in the above table.
43
+ #
44
+ # Looking at the results for 8-KB buffer.
45
+ #
46
+ # CFB and OFB results are far from the limit, because implementation
47
+ # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
48
+ # single-block aesni_encrypt, which is not the most optimal way to go.
49
+ # CBC encrypt result is unexpectedly high and there is no documented
50
+ # explanation for it. Seemingly there is a small penalty for feeding
51
+ # the result back to AES unit the way it's done in CBC mode. There is
52
+ # nothing one can do and the result appears optimal. CCM result is
53
+ # identical to CBC, because CBC-MAC is essentially CBC encrypt without
54
+ # saving output. CCM CTR "stays invisible," because it's neatly
55
+ # interleaved wih CBC-MAC. This provides ~30% improvement over
56
+ # "straghtforward" CCM implementation with CTR and CBC-MAC performed
57
+ # disjointly. Parallelizable modes practically achieve the theoretical
58
+ # limit.
59
+ #
60
+ # Looking at how results vary with buffer size.
61
+ #
62
+ # Curves are practically saturated at 1-KB buffer size. In most cases
63
+ # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
64
+ # CTR curve doesn't follow this pattern and is "slowest" changing one
65
+ # with "256-byte" result being 87% of "8-KB." This is because overhead
66
+ # in CTR mode is most computationally intensive. Small-block CCM
67
+ # decrypt is slower than encrypt, because first CTR and last CBC-MAC
68
+ # iterations can't be interleaved.
69
+ #
70
+ # Results for 192- and 256-bit keys.
71
+ #
72
+ # EVP-free results were observed to scale perfectly with number of
73
+ # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
74
+ # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
75
+ # are a tad smaller, because the above mentioned penalty biases all
76
+ # results by same constant value. In similar way function call
77
+ # overhead affects small-block performance, as well as OFB and CFB
78
+ # results. Differences are not large, most common coefficients are
79
+ # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
80
+ # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
81
+
82
+ # January 2011
83
+ #
84
+ # While Westmere processor features 6 cycles latency for aes[enc|dec]
85
+ # instructions, which can be scheduled every second cycle, Sandy
86
+ # Bridge spends 8 cycles per instruction, but it can schedule them
87
+ # every cycle. This means that code targeting Westmere would perform
88
+ # suboptimally on Sandy Bridge. Therefore this update.
89
+ #
90
+ # In addition, non-parallelizable CBC encrypt (as well as CCM) is
91
+ # optimized. Relative improvement might appear modest, 8% on Westmere,
92
+ # but in absolute terms it's 3.77 cycles per byte encrypted with
93
+ # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
94
+ # should be compared to asymptotic limits of 3.75 for Westmere and
95
+ # 5.00 for Sandy Bridge. Actually, the fact that they get this close
96
+ # to asymptotic limits is quite amazing. Indeed, the limit is
97
+ # calculated as latency times number of rounds, 10 for 128-bit key,
98
+ # and divided by 16, the number of bytes in block, or in other words
99
+ # it accounts *solely* for aesenc instructions. But there are extra
100
+ # instructions, and numbers so close to the asymptotic limits mean
101
+ # that it's as if it takes as little as *one* additional cycle to
102
+ # execute all of them. How is it possible? It is possible thanks to
103
+ # out-of-order execution logic, which manages to overlap post-
104
+ # processing of previous block, things like saving the output, with
105
+ # actual encryption of current block, as well as pre-processing of
106
+ # current block, things like fetching input and xor-ing it with
107
+ # 0-round element of the key schedule, with actual encryption of
108
+ # previous block. Keep this in mind...
109
+ #
110
+ # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
111
+ # performance is achieved by interleaving instructions working on
112
+ # independent blocks. In which case asymptotic limit for such modes
113
+ # can be obtained by dividing above mentioned numbers by AES
114
+ # instructions' interleave factor. Westmere can execute at most 3
115
+ # instructions at a time, meaning that optimal interleave factor is 3,
116
+ # and that's where the "magic" number of 1.25 come from. "Optimal
117
+ # interleave factor" means that increase of interleave factor does
118
+ # not improve performance. The formula has proven to reflect reality
119
+ # pretty well on Westmere... Sandy Bridge on the other hand can
120
+ # execute up to 8 AES instructions at a time, so how does varying
121
+ # interleave factor affect the performance? Here is table for ECB
122
+ # (numbers are cycles per byte processed with 128-bit key):
123
+ #
124
+ # instruction interleave factor 3x 6x 8x
125
+ # theoretical asymptotic limit 1.67 0.83 0.625
126
+ # measured performance for 8KB block 1.05 0.86 0.84
127
+ #
128
+ # "as if" interleave factor 4.7x 5.8x 6.0x
129
+ #
130
+ # Further data for other parallelizable modes:
131
+ #
132
+ # CBC decrypt 1.16 0.93 0.74
133
+ # CTR 1.14 0.91 0.74
134
+ #
135
+ # Well, given 3x column it's probably inappropriate to call the limit
136
+ # asymptotic, if it can be surpassed, isn't it? What happens there?
137
+ # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
138
+ # magic is responsible for this. Processor overlaps not only the
139
+ # additional instructions with AES ones, but even AES instuctions
140
+ # processing adjacent triplets of independent blocks. In the 6x case
141
+ # additional instructions still claim disproportionally small amount
142
+ # of additional cycles, but in 8x case number of instructions must be
143
+ # a tad too high for out-of-order logic to cope with, and AES unit
144
+ # remains underutilized... As you can see 8x interleave is hardly
145
+ # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
146
+ # utilizies 6x interleave because of limited register bank capacity.
147
+ #
148
+ # Higher interleave factors do have negative impact on Westmere
149
+ # performance. While for ECB mode it's negligible ~1.5%, other
150
+ # parallelizables perform ~5% worse, which is outweighed by ~25%
151
+ # improvement on Sandy Bridge. To balance regression on Westmere
152
+ # CTR mode was implemented with 6x aesenc interleave factor.
153
+
154
+ # April 2011
155
+ #
156
+ # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
157
+ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
158
+ # in CTR mode AES instruction interleave factor was chosen to be 6x.
159
+
160
+ ######################################################################
161
+ # Current large-block performance in cycles per byte processed with
162
+ # 128-bit key (less is better).
163
+ #
164
+ # CBC en-/decrypt CTR XTS ECB
165
+ # Westmere 3.77/1.25 1.25 1.25 1.26
166
+ # * Bridge 5.07/0.74 0.75 0.90 0.85
167
+ # Haswell 4.44/0.63 0.63 0.73 0.63
168
+ # Silvermont 5.75/3.54 3.56 4.12 3.87(*)
169
+ # Bulldozer 5.77/0.70 0.72 0.90 0.70
170
+ #
171
+ # (*) Atom Silvermont ECB result is suboptimal because of penalties
172
+ # incurred by operations on %xmm8-15. As ECB is not considered
173
+ # critical, nothing was done to mitigate the problem.
174
+
175
+ $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
176
+ # generates drop-in replacement for
177
+ # crypto/aes/asm/aes-x86_64.pl:-)
178
+
179
+ $flavour = shift;
180
+ $output = shift;
181
+ if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
182
+
183
+ $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
184
+
185
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
186
+ ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
187
+ ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
188
+ die "can't locate x86_64-xlate.pl";
189
+
190
+ open OUT,"| \"$^X\" $xlate $flavour $output";
191
+ *STDOUT=*OUT;
192
+
193
+ $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
194
+ @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
195
+ ("%rdi","%rsi","%rdx","%rcx"); # Unix order
196
+
197
+ $code=".text\n";
198
+ $code.=".extern OPENSSL_ia32cap_P\n";
199
+
200
+ $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
201
+ # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
202
+ $inp="%rdi";
203
+ $out="%rsi";
204
+ $len="%rdx";
205
+ $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
206
+ $ivp="%r8"; # cbc, ctr, ...
207
+
208
+ $rnds_="%r10d"; # backup copy for $rounds
209
+ $key_="%r11"; # backup copy for $key
210
+
211
+ # %xmm register layout
212
+ $rndkey0="%xmm0"; $rndkey1="%xmm1";
213
+ $inout0="%xmm2"; $inout1="%xmm3";
214
+ $inout2="%xmm4"; $inout3="%xmm5";
215
+ $inout4="%xmm6"; $inout5="%xmm7";
216
+ $inout6="%xmm8"; $inout7="%xmm9";
217
+
218
+ $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
219
+ $in0="%xmm8"; $iv="%xmm9";
220
+
221
+ # Inline version of internal aesni_[en|de]crypt1.
222
+ #
223
+ # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
224
+ # cycles which take care of loop variables...
225
+ { my $sn;
226
+ sub aesni_generate1 {
227
+ my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
228
+ ++$sn;
229
+ $code.=<<___;
230
+ $movkey ($key),$rndkey0
231
+ $movkey 16($key),$rndkey1
232
+ ___
233
+ $code.=<<___ if (defined($ivec));
234
+ xorps $rndkey0,$ivec
235
+ lea 32($key),$key
236
+ xorps $ivec,$inout
237
+ ___
238
+ $code.=<<___ if (!defined($ivec));
239
+ lea 32($key),$key
240
+ xorps $rndkey0,$inout
241
+ ___
242
+ $code.=<<___;
243
+ .Loop_${p}1_$sn:
244
+ aes${p} $rndkey1,$inout
245
+ dec $rounds
246
+ $movkey ($key),$rndkey1
247
+ lea 16($key),$key
248
+ jnz .Loop_${p}1_$sn # loop body is 16 bytes
249
+ aes${p}last $rndkey1,$inout
250
+ ___
251
+ }}
252
+ # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
253
+ #
254
+ { my ($inp,$out,$key) = @_4args;
255
+
256
+ $code.=<<___;
257
+ .globl ${PREFIX}_encrypt
258
+ .type ${PREFIX}_encrypt,\@abi-omnipotent
259
+ .align 16
260
+ ${PREFIX}_encrypt:
261
+ movups ($inp),$inout0 # load input
262
+ mov 240($key),$rounds # key->rounds
263
+ ___
264
+ &aesni_generate1("enc",$key,$rounds);
265
+ $code.=<<___;
266
+ pxor $rndkey0,$rndkey0 # clear register bank
267
+ pxor $rndkey1,$rndkey1
268
+ movups $inout0,($out) # output
269
+ pxor $inout0,$inout0
270
+ ret
271
+ .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
272
+
273
+ .globl ${PREFIX}_decrypt
274
+ .type ${PREFIX}_decrypt,\@abi-omnipotent
275
+ .align 16
276
+ ${PREFIX}_decrypt:
277
+ movups ($inp),$inout0 # load input
278
+ mov 240($key),$rounds # key->rounds
279
+ ___
280
+ &aesni_generate1("dec",$key,$rounds);
281
+ $code.=<<___;
282
+ pxor $rndkey0,$rndkey0 # clear register bank
283
+ pxor $rndkey1,$rndkey1
284
+ movups $inout0,($out) # output
285
+ pxor $inout0,$inout0
286
+ ret
287
+ .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
288
+ ___
289
+ }
290
+
291
+ # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
292
+ # factor. Why 3x subroutine were originally used in loops? Even though
293
+ # aes[enc|dec] latency was originally 6, it could be scheduled only
294
+ # every *2nd* cycle. Thus 3x interleave was the one providing optimal
295
+ # utilization, i.e. when subroutine's throughput is virtually same as
296
+ # of non-interleaved subroutine [for number of input blocks up to 3].
297
+ # This is why it originally made no sense to implement 2x subroutine.
298
+ # But times change and it became appropriate to spend extra 192 bytes
299
+ # on 2x subroutine on Atom Silvermont account. For processors that
300
+ # can schedule aes[enc|dec] every cycle optimal interleave factor
301
+ # equals to corresponding instructions latency. 8x is optimal for
302
+ # * Bridge and "super-optimal" for other Intel CPUs...
303
+
304
+ sub aesni_generate2 {
305
+ my $dir=shift;
306
+ # As already mentioned it takes in $key and $rounds, which are *not*
307
+ # preserved. $inout[0-1] is cipher/clear text...
308
+ $code.=<<___;
309
+ .type _aesni_${dir}rypt2,\@abi-omnipotent
310
+ .align 16
311
+ _aesni_${dir}rypt2:
312
+ $movkey ($key),$rndkey0
313
+ shl \$4,$rounds
314
+ $movkey 16($key),$rndkey1
315
+ xorps $rndkey0,$inout0
316
+ xorps $rndkey0,$inout1
317
+ $movkey 32($key),$rndkey0
318
+ lea 32($key,$rounds),$key
319
+ neg %rax # $rounds
320
+ add \$16,%rax
321
+
322
+ .L${dir}_loop2:
323
+ aes${dir} $rndkey1,$inout0
324
+ aes${dir} $rndkey1,$inout1
325
+ $movkey ($key,%rax),$rndkey1
326
+ add \$32,%rax
327
+ aes${dir} $rndkey0,$inout0
328
+ aes${dir} $rndkey0,$inout1
329
+ $movkey -16($key,%rax),$rndkey0
330
+ jnz .L${dir}_loop2
331
+
332
+ aes${dir} $rndkey1,$inout0
333
+ aes${dir} $rndkey1,$inout1
334
+ aes${dir}last $rndkey0,$inout0
335
+ aes${dir}last $rndkey0,$inout1
336
+ ret
337
+ .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
338
+ ___
339
+ }
340
+ sub aesni_generate3 {
341
+ my $dir=shift;
342
+ # As already mentioned it takes in $key and $rounds, which are *not*
343
+ # preserved. $inout[0-2] is cipher/clear text...
344
+ $code.=<<___;
345
+ .type _aesni_${dir}rypt3,\@abi-omnipotent
346
+ .align 16
347
+ _aesni_${dir}rypt3:
348
+ $movkey ($key),$rndkey0
349
+ shl \$4,$rounds
350
+ $movkey 16($key),$rndkey1
351
+ xorps $rndkey0,$inout0
352
+ xorps $rndkey0,$inout1
353
+ xorps $rndkey0,$inout2
354
+ $movkey 32($key),$rndkey0
355
+ lea 32($key,$rounds),$key
356
+ neg %rax # $rounds
357
+ add \$16,%rax
358
+
359
+ .L${dir}_loop3:
360
+ aes${dir} $rndkey1,$inout0
361
+ aes${dir} $rndkey1,$inout1
362
+ aes${dir} $rndkey1,$inout2
363
+ $movkey ($key,%rax),$rndkey1
364
+ add \$32,%rax
365
+ aes${dir} $rndkey0,$inout0
366
+ aes${dir} $rndkey0,$inout1
367
+ aes${dir} $rndkey0,$inout2
368
+ $movkey -16($key,%rax),$rndkey0
369
+ jnz .L${dir}_loop3
370
+
371
+ aes${dir} $rndkey1,$inout0
372
+ aes${dir} $rndkey1,$inout1
373
+ aes${dir} $rndkey1,$inout2
374
+ aes${dir}last $rndkey0,$inout0
375
+ aes${dir}last $rndkey0,$inout1
376
+ aes${dir}last $rndkey0,$inout2
377
+ ret
378
+ .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
379
+ ___
380
+ }
381
+ # 4x interleave is implemented to improve small block performance,
382
+ # most notably [and naturally] 4 block by ~30%. One can argue that one
383
+ # should have implemented 5x as well, but improvement would be <20%,
384
+ # so it's not worth it...
385
+ sub aesni_generate4 {
386
+ my $dir=shift;
387
+ # As already mentioned it takes in $key and $rounds, which are *not*
388
+ # preserved. $inout[0-3] is cipher/clear text...
389
+ $code.=<<___;
390
+ .type _aesni_${dir}rypt4,\@abi-omnipotent
391
+ .align 16
392
+ _aesni_${dir}rypt4:
393
+ $movkey ($key),$rndkey0
394
+ shl \$4,$rounds
395
+ $movkey 16($key),$rndkey1
396
+ xorps $rndkey0,$inout0
397
+ xorps $rndkey0,$inout1
398
+ xorps $rndkey0,$inout2
399
+ xorps $rndkey0,$inout3
400
+ $movkey 32($key),$rndkey0
401
+ lea 32($key,$rounds),$key
402
+ neg %rax # $rounds
403
+ .byte 0x0f,0x1f,0x00
404
+ add \$16,%rax
405
+
406
+ .L${dir}_loop4:
407
+ aes${dir} $rndkey1,$inout0
408
+ aes${dir} $rndkey1,$inout1
409
+ aes${dir} $rndkey1,$inout2
410
+ aes${dir} $rndkey1,$inout3
411
+ $movkey ($key,%rax),$rndkey1
412
+ add \$32,%rax
413
+ aes${dir} $rndkey0,$inout0
414
+ aes${dir} $rndkey0,$inout1
415
+ aes${dir} $rndkey0,$inout2
416
+ aes${dir} $rndkey0,$inout3
417
+ $movkey -16($key,%rax),$rndkey0
418
+ jnz .L${dir}_loop4
419
+
420
+ aes${dir} $rndkey1,$inout0
421
+ aes${dir} $rndkey1,$inout1
422
+ aes${dir} $rndkey1,$inout2
423
+ aes${dir} $rndkey1,$inout3
424
+ aes${dir}last $rndkey0,$inout0
425
+ aes${dir}last $rndkey0,$inout1
426
+ aes${dir}last $rndkey0,$inout2
427
+ aes${dir}last $rndkey0,$inout3
428
+ ret
429
+ .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
430
+ ___
431
+ }
432
+ sub aesni_generate6 {
433
+ my $dir=shift;
434
+ # As already mentioned it takes in $key and $rounds, which are *not*
435
+ # preserved. $inout[0-5] is cipher/clear text...
436
+ $code.=<<___;
437
+ .type _aesni_${dir}rypt6,\@abi-omnipotent
438
+ .align 16
439
+ _aesni_${dir}rypt6:
440
+ $movkey ($key),$rndkey0
441
+ shl \$4,$rounds
442
+ $movkey 16($key),$rndkey1
443
+ xorps $rndkey0,$inout0
444
+ pxor $rndkey0,$inout1
445
+ pxor $rndkey0,$inout2
446
+ aes${dir} $rndkey1,$inout0
447
+ lea 32($key,$rounds),$key
448
+ neg %rax # $rounds
449
+ aes${dir} $rndkey1,$inout1
450
+ pxor $rndkey0,$inout3
451
+ pxor $rndkey0,$inout4
452
+ aes${dir} $rndkey1,$inout2
453
+ pxor $rndkey0,$inout5
454
+ $movkey ($key,%rax),$rndkey0
455
+ add \$16,%rax
456
+ jmp .L${dir}_loop6_enter
457
+ .align 16
458
+ .L${dir}_loop6:
459
+ aes${dir} $rndkey1,$inout0
460
+ aes${dir} $rndkey1,$inout1
461
+ aes${dir} $rndkey1,$inout2
462
+ .L${dir}_loop6_enter:
463
+ aes${dir} $rndkey1,$inout3
464
+ aes${dir} $rndkey1,$inout4
465
+ aes${dir} $rndkey1,$inout5
466
+ $movkey ($key,%rax),$rndkey1
467
+ add \$32,%rax
468
+ aes${dir} $rndkey0,$inout0
469
+ aes${dir} $rndkey0,$inout1
470
+ aes${dir} $rndkey0,$inout2
471
+ aes${dir} $rndkey0,$inout3
472
+ aes${dir} $rndkey0,$inout4
473
+ aes${dir} $rndkey0,$inout5
474
+ $movkey -16($key,%rax),$rndkey0
475
+ jnz .L${dir}_loop6
476
+
477
+ aes${dir} $rndkey1,$inout0
478
+ aes${dir} $rndkey1,$inout1
479
+ aes${dir} $rndkey1,$inout2
480
+ aes${dir} $rndkey1,$inout3
481
+ aes${dir} $rndkey1,$inout4
482
+ aes${dir} $rndkey1,$inout5
483
+ aes${dir}last $rndkey0,$inout0
484
+ aes${dir}last $rndkey0,$inout1
485
+ aes${dir}last $rndkey0,$inout2
486
+ aes${dir}last $rndkey0,$inout3
487
+ aes${dir}last $rndkey0,$inout4
488
+ aes${dir}last $rndkey0,$inout5
489
+ ret
490
+ .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
491
+ ___
492
+ }
493
+ sub aesni_generate8 {
494
+ my $dir=shift;
495
+ # As already mentioned it takes in $key and $rounds, which are *not*
496
+ # preserved. $inout[0-7] is cipher/clear text...
497
+ $code.=<<___;
498
+ .type _aesni_${dir}rypt8,\@abi-omnipotent
499
+ .align 16
500
+ _aesni_${dir}rypt8:
501
+ $movkey ($key),$rndkey0
502
+ shl \$4,$rounds
503
+ $movkey 16($key),$rndkey1
504
+ xorps $rndkey0,$inout0
505
+ xorps $rndkey0,$inout1
506
+ pxor $rndkey0,$inout2
507
+ pxor $rndkey0,$inout3
508
+ pxor $rndkey0,$inout4
509
+ lea 32($key,$rounds),$key
510
+ neg %rax # $rounds
511
+ aes${dir} $rndkey1,$inout0
512
+ pxor $rndkey0,$inout5
513
+ pxor $rndkey0,$inout6
514
+ aes${dir} $rndkey1,$inout1
515
+ pxor $rndkey0,$inout7
516
+ $movkey ($key,%rax),$rndkey0
517
+ add \$16,%rax
518
+ jmp .L${dir}_loop8_inner
519
+ .align 16
520
+ .L${dir}_loop8:
521
+ aes${dir} $rndkey1,$inout0
522
+ aes${dir} $rndkey1,$inout1
523
+ .L${dir}_loop8_inner:
524
+ aes${dir} $rndkey1,$inout2
525
+ aes${dir} $rndkey1,$inout3
526
+ aes${dir} $rndkey1,$inout4
527
+ aes${dir} $rndkey1,$inout5
528
+ aes${dir} $rndkey1,$inout6
529
+ aes${dir} $rndkey1,$inout7
530
+ .L${dir}_loop8_enter:
531
+ $movkey ($key,%rax),$rndkey1
532
+ add \$32,%rax
533
+ aes${dir} $rndkey0,$inout0
534
+ aes${dir} $rndkey0,$inout1
535
+ aes${dir} $rndkey0,$inout2
536
+ aes${dir} $rndkey0,$inout3
537
+ aes${dir} $rndkey0,$inout4
538
+ aes${dir} $rndkey0,$inout5
539
+ aes${dir} $rndkey0,$inout6
540
+ aes${dir} $rndkey0,$inout7
541
+ $movkey -16($key,%rax),$rndkey0
542
+ jnz .L${dir}_loop8
543
+
544
+ aes${dir} $rndkey1,$inout0
545
+ aes${dir} $rndkey1,$inout1
546
+ aes${dir} $rndkey1,$inout2
547
+ aes${dir} $rndkey1,$inout3
548
+ aes${dir} $rndkey1,$inout4
549
+ aes${dir} $rndkey1,$inout5
550
+ aes${dir} $rndkey1,$inout6
551
+ aes${dir} $rndkey1,$inout7
552
+ aes${dir}last $rndkey0,$inout0
553
+ aes${dir}last $rndkey0,$inout1
554
+ aes${dir}last $rndkey0,$inout2
555
+ aes${dir}last $rndkey0,$inout3
556
+ aes${dir}last $rndkey0,$inout4
557
+ aes${dir}last $rndkey0,$inout5
558
+ aes${dir}last $rndkey0,$inout6
559
+ aes${dir}last $rndkey0,$inout7
560
+ ret
561
+ .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
562
+ ___
563
+ }
564
+ &aesni_generate2("enc") if ($PREFIX eq "aesni");
565
+ &aesni_generate2("dec");
566
+ &aesni_generate3("enc") if ($PREFIX eq "aesni");
567
+ &aesni_generate3("dec");
568
+ &aesni_generate4("enc") if ($PREFIX eq "aesni");
569
+ &aesni_generate4("dec");
570
+ &aesni_generate6("enc") if ($PREFIX eq "aesni");
571
+ &aesni_generate6("dec");
572
+ &aesni_generate8("enc") if ($PREFIX eq "aesni");
573
+ &aesni_generate8("dec");
574
+
575
+ if ($PREFIX eq "aesni") {
576
+ {
577
+ ######################################################################
578
+ # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
579
+ # size_t blocks, const AES_KEY *key,
580
+ # const char *ivec,char *cmac);
581
+ #
582
+ # Handles only complete blocks, operates on 64-bit counter and
583
+ # does not update *ivec! Nor does it finalize CMAC value
584
+ # (see engine/eng_aesni.c for details)
585
+ #
586
+ {
587
+ my $cmac="%r9"; # 6th argument
588
+
589
+ my $increment="%xmm9";
590
+ my $iv="%xmm6";
591
+ my $bswap_mask="%xmm7";
592
+
593
+ $code.=<<___;
594
+ .globl aesni_ccm64_encrypt_blocks
595
+ .type aesni_ccm64_encrypt_blocks,\@function,6
596
+ .align 16
597
+ aesni_ccm64_encrypt_blocks:
598
+ ___
599
+ $code.=<<___ if ($win64);
600
+ lea -0x58(%rsp),%rsp
601
+ movaps %xmm6,(%rsp) # $iv
602
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
603
+ movaps %xmm8,0x20(%rsp) # $in0
604
+ movaps %xmm9,0x30(%rsp) # $increment
605
+ .Lccm64_enc_body:
606
+ ___
607
+ $code.=<<___;
608
+ mov 240($key),$rounds # key->rounds
609
+ movdqu ($ivp),$iv
610
+ movdqa .Lincrement64(%rip),$increment
611
+ movdqa .Lbswap_mask(%rip),$bswap_mask
612
+
613
+ shl \$4,$rounds
614
+ mov \$16,$rnds_
615
+ lea 0($key),$key_
616
+ movdqu ($cmac),$inout1
617
+ movdqa $iv,$inout0
618
+ lea 32($key,$rounds),$key # end of key schedule
619
+ pshufb $bswap_mask,$iv
620
+ sub %rax,%r10 # twisted $rounds
621
+ jmp .Lccm64_enc_outer
622
+ .align 16
623
+ .Lccm64_enc_outer:
624
+ $movkey ($key_),$rndkey0
625
+ mov %r10,%rax
626
+ movups ($inp),$in0 # load inp
627
+
628
+ xorps $rndkey0,$inout0 # counter
629
+ $movkey 16($key_),$rndkey1
630
+ xorps $in0,$rndkey0
631
+ xorps $rndkey0,$inout1 # cmac^=inp
632
+ $movkey 32($key_),$rndkey0
633
+
634
+ .Lccm64_enc2_loop:
635
+ aesenc $rndkey1,$inout0
636
+ aesenc $rndkey1,$inout1
637
+ $movkey ($key,%rax),$rndkey1
638
+ add \$32,%rax
639
+ aesenc $rndkey0,$inout0
640
+ aesenc $rndkey0,$inout1
641
+ $movkey -16($key,%rax),$rndkey0
642
+ jnz .Lccm64_enc2_loop
643
+ aesenc $rndkey1,$inout0
644
+ aesenc $rndkey1,$inout1
645
+ paddq $increment,$iv
646
+ dec $len # $len-- ($len is in blocks)
647
+ aesenclast $rndkey0,$inout0
648
+ aesenclast $rndkey0,$inout1
649
+
650
+ lea 16($inp),$inp
651
+ xorps $inout0,$in0 # inp ^= E(iv)
652
+ movdqa $iv,$inout0
653
+ movups $in0,($out) # save output
654
+ pshufb $bswap_mask,$inout0
655
+ lea 16($out),$out # $out+=16
656
+ jnz .Lccm64_enc_outer # loop if ($len!=0)
657
+
658
+ pxor $rndkey0,$rndkey0 # clear register bank
659
+ pxor $rndkey1,$rndkey1
660
+ pxor $inout0,$inout0
661
+ movups $inout1,($cmac) # store resulting mac
662
+ pxor $inout1,$inout1
663
+ pxor $in0,$in0
664
+ pxor $iv,$iv
665
+ ___
666
+ $code.=<<___ if ($win64);
667
+ movaps (%rsp),%xmm6
668
+ movaps %xmm0,(%rsp) # clear stack
669
+ movaps 0x10(%rsp),%xmm7
670
+ movaps %xmm0,0x10(%rsp)
671
+ movaps 0x20(%rsp),%xmm8
672
+ movaps %xmm0,0x20(%rsp)
673
+ movaps 0x30(%rsp),%xmm9
674
+ movaps %xmm0,0x30(%rsp)
675
+ lea 0x58(%rsp),%rsp
676
+ .Lccm64_enc_ret:
677
+ ___
678
+ $code.=<<___;
679
+ ret
680
+ .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
681
+ ___
682
+ ######################################################################
683
+ $code.=<<___;
684
+ .globl aesni_ccm64_decrypt_blocks
685
+ .type aesni_ccm64_decrypt_blocks,\@function,6
686
+ .align 16
687
+ aesni_ccm64_decrypt_blocks:
688
+ ___
689
+ $code.=<<___ if ($win64);
690
+ lea -0x58(%rsp),%rsp
691
+ movaps %xmm6,(%rsp) # $iv
692
+ movaps %xmm7,0x10(%rsp) # $bswap_mask
693
+ movaps %xmm8,0x20(%rsp) # $in8
694
+ movaps %xmm9,0x30(%rsp) # $increment
695
+ .Lccm64_dec_body:
696
+ ___
697
+ $code.=<<___;
698
+ mov 240($key),$rounds # key->rounds
699
+ movups ($ivp),$iv
700
+ movdqu ($cmac),$inout1
701
+ movdqa .Lincrement64(%rip),$increment
702
+ movdqa .Lbswap_mask(%rip),$bswap_mask
703
+
704
+ movaps $iv,$inout0
705
+ mov $rounds,$rnds_
706
+ mov $key,$key_
707
+ pshufb $bswap_mask,$iv
708
+ ___
709
+ &aesni_generate1("enc",$key,$rounds);
710
+ $code.=<<___;
711
+ shl \$4,$rnds_
712
+ mov \$16,$rounds
713
+ movups ($inp),$in0 # load inp
714
+ paddq $increment,$iv
715
+ lea 16($inp),$inp # $inp+=16
716
+ sub %r10,%rax # twisted $rounds
717
+ lea 32($key_,$rnds_),$key # end of key schedule
718
+ mov %rax,%r10
719
+ jmp .Lccm64_dec_outer
720
+ .align 16
721
+ .Lccm64_dec_outer:
722
+ xorps $inout0,$in0 # inp ^= E(iv)
723
+ movdqa $iv,$inout0
724
+ movups $in0,($out) # save output
725
+ lea 16($out),$out # $out+=16
726
+ pshufb $bswap_mask,$inout0
727
+
728
+ sub \$1,$len # $len-- ($len is in blocks)
729
+ jz .Lccm64_dec_break # if ($len==0) break
730
+
731
+ $movkey ($key_),$rndkey0
732
+ mov %r10,%rax
733
+ $movkey 16($key_),$rndkey1
734
+ xorps $rndkey0,$in0
735
+ xorps $rndkey0,$inout0
736
+ xorps $in0,$inout1 # cmac^=out
737
+ $movkey 32($key_),$rndkey0
738
+ jmp .Lccm64_dec2_loop
739
+ .align 16
740
+ .Lccm64_dec2_loop:
741
+ aesenc $rndkey1,$inout0
742
+ aesenc $rndkey1,$inout1
743
+ $movkey ($key,%rax),$rndkey1
744
+ add \$32,%rax
745
+ aesenc $rndkey0,$inout0
746
+ aesenc $rndkey0,$inout1
747
+ $movkey -16($key,%rax),$rndkey0
748
+ jnz .Lccm64_dec2_loop
749
+ movups ($inp),$in0 # load input
750
+ paddq $increment,$iv
751
+ aesenc $rndkey1,$inout0
752
+ aesenc $rndkey1,$inout1
753
+ aesenclast $rndkey0,$inout0
754
+ aesenclast $rndkey0,$inout1
755
+ lea 16($inp),$inp # $inp+=16
756
+ jmp .Lccm64_dec_outer
757
+
758
+ .align 16
759
+ .Lccm64_dec_break:
760
+ #xorps $in0,$inout1 # cmac^=out
761
+ mov 240($key_),$rounds
762
+ ___
763
+ &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
764
+ $code.=<<___;
765
+ pxor $rndkey0,$rndkey0 # clear register bank
766
+ pxor $rndkey1,$rndkey1
767
+ pxor $inout0,$inout0
768
+ movups $inout1,($cmac) # store resulting mac
769
+ pxor $inout1,$inout1
770
+ pxor $in0,$in0
771
+ pxor $iv,$iv
772
+ ___
773
+ $code.=<<___ if ($win64);
774
+ movaps (%rsp),%xmm6
775
+ movaps %xmm0,(%rsp) # clear stack
776
+ movaps 0x10(%rsp),%xmm7
777
+ movaps %xmm0,0x10(%rsp)
778
+ movaps 0x20(%rsp),%xmm8
779
+ movaps %xmm0,0x20(%rsp)
780
+ movaps 0x30(%rsp),%xmm9
781
+ movaps %xmm0,0x30(%rsp)
782
+ lea 0x58(%rsp),%rsp
783
+ .Lccm64_dec_ret:
784
+ ___
785
+ $code.=<<___;
786
+ ret
787
+ .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
788
+ ___
789
+ }
790
+ ######################################################################
791
+ # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
792
+ # size_t blocks, const AES_KEY *key,
793
+ # const char *ivec);
794
+ #
795
+ # Handles only complete blocks, operates on 32-bit counter and
796
+ # does not update *ivec! (see crypto/modes/ctr128.c for details)
797
+ #
798
+ # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
799
+ # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
800
+ # Keywords are full unroll and modulo-schedule counter calculations
801
+ # with zero-round key xor.
802
+ {
803
+ my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
804
+ my ($key0,$ctr)=("${key_}d","${ivp}d");
805
+ my $frame_size = 0x80 + ($win64?160:0);
806
+
807
+ $code.=<<___;
808
+ .globl aesni_ctr32_encrypt_blocks
809
+ .type aesni_ctr32_encrypt_blocks,\@function,5
810
+ .align 16
811
+ aesni_ctr32_encrypt_blocks:
812
+ cmp \$1,$len
813
+ jne .Lctr32_bulk
814
+
815
+ # handle single block without allocating stack frame,
816
+ # useful when handling edges
817
+ movups ($ivp),$inout0
818
+ movups ($inp),$inout1
819
+ mov 240($key),%edx # key->rounds
820
+ ___
821
+ &aesni_generate1("enc",$key,"%edx");
822
+ $code.=<<___;
823
+ pxor $rndkey0,$rndkey0 # clear register bank
824
+ pxor $rndkey1,$rndkey1
825
+ xorps $inout1,$inout0
826
+ pxor $inout1,$inout1
827
+ movups $inout0,($out)
828
+ xorps $inout0,$inout0
829
+ jmp .Lctr32_epilogue
830
+
831
+ .align 16
832
+ .Lctr32_bulk:
833
+ lea (%rsp),%rax
834
+ push %rbp
835
+ sub \$$frame_size,%rsp
836
+ and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
837
+ ___
838
+ $code.=<<___ if ($win64);
839
+ movaps %xmm6,-0xa8(%rax) # offload everything
840
+ movaps %xmm7,-0x98(%rax)
841
+ movaps %xmm8,-0x88(%rax)
842
+ movaps %xmm9,-0x78(%rax)
843
+ movaps %xmm10,-0x68(%rax)
844
+ movaps %xmm11,-0x58(%rax)
845
+ movaps %xmm12,-0x48(%rax)
846
+ movaps %xmm13,-0x38(%rax)
847
+ movaps %xmm14,-0x28(%rax)
848
+ movaps %xmm15,-0x18(%rax)
849
+ .Lctr32_body:
850
+ ___
851
+ $code.=<<___;
852
+ lea -8(%rax),%rbp
853
+
854
+ # 8 16-byte words on top of stack are counter values
855
+ # xor-ed with zero-round key
856
+
857
+ movdqu ($ivp),$inout0
858
+ movdqu ($key),$rndkey0
859
+ mov 12($ivp),$ctr # counter LSB
860
+ pxor $rndkey0,$inout0
861
+ mov 12($key),$key0 # 0-round key LSB
862
+ movdqa $inout0,0x00(%rsp) # populate counter block
863
+ bswap $ctr
864
+ movdqa $inout0,$inout1
865
+ movdqa $inout0,$inout2
866
+ movdqa $inout0,$inout3
867
+ movdqa $inout0,0x40(%rsp)
868
+ movdqa $inout0,0x50(%rsp)
869
+ movdqa $inout0,0x60(%rsp)
870
+ mov %rdx,%r10 # about to borrow %rdx
871
+ movdqa $inout0,0x70(%rsp)
872
+
873
+ lea 1($ctr),%rax
874
+ lea 2($ctr),%rdx
875
+ bswap %eax
876
+ bswap %edx
877
+ xor $key0,%eax
878
+ xor $key0,%edx
879
+ pinsrd \$3,%eax,$inout1
880
+ lea 3($ctr),%rax
881
+ movdqa $inout1,0x10(%rsp)
882
+ pinsrd \$3,%edx,$inout2
883
+ bswap %eax
884
+ mov %r10,%rdx # restore %rdx
885
+ lea 4($ctr),%r10
886
+ movdqa $inout2,0x20(%rsp)
887
+ xor $key0,%eax
888
+ bswap %r10d
889
+ pinsrd \$3,%eax,$inout3
890
+ xor $key0,%r10d
891
+ movdqa $inout3,0x30(%rsp)
892
+ lea 5($ctr),%r9
893
+ mov %r10d,0x40+12(%rsp)
894
+ bswap %r9d
895
+ lea 6($ctr),%r10
896
+ mov 240($key),$rounds # key->rounds
897
+ xor $key0,%r9d
898
+ bswap %r10d
899
+ mov %r9d,0x50+12(%rsp)
900
+ xor $key0,%r10d
901
+ lea 7($ctr),%r9
902
+ mov %r10d,0x60+12(%rsp)
903
+ bswap %r9d
904
+ mov OPENSSL_ia32cap_P+4(%rip),%r10d
905
+ xor $key0,%r9d
906
+ and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
907
+ mov %r9d,0x70+12(%rsp)
908
+
909
+ $movkey 0x10($key),$rndkey1
910
+
911
+ movdqa 0x40(%rsp),$inout4
912
+ movdqa 0x50(%rsp),$inout5
913
+
914
+ cmp \$8,$len # $len is in blocks
915
+ jb .Lctr32_tail # short input if ($len<8)
916
+
917
+ sub \$6,$len # $len is biased by -6
918
+ cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
919
+ je .Lctr32_6x # [which denotes Atom Silvermont]
920
+
921
+ lea 0x80($key),$key # size optimization
922
+ sub \$2,$len # $len is biased by -8
923
+ jmp .Lctr32_loop8
924
+
925
+ .align 16
926
+ .Lctr32_6x:
927
+ shl \$4,$rounds
928
+ mov \$48,$rnds_
929
+ bswap $key0
930
+ lea 32($key,$rounds),$key # end of key schedule
931
+ sub %rax,%r10 # twisted $rounds
932
+ jmp .Lctr32_loop6
933
+
934
+ .align 16
935
+ .Lctr32_loop6:
936
+ add \$6,$ctr # next counter value
937
+ $movkey -48($key,$rnds_),$rndkey0
938
+ aesenc $rndkey1,$inout0
939
+ mov $ctr,%eax
940
+ xor $key0,%eax
941
+ aesenc $rndkey1,$inout1
942
+ movbe %eax,`0x00+12`(%rsp) # store next counter value
943
+ lea 1($ctr),%eax
944
+ aesenc $rndkey1,$inout2
945
+ xor $key0,%eax
946
+ movbe %eax,`0x10+12`(%rsp)
947
+ aesenc $rndkey1,$inout3
948
+ lea 2($ctr),%eax
949
+ xor $key0,%eax
950
+ aesenc $rndkey1,$inout4
951
+ movbe %eax,`0x20+12`(%rsp)
952
+ lea 3($ctr),%eax
953
+ aesenc $rndkey1,$inout5
954
+ $movkey -32($key,$rnds_),$rndkey1
955
+ xor $key0,%eax
956
+
957
+ aesenc $rndkey0,$inout0
958
+ movbe %eax,`0x30+12`(%rsp)
959
+ lea 4($ctr),%eax
960
+ aesenc $rndkey0,$inout1
961
+ xor $key0,%eax
962
+ movbe %eax,`0x40+12`(%rsp)
963
+ aesenc $rndkey0,$inout2
964
+ lea 5($ctr),%eax
965
+ xor $key0,%eax
966
+ aesenc $rndkey0,$inout3
967
+ movbe %eax,`0x50+12`(%rsp)
968
+ mov %r10,%rax # mov $rnds_,$rounds
969
+ aesenc $rndkey0,$inout4
970
+ aesenc $rndkey0,$inout5
971
+ $movkey -16($key,$rnds_),$rndkey0
972
+
973
+ call .Lenc_loop6
974
+
975
+ movdqu ($inp),$inout6 # load 6 input blocks
976
+ movdqu 0x10($inp),$inout7
977
+ movdqu 0x20($inp),$in0
978
+ movdqu 0x30($inp),$in1
979
+ movdqu 0x40($inp),$in2
980
+ movdqu 0x50($inp),$in3
981
+ lea 0x60($inp),$inp # $inp+=6*16
982
+ $movkey -64($key,$rnds_),$rndkey1
983
+ pxor $inout0,$inout6 # inp^=E(ctr)
984
+ movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
985
+ pxor $inout1,$inout7
986
+ movaps 0x10(%rsp),$inout1
987
+ pxor $inout2,$in0
988
+ movaps 0x20(%rsp),$inout2
989
+ pxor $inout3,$in1
990
+ movaps 0x30(%rsp),$inout3
991
+ pxor $inout4,$in2
992
+ movaps 0x40(%rsp),$inout4
993
+ pxor $inout5,$in3
994
+ movaps 0x50(%rsp),$inout5
995
+ movdqu $inout6,($out) # store 6 output blocks
996
+ movdqu $inout7,0x10($out)
997
+ movdqu $in0,0x20($out)
998
+ movdqu $in1,0x30($out)
999
+ movdqu $in2,0x40($out)
1000
+ movdqu $in3,0x50($out)
1001
+ lea 0x60($out),$out # $out+=6*16
1002
+
1003
+ sub \$6,$len
1004
+ jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1005
+
1006
+ add \$6,$len # restore real remaining $len
1007
+ jz .Lctr32_done # done if ($len==0)
1008
+
1009
+ lea -48($rnds_),$rounds
1010
+ lea -80($key,$rnds_),$key # restore $key
1011
+ neg $rounds
1012
+ shr \$4,$rounds # restore $rounds
1013
+ jmp .Lctr32_tail
1014
+
1015
+ .align 32
1016
+ .Lctr32_loop8:
1017
+ add \$8,$ctr # next counter value
1018
+ movdqa 0x60(%rsp),$inout6
1019
+ aesenc $rndkey1,$inout0
1020
+ mov $ctr,%r9d
1021
+ movdqa 0x70(%rsp),$inout7
1022
+ aesenc $rndkey1,$inout1
1023
+ bswap %r9d
1024
+ $movkey 0x20-0x80($key),$rndkey0
1025
+ aesenc $rndkey1,$inout2
1026
+ xor $key0,%r9d
1027
+ nop
1028
+ aesenc $rndkey1,$inout3
1029
+ mov %r9d,0x00+12(%rsp) # store next counter value
1030
+ lea 1($ctr),%r9
1031
+ aesenc $rndkey1,$inout4
1032
+ aesenc $rndkey1,$inout5
1033
+ aesenc $rndkey1,$inout6
1034
+ aesenc $rndkey1,$inout7
1035
+ $movkey 0x30-0x80($key),$rndkey1
1036
+ ___
1037
+ for($i=2;$i<8;$i++) {
1038
+ my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1039
+ $code.=<<___;
1040
+ bswap %r9d
1041
+ aesenc $rndkeyx,$inout0
1042
+ aesenc $rndkeyx,$inout1
1043
+ xor $key0,%r9d
1044
+ .byte 0x66,0x90
1045
+ aesenc $rndkeyx,$inout2
1046
+ aesenc $rndkeyx,$inout3
1047
+ mov %r9d,`0x10*($i-1)`+12(%rsp)
1048
+ lea $i($ctr),%r9
1049
+ aesenc $rndkeyx,$inout4
1050
+ aesenc $rndkeyx,$inout5
1051
+ aesenc $rndkeyx,$inout6
1052
+ aesenc $rndkeyx,$inout7
1053
+ $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1054
+ ___
1055
+ }
1056
+ $code.=<<___;
1057
+ bswap %r9d
1058
+ aesenc $rndkey0,$inout0
1059
+ aesenc $rndkey0,$inout1
1060
+ aesenc $rndkey0,$inout2
1061
+ xor $key0,%r9d
1062
+ movdqu 0x00($inp),$in0 # start loading input
1063
+ aesenc $rndkey0,$inout3
1064
+ mov %r9d,0x70+12(%rsp)
1065
+ cmp \$11,$rounds
1066
+ aesenc $rndkey0,$inout4
1067
+ aesenc $rndkey0,$inout5
1068
+ aesenc $rndkey0,$inout6
1069
+ aesenc $rndkey0,$inout7
1070
+ $movkey 0xa0-0x80($key),$rndkey0
1071
+
1072
+ jb .Lctr32_enc_done
1073
+
1074
+ aesenc $rndkey1,$inout0
1075
+ aesenc $rndkey1,$inout1
1076
+ aesenc $rndkey1,$inout2
1077
+ aesenc $rndkey1,$inout3
1078
+ aesenc $rndkey1,$inout4
1079
+ aesenc $rndkey1,$inout5
1080
+ aesenc $rndkey1,$inout6
1081
+ aesenc $rndkey1,$inout7
1082
+ $movkey 0xb0-0x80($key),$rndkey1
1083
+
1084
+ aesenc $rndkey0,$inout0
1085
+ aesenc $rndkey0,$inout1
1086
+ aesenc $rndkey0,$inout2
1087
+ aesenc $rndkey0,$inout3
1088
+ aesenc $rndkey0,$inout4
1089
+ aesenc $rndkey0,$inout5
1090
+ aesenc $rndkey0,$inout6
1091
+ aesenc $rndkey0,$inout7
1092
+ $movkey 0xc0-0x80($key),$rndkey0
1093
+ je .Lctr32_enc_done
1094
+
1095
+ aesenc $rndkey1,$inout0
1096
+ aesenc $rndkey1,$inout1
1097
+ aesenc $rndkey1,$inout2
1098
+ aesenc $rndkey1,$inout3
1099
+ aesenc $rndkey1,$inout4
1100
+ aesenc $rndkey1,$inout5
1101
+ aesenc $rndkey1,$inout6
1102
+ aesenc $rndkey1,$inout7
1103
+ $movkey 0xd0-0x80($key),$rndkey1
1104
+
1105
+ aesenc $rndkey0,$inout0
1106
+ aesenc $rndkey0,$inout1
1107
+ aesenc $rndkey0,$inout2
1108
+ aesenc $rndkey0,$inout3
1109
+ aesenc $rndkey0,$inout4
1110
+ aesenc $rndkey0,$inout5
1111
+ aesenc $rndkey0,$inout6
1112
+ aesenc $rndkey0,$inout7
1113
+ $movkey 0xe0-0x80($key),$rndkey0
1114
+ jmp .Lctr32_enc_done
1115
+
1116
+ .align 16
1117
+ .Lctr32_enc_done:
1118
+ movdqu 0x10($inp),$in1
1119
+ pxor $rndkey0,$in0 # input^=round[last]
1120
+ movdqu 0x20($inp),$in2
1121
+ pxor $rndkey0,$in1
1122
+ movdqu 0x30($inp),$in3
1123
+ pxor $rndkey0,$in2
1124
+ movdqu 0x40($inp),$in4
1125
+ pxor $rndkey0,$in3
1126
+ movdqu 0x50($inp),$in5
1127
+ pxor $rndkey0,$in4
1128
+ pxor $rndkey0,$in5
1129
+ aesenc $rndkey1,$inout0
1130
+ aesenc $rndkey1,$inout1
1131
+ aesenc $rndkey1,$inout2
1132
+ aesenc $rndkey1,$inout3
1133
+ aesenc $rndkey1,$inout4
1134
+ aesenc $rndkey1,$inout5
1135
+ aesenc $rndkey1,$inout6
1136
+ aesenc $rndkey1,$inout7
1137
+ movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1138
+ lea 0x80($inp),$inp # $inp+=8*16
1139
+
1140
+ aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1141
+ pxor $rndkey0,$rndkey1 # borrowed $rndkey
1142
+ movdqu 0x70-0x80($inp),$in0
1143
+ aesenclast $in1,$inout1
1144
+ pxor $rndkey0,$in0
1145
+ movdqa 0x00(%rsp),$in1 # load next counter block
1146
+ aesenclast $in2,$inout2
1147
+ aesenclast $in3,$inout3
1148
+ movdqa 0x10(%rsp),$in2
1149
+ movdqa 0x20(%rsp),$in3
1150
+ aesenclast $in4,$inout4
1151
+ aesenclast $in5,$inout5
1152
+ movdqa 0x30(%rsp),$in4
1153
+ movdqa 0x40(%rsp),$in5
1154
+ aesenclast $rndkey1,$inout6
1155
+ movdqa 0x50(%rsp),$rndkey0
1156
+ $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1157
+ aesenclast $in0,$inout7
1158
+
1159
+ movups $inout0,($out) # store 8 output blocks
1160
+ movdqa $in1,$inout0
1161
+ movups $inout1,0x10($out)
1162
+ movdqa $in2,$inout1
1163
+ movups $inout2,0x20($out)
1164
+ movdqa $in3,$inout2
1165
+ movups $inout3,0x30($out)
1166
+ movdqa $in4,$inout3
1167
+ movups $inout4,0x40($out)
1168
+ movdqa $in5,$inout4
1169
+ movups $inout5,0x50($out)
1170
+ movdqa $rndkey0,$inout5
1171
+ movups $inout6,0x60($out)
1172
+ movups $inout7,0x70($out)
1173
+ lea 0x80($out),$out # $out+=8*16
1174
+
1175
+ sub \$8,$len
1176
+ jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1177
+
1178
+ add \$8,$len # restore real remainig $len
1179
+ jz .Lctr32_done # done if ($len==0)
1180
+ lea -0x80($key),$key
1181
+
1182
+ .Lctr32_tail:
1183
+ # note that at this point $inout0..5 are populated with
1184
+ # counter values xor-ed with 0-round key
1185
+ lea 16($key),$key
1186
+ cmp \$4,$len
1187
+ jb .Lctr32_loop3
1188
+ je .Lctr32_loop4
1189
+
1190
+ # if ($len>4) compute 7 E(counter)
1191
+ shl \$4,$rounds
1192
+ movdqa 0x60(%rsp),$inout6
1193
+ pxor $inout7,$inout7
1194
+
1195
+ $movkey 16($key),$rndkey0
1196
+ aesenc $rndkey1,$inout0
1197
+ aesenc $rndkey1,$inout1
1198
+ lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1199
+ neg %rax
1200
+ aesenc $rndkey1,$inout2
1201
+ add \$16,%rax # prepare for .Lenc_loop8_enter
1202
+ movups ($inp),$in0
1203
+ aesenc $rndkey1,$inout3
1204
+ aesenc $rndkey1,$inout4
1205
+ movups 0x10($inp),$in1 # pre-load input
1206
+ movups 0x20($inp),$in2
1207
+ aesenc $rndkey1,$inout5
1208
+ aesenc $rndkey1,$inout6
1209
+
1210
+ call .Lenc_loop8_enter
1211
+
1212
+ movdqu 0x30($inp),$in3
1213
+ pxor $in0,$inout0
1214
+ movdqu 0x40($inp),$in0
1215
+ pxor $in1,$inout1
1216
+ movdqu $inout0,($out) # store output
1217
+ pxor $in2,$inout2
1218
+ movdqu $inout1,0x10($out)
1219
+ pxor $in3,$inout3
1220
+ movdqu $inout2,0x20($out)
1221
+ pxor $in0,$inout4
1222
+ movdqu $inout3,0x30($out)
1223
+ movdqu $inout4,0x40($out)
1224
+ cmp \$6,$len
1225
+ jb .Lctr32_done # $len was 5, stop store
1226
+
1227
+ movups 0x50($inp),$in1
1228
+ xorps $in1,$inout5
1229
+ movups $inout5,0x50($out)
1230
+ je .Lctr32_done # $len was 6, stop store
1231
+
1232
+ movups 0x60($inp),$in2
1233
+ xorps $in2,$inout6
1234
+ movups $inout6,0x60($out)
1235
+ jmp .Lctr32_done # $len was 7, stop store
1236
+
1237
+ .align 32
1238
+ .Lctr32_loop4:
1239
+ aesenc $rndkey1,$inout0
1240
+ lea 16($key),$key
1241
+ dec $rounds
1242
+ aesenc $rndkey1,$inout1
1243
+ aesenc $rndkey1,$inout2
1244
+ aesenc $rndkey1,$inout3
1245
+ $movkey ($key),$rndkey1
1246
+ jnz .Lctr32_loop4
1247
+ aesenclast $rndkey1,$inout0
1248
+ aesenclast $rndkey1,$inout1
1249
+ movups ($inp),$in0 # load input
1250
+ movups 0x10($inp),$in1
1251
+ aesenclast $rndkey1,$inout2
1252
+ aesenclast $rndkey1,$inout3
1253
+ movups 0x20($inp),$in2
1254
+ movups 0x30($inp),$in3
1255
+
1256
+ xorps $in0,$inout0
1257
+ movups $inout0,($out) # store output
1258
+ xorps $in1,$inout1
1259
+ movups $inout1,0x10($out)
1260
+ pxor $in2,$inout2
1261
+ movdqu $inout2,0x20($out)
1262
+ pxor $in3,$inout3
1263
+ movdqu $inout3,0x30($out)
1264
+ jmp .Lctr32_done # $len was 4, stop store
1265
+
1266
+ .align 32
1267
+ .Lctr32_loop3:
1268
+ aesenc $rndkey1,$inout0
1269
+ lea 16($key),$key
1270
+ dec $rounds
1271
+ aesenc $rndkey1,$inout1
1272
+ aesenc $rndkey1,$inout2
1273
+ $movkey ($key),$rndkey1
1274
+ jnz .Lctr32_loop3
1275
+ aesenclast $rndkey1,$inout0
1276
+ aesenclast $rndkey1,$inout1
1277
+ aesenclast $rndkey1,$inout2
1278
+
1279
+ movups ($inp),$in0 # load input
1280
+ xorps $in0,$inout0
1281
+ movups $inout0,($out) # store output
1282
+ cmp \$2,$len
1283
+ jb .Lctr32_done # $len was 1, stop store
1284
+
1285
+ movups 0x10($inp),$in1
1286
+ xorps $in1,$inout1
1287
+ movups $inout1,0x10($out)
1288
+ je .Lctr32_done # $len was 2, stop store
1289
+
1290
+ movups 0x20($inp),$in2
1291
+ xorps $in2,$inout2
1292
+ movups $inout2,0x20($out) # $len was 3, stop store
1293
+
1294
+ .Lctr32_done:
1295
+ xorps %xmm0,%xmm0 # clear regiser bank
1296
+ xor $key0,$key0
1297
+ pxor %xmm1,%xmm1
1298
+ pxor %xmm2,%xmm2
1299
+ pxor %xmm3,%xmm3
1300
+ pxor %xmm4,%xmm4
1301
+ pxor %xmm5,%xmm5
1302
+ ___
1303
+ $code.=<<___ if (!$win64);
1304
+ pxor %xmm6,%xmm6
1305
+ pxor %xmm7,%xmm7
1306
+ movaps %xmm0,0x00(%rsp) # clear stack
1307
+ pxor %xmm8,%xmm8
1308
+ movaps %xmm0,0x10(%rsp)
1309
+ pxor %xmm9,%xmm9
1310
+ movaps %xmm0,0x20(%rsp)
1311
+ pxor %xmm10,%xmm10
1312
+ movaps %xmm0,0x30(%rsp)
1313
+ pxor %xmm11,%xmm11
1314
+ movaps %xmm0,0x40(%rsp)
1315
+ pxor %xmm12,%xmm12
1316
+ movaps %xmm0,0x50(%rsp)
1317
+ pxor %xmm13,%xmm13
1318
+ movaps %xmm0,0x60(%rsp)
1319
+ pxor %xmm14,%xmm14
1320
+ movaps %xmm0,0x70(%rsp)
1321
+ pxor %xmm15,%xmm15
1322
+ ___
1323
+ $code.=<<___ if ($win64);
1324
+ movaps -0xa0(%rbp),%xmm6
1325
+ movaps %xmm0,-0xa0(%rbp) # clear stack
1326
+ movaps -0x90(%rbp),%xmm7
1327
+ movaps %xmm0,-0x90(%rbp)
1328
+ movaps -0x80(%rbp),%xmm8
1329
+ movaps %xmm0,-0x80(%rbp)
1330
+ movaps -0x70(%rbp),%xmm9
1331
+ movaps %xmm0,-0x70(%rbp)
1332
+ movaps -0x60(%rbp),%xmm10
1333
+ movaps %xmm0,-0x60(%rbp)
1334
+ movaps -0x50(%rbp),%xmm11
1335
+ movaps %xmm0,-0x50(%rbp)
1336
+ movaps -0x40(%rbp),%xmm12
1337
+ movaps %xmm0,-0x40(%rbp)
1338
+ movaps -0x30(%rbp),%xmm13
1339
+ movaps %xmm0,-0x30(%rbp)
1340
+ movaps -0x20(%rbp),%xmm14
1341
+ movaps %xmm0,-0x20(%rbp)
1342
+ movaps -0x10(%rbp),%xmm15
1343
+ movaps %xmm0,-0x10(%rbp)
1344
+ movaps %xmm0,0x00(%rsp)
1345
+ movaps %xmm0,0x10(%rsp)
1346
+ movaps %xmm0,0x20(%rsp)
1347
+ movaps %xmm0,0x30(%rsp)
1348
+ movaps %xmm0,0x40(%rsp)
1349
+ movaps %xmm0,0x50(%rsp)
1350
+ movaps %xmm0,0x60(%rsp)
1351
+ movaps %xmm0,0x70(%rsp)
1352
+ ___
1353
+ $code.=<<___;
1354
+ lea (%rbp),%rsp
1355
+ pop %rbp
1356
+ .Lctr32_epilogue:
1357
+ ret
1358
+ .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1359
+ ___
1360
+ } }}
1361
+
1362
+ # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
1363
+ # int bits, AES_KEY *key)
1364
+ #
1365
+ # input: $inp user-supplied key
1366
+ # $bits $inp length in bits
1367
+ # $key pointer to key schedule
1368
+ # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1369
+ # *$key key schedule
1370
+ #
1371
+ { my ($inp,$bits,$key) = @_4args;
1372
+ $bits =~ s/%r/%e/;
1373
+
1374
+ $code.=<<___;
1375
+ .globl ${PREFIX}_set_decrypt_key
1376
+ .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
1377
+ .align 16
1378
+ ${PREFIX}_set_decrypt_key:
1379
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1380
+ call __aesni_set_encrypt_key
1381
+ shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
1382
+ test %eax,%eax
1383
+ jnz .Ldec_key_ret
1384
+ lea 16($key,$bits),$inp # points at the end of key schedule
1385
+
1386
+ $movkey ($key),%xmm0 # just swap
1387
+ $movkey ($inp),%xmm1
1388
+ $movkey %xmm0,($inp)
1389
+ $movkey %xmm1,($key)
1390
+ lea 16($key),$key
1391
+ lea -16($inp),$inp
1392
+
1393
+ .Ldec_key_inverse:
1394
+ $movkey ($key),%xmm0 # swap and inverse
1395
+ $movkey ($inp),%xmm1
1396
+ aesimc %xmm0,%xmm0
1397
+ aesimc %xmm1,%xmm1
1398
+ lea 16($key),$key
1399
+ lea -16($inp),$inp
1400
+ $movkey %xmm0,16($inp)
1401
+ $movkey %xmm1,-16($key)
1402
+ cmp $key,$inp
1403
+ ja .Ldec_key_inverse
1404
+
1405
+ $movkey ($key),%xmm0 # inverse middle
1406
+ aesimc %xmm0,%xmm0
1407
+ pxor %xmm1,%xmm1
1408
+ $movkey %xmm0,($inp)
1409
+ pxor %xmm0,%xmm0
1410
+ .Ldec_key_ret:
1411
+ add \$8,%rsp
1412
+ ret
1413
+ .LSEH_end_set_decrypt_key:
1414
+ .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
1415
+ ___
1416
+
1417
+ # This is based on submission by
1418
+ #
1419
+ # Huang Ying <ying.huang@intel.com>
1420
+ # Vinodh Gopal <vinodh.gopal@intel.com>
1421
+ # Kahraman Akdemir
1422
+ #
1423
+ # Agressively optimized in respect to aeskeygenassist's critical path
1424
+ # and is contained in %xmm0-5 to meet Win64 ABI requirement.
1425
+ #
1426
+ # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
1427
+ # int bits, AES_KEY * const key);
1428
+ #
1429
+ # input: $inp user-supplied key
1430
+ # $bits $inp length in bits
1431
+ # $key pointer to key schedule
1432
+ # output: %eax 0 denoting success, -1 or -2 - failure (see C)
1433
+ # $bits rounds-1 (used in aesni_set_decrypt_key)
1434
+ # *$key key schedule
1435
+ # $key pointer to key schedule (used in
1436
+ # aesni_set_decrypt_key)
1437
+ #
1438
+ # Subroutine is frame-less, which means that only volatile registers
1439
+ # are used. Note that it's declared "abi-omnipotent", which means that
1440
+ # amount of volatile registers is smaller on Windows.
1441
+ #
1442
+ $code.=<<___;
1443
+ .globl ${PREFIX}_set_encrypt_key
1444
+ .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
1445
+ .align 16
1446
+ ${PREFIX}_set_encrypt_key:
1447
+ __aesni_set_encrypt_key:
1448
+ .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
1449
+ mov \$-1,%rax
1450
+ test $inp,$inp
1451
+ jz .Lenc_key_ret
1452
+ test $key,$key
1453
+ jz .Lenc_key_ret
1454
+
1455
+ mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
1456
+ movups ($inp),%xmm0 # pull first 128 bits of *userKey
1457
+ xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
1458
+ and OPENSSL_ia32cap_P+4(%rip),%r10d
1459
+ lea 16($key),%rax # %rax is used as modifiable copy of $key
1460
+ cmp \$256,$bits
1461
+ je .L14rounds
1462
+ cmp \$192,$bits
1463
+ je .L12rounds
1464
+ cmp \$128,$bits
1465
+ jne .Lbad_keybits
1466
+
1467
+ .L10rounds:
1468
+ mov \$9,$bits # 10 rounds for 128-bit key
1469
+ cmp \$`1<<28`,%r10d # AVX, bit no XOP
1470
+ je .L10rounds_alt
1471
+
1472
+ $movkey %xmm0,($key) # round 0
1473
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
1474
+ call .Lkey_expansion_128_cold
1475
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
1476
+ call .Lkey_expansion_128
1477
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
1478
+ call .Lkey_expansion_128
1479
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
1480
+ call .Lkey_expansion_128
1481
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
1482
+ call .Lkey_expansion_128
1483
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
1484
+ call .Lkey_expansion_128
1485
+ aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
1486
+ call .Lkey_expansion_128
1487
+ aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
1488
+ call .Lkey_expansion_128
1489
+ aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
1490
+ call .Lkey_expansion_128
1491
+ aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
1492
+ call .Lkey_expansion_128
1493
+ $movkey %xmm0,(%rax)
1494
+ mov $bits,80(%rax) # 240(%rdx)
1495
+ xor %eax,%eax
1496
+ jmp .Lenc_key_ret
1497
+
1498
+ .align 16
1499
+ .L10rounds_alt:
1500
+ movdqa .Lkey_rotate(%rip),%xmm5
1501
+ mov \$8,%r10d
1502
+ movdqa .Lkey_rcon1(%rip),%xmm4
1503
+ movdqa %xmm0,%xmm2
1504
+ movdqu %xmm0,($key)
1505
+ jmp .Loop_key128
1506
+
1507
+ .align 16
1508
+ .Loop_key128:
1509
+ pshufb %xmm5,%xmm0
1510
+ aesenclast %xmm4,%xmm0
1511
+ pslld \$1,%xmm4
1512
+ lea 16(%rax),%rax
1513
+
1514
+ movdqa %xmm2,%xmm3
1515
+ pslldq \$4,%xmm2
1516
+ pxor %xmm2,%xmm3
1517
+ pslldq \$4,%xmm2
1518
+ pxor %xmm2,%xmm3
1519
+ pslldq \$4,%xmm2
1520
+ pxor %xmm3,%xmm2
1521
+
1522
+ pxor %xmm2,%xmm0
1523
+ movdqu %xmm0,-16(%rax)
1524
+ movdqa %xmm0,%xmm2
1525
+
1526
+ dec %r10d
1527
+ jnz .Loop_key128
1528
+
1529
+ movdqa .Lkey_rcon1b(%rip),%xmm4
1530
+
1531
+ pshufb %xmm5,%xmm0
1532
+ aesenclast %xmm4,%xmm0
1533
+ pslld \$1,%xmm4
1534
+
1535
+ movdqa %xmm2,%xmm3
1536
+ pslldq \$4,%xmm2
1537
+ pxor %xmm2,%xmm3
1538
+ pslldq \$4,%xmm2
1539
+ pxor %xmm2,%xmm3
1540
+ pslldq \$4,%xmm2
1541
+ pxor %xmm3,%xmm2
1542
+
1543
+ pxor %xmm2,%xmm0
1544
+ movdqu %xmm0,(%rax)
1545
+
1546
+ movdqa %xmm0,%xmm2
1547
+ pshufb %xmm5,%xmm0
1548
+ aesenclast %xmm4,%xmm0
1549
+
1550
+ movdqa %xmm2,%xmm3
1551
+ pslldq \$4,%xmm2
1552
+ pxor %xmm2,%xmm3
1553
+ pslldq \$4,%xmm2
1554
+ pxor %xmm2,%xmm3
1555
+ pslldq \$4,%xmm2
1556
+ pxor %xmm3,%xmm2
1557
+
1558
+ pxor %xmm2,%xmm0
1559
+ movdqu %xmm0,16(%rax)
1560
+
1561
+ mov $bits,96(%rax) # 240($key)
1562
+ xor %eax,%eax
1563
+ jmp .Lenc_key_ret
1564
+
1565
+ .align 16
1566
+ .L12rounds:
1567
+ movq 16($inp),%xmm2 # remaining 1/3 of *userKey
1568
+ mov \$11,$bits # 12 rounds for 192
1569
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
1570
+ je .L12rounds_alt
1571
+
1572
+ $movkey %xmm0,($key) # round 0
1573
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
1574
+ call .Lkey_expansion_192a_cold
1575
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
1576
+ call .Lkey_expansion_192b
1577
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
1578
+ call .Lkey_expansion_192a
1579
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
1580
+ call .Lkey_expansion_192b
1581
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
1582
+ call .Lkey_expansion_192a
1583
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
1584
+ call .Lkey_expansion_192b
1585
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
1586
+ call .Lkey_expansion_192a
1587
+ aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
1588
+ call .Lkey_expansion_192b
1589
+ $movkey %xmm0,(%rax)
1590
+ mov $bits,48(%rax) # 240(%rdx)
1591
+ xor %rax, %rax
1592
+ jmp .Lenc_key_ret
1593
+
1594
+ .align 16
1595
+ .L12rounds_alt:
1596
+ movdqa .Lkey_rotate192(%rip),%xmm5
1597
+ movdqa .Lkey_rcon1(%rip),%xmm4
1598
+ mov \$8,%r10d
1599
+ movdqu %xmm0,($key)
1600
+ jmp .Loop_key192
1601
+
1602
+ .align 16
1603
+ .Loop_key192:
1604
+ movq %xmm2,0(%rax)
1605
+ movdqa %xmm2,%xmm1
1606
+ pshufb %xmm5,%xmm2
1607
+ aesenclast %xmm4,%xmm2
1608
+ pslld \$1, %xmm4
1609
+ lea 24(%rax),%rax
1610
+
1611
+ movdqa %xmm0,%xmm3
1612
+ pslldq \$4,%xmm0
1613
+ pxor %xmm0,%xmm3
1614
+ pslldq \$4,%xmm0
1615
+ pxor %xmm0,%xmm3
1616
+ pslldq \$4,%xmm0
1617
+ pxor %xmm3,%xmm0
1618
+
1619
+ pshufd \$0xff,%xmm0,%xmm3
1620
+ pxor %xmm1,%xmm3
1621
+ pslldq \$4,%xmm1
1622
+ pxor %xmm1,%xmm3
1623
+
1624
+ pxor %xmm2,%xmm0
1625
+ pxor %xmm3,%xmm2
1626
+ movdqu %xmm0,-16(%rax)
1627
+
1628
+ dec %r10d
1629
+ jnz .Loop_key192
1630
+
1631
+ mov $bits,32(%rax) # 240($key)
1632
+ xor %eax,%eax
1633
+ jmp .Lenc_key_ret
1634
+
1635
+ .align 16
1636
+ .L14rounds:
1637
+ movups 16($inp),%xmm2 # remaning half of *userKey
1638
+ mov \$13,$bits # 14 rounds for 256
1639
+ lea 16(%rax),%rax
1640
+ cmp \$`1<<28`,%r10d # AVX, but no XOP
1641
+ je .L14rounds_alt
1642
+
1643
+ $movkey %xmm0,($key) # round 0
1644
+ $movkey %xmm2,16($key) # round 1
1645
+ aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
1646
+ call .Lkey_expansion_256a_cold
1647
+ aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
1648
+ call .Lkey_expansion_256b
1649
+ aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
1650
+ call .Lkey_expansion_256a
1651
+ aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
1652
+ call .Lkey_expansion_256b
1653
+ aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
1654
+ call .Lkey_expansion_256a
1655
+ aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
1656
+ call .Lkey_expansion_256b
1657
+ aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
1658
+ call .Lkey_expansion_256a
1659
+ aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
1660
+ call .Lkey_expansion_256b
1661
+ aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
1662
+ call .Lkey_expansion_256a
1663
+ aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
1664
+ call .Lkey_expansion_256b
1665
+ aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
1666
+ call .Lkey_expansion_256a
1667
+ aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
1668
+ call .Lkey_expansion_256b
1669
+ aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
1670
+ call .Lkey_expansion_256a
1671
+ $movkey %xmm0,(%rax)
1672
+ mov $bits,16(%rax) # 240(%rdx)
1673
+ xor %rax,%rax
1674
+ jmp .Lenc_key_ret
1675
+
1676
+ .align 16
1677
+ .L14rounds_alt:
1678
+ movdqa .Lkey_rotate(%rip),%xmm5
1679
+ movdqa .Lkey_rcon1(%rip),%xmm4
1680
+ mov \$7,%r10d
1681
+ movdqu %xmm0,0($key)
1682
+ movdqa %xmm2,%xmm1
1683
+ movdqu %xmm2,16($key)
1684
+ jmp .Loop_key256
1685
+
1686
+ .align 16
1687
+ .Loop_key256:
1688
+ pshufb %xmm5,%xmm2
1689
+ aesenclast %xmm4,%xmm2
1690
+
1691
+ movdqa %xmm0,%xmm3
1692
+ pslldq \$4,%xmm0
1693
+ pxor %xmm0,%xmm3
1694
+ pslldq \$4,%xmm0
1695
+ pxor %xmm0,%xmm3
1696
+ pslldq \$4,%xmm0
1697
+ pxor %xmm3,%xmm0
1698
+ pslld \$1,%xmm4
1699
+
1700
+ pxor %xmm2,%xmm0
1701
+ movdqu %xmm0,(%rax)
1702
+
1703
+ dec %r10d
1704
+ jz .Ldone_key256
1705
+
1706
+ pshufd \$0xff,%xmm0,%xmm2
1707
+ pxor %xmm3,%xmm3
1708
+ aesenclast %xmm3,%xmm2
1709
+
1710
+ movdqa %xmm1,%xmm3
1711
+ pslldq \$4,%xmm1
1712
+ pxor %xmm1,%xmm3
1713
+ pslldq \$4,%xmm1
1714
+ pxor %xmm1,%xmm3
1715
+ pslldq \$4,%xmm1
1716
+ pxor %xmm3,%xmm1
1717
+
1718
+ pxor %xmm1,%xmm2
1719
+ movdqu %xmm2,16(%rax)
1720
+ lea 32(%rax),%rax
1721
+ movdqa %xmm2,%xmm1
1722
+
1723
+ jmp .Loop_key256
1724
+
1725
+ .Ldone_key256:
1726
+ mov $bits,16(%rax) # 240($key)
1727
+ xor %eax,%eax
1728
+ jmp .Lenc_key_ret
1729
+
1730
+ .align 16
1731
+ .Lbad_keybits:
1732
+ mov \$-2,%rax
1733
+ .Lenc_key_ret:
1734
+ pxor %xmm0,%xmm0
1735
+ pxor %xmm1,%xmm1
1736
+ pxor %xmm2,%xmm2
1737
+ pxor %xmm3,%xmm3
1738
+ pxor %xmm4,%xmm4
1739
+ pxor %xmm5,%xmm5
1740
+ add \$8,%rsp
1741
+ ret
1742
+ .LSEH_end_set_encrypt_key:
1743
+
1744
+ .align 16
1745
+ .Lkey_expansion_128:
1746
+ $movkey %xmm0,(%rax)
1747
+ lea 16(%rax),%rax
1748
+ .Lkey_expansion_128_cold:
1749
+ shufps \$0b00010000,%xmm0,%xmm4
1750
+ xorps %xmm4, %xmm0
1751
+ shufps \$0b10001100,%xmm0,%xmm4
1752
+ xorps %xmm4, %xmm0
1753
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
1754
+ xorps %xmm1,%xmm0
1755
+ ret
1756
+
1757
+ .align 16
1758
+ .Lkey_expansion_192a:
1759
+ $movkey %xmm0,(%rax)
1760
+ lea 16(%rax),%rax
1761
+ .Lkey_expansion_192a_cold:
1762
+ movaps %xmm2, %xmm5
1763
+ .Lkey_expansion_192b_warm:
1764
+ shufps \$0b00010000,%xmm0,%xmm4
1765
+ movdqa %xmm2,%xmm3
1766
+ xorps %xmm4,%xmm0
1767
+ shufps \$0b10001100,%xmm0,%xmm4
1768
+ pslldq \$4,%xmm3
1769
+ xorps %xmm4,%xmm0
1770
+ pshufd \$0b01010101,%xmm1,%xmm1 # critical path
1771
+ pxor %xmm3,%xmm2
1772
+ pxor %xmm1,%xmm0
1773
+ pshufd \$0b11111111,%xmm0,%xmm3
1774
+ pxor %xmm3,%xmm2
1775
+ ret
1776
+
1777
+ .align 16
1778
+ .Lkey_expansion_192b:
1779
+ movaps %xmm0,%xmm3
1780
+ shufps \$0b01000100,%xmm0,%xmm5
1781
+ $movkey %xmm5,(%rax)
1782
+ shufps \$0b01001110,%xmm2,%xmm3
1783
+ $movkey %xmm3,16(%rax)
1784
+ lea 32(%rax),%rax
1785
+ jmp .Lkey_expansion_192b_warm
1786
+
1787
+ .align 16
1788
+ .Lkey_expansion_256a:
1789
+ $movkey %xmm2,(%rax)
1790
+ lea 16(%rax),%rax
1791
+ .Lkey_expansion_256a_cold:
1792
+ shufps \$0b00010000,%xmm0,%xmm4
1793
+ xorps %xmm4,%xmm0
1794
+ shufps \$0b10001100,%xmm0,%xmm4
1795
+ xorps %xmm4,%xmm0
1796
+ shufps \$0b11111111,%xmm1,%xmm1 # critical path
1797
+ xorps %xmm1,%xmm0
1798
+ ret
1799
+
1800
+ .align 16
1801
+ .Lkey_expansion_256b:
1802
+ $movkey %xmm0,(%rax)
1803
+ lea 16(%rax),%rax
1804
+
1805
+ shufps \$0b00010000,%xmm2,%xmm4
1806
+ xorps %xmm4,%xmm2
1807
+ shufps \$0b10001100,%xmm2,%xmm4
1808
+ xorps %xmm4,%xmm2
1809
+ shufps \$0b10101010,%xmm1,%xmm1 # critical path
1810
+ xorps %xmm1,%xmm2
1811
+ ret
1812
+ .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
1813
+ .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
1814
+ ___
1815
+ }
1816
+
1817
+ $code.=<<___;
1818
+ .align 64
1819
+ .Lbswap_mask:
1820
+ .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1821
+ .Lincrement32:
1822
+ .long 6,6,6,0
1823
+ .Lincrement64:
1824
+ .long 1,0,0,0
1825
+ .Lincrement1:
1826
+ .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1827
+ .Lkey_rotate:
1828
+ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
1829
+ .Lkey_rotate192:
1830
+ .long 0x04070605,0x04070605,0x04070605,0x04070605
1831
+ .Lkey_rcon1:
1832
+ .long 1,1,1,1
1833
+ .Lkey_rcon1b:
1834
+ .long 0x1b,0x1b,0x1b,0x1b
1835
+
1836
+ .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
1837
+ .align 64
1838
+ ___
1839
+
1840
+ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1841
+ # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1842
+ if ($win64) {
1843
+ $rec="%rcx";
1844
+ $frame="%rdx";
1845
+ $context="%r8";
1846
+ $disp="%r9";
1847
+
1848
+ $code.=<<___;
1849
+ .extern __imp_RtlVirtualUnwind
1850
+ ___
1851
+ $code.=<<___ if ($PREFIX eq "aesni");
1852
+ .type ccm64_se_handler,\@abi-omnipotent
1853
+ .align 16
1854
+ ccm64_se_handler:
1855
+ push %rsi
1856
+ push %rdi
1857
+ push %rbx
1858
+ push %rbp
1859
+ push %r12
1860
+ push %r13
1861
+ push %r14
1862
+ push %r15
1863
+ pushfq
1864
+ sub \$64,%rsp
1865
+
1866
+ mov 120($context),%rax # pull context->Rax
1867
+ mov 248($context),%rbx # pull context->Rip
1868
+
1869
+ mov 8($disp),%rsi # disp->ImageBase
1870
+ mov 56($disp),%r11 # disp->HandlerData
1871
+
1872
+ mov 0(%r11),%r10d # HandlerData[0]
1873
+ lea (%rsi,%r10),%r10 # prologue label
1874
+ cmp %r10,%rbx # context->Rip<prologue label
1875
+ jb .Lcommon_seh_tail
1876
+
1877
+ mov 152($context),%rax # pull context->Rsp
1878
+
1879
+ mov 4(%r11),%r10d # HandlerData[1]
1880
+ lea (%rsi,%r10),%r10 # epilogue label
1881
+ cmp %r10,%rbx # context->Rip>=epilogue label
1882
+ jae .Lcommon_seh_tail
1883
+
1884
+ lea 0(%rax),%rsi # %xmm save area
1885
+ lea 512($context),%rdi # &context.Xmm6
1886
+ mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
1887
+ .long 0xa548f3fc # cld; rep movsq
1888
+ lea 0x58(%rax),%rax # adjust stack pointer
1889
+
1890
+ jmp .Lcommon_seh_tail
1891
+ .size ccm64_se_handler,.-ccm64_se_handler
1892
+
1893
+ .type ctr_se_handler,\@abi-omnipotent
1894
+ .align 16
1895
+ ctr_se_handler:
1896
+ push %rsi
1897
+ push %rdi
1898
+ push %rbx
1899
+ push %rbp
1900
+ push %r12
1901
+ push %r13
1902
+ push %r14
1903
+ push %r15
1904
+ pushfq
1905
+ sub \$64,%rsp
1906
+
1907
+ mov 120($context),%rax # pull context->Rax
1908
+ mov 248($context),%rbx # pull context->Rip
1909
+
1910
+ mov 8($disp),%rsi # disp->ImageBase
1911
+ mov 56($disp),%r11 # disp->HandlerData
1912
+
1913
+ mov 0(%r11),%r10d # HandlerData[0]
1914
+ lea (%rsi,%r10),%r10 # prologue lable
1915
+ cmp %r10,%rbx # context->Rip<prologue label
1916
+ jb .Lcommon_seh_tail
1917
+
1918
+ mov 152($context),%rax # pull context->Rsp
1919
+
1920
+ mov 4(%r11),%r10d # HandlerData[1]
1921
+ lea (%rsi,%r10),%r10 # epilogue label
1922
+ cmp %r10,%rbx # context->Rip>=epilogue label
1923
+ jae .Lcommon_seh_tail
1924
+
1925
+ mov 160($context),%rax # pull context->Rbp
1926
+ lea -0xa0(%rax),%rsi # %xmm save area
1927
+ lea 512($context),%rdi # & context.Xmm6
1928
+ mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1929
+ .long 0xa548f3fc # cld; rep movsq
1930
+
1931
+ mov 160($context),%rax # pull context->Rbp
1932
+ mov (%rax),%rbp # restore saved %rbp
1933
+ lea 8(%rax),%rax # adjust stack pointer
1934
+ mov %rbp,160($context) # restore context->Rbp
1935
+
1936
+ mov 8(%rax),%rdi
1937
+ mov 16(%rax),%rsi
1938
+ mov %rax,152($context) # restore context->Rsp
1939
+ mov %rsi,168($context) # restore context->Rsi
1940
+ mov %rdi,176($context) # restore context->Rdi
1941
+
1942
+ mov 40($disp),%rdi # disp->ContextRecord
1943
+ mov $context,%rsi # context
1944
+ mov \$154,%ecx # sizeof(CONTEXT)
1945
+ .long 0xa548f3fc # cld; rep movsq
1946
+
1947
+ mov $disp,%rsi
1948
+ xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1949
+ mov 8(%rsi),%rdx # arg2, disp->ImageBase
1950
+ mov 0(%rsi),%r8 # arg3, disp->ControlPc
1951
+ mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1952
+ mov 40(%rsi),%r10 # disp->ContextRecord
1953
+ lea 56(%rsi),%r11 # &disp->HandlerData
1954
+ lea 24(%rsi),%r12 # &disp->EstablisherFrame
1955
+ mov %r10,32(%rsp) # arg5
1956
+ mov %r11,40(%rsp) # arg6
1957
+ mov %r12,48(%rsp) # arg7
1958
+ mov %rcx,56(%rsp) # arg8, (NULL)
1959
+ call *__imp_RtlVirtualUnwind(%rip)
1960
+
1961
+ mov \$1,%eax # ExceptionContinueSearch
1962
+ add \$64,%rsp
1963
+ popfq
1964
+ pop %r15
1965
+ pop %r14
1966
+ pop %r13
1967
+ pop %r12
1968
+ pop %rbp
1969
+ pop %rbx
1970
+ pop %rdi
1971
+ pop %rsi
1972
+ ret
1973
+ .size ctr_se_handler,.-ctr_se_handler
1974
+
1975
+ .section .pdata
1976
+ .align 4
1977
+ ___
1978
+ $code.=<<___ if ($PREFIX eq "aesni");
1979
+ .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
1980
+ .rva .LSEH_end_aesni_ccm64_encrypt_blocks
1981
+ .rva .LSEH_info_ccm64_enc
1982
+
1983
+ .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
1984
+ .rva .LSEH_end_aesni_ccm64_decrypt_blocks
1985
+ .rva .LSEH_info_ccm64_dec
1986
+
1987
+ .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
1988
+ .rva .LSEH_end_aesni_ctr32_encrypt_blocks
1989
+ .rva .LSEH_info_ctr32
1990
+ ___
1991
+ $code.=<<___;
1992
+ .rva ${PREFIX}_set_decrypt_key
1993
+ .rva .LSEH_end_set_decrypt_key
1994
+ .rva .LSEH_info_key
1995
+
1996
+ .rva ${PREFIX}_set_encrypt_key
1997
+ .rva .LSEH_end_set_encrypt_key
1998
+ .rva .LSEH_info_key
1999
+ .section .xdata
2000
+ .align 8
2001
+ ___
2002
+ $code.=<<___ if ($PREFIX eq "aesni");
2003
+ .LSEH_info_ccm64_enc:
2004
+ .byte 9,0,0,0
2005
+ .rva ccm64_se_handler
2006
+ .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
2007
+ .LSEH_info_ccm64_dec:
2008
+ .byte 9,0,0,0
2009
+ .rva ccm64_se_handler
2010
+ .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
2011
+ .LSEH_info_ctr32:
2012
+ .byte 9,0,0,0
2013
+ .rva ctr_se_handler
2014
+ .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
2015
+ ___
2016
+ $code.=<<___;
2017
+ .LSEH_info_key:
2018
+ .byte 0x01,0x04,0x01,0x00
2019
+ .byte 0x04,0x02,0x00,0x00 # sub rsp,8
2020
+ ___
2021
+ }
2022
+
2023
+ sub rex {
2024
+ local *opcode=shift;
2025
+ my ($dst,$src)=@_;
2026
+ my $rex=0;
2027
+
2028
+ $rex|=0x04 if($dst>=8);
2029
+ $rex|=0x01 if($src>=8);
2030
+ push @opcode,$rex|0x40 if($rex);
2031
+ }
2032
+
2033
+ sub aesni {
2034
+ my $line=shift;
2035
+ my @opcode=(0x66);
2036
+
2037
+ if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2038
+ rex(\@opcode,$4,$3);
2039
+ push @opcode,0x0f,0x3a,0xdf;
2040
+ push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
2041
+ my $c=$2;
2042
+ push @opcode,$c=~/^0/?oct($c):$c;
2043
+ return ".byte\t".join(',',@opcode);
2044
+ }
2045
+ elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2046
+ my %opcodelet = (
2047
+ "aesimc" => 0xdb,
2048
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
2049
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
2050
+ );
2051
+ return undef if (!defined($opcodelet{$1}));
2052
+ rex(\@opcode,$3,$2);
2053
+ push @opcode,0x0f,0x38,$opcodelet{$1};
2054
+ push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
2055
+ return ".byte\t".join(',',@opcode);
2056
+ }
2057
+ elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
2058
+ my %opcodelet = (
2059
+ "aesenc" => 0xdc, "aesenclast" => 0xdd,
2060
+ "aesdec" => 0xde, "aesdeclast" => 0xdf
2061
+ );
2062
+ return undef if (!defined($opcodelet{$1}));
2063
+ my $off = $2;
2064
+ push @opcode,0x44 if ($3>=8);
2065
+ push @opcode,0x0f,0x38,$opcodelet{$1};
2066
+ push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
2067
+ push @opcode,($off=~/^0/?oct($off):$off)&0xff;
2068
+ return ".byte\t".join(',',@opcode);
2069
+ }
2070
+ return $line;
2071
+ }
2072
+
2073
+ sub movbe {
2074
+ ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
2075
+ }
2076
+
2077
+ $code =~ s/\`([^\`]*)\`/eval($1)/gem;
2078
+ $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
2079
+ #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
2080
+ $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
2081
+
2082
+ print $code;
2083
+
2084
+ close STDOUT;