ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,1318 @@
1
+ #!/usr/bin/env perl
2
+
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # This module implements support for Intel AES-NI extension. In
11
+ # OpenSSL context it's used with Intel engine, but can also be used as
12
+ # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13
+ # details].
14
+ #
15
+ # Performance.
16
+ #
17
+ # To start with see corresponding paragraph in aesni-x86_64.pl...
18
+ # Instead of filling table similar to one found there I've chosen to
19
+ # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20
+ # The simplified table below represents 32-bit performance relative
21
+ # to 64-bit one in every given point. Ratios vary for different
22
+ # encryption modes, therefore interval values.
23
+ #
24
+ # 16-byte 64-byte 256-byte 1-KB 8-KB
25
+ # 53-67% 67-84% 91-94% 95-98% 97-99.5%
26
+ #
27
+ # Lower ratios for smaller block sizes are perfectly understandable,
28
+ # because function call overhead is higher in 32-bit mode. Largest
29
+ # 8-KB block performance is virtually same: 32-bit code is less than
30
+ # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
+
32
+ # January 2011
33
+ #
34
+ # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35
+ # interleaves at most 6 aes[enc|dec] instructions, because there are
36
+ # not enough registers for 8x interleave [which should be optimal for
37
+ # Sandy Bridge]. Actually, performance results for 6x interleave
38
+ # factor presented in aesni-x86_64.pl (except for CTR) are for this
39
+ # module.
40
+
41
+ # April 2011
42
+ #
43
+ # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44
+ # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
+
46
+ ######################################################################
47
+ # Current large-block performance in cycles per byte processed with
48
+ # 128-bit key (less is better).
49
+ #
50
+ # CBC en-/decrypt CTR XTS ECB
51
+ # Westmere 3.77/1.37 1.37 1.52 1.27
52
+ # * Bridge 5.07/0.98 0.99 1.09 0.91
53
+ # Haswell 4.44/0.80 0.97 1.03 0.72
54
+ # Silvermont 5.77/3.56 3.67 4.03 3.46
55
+ # Bulldozer 5.80/0.98 1.05 1.24 0.93
56
+
57
+ $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
58
+ # generates drop-in replacement for
59
+ # crypto/aes/asm/aes-586.pl:-)
60
+ $inline=1; # inline _aesni_[en|de]crypt
61
+
62
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63
+ push(@INC,"${dir}","${dir}../../perlasm");
64
+ require "x86asm.pl";
65
+
66
+ &asm_init($ARGV[0],$0);
67
+
68
+ &external_label("OPENSSL_ia32cap_P");
69
+ &static_label("key_const");
70
+
71
+ if ($PREFIX eq "aesni") { $movekey=\&movups; }
72
+ else { $movekey=\&movups; }
73
+
74
+ $len="eax";
75
+ $rounds="ecx";
76
+ $key="edx";
77
+ $inp="esi";
78
+ $out="edi";
79
+ $rounds_="ebx"; # backup copy for $rounds
80
+ $key_="ebp"; # backup copy for $key
81
+
82
+ $rndkey0="xmm0";
83
+ $rndkey1="xmm1";
84
+ $inout0="xmm2";
85
+ $inout1="xmm3";
86
+ $inout2="xmm4";
87
+ $inout3="xmm5"; $in1="xmm5";
88
+ $inout4="xmm6"; $in0="xmm6";
89
+ $inout5="xmm7"; $ivec="xmm7";
90
+
91
+ # AESNI extension
92
+ sub aeskeygenassist
93
+ { my($dst,$src,$imm)=@_;
94
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
95
+ { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
96
+ }
97
+ sub aescommon
98
+ { my($opcodelet,$dst,$src)=@_;
99
+ if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
100
+ { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
101
+ }
102
+ sub aesimc { aescommon(0xdb,@_); }
103
+ sub aesenc { aescommon(0xdc,@_); }
104
+ sub aesenclast { aescommon(0xdd,@_); }
105
+ sub aesdec { aescommon(0xde,@_); }
106
+ sub aesdeclast { aescommon(0xdf,@_); }
107
+
108
+ # Inline version of internal aesni_[en|de]crypt1
109
+ { my $sn;
110
+ sub aesni_inline_generate1
111
+ { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
112
+ $sn++;
113
+
114
+ &$movekey ($rndkey0,&QWP(0,$key));
115
+ &$movekey ($rndkey1,&QWP(16,$key));
116
+ &xorps ($ivec,$rndkey0) if (defined($ivec));
117
+ &lea ($key,&DWP(32,$key));
118
+ &xorps ($inout,$ivec) if (defined($ivec));
119
+ &xorps ($inout,$rndkey0) if (!defined($ivec));
120
+ &set_label("${p}1_loop_$sn");
121
+ eval"&aes${p} ($inout,$rndkey1)";
122
+ &dec ($rounds);
123
+ &$movekey ($rndkey1,&QWP(0,$key));
124
+ &lea ($key,&DWP(16,$key));
125
+ &jnz (&label("${p}1_loop_$sn"));
126
+ eval"&aes${p}last ($inout,$rndkey1)";
127
+ }}
128
+
129
+ sub aesni_generate1 # fully unrolled loop
130
+ { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
131
+
132
+ &function_begin_B("_aesni_${p}rypt1");
133
+ &movups ($rndkey0,&QWP(0,$key));
134
+ &$movekey ($rndkey1,&QWP(0x10,$key));
135
+ &xorps ($inout,$rndkey0);
136
+ &$movekey ($rndkey0,&QWP(0x20,$key));
137
+ &lea ($key,&DWP(0x30,$key));
138
+ &cmp ($rounds,11);
139
+ &jb (&label("${p}128"));
140
+ &lea ($key,&DWP(0x20,$key));
141
+ &je (&label("${p}192"));
142
+ &lea ($key,&DWP(0x20,$key));
143
+ eval"&aes${p} ($inout,$rndkey1)";
144
+ &$movekey ($rndkey1,&QWP(-0x40,$key));
145
+ eval"&aes${p} ($inout,$rndkey0)";
146
+ &$movekey ($rndkey0,&QWP(-0x30,$key));
147
+ &set_label("${p}192");
148
+ eval"&aes${p} ($inout,$rndkey1)";
149
+ &$movekey ($rndkey1,&QWP(-0x20,$key));
150
+ eval"&aes${p} ($inout,$rndkey0)";
151
+ &$movekey ($rndkey0,&QWP(-0x10,$key));
152
+ &set_label("${p}128");
153
+ eval"&aes${p} ($inout,$rndkey1)";
154
+ &$movekey ($rndkey1,&QWP(0,$key));
155
+ eval"&aes${p} ($inout,$rndkey0)";
156
+ &$movekey ($rndkey0,&QWP(0x10,$key));
157
+ eval"&aes${p} ($inout,$rndkey1)";
158
+ &$movekey ($rndkey1,&QWP(0x20,$key));
159
+ eval"&aes${p} ($inout,$rndkey0)";
160
+ &$movekey ($rndkey0,&QWP(0x30,$key));
161
+ eval"&aes${p} ($inout,$rndkey1)";
162
+ &$movekey ($rndkey1,&QWP(0x40,$key));
163
+ eval"&aes${p} ($inout,$rndkey0)";
164
+ &$movekey ($rndkey0,&QWP(0x50,$key));
165
+ eval"&aes${p} ($inout,$rndkey1)";
166
+ &$movekey ($rndkey1,&QWP(0x60,$key));
167
+ eval"&aes${p} ($inout,$rndkey0)";
168
+ &$movekey ($rndkey0,&QWP(0x70,$key));
169
+ eval"&aes${p} ($inout,$rndkey1)";
170
+ eval"&aes${p}last ($inout,$rndkey0)";
171
+ &ret();
172
+ &function_end_B("_aesni_${p}rypt1");
173
+ }
174
+
175
+ # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
176
+ &aesni_generate1("enc") if (!$inline);
177
+ &function_begin_B("${PREFIX}_encrypt");
178
+ &mov ("eax",&wparam(0));
179
+ &mov ($key,&wparam(2));
180
+ &movups ($inout0,&QWP(0,"eax"));
181
+ &mov ($rounds,&DWP(240,$key));
182
+ &mov ("eax",&wparam(1));
183
+ if ($inline)
184
+ { &aesni_inline_generate1("enc"); }
185
+ else
186
+ { &call ("_aesni_encrypt1"); }
187
+ &pxor ($rndkey0,$rndkey0); # clear register bank
188
+ &pxor ($rndkey1,$rndkey1);
189
+ &movups (&QWP(0,"eax"),$inout0);
190
+ &pxor ($inout0,$inout0);
191
+ &ret ();
192
+ &function_end_B("${PREFIX}_encrypt");
193
+
194
+ # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
195
+ &aesni_generate1("dec") if(!$inline);
196
+ &function_begin_B("${PREFIX}_decrypt");
197
+ &mov ("eax",&wparam(0));
198
+ &mov ($key,&wparam(2));
199
+ &movups ($inout0,&QWP(0,"eax"));
200
+ &mov ($rounds,&DWP(240,$key));
201
+ &mov ("eax",&wparam(1));
202
+ if ($inline)
203
+ { &aesni_inline_generate1("dec"); }
204
+ else
205
+ { &call ("_aesni_decrypt1"); }
206
+ &pxor ($rndkey0,$rndkey0); # clear register bank
207
+ &pxor ($rndkey1,$rndkey1);
208
+ &movups (&QWP(0,"eax"),$inout0);
209
+ &pxor ($inout0,$inout0);
210
+ &ret ();
211
+ &function_end_B("${PREFIX}_decrypt");
212
+
213
+ # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
214
+ # factor. Why 3x subroutine were originally used in loops? Even though
215
+ # aes[enc|dec] latency was originally 6, it could be scheduled only
216
+ # every *2nd* cycle. Thus 3x interleave was the one providing optimal
217
+ # utilization, i.e. when subroutine's throughput is virtually same as
218
+ # of non-interleaved subroutine [for number of input blocks up to 3].
219
+ # This is why it originally made no sense to implement 2x subroutine.
220
+ # But times change and it became appropriate to spend extra 192 bytes
221
+ # on 2x subroutine on Atom Silvermont account. For processors that
222
+ # can schedule aes[enc|dec] every cycle optimal interleave factor
223
+ # equals to corresponding instructions latency. 8x is optimal for
224
+ # * Bridge, but it's unfeasible to accommodate such implementation
225
+ # in XMM registers addreassable in 32-bit mode and therefore maximum
226
+ # of 6x is used instead...
227
+
228
+ sub aesni_generate2
229
+ { my $p=shift;
230
+
231
+ &function_begin_B("_aesni_${p}rypt2");
232
+ &$movekey ($rndkey0,&QWP(0,$key));
233
+ &shl ($rounds,4);
234
+ &$movekey ($rndkey1,&QWP(16,$key));
235
+ &xorps ($inout0,$rndkey0);
236
+ &pxor ($inout1,$rndkey0);
237
+ &$movekey ($rndkey0,&QWP(32,$key));
238
+ &lea ($key,&DWP(32,$key,$rounds));
239
+ &neg ($rounds);
240
+ &add ($rounds,16);
241
+
242
+ &set_label("${p}2_loop");
243
+ eval"&aes${p} ($inout0,$rndkey1)";
244
+ eval"&aes${p} ($inout1,$rndkey1)";
245
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
246
+ &add ($rounds,32);
247
+ eval"&aes${p} ($inout0,$rndkey0)";
248
+ eval"&aes${p} ($inout1,$rndkey0)";
249
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
250
+ &jnz (&label("${p}2_loop"));
251
+ eval"&aes${p} ($inout0,$rndkey1)";
252
+ eval"&aes${p} ($inout1,$rndkey1)";
253
+ eval"&aes${p}last ($inout0,$rndkey0)";
254
+ eval"&aes${p}last ($inout1,$rndkey0)";
255
+ &ret();
256
+ &function_end_B("_aesni_${p}rypt2");
257
+ }
258
+
259
+ sub aesni_generate3
260
+ { my $p=shift;
261
+
262
+ &function_begin_B("_aesni_${p}rypt3");
263
+ &$movekey ($rndkey0,&QWP(0,$key));
264
+ &shl ($rounds,4);
265
+ &$movekey ($rndkey1,&QWP(16,$key));
266
+ &xorps ($inout0,$rndkey0);
267
+ &pxor ($inout1,$rndkey0);
268
+ &pxor ($inout2,$rndkey0);
269
+ &$movekey ($rndkey0,&QWP(32,$key));
270
+ &lea ($key,&DWP(32,$key,$rounds));
271
+ &neg ($rounds);
272
+ &add ($rounds,16);
273
+
274
+ &set_label("${p}3_loop");
275
+ eval"&aes${p} ($inout0,$rndkey1)";
276
+ eval"&aes${p} ($inout1,$rndkey1)";
277
+ eval"&aes${p} ($inout2,$rndkey1)";
278
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
279
+ &add ($rounds,32);
280
+ eval"&aes${p} ($inout0,$rndkey0)";
281
+ eval"&aes${p} ($inout1,$rndkey0)";
282
+ eval"&aes${p} ($inout2,$rndkey0)";
283
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
284
+ &jnz (&label("${p}3_loop"));
285
+ eval"&aes${p} ($inout0,$rndkey1)";
286
+ eval"&aes${p} ($inout1,$rndkey1)";
287
+ eval"&aes${p} ($inout2,$rndkey1)";
288
+ eval"&aes${p}last ($inout0,$rndkey0)";
289
+ eval"&aes${p}last ($inout1,$rndkey0)";
290
+ eval"&aes${p}last ($inout2,$rndkey0)";
291
+ &ret();
292
+ &function_end_B("_aesni_${p}rypt3");
293
+ }
294
+
295
+ # 4x interleave is implemented to improve small block performance,
296
+ # most notably [and naturally] 4 block by ~30%. One can argue that one
297
+ # should have implemented 5x as well, but improvement would be <20%,
298
+ # so it's not worth it...
299
+ sub aesni_generate4
300
+ { my $p=shift;
301
+
302
+ &function_begin_B("_aesni_${p}rypt4");
303
+ &$movekey ($rndkey0,&QWP(0,$key));
304
+ &$movekey ($rndkey1,&QWP(16,$key));
305
+ &shl ($rounds,4);
306
+ &xorps ($inout0,$rndkey0);
307
+ &pxor ($inout1,$rndkey0);
308
+ &pxor ($inout2,$rndkey0);
309
+ &pxor ($inout3,$rndkey0);
310
+ &$movekey ($rndkey0,&QWP(32,$key));
311
+ &lea ($key,&DWP(32,$key,$rounds));
312
+ &neg ($rounds);
313
+ &data_byte (0x0f,0x1f,0x40,0x00);
314
+ &add ($rounds,16);
315
+
316
+ &set_label("${p}4_loop");
317
+ eval"&aes${p} ($inout0,$rndkey1)";
318
+ eval"&aes${p} ($inout1,$rndkey1)";
319
+ eval"&aes${p} ($inout2,$rndkey1)";
320
+ eval"&aes${p} ($inout3,$rndkey1)";
321
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
322
+ &add ($rounds,32);
323
+ eval"&aes${p} ($inout0,$rndkey0)";
324
+ eval"&aes${p} ($inout1,$rndkey0)";
325
+ eval"&aes${p} ($inout2,$rndkey0)";
326
+ eval"&aes${p} ($inout3,$rndkey0)";
327
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
328
+ &jnz (&label("${p}4_loop"));
329
+
330
+ eval"&aes${p} ($inout0,$rndkey1)";
331
+ eval"&aes${p} ($inout1,$rndkey1)";
332
+ eval"&aes${p} ($inout2,$rndkey1)";
333
+ eval"&aes${p} ($inout3,$rndkey1)";
334
+ eval"&aes${p}last ($inout0,$rndkey0)";
335
+ eval"&aes${p}last ($inout1,$rndkey0)";
336
+ eval"&aes${p}last ($inout2,$rndkey0)";
337
+ eval"&aes${p}last ($inout3,$rndkey0)";
338
+ &ret();
339
+ &function_end_B("_aesni_${p}rypt4");
340
+ }
341
+
342
+ sub aesni_generate6
343
+ { my $p=shift;
344
+
345
+ &function_begin_B("_aesni_${p}rypt6");
346
+ &static_label("_aesni_${p}rypt6_enter");
347
+ &$movekey ($rndkey0,&QWP(0,$key));
348
+ &shl ($rounds,4);
349
+ &$movekey ($rndkey1,&QWP(16,$key));
350
+ &xorps ($inout0,$rndkey0);
351
+ &pxor ($inout1,$rndkey0); # pxor does better here
352
+ &pxor ($inout2,$rndkey0);
353
+ eval"&aes${p} ($inout0,$rndkey1)";
354
+ &pxor ($inout3,$rndkey0);
355
+ &pxor ($inout4,$rndkey0);
356
+ eval"&aes${p} ($inout1,$rndkey1)";
357
+ &lea ($key,&DWP(32,$key,$rounds));
358
+ &neg ($rounds);
359
+ eval"&aes${p} ($inout2,$rndkey1)";
360
+ &pxor ($inout5,$rndkey0);
361
+ &$movekey ($rndkey0,&QWP(0,$key,$rounds));
362
+ &add ($rounds,16);
363
+ &jmp (&label("_aesni_${p}rypt6_inner"));
364
+
365
+ &set_label("${p}6_loop",16);
366
+ eval"&aes${p} ($inout0,$rndkey1)";
367
+ eval"&aes${p} ($inout1,$rndkey1)";
368
+ eval"&aes${p} ($inout2,$rndkey1)";
369
+ &set_label("_aesni_${p}rypt6_inner");
370
+ eval"&aes${p} ($inout3,$rndkey1)";
371
+ eval"&aes${p} ($inout4,$rndkey1)";
372
+ eval"&aes${p} ($inout5,$rndkey1)";
373
+ &set_label("_aesni_${p}rypt6_enter");
374
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
375
+ &add ($rounds,32);
376
+ eval"&aes${p} ($inout0,$rndkey0)";
377
+ eval"&aes${p} ($inout1,$rndkey0)";
378
+ eval"&aes${p} ($inout2,$rndkey0)";
379
+ eval"&aes${p} ($inout3,$rndkey0)";
380
+ eval"&aes${p} ($inout4,$rndkey0)";
381
+ eval"&aes${p} ($inout5,$rndkey0)";
382
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
383
+ &jnz (&label("${p}6_loop"));
384
+
385
+ eval"&aes${p} ($inout0,$rndkey1)";
386
+ eval"&aes${p} ($inout1,$rndkey1)";
387
+ eval"&aes${p} ($inout2,$rndkey1)";
388
+ eval"&aes${p} ($inout3,$rndkey1)";
389
+ eval"&aes${p} ($inout4,$rndkey1)";
390
+ eval"&aes${p} ($inout5,$rndkey1)";
391
+ eval"&aes${p}last ($inout0,$rndkey0)";
392
+ eval"&aes${p}last ($inout1,$rndkey0)";
393
+ eval"&aes${p}last ($inout2,$rndkey0)";
394
+ eval"&aes${p}last ($inout3,$rndkey0)";
395
+ eval"&aes${p}last ($inout4,$rndkey0)";
396
+ eval"&aes${p}last ($inout5,$rndkey0)";
397
+ &ret();
398
+ &function_end_B("_aesni_${p}rypt6");
399
+ }
400
+ &aesni_generate2("enc") if ($PREFIX eq "aesni");
401
+ &aesni_generate2("dec");
402
+ &aesni_generate3("enc") if ($PREFIX eq "aesni");
403
+ &aesni_generate3("dec");
404
+ &aesni_generate4("enc") if ($PREFIX eq "aesni");
405
+ &aesni_generate4("dec");
406
+ &aesni_generate6("enc") if ($PREFIX eq "aesni");
407
+ &aesni_generate6("dec");
408
+
409
+ if ($PREFIX eq "aesni") {
410
+ ######################################################################
411
+ # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
412
+ # size_t blocks, const AES_KEY *key,
413
+ # const char *ivec,char *cmac);
414
+ #
415
+ # Handles only complete blocks, operates on 64-bit counter and
416
+ # does not update *ivec! Nor does it finalize CMAC value
417
+ # (see engine/eng_aesni.c for details)
418
+ #
419
+ { my $cmac=$inout1;
420
+ &function_begin("aesni_ccm64_encrypt_blocks");
421
+ &mov ($inp,&wparam(0));
422
+ &mov ($out,&wparam(1));
423
+ &mov ($len,&wparam(2));
424
+ &mov ($key,&wparam(3));
425
+ &mov ($rounds_,&wparam(4));
426
+ &mov ($rounds,&wparam(5));
427
+ &mov ($key_,"esp");
428
+ &sub ("esp",60);
429
+ &and ("esp",-16); # align stack
430
+ &mov (&DWP(48,"esp"),$key_);
431
+
432
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
433
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
434
+ &mov ($rounds,&DWP(240,$key));
435
+
436
+ # compose byte-swap control mask for pshufb on stack
437
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
438
+ &mov (&DWP(4,"esp"),0x08090a0b);
439
+ &mov (&DWP(8,"esp"),0x04050607);
440
+ &mov (&DWP(12,"esp"),0x00010203);
441
+
442
+ # compose counter increment vector on stack
443
+ &mov ($rounds_,1);
444
+ &xor ($key_,$key_);
445
+ &mov (&DWP(16,"esp"),$rounds_);
446
+ &mov (&DWP(20,"esp"),$key_);
447
+ &mov (&DWP(24,"esp"),$key_);
448
+ &mov (&DWP(28,"esp"),$key_);
449
+
450
+ &shl ($rounds,4);
451
+ &mov ($rounds_,16);
452
+ &lea ($key_,&DWP(0,$key));
453
+ &movdqa ($inout3,&QWP(0,"esp"));
454
+ &movdqa ($inout0,$ivec);
455
+ &lea ($key,&DWP(32,$key,$rounds));
456
+ &sub ($rounds_,$rounds);
457
+ &pshufb ($ivec,$inout3);
458
+
459
+ &set_label("ccm64_enc_outer");
460
+ &$movekey ($rndkey0,&QWP(0,$key_));
461
+ &mov ($rounds,$rounds_);
462
+ &movups ($in0,&QWP(0,$inp));
463
+
464
+ &xorps ($inout0,$rndkey0);
465
+ &$movekey ($rndkey1,&QWP(16,$key_));
466
+ &xorps ($rndkey0,$in0);
467
+ &xorps ($cmac,$rndkey0); # cmac^=inp
468
+ &$movekey ($rndkey0,&QWP(32,$key_));
469
+
470
+ &set_label("ccm64_enc2_loop");
471
+ &aesenc ($inout0,$rndkey1);
472
+ &aesenc ($cmac,$rndkey1);
473
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
474
+ &add ($rounds,32);
475
+ &aesenc ($inout0,$rndkey0);
476
+ &aesenc ($cmac,$rndkey0);
477
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
478
+ &jnz (&label("ccm64_enc2_loop"));
479
+ &aesenc ($inout0,$rndkey1);
480
+ &aesenc ($cmac,$rndkey1);
481
+ &paddq ($ivec,&QWP(16,"esp"));
482
+ &dec ($len);
483
+ &aesenclast ($inout0,$rndkey0);
484
+ &aesenclast ($cmac,$rndkey0);
485
+
486
+ &lea ($inp,&DWP(16,$inp));
487
+ &xorps ($in0,$inout0); # inp^=E(ivec)
488
+ &movdqa ($inout0,$ivec);
489
+ &movups (&QWP(0,$out),$in0); # save output
490
+ &pshufb ($inout0,$inout3);
491
+ &lea ($out,&DWP(16,$out));
492
+ &jnz (&label("ccm64_enc_outer"));
493
+
494
+ &mov ("esp",&DWP(48,"esp"));
495
+ &mov ($out,&wparam(5));
496
+ &movups (&QWP(0,$out),$cmac);
497
+
498
+ &pxor ("xmm0","xmm0"); # clear register bank
499
+ &pxor ("xmm1","xmm1");
500
+ &pxor ("xmm2","xmm2");
501
+ &pxor ("xmm3","xmm3");
502
+ &pxor ("xmm4","xmm4");
503
+ &pxor ("xmm5","xmm5");
504
+ &pxor ("xmm6","xmm6");
505
+ &pxor ("xmm7","xmm7");
506
+ &function_end("aesni_ccm64_encrypt_blocks");
507
+
508
+ &function_begin("aesni_ccm64_decrypt_blocks");
509
+ &mov ($inp,&wparam(0));
510
+ &mov ($out,&wparam(1));
511
+ &mov ($len,&wparam(2));
512
+ &mov ($key,&wparam(3));
513
+ &mov ($rounds_,&wparam(4));
514
+ &mov ($rounds,&wparam(5));
515
+ &mov ($key_,"esp");
516
+ &sub ("esp",60);
517
+ &and ("esp",-16); # align stack
518
+ &mov (&DWP(48,"esp"),$key_);
519
+
520
+ &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
521
+ &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
522
+ &mov ($rounds,&DWP(240,$key));
523
+
524
+ # compose byte-swap control mask for pshufb on stack
525
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
526
+ &mov (&DWP(4,"esp"),0x08090a0b);
527
+ &mov (&DWP(8,"esp"),0x04050607);
528
+ &mov (&DWP(12,"esp"),0x00010203);
529
+
530
+ # compose counter increment vector on stack
531
+ &mov ($rounds_,1);
532
+ &xor ($key_,$key_);
533
+ &mov (&DWP(16,"esp"),$rounds_);
534
+ &mov (&DWP(20,"esp"),$key_);
535
+ &mov (&DWP(24,"esp"),$key_);
536
+ &mov (&DWP(28,"esp"),$key_);
537
+
538
+ &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
539
+ &movdqa ($inout0,$ivec);
540
+
541
+ &mov ($key_,$key);
542
+ &mov ($rounds_,$rounds);
543
+
544
+ &pshufb ($ivec,$inout3);
545
+ if ($inline)
546
+ { &aesni_inline_generate1("enc"); }
547
+ else
548
+ { &call ("_aesni_encrypt1"); }
549
+ &shl ($rounds_,4);
550
+ &mov ($rounds,16);
551
+ &movups ($in0,&QWP(0,$inp)); # load inp
552
+ &paddq ($ivec,&QWP(16,"esp"));
553
+ &lea ($inp,&QWP(16,$inp));
554
+ &sub ($rounds,$rounds_);
555
+ &lea ($key,&DWP(32,$key_,$rounds_));
556
+ &mov ($rounds_,$rounds);
557
+ &jmp (&label("ccm64_dec_outer"));
558
+
559
+ &set_label("ccm64_dec_outer",16);
560
+ &xorps ($in0,$inout0); # inp ^= E(ivec)
561
+ &movdqa ($inout0,$ivec);
562
+ &movups (&QWP(0,$out),$in0); # save output
563
+ &lea ($out,&DWP(16,$out));
564
+ &pshufb ($inout0,$inout3);
565
+
566
+ &sub ($len,1);
567
+ &jz (&label("ccm64_dec_break"));
568
+
569
+ &$movekey ($rndkey0,&QWP(0,$key_));
570
+ &mov ($rounds,$rounds_);
571
+ &$movekey ($rndkey1,&QWP(16,$key_));
572
+ &xorps ($in0,$rndkey0);
573
+ &xorps ($inout0,$rndkey0);
574
+ &xorps ($cmac,$in0); # cmac^=out
575
+ &$movekey ($rndkey0,&QWP(32,$key_));
576
+
577
+ &set_label("ccm64_dec2_loop");
578
+ &aesenc ($inout0,$rndkey1);
579
+ &aesenc ($cmac,$rndkey1);
580
+ &$movekey ($rndkey1,&QWP(0,$key,$rounds));
581
+ &add ($rounds,32);
582
+ &aesenc ($inout0,$rndkey0);
583
+ &aesenc ($cmac,$rndkey0);
584
+ &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
585
+ &jnz (&label("ccm64_dec2_loop"));
586
+ &movups ($in0,&QWP(0,$inp)); # load inp
587
+ &paddq ($ivec,&QWP(16,"esp"));
588
+ &aesenc ($inout0,$rndkey1);
589
+ &aesenc ($cmac,$rndkey1);
590
+ &aesenclast ($inout0,$rndkey0);
591
+ &aesenclast ($cmac,$rndkey0);
592
+ &lea ($inp,&QWP(16,$inp));
593
+ &jmp (&label("ccm64_dec_outer"));
594
+
595
+ &set_label("ccm64_dec_break",16);
596
+ &mov ($rounds,&DWP(240,$key_));
597
+ &mov ($key,$key_);
598
+ if ($inline)
599
+ { &aesni_inline_generate1("enc",$cmac,$in0); }
600
+ else
601
+ { &call ("_aesni_encrypt1",$cmac); }
602
+
603
+ &mov ("esp",&DWP(48,"esp"));
604
+ &mov ($out,&wparam(5));
605
+ &movups (&QWP(0,$out),$cmac);
606
+
607
+ &pxor ("xmm0","xmm0"); # clear register bank
608
+ &pxor ("xmm1","xmm1");
609
+ &pxor ("xmm2","xmm2");
610
+ &pxor ("xmm3","xmm3");
611
+ &pxor ("xmm4","xmm4");
612
+ &pxor ("xmm5","xmm5");
613
+ &pxor ("xmm6","xmm6");
614
+ &pxor ("xmm7","xmm7");
615
+ &function_end("aesni_ccm64_decrypt_blocks");
616
+ }
617
+
618
+ ######################################################################
619
+ # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
620
+ # size_t blocks, const AES_KEY *key,
621
+ # const char *ivec);
622
+ #
623
+ # Handles only complete blocks, operates on 32-bit counter and
624
+ # does not update *ivec! (see crypto/modes/ctr128.c for details)
625
+ #
626
+ # stack layout:
627
+ # 0 pshufb mask
628
+ # 16 vector addend: 0,6,6,6
629
+ # 32 counter-less ivec
630
+ # 48 1st triplet of counter vector
631
+ # 64 2nd triplet of counter vector
632
+ # 80 saved %esp
633
+
634
+ &function_begin("aesni_ctr32_encrypt_blocks");
635
+ &mov ($inp,&wparam(0));
636
+ &mov ($out,&wparam(1));
637
+ &mov ($len,&wparam(2));
638
+ &mov ($key,&wparam(3));
639
+ &mov ($rounds_,&wparam(4));
640
+ &mov ($key_,"esp");
641
+ &sub ("esp",88);
642
+ &and ("esp",-16); # align stack
643
+ &mov (&DWP(80,"esp"),$key_);
644
+
645
+ &cmp ($len,1);
646
+ &je (&label("ctr32_one_shortcut"));
647
+
648
+ &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
649
+
650
+ # compose byte-swap control mask for pshufb on stack
651
+ &mov (&DWP(0,"esp"),0x0c0d0e0f);
652
+ &mov (&DWP(4,"esp"),0x08090a0b);
653
+ &mov (&DWP(8,"esp"),0x04050607);
654
+ &mov (&DWP(12,"esp"),0x00010203);
655
+
656
+ # compose counter increment vector on stack
657
+ &mov ($rounds,6);
658
+ &xor ($key_,$key_);
659
+ &mov (&DWP(16,"esp"),$rounds);
660
+ &mov (&DWP(20,"esp"),$rounds);
661
+ &mov (&DWP(24,"esp"),$rounds);
662
+ &mov (&DWP(28,"esp"),$key_);
663
+
664
+ &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
665
+ &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
666
+
667
+ &mov ($rounds,&DWP(240,$key)); # key->rounds
668
+
669
+ # compose 2 vectors of 3x32-bit counters
670
+ &bswap ($rounds_);
671
+ &pxor ($rndkey0,$rndkey0);
672
+ &pxor ($rndkey1,$rndkey1);
673
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
674
+ &pinsrd ($rndkey0,$rounds_,0);
675
+ &lea ($key_,&DWP(3,$rounds_));
676
+ &pinsrd ($rndkey1,$key_,0);
677
+ &inc ($rounds_);
678
+ &pinsrd ($rndkey0,$rounds_,1);
679
+ &inc ($key_);
680
+ &pinsrd ($rndkey1,$key_,1);
681
+ &inc ($rounds_);
682
+ &pinsrd ($rndkey0,$rounds_,2);
683
+ &inc ($key_);
684
+ &pinsrd ($rndkey1,$key_,2);
685
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
686
+ &pshufb ($rndkey0,$inout0); # byte swap
687
+ &movdqu ($inout4,&QWP(0,$key)); # key[0]
688
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
689
+ &pshufb ($rndkey1,$inout0); # byte swap
690
+
691
+ &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
692
+ &pshufd ($inout1,$rndkey0,2<<6);
693
+ &cmp ($len,6);
694
+ &jb (&label("ctr32_tail"));
695
+ &pxor ($inout5,$inout4); # counter-less ivec^key[0]
696
+ &shl ($rounds,4);
697
+ &mov ($rounds_,16);
698
+ &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
699
+ &mov ($key_,$key); # backup $key
700
+ &sub ($rounds_,$rounds); # backup twisted $rounds
701
+ &lea ($key,&DWP(32,$key,$rounds));
702
+ &sub ($len,6);
703
+ &jmp (&label("ctr32_loop6"));
704
+
705
+ &set_label("ctr32_loop6",16);
706
+ # inlining _aesni_encrypt6's prologue gives ~6% improvement...
707
+ &pshufd ($inout2,$rndkey0,1<<6);
708
+ &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
709
+ &pshufd ($inout3,$rndkey1,3<<6);
710
+ &pxor ($inout0,$rndkey0); # merge counter-less ivec
711
+ &pshufd ($inout4,$rndkey1,2<<6);
712
+ &pxor ($inout1,$rndkey0);
713
+ &pshufd ($inout5,$rndkey1,1<<6);
714
+ &$movekey ($rndkey1,&QWP(16,$key_));
715
+ &pxor ($inout2,$rndkey0);
716
+ &pxor ($inout3,$rndkey0);
717
+ &aesenc ($inout0,$rndkey1);
718
+ &pxor ($inout4,$rndkey0);
719
+ &pxor ($inout5,$rndkey0);
720
+ &aesenc ($inout1,$rndkey1);
721
+ &$movekey ($rndkey0,&QWP(32,$key_));
722
+ &mov ($rounds,$rounds_);
723
+ &aesenc ($inout2,$rndkey1);
724
+ &aesenc ($inout3,$rndkey1);
725
+ &aesenc ($inout4,$rndkey1);
726
+ &aesenc ($inout5,$rndkey1);
727
+
728
+ &call (&label("_aesni_encrypt6_enter"));
729
+
730
+ &movups ($rndkey1,&QWP(0,$inp));
731
+ &movups ($rndkey0,&QWP(0x10,$inp));
732
+ &xorps ($inout0,$rndkey1);
733
+ &movups ($rndkey1,&QWP(0x20,$inp));
734
+ &xorps ($inout1,$rndkey0);
735
+ &movups (&QWP(0,$out),$inout0);
736
+ &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
737
+ &xorps ($inout2,$rndkey1);
738
+ &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
739
+ &movups (&QWP(0x10,$out),$inout1);
740
+ &movups (&QWP(0x20,$out),$inout2);
741
+
742
+ &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
743
+ &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
744
+ &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
745
+
746
+ &movups ($inout1,&QWP(0x30,$inp));
747
+ &movups ($inout2,&QWP(0x40,$inp));
748
+ &xorps ($inout3,$inout1);
749
+ &movups ($inout1,&QWP(0x50,$inp));
750
+ &lea ($inp,&DWP(0x60,$inp));
751
+ &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
752
+ &pshufb ($rndkey0,$inout0); # byte swap
753
+ &xorps ($inout4,$inout2);
754
+ &movups (&QWP(0x30,$out),$inout3);
755
+ &xorps ($inout5,$inout1);
756
+ &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
757
+ &pshufb ($rndkey1,$inout0); # byte swap
758
+ &movups (&QWP(0x40,$out),$inout4);
759
+ &pshufd ($inout0,$rndkey0,3<<6);
760
+ &movups (&QWP(0x50,$out),$inout5);
761
+ &lea ($out,&DWP(0x60,$out));
762
+
763
+ &pshufd ($inout1,$rndkey0,2<<6);
764
+ &sub ($len,6);
765
+ &jnc (&label("ctr32_loop6"));
766
+
767
+ &add ($len,6);
768
+ &jz (&label("ctr32_ret"));
769
+ &movdqu ($inout5,&QWP(0,$key_));
770
+ &mov ($key,$key_);
771
+ &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
772
+ &mov ($rounds,&DWP(240,$key_)); # restore $rounds
773
+
774
+ &set_label("ctr32_tail");
775
+ &por ($inout0,$inout5);
776
+ &cmp ($len,2);
777
+ &jb (&label("ctr32_one"));
778
+
779
+ &pshufd ($inout2,$rndkey0,1<<6);
780
+ &por ($inout1,$inout5);
781
+ &je (&label("ctr32_two"));
782
+
783
+ &pshufd ($inout3,$rndkey1,3<<6);
784
+ &por ($inout2,$inout5);
785
+ &cmp ($len,4);
786
+ &jb (&label("ctr32_three"));
787
+
788
+ &pshufd ($inout4,$rndkey1,2<<6);
789
+ &por ($inout3,$inout5);
790
+ &je (&label("ctr32_four"));
791
+
792
+ &por ($inout4,$inout5);
793
+ &call ("_aesni_encrypt6");
794
+ &movups ($rndkey1,&QWP(0,$inp));
795
+ &movups ($rndkey0,&QWP(0x10,$inp));
796
+ &xorps ($inout0,$rndkey1);
797
+ &movups ($rndkey1,&QWP(0x20,$inp));
798
+ &xorps ($inout1,$rndkey0);
799
+ &movups ($rndkey0,&QWP(0x30,$inp));
800
+ &xorps ($inout2,$rndkey1);
801
+ &movups ($rndkey1,&QWP(0x40,$inp));
802
+ &xorps ($inout3,$rndkey0);
803
+ &movups (&QWP(0,$out),$inout0);
804
+ &xorps ($inout4,$rndkey1);
805
+ &movups (&QWP(0x10,$out),$inout1);
806
+ &movups (&QWP(0x20,$out),$inout2);
807
+ &movups (&QWP(0x30,$out),$inout3);
808
+ &movups (&QWP(0x40,$out),$inout4);
809
+ &jmp (&label("ctr32_ret"));
810
+
811
+ &set_label("ctr32_one_shortcut",16);
812
+ &movups ($inout0,&QWP(0,$rounds_)); # load ivec
813
+ &mov ($rounds,&DWP(240,$key));
814
+
815
+ &set_label("ctr32_one");
816
+ if ($inline)
817
+ { &aesni_inline_generate1("enc"); }
818
+ else
819
+ { &call ("_aesni_encrypt1"); }
820
+ &movups ($in0,&QWP(0,$inp));
821
+ &xorps ($in0,$inout0);
822
+ &movups (&QWP(0,$out),$in0);
823
+ &jmp (&label("ctr32_ret"));
824
+
825
+ &set_label("ctr32_two",16);
826
+ &call ("_aesni_encrypt2");
827
+ &movups ($inout3,&QWP(0,$inp));
828
+ &movups ($inout4,&QWP(0x10,$inp));
829
+ &xorps ($inout0,$inout3);
830
+ &xorps ($inout1,$inout4);
831
+ &movups (&QWP(0,$out),$inout0);
832
+ &movups (&QWP(0x10,$out),$inout1);
833
+ &jmp (&label("ctr32_ret"));
834
+
835
+ &set_label("ctr32_three",16);
836
+ &call ("_aesni_encrypt3");
837
+ &movups ($inout3,&QWP(0,$inp));
838
+ &movups ($inout4,&QWP(0x10,$inp));
839
+ &xorps ($inout0,$inout3);
840
+ &movups ($inout5,&QWP(0x20,$inp));
841
+ &xorps ($inout1,$inout4);
842
+ &movups (&QWP(0,$out),$inout0);
843
+ &xorps ($inout2,$inout5);
844
+ &movups (&QWP(0x10,$out),$inout1);
845
+ &movups (&QWP(0x20,$out),$inout2);
846
+ &jmp (&label("ctr32_ret"));
847
+
848
+ &set_label("ctr32_four",16);
849
+ &call ("_aesni_encrypt4");
850
+ &movups ($inout4,&QWP(0,$inp));
851
+ &movups ($inout5,&QWP(0x10,$inp));
852
+ &movups ($rndkey1,&QWP(0x20,$inp));
853
+ &xorps ($inout0,$inout4);
854
+ &movups ($rndkey0,&QWP(0x30,$inp));
855
+ &xorps ($inout1,$inout5);
856
+ &movups (&QWP(0,$out),$inout0);
857
+ &xorps ($inout2,$rndkey1);
858
+ &movups (&QWP(0x10,$out),$inout1);
859
+ &xorps ($inout3,$rndkey0);
860
+ &movups (&QWP(0x20,$out),$inout2);
861
+ &movups (&QWP(0x30,$out),$inout3);
862
+
863
+ &set_label("ctr32_ret");
864
+ &pxor ("xmm0","xmm0"); # clear register bank
865
+ &pxor ("xmm1","xmm1");
866
+ &pxor ("xmm2","xmm2");
867
+ &pxor ("xmm3","xmm3");
868
+ &pxor ("xmm4","xmm4");
869
+ &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
870
+ &pxor ("xmm5","xmm5");
871
+ &movdqa (&QWP(48,"esp"),"xmm0");
872
+ &pxor ("xmm6","xmm6");
873
+ &movdqa (&QWP(64,"esp"),"xmm0");
874
+ &pxor ("xmm7","xmm7");
875
+ &mov ("esp",&DWP(80,"esp"));
876
+ &function_end("aesni_ctr32_encrypt_blocks");
877
+
878
+ ######################################################################
879
+ # Mechanical port from aesni-x86_64.pl.
880
+ #
881
+ # _aesni_set_encrypt_key is private interface,
882
+ # input:
883
+ # "eax" const unsigned char *userKey
884
+ # $rounds int bits
885
+ # $key AES_KEY *key
886
+ # output:
887
+ # "eax" return code
888
+ # $round rounds
889
+
890
+ &function_begin_B("_aesni_set_encrypt_key");
891
+ &push ("ebp");
892
+ &push ("ebx");
893
+ &test ("eax","eax");
894
+ &jz (&label("bad_pointer"));
895
+ &test ($key,$key);
896
+ &jz (&label("bad_pointer"));
897
+
898
+ &call (&label("pic"));
899
+ &set_label("pic");
900
+ &blindpop("ebx");
901
+ &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
902
+
903
+ &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
904
+ &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
905
+ &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
906
+ &mov ("ebp",&DWP(4,"ebp"));
907
+ &lea ($key,&DWP(16,$key));
908
+ &and ("ebp",1<<28|1<<11); # AVX and XOP bits
909
+ &cmp ($rounds,256);
910
+ &je (&label("14rounds"));
911
+ &cmp ($rounds,192);
912
+ &je (&label("12rounds"));
913
+ &cmp ($rounds,128);
914
+ &jne (&label("bad_keybits"));
915
+
916
+ &set_label("10rounds",16);
917
+ &cmp ("ebp",1<<28);
918
+ &je (&label("10rounds_alt"));
919
+
920
+ &mov ($rounds,9);
921
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
922
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 1
923
+ &call (&label("key_128_cold"));
924
+ &aeskeygenassist("xmm1","xmm0",0x2); # round 2
925
+ &call (&label("key_128"));
926
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 3
927
+ &call (&label("key_128"));
928
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 4
929
+ &call (&label("key_128"));
930
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 5
931
+ &call (&label("key_128"));
932
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 6
933
+ &call (&label("key_128"));
934
+ &aeskeygenassist("xmm1","xmm0",0x40); # round 7
935
+ &call (&label("key_128"));
936
+ &aeskeygenassist("xmm1","xmm0",0x80); # round 8
937
+ &call (&label("key_128"));
938
+ &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
939
+ &call (&label("key_128"));
940
+ &aeskeygenassist("xmm1","xmm0",0x36); # round 10
941
+ &call (&label("key_128"));
942
+ &$movekey (&QWP(0,$key),"xmm0");
943
+ &mov (&DWP(80,$key),$rounds);
944
+
945
+ &jmp (&label("good_key"));
946
+
947
+ &set_label("key_128",16);
948
+ &$movekey (&QWP(0,$key),"xmm0");
949
+ &lea ($key,&DWP(16,$key));
950
+ &set_label("key_128_cold");
951
+ &shufps ("xmm4","xmm0",0b00010000);
952
+ &xorps ("xmm0","xmm4");
953
+ &shufps ("xmm4","xmm0",0b10001100);
954
+ &xorps ("xmm0","xmm4");
955
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
956
+ &xorps ("xmm0","xmm1");
957
+ &ret();
958
+
959
+ &set_label("10rounds_alt",16);
960
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
961
+ &mov ($rounds,8);
962
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
963
+ &movdqa ("xmm2","xmm0");
964
+ &movdqu (&QWP(-16,$key),"xmm0");
965
+
966
+ &set_label("loop_key128");
967
+ &pshufb ("xmm0","xmm5");
968
+ &aesenclast ("xmm0","xmm4");
969
+ &pslld ("xmm4",1);
970
+ &lea ($key,&DWP(16,$key));
971
+
972
+ &movdqa ("xmm3","xmm2");
973
+ &pslldq ("xmm2",4);
974
+ &pxor ("xmm3","xmm2");
975
+ &pslldq ("xmm2",4);
976
+ &pxor ("xmm3","xmm2");
977
+ &pslldq ("xmm2",4);
978
+ &pxor ("xmm2","xmm3");
979
+
980
+ &pxor ("xmm0","xmm2");
981
+ &movdqu (&QWP(-16,$key),"xmm0");
982
+ &movdqa ("xmm2","xmm0");
983
+
984
+ &dec ($rounds);
985
+ &jnz (&label("loop_key128"));
986
+
987
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
988
+
989
+ &pshufb ("xmm0","xmm5");
990
+ &aesenclast ("xmm0","xmm4");
991
+ &pslld ("xmm4",1);
992
+
993
+ &movdqa ("xmm3","xmm2");
994
+ &pslldq ("xmm2",4);
995
+ &pxor ("xmm3","xmm2");
996
+ &pslldq ("xmm2",4);
997
+ &pxor ("xmm3","xmm2");
998
+ &pslldq ("xmm2",4);
999
+ &pxor ("xmm2","xmm3");
1000
+
1001
+ &pxor ("xmm0","xmm2");
1002
+ &movdqu (&QWP(0,$key),"xmm0");
1003
+
1004
+ &movdqa ("xmm2","xmm0");
1005
+ &pshufb ("xmm0","xmm5");
1006
+ &aesenclast ("xmm0","xmm4");
1007
+
1008
+ &movdqa ("xmm3","xmm2");
1009
+ &pslldq ("xmm2",4);
1010
+ &pxor ("xmm3","xmm2");
1011
+ &pslldq ("xmm2",4);
1012
+ &pxor ("xmm3","xmm2");
1013
+ &pslldq ("xmm2",4);
1014
+ &pxor ("xmm2","xmm3");
1015
+
1016
+ &pxor ("xmm0","xmm2");
1017
+ &movdqu (&QWP(16,$key),"xmm0");
1018
+
1019
+ &mov ($rounds,9);
1020
+ &mov (&DWP(96,$key),$rounds);
1021
+
1022
+ &jmp (&label("good_key"));
1023
+
1024
+ &set_label("12rounds",16);
1025
+ &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
1026
+ &cmp ("ebp",1<<28);
1027
+ &je (&label("12rounds_alt"));
1028
+
1029
+ &mov ($rounds,11);
1030
+ &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1031
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
1032
+ &call (&label("key_192a_cold"));
1033
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
1034
+ &call (&label("key_192b"));
1035
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
1036
+ &call (&label("key_192a"));
1037
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
1038
+ &call (&label("key_192b"));
1039
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
1040
+ &call (&label("key_192a"));
1041
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
1042
+ &call (&label("key_192b"));
1043
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
1044
+ &call (&label("key_192a"));
1045
+ &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
1046
+ &call (&label("key_192b"));
1047
+ &$movekey (&QWP(0,$key),"xmm0");
1048
+ &mov (&DWP(48,$key),$rounds);
1049
+
1050
+ &jmp (&label("good_key"));
1051
+
1052
+ &set_label("key_192a",16);
1053
+ &$movekey (&QWP(0,$key),"xmm0");
1054
+ &lea ($key,&DWP(16,$key));
1055
+ &set_label("key_192a_cold",16);
1056
+ &movaps ("xmm5","xmm2");
1057
+ &set_label("key_192b_warm");
1058
+ &shufps ("xmm4","xmm0",0b00010000);
1059
+ &movdqa ("xmm3","xmm2");
1060
+ &xorps ("xmm0","xmm4");
1061
+ &shufps ("xmm4","xmm0",0b10001100);
1062
+ &pslldq ("xmm3",4);
1063
+ &xorps ("xmm0","xmm4");
1064
+ &pshufd ("xmm1","xmm1",0b01010101); # critical path
1065
+ &pxor ("xmm2","xmm3");
1066
+ &pxor ("xmm0","xmm1");
1067
+ &pshufd ("xmm3","xmm0",0b11111111);
1068
+ &pxor ("xmm2","xmm3");
1069
+ &ret();
1070
+
1071
+ &set_label("key_192b",16);
1072
+ &movaps ("xmm3","xmm0");
1073
+ &shufps ("xmm5","xmm0",0b01000100);
1074
+ &$movekey (&QWP(0,$key),"xmm5");
1075
+ &shufps ("xmm3","xmm2",0b01001110);
1076
+ &$movekey (&QWP(16,$key),"xmm3");
1077
+ &lea ($key,&DWP(32,$key));
1078
+ &jmp (&label("key_192b_warm"));
1079
+
1080
+ &set_label("12rounds_alt",16);
1081
+ &movdqa ("xmm5",&QWP(0x10,"ebx"));
1082
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
1083
+ &mov ($rounds,8);
1084
+ &movdqu (&QWP(-16,$key),"xmm0");
1085
+
1086
+ &set_label("loop_key192");
1087
+ &movq (&QWP(0,$key),"xmm2");
1088
+ &movdqa ("xmm1","xmm2");
1089
+ &pshufb ("xmm2","xmm5");
1090
+ &aesenclast ("xmm2","xmm4");
1091
+ &pslld ("xmm4",1);
1092
+ &lea ($key,&DWP(24,$key));
1093
+
1094
+ &movdqa ("xmm3","xmm0");
1095
+ &pslldq ("xmm0",4);
1096
+ &pxor ("xmm3","xmm0");
1097
+ &pslldq ("xmm0",4);
1098
+ &pxor ("xmm3","xmm0");
1099
+ &pslldq ("xmm0",4);
1100
+ &pxor ("xmm0","xmm3");
1101
+
1102
+ &pshufd ("xmm3","xmm0",0xff);
1103
+ &pxor ("xmm3","xmm1");
1104
+ &pslldq ("xmm1",4);
1105
+ &pxor ("xmm3","xmm1");
1106
+
1107
+ &pxor ("xmm0","xmm2");
1108
+ &pxor ("xmm2","xmm3");
1109
+ &movdqu (&QWP(-16,$key),"xmm0");
1110
+
1111
+ &dec ($rounds);
1112
+ &jnz (&label("loop_key192"));
1113
+
1114
+ &mov ($rounds,11);
1115
+ &mov (&DWP(32,$key),$rounds);
1116
+
1117
+ &jmp (&label("good_key"));
1118
+
1119
+ &set_label("14rounds",16);
1120
+ &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
1121
+ &lea ($key,&DWP(16,$key));
1122
+ &cmp ("ebp",1<<28);
1123
+ &je (&label("14rounds_alt"));
1124
+
1125
+ &mov ($rounds,13);
1126
+ &$movekey (&QWP(-32,$key),"xmm0"); # round 0
1127
+ &$movekey (&QWP(-16,$key),"xmm2"); # round 1
1128
+ &aeskeygenassist("xmm1","xmm2",0x01); # round 2
1129
+ &call (&label("key_256a_cold"));
1130
+ &aeskeygenassist("xmm1","xmm0",0x01); # round 3
1131
+ &call (&label("key_256b"));
1132
+ &aeskeygenassist("xmm1","xmm2",0x02); # round 4
1133
+ &call (&label("key_256a"));
1134
+ &aeskeygenassist("xmm1","xmm0",0x02); # round 5
1135
+ &call (&label("key_256b"));
1136
+ &aeskeygenassist("xmm1","xmm2",0x04); # round 6
1137
+ &call (&label("key_256a"));
1138
+ &aeskeygenassist("xmm1","xmm0",0x04); # round 7
1139
+ &call (&label("key_256b"));
1140
+ &aeskeygenassist("xmm1","xmm2",0x08); # round 8
1141
+ &call (&label("key_256a"));
1142
+ &aeskeygenassist("xmm1","xmm0",0x08); # round 9
1143
+ &call (&label("key_256b"));
1144
+ &aeskeygenassist("xmm1","xmm2",0x10); # round 10
1145
+ &call (&label("key_256a"));
1146
+ &aeskeygenassist("xmm1","xmm0",0x10); # round 11
1147
+ &call (&label("key_256b"));
1148
+ &aeskeygenassist("xmm1","xmm2",0x20); # round 12
1149
+ &call (&label("key_256a"));
1150
+ &aeskeygenassist("xmm1","xmm0",0x20); # round 13
1151
+ &call (&label("key_256b"));
1152
+ &aeskeygenassist("xmm1","xmm2",0x40); # round 14
1153
+ &call (&label("key_256a"));
1154
+ &$movekey (&QWP(0,$key),"xmm0");
1155
+ &mov (&DWP(16,$key),$rounds);
1156
+ &xor ("eax","eax");
1157
+
1158
+ &jmp (&label("good_key"));
1159
+
1160
+ &set_label("key_256a",16);
1161
+ &$movekey (&QWP(0,$key),"xmm2");
1162
+ &lea ($key,&DWP(16,$key));
1163
+ &set_label("key_256a_cold");
1164
+ &shufps ("xmm4","xmm0",0b00010000);
1165
+ &xorps ("xmm0","xmm4");
1166
+ &shufps ("xmm4","xmm0",0b10001100);
1167
+ &xorps ("xmm0","xmm4");
1168
+ &shufps ("xmm1","xmm1",0b11111111); # critical path
1169
+ &xorps ("xmm0","xmm1");
1170
+ &ret();
1171
+
1172
+ &set_label("key_256b",16);
1173
+ &$movekey (&QWP(0,$key),"xmm0");
1174
+ &lea ($key,&DWP(16,$key));
1175
+
1176
+ &shufps ("xmm4","xmm2",0b00010000);
1177
+ &xorps ("xmm2","xmm4");
1178
+ &shufps ("xmm4","xmm2",0b10001100);
1179
+ &xorps ("xmm2","xmm4");
1180
+ &shufps ("xmm1","xmm1",0b10101010); # critical path
1181
+ &xorps ("xmm2","xmm1");
1182
+ &ret();
1183
+
1184
+ &set_label("14rounds_alt",16);
1185
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
1186
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
1187
+ &mov ($rounds,7);
1188
+ &movdqu (&QWP(-32,$key),"xmm0");
1189
+ &movdqa ("xmm1","xmm2");
1190
+ &movdqu (&QWP(-16,$key),"xmm2");
1191
+
1192
+ &set_label("loop_key256");
1193
+ &pshufb ("xmm2","xmm5");
1194
+ &aesenclast ("xmm2","xmm4");
1195
+
1196
+ &movdqa ("xmm3","xmm0");
1197
+ &pslldq ("xmm0",4);
1198
+ &pxor ("xmm3","xmm0");
1199
+ &pslldq ("xmm0",4);
1200
+ &pxor ("xmm3","xmm0");
1201
+ &pslldq ("xmm0",4);
1202
+ &pxor ("xmm0","xmm3");
1203
+ &pslld ("xmm4",1);
1204
+
1205
+ &pxor ("xmm0","xmm2");
1206
+ &movdqu (&QWP(0,$key),"xmm0");
1207
+
1208
+ &dec ($rounds);
1209
+ &jz (&label("done_key256"));
1210
+
1211
+ &pshufd ("xmm2","xmm0",0xff);
1212
+ &pxor ("xmm3","xmm3");
1213
+ &aesenclast ("xmm2","xmm3");
1214
+
1215
+ &movdqa ("xmm3","xmm1")
1216
+ &pslldq ("xmm1",4);
1217
+ &pxor ("xmm3","xmm1");
1218
+ &pslldq ("xmm1",4);
1219
+ &pxor ("xmm3","xmm1");
1220
+ &pslldq ("xmm1",4);
1221
+ &pxor ("xmm1","xmm3");
1222
+
1223
+ &pxor ("xmm2","xmm1");
1224
+ &movdqu (&QWP(16,$key),"xmm2");
1225
+ &lea ($key,&DWP(32,$key));
1226
+ &movdqa ("xmm1","xmm2");
1227
+ &jmp (&label("loop_key256"));
1228
+
1229
+ &set_label("done_key256");
1230
+ &mov ($rounds,13);
1231
+ &mov (&DWP(16,$key),$rounds);
1232
+
1233
+ &set_label("good_key");
1234
+ &pxor ("xmm0","xmm0");
1235
+ &pxor ("xmm1","xmm1");
1236
+ &pxor ("xmm2","xmm2");
1237
+ &pxor ("xmm3","xmm3");
1238
+ &pxor ("xmm4","xmm4");
1239
+ &pxor ("xmm5","xmm5");
1240
+ &xor ("eax","eax");
1241
+ &pop ("ebx");
1242
+ &pop ("ebp");
1243
+ &ret ();
1244
+
1245
+ &set_label("bad_pointer",4);
1246
+ &mov ("eax",-1);
1247
+ &pop ("ebx");
1248
+ &pop ("ebp");
1249
+ &ret ();
1250
+ &set_label("bad_keybits",4);
1251
+ &pxor ("xmm0","xmm0");
1252
+ &mov ("eax",-2);
1253
+ &pop ("ebx");
1254
+ &pop ("ebp");
1255
+ &ret ();
1256
+ &function_end_B("_aesni_set_encrypt_key");
1257
+
1258
+ # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
1259
+ # AES_KEY *key)
1260
+ &function_begin_B("${PREFIX}_set_encrypt_key");
1261
+ &mov ("eax",&wparam(0));
1262
+ &mov ($rounds,&wparam(1));
1263
+ &mov ($key,&wparam(2));
1264
+ &call ("_aesni_set_encrypt_key");
1265
+ &ret ();
1266
+ &function_end_B("${PREFIX}_set_encrypt_key");
1267
+
1268
+ # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
1269
+ # AES_KEY *key)
1270
+ &function_begin_B("${PREFIX}_set_decrypt_key");
1271
+ &mov ("eax",&wparam(0));
1272
+ &mov ($rounds,&wparam(1));
1273
+ &mov ($key,&wparam(2));
1274
+ &call ("_aesni_set_encrypt_key");
1275
+ &mov ($key,&wparam(2));
1276
+ &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
1277
+ &test ("eax","eax");
1278
+ &jnz (&label("dec_key_ret"));
1279
+ &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
1280
+
1281
+ &$movekey ("xmm0",&QWP(0,$key)); # just swap
1282
+ &$movekey ("xmm1",&QWP(0,"eax"));
1283
+ &$movekey (&QWP(0,"eax"),"xmm0");
1284
+ &$movekey (&QWP(0,$key),"xmm1");
1285
+ &lea ($key,&DWP(16,$key));
1286
+ &lea ("eax",&DWP(-16,"eax"));
1287
+
1288
+ &set_label("dec_key_inverse");
1289
+ &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
1290
+ &$movekey ("xmm1",&QWP(0,"eax"));
1291
+ &aesimc ("xmm0","xmm0");
1292
+ &aesimc ("xmm1","xmm1");
1293
+ &lea ($key,&DWP(16,$key));
1294
+ &lea ("eax",&DWP(-16,"eax"));
1295
+ &$movekey (&QWP(16,"eax"),"xmm0");
1296
+ &$movekey (&QWP(-16,$key),"xmm1");
1297
+ &cmp ("eax",$key);
1298
+ &ja (&label("dec_key_inverse"));
1299
+
1300
+ &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
1301
+ &aesimc ("xmm0","xmm0");
1302
+ &$movekey (&QWP(0,$key),"xmm0");
1303
+
1304
+ &pxor ("xmm0","xmm0");
1305
+ &pxor ("xmm1","xmm1");
1306
+ &xor ("eax","eax"); # return success
1307
+ &set_label("dec_key_ret");
1308
+ &ret ();
1309
+ &function_end_B("${PREFIX}_set_decrypt_key");
1310
+
1311
+ &set_label("key_const",64);
1312
+ &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
1313
+ &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
1314
+ &data_word(1,1,1,1);
1315
+ &data_word(0x1b,0x1b,0x1b,0x1b);
1316
+ &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
1317
+
1318
+ &asm_finish();