ring-native 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,25 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3
+ <PropertyGroup Label="Globals">
4
+ <ProjectGuid>{1C3071CC-26DA-4790-B48A-3936DDD0E7E7}</ProjectGuid>
5
+ <TargetName>aes_test</TargetName>
6
+ </PropertyGroup>
7
+ <ImportGroup Label="PropertySheets">
8
+ <Import Project="..\..\mk\WindowsTest.props" />
9
+ </ImportGroup>
10
+ <PropertyGroup Label="Configuration">
11
+ <OutDir>$(OutRootDir)test\ring\crypto\aes\</OutDir>
12
+ </PropertyGroup>
13
+ <ItemGroup>
14
+ <ClCompile Include="aes_test.cc" />
15
+ </ItemGroup>
16
+ <ItemGroup>
17
+ <ProjectReference Include="..\libring.Windows.vcxproj">
18
+ <Project>{f4c0a1b6-5e09-41c8-8242-3e1f6762fb18}</Project>
19
+ </ProjectReference>
20
+ <ProjectReference Include="..\test\test.Windows.vcxproj">
21
+ <Project>{1dace503-6498-492d-b1ff-f9ee18624443}</Project>
22
+ </ProjectReference>
23
+ </ItemGroup>
24
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
25
+ </Project>
@@ -0,0 +1,93 @@
1
+ /* Copyright (c) 2015, Google Inc.
2
+ *
3
+ * Permission to use, copy, modify, and/or distribute this software for any
4
+ * purpose with or without fee is hereby granted, provided that the above
5
+ * copyright notice and this permission notice appear in all copies.
6
+ *
7
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
+
15
+ #include <stdio.h>
16
+ #include <string.h>
17
+
18
+ #include <openssl/aes.h>
19
+ #include <openssl/crypto.h>
20
+
21
+
22
+ static bool TestAES(const uint8_t *key, size_t key_len,
23
+ const uint8_t plaintext[AES_BLOCK_SIZE],
24
+ const uint8_t ciphertext[AES_BLOCK_SIZE]) {
25
+ AES_KEY aes_key;
26
+ if (AES_set_encrypt_key(key, key_len * 8, &aes_key) != 0) {
27
+ fprintf(stderr, "AES_set_encrypt_key failed\n");
28
+ return false;
29
+ }
30
+
31
+ // Test encryption.
32
+ uint8_t block[AES_BLOCK_SIZE];
33
+ AES_encrypt(plaintext, block, &aes_key);
34
+ if (memcmp(block, ciphertext, AES_BLOCK_SIZE) != 0) {
35
+ fprintf(stderr, "AES_encrypt gave the wrong output\n");
36
+ return false;
37
+ }
38
+
39
+ // Test in-place encryption.
40
+ memcpy(block, plaintext, AES_BLOCK_SIZE);
41
+ AES_encrypt(block, block, &aes_key);
42
+ if (memcmp(block, ciphertext, AES_BLOCK_SIZE) != 0) {
43
+ fprintf(stderr, "AES_encrypt gave the wrong output\n");
44
+ return false;
45
+ }
46
+
47
+ if (AES_set_decrypt_key(key, key_len * 8, &aes_key) != 0) {
48
+ fprintf(stderr, "AES_set_decrypt_key failed\n");
49
+ return false;
50
+ }
51
+
52
+ // Test decryption.
53
+ AES_decrypt(ciphertext, block, &aes_key);
54
+ if (memcmp(block, plaintext, AES_BLOCK_SIZE) != 0) {
55
+ fprintf(stderr, "AES_decrypt gave the wrong output\n");
56
+ return false;
57
+ }
58
+
59
+ // Test in-place decryption.
60
+ memcpy(block, ciphertext, AES_BLOCK_SIZE);
61
+ AES_decrypt(block, block, &aes_key);
62
+ if (memcmp(block, plaintext, AES_BLOCK_SIZE) != 0) {
63
+ fprintf(stderr, "AES_decrypt gave the wrong output\n");
64
+ return false;
65
+ }
66
+ return true;
67
+ }
68
+
69
+ int main() {
70
+ CRYPTO_library_init();
71
+
72
+ // Test vectors from FIPS-197, Appendix C.
73
+ if (!TestAES((const uint8_t *)"\x00\x01\x02\x03\x04\x05\x06\x07"
74
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
75
+ 128 / 8,
76
+ (const uint8_t *)"\x00\x11\x22\x33\x44\x55\x66\x77"
77
+ "\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
78
+ (const uint8_t *)"\x69\xc4\xe0\xd8\x6a\x7b\x04\x30"
79
+ "\xd8\xcd\xb7\x80\x70\xb4\xc5\x5a") ||
80
+ !TestAES((const uint8_t *)"\x00\x01\x02\x03\x04\x05\x06\x07"
81
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
82
+ "\x10\x11\x12\x13\x14\x15\x16\x17"
83
+ "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
84
+ 256 / 8,
85
+ (const uint8_t *)"\x00\x11\x22\x33\x44\x55\x66\x77"
86
+ "\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
87
+ (const uint8_t *)"\x8e\xa2\xb7\xca\x51\x67\x45\xbf"
88
+ "\xea\xfc\x49\x90\x4b\x49\x60\x89")) {
89
+ return false;
90
+ }
91
+
92
+ return 0;
93
+ }
@@ -0,0 +1,2368 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # Version 4.3.
11
+ #
12
+ # You might fail to appreciate this module performance from the first
13
+ # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14
+ # to be *the* best Intel C compiler without -KPIC, performance appears
15
+ # to be virtually identical... But try to re-configure with shared
16
+ # library support... Aha! Intel compiler "suddenly" lags behind by 30%
17
+ # [on P4, more on others]:-) And if compared to position-independent
18
+ # code generated by GNU C, this code performs *more* than *twice* as
19
+ # fast! Yes, all this buzz about PIC means that unlike other hand-
20
+ # coded implementations, this one was explicitly designed to be safe
21
+ # to use even in shared library context... This also means that this
22
+ # code isn't necessarily absolutely fastest "ever," because in order
23
+ # to achieve position independence an extra register has to be
24
+ # off-loaded to stack, which affects the benchmark result.
25
+ #
26
+ # Special note about instruction choice. Do you recall RC4_INT code
27
+ # performing poorly on P4? It might be the time to figure out why.
28
+ # RC4_INT code implies effective address calculations in base+offset*4
29
+ # form. Trouble is that it seems that offset scaling turned to be
30
+ # critical path... At least eliminating scaling resulted in 2.8x RC4
31
+ # performance improvement [as you might recall]. As AES code is hungry
32
+ # for scaling too, I [try to] avoid the latter by favoring off-by-2
33
+ # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34
+ #
35
+ # As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36
+ # void. Performance improvement with off-by-2 shifts was observed on
37
+ # intermediate implementation, which was spilling yet another register
38
+ # to stack... Final offset*4 code below runs just a tad faster on P4,
39
+ # but exhibits up to 10% improvement on other cores.
40
+ #
41
+ # Second version is "monolithic" replacement for aes_core.c, which in
42
+ # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43
+ # This made it possible to implement little-endian variant of the
44
+ # algorithm without modifying the base C code. Motivating factor for
45
+ # the undertaken effort was that it appeared that in tight IA-32
46
+ # register window little-endian flavor could achieve slightly higher
47
+ # Instruction Level Parallelism, and it indeed resulted in up to 15%
48
+ # better performance on most recent µ-archs...
49
+ #
50
+ # Third version adds AES_cbc_encrypt implementation, which resulted in
51
+ # up to 40% performance imrovement of CBC benchmark results. 40% was
52
+ # observed on P4 core, where "overall" imrovement coefficient, i.e. if
53
+ # compared to PIC generated by GCC and in CBC mode, was observed to be
54
+ # as large as 4x:-) CBC performance is virtually identical to ECB now
55
+ # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56
+ # Opteron, because certain function prologues and epilogues are
57
+ # effectively taken out of the loop...
58
+ #
59
+ # Version 3.2 implements compressed tables and prefetch of these tables
60
+ # in CBC[!] mode. Former means that 3/4 of table references are now
61
+ # misaligned, which unfortunately has negative impact on elder IA-32
62
+ # implementations, Pentium suffered 30% penalty, PIII - 10%.
63
+ #
64
+ # Version 3.3 avoids L1 cache aliasing between stack frame and
65
+ # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66
+ # latter is achieved by copying the key schedule to controlled place in
67
+ # stack. This unfortunately has rather strong impact on small block CBC
68
+ # performance, ~2x deterioration on 16-byte block if compared to 3.3.
69
+ #
70
+ # Version 3.5 checks if there is L1 cache aliasing between user-supplied
71
+ # key schedule and S-boxes and abstains from copying the former if
72
+ # there is no. This allows end-user to consciously retain small block
73
+ # performance by aligning key schedule in specific manner.
74
+ #
75
+ # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76
+ #
77
+ # Current ECB performance numbers for 128-bit key in CPU cycles per
78
+ # processed byte [measure commonly used by AES benchmarkers] are:
79
+ #
80
+ # small footprint fully unrolled
81
+ # P4 24 22
82
+ # AMD K8 20 19
83
+ # PIII 25 23
84
+ # Pentium 81 78
85
+ #
86
+ # Version 3.7 reimplements outer rounds as "compact." Meaning that
87
+ # first and last rounds reference compact 256 bytes S-box. This means
88
+ # that first round consumes a lot more CPU cycles and that encrypt
89
+ # and decrypt performance becomes asymmetric. Encrypt performance
90
+ # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91
+ # aggressively pre-fetched.
92
+ #
93
+ # Version 4.0 effectively rolls back to 3.6 and instead implements
94
+ # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95
+ # which use exclusively 256 byte S-box. These functions are to be
96
+ # called in modes not concealing plain text, such as ECB, or when
97
+ # we're asked to process smaller amount of data [or unconditionally
98
+ # on hyper-threading CPU]. Currently it's called unconditionally from
99
+ # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100
+ # still needs to be modified to switch between slower and faster
101
+ # mode when appropriate... But in either case benchmark landscape
102
+ # changes dramatically and below numbers are CPU cycles per processed
103
+ # byte for 128-bit key.
104
+ #
105
+ # ECB encrypt ECB decrypt CBC large chunk
106
+ # P4 52[54] 83[95] 23
107
+ # AMD K8 46[41] 66[70] 18
108
+ # PIII 41[50] 60[77] 24
109
+ # Core 2 31[36] 45[64] 18.5
110
+ # Atom 76[100] 96[138] 60
111
+ # Pentium 115 150 77
112
+ #
113
+ # Version 4.1 switches to compact S-box even in key schedule setup.
114
+ #
115
+ # Version 4.2 prefetches compact S-box in every SSE round or in other
116
+ # words every cache-line is *guaranteed* to be accessed within ~50
117
+ # cycles window. Why just SSE? Because it's needed on hyper-threading
118
+ # CPU! Which is also why it's prefetched with 64 byte stride. Best
119
+ # part is that it has no negative effect on performance:-)
120
+ #
121
+ # Version 4.3 implements switch between compact and non-compact block
122
+ # functions in AES_cbc_encrypt depending on how much data was asked
123
+ # to be processed in one stroke.
124
+ #
125
+ ######################################################################
126
+ # Timing attacks are classified in two classes: synchronous when
127
+ # attacker consciously initiates cryptographic operation and collects
128
+ # timing data of various character afterwards, and asynchronous when
129
+ # malicious code is executed on same CPU simultaneously with AES,
130
+ # instruments itself and performs statistical analysis of this data.
131
+ #
132
+ # As far as synchronous attacks go the root to the AES timing
133
+ # vulnerability is twofold. Firstly, of 256 S-box elements at most 160
134
+ # are referred to in single 128-bit block operation. Well, in C
135
+ # implementation with 4 distinct tables it's actually as little as 40
136
+ # references per 256 elements table, but anyway... Secondly, even
137
+ # though S-box elements are clustered into smaller amount of cache-
138
+ # lines, smaller than 160 and even 40, it turned out that for certain
139
+ # plain-text pattern[s] or simply put chosen plain-text and given key
140
+ # few cache-lines remain unaccessed during block operation. Now, if
141
+ # attacker can figure out this access pattern, he can deduct the key
142
+ # [or at least part of it]. The natural way to mitigate this kind of
143
+ # attacks is to minimize the amount of cache-lines in S-box and/or
144
+ # prefetch them to ensure that every one is accessed for more uniform
145
+ # timing. But note that *if* plain-text was concealed in such way that
146
+ # input to block function is distributed *uniformly*, then attack
147
+ # wouldn't apply. Now note that some encryption modes, most notably
148
+ # CBC, do mask the plain-text in this exact way [secure cipher output
149
+ # is distributed uniformly]. Yes, one still might find input that
150
+ # would reveal the information about given key, but if amount of
151
+ # candidate inputs to be tried is larger than amount of possible key
152
+ # combinations then attack becomes infeasible. This is why revised
153
+ # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
154
+ # of data is to be processed in one stroke. The current size limit of
155
+ # 512 bytes is chosen to provide same [diminishigly low] probability
156
+ # for cache-line to remain untouched in large chunk operation with
157
+ # large S-box as for single block operation with compact S-box and
158
+ # surely needs more careful consideration...
159
+ #
160
+ # As for asynchronous attacks. There are two flavours: attacker code
161
+ # being interleaved with AES on hyper-threading CPU at *instruction*
162
+ # level, and two processes time sharing single core. As for latter.
163
+ # Two vectors. 1. Given that attacker process has higher priority,
164
+ # yield execution to process performing AES just before timer fires
165
+ # off the scheduler, immediately regain control of CPU and analyze the
166
+ # cache state. For this attack to be efficient attacker would have to
167
+ # effectively slow down the operation by several *orders* of magnitute,
168
+ # by ratio of time slice to duration of handful of AES rounds, which
169
+ # unlikely to remain unnoticed. Not to mention that this also means
170
+ # that he would spend correspondigly more time to collect enough
171
+ # statistical data to mount the attack. It's probably appropriate to
172
+ # say that if adeversary reckons that this attack is beneficial and
173
+ # risks to be noticed, you probably have larger problems having him
174
+ # mere opportunity. In other words suggested code design expects you
175
+ # to preclude/mitigate this attack by overall system security design.
176
+ # 2. Attacker manages to make his code interrupt driven. In order for
177
+ # this kind of attack to be feasible, interrupt rate has to be high
178
+ # enough, again comparable to duration of handful of AES rounds. But
179
+ # is there interrupt source of such rate? Hardly, not even 1Gbps NIC
180
+ # generates interrupts at such raging rate...
181
+ #
182
+ # And now back to the former, hyper-threading CPU or more specifically
183
+ # Intel P4. Recall that asynchronous attack implies that malicious
184
+ # code instruments itself. And naturally instrumentation granularity
185
+ # has be noticeably lower than duration of codepath accessing S-box.
186
+ # Given that all cache-lines are accessed during that time that is.
187
+ # Current implementation accesses *all* cache-lines within ~50 cycles
188
+ # window, which is actually *less* than RDTSC latency on Intel P4!
189
+
190
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
191
+ push(@INC,"${dir}","${dir}../../perlasm");
192
+ require "x86asm.pl";
193
+
194
+ &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
195
+ &static_label("AES_Te");
196
+ &static_label("AES_Td");
197
+
198
+ $s0="eax";
199
+ $s1="ebx";
200
+ $s2="ecx";
201
+ $s3="edx";
202
+ $key="edi";
203
+ $acc="esi";
204
+ $tbl="ebp";
205
+
206
+ # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
207
+ # by caller
208
+ $__ra=&DWP(0,"esp"); # return address
209
+ $__s0=&DWP(4,"esp"); # s0 backing store
210
+ $__s1=&DWP(8,"esp"); # s1 backing store
211
+ $__s2=&DWP(12,"esp"); # s2 backing store
212
+ $__s3=&DWP(16,"esp"); # s3 backing store
213
+ $__key=&DWP(20,"esp"); # pointer to key schedule
214
+ $__end=&DWP(24,"esp"); # pointer to end of key schedule
215
+ $__tbl=&DWP(28,"esp"); # %ebp backing store
216
+
217
+ # stack frame layout in AES_[en|crypt] routines, which differs from
218
+ # above by 4 and overlaps by %ebp backing store
219
+ $_tbl=&DWP(24,"esp");
220
+ $_esp=&DWP(28,"esp");
221
+
222
+ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
223
+
224
+ $speed_limit=512; # chunks smaller than $speed_limit are
225
+ # processed with compact routine in CBC mode
226
+ $small_footprint=1; # $small_footprint=1 code is ~5% slower [on
227
+ # recent µ-archs], but ~5 times smaller!
228
+ # I favor compact code to minimize cache
229
+ # contention and in hope to "collect" 5% back
230
+ # in real-life applications...
231
+
232
+ $vertical_spin=0; # shift "verticaly" defaults to 0, because of
233
+ # its proof-of-concept status...
234
+ # Note that there is no decvert(), as well as last encryption round is
235
+ # performed with "horizontal" shifts. This is because this "vertical"
236
+ # implementation [one which groups shifts on a given $s[i] to form a
237
+ # "column," unlike "horizontal" one, which groups shifts on different
238
+ # $s[i] to form a "row"] is work in progress. It was observed to run
239
+ # few percents faster on Intel cores, but not AMD. On AMD K8 core it's
240
+ # whole 12% slower:-( So we face a trade-off... Shall it be resolved
241
+ # some day? Till then the code is considered experimental and by
242
+ # default remains dormant...
243
+
244
+ sub encvert()
245
+ { my ($te,@s) = @_;
246
+ my ($v0,$v1) = ($acc,$key);
247
+
248
+ &mov ($v0,$s[3]); # copy s3
249
+ &mov (&DWP(4,"esp"),$s[2]); # save s2
250
+ &mov ($v1,$s[0]); # copy s0
251
+ &mov (&DWP(8,"esp"),$s[1]); # save s1
252
+
253
+ &movz ($s[2],&HB($s[0]));
254
+ &and ($s[0],0xFF);
255
+ &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
256
+ &shr ($v1,16);
257
+ &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
258
+ &movz ($s[1],&HB($v1));
259
+ &and ($v1,0xFF);
260
+ &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
261
+ &mov ($v1,$v0);
262
+ &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
263
+
264
+ &and ($v0,0xFF);
265
+ &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
266
+ &movz ($v0,&HB($v1));
267
+ &shr ($v1,16);
268
+ &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
269
+ &movz ($v0,&HB($v1));
270
+ &and ($v1,0xFF);
271
+ &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
272
+ &mov ($v1,&DWP(4,"esp")); # restore s2
273
+ &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
274
+
275
+ &mov ($v0,$v1);
276
+ &and ($v1,0xFF);
277
+ &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
278
+ &movz ($v1,&HB($v0));
279
+ &shr ($v0,16);
280
+ &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
281
+ &movz ($v1,&HB($v0));
282
+ &and ($v0,0xFF);
283
+ &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
284
+ &mov ($v0,&DWP(8,"esp")); # restore s1
285
+ &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
286
+
287
+ &mov ($v1,$v0);
288
+ &and ($v0,0xFF);
289
+ &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
290
+ &movz ($v0,&HB($v1));
291
+ &shr ($v1,16);
292
+ &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
293
+ &movz ($v0,&HB($v1));
294
+ &and ($v1,0xFF);
295
+ &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
296
+ &mov ($key,$__key); # reincarnate v1 as key
297
+ &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
298
+ }
299
+
300
+ # Another experimental routine, which features "horizontal spin," but
301
+ # eliminates one reference to stack. Strangely enough runs slower...
302
+ sub enchoriz()
303
+ { my ($v0,$v1) = ($key,$acc);
304
+
305
+ &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
306
+ &rotr ($s2,8); # 8,11,10, 9
307
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 0
308
+ &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
309
+ &rotr ($s3,16); # 13,12,15,14
310
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 5
311
+ &movz ($v0,&HB($s2)); # 8,11,10*, 9
312
+ &rotr ($s0,16); # 1, 0, 3, 2
313
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 10
314
+ &movz ($v0,&HB($s3)); # 13,12,15*,14
315
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
316
+ &mov ($__s0,$v1); # t[0] saved
317
+
318
+ &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
319
+ &shr ($s1,16); # -, -, 7, 6
320
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 4
321
+ &movz ($v0,&LB($s3)); # 13,12,15,14*
322
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 14
323
+ &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
324
+ &and ($s3,0xffff0000); # 13,12, -, -
325
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 3
326
+ &movz ($v0,&LB($s2)); # 8,11,10, 9*
327
+ &or ($s3,$s1); # 13,12, 7, 6
328
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
329
+ &mov ($s1,$v1); # s[1]=t[1]
330
+
331
+ &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
332
+ &shr ($s2,16); # -, -, 8,11
333
+ &mov ($v1,&DWP(2,$te,$v0,8)); # 2
334
+ &movz ($v0,&HB($s3)); # 13,12, 7*, 6
335
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 7
336
+ &movz ($v0,&HB($s2)); # -, -, 8*,11
337
+ &xor ($v1,&DWP(0,$te,$v0,8)); # 8
338
+ &mov ($v0,$s3);
339
+ &shr ($v0,24); # 13
340
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
341
+
342
+ &movz ($v0,&LB($s2)); # -, -, 8,11*
343
+ &shr ($s0,24); # 1*
344
+ &mov ($s2,&DWP(1,$te,$v0,8)); # 11
345
+ &xor ($s2,&DWP(3,$te,$s0,8)); # 1
346
+ &mov ($s0,$__s0); # s[0]=t[0]
347
+ &movz ($v0,&LB($s3)); # 13,12, 7, 6*
348
+ &shr ($s3,16); # , ,13,12
349
+ &xor ($s2,&DWP(2,$te,$v0,8)); # 6
350
+ &mov ($key,$__key); # reincarnate v0 as key
351
+ &and ($s3,0xff); # , ,13,12*
352
+ &mov ($s3,&DWP(0,$te,$s3,8)); # 12
353
+ &xor ($s3,$s2); # s[2]=t[3] collected
354
+ &mov ($s2,$v1); # s[2]=t[2]
355
+ }
356
+
357
+ # More experimental code... SSE one... Even though this one eliminates
358
+ # *all* references to stack, it's not faster...
359
+ sub sse_encbody()
360
+ {
361
+ &movz ($acc,&LB("eax")); # 0
362
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
363
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
364
+ &movz ("edx",&HB("eax")); # 1
365
+ &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
366
+ &shr ("eax",16); # 5, 4
367
+
368
+ &movz ($acc,&LB("ebx")); # 10
369
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
370
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
371
+ &movz ($acc,&HB("ebx")); # 11
372
+ &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
373
+ &shr ("ebx",16); # 15,14
374
+
375
+ &movz ($acc,&HB("eax")); # 5
376
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
377
+ &movq ("mm3",QWP(16,$key));
378
+ &movz ($acc,&HB("ebx")); # 15
379
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
380
+ &movd ("mm0","ecx"); # t[0] collected
381
+
382
+ &movz ($acc,&LB("eax")); # 4
383
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
384
+ &movd ("eax","mm2"); # 7, 6, 3, 2
385
+ &movz ($acc,&LB("ebx")); # 14
386
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
387
+ &movd ("ebx","mm6"); # 13,12, 9, 8
388
+
389
+ &movz ($acc,&HB("eax")); # 3
390
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
391
+ &movz ($acc,&HB("ebx")); # 9
392
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
393
+ &movd ("mm1","ecx"); # t[1] collected
394
+
395
+ &movz ($acc,&LB("eax")); # 2
396
+ &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
397
+ &shr ("eax",16); # 7, 6
398
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
399
+ &movz ($acc,&LB("ebx")); # 8
400
+ &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
401
+ &shr ("ebx",16); # 13,12
402
+
403
+ &movz ($acc,&HB("eax")); # 7
404
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
405
+ &pxor ("mm0","mm3");
406
+ &movz ("eax",&LB("eax")); # 6
407
+ &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
408
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
409
+ &movz ($acc,&HB("ebx")); # 13
410
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
411
+ &xor ("ecx",&DWP(24,$key)); # t[2]
412
+ &movd ("mm4","ecx"); # t[2] collected
413
+ &movz ("ebx",&LB("ebx")); # 12
414
+ &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
415
+ &shr ("ecx",16);
416
+ &movd ("eax","mm1"); # 5, 4, 1, 0
417
+ &mov ("ebx",&DWP(28,$key)); # t[3]
418
+ &xor ("ebx","edx");
419
+ &movd ("mm5","ebx"); # t[3] collected
420
+ &and ("ebx",0xffff0000);
421
+ &or ("ebx","ecx");
422
+
423
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
424
+ }
425
+
426
+ ######################################################################
427
+ # "Compact" block function
428
+ ######################################################################
429
+
430
+ sub enccompact()
431
+ { my $Fn = \&mov;
432
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
433
+ my ($i,$te,@s)=@_;
434
+ my $tmp = $key;
435
+ my $out = $i==3?$s[0]:$acc;
436
+
437
+ # $Fn is used in first compact round and its purpose is to
438
+ # void restoration of some values from stack, so that after
439
+ # 4xenccompact with extra argument $key value is left there...
440
+ if ($i==3) { &$Fn ($key,$__key); }##%edx
441
+ else { &mov ($out,$s[0]); }
442
+ &and ($out,0xFF);
443
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
444
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
445
+ &movz ($out,&BP(-128,$te,$out,1));
446
+
447
+ if ($i==3) { $tmp=$s[1]; }##%eax
448
+ &movz ($tmp,&HB($s[1]));
449
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
450
+ &shl ($tmp,8);
451
+ &xor ($out,$tmp);
452
+
453
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
454
+ else { &mov ($tmp,$s[2]);
455
+ &shr ($tmp,16); }
456
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
457
+ &and ($tmp,0xFF);
458
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
459
+ &shl ($tmp,16);
460
+ &xor ($out,$tmp);
461
+
462
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
463
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
464
+ else { &mov ($tmp,$s[3]);
465
+ &shr ($tmp,24); }
466
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
467
+ &shl ($tmp,24);
468
+ &xor ($out,$tmp);
469
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
470
+ if ($i==3) { &mov ($s[3],$acc); }
471
+ &comment();
472
+ }
473
+
474
+ sub enctransform()
475
+ { my @s = ($s0,$s1,$s2,$s3);
476
+ my $i = shift;
477
+ my $tmp = $tbl;
478
+ my $r2 = $key ;
479
+
480
+ &and ($tmp,$s[$i]);
481
+ &lea ($r2,&DWP(0,$s[$i],$s[$i]));
482
+ &mov ($acc,$tmp);
483
+ &shr ($tmp,7);
484
+ &and ($r2,0xfefefefe);
485
+ &sub ($acc,$tmp);
486
+ &mov ($tmp,$s[$i]);
487
+ &and ($acc,0x1b1b1b1b);
488
+ &rotr ($tmp,16);
489
+ &xor ($acc,$r2); # r2
490
+ &mov ($r2,$s[$i]);
491
+
492
+ &xor ($s[$i],$acc); # r0 ^ r2
493
+ &rotr ($r2,16+8);
494
+ &xor ($acc,$tmp);
495
+ &rotl ($s[$i],24);
496
+ &xor ($acc,$r2);
497
+ &mov ($tmp,0x80808080) if ($i!=1);
498
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
499
+ }
500
+
501
+ &function_begin_B("_x86_AES_encrypt_compact");
502
+ # note that caller is expected to allocate stack frame for me!
503
+ &mov ($__key,$key); # save key
504
+
505
+ &xor ($s0,&DWP(0,$key)); # xor with key
506
+ &xor ($s1,&DWP(4,$key));
507
+ &xor ($s2,&DWP(8,$key));
508
+ &xor ($s3,&DWP(12,$key));
509
+
510
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
511
+ &lea ($acc,&DWP(-2,$acc,$acc));
512
+ &lea ($acc,&DWP(0,$key,$acc,8));
513
+ &mov ($__end,$acc); # end of key schedule
514
+
515
+ # prefetch Te4
516
+ &mov ($key,&DWP(0-128,$tbl));
517
+ &mov ($acc,&DWP(32-128,$tbl));
518
+ &mov ($key,&DWP(64-128,$tbl));
519
+ &mov ($acc,&DWP(96-128,$tbl));
520
+ &mov ($key,&DWP(128-128,$tbl));
521
+ &mov ($acc,&DWP(160-128,$tbl));
522
+ &mov ($key,&DWP(192-128,$tbl));
523
+ &mov ($acc,&DWP(224-128,$tbl));
524
+
525
+ &set_label("loop",16);
526
+
527
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
528
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
529
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
530
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
531
+ &mov ($tbl,0x80808080);
532
+ &enctransform(2);
533
+ &enctransform(3);
534
+ &enctransform(0);
535
+ &enctransform(1);
536
+ &mov ($key,$__key);
537
+ &mov ($tbl,$__tbl);
538
+ &add ($key,16); # advance rd_key
539
+ &xor ($s0,&DWP(0,$key));
540
+ &xor ($s1,&DWP(4,$key));
541
+ &xor ($s2,&DWP(8,$key));
542
+ &xor ($s3,&DWP(12,$key));
543
+
544
+ &cmp ($key,$__end);
545
+ &mov ($__key,$key);
546
+ &jb (&label("loop"));
547
+
548
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
549
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
550
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
551
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
552
+
553
+ &xor ($s0,&DWP(16,$key));
554
+ &xor ($s1,&DWP(20,$key));
555
+ &xor ($s2,&DWP(24,$key));
556
+ &xor ($s3,&DWP(28,$key));
557
+
558
+ &ret ();
559
+ &function_end_B("_x86_AES_encrypt_compact");
560
+
561
+ ######################################################################
562
+ # "Compact" SSE block function.
563
+ ######################################################################
564
+ #
565
+ # Performance is not actually extraordinary in comparison to pure
566
+ # x86 code. In particular encrypt performance is virtually the same.
567
+ # Decrypt performance on the other hand is 15-20% better on newer
568
+ # µ-archs [but we're thankful for *any* improvement here], and ~50%
569
+ # better on PIII:-) And additionally on the pros side this code
570
+ # eliminates redundant references to stack and thus relieves/
571
+ # minimizes the pressure on the memory bus.
572
+ #
573
+ # MMX register layout lsb
574
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
575
+ # | mm4 | mm0 |
576
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
577
+ # | s3 | s2 | s1 | s0 |
578
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
579
+ # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
580
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
581
+ #
582
+ # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
583
+ # In this terms encryption and decryption "compact" permutation
584
+ # matrices can be depicted as following:
585
+ #
586
+ # encryption lsb # decryption lsb
587
+ # +----++----+----+----+----+ # +----++----+----+----+----+
588
+ # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
589
+ # +----++----+----+----+----+ # +----++----+----+----+----+
590
+ # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
591
+ # +----++----+----+----+----+ # +----++----+----+----+----+
592
+ # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
593
+ # +----++----+----+----+----+ # +----++----+----+----+----+
594
+ # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
595
+ # +----++----+----+----+----+ # +----++----+----+----+----+
596
+ #
597
+ ######################################################################
598
+ # Why not xmm registers? Short answer. It was actually tested and
599
+ # was not any faster, but *contrary*, most notably on Intel CPUs.
600
+ # Longer answer. Main advantage of using mm registers is that movd
601
+ # latency is lower, especially on Intel P4. While arithmetic
602
+ # instructions are twice as many, they can be scheduled every cycle
603
+ # and not every second one when they are operating on xmm register,
604
+ # so that "arithmetic throughput" remains virtually the same. And
605
+ # finally the code can be executed even on elder SSE-only CPUs:-)
606
+
607
+ sub sse_enccompact()
608
+ {
609
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
610
+ &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
611
+ &movd ("eax","mm1"); # 5, 4, 1, 0
612
+ &movd ("ebx","mm5"); # 15,14,11,10
613
+ &mov ($__key,$key);
614
+
615
+ &movz ($acc,&LB("eax")); # 0
616
+ &movz ("edx",&HB("eax")); # 1
617
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
618
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
619
+ &movz ($key,&LB("ebx")); # 10
620
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
621
+ &shr ("eax",16); # 5, 4
622
+ &shl ("edx",8); # 1
623
+
624
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
625
+ &movz ($key,&HB("ebx")); # 11
626
+ &shl ($acc,16); # 10
627
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
628
+ &or ("ecx",$acc); # 10
629
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
630
+ &movz ($key,&HB("eax")); # 5
631
+ &shl ($acc,24); # 11
632
+ &shr ("ebx",16); # 15,14
633
+ &or ("edx",$acc); # 11
634
+
635
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
636
+ &movz ($key,&HB("ebx")); # 15
637
+ &shl ($acc,8); # 5
638
+ &or ("ecx",$acc); # 5
639
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
640
+ &movz ($key,&LB("eax")); # 4
641
+ &shl ($acc,24); # 15
642
+ &or ("ecx",$acc); # 15
643
+
644
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
645
+ &movz ($key,&LB("ebx")); # 14
646
+ &movd ("eax","mm2"); # 7, 6, 3, 2
647
+ &movd ("mm0","ecx"); # t[0] collected
648
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
649
+ &movz ($key,&HB("eax")); # 3
650
+ &shl ("ecx",16); # 14
651
+ &movd ("ebx","mm6"); # 13,12, 9, 8
652
+ &or ("ecx",$acc); # 14
653
+
654
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
655
+ &movz ($key,&HB("ebx")); # 9
656
+ &shl ($acc,24); # 3
657
+ &or ("ecx",$acc); # 3
658
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
659
+ &movz ($key,&LB("ebx")); # 8
660
+ &shl ($acc,8); # 9
661
+ &shr ("ebx",16); # 13,12
662
+ &or ("ecx",$acc); # 9
663
+
664
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
665
+ &movz ($key,&LB("eax")); # 2
666
+ &shr ("eax",16); # 7, 6
667
+ &movd ("mm1","ecx"); # t[1] collected
668
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
669
+ &movz ($key,&HB("eax")); # 7
670
+ &shl ("ecx",16); # 2
671
+ &and ("eax",0xff); # 6
672
+ &or ("ecx",$acc); # 2
673
+
674
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
675
+
676
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
677
+ &movz ($key,&HB("ebx")); # 13
678
+ &shl ($acc,24); # 7
679
+ &and ("ebx",0xff); # 12
680
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
681
+ &or ("ecx",$acc); # 7
682
+ &shl ("eax",16); # 6
683
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
684
+ &or ("edx","eax"); # 6
685
+ &shl ($acc,8); # 13
686
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
687
+ &or ("ecx",$acc); # 13
688
+ &or ("edx","ebx"); # 12
689
+ &mov ($key,$__key);
690
+ &movd ("mm4","ecx"); # t[2] collected
691
+ &movd ("mm5","edx"); # t[3] collected
692
+
693
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
694
+ }
695
+
696
+ if (!$x86only) {
697
+ &function_begin_B("_sse_AES_encrypt_compact");
698
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
699
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
700
+
701
+ # note that caller is expected to allocate stack frame for me!
702
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
703
+ &lea ($acc,&DWP(-2,$acc,$acc));
704
+ &lea ($acc,&DWP(0,$key,$acc,8));
705
+ &mov ($__end,$acc); # end of key schedule
706
+
707
+ &mov ($s0,0x1b1b1b1b); # magic constant
708
+ &mov (&DWP(8,"esp"),$s0);
709
+ &mov (&DWP(12,"esp"),$s0);
710
+
711
+ # prefetch Te4
712
+ &mov ($s0,&DWP(0-128,$tbl));
713
+ &mov ($s1,&DWP(32-128,$tbl));
714
+ &mov ($s2,&DWP(64-128,$tbl));
715
+ &mov ($s3,&DWP(96-128,$tbl));
716
+ &mov ($s0,&DWP(128-128,$tbl));
717
+ &mov ($s1,&DWP(160-128,$tbl));
718
+ &mov ($s2,&DWP(192-128,$tbl));
719
+ &mov ($s3,&DWP(224-128,$tbl));
720
+
721
+ &set_label("loop",16);
722
+ &sse_enccompact();
723
+ &add ($key,16);
724
+ &cmp ($key,$__end);
725
+ &ja (&label("out"));
726
+
727
+ &movq ("mm2",&QWP(8,"esp"));
728
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
729
+ &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
730
+ &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
731
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
732
+ &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
733
+ &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
734
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
735
+ &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
736
+ &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
737
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
738
+
739
+ &movq ("mm2","mm3"); &movq ("mm6","mm7");
740
+ &pslld ("mm3",8); &pslld ("mm7",8);
741
+ &psrld ("mm2",24); &psrld ("mm6",24);
742
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
743
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
744
+
745
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
746
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
747
+ &psrld ("mm1",8); &psrld ("mm5",8);
748
+ &mov ($s0,&DWP(0-128,$tbl));
749
+ &pslld ("mm3",24); &pslld ("mm7",24);
750
+ &mov ($s1,&DWP(64-128,$tbl));
751
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
752
+ &mov ($s2,&DWP(128-128,$tbl));
753
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
754
+ &mov ($s3,&DWP(192-128,$tbl));
755
+
756
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
757
+ &jmp (&label("loop"));
758
+
759
+ &set_label("out",16);
760
+ &pxor ("mm0",&QWP(0,$key));
761
+ &pxor ("mm4",&QWP(8,$key));
762
+
763
+ &ret ();
764
+ &function_end_B("_sse_AES_encrypt_compact");
765
+ }
766
+
767
+ ######################################################################
768
+ # Vanilla block function.
769
+ ######################################################################
770
+
771
+ sub encstep()
772
+ { my ($i,$te,@s) = @_;
773
+ my $tmp = $key;
774
+ my $out = $i==3?$s[0]:$acc;
775
+
776
+ # lines marked with #%e?x[i] denote "reordered" instructions...
777
+ if ($i==3) { &mov ($key,$__key); }##%edx
778
+ else { &mov ($out,$s[0]);
779
+ &and ($out,0xFF); }
780
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
781
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
782
+ &mov ($out,&DWP(0,$te,$out,8));
783
+
784
+ if ($i==3) { $tmp=$s[1]; }##%eax
785
+ &movz ($tmp,&HB($s[1]));
786
+ &xor ($out,&DWP(3,$te,$tmp,8));
787
+
788
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
789
+ else { &mov ($tmp,$s[2]);
790
+ &shr ($tmp,16); }
791
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
792
+ &and ($tmp,0xFF);
793
+ &xor ($out,&DWP(2,$te,$tmp,8));
794
+
795
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
796
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
797
+ else { &mov ($tmp,$s[3]);
798
+ &shr ($tmp,24) }
799
+ &xor ($out,&DWP(1,$te,$tmp,8));
800
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
801
+ if ($i==3) { &mov ($s[3],$acc); }
802
+ &comment();
803
+ }
804
+
805
+ sub enclast()
806
+ { my ($i,$te,@s)=@_;
807
+ my $tmp = $key;
808
+ my $out = $i==3?$s[0]:$acc;
809
+
810
+ if ($i==3) { &mov ($key,$__key); }##%edx
811
+ else { &mov ($out,$s[0]); }
812
+ &and ($out,0xFF);
813
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
814
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
815
+ &mov ($out,&DWP(2,$te,$out,8));
816
+ &and ($out,0x000000ff);
817
+
818
+ if ($i==3) { $tmp=$s[1]; }##%eax
819
+ &movz ($tmp,&HB($s[1]));
820
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
821
+ &and ($tmp,0x0000ff00);
822
+ &xor ($out,$tmp);
823
+
824
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
825
+ else { &mov ($tmp,$s[2]);
826
+ &shr ($tmp,16); }
827
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
828
+ &and ($tmp,0xFF);
829
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
830
+ &and ($tmp,0x00ff0000);
831
+ &xor ($out,$tmp);
832
+
833
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
834
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
835
+ else { &mov ($tmp,$s[3]);
836
+ &shr ($tmp,24); }
837
+ &mov ($tmp,&DWP(2,$te,$tmp,8));
838
+ &and ($tmp,0xff000000);
839
+ &xor ($out,$tmp);
840
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
841
+ if ($i==3) { &mov ($s[3],$acc); }
842
+ }
843
+
844
+ &function_begin_B("_x86_AES_encrypt");
845
+ if ($vertical_spin) {
846
+ # I need high parts of volatile registers to be accessible...
847
+ &exch ($s1="edi",$key="ebx");
848
+ &mov ($s2="esi",$acc="ecx");
849
+ }
850
+
851
+ # note that caller is expected to allocate stack frame for me!
852
+ &mov ($__key,$key); # save key
853
+
854
+ &xor ($s0,&DWP(0,$key)); # xor with key
855
+ &xor ($s1,&DWP(4,$key));
856
+ &xor ($s2,&DWP(8,$key));
857
+ &xor ($s3,&DWP(12,$key));
858
+
859
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
860
+
861
+ if ($small_footprint) {
862
+ &lea ($acc,&DWP(-2,$acc,$acc));
863
+ &lea ($acc,&DWP(0,$key,$acc,8));
864
+ &mov ($__end,$acc); # end of key schedule
865
+
866
+ &set_label("loop",16);
867
+ if ($vertical_spin) {
868
+ &encvert($tbl,$s0,$s1,$s2,$s3);
869
+ } else {
870
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
871
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
872
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
873
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
874
+ }
875
+ &add ($key,16); # advance rd_key
876
+ &xor ($s0,&DWP(0,$key));
877
+ &xor ($s1,&DWP(4,$key));
878
+ &xor ($s2,&DWP(8,$key));
879
+ &xor ($s3,&DWP(12,$key));
880
+ &cmp ($key,$__end);
881
+ &mov ($__key,$key);
882
+ &jb (&label("loop"));
883
+ }
884
+ else {
885
+ &cmp ($acc,10);
886
+ &jle (&label("10rounds"));
887
+ &cmp ($acc,12);
888
+ &jle (&label("12rounds"));
889
+
890
+ &set_label("14rounds",4);
891
+ for ($i=1;$i<3;$i++) {
892
+ if ($vertical_spin) {
893
+ &encvert($tbl,$s0,$s1,$s2,$s3);
894
+ } else {
895
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
896
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
897
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
898
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
899
+ }
900
+ &xor ($s0,&DWP(16*$i+0,$key));
901
+ &xor ($s1,&DWP(16*$i+4,$key));
902
+ &xor ($s2,&DWP(16*$i+8,$key));
903
+ &xor ($s3,&DWP(16*$i+12,$key));
904
+ }
905
+ &add ($key,32);
906
+ &mov ($__key,$key); # advance rd_key
907
+ &set_label("12rounds",4);
908
+ for ($i=1;$i<3;$i++) {
909
+ if ($vertical_spin) {
910
+ &encvert($tbl,$s0,$s1,$s2,$s3);
911
+ } else {
912
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
913
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
914
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
915
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
916
+ }
917
+ &xor ($s0,&DWP(16*$i+0,$key));
918
+ &xor ($s1,&DWP(16*$i+4,$key));
919
+ &xor ($s2,&DWP(16*$i+8,$key));
920
+ &xor ($s3,&DWP(16*$i+12,$key));
921
+ }
922
+ &add ($key,32);
923
+ &mov ($__key,$key); # advance rd_key
924
+ &set_label("10rounds",4);
925
+ for ($i=1;$i<10;$i++) {
926
+ if ($vertical_spin) {
927
+ &encvert($tbl,$s0,$s1,$s2,$s3);
928
+ } else {
929
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
930
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
931
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
932
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
933
+ }
934
+ &xor ($s0,&DWP(16*$i+0,$key));
935
+ &xor ($s1,&DWP(16*$i+4,$key));
936
+ &xor ($s2,&DWP(16*$i+8,$key));
937
+ &xor ($s3,&DWP(16*$i+12,$key));
938
+ }
939
+ }
940
+
941
+ if ($vertical_spin) {
942
+ # "reincarnate" some registers for "horizontal" spin...
943
+ &mov ($s1="ebx",$key="edi");
944
+ &mov ($s2="ecx",$acc="esi");
945
+ }
946
+ &enclast(0,$tbl,$s0,$s1,$s2,$s3);
947
+ &enclast(1,$tbl,$s1,$s2,$s3,$s0);
948
+ &enclast(2,$tbl,$s2,$s3,$s0,$s1);
949
+ &enclast(3,$tbl,$s3,$s0,$s1,$s2);
950
+
951
+ &add ($key,$small_footprint?16:160);
952
+ &xor ($s0,&DWP(0,$key));
953
+ &xor ($s1,&DWP(4,$key));
954
+ &xor ($s2,&DWP(8,$key));
955
+ &xor ($s3,&DWP(12,$key));
956
+
957
+ &ret ();
958
+
959
+ &set_label("AES_Te",64); # Yes! I keep it in the code segment!
960
+ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
961
+ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
962
+ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
963
+ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
964
+ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
965
+ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
966
+ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
967
+ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
968
+ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
969
+ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
970
+ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
971
+ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
972
+ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
973
+ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
974
+ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
975
+ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
976
+ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
977
+ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
978
+ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
979
+ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
980
+ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
981
+ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
982
+ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
983
+ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
984
+ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
985
+ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
986
+ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
987
+ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
988
+ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
989
+ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
990
+ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
991
+ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
992
+ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
993
+ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
994
+ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
995
+ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
996
+ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
997
+ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
998
+ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
999
+ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
1000
+ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
1001
+ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
1002
+ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
1003
+ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
1004
+ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1005
+ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1006
+ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1007
+ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1008
+ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1009
+ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1010
+ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1011
+ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1012
+ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1013
+ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1014
+ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1015
+ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1016
+ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1017
+ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1018
+ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1019
+ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1020
+ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1021
+ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1022
+ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1023
+ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1024
+
1025
+ #Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1026
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1027
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1028
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1029
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1030
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1031
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1032
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1033
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1034
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1035
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1036
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1037
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1038
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1039
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1040
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1041
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1042
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1043
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1044
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1045
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1046
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1047
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1048
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1049
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1050
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1051
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1052
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1053
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1054
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1055
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1056
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1057
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1058
+
1059
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1060
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1061
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1062
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1063
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1064
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1065
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1066
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1067
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1068
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1069
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1070
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1071
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1072
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1073
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1074
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1075
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1076
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1077
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1078
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1079
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1080
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1081
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1082
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1083
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1084
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1085
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1086
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1087
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1088
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1089
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1090
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1091
+
1092
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1093
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1094
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1095
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1096
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1097
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1098
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1099
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1100
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1101
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1102
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1103
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1104
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1105
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1106
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1107
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1108
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1109
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1110
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1111
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1112
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1113
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1114
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1115
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1116
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1117
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1118
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1119
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1120
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1121
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1122
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1123
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1124
+
1125
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1126
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1127
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1128
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1129
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1130
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1131
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1132
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1133
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1134
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1135
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1136
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1137
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1138
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1139
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1140
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1141
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1142
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1143
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1144
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1145
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1146
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1147
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1148
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1149
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1150
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1151
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1152
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1153
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1154
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1155
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1156
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1157
+ #rcon:
1158
+ &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1159
+ &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1160
+ &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1161
+ &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1162
+ &function_end_B("_x86_AES_encrypt");
1163
+
1164
+ # void asm_AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1165
+ &function_begin("asm_AES_encrypt");
1166
+ &mov ($acc,&wparam(0)); # load inp
1167
+ &mov ($key,&wparam(2)); # load key
1168
+
1169
+ &mov ($s0,"esp");
1170
+ &sub ("esp",36);
1171
+ &and ("esp",-64); # align to cache-line
1172
+
1173
+ # place stack frame just "above" the key schedule
1174
+ &lea ($s1,&DWP(-64-63,$key));
1175
+ &sub ($s1,"esp");
1176
+ &neg ($s1);
1177
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1178
+ &sub ("esp",$s1);
1179
+ &add ("esp",4); # 4 is reserved for caller's return address
1180
+ &mov ($_esp,$s0); # save stack pointer
1181
+
1182
+ &call (&label("pic_point")); # make it PIC!
1183
+ &set_label("pic_point");
1184
+ &blindpop($tbl);
1185
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1186
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1187
+
1188
+ # pick Te4 copy which can't "overlap" with stack frame or key schedule
1189
+ &lea ($s1,&DWP(768-4,"esp"));
1190
+ &sub ($s1,$tbl);
1191
+ &and ($s1,0x300);
1192
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1193
+
1194
+ if (!$x86only) {
1195
+ &bt (&DWP(0,$s0),25); # check for SSE bit
1196
+ &jnc (&label("x86"));
1197
+
1198
+ &movq ("mm0",&QWP(0,$acc));
1199
+ &movq ("mm4",&QWP(8,$acc));
1200
+ &call ("_sse_AES_encrypt_compact");
1201
+ &mov ("esp",$_esp); # restore stack pointer
1202
+ &mov ($acc,&wparam(1)); # load out
1203
+ &movq (&QWP(0,$acc),"mm0"); # write output data
1204
+ &movq (&QWP(8,$acc),"mm4");
1205
+ &emms ();
1206
+ &function_end_A();
1207
+ }
1208
+ &set_label("x86",16);
1209
+ &mov ($_tbl,$tbl);
1210
+ &mov ($s0,&DWP(0,$acc)); # load input data
1211
+ &mov ($s1,&DWP(4,$acc));
1212
+ &mov ($s2,&DWP(8,$acc));
1213
+ &mov ($s3,&DWP(12,$acc));
1214
+ &call ("_x86_AES_encrypt_compact");
1215
+ &mov ("esp",$_esp); # restore stack pointer
1216
+ &mov ($acc,&wparam(1)); # load out
1217
+ &mov (&DWP(0,$acc),$s0); # write output data
1218
+ &mov (&DWP(4,$acc),$s1);
1219
+ &mov (&DWP(8,$acc),$s2);
1220
+ &mov (&DWP(12,$acc),$s3);
1221
+ &function_end("asm_AES_encrypt");
1222
+
1223
+ #--------------------------------------------------------------------#
1224
+
1225
+ ######################################################################
1226
+ # "Compact" block function
1227
+ ######################################################################
1228
+
1229
+ sub deccompact()
1230
+ { my $Fn = \&mov;
1231
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
1232
+ my ($i,$td,@s)=@_;
1233
+ my $tmp = $key;
1234
+ my $out = $i==3?$s[0]:$acc;
1235
+
1236
+ # $Fn is used in first compact round and its purpose is to
1237
+ # void restoration of some values from stack, so that after
1238
+ # 4xdeccompact with extra argument $key, $s0 and $s1 values
1239
+ # are left there...
1240
+ if($i==3) { &$Fn ($key,$__key); }
1241
+ else { &mov ($out,$s[0]); }
1242
+ &and ($out,0xFF);
1243
+ &movz ($out,&BP(-128,$td,$out,1));
1244
+
1245
+ if ($i==3) { $tmp=$s[1]; }
1246
+ &movz ($tmp,&HB($s[1]));
1247
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1248
+ &shl ($tmp,8);
1249
+ &xor ($out,$tmp);
1250
+
1251
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1252
+ else { mov ($tmp,$s[2]); }
1253
+ &shr ($tmp,16);
1254
+ &and ($tmp,0xFF);
1255
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1256
+ &shl ($tmp,16);
1257
+ &xor ($out,$tmp);
1258
+
1259
+ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1260
+ else { &mov ($tmp,$s[3]); }
1261
+ &shr ($tmp,24);
1262
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1263
+ &shl ($tmp,24);
1264
+ &xor ($out,$tmp);
1265
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1266
+ if ($i==3) { &$Fn ($s[3],$__s0); }
1267
+ }
1268
+
1269
+ # must be called with 2,3,0,1 as argument sequence!!!
1270
+ sub dectransform()
1271
+ { my @s = ($s0,$s1,$s2,$s3);
1272
+ my $i = shift;
1273
+ my $tmp = $key;
1274
+ my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1275
+ my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1276
+ my $tp8 = $tbl;
1277
+
1278
+ &mov ($tmp,0x80808080);
1279
+ &and ($tmp,$s[$i]);
1280
+ &mov ($acc,$tmp);
1281
+ &shr ($tmp,7);
1282
+ &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1283
+ &sub ($acc,$tmp);
1284
+ &and ($tp2,0xfefefefe);
1285
+ &and ($acc,0x1b1b1b1b);
1286
+ &xor ($tp2,$acc);
1287
+ &mov ($tmp,0x80808080);
1288
+
1289
+ &and ($tmp,$tp2);
1290
+ &mov ($acc,$tmp);
1291
+ &shr ($tmp,7);
1292
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
1293
+ &sub ($acc,$tmp);
1294
+ &and ($tp4,0xfefefefe);
1295
+ &and ($acc,0x1b1b1b1b);
1296
+ &xor ($tp2,$s[$i]); # tp2^tp1
1297
+ &xor ($tp4,$acc);
1298
+ &mov ($tmp,0x80808080);
1299
+
1300
+ &and ($tmp,$tp4);
1301
+ &mov ($acc,$tmp);
1302
+ &shr ($tmp,7);
1303
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
1304
+ &sub ($acc,$tmp);
1305
+ &and ($tp8,0xfefefefe);
1306
+ &and ($acc,0x1b1b1b1b);
1307
+ &xor ($tp4,$s[$i]); # tp4^tp1
1308
+ &rotl ($s[$i],8); # = ROTATE(tp1,8)
1309
+ &xor ($tp8,$acc);
1310
+
1311
+ &xor ($s[$i],$tp2);
1312
+ &xor ($tp2,$tp8);
1313
+ &xor ($s[$i],$tp4);
1314
+ &xor ($tp4,$tp8);
1315
+ &rotl ($tp2,24);
1316
+ &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1317
+ &rotl ($tp4,16);
1318
+ &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1319
+ &rotl ($tp8,8);
1320
+ &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1321
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1322
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1323
+ &mov ($s[2],$__s2) if($i==1);
1324
+ &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1325
+
1326
+ &mov ($s[3],$__s3) if($i==1);
1327
+ &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1328
+ }
1329
+
1330
+ &function_begin_B("_x86_AES_decrypt_compact");
1331
+ # note that caller is expected to allocate stack frame for me!
1332
+ &mov ($__key,$key); # save key
1333
+
1334
+ &xor ($s0,&DWP(0,$key)); # xor with key
1335
+ &xor ($s1,&DWP(4,$key));
1336
+ &xor ($s2,&DWP(8,$key));
1337
+ &xor ($s3,&DWP(12,$key));
1338
+
1339
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1340
+
1341
+ &lea ($acc,&DWP(-2,$acc,$acc));
1342
+ &lea ($acc,&DWP(0,$key,$acc,8));
1343
+ &mov ($__end,$acc); # end of key schedule
1344
+
1345
+ # prefetch Td4
1346
+ &mov ($key,&DWP(0-128,$tbl));
1347
+ &mov ($acc,&DWP(32-128,$tbl));
1348
+ &mov ($key,&DWP(64-128,$tbl));
1349
+ &mov ($acc,&DWP(96-128,$tbl));
1350
+ &mov ($key,&DWP(128-128,$tbl));
1351
+ &mov ($acc,&DWP(160-128,$tbl));
1352
+ &mov ($key,&DWP(192-128,$tbl));
1353
+ &mov ($acc,&DWP(224-128,$tbl));
1354
+
1355
+ &set_label("loop",16);
1356
+
1357
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1358
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1359
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1360
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1361
+ &dectransform(2);
1362
+ &dectransform(3);
1363
+ &dectransform(0);
1364
+ &dectransform(1);
1365
+ &mov ($key,$__key);
1366
+ &mov ($tbl,$__tbl);
1367
+ &add ($key,16); # advance rd_key
1368
+ &xor ($s0,&DWP(0,$key));
1369
+ &xor ($s1,&DWP(4,$key));
1370
+ &xor ($s2,&DWP(8,$key));
1371
+ &xor ($s3,&DWP(12,$key));
1372
+
1373
+ &cmp ($key,$__end);
1374
+ &mov ($__key,$key);
1375
+ &jb (&label("loop"));
1376
+
1377
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1378
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1379
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1380
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1381
+
1382
+ &xor ($s0,&DWP(16,$key));
1383
+ &xor ($s1,&DWP(20,$key));
1384
+ &xor ($s2,&DWP(24,$key));
1385
+ &xor ($s3,&DWP(28,$key));
1386
+
1387
+ &ret ();
1388
+ &function_end_B("_x86_AES_decrypt_compact");
1389
+
1390
+ ######################################################################
1391
+ # "Compact" SSE block function.
1392
+ ######################################################################
1393
+
1394
+ sub sse_deccompact()
1395
+ {
1396
+ &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1397
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1398
+ &movd ("eax","mm1"); # 7, 6, 1, 0
1399
+ &movd ("ebx","mm5"); # 13,12,11,10
1400
+ &mov ($__key,$key);
1401
+
1402
+ &movz ($acc,&LB("eax")); # 0
1403
+ &movz ("edx",&HB("eax")); # 1
1404
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1405
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1406
+ &movz ($key,&LB("ebx")); # 10
1407
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1408
+ &shr ("eax",16); # 7, 6
1409
+ &shl ("edx",8); # 1
1410
+
1411
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
1412
+ &movz ($key,&HB("ebx")); # 11
1413
+ &shl ($acc,16); # 10
1414
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415
+ &or ("ecx",$acc); # 10
1416
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
1417
+ &movz ($key,&HB("eax")); # 7
1418
+ &shl ($acc,24); # 11
1419
+ &shr ("ebx",16); # 13,12
1420
+ &or ("edx",$acc); # 11
1421
+
1422
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
1423
+ &movz ($key,&HB("ebx")); # 13
1424
+ &shl ($acc,24); # 7
1425
+ &or ("ecx",$acc); # 7
1426
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
1427
+ &movz ($key,&LB("eax")); # 6
1428
+ &shl ($acc,8); # 13
1429
+ &movd ("eax","mm2"); # 3, 2, 5, 4
1430
+ &or ("ecx",$acc); # 13
1431
+
1432
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
1433
+ &movz ($key,&LB("ebx")); # 12
1434
+ &shl ($acc,16); # 6
1435
+ &movd ("ebx","mm6"); # 9, 8,15,14
1436
+ &movd ("mm0","ecx"); # t[0] collected
1437
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
1438
+ &movz ($key,&LB("eax")); # 4
1439
+ &or ("ecx",$acc); # 12
1440
+
1441
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
1442
+ &movz ($key,&LB("ebx")); # 14
1443
+ &or ("edx",$acc); # 4
1444
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
1445
+ &movz ($key,&HB("eax")); # 5
1446
+ &shl ($acc,16); # 14
1447
+ &shr ("eax",16); # 3, 2
1448
+ &or ("edx",$acc); # 14
1449
+
1450
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
1451
+ &movz ($key,&HB("ebx")); # 15
1452
+ &shr ("ebx",16); # 9, 8
1453
+ &shl ($acc,8); # 5
1454
+ &movd ("mm1","edx"); # t[1] collected
1455
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
1456
+ &movz ($key,&HB("ebx")); # 9
1457
+ &shl ("edx",24); # 15
1458
+ &and ("ebx",0xff); # 8
1459
+ &or ("edx",$acc); # 15
1460
+
1461
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
1462
+
1463
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
1464
+ &movz ($key,&LB("eax")); # 2
1465
+ &shl ($acc,8); # 9
1466
+ &movz ("eax",&HB("eax")); # 3
1467
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1468
+ &or ("ecx",$acc); # 9
1469
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
1470
+ &or ("edx","ebx"); # 8
1471
+ &shl ($acc,16); # 2
1472
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1473
+ &or ("edx",$acc); # 2
1474
+ &shl ("eax",24); # 3
1475
+ &or ("ecx","eax"); # 3
1476
+ &mov ($key,$__key);
1477
+ &movd ("mm4","edx"); # t[2] collected
1478
+ &movd ("mm5","ecx"); # t[3] collected
1479
+
1480
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
1481
+ }
1482
+
1483
+ if (!$x86only) {
1484
+ &function_begin_B("_sse_AES_decrypt_compact");
1485
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1486
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1487
+
1488
+ # note that caller is expected to allocate stack frame for me!
1489
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1490
+ &lea ($acc,&DWP(-2,$acc,$acc));
1491
+ &lea ($acc,&DWP(0,$key,$acc,8));
1492
+ &mov ($__end,$acc); # end of key schedule
1493
+
1494
+ &mov ($s0,0x1b1b1b1b); # magic constant
1495
+ &mov (&DWP(8,"esp"),$s0);
1496
+ &mov (&DWP(12,"esp"),$s0);
1497
+
1498
+ # prefetch Td4
1499
+ &mov ($s0,&DWP(0-128,$tbl));
1500
+ &mov ($s1,&DWP(32-128,$tbl));
1501
+ &mov ($s2,&DWP(64-128,$tbl));
1502
+ &mov ($s3,&DWP(96-128,$tbl));
1503
+ &mov ($s0,&DWP(128-128,$tbl));
1504
+ &mov ($s1,&DWP(160-128,$tbl));
1505
+ &mov ($s2,&DWP(192-128,$tbl));
1506
+ &mov ($s3,&DWP(224-128,$tbl));
1507
+
1508
+ &set_label("loop",16);
1509
+ &sse_deccompact();
1510
+ &add ($key,16);
1511
+ &cmp ($key,$__end);
1512
+ &ja (&label("out"));
1513
+
1514
+ # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1515
+ &movq ("mm3","mm0"); &movq ("mm7","mm4");
1516
+ &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1517
+ &movq ("mm1","mm0"); &movq ("mm5","mm4");
1518
+ &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1519
+ &pslld ("mm2",8); &pslld ("mm6",8);
1520
+ &psrld ("mm3",8); &psrld ("mm7",8);
1521
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1522
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1523
+ &pslld ("mm2",16); &pslld ("mm6",16);
1524
+ &psrld ("mm3",16); &psrld ("mm7",16);
1525
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1526
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1527
+
1528
+ &movq ("mm3",&QWP(8,"esp"));
1529
+ &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1530
+ &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1531
+ &pand ("mm2","mm3"); &pand ("mm6","mm3");
1532
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1533
+ &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1534
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
1535
+ &movq ("mm2","mm1"); &movq ("mm6","mm5");
1536
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1537
+ &pslld ("mm3",24); &pslld ("mm7",24);
1538
+ &psrld ("mm2",8); &psrld ("mm6",8);
1539
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1540
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1541
+
1542
+ &movq ("mm2",&QWP(8,"esp"));
1543
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1544
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1545
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
1546
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1547
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1548
+ &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1549
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1550
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1551
+
1552
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1553
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1554
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
1555
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1556
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1557
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1558
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
1559
+ &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1560
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1561
+ &pslld ("mm1",8); &pslld ("mm5",8);
1562
+ &psrld ("mm3",8); &psrld ("mm7",8);
1563
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1564
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1565
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1566
+ &mov ($s0,&DWP(0-128,$tbl));
1567
+ &pslld ("mm1",16); &pslld ("mm5",16);
1568
+ &mov ($s1,&DWP(64-128,$tbl));
1569
+ &psrld ("mm3",16); &psrld ("mm7",16);
1570
+ &mov ($s2,&DWP(128-128,$tbl));
1571
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1572
+ &mov ($s3,&DWP(192-128,$tbl));
1573
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1574
+
1575
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1576
+ &jmp (&label("loop"));
1577
+
1578
+ &set_label("out",16);
1579
+ &pxor ("mm0",&QWP(0,$key));
1580
+ &pxor ("mm4",&QWP(8,$key));
1581
+
1582
+ &ret ();
1583
+ &function_end_B("_sse_AES_decrypt_compact");
1584
+ }
1585
+
1586
+ ######################################################################
1587
+ # Vanilla block function.
1588
+ ######################################################################
1589
+
1590
+ sub decstep()
1591
+ { my ($i,$td,@s) = @_;
1592
+ my $tmp = $key;
1593
+ my $out = $i==3?$s[0]:$acc;
1594
+
1595
+ # no instructions are reordered, as performance appears
1596
+ # optimal... or rather that all attempts to reorder didn't
1597
+ # result in better performance [which by the way is not a
1598
+ # bit lower than ecryption].
1599
+ if($i==3) { &mov ($key,$__key); }
1600
+ else { &mov ($out,$s[0]); }
1601
+ &and ($out,0xFF);
1602
+ &mov ($out,&DWP(0,$td,$out,8));
1603
+
1604
+ if ($i==3) { $tmp=$s[1]; }
1605
+ &movz ($tmp,&HB($s[1]));
1606
+ &xor ($out,&DWP(3,$td,$tmp,8));
1607
+
1608
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1609
+ else { &mov ($tmp,$s[2]); }
1610
+ &shr ($tmp,16);
1611
+ &and ($tmp,0xFF);
1612
+ &xor ($out,&DWP(2,$td,$tmp,8));
1613
+
1614
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1615
+ else { &mov ($tmp,$s[3]); }
1616
+ &shr ($tmp,24);
1617
+ &xor ($out,&DWP(1,$td,$tmp,8));
1618
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1619
+ if ($i==3) { &mov ($s[3],$__s0); }
1620
+ &comment();
1621
+ }
1622
+
1623
+ sub declast()
1624
+ { my ($i,$td,@s)=@_;
1625
+ my $tmp = $key;
1626
+ my $out = $i==3?$s[0]:$acc;
1627
+
1628
+ if($i==0) { &lea ($td,&DWP(2048+128,$td));
1629
+ &mov ($tmp,&DWP(0-128,$td));
1630
+ &mov ($acc,&DWP(32-128,$td));
1631
+ &mov ($tmp,&DWP(64-128,$td));
1632
+ &mov ($acc,&DWP(96-128,$td));
1633
+ &mov ($tmp,&DWP(128-128,$td));
1634
+ &mov ($acc,&DWP(160-128,$td));
1635
+ &mov ($tmp,&DWP(192-128,$td));
1636
+ &mov ($acc,&DWP(224-128,$td));
1637
+ &lea ($td,&DWP(-128,$td)); }
1638
+ if($i==3) { &mov ($key,$__key); }
1639
+ else { &mov ($out,$s[0]); }
1640
+ &and ($out,0xFF);
1641
+ &movz ($out,&BP(0,$td,$out,1));
1642
+
1643
+ if ($i==3) { $tmp=$s[1]; }
1644
+ &movz ($tmp,&HB($s[1]));
1645
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1646
+ &shl ($tmp,8);
1647
+ &xor ($out,$tmp);
1648
+
1649
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1650
+ else { mov ($tmp,$s[2]); }
1651
+ &shr ($tmp,16);
1652
+ &and ($tmp,0xFF);
1653
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1654
+ &shl ($tmp,16);
1655
+ &xor ($out,$tmp);
1656
+
1657
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1658
+ else { &mov ($tmp,$s[3]); }
1659
+ &shr ($tmp,24);
1660
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1661
+ &shl ($tmp,24);
1662
+ &xor ($out,$tmp);
1663
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1664
+ if ($i==3) { &mov ($s[3],$__s0);
1665
+ &lea ($td,&DWP(-2048,$td)); }
1666
+ }
1667
+
1668
+ &function_begin_B("_x86_AES_decrypt");
1669
+ # note that caller is expected to allocate stack frame for me!
1670
+ &mov ($__key,$key); # save key
1671
+
1672
+ &xor ($s0,&DWP(0,$key)); # xor with key
1673
+ &xor ($s1,&DWP(4,$key));
1674
+ &xor ($s2,&DWP(8,$key));
1675
+ &xor ($s3,&DWP(12,$key));
1676
+
1677
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1678
+
1679
+ if ($small_footprint) {
1680
+ &lea ($acc,&DWP(-2,$acc,$acc));
1681
+ &lea ($acc,&DWP(0,$key,$acc,8));
1682
+ &mov ($__end,$acc); # end of key schedule
1683
+ &set_label("loop",16);
1684
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1685
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1686
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1687
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1688
+ &add ($key,16); # advance rd_key
1689
+ &xor ($s0,&DWP(0,$key));
1690
+ &xor ($s1,&DWP(4,$key));
1691
+ &xor ($s2,&DWP(8,$key));
1692
+ &xor ($s3,&DWP(12,$key));
1693
+ &cmp ($key,$__end);
1694
+ &mov ($__key,$key);
1695
+ &jb (&label("loop"));
1696
+ }
1697
+ else {
1698
+ &cmp ($acc,10);
1699
+ &jle (&label("10rounds"));
1700
+ &cmp ($acc,12);
1701
+ &jle (&label("12rounds"));
1702
+
1703
+ &set_label("14rounds",4);
1704
+ for ($i=1;$i<3;$i++) {
1705
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1706
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1707
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1708
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1709
+ &xor ($s0,&DWP(16*$i+0,$key));
1710
+ &xor ($s1,&DWP(16*$i+4,$key));
1711
+ &xor ($s2,&DWP(16*$i+8,$key));
1712
+ &xor ($s3,&DWP(16*$i+12,$key));
1713
+ }
1714
+ &add ($key,32);
1715
+ &mov ($__key,$key); # advance rd_key
1716
+ &set_label("12rounds",4);
1717
+ for ($i=1;$i<3;$i++) {
1718
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1719
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1720
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1721
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1722
+ &xor ($s0,&DWP(16*$i+0,$key));
1723
+ &xor ($s1,&DWP(16*$i+4,$key));
1724
+ &xor ($s2,&DWP(16*$i+8,$key));
1725
+ &xor ($s3,&DWP(16*$i+12,$key));
1726
+ }
1727
+ &add ($key,32);
1728
+ &mov ($__key,$key); # advance rd_key
1729
+ &set_label("10rounds",4);
1730
+ for ($i=1;$i<10;$i++) {
1731
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1732
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1733
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1734
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1735
+ &xor ($s0,&DWP(16*$i+0,$key));
1736
+ &xor ($s1,&DWP(16*$i+4,$key));
1737
+ &xor ($s2,&DWP(16*$i+8,$key));
1738
+ &xor ($s3,&DWP(16*$i+12,$key));
1739
+ }
1740
+ }
1741
+
1742
+ &declast(0,$tbl,$s0,$s3,$s2,$s1);
1743
+ &declast(1,$tbl,$s1,$s0,$s3,$s2);
1744
+ &declast(2,$tbl,$s2,$s1,$s0,$s3);
1745
+ &declast(3,$tbl,$s3,$s2,$s1,$s0);
1746
+
1747
+ &add ($key,$small_footprint?16:160);
1748
+ &xor ($s0,&DWP(0,$key));
1749
+ &xor ($s1,&DWP(4,$key));
1750
+ &xor ($s2,&DWP(8,$key));
1751
+ &xor ($s3,&DWP(12,$key));
1752
+
1753
+ &ret ();
1754
+
1755
+ &set_label("AES_Td",64); # Yes! I keep it in the code segment!
1756
+ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1757
+ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1758
+ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1759
+ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1760
+ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1761
+ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1762
+ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1763
+ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1764
+ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1765
+ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1766
+ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1767
+ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1768
+ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1769
+ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1770
+ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1771
+ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1772
+ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1773
+ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1774
+ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1775
+ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1776
+ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1777
+ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1778
+ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1779
+ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1780
+ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1781
+ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1782
+ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1783
+ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1784
+ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1785
+ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1786
+ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1787
+ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1788
+ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1789
+ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1790
+ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1791
+ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1792
+ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1793
+ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1794
+ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1795
+ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1796
+ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1797
+ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1798
+ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1799
+ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1800
+ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1801
+ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1802
+ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1803
+ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1804
+ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1805
+ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1806
+ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1807
+ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1808
+ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1809
+ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1810
+ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1811
+ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1812
+ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1813
+ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1814
+ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1815
+ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1816
+ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1817
+ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1818
+ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1819
+ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1820
+
1821
+ #Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1822
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1823
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1824
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1825
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1826
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1827
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1828
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1829
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1830
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1831
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1832
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1833
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1834
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1835
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1836
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1837
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1838
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1839
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1840
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1841
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1842
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1843
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1844
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1845
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1846
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1847
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1848
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1849
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1850
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1851
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1852
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1853
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1854
+
1855
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1856
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1857
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1858
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1859
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1860
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1861
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1862
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1863
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1864
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1865
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1866
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1867
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1868
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1869
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1870
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1871
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1872
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1873
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1874
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1875
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1876
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1877
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1878
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1879
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1880
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1881
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1882
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1883
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1884
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1885
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1886
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1887
+
1888
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1889
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1890
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1891
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1892
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1893
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1894
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1895
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1896
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1897
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1898
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1899
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1900
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1901
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1902
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1903
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1904
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1905
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1906
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1907
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1908
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1909
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1910
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1911
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1912
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1913
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1914
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1915
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1916
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1917
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1918
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1919
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1920
+
1921
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1922
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1923
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1924
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1925
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1926
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1927
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1928
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1929
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1930
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1931
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1932
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1933
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1934
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1935
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1936
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1937
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1938
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1939
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1940
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1941
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1942
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1943
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1944
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1945
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1946
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1947
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1948
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1949
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1950
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1951
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1952
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1953
+ &function_end_B("_x86_AES_decrypt");
1954
+
1955
+ # void asm_AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1956
+ &function_begin("asm_AES_decrypt");
1957
+ &mov ($acc,&wparam(0)); # load inp
1958
+ &mov ($key,&wparam(2)); # load key
1959
+
1960
+ &mov ($s0,"esp");
1961
+ &sub ("esp",36);
1962
+ &and ("esp",-64); # align to cache-line
1963
+
1964
+ # place stack frame just "above" the key schedule
1965
+ &lea ($s1,&DWP(-64-63,$key));
1966
+ &sub ($s1,"esp");
1967
+ &neg ($s1);
1968
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1969
+ &sub ("esp",$s1);
1970
+ &add ("esp",4); # 4 is reserved for caller's return address
1971
+ &mov ($_esp,$s0); # save stack pointer
1972
+
1973
+ &call (&label("pic_point")); # make it PIC!
1974
+ &set_label("pic_point");
1975
+ &blindpop($tbl);
1976
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1977
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1978
+
1979
+ # pick Td4 copy which can't "overlap" with stack frame or key schedule
1980
+ &lea ($s1,&DWP(768-4,"esp"));
1981
+ &sub ($s1,$tbl);
1982
+ &and ($s1,0x300);
1983
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1984
+
1985
+ if (!$x86only) {
1986
+ &bt (&DWP(0,$s0),25); # check for SSE bit
1987
+ &jnc (&label("x86"));
1988
+
1989
+ &movq ("mm0",&QWP(0,$acc));
1990
+ &movq ("mm4",&QWP(8,$acc));
1991
+ &call ("_sse_AES_decrypt_compact");
1992
+ &mov ("esp",$_esp); # restore stack pointer
1993
+ &mov ($acc,&wparam(1)); # load out
1994
+ &movq (&QWP(0,$acc),"mm0"); # write output data
1995
+ &movq (&QWP(8,$acc),"mm4");
1996
+ &emms ();
1997
+ &function_end_A();
1998
+ }
1999
+ &set_label("x86",16);
2000
+ &mov ($_tbl,$tbl);
2001
+ &mov ($s0,&DWP(0,$acc)); # load input data
2002
+ &mov ($s1,&DWP(4,$acc));
2003
+ &mov ($s2,&DWP(8,$acc));
2004
+ &mov ($s3,&DWP(12,$acc));
2005
+ &call ("_x86_AES_decrypt_compact");
2006
+ &mov ("esp",$_esp); # restore stack pointer
2007
+ &mov ($acc,&wparam(1)); # load out
2008
+ &mov (&DWP(0,$acc),$s0); # write output data
2009
+ &mov (&DWP(4,$acc),$s1);
2010
+ &mov (&DWP(8,$acc),$s2);
2011
+ &mov (&DWP(12,$acc),$s3);
2012
+ &function_end("asm_AES_decrypt");
2013
+
2014
+ #------------------------------------------------------------------#
2015
+
2016
+ sub enckey()
2017
+ {
2018
+ &movz ("esi",&LB("edx")); # rk[i]>>0
2019
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2020
+ &movz ("esi",&HB("edx")); # rk[i]>>8
2021
+ &shl ("ebx",24);
2022
+ &xor ("eax","ebx");
2023
+
2024
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2025
+ &shr ("edx",16);
2026
+ &movz ("esi",&LB("edx")); # rk[i]>>16
2027
+ &xor ("eax","ebx");
2028
+
2029
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2030
+ &movz ("esi",&HB("edx")); # rk[i]>>24
2031
+ &shl ("ebx",8);
2032
+ &xor ("eax","ebx");
2033
+
2034
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2035
+ &shl ("ebx",16);
2036
+ &xor ("eax","ebx");
2037
+
2038
+ &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2039
+ }
2040
+
2041
+ &function_begin("_x86_AES_set_encrypt_key");
2042
+ &mov ("esi",&wparam(1)); # user supplied key
2043
+ &mov ("edi",&wparam(3)); # private key schedule
2044
+
2045
+ &test ("esi",-1);
2046
+ &jz (&label("badpointer"));
2047
+ &test ("edi",-1);
2048
+ &jz (&label("badpointer"));
2049
+
2050
+ &call (&label("pic_point"));
2051
+ &set_label("pic_point");
2052
+ &blindpop($tbl);
2053
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2054
+ &lea ($tbl,&DWP(2048+128,$tbl));
2055
+
2056
+ # prefetch Te4
2057
+ &mov ("eax",&DWP(0-128,$tbl));
2058
+ &mov ("ebx",&DWP(32-128,$tbl));
2059
+ &mov ("ecx",&DWP(64-128,$tbl));
2060
+ &mov ("edx",&DWP(96-128,$tbl));
2061
+ &mov ("eax",&DWP(128-128,$tbl));
2062
+ &mov ("ebx",&DWP(160-128,$tbl));
2063
+ &mov ("ecx",&DWP(192-128,$tbl));
2064
+ &mov ("edx",&DWP(224-128,$tbl));
2065
+
2066
+ &mov ("ecx",&wparam(2)); # number of bits in key
2067
+ &cmp ("ecx",128);
2068
+ &je (&label("10rounds"));
2069
+ &cmp ("ecx",192);
2070
+ &je (&label("12rounds"));
2071
+ &cmp ("ecx",256);
2072
+ &je (&label("14rounds"));
2073
+ &mov ("eax",-2); # invalid number of bits
2074
+ &jmp (&label("exit"));
2075
+
2076
+ &set_label("10rounds");
2077
+ &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2078
+ &mov ("ebx",&DWP(4,"esi"));
2079
+ &mov ("ecx",&DWP(8,"esi"));
2080
+ &mov ("edx",&DWP(12,"esi"));
2081
+ &mov (&DWP(0,"edi"),"eax");
2082
+ &mov (&DWP(4,"edi"),"ebx");
2083
+ &mov (&DWP(8,"edi"),"ecx");
2084
+ &mov (&DWP(12,"edi"),"edx");
2085
+
2086
+ &xor ("ecx","ecx");
2087
+ &jmp (&label("10shortcut"));
2088
+
2089
+ &align (4);
2090
+ &set_label("10loop");
2091
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2092
+ &mov ("edx",&DWP(12,"edi")); # rk[3]
2093
+ &set_label("10shortcut");
2094
+ &enckey ();
2095
+
2096
+ &mov (&DWP(16,"edi"),"eax"); # rk[4]
2097
+ &xor ("eax",&DWP(4,"edi"));
2098
+ &mov (&DWP(20,"edi"),"eax"); # rk[5]
2099
+ &xor ("eax",&DWP(8,"edi"));
2100
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
2101
+ &xor ("eax",&DWP(12,"edi"));
2102
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
2103
+ &inc ("ecx");
2104
+ &add ("edi",16);
2105
+ &cmp ("ecx",10);
2106
+ &jl (&label("10loop"));
2107
+
2108
+ &mov (&DWP(80,"edi"),10); # setup number of rounds
2109
+ &xor ("eax","eax");
2110
+ &jmp (&label("exit"));
2111
+
2112
+ &set_label("12rounds");
2113
+ &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2114
+ &mov ("ebx",&DWP(4,"esi"));
2115
+ &mov ("ecx",&DWP(8,"esi"));
2116
+ &mov ("edx",&DWP(12,"esi"));
2117
+ &mov (&DWP(0,"edi"),"eax");
2118
+ &mov (&DWP(4,"edi"),"ebx");
2119
+ &mov (&DWP(8,"edi"),"ecx");
2120
+ &mov (&DWP(12,"edi"),"edx");
2121
+ &mov ("ecx",&DWP(16,"esi"));
2122
+ &mov ("edx",&DWP(20,"esi"));
2123
+ &mov (&DWP(16,"edi"),"ecx");
2124
+ &mov (&DWP(20,"edi"),"edx");
2125
+
2126
+ &xor ("ecx","ecx");
2127
+ &jmp (&label("12shortcut"));
2128
+
2129
+ &align (4);
2130
+ &set_label("12loop");
2131
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2132
+ &mov ("edx",&DWP(20,"edi")); # rk[5]
2133
+ &set_label("12shortcut");
2134
+ &enckey ();
2135
+
2136
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
2137
+ &xor ("eax",&DWP(4,"edi"));
2138
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
2139
+ &xor ("eax",&DWP(8,"edi"));
2140
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
2141
+ &xor ("eax",&DWP(12,"edi"));
2142
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
2143
+
2144
+ &cmp ("ecx",7);
2145
+ &je (&label("12break"));
2146
+ &inc ("ecx");
2147
+
2148
+ &xor ("eax",&DWP(16,"edi"));
2149
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
2150
+ &xor ("eax",&DWP(20,"edi"));
2151
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
2152
+
2153
+ &add ("edi",24);
2154
+ &jmp (&label("12loop"));
2155
+
2156
+ &set_label("12break");
2157
+ &mov (&DWP(72,"edi"),12); # setup number of rounds
2158
+ &xor ("eax","eax");
2159
+ &jmp (&label("exit"));
2160
+
2161
+ &set_label("14rounds");
2162
+ &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2163
+ &mov ("ebx",&DWP(4,"esi"));
2164
+ &mov ("ecx",&DWP(8,"esi"));
2165
+ &mov ("edx",&DWP(12,"esi"));
2166
+ &mov (&DWP(0,"edi"),"eax");
2167
+ &mov (&DWP(4,"edi"),"ebx");
2168
+ &mov (&DWP(8,"edi"),"ecx");
2169
+ &mov (&DWP(12,"edi"),"edx");
2170
+ &mov ("eax",&DWP(16,"esi"));
2171
+ &mov ("ebx",&DWP(20,"esi"));
2172
+ &mov ("ecx",&DWP(24,"esi"));
2173
+ &mov ("edx",&DWP(28,"esi"));
2174
+ &mov (&DWP(16,"edi"),"eax");
2175
+ &mov (&DWP(20,"edi"),"ebx");
2176
+ &mov (&DWP(24,"edi"),"ecx");
2177
+ &mov (&DWP(28,"edi"),"edx");
2178
+
2179
+ &xor ("ecx","ecx");
2180
+ &jmp (&label("14shortcut"));
2181
+
2182
+ &align (4);
2183
+ &set_label("14loop");
2184
+ &mov ("edx",&DWP(28,"edi")); # rk[7]
2185
+ &set_label("14shortcut");
2186
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2187
+
2188
+ &enckey ();
2189
+
2190
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
2191
+ &xor ("eax",&DWP(4,"edi"));
2192
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
2193
+ &xor ("eax",&DWP(8,"edi"));
2194
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
2195
+ &xor ("eax",&DWP(12,"edi"));
2196
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
2197
+
2198
+ &cmp ("ecx",6);
2199
+ &je (&label("14break"));
2200
+ &inc ("ecx");
2201
+
2202
+ &mov ("edx","eax");
2203
+ &mov ("eax",&DWP(16,"edi")); # rk[4]
2204
+ &movz ("esi",&LB("edx")); # rk[11]>>0
2205
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2206
+ &movz ("esi",&HB("edx")); # rk[11]>>8
2207
+ &xor ("eax","ebx");
2208
+
2209
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2210
+ &shr ("edx",16);
2211
+ &shl ("ebx",8);
2212
+ &movz ("esi",&LB("edx")); # rk[11]>>16
2213
+ &xor ("eax","ebx");
2214
+
2215
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2216
+ &movz ("esi",&HB("edx")); # rk[11]>>24
2217
+ &shl ("ebx",16);
2218
+ &xor ("eax","ebx");
2219
+
2220
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2221
+ &shl ("ebx",24);
2222
+ &xor ("eax","ebx");
2223
+
2224
+ &mov (&DWP(48,"edi"),"eax"); # rk[12]
2225
+ &xor ("eax",&DWP(20,"edi"));
2226
+ &mov (&DWP(52,"edi"),"eax"); # rk[13]
2227
+ &xor ("eax",&DWP(24,"edi"));
2228
+ &mov (&DWP(56,"edi"),"eax"); # rk[14]
2229
+ &xor ("eax",&DWP(28,"edi"));
2230
+ &mov (&DWP(60,"edi"),"eax"); # rk[15]
2231
+
2232
+ &add ("edi",32);
2233
+ &jmp (&label("14loop"));
2234
+
2235
+ &set_label("14break");
2236
+ &mov (&DWP(48,"edi"),14); # setup number of rounds
2237
+ &xor ("eax","eax");
2238
+ &jmp (&label("exit"));
2239
+
2240
+ &set_label("badpointer");
2241
+ &mov ("eax",-1);
2242
+ &set_label("exit");
2243
+ &function_end("_x86_AES_set_encrypt_key");
2244
+
2245
+ # int asm_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2246
+ # AES_KEY *key)
2247
+ &function_begin_B("asm_AES_set_encrypt_key");
2248
+ &call ("_x86_AES_set_encrypt_key");
2249
+ &ret ();
2250
+ &function_end_B("asm_AES_set_encrypt_key");
2251
+
2252
+ sub deckey()
2253
+ { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2254
+ my $tmp = $tbl;
2255
+
2256
+ &mov ($tmp,0x80808080);
2257
+ &and ($tmp,$tp1);
2258
+ &lea ($tp2,&DWP(0,$tp1,$tp1));
2259
+ &mov ($acc,$tmp);
2260
+ &shr ($tmp,7);
2261
+ &sub ($acc,$tmp);
2262
+ &and ($tp2,0xfefefefe);
2263
+ &and ($acc,0x1b1b1b1b);
2264
+ &xor ($tp2,$acc);
2265
+ &mov ($tmp,0x80808080);
2266
+
2267
+ &and ($tmp,$tp2);
2268
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
2269
+ &mov ($acc,$tmp);
2270
+ &shr ($tmp,7);
2271
+ &sub ($acc,$tmp);
2272
+ &and ($tp4,0xfefefefe);
2273
+ &and ($acc,0x1b1b1b1b);
2274
+ &xor ($tp2,$tp1); # tp2^tp1
2275
+ &xor ($tp4,$acc);
2276
+ &mov ($tmp,0x80808080);
2277
+
2278
+ &and ($tmp,$tp4);
2279
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
2280
+ &mov ($acc,$tmp);
2281
+ &shr ($tmp,7);
2282
+ &xor ($tp4,$tp1); # tp4^tp1
2283
+ &sub ($acc,$tmp);
2284
+ &and ($tp8,0xfefefefe);
2285
+ &and ($acc,0x1b1b1b1b);
2286
+ &rotl ($tp1,8); # = ROTATE(tp1,8)
2287
+ &xor ($tp8,$acc);
2288
+
2289
+ &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2290
+
2291
+ &xor ($tp1,$tp2);
2292
+ &xor ($tp2,$tp8);
2293
+ &xor ($tp1,$tp4);
2294
+ &rotl ($tp2,24);
2295
+ &xor ($tp4,$tp8);
2296
+ &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2297
+ &rotl ($tp4,16);
2298
+ &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2299
+ &rotl ($tp8,8);
2300
+ &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2301
+ &mov ($tp2,$tmp);
2302
+ &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2303
+
2304
+ &mov (&DWP(4*$i,$key),$tp1);
2305
+ }
2306
+
2307
+ # int asm_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2308
+ # AES_KEY *key)
2309
+ &function_begin_B("asm_AES_set_decrypt_key");
2310
+ &call ("_x86_AES_set_encrypt_key");
2311
+ &cmp ("eax",0);
2312
+ &je (&label("proceed"));
2313
+ &ret ();
2314
+
2315
+ &set_label("proceed");
2316
+ &push ("ebp");
2317
+ &push ("ebx");
2318
+ &push ("esi");
2319
+ &push ("edi");
2320
+
2321
+ &mov ("esi",&wparam(2));
2322
+ &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2323
+ &lea ("ecx",&DWP(0,"","ecx",4));
2324
+ &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2325
+
2326
+ &set_label("invert",4); # invert order of chunks
2327
+ &mov ("eax",&DWP(0,"esi"));
2328
+ &mov ("ebx",&DWP(4,"esi"));
2329
+ &mov ("ecx",&DWP(0,"edi"));
2330
+ &mov ("edx",&DWP(4,"edi"));
2331
+ &mov (&DWP(0,"edi"),"eax");
2332
+ &mov (&DWP(4,"edi"),"ebx");
2333
+ &mov (&DWP(0,"esi"),"ecx");
2334
+ &mov (&DWP(4,"esi"),"edx");
2335
+ &mov ("eax",&DWP(8,"esi"));
2336
+ &mov ("ebx",&DWP(12,"esi"));
2337
+ &mov ("ecx",&DWP(8,"edi"));
2338
+ &mov ("edx",&DWP(12,"edi"));
2339
+ &mov (&DWP(8,"edi"),"eax");
2340
+ &mov (&DWP(12,"edi"),"ebx");
2341
+ &mov (&DWP(8,"esi"),"ecx");
2342
+ &mov (&DWP(12,"esi"),"edx");
2343
+ &add ("esi",16);
2344
+ &sub ("edi",16);
2345
+ &cmp ("esi","edi");
2346
+ &jne (&label("invert"));
2347
+
2348
+ &mov ($key,&wparam(2));
2349
+ &mov ($acc,&DWP(240,$key)); # pull number of rounds
2350
+ &lea ($acc,&DWP(-2,$acc,$acc));
2351
+ &lea ($acc,&DWP(0,$key,$acc,8));
2352
+ &mov (&wparam(2),$acc);
2353
+
2354
+ &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2355
+ &set_label("permute",4); # permute the key schedule
2356
+ &add ($key,16);
2357
+ &deckey (0,$key,$s0,$s1,$s2,$s3);
2358
+ &deckey (1,$key,$s1,$s2,$s3,$s0);
2359
+ &deckey (2,$key,$s2,$s3,$s0,$s1);
2360
+ &deckey (3,$key,$s3,$s0,$s1,$s2);
2361
+ &cmp ($key,&wparam(2));
2362
+ &jb (&label("permute"));
2363
+
2364
+ &xor ("eax","eax"); # return success
2365
+ &function_end("asm_AES_set_decrypt_key");
2366
+ &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2367
+
2368
+ &asm_finish();