ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,25 @@
1
+ <?xml version="1.0" encoding="utf-8"?>
2
+ <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3
+ <PropertyGroup Label="Globals">
4
+ <ProjectGuid>{1C3071CC-26DA-4790-B48A-3936DDD0E7E7}</ProjectGuid>
5
+ <TargetName>aes_test</TargetName>
6
+ </PropertyGroup>
7
+ <ImportGroup Label="PropertySheets">
8
+ <Import Project="..\..\mk\WindowsTest.props" />
9
+ </ImportGroup>
10
+ <PropertyGroup Label="Configuration">
11
+ <OutDir>$(OutRootDir)test\ring\crypto\aes\</OutDir>
12
+ </PropertyGroup>
13
+ <ItemGroup>
14
+ <ClCompile Include="aes_test.cc" />
15
+ </ItemGroup>
16
+ <ItemGroup>
17
+ <ProjectReference Include="..\libring.Windows.vcxproj">
18
+ <Project>{f4c0a1b6-5e09-41c8-8242-3e1f6762fb18}</Project>
19
+ </ProjectReference>
20
+ <ProjectReference Include="..\test\test.Windows.vcxproj">
21
+ <Project>{1dace503-6498-492d-b1ff-f9ee18624443}</Project>
22
+ </ProjectReference>
23
+ </ItemGroup>
24
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
25
+ </Project>
@@ -0,0 +1,93 @@
1
+ /* Copyright (c) 2015, Google Inc.
2
+ *
3
+ * Permission to use, copy, modify, and/or distribute this software for any
4
+ * purpose with or without fee is hereby granted, provided that the above
5
+ * copyright notice and this permission notice appear in all copies.
6
+ *
7
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
10
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
12
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
13
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
14
+
15
+ #include <stdio.h>
16
+ #include <string.h>
17
+
18
+ #include <openssl/aes.h>
19
+ #include <openssl/crypto.h>
20
+
21
+
22
+ static bool TestAES(const uint8_t *key, size_t key_len,
23
+ const uint8_t plaintext[AES_BLOCK_SIZE],
24
+ const uint8_t ciphertext[AES_BLOCK_SIZE]) {
25
+ AES_KEY aes_key;
26
+ if (AES_set_encrypt_key(key, key_len * 8, &aes_key) != 0) {
27
+ fprintf(stderr, "AES_set_encrypt_key failed\n");
28
+ return false;
29
+ }
30
+
31
+ // Test encryption.
32
+ uint8_t block[AES_BLOCK_SIZE];
33
+ AES_encrypt(plaintext, block, &aes_key);
34
+ if (memcmp(block, ciphertext, AES_BLOCK_SIZE) != 0) {
35
+ fprintf(stderr, "AES_encrypt gave the wrong output\n");
36
+ return false;
37
+ }
38
+
39
+ // Test in-place encryption.
40
+ memcpy(block, plaintext, AES_BLOCK_SIZE);
41
+ AES_encrypt(block, block, &aes_key);
42
+ if (memcmp(block, ciphertext, AES_BLOCK_SIZE) != 0) {
43
+ fprintf(stderr, "AES_encrypt gave the wrong output\n");
44
+ return false;
45
+ }
46
+
47
+ if (AES_set_decrypt_key(key, key_len * 8, &aes_key) != 0) {
48
+ fprintf(stderr, "AES_set_decrypt_key failed\n");
49
+ return false;
50
+ }
51
+
52
+ // Test decryption.
53
+ AES_decrypt(ciphertext, block, &aes_key);
54
+ if (memcmp(block, plaintext, AES_BLOCK_SIZE) != 0) {
55
+ fprintf(stderr, "AES_decrypt gave the wrong output\n");
56
+ return false;
57
+ }
58
+
59
+ // Test in-place decryption.
60
+ memcpy(block, ciphertext, AES_BLOCK_SIZE);
61
+ AES_decrypt(block, block, &aes_key);
62
+ if (memcmp(block, plaintext, AES_BLOCK_SIZE) != 0) {
63
+ fprintf(stderr, "AES_decrypt gave the wrong output\n");
64
+ return false;
65
+ }
66
+ return true;
67
+ }
68
+
69
+ int main() {
70
+ CRYPTO_library_init();
71
+
72
+ // Test vectors from FIPS-197, Appendix C.
73
+ if (!TestAES((const uint8_t *)"\x00\x01\x02\x03\x04\x05\x06\x07"
74
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f",
75
+ 128 / 8,
76
+ (const uint8_t *)"\x00\x11\x22\x33\x44\x55\x66\x77"
77
+ "\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
78
+ (const uint8_t *)"\x69\xc4\xe0\xd8\x6a\x7b\x04\x30"
79
+ "\xd8\xcd\xb7\x80\x70\xb4\xc5\x5a") ||
80
+ !TestAES((const uint8_t *)"\x00\x01\x02\x03\x04\x05\x06\x07"
81
+ "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f"
82
+ "\x10\x11\x12\x13\x14\x15\x16\x17"
83
+ "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f",
84
+ 256 / 8,
85
+ (const uint8_t *)"\x00\x11\x22\x33\x44\x55\x66\x77"
86
+ "\x88\x99\xaa\xbb\xcc\xdd\xee\xff",
87
+ (const uint8_t *)"\x8e\xa2\xb7\xca\x51\x67\x45\xbf"
88
+ "\xea\xfc\x49\x90\x4b\x49\x60\x89")) {
89
+ return false;
90
+ }
91
+
92
+ return 0;
93
+ }
@@ -0,0 +1,2368 @@
1
+ #!/usr/bin/env perl
2
+ #
3
+ # ====================================================================
4
+ # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5
+ # project. The module is, however, dual licensed under OpenSSL and
6
+ # CRYPTOGAMS licenses depending on where you obtain it. For further
7
+ # details see http://www.openssl.org/~appro/cryptogams/.
8
+ # ====================================================================
9
+ #
10
+ # Version 4.3.
11
+ #
12
+ # You might fail to appreciate this module performance from the first
13
+ # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
14
+ # to be *the* best Intel C compiler without -KPIC, performance appears
15
+ # to be virtually identical... But try to re-configure with shared
16
+ # library support... Aha! Intel compiler "suddenly" lags behind by 30%
17
+ # [on P4, more on others]:-) And if compared to position-independent
18
+ # code generated by GNU C, this code performs *more* than *twice* as
19
+ # fast! Yes, all this buzz about PIC means that unlike other hand-
20
+ # coded implementations, this one was explicitly designed to be safe
21
+ # to use even in shared library context... This also means that this
22
+ # code isn't necessarily absolutely fastest "ever," because in order
23
+ # to achieve position independence an extra register has to be
24
+ # off-loaded to stack, which affects the benchmark result.
25
+ #
26
+ # Special note about instruction choice. Do you recall RC4_INT code
27
+ # performing poorly on P4? It might be the time to figure out why.
28
+ # RC4_INT code implies effective address calculations in base+offset*4
29
+ # form. Trouble is that it seems that offset scaling turned to be
30
+ # critical path... At least eliminating scaling resulted in 2.8x RC4
31
+ # performance improvement [as you might recall]. As AES code is hungry
32
+ # for scaling too, I [try to] avoid the latter by favoring off-by-2
33
+ # shifts and masking the result with 0xFF<<2 instead of "boring" 0xFF.
34
+ #
35
+ # As was shown by Dean Gaudet <dean@arctic.org>, the above note turned
36
+ # void. Performance improvement with off-by-2 shifts was observed on
37
+ # intermediate implementation, which was spilling yet another register
38
+ # to stack... Final offset*4 code below runs just a tad faster on P4,
39
+ # but exhibits up to 10% improvement on other cores.
40
+ #
41
+ # Second version is "monolithic" replacement for aes_core.c, which in
42
+ # addition to AES_[de|en]crypt implements AES_set_[de|en]cryption_key.
43
+ # This made it possible to implement little-endian variant of the
44
+ # algorithm without modifying the base C code. Motivating factor for
45
+ # the undertaken effort was that it appeared that in tight IA-32
46
+ # register window little-endian flavor could achieve slightly higher
47
+ # Instruction Level Parallelism, and it indeed resulted in up to 15%
48
+ # better performance on most recent µ-archs...
49
+ #
50
+ # Third version adds AES_cbc_encrypt implementation, which resulted in
51
+ # up to 40% performance imrovement of CBC benchmark results. 40% was
52
+ # observed on P4 core, where "overall" imrovement coefficient, i.e. if
53
+ # compared to PIC generated by GCC and in CBC mode, was observed to be
54
+ # as large as 4x:-) CBC performance is virtually identical to ECB now
55
+ # and on some platforms even better, e.g. 17.6 "small" cycles/byte on
56
+ # Opteron, because certain function prologues and epilogues are
57
+ # effectively taken out of the loop...
58
+ #
59
+ # Version 3.2 implements compressed tables and prefetch of these tables
60
+ # in CBC[!] mode. Former means that 3/4 of table references are now
61
+ # misaligned, which unfortunately has negative impact on elder IA-32
62
+ # implementations, Pentium suffered 30% penalty, PIII - 10%.
63
+ #
64
+ # Version 3.3 avoids L1 cache aliasing between stack frame and
65
+ # S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
66
+ # latter is achieved by copying the key schedule to controlled place in
67
+ # stack. This unfortunately has rather strong impact on small block CBC
68
+ # performance, ~2x deterioration on 16-byte block if compared to 3.3.
69
+ #
70
+ # Version 3.5 checks if there is L1 cache aliasing between user-supplied
71
+ # key schedule and S-boxes and abstains from copying the former if
72
+ # there is no. This allows end-user to consciously retain small block
73
+ # performance by aligning key schedule in specific manner.
74
+ #
75
+ # Version 3.6 compresses Td4 to 256 bytes and prefetches it in ECB.
76
+ #
77
+ # Current ECB performance numbers for 128-bit key in CPU cycles per
78
+ # processed byte [measure commonly used by AES benchmarkers] are:
79
+ #
80
+ # small footprint fully unrolled
81
+ # P4 24 22
82
+ # AMD K8 20 19
83
+ # PIII 25 23
84
+ # Pentium 81 78
85
+ #
86
+ # Version 3.7 reimplements outer rounds as "compact." Meaning that
87
+ # first and last rounds reference compact 256 bytes S-box. This means
88
+ # that first round consumes a lot more CPU cycles and that encrypt
89
+ # and decrypt performance becomes asymmetric. Encrypt performance
90
+ # drops by 10-12%, while decrypt - by 20-25%:-( 256 bytes S-box is
91
+ # aggressively pre-fetched.
92
+ #
93
+ # Version 4.0 effectively rolls back to 3.6 and instead implements
94
+ # additional set of functions, _[x86|sse]_AES_[en|de]crypt_compact,
95
+ # which use exclusively 256 byte S-box. These functions are to be
96
+ # called in modes not concealing plain text, such as ECB, or when
97
+ # we're asked to process smaller amount of data [or unconditionally
98
+ # on hyper-threading CPU]. Currently it's called unconditionally from
99
+ # AES_[en|de]crypt, which affects all modes, but CBC. CBC routine
100
+ # still needs to be modified to switch between slower and faster
101
+ # mode when appropriate... But in either case benchmark landscape
102
+ # changes dramatically and below numbers are CPU cycles per processed
103
+ # byte for 128-bit key.
104
+ #
105
+ # ECB encrypt ECB decrypt CBC large chunk
106
+ # P4 52[54] 83[95] 23
107
+ # AMD K8 46[41] 66[70] 18
108
+ # PIII 41[50] 60[77] 24
109
+ # Core 2 31[36] 45[64] 18.5
110
+ # Atom 76[100] 96[138] 60
111
+ # Pentium 115 150 77
112
+ #
113
+ # Version 4.1 switches to compact S-box even in key schedule setup.
114
+ #
115
+ # Version 4.2 prefetches compact S-box in every SSE round or in other
116
+ # words every cache-line is *guaranteed* to be accessed within ~50
117
+ # cycles window. Why just SSE? Because it's needed on hyper-threading
118
+ # CPU! Which is also why it's prefetched with 64 byte stride. Best
119
+ # part is that it has no negative effect on performance:-)
120
+ #
121
+ # Version 4.3 implements switch between compact and non-compact block
122
+ # functions in AES_cbc_encrypt depending on how much data was asked
123
+ # to be processed in one stroke.
124
+ #
125
+ ######################################################################
126
+ # Timing attacks are classified in two classes: synchronous when
127
+ # attacker consciously initiates cryptographic operation and collects
128
+ # timing data of various character afterwards, and asynchronous when
129
+ # malicious code is executed on same CPU simultaneously with AES,
130
+ # instruments itself and performs statistical analysis of this data.
131
+ #
132
+ # As far as synchronous attacks go the root to the AES timing
133
+ # vulnerability is twofold. Firstly, of 256 S-box elements at most 160
134
+ # are referred to in single 128-bit block operation. Well, in C
135
+ # implementation with 4 distinct tables it's actually as little as 40
136
+ # references per 256 elements table, but anyway... Secondly, even
137
+ # though S-box elements are clustered into smaller amount of cache-
138
+ # lines, smaller than 160 and even 40, it turned out that for certain
139
+ # plain-text pattern[s] or simply put chosen plain-text and given key
140
+ # few cache-lines remain unaccessed during block operation. Now, if
141
+ # attacker can figure out this access pattern, he can deduct the key
142
+ # [or at least part of it]. The natural way to mitigate this kind of
143
+ # attacks is to minimize the amount of cache-lines in S-box and/or
144
+ # prefetch them to ensure that every one is accessed for more uniform
145
+ # timing. But note that *if* plain-text was concealed in such way that
146
+ # input to block function is distributed *uniformly*, then attack
147
+ # wouldn't apply. Now note that some encryption modes, most notably
148
+ # CBC, do mask the plain-text in this exact way [secure cipher output
149
+ # is distributed uniformly]. Yes, one still might find input that
150
+ # would reveal the information about given key, but if amount of
151
+ # candidate inputs to be tried is larger than amount of possible key
152
+ # combinations then attack becomes infeasible. This is why revised
153
+ # AES_cbc_encrypt "dares" to switch to larger S-box when larger chunk
154
+ # of data is to be processed in one stroke. The current size limit of
155
+ # 512 bytes is chosen to provide same [diminishigly low] probability
156
+ # for cache-line to remain untouched in large chunk operation with
157
+ # large S-box as for single block operation with compact S-box and
158
+ # surely needs more careful consideration...
159
+ #
160
+ # As for asynchronous attacks. There are two flavours: attacker code
161
+ # being interleaved with AES on hyper-threading CPU at *instruction*
162
+ # level, and two processes time sharing single core. As for latter.
163
+ # Two vectors. 1. Given that attacker process has higher priority,
164
+ # yield execution to process performing AES just before timer fires
165
+ # off the scheduler, immediately regain control of CPU and analyze the
166
+ # cache state. For this attack to be efficient attacker would have to
167
+ # effectively slow down the operation by several *orders* of magnitute,
168
+ # by ratio of time slice to duration of handful of AES rounds, which
169
+ # unlikely to remain unnoticed. Not to mention that this also means
170
+ # that he would spend correspondigly more time to collect enough
171
+ # statistical data to mount the attack. It's probably appropriate to
172
+ # say that if adeversary reckons that this attack is beneficial and
173
+ # risks to be noticed, you probably have larger problems having him
174
+ # mere opportunity. In other words suggested code design expects you
175
+ # to preclude/mitigate this attack by overall system security design.
176
+ # 2. Attacker manages to make his code interrupt driven. In order for
177
+ # this kind of attack to be feasible, interrupt rate has to be high
178
+ # enough, again comparable to duration of handful of AES rounds. But
179
+ # is there interrupt source of such rate? Hardly, not even 1Gbps NIC
180
+ # generates interrupts at such raging rate...
181
+ #
182
+ # And now back to the former, hyper-threading CPU or more specifically
183
+ # Intel P4. Recall that asynchronous attack implies that malicious
184
+ # code instruments itself. And naturally instrumentation granularity
185
+ # has be noticeably lower than duration of codepath accessing S-box.
186
+ # Given that all cache-lines are accessed during that time that is.
187
+ # Current implementation accesses *all* cache-lines within ~50 cycles
188
+ # window, which is actually *less* than RDTSC latency on Intel P4!
189
+
190
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
191
+ push(@INC,"${dir}","${dir}../../perlasm");
192
+ require "x86asm.pl";
193
+
194
+ &asm_init($ARGV[0],"aes-586.pl",$x86only = $ARGV[$#ARGV] eq "386");
195
+ &static_label("AES_Te");
196
+ &static_label("AES_Td");
197
+
198
+ $s0="eax";
199
+ $s1="ebx";
200
+ $s2="ecx";
201
+ $s3="edx";
202
+ $key="edi";
203
+ $acc="esi";
204
+ $tbl="ebp";
205
+
206
+ # stack frame layout in _[x86|sse]_AES_* routines, frame is allocated
207
+ # by caller
208
+ $__ra=&DWP(0,"esp"); # return address
209
+ $__s0=&DWP(4,"esp"); # s0 backing store
210
+ $__s1=&DWP(8,"esp"); # s1 backing store
211
+ $__s2=&DWP(12,"esp"); # s2 backing store
212
+ $__s3=&DWP(16,"esp"); # s3 backing store
213
+ $__key=&DWP(20,"esp"); # pointer to key schedule
214
+ $__end=&DWP(24,"esp"); # pointer to end of key schedule
215
+ $__tbl=&DWP(28,"esp"); # %ebp backing store
216
+
217
+ # stack frame layout in AES_[en|crypt] routines, which differs from
218
+ # above by 4 and overlaps by %ebp backing store
219
+ $_tbl=&DWP(24,"esp");
220
+ $_esp=&DWP(28,"esp");
221
+
222
+ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
223
+
224
+ $speed_limit=512; # chunks smaller than $speed_limit are
225
+ # processed with compact routine in CBC mode
226
+ $small_footprint=1; # $small_footprint=1 code is ~5% slower [on
227
+ # recent µ-archs], but ~5 times smaller!
228
+ # I favor compact code to minimize cache
229
+ # contention and in hope to "collect" 5% back
230
+ # in real-life applications...
231
+
232
+ $vertical_spin=0; # shift "verticaly" defaults to 0, because of
233
+ # its proof-of-concept status...
234
+ # Note that there is no decvert(), as well as last encryption round is
235
+ # performed with "horizontal" shifts. This is because this "vertical"
236
+ # implementation [one which groups shifts on a given $s[i] to form a
237
+ # "column," unlike "horizontal" one, which groups shifts on different
238
+ # $s[i] to form a "row"] is work in progress. It was observed to run
239
+ # few percents faster on Intel cores, but not AMD. On AMD K8 core it's
240
+ # whole 12% slower:-( So we face a trade-off... Shall it be resolved
241
+ # some day? Till then the code is considered experimental and by
242
+ # default remains dormant...
243
+
244
+ sub encvert()
245
+ { my ($te,@s) = @_;
246
+ my ($v0,$v1) = ($acc,$key);
247
+
248
+ &mov ($v0,$s[3]); # copy s3
249
+ &mov (&DWP(4,"esp"),$s[2]); # save s2
250
+ &mov ($v1,$s[0]); # copy s0
251
+ &mov (&DWP(8,"esp"),$s[1]); # save s1
252
+
253
+ &movz ($s[2],&HB($s[0]));
254
+ &and ($s[0],0xFF);
255
+ &mov ($s[0],&DWP(0,$te,$s[0],8)); # s0>>0
256
+ &shr ($v1,16);
257
+ &mov ($s[3],&DWP(3,$te,$s[2],8)); # s0>>8
258
+ &movz ($s[1],&HB($v1));
259
+ &and ($v1,0xFF);
260
+ &mov ($s[2],&DWP(2,$te,$v1,8)); # s0>>16
261
+ &mov ($v1,$v0);
262
+ &mov ($s[1],&DWP(1,$te,$s[1],8)); # s0>>24
263
+
264
+ &and ($v0,0xFF);
265
+ &xor ($s[3],&DWP(0,$te,$v0,8)); # s3>>0
266
+ &movz ($v0,&HB($v1));
267
+ &shr ($v1,16);
268
+ &xor ($s[2],&DWP(3,$te,$v0,8)); # s3>>8
269
+ &movz ($v0,&HB($v1));
270
+ &and ($v1,0xFF);
271
+ &xor ($s[1],&DWP(2,$te,$v1,8)); # s3>>16
272
+ &mov ($v1,&DWP(4,"esp")); # restore s2
273
+ &xor ($s[0],&DWP(1,$te,$v0,8)); # s3>>24
274
+
275
+ &mov ($v0,$v1);
276
+ &and ($v1,0xFF);
277
+ &xor ($s[2],&DWP(0,$te,$v1,8)); # s2>>0
278
+ &movz ($v1,&HB($v0));
279
+ &shr ($v0,16);
280
+ &xor ($s[1],&DWP(3,$te,$v1,8)); # s2>>8
281
+ &movz ($v1,&HB($v0));
282
+ &and ($v0,0xFF);
283
+ &xor ($s[0],&DWP(2,$te,$v0,8)); # s2>>16
284
+ &mov ($v0,&DWP(8,"esp")); # restore s1
285
+ &xor ($s[3],&DWP(1,$te,$v1,8)); # s2>>24
286
+
287
+ &mov ($v1,$v0);
288
+ &and ($v0,0xFF);
289
+ &xor ($s[1],&DWP(0,$te,$v0,8)); # s1>>0
290
+ &movz ($v0,&HB($v1));
291
+ &shr ($v1,16);
292
+ &xor ($s[0],&DWP(3,$te,$v0,8)); # s1>>8
293
+ &movz ($v0,&HB($v1));
294
+ &and ($v1,0xFF);
295
+ &xor ($s[3],&DWP(2,$te,$v1,8)); # s1>>16
296
+ &mov ($key,$__key); # reincarnate v1 as key
297
+ &xor ($s[2],&DWP(1,$te,$v0,8)); # s1>>24
298
+ }
299
+
300
+ # Another experimental routine, which features "horizontal spin," but
301
+ # eliminates one reference to stack. Strangely enough runs slower...
302
+ sub enchoriz()
303
+ { my ($v0,$v1) = ($key,$acc);
304
+
305
+ &movz ($v0,&LB($s0)); # 3, 2, 1, 0*
306
+ &rotr ($s2,8); # 8,11,10, 9
307
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 0
308
+ &movz ($v0,&HB($s1)); # 7, 6, 5*, 4
309
+ &rotr ($s3,16); # 13,12,15,14
310
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 5
311
+ &movz ($v0,&HB($s2)); # 8,11,10*, 9
312
+ &rotr ($s0,16); # 1, 0, 3, 2
313
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 10
314
+ &movz ($v0,&HB($s3)); # 13,12,15*,14
315
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 15, t[0] collected
316
+ &mov ($__s0,$v1); # t[0] saved
317
+
318
+ &movz ($v0,&LB($s1)); # 7, 6, 5, 4*
319
+ &shr ($s1,16); # -, -, 7, 6
320
+ &mov ($v1,&DWP(0,$te,$v0,8)); # 4
321
+ &movz ($v0,&LB($s3)); # 13,12,15,14*
322
+ &xor ($v1,&DWP(2,$te,$v0,8)); # 14
323
+ &movz ($v0,&HB($s0)); # 1, 0, 3*, 2
324
+ &and ($s3,0xffff0000); # 13,12, -, -
325
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 3
326
+ &movz ($v0,&LB($s2)); # 8,11,10, 9*
327
+ &or ($s3,$s1); # 13,12, 7, 6
328
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 9, t[1] collected
329
+ &mov ($s1,$v1); # s[1]=t[1]
330
+
331
+ &movz ($v0,&LB($s0)); # 1, 0, 3, 2*
332
+ &shr ($s2,16); # -, -, 8,11
333
+ &mov ($v1,&DWP(2,$te,$v0,8)); # 2
334
+ &movz ($v0,&HB($s3)); # 13,12, 7*, 6
335
+ &xor ($v1,&DWP(1,$te,$v0,8)); # 7
336
+ &movz ($v0,&HB($s2)); # -, -, 8*,11
337
+ &xor ($v1,&DWP(0,$te,$v0,8)); # 8
338
+ &mov ($v0,$s3);
339
+ &shr ($v0,24); # 13
340
+ &xor ($v1,&DWP(3,$te,$v0,8)); # 13, t[2] collected
341
+
342
+ &movz ($v0,&LB($s2)); # -, -, 8,11*
343
+ &shr ($s0,24); # 1*
344
+ &mov ($s2,&DWP(1,$te,$v0,8)); # 11
345
+ &xor ($s2,&DWP(3,$te,$s0,8)); # 1
346
+ &mov ($s0,$__s0); # s[0]=t[0]
347
+ &movz ($v0,&LB($s3)); # 13,12, 7, 6*
348
+ &shr ($s3,16); # , ,13,12
349
+ &xor ($s2,&DWP(2,$te,$v0,8)); # 6
350
+ &mov ($key,$__key); # reincarnate v0 as key
351
+ &and ($s3,0xff); # , ,13,12*
352
+ &mov ($s3,&DWP(0,$te,$s3,8)); # 12
353
+ &xor ($s3,$s2); # s[2]=t[3] collected
354
+ &mov ($s2,$v1); # s[2]=t[2]
355
+ }
356
+
357
+ # More experimental code... SSE one... Even though this one eliminates
358
+ # *all* references to stack, it's not faster...
359
+ sub sse_encbody()
360
+ {
361
+ &movz ($acc,&LB("eax")); # 0
362
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 0
363
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
364
+ &movz ("edx",&HB("eax")); # 1
365
+ &mov ("edx",&DWP(3,$tbl,"edx",8)); # 1
366
+ &shr ("eax",16); # 5, 4
367
+
368
+ &movz ($acc,&LB("ebx")); # 10
369
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 10
370
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
371
+ &movz ($acc,&HB("ebx")); # 11
372
+ &xor ("edx",&DWP(1,$tbl,$acc,8)); # 11
373
+ &shr ("ebx",16); # 15,14
374
+
375
+ &movz ($acc,&HB("eax")); # 5
376
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 5
377
+ &movq ("mm3",QWP(16,$key));
378
+ &movz ($acc,&HB("ebx")); # 15
379
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 15
380
+ &movd ("mm0","ecx"); # t[0] collected
381
+
382
+ &movz ($acc,&LB("eax")); # 4
383
+ &mov ("ecx",&DWP(0,$tbl,$acc,8)); # 4
384
+ &movd ("eax","mm2"); # 7, 6, 3, 2
385
+ &movz ($acc,&LB("ebx")); # 14
386
+ &xor ("ecx",&DWP(2,$tbl,$acc,8)); # 14
387
+ &movd ("ebx","mm6"); # 13,12, 9, 8
388
+
389
+ &movz ($acc,&HB("eax")); # 3
390
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 3
391
+ &movz ($acc,&HB("ebx")); # 9
392
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 9
393
+ &movd ("mm1","ecx"); # t[1] collected
394
+
395
+ &movz ($acc,&LB("eax")); # 2
396
+ &mov ("ecx",&DWP(2,$tbl,$acc,8)); # 2
397
+ &shr ("eax",16); # 7, 6
398
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
399
+ &movz ($acc,&LB("ebx")); # 8
400
+ &xor ("ecx",&DWP(0,$tbl,$acc,8)); # 8
401
+ &shr ("ebx",16); # 13,12
402
+
403
+ &movz ($acc,&HB("eax")); # 7
404
+ &xor ("ecx",&DWP(1,$tbl,$acc,8)); # 7
405
+ &pxor ("mm0","mm3");
406
+ &movz ("eax",&LB("eax")); # 6
407
+ &xor ("edx",&DWP(2,$tbl,"eax",8)); # 6
408
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
409
+ &movz ($acc,&HB("ebx")); # 13
410
+ &xor ("ecx",&DWP(3,$tbl,$acc,8)); # 13
411
+ &xor ("ecx",&DWP(24,$key)); # t[2]
412
+ &movd ("mm4","ecx"); # t[2] collected
413
+ &movz ("ebx",&LB("ebx")); # 12
414
+ &xor ("edx",&DWP(0,$tbl,"ebx",8)); # 12
415
+ &shr ("ecx",16);
416
+ &movd ("eax","mm1"); # 5, 4, 1, 0
417
+ &mov ("ebx",&DWP(28,$key)); # t[3]
418
+ &xor ("ebx","edx");
419
+ &movd ("mm5","ebx"); # t[3] collected
420
+ &and ("ebx",0xffff0000);
421
+ &or ("ebx","ecx");
422
+
423
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
424
+ }
425
+
426
+ ######################################################################
427
+ # "Compact" block function
428
+ ######################################################################
429
+
430
+ sub enccompact()
431
+ { my $Fn = \&mov;
432
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
433
+ my ($i,$te,@s)=@_;
434
+ my $tmp = $key;
435
+ my $out = $i==3?$s[0]:$acc;
436
+
437
+ # $Fn is used in first compact round and its purpose is to
438
+ # void restoration of some values from stack, so that after
439
+ # 4xenccompact with extra argument $key value is left there...
440
+ if ($i==3) { &$Fn ($key,$__key); }##%edx
441
+ else { &mov ($out,$s[0]); }
442
+ &and ($out,0xFF);
443
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
444
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
445
+ &movz ($out,&BP(-128,$te,$out,1));
446
+
447
+ if ($i==3) { $tmp=$s[1]; }##%eax
448
+ &movz ($tmp,&HB($s[1]));
449
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
450
+ &shl ($tmp,8);
451
+ &xor ($out,$tmp);
452
+
453
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
454
+ else { &mov ($tmp,$s[2]);
455
+ &shr ($tmp,16); }
456
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
457
+ &and ($tmp,0xFF);
458
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
459
+ &shl ($tmp,16);
460
+ &xor ($out,$tmp);
461
+
462
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
463
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
464
+ else { &mov ($tmp,$s[3]);
465
+ &shr ($tmp,24); }
466
+ &movz ($tmp,&BP(-128,$te,$tmp,1));
467
+ &shl ($tmp,24);
468
+ &xor ($out,$tmp);
469
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
470
+ if ($i==3) { &mov ($s[3],$acc); }
471
+ &comment();
472
+ }
473
+
474
+ sub enctransform()
475
+ { my @s = ($s0,$s1,$s2,$s3);
476
+ my $i = shift;
477
+ my $tmp = $tbl;
478
+ my $r2 = $key ;
479
+
480
+ &and ($tmp,$s[$i]);
481
+ &lea ($r2,&DWP(0,$s[$i],$s[$i]));
482
+ &mov ($acc,$tmp);
483
+ &shr ($tmp,7);
484
+ &and ($r2,0xfefefefe);
485
+ &sub ($acc,$tmp);
486
+ &mov ($tmp,$s[$i]);
487
+ &and ($acc,0x1b1b1b1b);
488
+ &rotr ($tmp,16);
489
+ &xor ($acc,$r2); # r2
490
+ &mov ($r2,$s[$i]);
491
+
492
+ &xor ($s[$i],$acc); # r0 ^ r2
493
+ &rotr ($r2,16+8);
494
+ &xor ($acc,$tmp);
495
+ &rotl ($s[$i],24);
496
+ &xor ($acc,$r2);
497
+ &mov ($tmp,0x80808080) if ($i!=1);
498
+ &xor ($s[$i],$acc); # ROTATE(r2^r0,24) ^ r2
499
+ }
500
+
501
+ &function_begin_B("_x86_AES_encrypt_compact");
502
+ # note that caller is expected to allocate stack frame for me!
503
+ &mov ($__key,$key); # save key
504
+
505
+ &xor ($s0,&DWP(0,$key)); # xor with key
506
+ &xor ($s1,&DWP(4,$key));
507
+ &xor ($s2,&DWP(8,$key));
508
+ &xor ($s3,&DWP(12,$key));
509
+
510
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
511
+ &lea ($acc,&DWP(-2,$acc,$acc));
512
+ &lea ($acc,&DWP(0,$key,$acc,8));
513
+ &mov ($__end,$acc); # end of key schedule
514
+
515
+ # prefetch Te4
516
+ &mov ($key,&DWP(0-128,$tbl));
517
+ &mov ($acc,&DWP(32-128,$tbl));
518
+ &mov ($key,&DWP(64-128,$tbl));
519
+ &mov ($acc,&DWP(96-128,$tbl));
520
+ &mov ($key,&DWP(128-128,$tbl));
521
+ &mov ($acc,&DWP(160-128,$tbl));
522
+ &mov ($key,&DWP(192-128,$tbl));
523
+ &mov ($acc,&DWP(224-128,$tbl));
524
+
525
+ &set_label("loop",16);
526
+
527
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3,1);
528
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
529
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
530
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
531
+ &mov ($tbl,0x80808080);
532
+ &enctransform(2);
533
+ &enctransform(3);
534
+ &enctransform(0);
535
+ &enctransform(1);
536
+ &mov ($key,$__key);
537
+ &mov ($tbl,$__tbl);
538
+ &add ($key,16); # advance rd_key
539
+ &xor ($s0,&DWP(0,$key));
540
+ &xor ($s1,&DWP(4,$key));
541
+ &xor ($s2,&DWP(8,$key));
542
+ &xor ($s3,&DWP(12,$key));
543
+
544
+ &cmp ($key,$__end);
545
+ &mov ($__key,$key);
546
+ &jb (&label("loop"));
547
+
548
+ &enccompact(0,$tbl,$s0,$s1,$s2,$s3);
549
+ &enccompact(1,$tbl,$s1,$s2,$s3,$s0);
550
+ &enccompact(2,$tbl,$s2,$s3,$s0,$s1);
551
+ &enccompact(3,$tbl,$s3,$s0,$s1,$s2);
552
+
553
+ &xor ($s0,&DWP(16,$key));
554
+ &xor ($s1,&DWP(20,$key));
555
+ &xor ($s2,&DWP(24,$key));
556
+ &xor ($s3,&DWP(28,$key));
557
+
558
+ &ret ();
559
+ &function_end_B("_x86_AES_encrypt_compact");
560
+
561
+ ######################################################################
562
+ # "Compact" SSE block function.
563
+ ######################################################################
564
+ #
565
+ # Performance is not actually extraordinary in comparison to pure
566
+ # x86 code. In particular encrypt performance is virtually the same.
567
+ # Decrypt performance on the other hand is 15-20% better on newer
568
+ # µ-archs [but we're thankful for *any* improvement here], and ~50%
569
+ # better on PIII:-) And additionally on the pros side this code
570
+ # eliminates redundant references to stack and thus relieves/
571
+ # minimizes the pressure on the memory bus.
572
+ #
573
+ # MMX register layout lsb
574
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
575
+ # | mm4 | mm0 |
576
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
577
+ # | s3 | s2 | s1 | s0 |
578
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
579
+ # |15|14|13|12|11|10| 9| 8| 7| 6| 5| 4| 3| 2| 1| 0|
580
+ # +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
581
+ #
582
+ # Indexes translate as s[N/4]>>(8*(N%4)), e.g. 5 means s1>>8.
583
+ # In this terms encryption and decryption "compact" permutation
584
+ # matrices can be depicted as following:
585
+ #
586
+ # encryption lsb # decryption lsb
587
+ # +----++----+----+----+----+ # +----++----+----+----+----+
588
+ # | t0 || 15 | 10 | 5 | 0 | # | t0 || 7 | 10 | 13 | 0 |
589
+ # +----++----+----+----+----+ # +----++----+----+----+----+
590
+ # | t1 || 3 | 14 | 9 | 4 | # | t1 || 11 | 14 | 1 | 4 |
591
+ # +----++----+----+----+----+ # +----++----+----+----+----+
592
+ # | t2 || 7 | 2 | 13 | 8 | # | t2 || 15 | 2 | 5 | 8 |
593
+ # +----++----+----+----+----+ # +----++----+----+----+----+
594
+ # | t3 || 11 | 6 | 1 | 12 | # | t3 || 3 | 6 | 9 | 12 |
595
+ # +----++----+----+----+----+ # +----++----+----+----+----+
596
+ #
597
+ ######################################################################
598
+ # Why not xmm registers? Short answer. It was actually tested and
599
+ # was not any faster, but *contrary*, most notably on Intel CPUs.
600
+ # Longer answer. Main advantage of using mm registers is that movd
601
+ # latency is lower, especially on Intel P4. While arithmetic
602
+ # instructions are twice as many, they can be scheduled every cycle
603
+ # and not every second one when they are operating on xmm register,
604
+ # so that "arithmetic throughput" remains virtually the same. And
605
+ # finally the code can be executed even on elder SSE-only CPUs:-)
606
+
607
+ sub sse_enccompact()
608
+ {
609
+ &pshufw ("mm1","mm0",0x08); # 5, 4, 1, 0
610
+ &pshufw ("mm5","mm4",0x0d); # 15,14,11,10
611
+ &movd ("eax","mm1"); # 5, 4, 1, 0
612
+ &movd ("ebx","mm5"); # 15,14,11,10
613
+ &mov ($__key,$key);
614
+
615
+ &movz ($acc,&LB("eax")); # 0
616
+ &movz ("edx",&HB("eax")); # 1
617
+ &pshufw ("mm2","mm0",0x0d); # 7, 6, 3, 2
618
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
619
+ &movz ($key,&LB("ebx")); # 10
620
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
621
+ &shr ("eax",16); # 5, 4
622
+ &shl ("edx",8); # 1
623
+
624
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
625
+ &movz ($key,&HB("ebx")); # 11
626
+ &shl ($acc,16); # 10
627
+ &pshufw ("mm6","mm4",0x08); # 13,12, 9, 8
628
+ &or ("ecx",$acc); # 10
629
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
630
+ &movz ($key,&HB("eax")); # 5
631
+ &shl ($acc,24); # 11
632
+ &shr ("ebx",16); # 15,14
633
+ &or ("edx",$acc); # 11
634
+
635
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
636
+ &movz ($key,&HB("ebx")); # 15
637
+ &shl ($acc,8); # 5
638
+ &or ("ecx",$acc); # 5
639
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 15
640
+ &movz ($key,&LB("eax")); # 4
641
+ &shl ($acc,24); # 15
642
+ &or ("ecx",$acc); # 15
643
+
644
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
645
+ &movz ($key,&LB("ebx")); # 14
646
+ &movd ("eax","mm2"); # 7, 6, 3, 2
647
+ &movd ("mm0","ecx"); # t[0] collected
648
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 14
649
+ &movz ($key,&HB("eax")); # 3
650
+ &shl ("ecx",16); # 14
651
+ &movd ("ebx","mm6"); # 13,12, 9, 8
652
+ &or ("ecx",$acc); # 14
653
+
654
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 3
655
+ &movz ($key,&HB("ebx")); # 9
656
+ &shl ($acc,24); # 3
657
+ &or ("ecx",$acc); # 3
658
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
659
+ &movz ($key,&LB("ebx")); # 8
660
+ &shl ($acc,8); # 9
661
+ &shr ("ebx",16); # 13,12
662
+ &or ("ecx",$acc); # 9
663
+
664
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 8
665
+ &movz ($key,&LB("eax")); # 2
666
+ &shr ("eax",16); # 7, 6
667
+ &movd ("mm1","ecx"); # t[1] collected
668
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 2
669
+ &movz ($key,&HB("eax")); # 7
670
+ &shl ("ecx",16); # 2
671
+ &and ("eax",0xff); # 6
672
+ &or ("ecx",$acc); # 2
673
+
674
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
675
+
676
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
677
+ &movz ($key,&HB("ebx")); # 13
678
+ &shl ($acc,24); # 7
679
+ &and ("ebx",0xff); # 12
680
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 6
681
+ &or ("ecx",$acc); # 7
682
+ &shl ("eax",16); # 6
683
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
684
+ &or ("edx","eax"); # 6
685
+ &shl ($acc,8); # 13
686
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
687
+ &or ("ecx",$acc); # 13
688
+ &or ("edx","ebx"); # 12
689
+ &mov ($key,$__key);
690
+ &movd ("mm4","ecx"); # t[2] collected
691
+ &movd ("mm5","edx"); # t[3] collected
692
+
693
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
694
+ }
695
+
696
+ if (!$x86only) {
697
+ &function_begin_B("_sse_AES_encrypt_compact");
698
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
699
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
700
+
701
+ # note that caller is expected to allocate stack frame for me!
702
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
703
+ &lea ($acc,&DWP(-2,$acc,$acc));
704
+ &lea ($acc,&DWP(0,$key,$acc,8));
705
+ &mov ($__end,$acc); # end of key schedule
706
+
707
+ &mov ($s0,0x1b1b1b1b); # magic constant
708
+ &mov (&DWP(8,"esp"),$s0);
709
+ &mov (&DWP(12,"esp"),$s0);
710
+
711
+ # prefetch Te4
712
+ &mov ($s0,&DWP(0-128,$tbl));
713
+ &mov ($s1,&DWP(32-128,$tbl));
714
+ &mov ($s2,&DWP(64-128,$tbl));
715
+ &mov ($s3,&DWP(96-128,$tbl));
716
+ &mov ($s0,&DWP(128-128,$tbl));
717
+ &mov ($s1,&DWP(160-128,$tbl));
718
+ &mov ($s2,&DWP(192-128,$tbl));
719
+ &mov ($s3,&DWP(224-128,$tbl));
720
+
721
+ &set_label("loop",16);
722
+ &sse_enccompact();
723
+ &add ($key,16);
724
+ &cmp ($key,$__end);
725
+ &ja (&label("out"));
726
+
727
+ &movq ("mm2",&QWP(8,"esp"));
728
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
729
+ &movq ("mm1","mm0"); &movq ("mm5","mm4"); # r0
730
+ &pcmpgtb("mm3","mm0"); &pcmpgtb("mm7","mm4");
731
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
732
+ &pshufw ("mm2","mm0",0xb1); &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
733
+ &paddb ("mm0","mm0"); &paddb ("mm4","mm4");
734
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # = r2
735
+ &pshufw ("mm3","mm2",0xb1); &pshufw ("mm7","mm6",0xb1);# r0
736
+ &pxor ("mm1","mm0"); &pxor ("mm5","mm4"); # r0^r2
737
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(r0,16)
738
+
739
+ &movq ("mm2","mm3"); &movq ("mm6","mm7");
740
+ &pslld ("mm3",8); &pslld ("mm7",8);
741
+ &psrld ("mm2",24); &psrld ("mm6",24);
742
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= r0<<8
743
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= r0>>24
744
+
745
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
746
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
747
+ &psrld ("mm1",8); &psrld ("mm5",8);
748
+ &mov ($s0,&DWP(0-128,$tbl));
749
+ &pslld ("mm3",24); &pslld ("mm7",24);
750
+ &mov ($s1,&DWP(64-128,$tbl));
751
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= (r2^r0)<<8
752
+ &mov ($s2,&DWP(128-128,$tbl));
753
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= (r2^r0)>>24
754
+ &mov ($s3,&DWP(192-128,$tbl));
755
+
756
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
757
+ &jmp (&label("loop"));
758
+
759
+ &set_label("out",16);
760
+ &pxor ("mm0",&QWP(0,$key));
761
+ &pxor ("mm4",&QWP(8,$key));
762
+
763
+ &ret ();
764
+ &function_end_B("_sse_AES_encrypt_compact");
765
+ }
766
+
767
+ ######################################################################
768
+ # Vanilla block function.
769
+ ######################################################################
770
+
771
+ sub encstep()
772
+ { my ($i,$te,@s) = @_;
773
+ my $tmp = $key;
774
+ my $out = $i==3?$s[0]:$acc;
775
+
776
+ # lines marked with #%e?x[i] denote "reordered" instructions...
777
+ if ($i==3) { &mov ($key,$__key); }##%edx
778
+ else { &mov ($out,$s[0]);
779
+ &and ($out,0xFF); }
780
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
781
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
782
+ &mov ($out,&DWP(0,$te,$out,8));
783
+
784
+ if ($i==3) { $tmp=$s[1]; }##%eax
785
+ &movz ($tmp,&HB($s[1]));
786
+ &xor ($out,&DWP(3,$te,$tmp,8));
787
+
788
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
789
+ else { &mov ($tmp,$s[2]);
790
+ &shr ($tmp,16); }
791
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
792
+ &and ($tmp,0xFF);
793
+ &xor ($out,&DWP(2,$te,$tmp,8));
794
+
795
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
796
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
797
+ else { &mov ($tmp,$s[3]);
798
+ &shr ($tmp,24) }
799
+ &xor ($out,&DWP(1,$te,$tmp,8));
800
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
801
+ if ($i==3) { &mov ($s[3],$acc); }
802
+ &comment();
803
+ }
804
+
805
+ sub enclast()
806
+ { my ($i,$te,@s)=@_;
807
+ my $tmp = $key;
808
+ my $out = $i==3?$s[0]:$acc;
809
+
810
+ if ($i==3) { &mov ($key,$__key); }##%edx
811
+ else { &mov ($out,$s[0]); }
812
+ &and ($out,0xFF);
813
+ if ($i==1) { &shr ($s[0],16); }#%ebx[1]
814
+ if ($i==2) { &shr ($s[0],24); }#%ecx[2]
815
+ &mov ($out,&DWP(2,$te,$out,8));
816
+ &and ($out,0x000000ff);
817
+
818
+ if ($i==3) { $tmp=$s[1]; }##%eax
819
+ &movz ($tmp,&HB($s[1]));
820
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
821
+ &and ($tmp,0x0000ff00);
822
+ &xor ($out,$tmp);
823
+
824
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$__s0); }##%ebx
825
+ else { &mov ($tmp,$s[2]);
826
+ &shr ($tmp,16); }
827
+ if ($i==2) { &and ($s[1],0xFF); }#%edx[2]
828
+ &and ($tmp,0xFF);
829
+ &mov ($tmp,&DWP(0,$te,$tmp,8));
830
+ &and ($tmp,0x00ff0000);
831
+ &xor ($out,$tmp);
832
+
833
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }##%ecx
834
+ elsif($i==2){ &movz ($tmp,&HB($s[3])); }#%ebx[2]
835
+ else { &mov ($tmp,$s[3]);
836
+ &shr ($tmp,24); }
837
+ &mov ($tmp,&DWP(2,$te,$tmp,8));
838
+ &and ($tmp,0xff000000);
839
+ &xor ($out,$tmp);
840
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
841
+ if ($i==3) { &mov ($s[3],$acc); }
842
+ }
843
+
844
+ &function_begin_B("_x86_AES_encrypt");
845
+ if ($vertical_spin) {
846
+ # I need high parts of volatile registers to be accessible...
847
+ &exch ($s1="edi",$key="ebx");
848
+ &mov ($s2="esi",$acc="ecx");
849
+ }
850
+
851
+ # note that caller is expected to allocate stack frame for me!
852
+ &mov ($__key,$key); # save key
853
+
854
+ &xor ($s0,&DWP(0,$key)); # xor with key
855
+ &xor ($s1,&DWP(4,$key));
856
+ &xor ($s2,&DWP(8,$key));
857
+ &xor ($s3,&DWP(12,$key));
858
+
859
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
860
+
861
+ if ($small_footprint) {
862
+ &lea ($acc,&DWP(-2,$acc,$acc));
863
+ &lea ($acc,&DWP(0,$key,$acc,8));
864
+ &mov ($__end,$acc); # end of key schedule
865
+
866
+ &set_label("loop",16);
867
+ if ($vertical_spin) {
868
+ &encvert($tbl,$s0,$s1,$s2,$s3);
869
+ } else {
870
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
871
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
872
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
873
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
874
+ }
875
+ &add ($key,16); # advance rd_key
876
+ &xor ($s0,&DWP(0,$key));
877
+ &xor ($s1,&DWP(4,$key));
878
+ &xor ($s2,&DWP(8,$key));
879
+ &xor ($s3,&DWP(12,$key));
880
+ &cmp ($key,$__end);
881
+ &mov ($__key,$key);
882
+ &jb (&label("loop"));
883
+ }
884
+ else {
885
+ &cmp ($acc,10);
886
+ &jle (&label("10rounds"));
887
+ &cmp ($acc,12);
888
+ &jle (&label("12rounds"));
889
+
890
+ &set_label("14rounds",4);
891
+ for ($i=1;$i<3;$i++) {
892
+ if ($vertical_spin) {
893
+ &encvert($tbl,$s0,$s1,$s2,$s3);
894
+ } else {
895
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
896
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
897
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
898
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
899
+ }
900
+ &xor ($s0,&DWP(16*$i+0,$key));
901
+ &xor ($s1,&DWP(16*$i+4,$key));
902
+ &xor ($s2,&DWP(16*$i+8,$key));
903
+ &xor ($s3,&DWP(16*$i+12,$key));
904
+ }
905
+ &add ($key,32);
906
+ &mov ($__key,$key); # advance rd_key
907
+ &set_label("12rounds",4);
908
+ for ($i=1;$i<3;$i++) {
909
+ if ($vertical_spin) {
910
+ &encvert($tbl,$s0,$s1,$s2,$s3);
911
+ } else {
912
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
913
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
914
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
915
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
916
+ }
917
+ &xor ($s0,&DWP(16*$i+0,$key));
918
+ &xor ($s1,&DWP(16*$i+4,$key));
919
+ &xor ($s2,&DWP(16*$i+8,$key));
920
+ &xor ($s3,&DWP(16*$i+12,$key));
921
+ }
922
+ &add ($key,32);
923
+ &mov ($__key,$key); # advance rd_key
924
+ &set_label("10rounds",4);
925
+ for ($i=1;$i<10;$i++) {
926
+ if ($vertical_spin) {
927
+ &encvert($tbl,$s0,$s1,$s2,$s3);
928
+ } else {
929
+ &encstep(0,$tbl,$s0,$s1,$s2,$s3);
930
+ &encstep(1,$tbl,$s1,$s2,$s3,$s0);
931
+ &encstep(2,$tbl,$s2,$s3,$s0,$s1);
932
+ &encstep(3,$tbl,$s3,$s0,$s1,$s2);
933
+ }
934
+ &xor ($s0,&DWP(16*$i+0,$key));
935
+ &xor ($s1,&DWP(16*$i+4,$key));
936
+ &xor ($s2,&DWP(16*$i+8,$key));
937
+ &xor ($s3,&DWP(16*$i+12,$key));
938
+ }
939
+ }
940
+
941
+ if ($vertical_spin) {
942
+ # "reincarnate" some registers for "horizontal" spin...
943
+ &mov ($s1="ebx",$key="edi");
944
+ &mov ($s2="ecx",$acc="esi");
945
+ }
946
+ &enclast(0,$tbl,$s0,$s1,$s2,$s3);
947
+ &enclast(1,$tbl,$s1,$s2,$s3,$s0);
948
+ &enclast(2,$tbl,$s2,$s3,$s0,$s1);
949
+ &enclast(3,$tbl,$s3,$s0,$s1,$s2);
950
+
951
+ &add ($key,$small_footprint?16:160);
952
+ &xor ($s0,&DWP(0,$key));
953
+ &xor ($s1,&DWP(4,$key));
954
+ &xor ($s2,&DWP(8,$key));
955
+ &xor ($s3,&DWP(12,$key));
956
+
957
+ &ret ();
958
+
959
+ &set_label("AES_Te",64); # Yes! I keep it in the code segment!
960
+ &_data_word(0xa56363c6, 0x847c7cf8, 0x997777ee, 0x8d7b7bf6);
961
+ &_data_word(0x0df2f2ff, 0xbd6b6bd6, 0xb16f6fde, 0x54c5c591);
962
+ &_data_word(0x50303060, 0x03010102, 0xa96767ce, 0x7d2b2b56);
963
+ &_data_word(0x19fefee7, 0x62d7d7b5, 0xe6abab4d, 0x9a7676ec);
964
+ &_data_word(0x45caca8f, 0x9d82821f, 0x40c9c989, 0x877d7dfa);
965
+ &_data_word(0x15fafaef, 0xeb5959b2, 0xc947478e, 0x0bf0f0fb);
966
+ &_data_word(0xecadad41, 0x67d4d4b3, 0xfda2a25f, 0xeaafaf45);
967
+ &_data_word(0xbf9c9c23, 0xf7a4a453, 0x967272e4, 0x5bc0c09b);
968
+ &_data_word(0xc2b7b775, 0x1cfdfde1, 0xae93933d, 0x6a26264c);
969
+ &_data_word(0x5a36366c, 0x413f3f7e, 0x02f7f7f5, 0x4fcccc83);
970
+ &_data_word(0x5c343468, 0xf4a5a551, 0x34e5e5d1, 0x08f1f1f9);
971
+ &_data_word(0x937171e2, 0x73d8d8ab, 0x53313162, 0x3f15152a);
972
+ &_data_word(0x0c040408, 0x52c7c795, 0x65232346, 0x5ec3c39d);
973
+ &_data_word(0x28181830, 0xa1969637, 0x0f05050a, 0xb59a9a2f);
974
+ &_data_word(0x0907070e, 0x36121224, 0x9b80801b, 0x3de2e2df);
975
+ &_data_word(0x26ebebcd, 0x6927274e, 0xcdb2b27f, 0x9f7575ea);
976
+ &_data_word(0x1b090912, 0x9e83831d, 0x742c2c58, 0x2e1a1a34);
977
+ &_data_word(0x2d1b1b36, 0xb26e6edc, 0xee5a5ab4, 0xfba0a05b);
978
+ &_data_word(0xf65252a4, 0x4d3b3b76, 0x61d6d6b7, 0xceb3b37d);
979
+ &_data_word(0x7b292952, 0x3ee3e3dd, 0x712f2f5e, 0x97848413);
980
+ &_data_word(0xf55353a6, 0x68d1d1b9, 0x00000000, 0x2cededc1);
981
+ &_data_word(0x60202040, 0x1ffcfce3, 0xc8b1b179, 0xed5b5bb6);
982
+ &_data_word(0xbe6a6ad4, 0x46cbcb8d, 0xd9bebe67, 0x4b393972);
983
+ &_data_word(0xde4a4a94, 0xd44c4c98, 0xe85858b0, 0x4acfcf85);
984
+ &_data_word(0x6bd0d0bb, 0x2aefefc5, 0xe5aaaa4f, 0x16fbfbed);
985
+ &_data_word(0xc5434386, 0xd74d4d9a, 0x55333366, 0x94858511);
986
+ &_data_word(0xcf45458a, 0x10f9f9e9, 0x06020204, 0x817f7ffe);
987
+ &_data_word(0xf05050a0, 0x443c3c78, 0xba9f9f25, 0xe3a8a84b);
988
+ &_data_word(0xf35151a2, 0xfea3a35d, 0xc0404080, 0x8a8f8f05);
989
+ &_data_word(0xad92923f, 0xbc9d9d21, 0x48383870, 0x04f5f5f1);
990
+ &_data_word(0xdfbcbc63, 0xc1b6b677, 0x75dadaaf, 0x63212142);
991
+ &_data_word(0x30101020, 0x1affffe5, 0x0ef3f3fd, 0x6dd2d2bf);
992
+ &_data_word(0x4ccdcd81, 0x140c0c18, 0x35131326, 0x2fececc3);
993
+ &_data_word(0xe15f5fbe, 0xa2979735, 0xcc444488, 0x3917172e);
994
+ &_data_word(0x57c4c493, 0xf2a7a755, 0x827e7efc, 0x473d3d7a);
995
+ &_data_word(0xac6464c8, 0xe75d5dba, 0x2b191932, 0x957373e6);
996
+ &_data_word(0xa06060c0, 0x98818119, 0xd14f4f9e, 0x7fdcdca3);
997
+ &_data_word(0x66222244, 0x7e2a2a54, 0xab90903b, 0x8388880b);
998
+ &_data_word(0xca46468c, 0x29eeeec7, 0xd3b8b86b, 0x3c141428);
999
+ &_data_word(0x79dedea7, 0xe25e5ebc, 0x1d0b0b16, 0x76dbdbad);
1000
+ &_data_word(0x3be0e0db, 0x56323264, 0x4e3a3a74, 0x1e0a0a14);
1001
+ &_data_word(0xdb494992, 0x0a06060c, 0x6c242448, 0xe45c5cb8);
1002
+ &_data_word(0x5dc2c29f, 0x6ed3d3bd, 0xefacac43, 0xa66262c4);
1003
+ &_data_word(0xa8919139, 0xa4959531, 0x37e4e4d3, 0x8b7979f2);
1004
+ &_data_word(0x32e7e7d5, 0x43c8c88b, 0x5937376e, 0xb76d6dda);
1005
+ &_data_word(0x8c8d8d01, 0x64d5d5b1, 0xd24e4e9c, 0xe0a9a949);
1006
+ &_data_word(0xb46c6cd8, 0xfa5656ac, 0x07f4f4f3, 0x25eaeacf);
1007
+ &_data_word(0xaf6565ca, 0x8e7a7af4, 0xe9aeae47, 0x18080810);
1008
+ &_data_word(0xd5baba6f, 0x887878f0, 0x6f25254a, 0x722e2e5c);
1009
+ &_data_word(0x241c1c38, 0xf1a6a657, 0xc7b4b473, 0x51c6c697);
1010
+ &_data_word(0x23e8e8cb, 0x7cdddda1, 0x9c7474e8, 0x211f1f3e);
1011
+ &_data_word(0xdd4b4b96, 0xdcbdbd61, 0x868b8b0d, 0x858a8a0f);
1012
+ &_data_word(0x907070e0, 0x423e3e7c, 0xc4b5b571, 0xaa6666cc);
1013
+ &_data_word(0xd8484890, 0x05030306, 0x01f6f6f7, 0x120e0e1c);
1014
+ &_data_word(0xa36161c2, 0x5f35356a, 0xf95757ae, 0xd0b9b969);
1015
+ &_data_word(0x91868617, 0x58c1c199, 0x271d1d3a, 0xb99e9e27);
1016
+ &_data_word(0x38e1e1d9, 0x13f8f8eb, 0xb398982b, 0x33111122);
1017
+ &_data_word(0xbb6969d2, 0x70d9d9a9, 0x898e8e07, 0xa7949433);
1018
+ &_data_word(0xb69b9b2d, 0x221e1e3c, 0x92878715, 0x20e9e9c9);
1019
+ &_data_word(0x49cece87, 0xff5555aa, 0x78282850, 0x7adfdfa5);
1020
+ &_data_word(0x8f8c8c03, 0xf8a1a159, 0x80898909, 0x170d0d1a);
1021
+ &_data_word(0xdabfbf65, 0x31e6e6d7, 0xc6424284, 0xb86868d0);
1022
+ &_data_word(0xc3414182, 0xb0999929, 0x772d2d5a, 0x110f0f1e);
1023
+ &_data_word(0xcbb0b07b, 0xfc5454a8, 0xd6bbbb6d, 0x3a16162c);
1024
+
1025
+ #Te4 # four copies of Te4 to choose from to avoid L1 aliasing
1026
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1027
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1028
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1029
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1030
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1031
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1032
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1033
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1034
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1035
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1036
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1037
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1038
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1039
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1040
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1041
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1042
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1043
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1044
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1045
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1046
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1047
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1048
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1049
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1050
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1051
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1052
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1053
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1054
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1055
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1056
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1057
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1058
+
1059
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1060
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1061
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1062
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1063
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1064
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1065
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1066
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1067
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1068
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1069
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1070
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1071
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1072
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1073
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1074
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1075
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1076
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1077
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1078
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1079
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1080
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1081
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1082
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1083
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1084
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1085
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1086
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1087
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1088
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1089
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1090
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1091
+
1092
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1093
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1094
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1095
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1096
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1097
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1098
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1099
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1100
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1101
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1102
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1103
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1104
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1105
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1106
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1107
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1108
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1109
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1110
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1111
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1112
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1113
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1114
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1115
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1116
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1117
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1118
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1119
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1120
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1121
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1122
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1123
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1124
+
1125
+ &data_byte(0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5);
1126
+ &data_byte(0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76);
1127
+ &data_byte(0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0);
1128
+ &data_byte(0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0);
1129
+ &data_byte(0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc);
1130
+ &data_byte(0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15);
1131
+ &data_byte(0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a);
1132
+ &data_byte(0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75);
1133
+ &data_byte(0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0);
1134
+ &data_byte(0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84);
1135
+ &data_byte(0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b);
1136
+ &data_byte(0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf);
1137
+ &data_byte(0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85);
1138
+ &data_byte(0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8);
1139
+ &data_byte(0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5);
1140
+ &data_byte(0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2);
1141
+ &data_byte(0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17);
1142
+ &data_byte(0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73);
1143
+ &data_byte(0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88);
1144
+ &data_byte(0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb);
1145
+ &data_byte(0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c);
1146
+ &data_byte(0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79);
1147
+ &data_byte(0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9);
1148
+ &data_byte(0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08);
1149
+ &data_byte(0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6);
1150
+ &data_byte(0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a);
1151
+ &data_byte(0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e);
1152
+ &data_byte(0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e);
1153
+ &data_byte(0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94);
1154
+ &data_byte(0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf);
1155
+ &data_byte(0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68);
1156
+ &data_byte(0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16);
1157
+ #rcon:
1158
+ &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
1159
+ &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
1160
+ &data_word(0x0000001b, 0x00000036, 0x00000000, 0x00000000);
1161
+ &data_word(0x00000000, 0x00000000, 0x00000000, 0x00000000);
1162
+ &function_end_B("_x86_AES_encrypt");
1163
+
1164
+ # void asm_AES_encrypt (const void *inp,void *out,const AES_KEY *key);
1165
+ &function_begin("asm_AES_encrypt");
1166
+ &mov ($acc,&wparam(0)); # load inp
1167
+ &mov ($key,&wparam(2)); # load key
1168
+
1169
+ &mov ($s0,"esp");
1170
+ &sub ("esp",36);
1171
+ &and ("esp",-64); # align to cache-line
1172
+
1173
+ # place stack frame just "above" the key schedule
1174
+ &lea ($s1,&DWP(-64-63,$key));
1175
+ &sub ($s1,"esp");
1176
+ &neg ($s1);
1177
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1178
+ &sub ("esp",$s1);
1179
+ &add ("esp",4); # 4 is reserved for caller's return address
1180
+ &mov ($_esp,$s0); # save stack pointer
1181
+
1182
+ &call (&label("pic_point")); # make it PIC!
1183
+ &set_label("pic_point");
1184
+ &blindpop($tbl);
1185
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if (!$x86only);
1186
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
1187
+
1188
+ # pick Te4 copy which can't "overlap" with stack frame or key schedule
1189
+ &lea ($s1,&DWP(768-4,"esp"));
1190
+ &sub ($s1,$tbl);
1191
+ &and ($s1,0x300);
1192
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1193
+
1194
+ if (!$x86only) {
1195
+ &bt (&DWP(0,$s0),25); # check for SSE bit
1196
+ &jnc (&label("x86"));
1197
+
1198
+ &movq ("mm0",&QWP(0,$acc));
1199
+ &movq ("mm4",&QWP(8,$acc));
1200
+ &call ("_sse_AES_encrypt_compact");
1201
+ &mov ("esp",$_esp); # restore stack pointer
1202
+ &mov ($acc,&wparam(1)); # load out
1203
+ &movq (&QWP(0,$acc),"mm0"); # write output data
1204
+ &movq (&QWP(8,$acc),"mm4");
1205
+ &emms ();
1206
+ &function_end_A();
1207
+ }
1208
+ &set_label("x86",16);
1209
+ &mov ($_tbl,$tbl);
1210
+ &mov ($s0,&DWP(0,$acc)); # load input data
1211
+ &mov ($s1,&DWP(4,$acc));
1212
+ &mov ($s2,&DWP(8,$acc));
1213
+ &mov ($s3,&DWP(12,$acc));
1214
+ &call ("_x86_AES_encrypt_compact");
1215
+ &mov ("esp",$_esp); # restore stack pointer
1216
+ &mov ($acc,&wparam(1)); # load out
1217
+ &mov (&DWP(0,$acc),$s0); # write output data
1218
+ &mov (&DWP(4,$acc),$s1);
1219
+ &mov (&DWP(8,$acc),$s2);
1220
+ &mov (&DWP(12,$acc),$s3);
1221
+ &function_end("asm_AES_encrypt");
1222
+
1223
+ #--------------------------------------------------------------------#
1224
+
1225
+ ######################################################################
1226
+ # "Compact" block function
1227
+ ######################################################################
1228
+
1229
+ sub deccompact()
1230
+ { my $Fn = \&mov;
1231
+ while ($#_>5) { pop(@_); $Fn=sub{}; }
1232
+ my ($i,$td,@s)=@_;
1233
+ my $tmp = $key;
1234
+ my $out = $i==3?$s[0]:$acc;
1235
+
1236
+ # $Fn is used in first compact round and its purpose is to
1237
+ # void restoration of some values from stack, so that after
1238
+ # 4xdeccompact with extra argument $key, $s0 and $s1 values
1239
+ # are left there...
1240
+ if($i==3) { &$Fn ($key,$__key); }
1241
+ else { &mov ($out,$s[0]); }
1242
+ &and ($out,0xFF);
1243
+ &movz ($out,&BP(-128,$td,$out,1));
1244
+
1245
+ if ($i==3) { $tmp=$s[1]; }
1246
+ &movz ($tmp,&HB($s[1]));
1247
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1248
+ &shl ($tmp,8);
1249
+ &xor ($out,$tmp);
1250
+
1251
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1252
+ else { mov ($tmp,$s[2]); }
1253
+ &shr ($tmp,16);
1254
+ &and ($tmp,0xFF);
1255
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1256
+ &shl ($tmp,16);
1257
+ &xor ($out,$tmp);
1258
+
1259
+ if ($i==3) { $tmp=$s[3]; &$Fn ($s[2],$__s1); }
1260
+ else { &mov ($tmp,$s[3]); }
1261
+ &shr ($tmp,24);
1262
+ &movz ($tmp,&BP(-128,$td,$tmp,1));
1263
+ &shl ($tmp,24);
1264
+ &xor ($out,$tmp);
1265
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1266
+ if ($i==3) { &$Fn ($s[3],$__s0); }
1267
+ }
1268
+
1269
+ # must be called with 2,3,0,1 as argument sequence!!!
1270
+ sub dectransform()
1271
+ { my @s = ($s0,$s1,$s2,$s3);
1272
+ my $i = shift;
1273
+ my $tmp = $key;
1274
+ my $tp2 = @s[($i+2)%4]; $tp2 = @s[2] if ($i==1);
1275
+ my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
1276
+ my $tp8 = $tbl;
1277
+
1278
+ &mov ($tmp,0x80808080);
1279
+ &and ($tmp,$s[$i]);
1280
+ &mov ($acc,$tmp);
1281
+ &shr ($tmp,7);
1282
+ &lea ($tp2,&DWP(0,$s[$i],$s[$i]));
1283
+ &sub ($acc,$tmp);
1284
+ &and ($tp2,0xfefefefe);
1285
+ &and ($acc,0x1b1b1b1b);
1286
+ &xor ($tp2,$acc);
1287
+ &mov ($tmp,0x80808080);
1288
+
1289
+ &and ($tmp,$tp2);
1290
+ &mov ($acc,$tmp);
1291
+ &shr ($tmp,7);
1292
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
1293
+ &sub ($acc,$tmp);
1294
+ &and ($tp4,0xfefefefe);
1295
+ &and ($acc,0x1b1b1b1b);
1296
+ &xor ($tp2,$s[$i]); # tp2^tp1
1297
+ &xor ($tp4,$acc);
1298
+ &mov ($tmp,0x80808080);
1299
+
1300
+ &and ($tmp,$tp4);
1301
+ &mov ($acc,$tmp);
1302
+ &shr ($tmp,7);
1303
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
1304
+ &sub ($acc,$tmp);
1305
+ &and ($tp8,0xfefefefe);
1306
+ &and ($acc,0x1b1b1b1b);
1307
+ &xor ($tp4,$s[$i]); # tp4^tp1
1308
+ &rotl ($s[$i],8); # = ROTATE(tp1,8)
1309
+ &xor ($tp8,$acc);
1310
+
1311
+ &xor ($s[$i],$tp2);
1312
+ &xor ($tp2,$tp8);
1313
+ &xor ($s[$i],$tp4);
1314
+ &xor ($tp4,$tp8);
1315
+ &rotl ($tp2,24);
1316
+ &xor ($s[$i],$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
1317
+ &rotl ($tp4,16);
1318
+ &xor ($s[$i],$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
1319
+ &rotl ($tp8,8);
1320
+ &xor ($s[$i],$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
1321
+ &mov ($s[0],$__s0) if($i==2); #prefetch $s0
1322
+ &mov ($s[1],$__s1) if($i==3); #prefetch $s1
1323
+ &mov ($s[2],$__s2) if($i==1);
1324
+ &xor ($s[$i],$tp8); # ^= ROTATE(tp8,8)
1325
+
1326
+ &mov ($s[3],$__s3) if($i==1);
1327
+ &mov (&DWP(4+4*$i,"esp"),$s[$i]) if($i>=2);
1328
+ }
1329
+
1330
+ &function_begin_B("_x86_AES_decrypt_compact");
1331
+ # note that caller is expected to allocate stack frame for me!
1332
+ &mov ($__key,$key); # save key
1333
+
1334
+ &xor ($s0,&DWP(0,$key)); # xor with key
1335
+ &xor ($s1,&DWP(4,$key));
1336
+ &xor ($s2,&DWP(8,$key));
1337
+ &xor ($s3,&DWP(12,$key));
1338
+
1339
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1340
+
1341
+ &lea ($acc,&DWP(-2,$acc,$acc));
1342
+ &lea ($acc,&DWP(0,$key,$acc,8));
1343
+ &mov ($__end,$acc); # end of key schedule
1344
+
1345
+ # prefetch Td4
1346
+ &mov ($key,&DWP(0-128,$tbl));
1347
+ &mov ($acc,&DWP(32-128,$tbl));
1348
+ &mov ($key,&DWP(64-128,$tbl));
1349
+ &mov ($acc,&DWP(96-128,$tbl));
1350
+ &mov ($key,&DWP(128-128,$tbl));
1351
+ &mov ($acc,&DWP(160-128,$tbl));
1352
+ &mov ($key,&DWP(192-128,$tbl));
1353
+ &mov ($acc,&DWP(224-128,$tbl));
1354
+
1355
+ &set_label("loop",16);
1356
+
1357
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1,1);
1358
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2,1);
1359
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3,1);
1360
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0,1);
1361
+ &dectransform(2);
1362
+ &dectransform(3);
1363
+ &dectransform(0);
1364
+ &dectransform(1);
1365
+ &mov ($key,$__key);
1366
+ &mov ($tbl,$__tbl);
1367
+ &add ($key,16); # advance rd_key
1368
+ &xor ($s0,&DWP(0,$key));
1369
+ &xor ($s1,&DWP(4,$key));
1370
+ &xor ($s2,&DWP(8,$key));
1371
+ &xor ($s3,&DWP(12,$key));
1372
+
1373
+ &cmp ($key,$__end);
1374
+ &mov ($__key,$key);
1375
+ &jb (&label("loop"));
1376
+
1377
+ &deccompact(0,$tbl,$s0,$s3,$s2,$s1);
1378
+ &deccompact(1,$tbl,$s1,$s0,$s3,$s2);
1379
+ &deccompact(2,$tbl,$s2,$s1,$s0,$s3);
1380
+ &deccompact(3,$tbl,$s3,$s2,$s1,$s0);
1381
+
1382
+ &xor ($s0,&DWP(16,$key));
1383
+ &xor ($s1,&DWP(20,$key));
1384
+ &xor ($s2,&DWP(24,$key));
1385
+ &xor ($s3,&DWP(28,$key));
1386
+
1387
+ &ret ();
1388
+ &function_end_B("_x86_AES_decrypt_compact");
1389
+
1390
+ ######################################################################
1391
+ # "Compact" SSE block function.
1392
+ ######################################################################
1393
+
1394
+ sub sse_deccompact()
1395
+ {
1396
+ &pshufw ("mm1","mm0",0x0c); # 7, 6, 1, 0
1397
+ &pshufw ("mm5","mm4",0x09); # 13,12,11,10
1398
+ &movd ("eax","mm1"); # 7, 6, 1, 0
1399
+ &movd ("ebx","mm5"); # 13,12,11,10
1400
+ &mov ($__key,$key);
1401
+
1402
+ &movz ($acc,&LB("eax")); # 0
1403
+ &movz ("edx",&HB("eax")); # 1
1404
+ &pshufw ("mm2","mm0",0x06); # 3, 2, 5, 4
1405
+ &movz ("ecx",&BP(-128,$tbl,$acc,1)); # 0
1406
+ &movz ($key,&LB("ebx")); # 10
1407
+ &movz ("edx",&BP(-128,$tbl,"edx",1)); # 1
1408
+ &shr ("eax",16); # 7, 6
1409
+ &shl ("edx",8); # 1
1410
+
1411
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 10
1412
+ &movz ($key,&HB("ebx")); # 11
1413
+ &shl ($acc,16); # 10
1414
+ &pshufw ("mm6","mm4",0x03); # 9, 8,15,14
1415
+ &or ("ecx",$acc); # 10
1416
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 11
1417
+ &movz ($key,&HB("eax")); # 7
1418
+ &shl ($acc,24); # 11
1419
+ &shr ("ebx",16); # 13,12
1420
+ &or ("edx",$acc); # 11
1421
+
1422
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 7
1423
+ &movz ($key,&HB("ebx")); # 13
1424
+ &shl ($acc,24); # 7
1425
+ &or ("ecx",$acc); # 7
1426
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 13
1427
+ &movz ($key,&LB("eax")); # 6
1428
+ &shl ($acc,8); # 13
1429
+ &movd ("eax","mm2"); # 3, 2, 5, 4
1430
+ &or ("ecx",$acc); # 13
1431
+
1432
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 6
1433
+ &movz ($key,&LB("ebx")); # 12
1434
+ &shl ($acc,16); # 6
1435
+ &movd ("ebx","mm6"); # 9, 8,15,14
1436
+ &movd ("mm0","ecx"); # t[0] collected
1437
+ &movz ("ecx",&BP(-128,$tbl,$key,1)); # 12
1438
+ &movz ($key,&LB("eax")); # 4
1439
+ &or ("ecx",$acc); # 12
1440
+
1441
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 4
1442
+ &movz ($key,&LB("ebx")); # 14
1443
+ &or ("edx",$acc); # 4
1444
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 14
1445
+ &movz ($key,&HB("eax")); # 5
1446
+ &shl ($acc,16); # 14
1447
+ &shr ("eax",16); # 3, 2
1448
+ &or ("edx",$acc); # 14
1449
+
1450
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 5
1451
+ &movz ($key,&HB("ebx")); # 15
1452
+ &shr ("ebx",16); # 9, 8
1453
+ &shl ($acc,8); # 5
1454
+ &movd ("mm1","edx"); # t[1] collected
1455
+ &movz ("edx",&BP(-128,$tbl,$key,1)); # 15
1456
+ &movz ($key,&HB("ebx")); # 9
1457
+ &shl ("edx",24); # 15
1458
+ &and ("ebx",0xff); # 8
1459
+ &or ("edx",$acc); # 15
1460
+
1461
+ &punpckldq ("mm0","mm1"); # t[0,1] collected
1462
+
1463
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 9
1464
+ &movz ($key,&LB("eax")); # 2
1465
+ &shl ($acc,8); # 9
1466
+ &movz ("eax",&HB("eax")); # 3
1467
+ &movz ("ebx",&BP(-128,$tbl,"ebx",1)); # 8
1468
+ &or ("ecx",$acc); # 9
1469
+ &movz ($acc,&BP(-128,$tbl,$key,1)); # 2
1470
+ &or ("edx","ebx"); # 8
1471
+ &shl ($acc,16); # 2
1472
+ &movz ("eax",&BP(-128,$tbl,"eax",1)); # 3
1473
+ &or ("edx",$acc); # 2
1474
+ &shl ("eax",24); # 3
1475
+ &or ("ecx","eax"); # 3
1476
+ &mov ($key,$__key);
1477
+ &movd ("mm4","edx"); # t[2] collected
1478
+ &movd ("mm5","ecx"); # t[3] collected
1479
+
1480
+ &punpckldq ("mm4","mm5"); # t[2,3] collected
1481
+ }
1482
+
1483
+ if (!$x86only) {
1484
+ &function_begin_B("_sse_AES_decrypt_compact");
1485
+ &pxor ("mm0",&QWP(0,$key)); # 7, 6, 5, 4, 3, 2, 1, 0
1486
+ &pxor ("mm4",&QWP(8,$key)); # 15,14,13,12,11,10, 9, 8
1487
+
1488
+ # note that caller is expected to allocate stack frame for me!
1489
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1490
+ &lea ($acc,&DWP(-2,$acc,$acc));
1491
+ &lea ($acc,&DWP(0,$key,$acc,8));
1492
+ &mov ($__end,$acc); # end of key schedule
1493
+
1494
+ &mov ($s0,0x1b1b1b1b); # magic constant
1495
+ &mov (&DWP(8,"esp"),$s0);
1496
+ &mov (&DWP(12,"esp"),$s0);
1497
+
1498
+ # prefetch Td4
1499
+ &mov ($s0,&DWP(0-128,$tbl));
1500
+ &mov ($s1,&DWP(32-128,$tbl));
1501
+ &mov ($s2,&DWP(64-128,$tbl));
1502
+ &mov ($s3,&DWP(96-128,$tbl));
1503
+ &mov ($s0,&DWP(128-128,$tbl));
1504
+ &mov ($s1,&DWP(160-128,$tbl));
1505
+ &mov ($s2,&DWP(192-128,$tbl));
1506
+ &mov ($s3,&DWP(224-128,$tbl));
1507
+
1508
+ &set_label("loop",16);
1509
+ &sse_deccompact();
1510
+ &add ($key,16);
1511
+ &cmp ($key,$__end);
1512
+ &ja (&label("out"));
1513
+
1514
+ # ROTATE(x^y,N) == ROTATE(x,N)^ROTATE(y,N)
1515
+ &movq ("mm3","mm0"); &movq ("mm7","mm4");
1516
+ &movq ("mm2","mm0",1); &movq ("mm6","mm4",1);
1517
+ &movq ("mm1","mm0"); &movq ("mm5","mm4");
1518
+ &pshufw ("mm0","mm0",0xb1); &pshufw ("mm4","mm4",0xb1);# = ROTATE(tp0,16)
1519
+ &pslld ("mm2",8); &pslld ("mm6",8);
1520
+ &psrld ("mm3",8); &psrld ("mm7",8);
1521
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<8
1522
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>8
1523
+ &pslld ("mm2",16); &pslld ("mm6",16);
1524
+ &psrld ("mm3",16); &psrld ("mm7",16);
1525
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp0<<24
1526
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp0>>24
1527
+
1528
+ &movq ("mm3",&QWP(8,"esp"));
1529
+ &pxor ("mm2","mm2"); &pxor ("mm6","mm6");
1530
+ &pcmpgtb("mm2","mm1"); &pcmpgtb("mm6","mm5");
1531
+ &pand ("mm2","mm3"); &pand ("mm6","mm3");
1532
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1533
+ &pxor ("mm1","mm2"); &pxor ("mm5","mm6"); # tp2
1534
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
1535
+ &movq ("mm2","mm1"); &movq ("mm6","mm5");
1536
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp2
1537
+ &pslld ("mm3",24); &pslld ("mm7",24);
1538
+ &psrld ("mm2",8); &psrld ("mm6",8);
1539
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp2<<24
1540
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= tp2>>8
1541
+
1542
+ &movq ("mm2",&QWP(8,"esp"));
1543
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1544
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1545
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
1546
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1547
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp4
1548
+ &pshufw ("mm3","mm1",0xb1); &pshufw ("mm7","mm5",0xb1);
1549
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp4
1550
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= ROTATE(tp4,16)
1551
+
1552
+ &pxor ("mm3","mm3"); &pxor ("mm7","mm7");
1553
+ &pcmpgtb("mm3","mm1"); &pcmpgtb("mm7","mm5");
1554
+ &pand ("mm3","mm2"); &pand ("mm7","mm2");
1555
+ &paddb ("mm1","mm1"); &paddb ("mm5","mm5");
1556
+ &pxor ("mm1","mm3"); &pxor ("mm5","mm7"); # tp8
1557
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8
1558
+ &movq ("mm3","mm1"); &movq ("mm7","mm5");
1559
+ &pshufw ("mm2","mm1",0xb1); &pshufw ("mm6","mm5",0xb1);
1560
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6"); # ^= ROTATE(tp8,16)
1561
+ &pslld ("mm1",8); &pslld ("mm5",8);
1562
+ &psrld ("mm3",8); &psrld ("mm7",8);
1563
+ &movq ("mm2",&QWP(0,$key)); &movq ("mm6",&QWP(8,$key));
1564
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<8
1565
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>8
1566
+ &mov ($s0,&DWP(0-128,$tbl));
1567
+ &pslld ("mm1",16); &pslld ("mm5",16);
1568
+ &mov ($s1,&DWP(64-128,$tbl));
1569
+ &psrld ("mm3",16); &psrld ("mm7",16);
1570
+ &mov ($s2,&DWP(128-128,$tbl));
1571
+ &pxor ("mm0","mm1"); &pxor ("mm4","mm5"); # ^= tp8<<24
1572
+ &mov ($s3,&DWP(192-128,$tbl));
1573
+ &pxor ("mm0","mm3"); &pxor ("mm4","mm7"); # ^= tp8>>24
1574
+
1575
+ &pxor ("mm0","mm2"); &pxor ("mm4","mm6");
1576
+ &jmp (&label("loop"));
1577
+
1578
+ &set_label("out",16);
1579
+ &pxor ("mm0",&QWP(0,$key));
1580
+ &pxor ("mm4",&QWP(8,$key));
1581
+
1582
+ &ret ();
1583
+ &function_end_B("_sse_AES_decrypt_compact");
1584
+ }
1585
+
1586
+ ######################################################################
1587
+ # Vanilla block function.
1588
+ ######################################################################
1589
+
1590
+ sub decstep()
1591
+ { my ($i,$td,@s) = @_;
1592
+ my $tmp = $key;
1593
+ my $out = $i==3?$s[0]:$acc;
1594
+
1595
+ # no instructions are reordered, as performance appears
1596
+ # optimal... or rather that all attempts to reorder didn't
1597
+ # result in better performance [which by the way is not a
1598
+ # bit lower than ecryption].
1599
+ if($i==3) { &mov ($key,$__key); }
1600
+ else { &mov ($out,$s[0]); }
1601
+ &and ($out,0xFF);
1602
+ &mov ($out,&DWP(0,$td,$out,8));
1603
+
1604
+ if ($i==3) { $tmp=$s[1]; }
1605
+ &movz ($tmp,&HB($s[1]));
1606
+ &xor ($out,&DWP(3,$td,$tmp,8));
1607
+
1608
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1609
+ else { &mov ($tmp,$s[2]); }
1610
+ &shr ($tmp,16);
1611
+ &and ($tmp,0xFF);
1612
+ &xor ($out,&DWP(2,$td,$tmp,8));
1613
+
1614
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1615
+ else { &mov ($tmp,$s[3]); }
1616
+ &shr ($tmp,24);
1617
+ &xor ($out,&DWP(1,$td,$tmp,8));
1618
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1619
+ if ($i==3) { &mov ($s[3],$__s0); }
1620
+ &comment();
1621
+ }
1622
+
1623
+ sub declast()
1624
+ { my ($i,$td,@s)=@_;
1625
+ my $tmp = $key;
1626
+ my $out = $i==3?$s[0]:$acc;
1627
+
1628
+ if($i==0) { &lea ($td,&DWP(2048+128,$td));
1629
+ &mov ($tmp,&DWP(0-128,$td));
1630
+ &mov ($acc,&DWP(32-128,$td));
1631
+ &mov ($tmp,&DWP(64-128,$td));
1632
+ &mov ($acc,&DWP(96-128,$td));
1633
+ &mov ($tmp,&DWP(128-128,$td));
1634
+ &mov ($acc,&DWP(160-128,$td));
1635
+ &mov ($tmp,&DWP(192-128,$td));
1636
+ &mov ($acc,&DWP(224-128,$td));
1637
+ &lea ($td,&DWP(-128,$td)); }
1638
+ if($i==3) { &mov ($key,$__key); }
1639
+ else { &mov ($out,$s[0]); }
1640
+ &and ($out,0xFF);
1641
+ &movz ($out,&BP(0,$td,$out,1));
1642
+
1643
+ if ($i==3) { $tmp=$s[1]; }
1644
+ &movz ($tmp,&HB($s[1]));
1645
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1646
+ &shl ($tmp,8);
1647
+ &xor ($out,$tmp);
1648
+
1649
+ if ($i==3) { $tmp=$s[2]; &mov ($s[1],$acc); }
1650
+ else { mov ($tmp,$s[2]); }
1651
+ &shr ($tmp,16);
1652
+ &and ($tmp,0xFF);
1653
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1654
+ &shl ($tmp,16);
1655
+ &xor ($out,$tmp);
1656
+
1657
+ if ($i==3) { $tmp=$s[3]; &mov ($s[2],$__s1); }
1658
+ else { &mov ($tmp,$s[3]); }
1659
+ &shr ($tmp,24);
1660
+ &movz ($tmp,&BP(0,$td,$tmp,1));
1661
+ &shl ($tmp,24);
1662
+ &xor ($out,$tmp);
1663
+ if ($i<2) { &mov (&DWP(4+4*$i,"esp"),$out); }
1664
+ if ($i==3) { &mov ($s[3],$__s0);
1665
+ &lea ($td,&DWP(-2048,$td)); }
1666
+ }
1667
+
1668
+ &function_begin_B("_x86_AES_decrypt");
1669
+ # note that caller is expected to allocate stack frame for me!
1670
+ &mov ($__key,$key); # save key
1671
+
1672
+ &xor ($s0,&DWP(0,$key)); # xor with key
1673
+ &xor ($s1,&DWP(4,$key));
1674
+ &xor ($s2,&DWP(8,$key));
1675
+ &xor ($s3,&DWP(12,$key));
1676
+
1677
+ &mov ($acc,&DWP(240,$key)); # load key->rounds
1678
+
1679
+ if ($small_footprint) {
1680
+ &lea ($acc,&DWP(-2,$acc,$acc));
1681
+ &lea ($acc,&DWP(0,$key,$acc,8));
1682
+ &mov ($__end,$acc); # end of key schedule
1683
+ &set_label("loop",16);
1684
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1685
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1686
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1687
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1688
+ &add ($key,16); # advance rd_key
1689
+ &xor ($s0,&DWP(0,$key));
1690
+ &xor ($s1,&DWP(4,$key));
1691
+ &xor ($s2,&DWP(8,$key));
1692
+ &xor ($s3,&DWP(12,$key));
1693
+ &cmp ($key,$__end);
1694
+ &mov ($__key,$key);
1695
+ &jb (&label("loop"));
1696
+ }
1697
+ else {
1698
+ &cmp ($acc,10);
1699
+ &jle (&label("10rounds"));
1700
+ &cmp ($acc,12);
1701
+ &jle (&label("12rounds"));
1702
+
1703
+ &set_label("14rounds",4);
1704
+ for ($i=1;$i<3;$i++) {
1705
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1706
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1707
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1708
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1709
+ &xor ($s0,&DWP(16*$i+0,$key));
1710
+ &xor ($s1,&DWP(16*$i+4,$key));
1711
+ &xor ($s2,&DWP(16*$i+8,$key));
1712
+ &xor ($s3,&DWP(16*$i+12,$key));
1713
+ }
1714
+ &add ($key,32);
1715
+ &mov ($__key,$key); # advance rd_key
1716
+ &set_label("12rounds",4);
1717
+ for ($i=1;$i<3;$i++) {
1718
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1719
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1720
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1721
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1722
+ &xor ($s0,&DWP(16*$i+0,$key));
1723
+ &xor ($s1,&DWP(16*$i+4,$key));
1724
+ &xor ($s2,&DWP(16*$i+8,$key));
1725
+ &xor ($s3,&DWP(16*$i+12,$key));
1726
+ }
1727
+ &add ($key,32);
1728
+ &mov ($__key,$key); # advance rd_key
1729
+ &set_label("10rounds",4);
1730
+ for ($i=1;$i<10;$i++) {
1731
+ &decstep(0,$tbl,$s0,$s3,$s2,$s1);
1732
+ &decstep(1,$tbl,$s1,$s0,$s3,$s2);
1733
+ &decstep(2,$tbl,$s2,$s1,$s0,$s3);
1734
+ &decstep(3,$tbl,$s3,$s2,$s1,$s0);
1735
+ &xor ($s0,&DWP(16*$i+0,$key));
1736
+ &xor ($s1,&DWP(16*$i+4,$key));
1737
+ &xor ($s2,&DWP(16*$i+8,$key));
1738
+ &xor ($s3,&DWP(16*$i+12,$key));
1739
+ }
1740
+ }
1741
+
1742
+ &declast(0,$tbl,$s0,$s3,$s2,$s1);
1743
+ &declast(1,$tbl,$s1,$s0,$s3,$s2);
1744
+ &declast(2,$tbl,$s2,$s1,$s0,$s3);
1745
+ &declast(3,$tbl,$s3,$s2,$s1,$s0);
1746
+
1747
+ &add ($key,$small_footprint?16:160);
1748
+ &xor ($s0,&DWP(0,$key));
1749
+ &xor ($s1,&DWP(4,$key));
1750
+ &xor ($s2,&DWP(8,$key));
1751
+ &xor ($s3,&DWP(12,$key));
1752
+
1753
+ &ret ();
1754
+
1755
+ &set_label("AES_Td",64); # Yes! I keep it in the code segment!
1756
+ &_data_word(0x50a7f451, 0x5365417e, 0xc3a4171a, 0x965e273a);
1757
+ &_data_word(0xcb6bab3b, 0xf1459d1f, 0xab58faac, 0x9303e34b);
1758
+ &_data_word(0x55fa3020, 0xf66d76ad, 0x9176cc88, 0x254c02f5);
1759
+ &_data_word(0xfcd7e54f, 0xd7cb2ac5, 0x80443526, 0x8fa362b5);
1760
+ &_data_word(0x495ab1de, 0x671bba25, 0x980eea45, 0xe1c0fe5d);
1761
+ &_data_word(0x02752fc3, 0x12f04c81, 0xa397468d, 0xc6f9d36b);
1762
+ &_data_word(0xe75f8f03, 0x959c9215, 0xeb7a6dbf, 0xda595295);
1763
+ &_data_word(0x2d83bed4, 0xd3217458, 0x2969e049, 0x44c8c98e);
1764
+ &_data_word(0x6a89c275, 0x78798ef4, 0x6b3e5899, 0xdd71b927);
1765
+ &_data_word(0xb64fe1be, 0x17ad88f0, 0x66ac20c9, 0xb43ace7d);
1766
+ &_data_word(0x184adf63, 0x82311ae5, 0x60335197, 0x457f5362);
1767
+ &_data_word(0xe07764b1, 0x84ae6bbb, 0x1ca081fe, 0x942b08f9);
1768
+ &_data_word(0x58684870, 0x19fd458f, 0x876cde94, 0xb7f87b52);
1769
+ &_data_word(0x23d373ab, 0xe2024b72, 0x578f1fe3, 0x2aab5566);
1770
+ &_data_word(0x0728ebb2, 0x03c2b52f, 0x9a7bc586, 0xa50837d3);
1771
+ &_data_word(0xf2872830, 0xb2a5bf23, 0xba6a0302, 0x5c8216ed);
1772
+ &_data_word(0x2b1ccf8a, 0x92b479a7, 0xf0f207f3, 0xa1e2694e);
1773
+ &_data_word(0xcdf4da65, 0xd5be0506, 0x1f6234d1, 0x8afea6c4);
1774
+ &_data_word(0x9d532e34, 0xa055f3a2, 0x32e18a05, 0x75ebf6a4);
1775
+ &_data_word(0x39ec830b, 0xaaef6040, 0x069f715e, 0x51106ebd);
1776
+ &_data_word(0xf98a213e, 0x3d06dd96, 0xae053edd, 0x46bde64d);
1777
+ &_data_word(0xb58d5491, 0x055dc471, 0x6fd40604, 0xff155060);
1778
+ &_data_word(0x24fb9819, 0x97e9bdd6, 0xcc434089, 0x779ed967);
1779
+ &_data_word(0xbd42e8b0, 0x888b8907, 0x385b19e7, 0xdbeec879);
1780
+ &_data_word(0x470a7ca1, 0xe90f427c, 0xc91e84f8, 0x00000000);
1781
+ &_data_word(0x83868009, 0x48ed2b32, 0xac70111e, 0x4e725a6c);
1782
+ &_data_word(0xfbff0efd, 0x5638850f, 0x1ed5ae3d, 0x27392d36);
1783
+ &_data_word(0x64d90f0a, 0x21a65c68, 0xd1545b9b, 0x3a2e3624);
1784
+ &_data_word(0xb1670a0c, 0x0fe75793, 0xd296eeb4, 0x9e919b1b);
1785
+ &_data_word(0x4fc5c080, 0xa220dc61, 0x694b775a, 0x161a121c);
1786
+ &_data_word(0x0aba93e2, 0xe52aa0c0, 0x43e0223c, 0x1d171b12);
1787
+ &_data_word(0x0b0d090e, 0xadc78bf2, 0xb9a8b62d, 0xc8a91e14);
1788
+ &_data_word(0x8519f157, 0x4c0775af, 0xbbdd99ee, 0xfd607fa3);
1789
+ &_data_word(0x9f2601f7, 0xbcf5725c, 0xc53b6644, 0x347efb5b);
1790
+ &_data_word(0x7629438b, 0xdcc623cb, 0x68fcedb6, 0x63f1e4b8);
1791
+ &_data_word(0xcadc31d7, 0x10856342, 0x40229713, 0x2011c684);
1792
+ &_data_word(0x7d244a85, 0xf83dbbd2, 0x1132f9ae, 0x6da129c7);
1793
+ &_data_word(0x4b2f9e1d, 0xf330b2dc, 0xec52860d, 0xd0e3c177);
1794
+ &_data_word(0x6c16b32b, 0x99b970a9, 0xfa489411, 0x2264e947);
1795
+ &_data_word(0xc48cfca8, 0x1a3ff0a0, 0xd82c7d56, 0xef903322);
1796
+ &_data_word(0xc74e4987, 0xc1d138d9, 0xfea2ca8c, 0x360bd498);
1797
+ &_data_word(0xcf81f5a6, 0x28de7aa5, 0x268eb7da, 0xa4bfad3f);
1798
+ &_data_word(0xe49d3a2c, 0x0d927850, 0x9bcc5f6a, 0x62467e54);
1799
+ &_data_word(0xc2138df6, 0xe8b8d890, 0x5ef7392e, 0xf5afc382);
1800
+ &_data_word(0xbe805d9f, 0x7c93d069, 0xa92dd56f, 0xb31225cf);
1801
+ &_data_word(0x3b99acc8, 0xa77d1810, 0x6e639ce8, 0x7bbb3bdb);
1802
+ &_data_word(0x097826cd, 0xf418596e, 0x01b79aec, 0xa89a4f83);
1803
+ &_data_word(0x656e95e6, 0x7ee6ffaa, 0x08cfbc21, 0xe6e815ef);
1804
+ &_data_word(0xd99be7ba, 0xce366f4a, 0xd4099fea, 0xd67cb029);
1805
+ &_data_word(0xafb2a431, 0x31233f2a, 0x3094a5c6, 0xc066a235);
1806
+ &_data_word(0x37bc4e74, 0xa6ca82fc, 0xb0d090e0, 0x15d8a733);
1807
+ &_data_word(0x4a9804f1, 0xf7daec41, 0x0e50cd7f, 0x2ff69117);
1808
+ &_data_word(0x8dd64d76, 0x4db0ef43, 0x544daacc, 0xdf0496e4);
1809
+ &_data_word(0xe3b5d19e, 0x1b886a4c, 0xb81f2cc1, 0x7f516546);
1810
+ &_data_word(0x04ea5e9d, 0x5d358c01, 0x737487fa, 0x2e410bfb);
1811
+ &_data_word(0x5a1d67b3, 0x52d2db92, 0x335610e9, 0x1347d66d);
1812
+ &_data_word(0x8c61d79a, 0x7a0ca137, 0x8e14f859, 0x893c13eb);
1813
+ &_data_word(0xee27a9ce, 0x35c961b7, 0xede51ce1, 0x3cb1477a);
1814
+ &_data_word(0x59dfd29c, 0x3f73f255, 0x79ce1418, 0xbf37c773);
1815
+ &_data_word(0xeacdf753, 0x5baafd5f, 0x146f3ddf, 0x86db4478);
1816
+ &_data_word(0x81f3afca, 0x3ec468b9, 0x2c342438, 0x5f40a3c2);
1817
+ &_data_word(0x72c31d16, 0x0c25e2bc, 0x8b493c28, 0x41950dff);
1818
+ &_data_word(0x7101a839, 0xdeb30c08, 0x9ce4b4d8, 0x90c15664);
1819
+ &_data_word(0x6184cb7b, 0x70b632d5, 0x745c6c48, 0x4257b8d0);
1820
+
1821
+ #Td4: # four copies of Td4 to choose from to avoid L1 aliasing
1822
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1823
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1824
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1825
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1826
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1827
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1828
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1829
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1830
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1831
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1832
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1833
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1834
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1835
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1836
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1837
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1838
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1839
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1840
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1841
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1842
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1843
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1844
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1845
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1846
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1847
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1848
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1849
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1850
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1851
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1852
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1853
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1854
+
1855
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1856
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1857
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1858
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1859
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1860
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1861
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1862
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1863
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1864
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1865
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1866
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1867
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1868
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1869
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1870
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1871
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1872
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1873
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1874
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1875
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1876
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1877
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1878
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1879
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1880
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1881
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1882
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1883
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1884
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1885
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1886
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1887
+
1888
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1889
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1890
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1891
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1892
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1893
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1894
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1895
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1896
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1897
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1898
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1899
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1900
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1901
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1902
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1903
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1904
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1905
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1906
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1907
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1908
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1909
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1910
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1911
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1912
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1913
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1914
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1915
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1916
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1917
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1918
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1919
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1920
+
1921
+ &data_byte(0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38);
1922
+ &data_byte(0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb);
1923
+ &data_byte(0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87);
1924
+ &data_byte(0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb);
1925
+ &data_byte(0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d);
1926
+ &data_byte(0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e);
1927
+ &data_byte(0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2);
1928
+ &data_byte(0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25);
1929
+ &data_byte(0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16);
1930
+ &data_byte(0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92);
1931
+ &data_byte(0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda);
1932
+ &data_byte(0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84);
1933
+ &data_byte(0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a);
1934
+ &data_byte(0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06);
1935
+ &data_byte(0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02);
1936
+ &data_byte(0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b);
1937
+ &data_byte(0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea);
1938
+ &data_byte(0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73);
1939
+ &data_byte(0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85);
1940
+ &data_byte(0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e);
1941
+ &data_byte(0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89);
1942
+ &data_byte(0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b);
1943
+ &data_byte(0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20);
1944
+ &data_byte(0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4);
1945
+ &data_byte(0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31);
1946
+ &data_byte(0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f);
1947
+ &data_byte(0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d);
1948
+ &data_byte(0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef);
1949
+ &data_byte(0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0);
1950
+ &data_byte(0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61);
1951
+ &data_byte(0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26);
1952
+ &data_byte(0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d);
1953
+ &function_end_B("_x86_AES_decrypt");
1954
+
1955
+ # void asm_AES_decrypt (const void *inp,void *out,const AES_KEY *key);
1956
+ &function_begin("asm_AES_decrypt");
1957
+ &mov ($acc,&wparam(0)); # load inp
1958
+ &mov ($key,&wparam(2)); # load key
1959
+
1960
+ &mov ($s0,"esp");
1961
+ &sub ("esp",36);
1962
+ &and ("esp",-64); # align to cache-line
1963
+
1964
+ # place stack frame just "above" the key schedule
1965
+ &lea ($s1,&DWP(-64-63,$key));
1966
+ &sub ($s1,"esp");
1967
+ &neg ($s1);
1968
+ &and ($s1,0x3C0); # modulo 1024, but aligned to cache-line
1969
+ &sub ("esp",$s1);
1970
+ &add ("esp",4); # 4 is reserved for caller's return address
1971
+ &mov ($_esp,$s0); # save stack pointer
1972
+
1973
+ &call (&label("pic_point")); # make it PIC!
1974
+ &set_label("pic_point");
1975
+ &blindpop($tbl);
1976
+ &picmeup($s0,"OPENSSL_ia32cap_P",$tbl,&label("pic_point")) if(!$x86only);
1977
+ &lea ($tbl,&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
1978
+
1979
+ # pick Td4 copy which can't "overlap" with stack frame or key schedule
1980
+ &lea ($s1,&DWP(768-4,"esp"));
1981
+ &sub ($s1,$tbl);
1982
+ &and ($s1,0x300);
1983
+ &lea ($tbl,&DWP(2048+128,$tbl,$s1));
1984
+
1985
+ if (!$x86only) {
1986
+ &bt (&DWP(0,$s0),25); # check for SSE bit
1987
+ &jnc (&label("x86"));
1988
+
1989
+ &movq ("mm0",&QWP(0,$acc));
1990
+ &movq ("mm4",&QWP(8,$acc));
1991
+ &call ("_sse_AES_decrypt_compact");
1992
+ &mov ("esp",$_esp); # restore stack pointer
1993
+ &mov ($acc,&wparam(1)); # load out
1994
+ &movq (&QWP(0,$acc),"mm0"); # write output data
1995
+ &movq (&QWP(8,$acc),"mm4");
1996
+ &emms ();
1997
+ &function_end_A();
1998
+ }
1999
+ &set_label("x86",16);
2000
+ &mov ($_tbl,$tbl);
2001
+ &mov ($s0,&DWP(0,$acc)); # load input data
2002
+ &mov ($s1,&DWP(4,$acc));
2003
+ &mov ($s2,&DWP(8,$acc));
2004
+ &mov ($s3,&DWP(12,$acc));
2005
+ &call ("_x86_AES_decrypt_compact");
2006
+ &mov ("esp",$_esp); # restore stack pointer
2007
+ &mov ($acc,&wparam(1)); # load out
2008
+ &mov (&DWP(0,$acc),$s0); # write output data
2009
+ &mov (&DWP(4,$acc),$s1);
2010
+ &mov (&DWP(8,$acc),$s2);
2011
+ &mov (&DWP(12,$acc),$s3);
2012
+ &function_end("asm_AES_decrypt");
2013
+
2014
+ #------------------------------------------------------------------#
2015
+
2016
+ sub enckey()
2017
+ {
2018
+ &movz ("esi",&LB("edx")); # rk[i]>>0
2019
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2020
+ &movz ("esi",&HB("edx")); # rk[i]>>8
2021
+ &shl ("ebx",24);
2022
+ &xor ("eax","ebx");
2023
+
2024
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2025
+ &shr ("edx",16);
2026
+ &movz ("esi",&LB("edx")); # rk[i]>>16
2027
+ &xor ("eax","ebx");
2028
+
2029
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2030
+ &movz ("esi",&HB("edx")); # rk[i]>>24
2031
+ &shl ("ebx",8);
2032
+ &xor ("eax","ebx");
2033
+
2034
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2035
+ &shl ("ebx",16);
2036
+ &xor ("eax","ebx");
2037
+
2038
+ &xor ("eax",&DWP(1024-128,$tbl,"ecx",4)); # rcon
2039
+ }
2040
+
2041
+ &function_begin("_x86_AES_set_encrypt_key");
2042
+ &mov ("esi",&wparam(1)); # user supplied key
2043
+ &mov ("edi",&wparam(3)); # private key schedule
2044
+
2045
+ &test ("esi",-1);
2046
+ &jz (&label("badpointer"));
2047
+ &test ("edi",-1);
2048
+ &jz (&label("badpointer"));
2049
+
2050
+ &call (&label("pic_point"));
2051
+ &set_label("pic_point");
2052
+ &blindpop($tbl);
2053
+ &lea ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
2054
+ &lea ($tbl,&DWP(2048+128,$tbl));
2055
+
2056
+ # prefetch Te4
2057
+ &mov ("eax",&DWP(0-128,$tbl));
2058
+ &mov ("ebx",&DWP(32-128,$tbl));
2059
+ &mov ("ecx",&DWP(64-128,$tbl));
2060
+ &mov ("edx",&DWP(96-128,$tbl));
2061
+ &mov ("eax",&DWP(128-128,$tbl));
2062
+ &mov ("ebx",&DWP(160-128,$tbl));
2063
+ &mov ("ecx",&DWP(192-128,$tbl));
2064
+ &mov ("edx",&DWP(224-128,$tbl));
2065
+
2066
+ &mov ("ecx",&wparam(2)); # number of bits in key
2067
+ &cmp ("ecx",128);
2068
+ &je (&label("10rounds"));
2069
+ &cmp ("ecx",192);
2070
+ &je (&label("12rounds"));
2071
+ &cmp ("ecx",256);
2072
+ &je (&label("14rounds"));
2073
+ &mov ("eax",-2); # invalid number of bits
2074
+ &jmp (&label("exit"));
2075
+
2076
+ &set_label("10rounds");
2077
+ &mov ("eax",&DWP(0,"esi")); # copy first 4 dwords
2078
+ &mov ("ebx",&DWP(4,"esi"));
2079
+ &mov ("ecx",&DWP(8,"esi"));
2080
+ &mov ("edx",&DWP(12,"esi"));
2081
+ &mov (&DWP(0,"edi"),"eax");
2082
+ &mov (&DWP(4,"edi"),"ebx");
2083
+ &mov (&DWP(8,"edi"),"ecx");
2084
+ &mov (&DWP(12,"edi"),"edx");
2085
+
2086
+ &xor ("ecx","ecx");
2087
+ &jmp (&label("10shortcut"));
2088
+
2089
+ &align (4);
2090
+ &set_label("10loop");
2091
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2092
+ &mov ("edx",&DWP(12,"edi")); # rk[3]
2093
+ &set_label("10shortcut");
2094
+ &enckey ();
2095
+
2096
+ &mov (&DWP(16,"edi"),"eax"); # rk[4]
2097
+ &xor ("eax",&DWP(4,"edi"));
2098
+ &mov (&DWP(20,"edi"),"eax"); # rk[5]
2099
+ &xor ("eax",&DWP(8,"edi"));
2100
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
2101
+ &xor ("eax",&DWP(12,"edi"));
2102
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
2103
+ &inc ("ecx");
2104
+ &add ("edi",16);
2105
+ &cmp ("ecx",10);
2106
+ &jl (&label("10loop"));
2107
+
2108
+ &mov (&DWP(80,"edi"),10); # setup number of rounds
2109
+ &xor ("eax","eax");
2110
+ &jmp (&label("exit"));
2111
+
2112
+ &set_label("12rounds");
2113
+ &mov ("eax",&DWP(0,"esi")); # copy first 6 dwords
2114
+ &mov ("ebx",&DWP(4,"esi"));
2115
+ &mov ("ecx",&DWP(8,"esi"));
2116
+ &mov ("edx",&DWP(12,"esi"));
2117
+ &mov (&DWP(0,"edi"),"eax");
2118
+ &mov (&DWP(4,"edi"),"ebx");
2119
+ &mov (&DWP(8,"edi"),"ecx");
2120
+ &mov (&DWP(12,"edi"),"edx");
2121
+ &mov ("ecx",&DWP(16,"esi"));
2122
+ &mov ("edx",&DWP(20,"esi"));
2123
+ &mov (&DWP(16,"edi"),"ecx");
2124
+ &mov (&DWP(20,"edi"),"edx");
2125
+
2126
+ &xor ("ecx","ecx");
2127
+ &jmp (&label("12shortcut"));
2128
+
2129
+ &align (4);
2130
+ &set_label("12loop");
2131
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2132
+ &mov ("edx",&DWP(20,"edi")); # rk[5]
2133
+ &set_label("12shortcut");
2134
+ &enckey ();
2135
+
2136
+ &mov (&DWP(24,"edi"),"eax"); # rk[6]
2137
+ &xor ("eax",&DWP(4,"edi"));
2138
+ &mov (&DWP(28,"edi"),"eax"); # rk[7]
2139
+ &xor ("eax",&DWP(8,"edi"));
2140
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
2141
+ &xor ("eax",&DWP(12,"edi"));
2142
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
2143
+
2144
+ &cmp ("ecx",7);
2145
+ &je (&label("12break"));
2146
+ &inc ("ecx");
2147
+
2148
+ &xor ("eax",&DWP(16,"edi"));
2149
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
2150
+ &xor ("eax",&DWP(20,"edi"));
2151
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
2152
+
2153
+ &add ("edi",24);
2154
+ &jmp (&label("12loop"));
2155
+
2156
+ &set_label("12break");
2157
+ &mov (&DWP(72,"edi"),12); # setup number of rounds
2158
+ &xor ("eax","eax");
2159
+ &jmp (&label("exit"));
2160
+
2161
+ &set_label("14rounds");
2162
+ &mov ("eax",&DWP(0,"esi")); # copy first 8 dwords
2163
+ &mov ("ebx",&DWP(4,"esi"));
2164
+ &mov ("ecx",&DWP(8,"esi"));
2165
+ &mov ("edx",&DWP(12,"esi"));
2166
+ &mov (&DWP(0,"edi"),"eax");
2167
+ &mov (&DWP(4,"edi"),"ebx");
2168
+ &mov (&DWP(8,"edi"),"ecx");
2169
+ &mov (&DWP(12,"edi"),"edx");
2170
+ &mov ("eax",&DWP(16,"esi"));
2171
+ &mov ("ebx",&DWP(20,"esi"));
2172
+ &mov ("ecx",&DWP(24,"esi"));
2173
+ &mov ("edx",&DWP(28,"esi"));
2174
+ &mov (&DWP(16,"edi"),"eax");
2175
+ &mov (&DWP(20,"edi"),"ebx");
2176
+ &mov (&DWP(24,"edi"),"ecx");
2177
+ &mov (&DWP(28,"edi"),"edx");
2178
+
2179
+ &xor ("ecx","ecx");
2180
+ &jmp (&label("14shortcut"));
2181
+
2182
+ &align (4);
2183
+ &set_label("14loop");
2184
+ &mov ("edx",&DWP(28,"edi")); # rk[7]
2185
+ &set_label("14shortcut");
2186
+ &mov ("eax",&DWP(0,"edi")); # rk[0]
2187
+
2188
+ &enckey ();
2189
+
2190
+ &mov (&DWP(32,"edi"),"eax"); # rk[8]
2191
+ &xor ("eax",&DWP(4,"edi"));
2192
+ &mov (&DWP(36,"edi"),"eax"); # rk[9]
2193
+ &xor ("eax",&DWP(8,"edi"));
2194
+ &mov (&DWP(40,"edi"),"eax"); # rk[10]
2195
+ &xor ("eax",&DWP(12,"edi"));
2196
+ &mov (&DWP(44,"edi"),"eax"); # rk[11]
2197
+
2198
+ &cmp ("ecx",6);
2199
+ &je (&label("14break"));
2200
+ &inc ("ecx");
2201
+
2202
+ &mov ("edx","eax");
2203
+ &mov ("eax",&DWP(16,"edi")); # rk[4]
2204
+ &movz ("esi",&LB("edx")); # rk[11]>>0
2205
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2206
+ &movz ("esi",&HB("edx")); # rk[11]>>8
2207
+ &xor ("eax","ebx");
2208
+
2209
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2210
+ &shr ("edx",16);
2211
+ &shl ("ebx",8);
2212
+ &movz ("esi",&LB("edx")); # rk[11]>>16
2213
+ &xor ("eax","ebx");
2214
+
2215
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2216
+ &movz ("esi",&HB("edx")); # rk[11]>>24
2217
+ &shl ("ebx",16);
2218
+ &xor ("eax","ebx");
2219
+
2220
+ &movz ("ebx",&BP(-128,$tbl,"esi",1));
2221
+ &shl ("ebx",24);
2222
+ &xor ("eax","ebx");
2223
+
2224
+ &mov (&DWP(48,"edi"),"eax"); # rk[12]
2225
+ &xor ("eax",&DWP(20,"edi"));
2226
+ &mov (&DWP(52,"edi"),"eax"); # rk[13]
2227
+ &xor ("eax",&DWP(24,"edi"));
2228
+ &mov (&DWP(56,"edi"),"eax"); # rk[14]
2229
+ &xor ("eax",&DWP(28,"edi"));
2230
+ &mov (&DWP(60,"edi"),"eax"); # rk[15]
2231
+
2232
+ &add ("edi",32);
2233
+ &jmp (&label("14loop"));
2234
+
2235
+ &set_label("14break");
2236
+ &mov (&DWP(48,"edi"),14); # setup number of rounds
2237
+ &xor ("eax","eax");
2238
+ &jmp (&label("exit"));
2239
+
2240
+ &set_label("badpointer");
2241
+ &mov ("eax",-1);
2242
+ &set_label("exit");
2243
+ &function_end("_x86_AES_set_encrypt_key");
2244
+
2245
+ # int asm_AES_set_encrypt_key(const unsigned char *userKey, const int bits,
2246
+ # AES_KEY *key)
2247
+ &function_begin_B("asm_AES_set_encrypt_key");
2248
+ &call ("_x86_AES_set_encrypt_key");
2249
+ &ret ();
2250
+ &function_end_B("asm_AES_set_encrypt_key");
2251
+
2252
+ sub deckey()
2253
+ { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
2254
+ my $tmp = $tbl;
2255
+
2256
+ &mov ($tmp,0x80808080);
2257
+ &and ($tmp,$tp1);
2258
+ &lea ($tp2,&DWP(0,$tp1,$tp1));
2259
+ &mov ($acc,$tmp);
2260
+ &shr ($tmp,7);
2261
+ &sub ($acc,$tmp);
2262
+ &and ($tp2,0xfefefefe);
2263
+ &and ($acc,0x1b1b1b1b);
2264
+ &xor ($tp2,$acc);
2265
+ &mov ($tmp,0x80808080);
2266
+
2267
+ &and ($tmp,$tp2);
2268
+ &lea ($tp4,&DWP(0,$tp2,$tp2));
2269
+ &mov ($acc,$tmp);
2270
+ &shr ($tmp,7);
2271
+ &sub ($acc,$tmp);
2272
+ &and ($tp4,0xfefefefe);
2273
+ &and ($acc,0x1b1b1b1b);
2274
+ &xor ($tp2,$tp1); # tp2^tp1
2275
+ &xor ($tp4,$acc);
2276
+ &mov ($tmp,0x80808080);
2277
+
2278
+ &and ($tmp,$tp4);
2279
+ &lea ($tp8,&DWP(0,$tp4,$tp4));
2280
+ &mov ($acc,$tmp);
2281
+ &shr ($tmp,7);
2282
+ &xor ($tp4,$tp1); # tp4^tp1
2283
+ &sub ($acc,$tmp);
2284
+ &and ($tp8,0xfefefefe);
2285
+ &and ($acc,0x1b1b1b1b);
2286
+ &rotl ($tp1,8); # = ROTATE(tp1,8)
2287
+ &xor ($tp8,$acc);
2288
+
2289
+ &mov ($tmp,&DWP(4*($i+1),$key)); # modulo-scheduled load
2290
+
2291
+ &xor ($tp1,$tp2);
2292
+ &xor ($tp2,$tp8);
2293
+ &xor ($tp1,$tp4);
2294
+ &rotl ($tp2,24);
2295
+ &xor ($tp4,$tp8);
2296
+ &xor ($tp1,$tp8); # ^= tp8^(tp4^tp1)^(tp2^tp1)
2297
+ &rotl ($tp4,16);
2298
+ &xor ($tp1,$tp2); # ^= ROTATE(tp8^tp2^tp1,24)
2299
+ &rotl ($tp8,8);
2300
+ &xor ($tp1,$tp4); # ^= ROTATE(tp8^tp4^tp1,16)
2301
+ &mov ($tp2,$tmp);
2302
+ &xor ($tp1,$tp8); # ^= ROTATE(tp8,8)
2303
+
2304
+ &mov (&DWP(4*$i,$key),$tp1);
2305
+ }
2306
+
2307
+ # int asm_AES_set_decrypt_key(const unsigned char *userKey, const int bits,
2308
+ # AES_KEY *key)
2309
+ &function_begin_B("asm_AES_set_decrypt_key");
2310
+ &call ("_x86_AES_set_encrypt_key");
2311
+ &cmp ("eax",0);
2312
+ &je (&label("proceed"));
2313
+ &ret ();
2314
+
2315
+ &set_label("proceed");
2316
+ &push ("ebp");
2317
+ &push ("ebx");
2318
+ &push ("esi");
2319
+ &push ("edi");
2320
+
2321
+ &mov ("esi",&wparam(2));
2322
+ &mov ("ecx",&DWP(240,"esi")); # pull number of rounds
2323
+ &lea ("ecx",&DWP(0,"","ecx",4));
2324
+ &lea ("edi",&DWP(0,"esi","ecx",4)); # pointer to last chunk
2325
+
2326
+ &set_label("invert",4); # invert order of chunks
2327
+ &mov ("eax",&DWP(0,"esi"));
2328
+ &mov ("ebx",&DWP(4,"esi"));
2329
+ &mov ("ecx",&DWP(0,"edi"));
2330
+ &mov ("edx",&DWP(4,"edi"));
2331
+ &mov (&DWP(0,"edi"),"eax");
2332
+ &mov (&DWP(4,"edi"),"ebx");
2333
+ &mov (&DWP(0,"esi"),"ecx");
2334
+ &mov (&DWP(4,"esi"),"edx");
2335
+ &mov ("eax",&DWP(8,"esi"));
2336
+ &mov ("ebx",&DWP(12,"esi"));
2337
+ &mov ("ecx",&DWP(8,"edi"));
2338
+ &mov ("edx",&DWP(12,"edi"));
2339
+ &mov (&DWP(8,"edi"),"eax");
2340
+ &mov (&DWP(12,"edi"),"ebx");
2341
+ &mov (&DWP(8,"esi"),"ecx");
2342
+ &mov (&DWP(12,"esi"),"edx");
2343
+ &add ("esi",16);
2344
+ &sub ("edi",16);
2345
+ &cmp ("esi","edi");
2346
+ &jne (&label("invert"));
2347
+
2348
+ &mov ($key,&wparam(2));
2349
+ &mov ($acc,&DWP(240,$key)); # pull number of rounds
2350
+ &lea ($acc,&DWP(-2,$acc,$acc));
2351
+ &lea ($acc,&DWP(0,$key,$acc,8));
2352
+ &mov (&wparam(2),$acc);
2353
+
2354
+ &mov ($s0,&DWP(16,$key)); # modulo-scheduled load
2355
+ &set_label("permute",4); # permute the key schedule
2356
+ &add ($key,16);
2357
+ &deckey (0,$key,$s0,$s1,$s2,$s3);
2358
+ &deckey (1,$key,$s1,$s2,$s3,$s0);
2359
+ &deckey (2,$key,$s2,$s3,$s0,$s1);
2360
+ &deckey (3,$key,$s3,$s0,$s1,$s2);
2361
+ &cmp ($key,&wparam(2));
2362
+ &jb (&label("permute"));
2363
+
2364
+ &xor ("eax","eax"); # return success
2365
+ &function_end("asm_AES_set_decrypt_key");
2366
+ &asciz("AES for x86, CRYPTOGAMS by <appro\@openssl.org>");
2367
+
2368
+ &asm_finish();