ring-native 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/Gemfile +3 -0
  4. data/README.md +22 -0
  5. data/Rakefile +1 -0
  6. data/ext/ring/extconf.rb +29 -0
  7. data/lib/ring/native.rb +8 -0
  8. data/lib/ring/native/version.rb +5 -0
  9. data/ring-native.gemspec +25 -0
  10. data/vendor/ring/BUILDING.md +40 -0
  11. data/vendor/ring/Cargo.toml +43 -0
  12. data/vendor/ring/LICENSE +185 -0
  13. data/vendor/ring/Makefile +35 -0
  14. data/vendor/ring/PORTING.md +163 -0
  15. data/vendor/ring/README.md +113 -0
  16. data/vendor/ring/STYLE.md +197 -0
  17. data/vendor/ring/appveyor.yml +27 -0
  18. data/vendor/ring/build.rs +108 -0
  19. data/vendor/ring/crypto/aes/aes.c +1142 -0
  20. data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
  21. data/vendor/ring/crypto/aes/aes_test.cc +93 -0
  22. data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
  23. data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
  24. data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
  25. data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
  26. data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
  27. data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
  28. data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
  29. data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
  30. data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
  31. data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
  32. data/vendor/ring/crypto/aes/internal.h +87 -0
  33. data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
  34. data/vendor/ring/crypto/bn/add.c +394 -0
  35. data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
  36. data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
  37. data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
  38. data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
  39. data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
  40. data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
  41. data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
  42. data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
  43. data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
  44. data/vendor/ring/crypto/bn/bn.c +352 -0
  45. data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
  46. data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
  47. data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
  48. data/vendor/ring/crypto/bn/cmp.c +200 -0
  49. data/vendor/ring/crypto/bn/convert.c +433 -0
  50. data/vendor/ring/crypto/bn/ctx.c +311 -0
  51. data/vendor/ring/crypto/bn/div.c +594 -0
  52. data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
  53. data/vendor/ring/crypto/bn/gcd.c +711 -0
  54. data/vendor/ring/crypto/bn/generic.c +1019 -0
  55. data/vendor/ring/crypto/bn/internal.h +316 -0
  56. data/vendor/ring/crypto/bn/montgomery.c +516 -0
  57. data/vendor/ring/crypto/bn/mul.c +888 -0
  58. data/vendor/ring/crypto/bn/prime.c +829 -0
  59. data/vendor/ring/crypto/bn/random.c +334 -0
  60. data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
  61. data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
  62. data/vendor/ring/crypto/bn/shift.c +276 -0
  63. data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
  64. data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
  65. data/vendor/ring/crypto/bytestring/cbb.c +399 -0
  66. data/vendor/ring/crypto/bytestring/cbs.c +227 -0
  67. data/vendor/ring/crypto/bytestring/internal.h +46 -0
  68. data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
  69. data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
  70. data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
  71. data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
  72. data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
  73. data/vendor/ring/crypto/cipher/e_aes.c +390 -0
  74. data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
  75. data/vendor/ring/crypto/cipher/internal.h +173 -0
  76. data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
  77. data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
  78. data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
  79. data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
  80. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
  81. data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
  82. data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
  83. data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
  84. data/vendor/ring/crypto/constant_time_test.c +304 -0
  85. data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
  86. data/vendor/ring/crypto/cpu-arm.c +199 -0
  87. data/vendor/ring/crypto/cpu-intel.c +261 -0
  88. data/vendor/ring/crypto/crypto.c +151 -0
  89. data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
  90. data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
  91. data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
  92. data/vendor/ring/crypto/digest/md32_common.h +181 -0
  93. data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
  94. data/vendor/ring/crypto/ec/ec.c +193 -0
  95. data/vendor/ring/crypto/ec/ec_curves.c +61 -0
  96. data/vendor/ring/crypto/ec/ec_key.c +228 -0
  97. data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
  98. data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
  99. data/vendor/ring/crypto/ec/internal.h +243 -0
  100. data/vendor/ring/crypto/ec/oct.c +253 -0
  101. data/vendor/ring/crypto/ec/p256-64.c +1794 -0
  102. data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
  103. data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
  104. data/vendor/ring/crypto/ec/simple.c +1007 -0
  105. data/vendor/ring/crypto/ec/util-64.c +183 -0
  106. data/vendor/ring/crypto/ec/wnaf.c +508 -0
  107. data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
  108. data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
  109. data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
  110. data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
  111. data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
  112. data/vendor/ring/crypto/header_removed.h +17 -0
  113. data/vendor/ring/crypto/internal.h +495 -0
  114. data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
  115. data/vendor/ring/crypto/mem.c +98 -0
  116. data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
  117. data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
  118. data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
  119. data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
  120. data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
  121. data/vendor/ring/crypto/modes/ctr.c +226 -0
  122. data/vendor/ring/crypto/modes/gcm.c +1206 -0
  123. data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
  124. data/vendor/ring/crypto/modes/gcm_test.c +348 -0
  125. data/vendor/ring/crypto/modes/internal.h +299 -0
  126. data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
  127. data/vendor/ring/crypto/perlasm/readme +100 -0
  128. data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
  129. data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
  130. data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
  131. data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
  132. data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
  133. data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
  134. data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
  135. data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
  136. data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
  137. data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
  138. data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
  139. data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
  140. data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
  141. data/vendor/ring/crypto/rand/internal.h +32 -0
  142. data/vendor/ring/crypto/rand/rand.c +189 -0
  143. data/vendor/ring/crypto/rand/urandom.c +219 -0
  144. data/vendor/ring/crypto/rand/windows.c +56 -0
  145. data/vendor/ring/crypto/refcount_c11.c +66 -0
  146. data/vendor/ring/crypto/refcount_lock.c +53 -0
  147. data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
  148. data/vendor/ring/crypto/refcount_test.c +58 -0
  149. data/vendor/ring/crypto/rsa/blinding.c +462 -0
  150. data/vendor/ring/crypto/rsa/internal.h +108 -0
  151. data/vendor/ring/crypto/rsa/padding.c +300 -0
  152. data/vendor/ring/crypto/rsa/rsa.c +450 -0
  153. data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
  154. data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
  155. data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
  156. data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
  157. data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
  158. data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
  159. data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
  160. data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
  161. data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
  162. data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
  163. data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
  164. data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
  165. data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
  166. data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
  167. data/vendor/ring/crypto/sha/sha1.c +271 -0
  168. data/vendor/ring/crypto/sha/sha256.c +204 -0
  169. data/vendor/ring/crypto/sha/sha512.c +355 -0
  170. data/vendor/ring/crypto/test/file_test.cc +326 -0
  171. data/vendor/ring/crypto/test/file_test.h +181 -0
  172. data/vendor/ring/crypto/test/malloc.cc +150 -0
  173. data/vendor/ring/crypto/test/scoped_types.h +95 -0
  174. data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
  175. data/vendor/ring/crypto/test/test_util.cc +46 -0
  176. data/vendor/ring/crypto/test/test_util.h +41 -0
  177. data/vendor/ring/crypto/thread_none.c +55 -0
  178. data/vendor/ring/crypto/thread_pthread.c +165 -0
  179. data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
  180. data/vendor/ring/crypto/thread_test.c +200 -0
  181. data/vendor/ring/crypto/thread_win.c +282 -0
  182. data/vendor/ring/examples/checkdigest.rs +103 -0
  183. data/vendor/ring/include/openssl/aes.h +121 -0
  184. data/vendor/ring/include/openssl/arm_arch.h +129 -0
  185. data/vendor/ring/include/openssl/base.h +156 -0
  186. data/vendor/ring/include/openssl/bn.h +794 -0
  187. data/vendor/ring/include/openssl/buffer.h +18 -0
  188. data/vendor/ring/include/openssl/bytestring.h +235 -0
  189. data/vendor/ring/include/openssl/chacha.h +37 -0
  190. data/vendor/ring/include/openssl/cmac.h +76 -0
  191. data/vendor/ring/include/openssl/cpu.h +184 -0
  192. data/vendor/ring/include/openssl/crypto.h +43 -0
  193. data/vendor/ring/include/openssl/curve25519.h +88 -0
  194. data/vendor/ring/include/openssl/ec.h +225 -0
  195. data/vendor/ring/include/openssl/ec_key.h +129 -0
  196. data/vendor/ring/include/openssl/ecdh.h +110 -0
  197. data/vendor/ring/include/openssl/ecdsa.h +156 -0
  198. data/vendor/ring/include/openssl/err.h +201 -0
  199. data/vendor/ring/include/openssl/mem.h +101 -0
  200. data/vendor/ring/include/openssl/obj_mac.h +71 -0
  201. data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
  202. data/vendor/ring/include/openssl/opensslv.h +18 -0
  203. data/vendor/ring/include/openssl/ossl_typ.h +18 -0
  204. data/vendor/ring/include/openssl/poly1305.h +51 -0
  205. data/vendor/ring/include/openssl/rand.h +70 -0
  206. data/vendor/ring/include/openssl/rsa.h +399 -0
  207. data/vendor/ring/include/openssl/thread.h +133 -0
  208. data/vendor/ring/include/openssl/type_check.h +71 -0
  209. data/vendor/ring/mk/Common.props +63 -0
  210. data/vendor/ring/mk/Windows.props +42 -0
  211. data/vendor/ring/mk/WindowsTest.props +18 -0
  212. data/vendor/ring/mk/appveyor.bat +62 -0
  213. data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
  214. data/vendor/ring/mk/ring.mk +266 -0
  215. data/vendor/ring/mk/top_of_makefile.mk +214 -0
  216. data/vendor/ring/mk/travis.sh +40 -0
  217. data/vendor/ring/mk/update-travis-yml.py +229 -0
  218. data/vendor/ring/ring.sln +153 -0
  219. data/vendor/ring/src/aead.rs +682 -0
  220. data/vendor/ring/src/agreement.rs +248 -0
  221. data/vendor/ring/src/c.rs +129 -0
  222. data/vendor/ring/src/constant_time.rs +37 -0
  223. data/vendor/ring/src/der.rs +96 -0
  224. data/vendor/ring/src/digest.rs +690 -0
  225. data/vendor/ring/src/digest_tests.txt +57 -0
  226. data/vendor/ring/src/ecc.rs +28 -0
  227. data/vendor/ring/src/ecc_build.rs +279 -0
  228. data/vendor/ring/src/ecc_curves.rs +117 -0
  229. data/vendor/ring/src/ed25519_tests.txt +2579 -0
  230. data/vendor/ring/src/exe_tests.rs +46 -0
  231. data/vendor/ring/src/ffi.rs +29 -0
  232. data/vendor/ring/src/file_test.rs +187 -0
  233. data/vendor/ring/src/hkdf.rs +153 -0
  234. data/vendor/ring/src/hkdf_tests.txt +59 -0
  235. data/vendor/ring/src/hmac.rs +414 -0
  236. data/vendor/ring/src/hmac_tests.txt +97 -0
  237. data/vendor/ring/src/input.rs +312 -0
  238. data/vendor/ring/src/lib.rs +41 -0
  239. data/vendor/ring/src/pbkdf2.rs +265 -0
  240. data/vendor/ring/src/pbkdf2_tests.txt +113 -0
  241. data/vendor/ring/src/polyfill.rs +57 -0
  242. data/vendor/ring/src/rand.rs +28 -0
  243. data/vendor/ring/src/signature.rs +314 -0
  244. data/vendor/ring/third-party/NIST/README.md +9 -0
  245. data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
  246. data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
  247. data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
  248. data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
  249. data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
  250. data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
  251. data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
  252. data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
  253. data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
  254. data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
  255. data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
  256. data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
  257. data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
  258. data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
  259. data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
  260. data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
  261. metadata +333 -0
@@ -0,0 +1,841 @@
1
+ #!/usr/bin/env perl
2
+
3
+ ######################################################################
4
+ ## Constant-time SSSE3 AES core implementation.
5
+ ## version 0.1
6
+ ##
7
+ ## By Mike Hamburg (Stanford University), 2009
8
+ ## Public domain.
9
+ ##
10
+ ## For details see http://shiftleft.org/papers/vector_aes/ and
11
+ ## http://crypto.stanford.edu/vpaes/.
12
+
13
+ ######################################################################
14
+ # September 2011.
15
+ #
16
+ # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
17
+ # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
18
+ # doesn't handle partial vectors (doesn't have to if called from
19
+ # EVP only). "Drop-in" implies that this module doesn't share key
20
+ # schedule structure with the original nor does it make assumption
21
+ # about its alignment...
22
+ #
23
+ # Performance summary. aes-586.pl column lists large-block CBC
24
+ # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
25
+ # byte processed with 128-bit key, and vpaes-x86.pl column - [also
26
+ # large-block CBC] encrypt/decrypt.
27
+ #
28
+ # aes-586.pl vpaes-x86.pl
29
+ #
30
+ # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
31
+ # Nehalem 27.9/40.4/18.1 10.2/11.9
32
+ # Atom 70.7/92.1/60.1 61.1/75.4(***)
33
+ # Silvermont 45.4/62.9/24.1 49.2/61.1(***)
34
+ #
35
+ # (*) "Hyper-threading" in the context refers rather to cache shared
36
+ # among multiple cores, than to specifically Intel HTT. As vast
37
+ # majority of contemporary cores share cache, slower code path
38
+ # is common place. In other words "with-hyper-threading-off"
39
+ # results are presented mostly for reference purposes.
40
+ #
41
+ # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
42
+ #
43
+ # (***) Less impressive improvement on Core 2 and Atom is due to slow
44
+ # pshufb, yet it's respectable +28%/64% improvement on Core 2
45
+ # and +15% on Atom (as implied, over "hyper-threading-safe"
46
+ # code path).
47
+ #
48
+ # <appro@openssl.org>
49
+
50
+ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51
+ push(@INC,"${dir}","${dir}../../perlasm");
52
+ require "x86asm.pl";
53
+
54
+ &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
55
+
56
+ $PREFIX="vpaes";
57
+
58
+ my ($round, $base, $magic, $key, $const, $inp, $out)=
59
+ ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
60
+
61
+ &static_label("_vpaes_consts");
62
+ &static_label("_vpaes_schedule_low_round");
63
+
64
+ &set_label("_vpaes_consts",64);
65
+ $k_inv=-0x30; # inv, inva
66
+ &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
67
+ &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
68
+
69
+ $k_s0F=-0x10; # s0F
70
+ &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
71
+
72
+ $k_ipt=0x00; # input transform (lo, hi)
73
+ &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
74
+ &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
75
+
76
+ $k_sb1=0x20; # sb1u, sb1t
77
+ &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
78
+ &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
79
+ $k_sb2=0x40; # sb2u, sb2t
80
+ &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
81
+ &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
82
+ $k_sbo=0x60; # sbou, sbot
83
+ &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
84
+ &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
85
+
86
+ $k_mc_forward=0x80; # mc_forward
87
+ &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
88
+ &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
89
+ &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
90
+ &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
91
+
92
+ $k_mc_backward=0xc0; # mc_backward
93
+ &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
94
+ &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
95
+ &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
96
+ &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
97
+
98
+ $k_sr=0x100; # sr
99
+ &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
100
+ &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
101
+ &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
102
+ &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
103
+
104
+ $k_rcon=0x140; # rcon
105
+ &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
106
+
107
+ $k_s63=0x150; # s63: all equal to 0x63 transformed
108
+ &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
109
+
110
+ $k_opt=0x160; # output transform
111
+ &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
112
+ &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
113
+
114
+ $k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
115
+ &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
116
+ &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
117
+ ##
118
+ ## Decryption stuff
119
+ ## Key schedule constants
120
+ ##
121
+ $k_dksd=0x1a0; # decryption key schedule: invskew x*D
122
+ &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
123
+ &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
124
+ $k_dksb=0x1c0; # decryption key schedule: invskew x*B
125
+ &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
126
+ &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
127
+ $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
128
+ &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
129
+ &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
130
+ $k_dks9=0x200; # decryption key schedule: invskew x*9
131
+ &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
132
+ &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
133
+
134
+ ##
135
+ ## Decryption stuff
136
+ ## Round function constants
137
+ ##
138
+ $k_dipt=0x220; # decryption input transform
139
+ &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
140
+ &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
141
+
142
+ $k_dsb9=0x240; # decryption sbox output *9*u, *9*t
143
+ &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
144
+ &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
145
+ $k_dsbd=0x260; # decryption sbox output *D*u, *D*t
146
+ &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
147
+ &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
148
+ $k_dsbb=0x280; # decryption sbox output *B*u, *B*t
149
+ &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
150
+ &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
151
+ $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
152
+ &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
153
+ &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
154
+ $k_dsbo=0x2c0; # decryption sbox final output
155
+ &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
156
+ &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
157
+ &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
158
+ &align (64);
159
+
160
+ &function_begin_B("_vpaes_preheat");
161
+ &add ($const,&DWP(0,"esp"));
162
+ &movdqa ("xmm7",&QWP($k_inv,$const));
163
+ &movdqa ("xmm6",&QWP($k_s0F,$const));
164
+ &ret ();
165
+ &function_end_B("_vpaes_preheat");
166
+
167
+ ##
168
+ ## _aes_encrypt_core
169
+ ##
170
+ ## AES-encrypt %xmm0.
171
+ ##
172
+ ## Inputs:
173
+ ## %xmm0 = input
174
+ ## %xmm6-%xmm7 as in _vpaes_preheat
175
+ ## (%edx) = scheduled keys
176
+ ##
177
+ ## Output in %xmm0
178
+ ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
179
+ ##
180
+ ##
181
+ &function_begin_B("_vpaes_encrypt_core");
182
+ &mov ($magic,16);
183
+ &mov ($round,&DWP(240,$key));
184
+ &movdqa ("xmm1","xmm6")
185
+ &movdqa ("xmm2",&QWP($k_ipt,$const));
186
+ &pandn ("xmm1","xmm0");
187
+ &pand ("xmm0","xmm6");
188
+ &movdqu ("xmm5",&QWP(0,$key));
189
+ &pshufb ("xmm2","xmm0");
190
+ &movdqa ("xmm0",&QWP($k_ipt+16,$const));
191
+ &pxor ("xmm2","xmm5");
192
+ &psrld ("xmm1",4);
193
+ &add ($key,16);
194
+ &pshufb ("xmm0","xmm1");
195
+ &lea ($base,&DWP($k_mc_backward,$const));
196
+ &pxor ("xmm0","xmm2");
197
+ &jmp (&label("enc_entry"));
198
+
199
+
200
+ &set_label("enc_loop",16);
201
+ # middle of middle round
202
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
203
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
204
+ &pshufb ("xmm4","xmm2"); # 4 = sb1u
205
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
206
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
207
+ &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
208
+ &pxor ("xmm0","xmm4"); # 0 = A
209
+ &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
210
+ &pshufb ("xmm5","xmm2"); # 4 = sb2u
211
+ &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
212
+ &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
213
+ &pshufb ("xmm2","xmm3"); # 2 = sb2t
214
+ &movdqa ("xmm3","xmm0"); # 3 = A
215
+ &pxor ("xmm2","xmm5"); # 2 = 2A
216
+ &pshufb ("xmm0","xmm1"); # 0 = B
217
+ &add ($key,16); # next key
218
+ &pxor ("xmm0","xmm2"); # 0 = 2A+B
219
+ &pshufb ("xmm3","xmm4"); # 3 = D
220
+ &add ($magic,16); # next mc
221
+ &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
222
+ &pshufb ("xmm0","xmm1"); # 0 = 2B+C
223
+ &and ($magic,0x30); # ... mod 4
224
+ &sub ($round,1); # nr--
225
+ &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
226
+
227
+ &set_label("enc_entry");
228
+ # top of round
229
+ &movdqa ("xmm1","xmm6"); # 1 : i
230
+ &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
231
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
232
+ &psrld ("xmm1",4); # 1 = i
233
+ &pand ("xmm0","xmm6"); # 0 = k
234
+ &pshufb ("xmm5","xmm0"); # 2 = a/k
235
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
236
+ &pxor ("xmm0","xmm1"); # 0 = j
237
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
238
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
239
+ &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
240
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
241
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
242
+ &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
243
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
244
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
245
+ &pxor ("xmm2","xmm0"); # 2 = io
246
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
247
+ &movdqu ("xmm5",&QWP(0,$key));
248
+ &pxor ("xmm3","xmm1"); # 3 = jo
249
+ &jnz (&label("enc_loop"));
250
+
251
+ # middle of last round
252
+ &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
253
+ &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
254
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
255
+ &pxor ("xmm4","xmm5"); # 4 = sb1u + k
256
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
257
+ &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
258
+ &pxor ("xmm0","xmm4"); # 0 = A
259
+ &pshufb ("xmm0","xmm1");
260
+ &ret ();
261
+ &function_end_B("_vpaes_encrypt_core");
262
+
263
+ ##
264
+ ## Decryption core
265
+ ##
266
+ ## Same API as encryption core.
267
+ ##
268
+ &function_begin_B("_vpaes_decrypt_core");
269
+ &lea ($base,&DWP($k_dsbd,$const));
270
+ &mov ($round,&DWP(240,$key));
271
+ &movdqa ("xmm1","xmm6");
272
+ &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
273
+ &pandn ("xmm1","xmm0");
274
+ &mov ($magic,$round);
275
+ &psrld ("xmm1",4)
276
+ &movdqu ("xmm5",&QWP(0,$key));
277
+ &shl ($magic,4);
278
+ &pand ("xmm0","xmm6");
279
+ &pshufb ("xmm2","xmm0");
280
+ &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
281
+ &xor ($magic,0x30);
282
+ &pshufb ("xmm0","xmm1");
283
+ &and ($magic,0x30);
284
+ &pxor ("xmm2","xmm5");
285
+ &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
286
+ &pxor ("xmm0","xmm2");
287
+ &add ($key,16);
288
+ &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
289
+ &jmp (&label("dec_entry"));
290
+
291
+ &set_label("dec_loop",16);
292
+ ##
293
+ ## Inverse mix columns
294
+ ##
295
+ &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
296
+ &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
297
+ &pshufb ("xmm4","xmm2"); # 4 = sb9u
298
+ &pshufb ("xmm1","xmm3"); # 0 = sb9t
299
+ &pxor ("xmm0","xmm4");
300
+ &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
301
+ &pxor ("xmm0","xmm1"); # 0 = ch
302
+ &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
303
+
304
+ &pshufb ("xmm4","xmm2"); # 4 = sbdu
305
+ &pshufb ("xmm0","xmm5"); # MC ch
306
+ &pshufb ("xmm1","xmm3"); # 0 = sbdt
307
+ &pxor ("xmm0","xmm4"); # 4 = ch
308
+ &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
309
+ &pxor ("xmm0","xmm1"); # 0 = ch
310
+ &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
311
+
312
+ &pshufb ("xmm4","xmm2"); # 4 = sbbu
313
+ &pshufb ("xmm0","xmm5"); # MC ch
314
+ &pshufb ("xmm1","xmm3"); # 0 = sbbt
315
+ &pxor ("xmm0","xmm4"); # 4 = ch
316
+ &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
317
+ &pxor ("xmm0","xmm1"); # 0 = ch
318
+ &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
319
+
320
+ &pshufb ("xmm4","xmm2"); # 4 = sbeu
321
+ &pshufb ("xmm0","xmm5"); # MC ch
322
+ &pshufb ("xmm1","xmm3"); # 0 = sbet
323
+ &pxor ("xmm0","xmm4"); # 4 = ch
324
+ &add ($key,16); # next round key
325
+ &palignr("xmm5","xmm5",12);
326
+ &pxor ("xmm0","xmm1"); # 0 = ch
327
+ &sub ($round,1); # nr--
328
+
329
+ &set_label("dec_entry");
330
+ # top of round
331
+ &movdqa ("xmm1","xmm6"); # 1 : i
332
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
333
+ &pandn ("xmm1","xmm0"); # 1 = i<<4
334
+ &pand ("xmm0","xmm6"); # 0 = k
335
+ &psrld ("xmm1",4); # 1 = i
336
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
337
+ &movdqa ("xmm3","xmm7"); # 3 : 1/i
338
+ &pxor ("xmm0","xmm1"); # 0 = j
339
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
340
+ &movdqa ("xmm4","xmm7"); # 4 : 1/j
341
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
342
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
343
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
344
+ &movdqa ("xmm2","xmm7"); # 2 : 1/iak
345
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
346
+ &movdqa ("xmm3","xmm7"); # 3 : 1/jak
347
+ &pxor ("xmm2","xmm0"); # 2 = io
348
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
349
+ &movdqu ("xmm0",&QWP(0,$key));
350
+ &pxor ("xmm3","xmm1"); # 3 = jo
351
+ &jnz (&label("dec_loop"));
352
+
353
+ # middle of last round
354
+ &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
355
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
356
+ &pxor ("xmm4","xmm0"); # 4 = sb1u + k
357
+ &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
358
+ &movdqa ("xmm2",&QWP(0,$magic));
359
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
360
+ &pxor ("xmm0","xmm4"); # 0 = A
361
+ &pshufb ("xmm0","xmm2");
362
+ &ret ();
363
+ &function_end_B("_vpaes_decrypt_core");
364
+
365
+ ########################################################
366
+ ## ##
367
+ ## AES key schedule ##
368
+ ## ##
369
+ ########################################################
370
+ &function_begin_B("_vpaes_schedule_core");
371
+ &add ($const,&DWP(0,"esp"));
372
+ &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
373
+ &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
374
+
375
+ # input transform
376
+ &movdqa ("xmm3","xmm0");
377
+ &lea ($base,&DWP($k_ipt,$const));
378
+ &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
379
+ &call ("_vpaes_schedule_transform");
380
+ &movdqa ("xmm7","xmm0");
381
+
382
+ &test ($out,$out);
383
+ &jnz (&label("schedule_am_decrypting"));
384
+
385
+ # encrypting, output zeroth round key after transform
386
+ &movdqu (&QWP(0,$key),"xmm0");
387
+ &jmp (&label("schedule_go"));
388
+
389
+ &set_label("schedule_am_decrypting");
390
+ # decrypting, output zeroth round key after shiftrows
391
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
392
+ &pshufb ("xmm3","xmm1");
393
+ &movdqu (&QWP(0,$key),"xmm3");
394
+ &xor ($magic,0x30);
395
+
396
+ &set_label("schedule_go");
397
+ &cmp ($round,192);
398
+ &ja (&label("schedule_256"));
399
+ &je (&label("schedule_192"));
400
+ # 128: fall though
401
+
402
+ ##
403
+ ## .schedule_128
404
+ ##
405
+ ## 128-bit specific part of key schedule.
406
+ ##
407
+ ## This schedule is really simple, because all its parts
408
+ ## are accomplished by the subroutines.
409
+ ##
410
+ &set_label("schedule_128");
411
+ &mov ($round,10);
412
+
413
+ &set_label("loop_schedule_128");
414
+ &call ("_vpaes_schedule_round");
415
+ &dec ($round);
416
+ &jz (&label("schedule_mangle_last"));
417
+ &call ("_vpaes_schedule_mangle"); # write output
418
+ &jmp (&label("loop_schedule_128"));
419
+
420
+ ##
421
+ ## .aes_schedule_192
422
+ ##
423
+ ## 192-bit specific part of key schedule.
424
+ ##
425
+ ## The main body of this schedule is the same as the 128-bit
426
+ ## schedule, but with more smearing. The long, high side is
427
+ ## stored in %xmm7 as before, and the short, low side is in
428
+ ## the high bits of %xmm6.
429
+ ##
430
+ ## This schedule is somewhat nastier, however, because each
431
+ ## round produces 192 bits of key material, or 1.5 round keys.
432
+ ## Therefore, on each cycle we do 2 rounds and produce 3 round
433
+ ## keys.
434
+ ##
435
+ &set_label("schedule_192",16);
436
+ &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
437
+ &call ("_vpaes_schedule_transform"); # input transform
438
+ &movdqa ("xmm6","xmm0"); # save short part
439
+ &pxor ("xmm4","xmm4"); # clear 4
440
+ &movhlps("xmm6","xmm4"); # clobber low side with zeros
441
+ &mov ($round,4);
442
+
443
+ &set_label("loop_schedule_192");
444
+ &call ("_vpaes_schedule_round");
445
+ &palignr("xmm0","xmm6",8);
446
+ &call ("_vpaes_schedule_mangle"); # save key n
447
+ &call ("_vpaes_schedule_192_smear");
448
+ &call ("_vpaes_schedule_mangle"); # save key n+1
449
+ &call ("_vpaes_schedule_round");
450
+ &dec ($round);
451
+ &jz (&label("schedule_mangle_last"));
452
+ &call ("_vpaes_schedule_mangle"); # save key n+2
453
+ &call ("_vpaes_schedule_192_smear");
454
+ &jmp (&label("loop_schedule_192"));
455
+
456
+ ##
457
+ ## .aes_schedule_256
458
+ ##
459
+ ## 256-bit specific part of key schedule.
460
+ ##
461
+ ## The structure here is very similar to the 128-bit
462
+ ## schedule, but with an additional "low side" in
463
+ ## %xmm6. The low side's rounds are the same as the
464
+ ## high side's, except no rcon and no rotation.
465
+ ##
466
+ &set_label("schedule_256",16);
467
+ &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
468
+ &call ("_vpaes_schedule_transform"); # input transform
469
+ &mov ($round,7);
470
+
471
+ &set_label("loop_schedule_256");
472
+ &call ("_vpaes_schedule_mangle"); # output low result
473
+ &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
474
+
475
+ # high round
476
+ &call ("_vpaes_schedule_round");
477
+ &dec ($round);
478
+ &jz (&label("schedule_mangle_last"));
479
+ &call ("_vpaes_schedule_mangle");
480
+
481
+ # low round. swap xmm7 and xmm6
482
+ &pshufd ("xmm0","xmm0",0xFF);
483
+ &movdqa (&QWP(20,"esp"),"xmm7");
484
+ &movdqa ("xmm7","xmm6");
485
+ &call ("_vpaes_schedule_low_round");
486
+ &movdqa ("xmm7",&QWP(20,"esp"));
487
+
488
+ &jmp (&label("loop_schedule_256"));
489
+
490
+ ##
491
+ ## .aes_schedule_mangle_last
492
+ ##
493
+ ## Mangler for last round of key schedule
494
+ ## Mangles %xmm0
495
+ ## when encrypting, outputs out(%xmm0) ^ 63
496
+ ## when decrypting, outputs unskew(%xmm0)
497
+ ##
498
+ ## Always called right before return... jumps to cleanup and exits
499
+ ##
500
+ &set_label("schedule_mangle_last",16);
501
+ # schedule last round key from xmm0
502
+ &lea ($base,&DWP($k_deskew,$const));
503
+ &test ($out,$out);
504
+ &jnz (&label("schedule_mangle_last_dec"));
505
+
506
+ # encrypting
507
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
508
+ &pshufb ("xmm0","xmm1"); # output permute
509
+ &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
510
+ &add ($key,32);
511
+
512
+ &set_label("schedule_mangle_last_dec");
513
+ &add ($key,-16);
514
+ &pxor ("xmm0",&QWP($k_s63,$const));
515
+ &call ("_vpaes_schedule_transform"); # output transform
516
+ &movdqu (&QWP(0,$key),"xmm0"); # save last key
517
+
518
+ # cleanup
519
+ &pxor ("xmm0","xmm0");
520
+ &pxor ("xmm1","xmm1");
521
+ &pxor ("xmm2","xmm2");
522
+ &pxor ("xmm3","xmm3");
523
+ &pxor ("xmm4","xmm4");
524
+ &pxor ("xmm5","xmm5");
525
+ &pxor ("xmm6","xmm6");
526
+ &pxor ("xmm7","xmm7");
527
+ &ret ();
528
+ &function_end_B("_vpaes_schedule_core");
529
+
530
+ ##
531
+ ## .aes_schedule_192_smear
532
+ ##
533
+ ## Smear the short, low side in the 192-bit key schedule.
534
+ ##
535
+ ## Inputs:
536
+ ## %xmm7: high side, b a x y
537
+ ## %xmm6: low side, d c 0 0
538
+ ## %xmm13: 0
539
+ ##
540
+ ## Outputs:
541
+ ## %xmm6: b+c+d b+c 0 0
542
+ ## %xmm0: b+c+d b+c b a
543
+ ##
544
+ &function_begin_B("_vpaes_schedule_192_smear");
545
+ &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
546
+ &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
547
+ &pxor ("xmm6","xmm1"); # -> c+d c 0 0
548
+ &pxor ("xmm1","xmm1");
549
+ &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
550
+ &movdqa ("xmm0","xmm6");
551
+ &movhlps("xmm6","xmm1"); # clobber low side with zeros
552
+ &ret ();
553
+ &function_end_B("_vpaes_schedule_192_smear");
554
+
555
+ ##
556
+ ## .aes_schedule_round
557
+ ##
558
+ ## Runs one main round of the key schedule on %xmm0, %xmm7
559
+ ##
560
+ ## Specifically, runs subbytes on the high dword of %xmm0
561
+ ## then rotates it by one byte and xors into the low dword of
562
+ ## %xmm7.
563
+ ##
564
+ ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
565
+ ## next rcon.
566
+ ##
567
+ ## Smears the dwords of %xmm7 by xoring the low into the
568
+ ## second low, result into third, result into highest.
569
+ ##
570
+ ## Returns results in %xmm7 = %xmm0.
571
+ ## Clobbers %xmm1-%xmm5.
572
+ ##
573
+ &function_begin_B("_vpaes_schedule_round");
574
+ # extract rcon from xmm8
575
+ &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
576
+ &pxor ("xmm1","xmm1");
577
+ &palignr("xmm1","xmm2",15);
578
+ &palignr("xmm2","xmm2",15);
579
+ &pxor ("xmm7","xmm1");
580
+
581
+ # rotate
582
+ &pshufd ("xmm0","xmm0",0xFF);
583
+ &palignr("xmm0","xmm0",1);
584
+
585
+ # fall through...
586
+ &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
587
+
588
+ # low round: same as high round, but no rotation and no rcon.
589
+ &set_label("_vpaes_schedule_low_round");
590
+ # smear xmm7
591
+ &movdqa ("xmm1","xmm7");
592
+ &pslldq ("xmm7",4);
593
+ &pxor ("xmm7","xmm1");
594
+ &movdqa ("xmm1","xmm7");
595
+ &pslldq ("xmm7",8);
596
+ &pxor ("xmm7","xmm1");
597
+ &pxor ("xmm7",&QWP($k_s63,$const));
598
+
599
+ # subbyte
600
+ &movdqa ("xmm4",&QWP($k_s0F,$const));
601
+ &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
602
+ &movdqa ("xmm1","xmm4");
603
+ &pandn ("xmm1","xmm0");
604
+ &psrld ("xmm1",4); # 1 = i
605
+ &pand ("xmm0","xmm4"); # 0 = k
606
+ &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
607
+ &pshufb ("xmm2","xmm0"); # 2 = a/k
608
+ &pxor ("xmm0","xmm1"); # 0 = j
609
+ &movdqa ("xmm3","xmm5"); # 3 : 1/i
610
+ &pshufb ("xmm3","xmm1"); # 3 = 1/i
611
+ &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
612
+ &movdqa ("xmm4","xmm5"); # 4 : 1/j
613
+ &pshufb ("xmm4","xmm0"); # 4 = 1/j
614
+ &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
615
+ &movdqa ("xmm2","xmm5"); # 2 : 1/iak
616
+ &pshufb ("xmm2","xmm3"); # 2 = 1/iak
617
+ &pxor ("xmm2","xmm0"); # 2 = io
618
+ &movdqa ("xmm3","xmm5"); # 3 : 1/jak
619
+ &pshufb ("xmm3","xmm4"); # 3 = 1/jak
620
+ &pxor ("xmm3","xmm1"); # 3 = jo
621
+ &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
622
+ &pshufb ("xmm4","xmm2"); # 4 = sbou
623
+ &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
624
+ &pshufb ("xmm0","xmm3"); # 0 = sb1t
625
+ &pxor ("xmm0","xmm4"); # 0 = sbox output
626
+
627
+ # add in smeared stuff
628
+ &pxor ("xmm0","xmm7");
629
+ &movdqa ("xmm7","xmm0");
630
+ &ret ();
631
+ &function_end_B("_vpaes_schedule_round");
632
+
633
+ ##
634
+ ## .aes_schedule_transform
635
+ ##
636
+ ## Linear-transform %xmm0 according to tables at (%ebx)
637
+ ##
638
+ ## Output in %xmm0
639
+ ## Clobbers %xmm1, %xmm2
640
+ ##
641
+ &function_begin_B("_vpaes_schedule_transform");
642
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
643
+ &movdqa ("xmm1","xmm2");
644
+ &pandn ("xmm1","xmm0");
645
+ &psrld ("xmm1",4);
646
+ &pand ("xmm0","xmm2");
647
+ &movdqa ("xmm2",&QWP(0,$base));
648
+ &pshufb ("xmm2","xmm0");
649
+ &movdqa ("xmm0",&QWP(16,$base));
650
+ &pshufb ("xmm0","xmm1");
651
+ &pxor ("xmm0","xmm2");
652
+ &ret ();
653
+ &function_end_B("_vpaes_schedule_transform");
654
+
655
+ ##
656
+ ## .aes_schedule_mangle
657
+ ##
658
+ ## Mangle xmm0 from (basis-transformed) standard version
659
+ ## to our version.
660
+ ##
661
+ ## On encrypt,
662
+ ## xor with 0x63
663
+ ## multiply by circulant 0,1,1,1
664
+ ## apply shiftrows transform
665
+ ##
666
+ ## On decrypt,
667
+ ## xor with 0x63
668
+ ## multiply by "inverse mixcolumns" circulant E,B,D,9
669
+ ## deskew
670
+ ## apply shiftrows transform
671
+ ##
672
+ ##
673
+ ## Writes out to (%edx), and increments or decrements it
674
+ ## Keeps track of round number mod 4 in %ecx
675
+ ## Preserves xmm0
676
+ ## Clobbers xmm1-xmm5
677
+ ##
678
+ &function_begin_B("_vpaes_schedule_mangle");
679
+ &movdqa ("xmm4","xmm0"); # save xmm0 for later
680
+ &movdqa ("xmm5",&QWP($k_mc_forward,$const));
681
+ &test ($out,$out);
682
+ &jnz (&label("schedule_mangle_dec"));
683
+
684
+ # encrypting
685
+ &add ($key,16);
686
+ &pxor ("xmm4",&QWP($k_s63,$const));
687
+ &pshufb ("xmm4","xmm5");
688
+ &movdqa ("xmm3","xmm4");
689
+ &pshufb ("xmm4","xmm5");
690
+ &pxor ("xmm3","xmm4");
691
+ &pshufb ("xmm4","xmm5");
692
+ &pxor ("xmm3","xmm4");
693
+
694
+ &jmp (&label("schedule_mangle_both"));
695
+
696
+ &set_label("schedule_mangle_dec",16);
697
+ # inverse mix columns
698
+ &movdqa ("xmm2",&QWP($k_s0F,$const));
699
+ &lea ($inp,&DWP($k_dksd,$const));
700
+ &movdqa ("xmm1","xmm2");
701
+ &pandn ("xmm1","xmm4");
702
+ &psrld ("xmm1",4); # 1 = hi
703
+ &pand ("xmm4","xmm2"); # 4 = lo
704
+
705
+ &movdqa ("xmm2",&QWP(0,$inp));
706
+ &pshufb ("xmm2","xmm4");
707
+ &movdqa ("xmm3",&QWP(0x10,$inp));
708
+ &pshufb ("xmm3","xmm1");
709
+ &pxor ("xmm3","xmm2");
710
+ &pshufb ("xmm3","xmm5");
711
+
712
+ &movdqa ("xmm2",&QWP(0x20,$inp));
713
+ &pshufb ("xmm2","xmm4");
714
+ &pxor ("xmm2","xmm3");
715
+ &movdqa ("xmm3",&QWP(0x30,$inp));
716
+ &pshufb ("xmm3","xmm1");
717
+ &pxor ("xmm3","xmm2");
718
+ &pshufb ("xmm3","xmm5");
719
+
720
+ &movdqa ("xmm2",&QWP(0x40,$inp));
721
+ &pshufb ("xmm2","xmm4");
722
+ &pxor ("xmm2","xmm3");
723
+ &movdqa ("xmm3",&QWP(0x50,$inp));
724
+ &pshufb ("xmm3","xmm1");
725
+ &pxor ("xmm3","xmm2");
726
+ &pshufb ("xmm3","xmm5");
727
+
728
+ &movdqa ("xmm2",&QWP(0x60,$inp));
729
+ &pshufb ("xmm2","xmm4");
730
+ &pxor ("xmm2","xmm3");
731
+ &movdqa ("xmm3",&QWP(0x70,$inp));
732
+ &pshufb ("xmm3","xmm1");
733
+ &pxor ("xmm3","xmm2");
734
+
735
+ &add ($key,-16);
736
+
737
+ &set_label("schedule_mangle_both");
738
+ &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
739
+ &pshufb ("xmm3","xmm1");
740
+ &add ($magic,-16);
741
+ &and ($magic,0x30);
742
+ &movdqu (&QWP(0,$key),"xmm3");
743
+ &ret ();
744
+ &function_end_B("_vpaes_schedule_mangle");
745
+
746
+ #
747
+ # Interface to OpenSSL
748
+ #
749
+ &function_begin("${PREFIX}_set_encrypt_key");
750
+ &mov ($inp,&wparam(0)); # inp
751
+ &lea ($base,&DWP(-56,"esp"));
752
+ &mov ($round,&wparam(1)); # bits
753
+ &and ($base,-16);
754
+ &mov ($key,&wparam(2)); # key
755
+ &xchg ($base,"esp"); # alloca
756
+ &mov (&DWP(48,"esp"),$base);
757
+
758
+ &mov ($base,$round);
759
+ &shr ($base,5);
760
+ &add ($base,5);
761
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
762
+ &mov ($magic,0x30);
763
+ &mov ($out,0);
764
+
765
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
766
+ &call ("_vpaes_schedule_core");
767
+ &set_label("pic_point");
768
+
769
+ &mov ("esp",&DWP(48,"esp"));
770
+ &xor ("eax","eax");
771
+ &function_end("${PREFIX}_set_encrypt_key");
772
+
773
+ &function_begin("${PREFIX}_set_decrypt_key");
774
+ &mov ($inp,&wparam(0)); # inp
775
+ &lea ($base,&DWP(-56,"esp"));
776
+ &mov ($round,&wparam(1)); # bits
777
+ &and ($base,-16);
778
+ &mov ($key,&wparam(2)); # key
779
+ &xchg ($base,"esp"); # alloca
780
+ &mov (&DWP(48,"esp"),$base);
781
+
782
+ &mov ($base,$round);
783
+ &shr ($base,5);
784
+ &add ($base,5);
785
+ &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
786
+ &shl ($base,4);
787
+ &lea ($key,&DWP(16,$key,$base));
788
+
789
+ &mov ($out,1);
790
+ &mov ($magic,$round);
791
+ &shr ($magic,1);
792
+ &and ($magic,32);
793
+ &xor ($magic,32); # nbist==192?0:32;
794
+
795
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
796
+ &call ("_vpaes_schedule_core");
797
+ &set_label("pic_point");
798
+
799
+ &mov ("esp",&DWP(48,"esp"));
800
+ &xor ("eax","eax");
801
+ &function_end("${PREFIX}_set_decrypt_key");
802
+
803
+ &function_begin("${PREFIX}_encrypt");
804
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805
+ &call ("_vpaes_preheat");
806
+ &set_label("pic_point");
807
+ &mov ($inp,&wparam(0)); # inp
808
+ &lea ($base,&DWP(-56,"esp"));
809
+ &mov ($out,&wparam(1)); # out
810
+ &and ($base,-16);
811
+ &mov ($key,&wparam(2)); # key
812
+ &xchg ($base,"esp"); # alloca
813
+ &mov (&DWP(48,"esp"),$base);
814
+
815
+ &movdqu ("xmm0",&QWP(0,$inp));
816
+ &call ("_vpaes_encrypt_core");
817
+ &movdqu (&QWP(0,$out),"xmm0");
818
+
819
+ &mov ("esp",&DWP(48,"esp"));
820
+ &function_end("${PREFIX}_encrypt");
821
+
822
+ &function_begin("${PREFIX}_decrypt");
823
+ &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
824
+ &call ("_vpaes_preheat");
825
+ &set_label("pic_point");
826
+ &mov ($inp,&wparam(0)); # inp
827
+ &lea ($base,&DWP(-56,"esp"));
828
+ &mov ($out,&wparam(1)); # out
829
+ &and ($base,-16);
830
+ &mov ($key,&wparam(2)); # key
831
+ &xchg ($base,"esp"); # alloca
832
+ &mov (&DWP(48,"esp"),$base);
833
+
834
+ &movdqu ("xmm0",&QWP(0,$inp));
835
+ &call ("_vpaes_decrypt_core");
836
+ &movdqu (&QWP(0,$out),"xmm0");
837
+
838
+ &mov ("esp",&DWP(48,"esp"));
839
+ &function_end("${PREFIX}_decrypt");
840
+
841
+ &asm_finish();