ring-native 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/Gemfile +3 -0
- data/README.md +22 -0
- data/Rakefile +1 -0
- data/ext/ring/extconf.rb +29 -0
- data/lib/ring/native.rb +8 -0
- data/lib/ring/native/version.rb +5 -0
- data/ring-native.gemspec +25 -0
- data/vendor/ring/BUILDING.md +40 -0
- data/vendor/ring/Cargo.toml +43 -0
- data/vendor/ring/LICENSE +185 -0
- data/vendor/ring/Makefile +35 -0
- data/vendor/ring/PORTING.md +163 -0
- data/vendor/ring/README.md +113 -0
- data/vendor/ring/STYLE.md +197 -0
- data/vendor/ring/appveyor.yml +27 -0
- data/vendor/ring/build.rs +108 -0
- data/vendor/ring/crypto/aes/aes.c +1142 -0
- data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/aes/aes_test.cc +93 -0
- data/vendor/ring/crypto/aes/asm/aes-586.pl +2368 -0
- data/vendor/ring/crypto/aes/asm/aes-armv4.pl +1249 -0
- data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +2246 -0
- data/vendor/ring/crypto/aes/asm/aesni-x86.pl +1318 -0
- data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +2084 -0
- data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +675 -0
- data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +1364 -0
- data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +1565 -0
- data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +841 -0
- data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +1116 -0
- data/vendor/ring/crypto/aes/internal.h +87 -0
- data/vendor/ring/crypto/aes/mode_wrappers.c +61 -0
- data/vendor/ring/crypto/bn/add.c +394 -0
- data/vendor/ring/crypto/bn/asm/armv4-mont.pl +694 -0
- data/vendor/ring/crypto/bn/asm/armv8-mont.pl +1503 -0
- data/vendor/ring/crypto/bn/asm/bn-586.pl +774 -0
- data/vendor/ring/crypto/bn/asm/co-586.pl +287 -0
- data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +1882 -0
- data/vendor/ring/crypto/bn/asm/x86-mont.pl +592 -0
- data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +599 -0
- data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +1393 -0
- data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +3507 -0
- data/vendor/ring/crypto/bn/bn.c +352 -0
- data/vendor/ring/crypto/bn/bn_asn1.c +74 -0
- data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/bn/bn_test.cc +1696 -0
- data/vendor/ring/crypto/bn/cmp.c +200 -0
- data/vendor/ring/crypto/bn/convert.c +433 -0
- data/vendor/ring/crypto/bn/ctx.c +311 -0
- data/vendor/ring/crypto/bn/div.c +594 -0
- data/vendor/ring/crypto/bn/exponentiation.c +1335 -0
- data/vendor/ring/crypto/bn/gcd.c +711 -0
- data/vendor/ring/crypto/bn/generic.c +1019 -0
- data/vendor/ring/crypto/bn/internal.h +316 -0
- data/vendor/ring/crypto/bn/montgomery.c +516 -0
- data/vendor/ring/crypto/bn/mul.c +888 -0
- data/vendor/ring/crypto/bn/prime.c +829 -0
- data/vendor/ring/crypto/bn/random.c +334 -0
- data/vendor/ring/crypto/bn/rsaz_exp.c +262 -0
- data/vendor/ring/crypto/bn/rsaz_exp.h +53 -0
- data/vendor/ring/crypto/bn/shift.c +276 -0
- data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/bytestring/bytestring_test.cc +421 -0
- data/vendor/ring/crypto/bytestring/cbb.c +399 -0
- data/vendor/ring/crypto/bytestring/cbs.c +227 -0
- data/vendor/ring/crypto/bytestring/internal.h +46 -0
- data/vendor/ring/crypto/chacha/chacha_generic.c +140 -0
- data/vendor/ring/crypto/chacha/chacha_vec.c +323 -0
- data/vendor/ring/crypto/chacha/chacha_vec_arm.S +1447 -0
- data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +153 -0
- data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/cipher/e_aes.c +390 -0
- data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +208 -0
- data/vendor/ring/crypto/cipher/internal.h +173 -0
- data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +543 -0
- data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +9 -0
- data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +475 -0
- data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +23 -0
- data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +422 -0
- data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +484 -0
- data/vendor/ring/crypto/cipher/test/cipher_test.txt +100 -0
- data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/constant_time_test.c +304 -0
- data/vendor/ring/crypto/cpu-arm-asm.S +32 -0
- data/vendor/ring/crypto/cpu-arm.c +199 -0
- data/vendor/ring/crypto/cpu-intel.c +261 -0
- data/vendor/ring/crypto/crypto.c +151 -0
- data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +2118 -0
- data/vendor/ring/crypto/curve25519/curve25519.c +4888 -0
- data/vendor/ring/crypto/curve25519/x25519_test.cc +128 -0
- data/vendor/ring/crypto/digest/md32_common.h +181 -0
- data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +2725 -0
- data/vendor/ring/crypto/ec/ec.c +193 -0
- data/vendor/ring/crypto/ec/ec_curves.c +61 -0
- data/vendor/ring/crypto/ec/ec_key.c +228 -0
- data/vendor/ring/crypto/ec/ec_montgomery.c +114 -0
- data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/ec/internal.h +243 -0
- data/vendor/ring/crypto/ec/oct.c +253 -0
- data/vendor/ring/crypto/ec/p256-64.c +1794 -0
- data/vendor/ring/crypto/ec/p256-x86_64-table.h +9548 -0
- data/vendor/ring/crypto/ec/p256-x86_64.c +509 -0
- data/vendor/ring/crypto/ec/simple.c +1007 -0
- data/vendor/ring/crypto/ec/util-64.c +183 -0
- data/vendor/ring/crypto/ec/wnaf.c +508 -0
- data/vendor/ring/crypto/ecdh/ecdh.c +155 -0
- data/vendor/ring/crypto/ecdsa/ecdsa.c +304 -0
- data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +193 -0
- data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +327 -0
- data/vendor/ring/crypto/header_removed.h +17 -0
- data/vendor/ring/crypto/internal.h +495 -0
- data/vendor/ring/crypto/libring.Windows.vcxproj +101 -0
- data/vendor/ring/crypto/mem.c +98 -0
- data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +1045 -0
- data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +517 -0
- data/vendor/ring/crypto/modes/asm/ghash-x86.pl +1393 -0
- data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +1741 -0
- data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +422 -0
- data/vendor/ring/crypto/modes/ctr.c +226 -0
- data/vendor/ring/crypto/modes/gcm.c +1206 -0
- data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/modes/gcm_test.c +348 -0
- data/vendor/ring/crypto/modes/internal.h +299 -0
- data/vendor/ring/crypto/perlasm/arm-xlate.pl +170 -0
- data/vendor/ring/crypto/perlasm/readme +100 -0
- data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +1164 -0
- data/vendor/ring/crypto/perlasm/x86asm.pl +292 -0
- data/vendor/ring/crypto/perlasm/x86gas.pl +263 -0
- data/vendor/ring/crypto/perlasm/x86masm.pl +200 -0
- data/vendor/ring/crypto/perlasm/x86nasm.pl +187 -0
- data/vendor/ring/crypto/poly1305/poly1305.c +331 -0
- data/vendor/ring/crypto/poly1305/poly1305_arm.c +301 -0
- data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +2015 -0
- data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/poly1305/poly1305_test.cc +80 -0
- data/vendor/ring/crypto/poly1305/poly1305_test.txt +52 -0
- data/vendor/ring/crypto/poly1305/poly1305_vec.c +892 -0
- data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +75 -0
- data/vendor/ring/crypto/rand/internal.h +32 -0
- data/vendor/ring/crypto/rand/rand.c +189 -0
- data/vendor/ring/crypto/rand/urandom.c +219 -0
- data/vendor/ring/crypto/rand/windows.c +56 -0
- data/vendor/ring/crypto/refcount_c11.c +66 -0
- data/vendor/ring/crypto/refcount_lock.c +53 -0
- data/vendor/ring/crypto/refcount_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/refcount_test.c +58 -0
- data/vendor/ring/crypto/rsa/blinding.c +462 -0
- data/vendor/ring/crypto/rsa/internal.h +108 -0
- data/vendor/ring/crypto/rsa/padding.c +300 -0
- data/vendor/ring/crypto/rsa/rsa.c +450 -0
- data/vendor/ring/crypto/rsa/rsa_asn1.c +261 -0
- data/vendor/ring/crypto/rsa/rsa_impl.c +944 -0
- data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/rsa/rsa_test.cc +437 -0
- data/vendor/ring/crypto/sha/asm/sha-armv8.pl +436 -0
- data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +2390 -0
- data/vendor/ring/crypto/sha/asm/sha256-586.pl +1275 -0
- data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +735 -0
- data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +14 -0
- data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +14 -0
- data/vendor/ring/crypto/sha/asm/sha512-586.pl +911 -0
- data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +666 -0
- data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +14 -0
- data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +14 -0
- data/vendor/ring/crypto/sha/sha1.c +271 -0
- data/vendor/ring/crypto/sha/sha256.c +204 -0
- data/vendor/ring/crypto/sha/sha512.c +355 -0
- data/vendor/ring/crypto/test/file_test.cc +326 -0
- data/vendor/ring/crypto/test/file_test.h +181 -0
- data/vendor/ring/crypto/test/malloc.cc +150 -0
- data/vendor/ring/crypto/test/scoped_types.h +95 -0
- data/vendor/ring/crypto/test/test.Windows.vcxproj +35 -0
- data/vendor/ring/crypto/test/test_util.cc +46 -0
- data/vendor/ring/crypto/test/test_util.h +41 -0
- data/vendor/ring/crypto/thread_none.c +55 -0
- data/vendor/ring/crypto/thread_pthread.c +165 -0
- data/vendor/ring/crypto/thread_test.Windows.vcxproj +25 -0
- data/vendor/ring/crypto/thread_test.c +200 -0
- data/vendor/ring/crypto/thread_win.c +282 -0
- data/vendor/ring/examples/checkdigest.rs +103 -0
- data/vendor/ring/include/openssl/aes.h +121 -0
- data/vendor/ring/include/openssl/arm_arch.h +129 -0
- data/vendor/ring/include/openssl/base.h +156 -0
- data/vendor/ring/include/openssl/bn.h +794 -0
- data/vendor/ring/include/openssl/buffer.h +18 -0
- data/vendor/ring/include/openssl/bytestring.h +235 -0
- data/vendor/ring/include/openssl/chacha.h +37 -0
- data/vendor/ring/include/openssl/cmac.h +76 -0
- data/vendor/ring/include/openssl/cpu.h +184 -0
- data/vendor/ring/include/openssl/crypto.h +43 -0
- data/vendor/ring/include/openssl/curve25519.h +88 -0
- data/vendor/ring/include/openssl/ec.h +225 -0
- data/vendor/ring/include/openssl/ec_key.h +129 -0
- data/vendor/ring/include/openssl/ecdh.h +110 -0
- data/vendor/ring/include/openssl/ecdsa.h +156 -0
- data/vendor/ring/include/openssl/err.h +201 -0
- data/vendor/ring/include/openssl/mem.h +101 -0
- data/vendor/ring/include/openssl/obj_mac.h +71 -0
- data/vendor/ring/include/openssl/opensslfeatures.h +68 -0
- data/vendor/ring/include/openssl/opensslv.h +18 -0
- data/vendor/ring/include/openssl/ossl_typ.h +18 -0
- data/vendor/ring/include/openssl/poly1305.h +51 -0
- data/vendor/ring/include/openssl/rand.h +70 -0
- data/vendor/ring/include/openssl/rsa.h +399 -0
- data/vendor/ring/include/openssl/thread.h +133 -0
- data/vendor/ring/include/openssl/type_check.h +71 -0
- data/vendor/ring/mk/Common.props +63 -0
- data/vendor/ring/mk/Windows.props +42 -0
- data/vendor/ring/mk/WindowsTest.props +18 -0
- data/vendor/ring/mk/appveyor.bat +62 -0
- data/vendor/ring/mk/bottom_of_makefile.mk +54 -0
- data/vendor/ring/mk/ring.mk +266 -0
- data/vendor/ring/mk/top_of_makefile.mk +214 -0
- data/vendor/ring/mk/travis.sh +40 -0
- data/vendor/ring/mk/update-travis-yml.py +229 -0
- data/vendor/ring/ring.sln +153 -0
- data/vendor/ring/src/aead.rs +682 -0
- data/vendor/ring/src/agreement.rs +248 -0
- data/vendor/ring/src/c.rs +129 -0
- data/vendor/ring/src/constant_time.rs +37 -0
- data/vendor/ring/src/der.rs +96 -0
- data/vendor/ring/src/digest.rs +690 -0
- data/vendor/ring/src/digest_tests.txt +57 -0
- data/vendor/ring/src/ecc.rs +28 -0
- data/vendor/ring/src/ecc_build.rs +279 -0
- data/vendor/ring/src/ecc_curves.rs +117 -0
- data/vendor/ring/src/ed25519_tests.txt +2579 -0
- data/vendor/ring/src/exe_tests.rs +46 -0
- data/vendor/ring/src/ffi.rs +29 -0
- data/vendor/ring/src/file_test.rs +187 -0
- data/vendor/ring/src/hkdf.rs +153 -0
- data/vendor/ring/src/hkdf_tests.txt +59 -0
- data/vendor/ring/src/hmac.rs +414 -0
- data/vendor/ring/src/hmac_tests.txt +97 -0
- data/vendor/ring/src/input.rs +312 -0
- data/vendor/ring/src/lib.rs +41 -0
- data/vendor/ring/src/pbkdf2.rs +265 -0
- data/vendor/ring/src/pbkdf2_tests.txt +113 -0
- data/vendor/ring/src/polyfill.rs +57 -0
- data/vendor/ring/src/rand.rs +28 -0
- data/vendor/ring/src/signature.rs +314 -0
- data/vendor/ring/third-party/NIST/README.md +9 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +263 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +309 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +267 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +263 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +309 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +267 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +263 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +309 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +267 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +519 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +309 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +523 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +519 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +309 -0
- data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +523 -0
- data/vendor/ring/third-party/NIST/sha256sums.txt +1 -0
- metadata +333 -0
|
@@ -0,0 +1,1565 @@
|
|
|
1
|
+
#!/usr/bin/env perl
|
|
2
|
+
|
|
3
|
+
###################################################################
|
|
4
|
+
### AES-128 [originally in CTR mode] ###
|
|
5
|
+
### bitsliced implementation for Intel Core 2 processors ###
|
|
6
|
+
### requires support of SSE extensions up to SSSE3 ###
|
|
7
|
+
### Author: Emilia Käsper and Peter Schwabe ###
|
|
8
|
+
### Date: 2009-03-19 ###
|
|
9
|
+
### Public domain ###
|
|
10
|
+
### ###
|
|
11
|
+
### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
|
|
12
|
+
### further information. ###
|
|
13
|
+
###################################################################
|
|
14
|
+
#
|
|
15
|
+
# September 2011.
|
|
16
|
+
#
|
|
17
|
+
# Started as transliteration to "perlasm" the original code has
|
|
18
|
+
# undergone following changes:
|
|
19
|
+
#
|
|
20
|
+
# - code was made position-independent;
|
|
21
|
+
# - rounds were folded into a loop resulting in >5x size reduction
|
|
22
|
+
# from 12.5KB to 2.2KB;
|
|
23
|
+
# - above was possibile thanks to mixcolumns() modification that
|
|
24
|
+
# allowed to feed its output back to aesenc[last], this was
|
|
25
|
+
# achieved at cost of two additional inter-registers moves;
|
|
26
|
+
# - some instruction reordering and interleaving;
|
|
27
|
+
# - this module doesn't implement key setup subroutine, instead it
|
|
28
|
+
# relies on conversion of "conventional" key schedule as returned
|
|
29
|
+
# by AES_set_encrypt_key (see discussion below);
|
|
30
|
+
# - first and last round keys are treated differently, which allowed
|
|
31
|
+
# to skip one shiftrows(), reduce bit-sliced key schedule and
|
|
32
|
+
# speed-up conversion by 22%;
|
|
33
|
+
# - support for 192- and 256-bit keys was added;
|
|
34
|
+
#
|
|
35
|
+
# Resulting performance in CPU cycles spent to encrypt one byte out
|
|
36
|
+
# of 4096-byte buffer with 128-bit key is:
|
|
37
|
+
#
|
|
38
|
+
# Emilia's this(*) difference
|
|
39
|
+
#
|
|
40
|
+
# Core 2 9.30 8.69 +7%
|
|
41
|
+
# Nehalem(**) 7.63 6.88 +11%
|
|
42
|
+
# Atom 17.1 16.4 +4%
|
|
43
|
+
# Silvermont - 12.9
|
|
44
|
+
#
|
|
45
|
+
# (*) Comparison is not completely fair, because "this" is ECB,
|
|
46
|
+
# i.e. no extra processing such as counter values calculation
|
|
47
|
+
# and xor-ing input as in Emilia's CTR implementation is
|
|
48
|
+
# performed. However, the CTR calculations stand for not more
|
|
49
|
+
# than 1% of total time, so comparison is *rather* fair.
|
|
50
|
+
#
|
|
51
|
+
# (**) Results were collected on Westmere, which is considered to
|
|
52
|
+
# be equivalent to Nehalem for this code.
|
|
53
|
+
#
|
|
54
|
+
# As for key schedule conversion subroutine. Interface to OpenSSL
|
|
55
|
+
# relies on per-invocation on-the-fly conversion. This naturally
|
|
56
|
+
# has impact on performance, especially for short inputs. Conversion
|
|
57
|
+
# time in CPU cycles and its ratio to CPU cycles spent in 8x block
|
|
58
|
+
# function is:
|
|
59
|
+
#
|
|
60
|
+
# conversion conversion/8x block
|
|
61
|
+
# Core 2 240 0.22
|
|
62
|
+
# Nehalem 180 0.20
|
|
63
|
+
# Atom 430 0.20
|
|
64
|
+
#
|
|
65
|
+
# The ratio values mean that 128-byte blocks will be processed
|
|
66
|
+
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
|
|
67
|
+
# etc. Then keep in mind that input sizes not divisible by 128 are
|
|
68
|
+
# *effectively* slower, especially shortest ones, e.g. consecutive
|
|
69
|
+
# 144-byte blocks are processed 44% slower than one would expect,
|
|
70
|
+
# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
|
|
71
|
+
# it's still faster than ["hyper-threading-safe" code path in]
|
|
72
|
+
# aes-x86_64.pl on all lengths above 64 bytes...
|
|
73
|
+
#
|
|
74
|
+
# October 2011.
|
|
75
|
+
#
|
|
76
|
+
# Add decryption procedure. Performance in CPU cycles spent to decrypt
|
|
77
|
+
# one byte out of 4096-byte buffer with 128-bit key is:
|
|
78
|
+
#
|
|
79
|
+
# Core 2 9.98
|
|
80
|
+
# Nehalem 7.80
|
|
81
|
+
# Atom 17.9
|
|
82
|
+
# Silvermont 14.0
|
|
83
|
+
#
|
|
84
|
+
# November 2011.
|
|
85
|
+
#
|
|
86
|
+
# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
|
|
87
|
+
# suboptimal, but XTS is meant to be used with larger blocks...
|
|
88
|
+
#
|
|
89
|
+
# <appro@openssl.org>
|
|
90
|
+
|
|
91
|
+
$flavour = shift;
|
|
92
|
+
$output = shift;
|
|
93
|
+
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
94
|
+
|
|
95
|
+
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
96
|
+
|
|
97
|
+
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
98
|
+
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
99
|
+
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
100
|
+
die "can't locate x86_64-xlate.pl";
|
|
101
|
+
|
|
102
|
+
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
103
|
+
*STDOUT=*OUT;
|
|
104
|
+
|
|
105
|
+
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
|
|
106
|
+
my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
|
|
107
|
+
|
|
108
|
+
{
|
|
109
|
+
my ($key,$rounds,$const)=("%rax","%r10d","%r11");
|
|
110
|
+
|
|
111
|
+
sub Sbox {
|
|
112
|
+
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
|
113
|
+
# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
|
|
114
|
+
my @b=@_[0..7];
|
|
115
|
+
my @t=@_[8..11];
|
|
116
|
+
my @s=@_[12..15];
|
|
117
|
+
&InBasisChange (@b);
|
|
118
|
+
&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
|
|
119
|
+
&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
sub InBasisChange {
|
|
123
|
+
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
|
124
|
+
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
|
|
125
|
+
my @b=@_[0..7];
|
|
126
|
+
$code.=<<___;
|
|
127
|
+
pxor @b[6], @b[5]
|
|
128
|
+
pxor @b[1], @b[2]
|
|
129
|
+
pxor @b[0], @b[3]
|
|
130
|
+
pxor @b[2], @b[6]
|
|
131
|
+
pxor @b[0], @b[5]
|
|
132
|
+
|
|
133
|
+
pxor @b[3], @b[6]
|
|
134
|
+
pxor @b[7], @b[3]
|
|
135
|
+
pxor @b[5], @b[7]
|
|
136
|
+
pxor @b[4], @b[3]
|
|
137
|
+
pxor @b[5], @b[4]
|
|
138
|
+
pxor @b[1], @b[3]
|
|
139
|
+
|
|
140
|
+
pxor @b[7], @b[2]
|
|
141
|
+
pxor @b[5], @b[1]
|
|
142
|
+
___
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
sub OutBasisChange {
|
|
146
|
+
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
|
147
|
+
# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
|
|
148
|
+
my @b=@_[0..7];
|
|
149
|
+
$code.=<<___;
|
|
150
|
+
pxor @b[6], @b[0]
|
|
151
|
+
pxor @b[4], @b[1]
|
|
152
|
+
pxor @b[0], @b[2]
|
|
153
|
+
pxor @b[6], @b[4]
|
|
154
|
+
pxor @b[1], @b[6]
|
|
155
|
+
|
|
156
|
+
pxor @b[5], @b[1]
|
|
157
|
+
pxor @b[3], @b[5]
|
|
158
|
+
pxor @b[7], @b[3]
|
|
159
|
+
pxor @b[5], @b[7]
|
|
160
|
+
pxor @b[5], @b[2]
|
|
161
|
+
|
|
162
|
+
pxor @b[7], @b[4]
|
|
163
|
+
___
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
sub InvSbox {
|
|
167
|
+
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
|
|
168
|
+
# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
|
|
169
|
+
my @b=@_[0..7];
|
|
170
|
+
my @t=@_[8..11];
|
|
171
|
+
my @s=@_[12..15];
|
|
172
|
+
&InvInBasisChange (@b);
|
|
173
|
+
&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
|
|
174
|
+
&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
sub InvInBasisChange { # OutBasisChange in reverse
|
|
178
|
+
my @b=@_[5,1,2,6,3,7,0,4];
|
|
179
|
+
$code.=<<___
|
|
180
|
+
pxor @b[7], @b[4]
|
|
181
|
+
|
|
182
|
+
pxor @b[5], @b[7]
|
|
183
|
+
pxor @b[5], @b[2]
|
|
184
|
+
pxor @b[7], @b[3]
|
|
185
|
+
pxor @b[3], @b[5]
|
|
186
|
+
pxor @b[5], @b[1]
|
|
187
|
+
|
|
188
|
+
pxor @b[1], @b[6]
|
|
189
|
+
pxor @b[0], @b[2]
|
|
190
|
+
pxor @b[6], @b[4]
|
|
191
|
+
pxor @b[6], @b[0]
|
|
192
|
+
pxor @b[4], @b[1]
|
|
193
|
+
___
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
sub InvOutBasisChange { # InBasisChange in reverse
|
|
197
|
+
my @b=@_[2,5,7,3,6,1,0,4];
|
|
198
|
+
$code.=<<___;
|
|
199
|
+
pxor @b[5], @b[1]
|
|
200
|
+
pxor @b[7], @b[2]
|
|
201
|
+
|
|
202
|
+
pxor @b[1], @b[3]
|
|
203
|
+
pxor @b[5], @b[4]
|
|
204
|
+
pxor @b[5], @b[7]
|
|
205
|
+
pxor @b[4], @b[3]
|
|
206
|
+
pxor @b[0], @b[5]
|
|
207
|
+
pxor @b[7], @b[3]
|
|
208
|
+
pxor @b[2], @b[6]
|
|
209
|
+
pxor @b[1], @b[2]
|
|
210
|
+
pxor @b[3], @b[6]
|
|
211
|
+
|
|
212
|
+
pxor @b[0], @b[3]
|
|
213
|
+
pxor @b[6], @b[5]
|
|
214
|
+
___
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
sub Mul_GF4 {
|
|
218
|
+
#;*************************************************************
|
|
219
|
+
#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
|
|
220
|
+
#;*************************************************************
|
|
221
|
+
my ($x0,$x1,$y0,$y1,$t0)=@_;
|
|
222
|
+
$code.=<<___;
|
|
223
|
+
movdqa $y0, $t0
|
|
224
|
+
pxor $y1, $t0
|
|
225
|
+
pand $x0, $t0
|
|
226
|
+
pxor $x1, $x0
|
|
227
|
+
pand $y0, $x1
|
|
228
|
+
pand $y1, $x0
|
|
229
|
+
pxor $x1, $x0
|
|
230
|
+
pxor $t0, $x1
|
|
231
|
+
___
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
sub Mul_GF4_N { # not used, see next subroutine
|
|
235
|
+
# multiply and scale by N
|
|
236
|
+
my ($x0,$x1,$y0,$y1,$t0)=@_;
|
|
237
|
+
$code.=<<___;
|
|
238
|
+
movdqa $y0, $t0
|
|
239
|
+
pxor $y1, $t0
|
|
240
|
+
pand $x0, $t0
|
|
241
|
+
pxor $x1, $x0
|
|
242
|
+
pand $y0, $x1
|
|
243
|
+
pand $y1, $x0
|
|
244
|
+
pxor $x0, $x1
|
|
245
|
+
pxor $t0, $x0
|
|
246
|
+
___
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
sub Mul_GF4_N_GF4 {
|
|
250
|
+
# interleaved Mul_GF4_N and Mul_GF4
|
|
251
|
+
my ($x0,$x1,$y0,$y1,$t0,
|
|
252
|
+
$x2,$x3,$y2,$y3,$t1)=@_;
|
|
253
|
+
$code.=<<___;
|
|
254
|
+
movdqa $y0, $t0
|
|
255
|
+
movdqa $y2, $t1
|
|
256
|
+
pxor $y1, $t0
|
|
257
|
+
pxor $y3, $t1
|
|
258
|
+
pand $x0, $t0
|
|
259
|
+
pand $x2, $t1
|
|
260
|
+
pxor $x1, $x0
|
|
261
|
+
pxor $x3, $x2
|
|
262
|
+
pand $y0, $x1
|
|
263
|
+
pand $y2, $x3
|
|
264
|
+
pand $y1, $x0
|
|
265
|
+
pand $y3, $x2
|
|
266
|
+
pxor $x0, $x1
|
|
267
|
+
pxor $x3, $x2
|
|
268
|
+
pxor $t0, $x0
|
|
269
|
+
pxor $t1, $x3
|
|
270
|
+
___
|
|
271
|
+
}
|
|
272
|
+
sub Mul_GF16_2 {
|
|
273
|
+
my @x=@_[0..7];
|
|
274
|
+
my @y=@_[8..11];
|
|
275
|
+
my @t=@_[12..15];
|
|
276
|
+
$code.=<<___;
|
|
277
|
+
movdqa @x[0], @t[0]
|
|
278
|
+
movdqa @x[1], @t[1]
|
|
279
|
+
___
|
|
280
|
+
&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
|
|
281
|
+
$code.=<<___;
|
|
282
|
+
pxor @x[2], @t[0]
|
|
283
|
+
pxor @x[3], @t[1]
|
|
284
|
+
pxor @y[2], @y[0]
|
|
285
|
+
pxor @y[3], @y[1]
|
|
286
|
+
___
|
|
287
|
+
Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
|
|
288
|
+
@x[2], @x[3], @y[2], @y[3], @t[2]);
|
|
289
|
+
$code.=<<___;
|
|
290
|
+
pxor @t[0], @x[0]
|
|
291
|
+
pxor @t[0], @x[2]
|
|
292
|
+
pxor @t[1], @x[1]
|
|
293
|
+
pxor @t[1], @x[3]
|
|
294
|
+
|
|
295
|
+
movdqa @x[4], @t[0]
|
|
296
|
+
movdqa @x[5], @t[1]
|
|
297
|
+
pxor @x[6], @t[0]
|
|
298
|
+
pxor @x[7], @t[1]
|
|
299
|
+
___
|
|
300
|
+
&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
|
|
301
|
+
@x[6], @x[7], @y[2], @y[3], @t[2]);
|
|
302
|
+
$code.=<<___;
|
|
303
|
+
pxor @y[2], @y[0]
|
|
304
|
+
pxor @y[3], @y[1]
|
|
305
|
+
___
|
|
306
|
+
&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
|
|
307
|
+
$code.=<<___;
|
|
308
|
+
pxor @t[0], @x[4]
|
|
309
|
+
pxor @t[0], @x[6]
|
|
310
|
+
pxor @t[1], @x[5]
|
|
311
|
+
pxor @t[1], @x[7]
|
|
312
|
+
___
|
|
313
|
+
}
|
|
314
|
+
sub Inv_GF256 {
|
|
315
|
+
#;********************************************************************
|
|
316
|
+
#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
|
|
317
|
+
#;********************************************************************
|
|
318
|
+
my @x=@_[0..7];
|
|
319
|
+
my @t=@_[8..11];
|
|
320
|
+
my @s=@_[12..15];
|
|
321
|
+
# direct optimizations from hardware
|
|
322
|
+
$code.=<<___;
|
|
323
|
+
movdqa @x[4], @t[3]
|
|
324
|
+
movdqa @x[5], @t[2]
|
|
325
|
+
movdqa @x[1], @t[1]
|
|
326
|
+
movdqa @x[7], @s[1]
|
|
327
|
+
movdqa @x[0], @s[0]
|
|
328
|
+
|
|
329
|
+
pxor @x[6], @t[3]
|
|
330
|
+
pxor @x[7], @t[2]
|
|
331
|
+
pxor @x[3], @t[1]
|
|
332
|
+
movdqa @t[3], @s[2]
|
|
333
|
+
pxor @x[6], @s[1]
|
|
334
|
+
movdqa @t[2], @t[0]
|
|
335
|
+
pxor @x[2], @s[0]
|
|
336
|
+
movdqa @t[3], @s[3]
|
|
337
|
+
|
|
338
|
+
por @t[1], @t[2]
|
|
339
|
+
por @s[0], @t[3]
|
|
340
|
+
pxor @t[0], @s[3]
|
|
341
|
+
pand @s[0], @s[2]
|
|
342
|
+
pxor @t[1], @s[0]
|
|
343
|
+
pand @t[1], @t[0]
|
|
344
|
+
pand @s[0], @s[3]
|
|
345
|
+
movdqa @x[3], @s[0]
|
|
346
|
+
pxor @x[2], @s[0]
|
|
347
|
+
pand @s[0], @s[1]
|
|
348
|
+
pxor @s[1], @t[3]
|
|
349
|
+
pxor @s[1], @t[2]
|
|
350
|
+
movdqa @x[4], @s[1]
|
|
351
|
+
movdqa @x[1], @s[0]
|
|
352
|
+
pxor @x[5], @s[1]
|
|
353
|
+
pxor @x[0], @s[0]
|
|
354
|
+
movdqa @s[1], @t[1]
|
|
355
|
+
pand @s[0], @s[1]
|
|
356
|
+
por @s[0], @t[1]
|
|
357
|
+
pxor @s[1], @t[0]
|
|
358
|
+
pxor @s[3], @t[3]
|
|
359
|
+
pxor @s[2], @t[2]
|
|
360
|
+
pxor @s[3], @t[1]
|
|
361
|
+
movdqa @x[7], @s[0]
|
|
362
|
+
pxor @s[2], @t[0]
|
|
363
|
+
movdqa @x[6], @s[1]
|
|
364
|
+
pxor @s[2], @t[1]
|
|
365
|
+
movdqa @x[5], @s[2]
|
|
366
|
+
pand @x[3], @s[0]
|
|
367
|
+
movdqa @x[4], @s[3]
|
|
368
|
+
pand @x[2], @s[1]
|
|
369
|
+
pand @x[1], @s[2]
|
|
370
|
+
por @x[0], @s[3]
|
|
371
|
+
pxor @s[0], @t[3]
|
|
372
|
+
pxor @s[1], @t[2]
|
|
373
|
+
pxor @s[2], @t[1]
|
|
374
|
+
pxor @s[3], @t[0]
|
|
375
|
+
|
|
376
|
+
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
|
|
377
|
+
|
|
378
|
+
# new smaller inversion
|
|
379
|
+
|
|
380
|
+
movdqa @t[3], @s[0]
|
|
381
|
+
pand @t[1], @t[3]
|
|
382
|
+
pxor @t[2], @s[0]
|
|
383
|
+
|
|
384
|
+
movdqa @t[0], @s[2]
|
|
385
|
+
movdqa @s[0], @s[3]
|
|
386
|
+
pxor @t[3], @s[2]
|
|
387
|
+
pand @s[2], @s[3]
|
|
388
|
+
|
|
389
|
+
movdqa @t[1], @s[1]
|
|
390
|
+
pxor @t[2], @s[3]
|
|
391
|
+
pxor @t[0], @s[1]
|
|
392
|
+
|
|
393
|
+
pxor @t[2], @t[3]
|
|
394
|
+
|
|
395
|
+
pand @t[3], @s[1]
|
|
396
|
+
|
|
397
|
+
movdqa @s[2], @t[2]
|
|
398
|
+
pxor @t[0], @s[1]
|
|
399
|
+
|
|
400
|
+
pxor @s[1], @t[2]
|
|
401
|
+
pxor @s[1], @t[1]
|
|
402
|
+
|
|
403
|
+
pand @t[0], @t[2]
|
|
404
|
+
|
|
405
|
+
pxor @t[2], @s[2]
|
|
406
|
+
pxor @t[2], @t[1]
|
|
407
|
+
|
|
408
|
+
pand @s[3], @s[2]
|
|
409
|
+
|
|
410
|
+
pxor @s[0], @s[2]
|
|
411
|
+
___
|
|
412
|
+
# output in s3, s2, s1, t1
|
|
413
|
+
|
|
414
|
+
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
|
|
415
|
+
|
|
416
|
+
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
|
|
417
|
+
&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
|
|
418
|
+
|
|
419
|
+
### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
# AES linear components
|
|
423
|
+
|
|
424
|
+
sub ShiftRows {
|
|
425
|
+
my @x=@_[0..7];
|
|
426
|
+
my $mask=pop;
|
|
427
|
+
$code.=<<___;
|
|
428
|
+
pxor 0x00($key),@x[0]
|
|
429
|
+
pxor 0x10($key),@x[1]
|
|
430
|
+
pxor 0x20($key),@x[2]
|
|
431
|
+
pxor 0x30($key),@x[3]
|
|
432
|
+
pshufb $mask,@x[0]
|
|
433
|
+
pshufb $mask,@x[1]
|
|
434
|
+
pxor 0x40($key),@x[4]
|
|
435
|
+
pxor 0x50($key),@x[5]
|
|
436
|
+
pshufb $mask,@x[2]
|
|
437
|
+
pshufb $mask,@x[3]
|
|
438
|
+
pxor 0x60($key),@x[6]
|
|
439
|
+
pxor 0x70($key),@x[7]
|
|
440
|
+
pshufb $mask,@x[4]
|
|
441
|
+
pshufb $mask,@x[5]
|
|
442
|
+
pshufb $mask,@x[6]
|
|
443
|
+
pshufb $mask,@x[7]
|
|
444
|
+
lea 0x80($key),$key
|
|
445
|
+
___
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
sub MixColumns {
|
|
449
|
+
# modified to emit output in order suitable for feeding back to aesenc[last]
|
|
450
|
+
my @x=@_[0..7];
|
|
451
|
+
my @t=@_[8..15];
|
|
452
|
+
my $inv=@_[16]; # optional
|
|
453
|
+
$code.=<<___;
|
|
454
|
+
pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
|
|
455
|
+
pshufd \$0x93, @x[1], @t[1]
|
|
456
|
+
pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
|
|
457
|
+
pshufd \$0x93, @x[2], @t[2]
|
|
458
|
+
pxor @t[1], @x[1]
|
|
459
|
+
pshufd \$0x93, @x[3], @t[3]
|
|
460
|
+
pxor @t[2], @x[2]
|
|
461
|
+
pshufd \$0x93, @x[4], @t[4]
|
|
462
|
+
pxor @t[3], @x[3]
|
|
463
|
+
pshufd \$0x93, @x[5], @t[5]
|
|
464
|
+
pxor @t[4], @x[4]
|
|
465
|
+
pshufd \$0x93, @x[6], @t[6]
|
|
466
|
+
pxor @t[5], @x[5]
|
|
467
|
+
pshufd \$0x93, @x[7], @t[7]
|
|
468
|
+
pxor @t[6], @x[6]
|
|
469
|
+
pxor @t[7], @x[7]
|
|
470
|
+
|
|
471
|
+
pxor @x[0], @t[1]
|
|
472
|
+
pxor @x[7], @t[0]
|
|
473
|
+
pxor @x[7], @t[1]
|
|
474
|
+
pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
|
|
475
|
+
pxor @x[1], @t[2]
|
|
476
|
+
pshufd \$0x4E, @x[1], @x[1]
|
|
477
|
+
pxor @x[4], @t[5]
|
|
478
|
+
pxor @t[0], @x[0]
|
|
479
|
+
pxor @x[5], @t[6]
|
|
480
|
+
pxor @t[1], @x[1]
|
|
481
|
+
pxor @x[3], @t[4]
|
|
482
|
+
pshufd \$0x4E, @x[4], @t[0]
|
|
483
|
+
pxor @x[6], @t[7]
|
|
484
|
+
pshufd \$0x4E, @x[5], @t[1]
|
|
485
|
+
pxor @x[2], @t[3]
|
|
486
|
+
pshufd \$0x4E, @x[3], @x[4]
|
|
487
|
+
pxor @x[7], @t[3]
|
|
488
|
+
pshufd \$0x4E, @x[7], @x[5]
|
|
489
|
+
pxor @x[7], @t[4]
|
|
490
|
+
pshufd \$0x4E, @x[6], @x[3]
|
|
491
|
+
pxor @t[4], @t[0]
|
|
492
|
+
pshufd \$0x4E, @x[2], @x[6]
|
|
493
|
+
pxor @t[5], @t[1]
|
|
494
|
+
___
|
|
495
|
+
$code.=<<___ if (!$inv);
|
|
496
|
+
pxor @t[3], @x[4]
|
|
497
|
+
pxor @t[7], @x[5]
|
|
498
|
+
pxor @t[6], @x[3]
|
|
499
|
+
movdqa @t[0], @x[2]
|
|
500
|
+
pxor @t[2], @x[6]
|
|
501
|
+
movdqa @t[1], @x[7]
|
|
502
|
+
___
|
|
503
|
+
$code.=<<___ if ($inv);
|
|
504
|
+
pxor @x[4], @t[3]
|
|
505
|
+
pxor @t[7], @x[5]
|
|
506
|
+
pxor @x[3], @t[6]
|
|
507
|
+
movdqa @t[0], @x[3]
|
|
508
|
+
pxor @t[2], @x[6]
|
|
509
|
+
movdqa @t[6], @x[2]
|
|
510
|
+
movdqa @t[1], @x[7]
|
|
511
|
+
movdqa @x[6], @x[4]
|
|
512
|
+
movdqa @t[3], @x[6]
|
|
513
|
+
___
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
sub InvMixColumns_orig {
|
|
517
|
+
my @x=@_[0..7];
|
|
518
|
+
my @t=@_[8..15];
|
|
519
|
+
|
|
520
|
+
$code.=<<___;
|
|
521
|
+
# multiplication by 0x0e
|
|
522
|
+
pshufd \$0x93, @x[7], @t[7]
|
|
523
|
+
movdqa @x[2], @t[2]
|
|
524
|
+
pxor @x[5], @x[7] # 7 5
|
|
525
|
+
pxor @x[5], @x[2] # 2 5
|
|
526
|
+
pshufd \$0x93, @x[0], @t[0]
|
|
527
|
+
movdqa @x[5], @t[5]
|
|
528
|
+
pxor @x[0], @x[5] # 5 0 [1]
|
|
529
|
+
pxor @x[1], @x[0] # 0 1
|
|
530
|
+
pshufd \$0x93, @x[1], @t[1]
|
|
531
|
+
pxor @x[2], @x[1] # 1 25
|
|
532
|
+
pxor @x[6], @x[0] # 01 6 [2]
|
|
533
|
+
pxor @x[3], @x[1] # 125 3 [4]
|
|
534
|
+
pshufd \$0x93, @x[3], @t[3]
|
|
535
|
+
pxor @x[0], @x[2] # 25 016 [3]
|
|
536
|
+
pxor @x[7], @x[3] # 3 75
|
|
537
|
+
pxor @x[6], @x[7] # 75 6 [0]
|
|
538
|
+
pshufd \$0x93, @x[6], @t[6]
|
|
539
|
+
movdqa @x[4], @t[4]
|
|
540
|
+
pxor @x[4], @x[6] # 6 4
|
|
541
|
+
pxor @x[3], @x[4] # 4 375 [6]
|
|
542
|
+
pxor @x[7], @x[3] # 375 756=36
|
|
543
|
+
pxor @t[5], @x[6] # 64 5 [7]
|
|
544
|
+
pxor @t[2], @x[3] # 36 2
|
|
545
|
+
pxor @t[4], @x[3] # 362 4 [5]
|
|
546
|
+
pshufd \$0x93, @t[5], @t[5]
|
|
547
|
+
___
|
|
548
|
+
my @y = @x[7,5,0,2,1,3,4,6];
|
|
549
|
+
$code.=<<___;
|
|
550
|
+
# multiplication by 0x0b
|
|
551
|
+
pxor @y[0], @y[1]
|
|
552
|
+
pxor @t[0], @y[0]
|
|
553
|
+
pxor @t[1], @y[1]
|
|
554
|
+
pshufd \$0x93, @t[2], @t[2]
|
|
555
|
+
pxor @t[5], @y[0]
|
|
556
|
+
pxor @t[6], @y[1]
|
|
557
|
+
pxor @t[7], @y[0]
|
|
558
|
+
pshufd \$0x93, @t[4], @t[4]
|
|
559
|
+
pxor @t[6], @t[7] # clobber t[7]
|
|
560
|
+
pxor @y[0], @y[1]
|
|
561
|
+
|
|
562
|
+
pxor @t[0], @y[3]
|
|
563
|
+
pshufd \$0x93, @t[0], @t[0]
|
|
564
|
+
pxor @t[1], @y[2]
|
|
565
|
+
pxor @t[1], @y[4]
|
|
566
|
+
pxor @t[2], @y[2]
|
|
567
|
+
pshufd \$0x93, @t[1], @t[1]
|
|
568
|
+
pxor @t[2], @y[3]
|
|
569
|
+
pxor @t[2], @y[5]
|
|
570
|
+
pxor @t[7], @y[2]
|
|
571
|
+
pshufd \$0x93, @t[2], @t[2]
|
|
572
|
+
pxor @t[3], @y[3]
|
|
573
|
+
pxor @t[3], @y[6]
|
|
574
|
+
pxor @t[3], @y[4]
|
|
575
|
+
pshufd \$0x93, @t[3], @t[3]
|
|
576
|
+
pxor @t[4], @y[7]
|
|
577
|
+
pxor @t[4], @y[5]
|
|
578
|
+
pxor @t[7], @y[7]
|
|
579
|
+
pxor @t[5], @y[3]
|
|
580
|
+
pxor @t[4], @y[4]
|
|
581
|
+
pxor @t[5], @t[7] # clobber t[7] even more
|
|
582
|
+
|
|
583
|
+
pxor @t[7], @y[5]
|
|
584
|
+
pshufd \$0x93, @t[4], @t[4]
|
|
585
|
+
pxor @t[7], @y[6]
|
|
586
|
+
pxor @t[7], @y[4]
|
|
587
|
+
|
|
588
|
+
pxor @t[5], @t[7]
|
|
589
|
+
pshufd \$0x93, @t[5], @t[5]
|
|
590
|
+
pxor @t[6], @t[7] # restore t[7]
|
|
591
|
+
|
|
592
|
+
# multiplication by 0x0d
|
|
593
|
+
pxor @y[7], @y[4]
|
|
594
|
+
pxor @t[4], @y[7]
|
|
595
|
+
pshufd \$0x93, @t[6], @t[6]
|
|
596
|
+
pxor @t[0], @y[2]
|
|
597
|
+
pxor @t[5], @y[7]
|
|
598
|
+
pxor @t[2], @y[2]
|
|
599
|
+
pshufd \$0x93, @t[7], @t[7]
|
|
600
|
+
|
|
601
|
+
pxor @y[1], @y[3]
|
|
602
|
+
pxor @t[1], @y[1]
|
|
603
|
+
pxor @t[0], @y[0]
|
|
604
|
+
pxor @t[0], @y[3]
|
|
605
|
+
pxor @t[5], @y[1]
|
|
606
|
+
pxor @t[5], @y[0]
|
|
607
|
+
pxor @t[7], @y[1]
|
|
608
|
+
pshufd \$0x93, @t[0], @t[0]
|
|
609
|
+
pxor @t[6], @y[0]
|
|
610
|
+
pxor @y[1], @y[3]
|
|
611
|
+
pxor @t[1], @y[4]
|
|
612
|
+
pshufd \$0x93, @t[1], @t[1]
|
|
613
|
+
|
|
614
|
+
pxor @t[7], @y[7]
|
|
615
|
+
pxor @t[2], @y[4]
|
|
616
|
+
pxor @t[2], @y[5]
|
|
617
|
+
pshufd \$0x93, @t[2], @t[2]
|
|
618
|
+
pxor @t[6], @y[2]
|
|
619
|
+
pxor @t[3], @t[6] # clobber t[6]
|
|
620
|
+
pxor @y[7], @y[4]
|
|
621
|
+
pxor @t[6], @y[3]
|
|
622
|
+
|
|
623
|
+
pxor @t[6], @y[6]
|
|
624
|
+
pxor @t[5], @y[5]
|
|
625
|
+
pxor @t[4], @y[6]
|
|
626
|
+
pshufd \$0x93, @t[4], @t[4]
|
|
627
|
+
pxor @t[6], @y[5]
|
|
628
|
+
pxor @t[7], @y[6]
|
|
629
|
+
pxor @t[3], @t[6] # restore t[6]
|
|
630
|
+
|
|
631
|
+
pshufd \$0x93, @t[5], @t[5]
|
|
632
|
+
pshufd \$0x93, @t[6], @t[6]
|
|
633
|
+
pshufd \$0x93, @t[7], @t[7]
|
|
634
|
+
pshufd \$0x93, @t[3], @t[3]
|
|
635
|
+
|
|
636
|
+
# multiplication by 0x09
|
|
637
|
+
pxor @y[1], @y[4]
|
|
638
|
+
pxor @y[1], @t[1] # t[1]=y[1]
|
|
639
|
+
pxor @t[5], @t[0] # clobber t[0]
|
|
640
|
+
pxor @t[5], @t[1]
|
|
641
|
+
pxor @t[0], @y[3]
|
|
642
|
+
pxor @y[0], @t[0] # t[0]=y[0]
|
|
643
|
+
pxor @t[6], @t[1]
|
|
644
|
+
pxor @t[7], @t[6] # clobber t[6]
|
|
645
|
+
pxor @t[1], @y[4]
|
|
646
|
+
pxor @t[4], @y[7]
|
|
647
|
+
pxor @y[4], @t[4] # t[4]=y[4]
|
|
648
|
+
pxor @t[3], @y[6]
|
|
649
|
+
pxor @y[3], @t[3] # t[3]=y[3]
|
|
650
|
+
pxor @t[2], @y[5]
|
|
651
|
+
pxor @y[2], @t[2] # t[2]=y[2]
|
|
652
|
+
pxor @t[7], @t[3]
|
|
653
|
+
pxor @y[5], @t[5] # t[5]=y[5]
|
|
654
|
+
pxor @t[6], @t[2]
|
|
655
|
+
pxor @t[6], @t[5]
|
|
656
|
+
pxor @y[6], @t[6] # t[6]=y[6]
|
|
657
|
+
pxor @y[7], @t[7] # t[7]=y[7]
|
|
658
|
+
|
|
659
|
+
movdqa @t[0],@XMM[0]
|
|
660
|
+
movdqa @t[1],@XMM[1]
|
|
661
|
+
movdqa @t[2],@XMM[2]
|
|
662
|
+
movdqa @t[3],@XMM[3]
|
|
663
|
+
movdqa @t[4],@XMM[4]
|
|
664
|
+
movdqa @t[5],@XMM[5]
|
|
665
|
+
movdqa @t[6],@XMM[6]
|
|
666
|
+
movdqa @t[7],@XMM[7]
|
|
667
|
+
___
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
sub InvMixColumns {
|
|
671
|
+
my @x=@_[0..7];
|
|
672
|
+
my @t=@_[8..15];
|
|
673
|
+
|
|
674
|
+
# Thanks to Jussi Kivilinna for providing pointer to
|
|
675
|
+
#
|
|
676
|
+
# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
|
|
677
|
+
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
|
|
678
|
+
# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
|
|
679
|
+
# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
|
|
680
|
+
|
|
681
|
+
$code.=<<___;
|
|
682
|
+
# multiplication by 0x05-0x00-0x04-0x00
|
|
683
|
+
pshufd \$0x4E, @x[0], @t[0]
|
|
684
|
+
pshufd \$0x4E, @x[6], @t[6]
|
|
685
|
+
pxor @x[0], @t[0]
|
|
686
|
+
pshufd \$0x4E, @x[7], @t[7]
|
|
687
|
+
pxor @x[6], @t[6]
|
|
688
|
+
pshufd \$0x4E, @x[1], @t[1]
|
|
689
|
+
pxor @x[7], @t[7]
|
|
690
|
+
pshufd \$0x4E, @x[2], @t[2]
|
|
691
|
+
pxor @x[1], @t[1]
|
|
692
|
+
pshufd \$0x4E, @x[3], @t[3]
|
|
693
|
+
pxor @x[2], @t[2]
|
|
694
|
+
pxor @t[6], @x[0]
|
|
695
|
+
pxor @t[6], @x[1]
|
|
696
|
+
pshufd \$0x4E, @x[4], @t[4]
|
|
697
|
+
pxor @x[3], @t[3]
|
|
698
|
+
pxor @t[0], @x[2]
|
|
699
|
+
pxor @t[1], @x[3]
|
|
700
|
+
pshufd \$0x4E, @x[5], @t[5]
|
|
701
|
+
pxor @x[4], @t[4]
|
|
702
|
+
pxor @t[7], @x[1]
|
|
703
|
+
pxor @t[2], @x[4]
|
|
704
|
+
pxor @x[5], @t[5]
|
|
705
|
+
|
|
706
|
+
pxor @t[7], @x[2]
|
|
707
|
+
pxor @t[6], @x[3]
|
|
708
|
+
pxor @t[6], @x[4]
|
|
709
|
+
pxor @t[3], @x[5]
|
|
710
|
+
pxor @t[4], @x[6]
|
|
711
|
+
pxor @t[7], @x[4]
|
|
712
|
+
pxor @t[7], @x[5]
|
|
713
|
+
pxor @t[5], @x[7]
|
|
714
|
+
___
|
|
715
|
+
&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
sub aesenc { # not used
|
|
719
|
+
my @b=@_[0..7];
|
|
720
|
+
my @t=@_[8..15];
|
|
721
|
+
$code.=<<___;
|
|
722
|
+
movdqa 0x30($const),@t[0] # .LSR
|
|
723
|
+
___
|
|
724
|
+
&ShiftRows (@b,@t[0]);
|
|
725
|
+
&Sbox (@b,@t);
|
|
726
|
+
&MixColumns (@b[0,1,4,6,3,7,2,5],@t);
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
sub aesenclast { # not used
|
|
730
|
+
my @b=@_[0..7];
|
|
731
|
+
my @t=@_[8..15];
|
|
732
|
+
$code.=<<___;
|
|
733
|
+
movdqa 0x40($const),@t[0] # .LSRM0
|
|
734
|
+
___
|
|
735
|
+
&ShiftRows (@b,@t[0]);
|
|
736
|
+
&Sbox (@b,@t);
|
|
737
|
+
$code.=<<___
|
|
738
|
+
pxor 0x00($key),@b[0]
|
|
739
|
+
pxor 0x10($key),@b[1]
|
|
740
|
+
pxor 0x20($key),@b[4]
|
|
741
|
+
pxor 0x30($key),@b[6]
|
|
742
|
+
pxor 0x40($key),@b[3]
|
|
743
|
+
pxor 0x50($key),@b[7]
|
|
744
|
+
pxor 0x60($key),@b[2]
|
|
745
|
+
pxor 0x70($key),@b[5]
|
|
746
|
+
___
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
sub swapmove {
|
|
750
|
+
my ($a,$b,$n,$mask,$t)=@_;
|
|
751
|
+
$code.=<<___;
|
|
752
|
+
movdqa $b,$t
|
|
753
|
+
psrlq \$$n,$b
|
|
754
|
+
pxor $a,$b
|
|
755
|
+
pand $mask,$b
|
|
756
|
+
pxor $b,$a
|
|
757
|
+
psllq \$$n,$b
|
|
758
|
+
pxor $t,$b
|
|
759
|
+
___
|
|
760
|
+
}
|
|
761
|
+
sub swapmove2x {
|
|
762
|
+
my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
|
|
763
|
+
$code.=<<___;
|
|
764
|
+
movdqa $b0,$t0
|
|
765
|
+
psrlq \$$n,$b0
|
|
766
|
+
movdqa $b1,$t1
|
|
767
|
+
psrlq \$$n,$b1
|
|
768
|
+
pxor $a0,$b0
|
|
769
|
+
pxor $a1,$b1
|
|
770
|
+
pand $mask,$b0
|
|
771
|
+
pand $mask,$b1
|
|
772
|
+
pxor $b0,$a0
|
|
773
|
+
psllq \$$n,$b0
|
|
774
|
+
pxor $b1,$a1
|
|
775
|
+
psllq \$$n,$b1
|
|
776
|
+
pxor $t0,$b0
|
|
777
|
+
pxor $t1,$b1
|
|
778
|
+
___
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
sub bitslice {
|
|
782
|
+
my @x=reverse(@_[0..7]);
|
|
783
|
+
my ($t0,$t1,$t2,$t3)=@_[8..11];
|
|
784
|
+
$code.=<<___;
|
|
785
|
+
movdqa 0x00($const),$t0 # .LBS0
|
|
786
|
+
movdqa 0x10($const),$t1 # .LBS1
|
|
787
|
+
___
|
|
788
|
+
&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
|
|
789
|
+
&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
|
|
790
|
+
$code.=<<___;
|
|
791
|
+
movdqa 0x20($const),$t0 # .LBS2
|
|
792
|
+
___
|
|
793
|
+
&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
|
|
794
|
+
&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
|
|
795
|
+
|
|
796
|
+
&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
|
|
797
|
+
&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
|
|
798
|
+
}
|
|
799
|
+
|
|
800
|
+
$code.=<<___;
|
|
801
|
+
.text
|
|
802
|
+
|
|
803
|
+
.extern asm_AES_encrypt
|
|
804
|
+
.extern asm_AES_decrypt
|
|
805
|
+
|
|
806
|
+
.type _bsaes_encrypt8,\@abi-omnipotent
|
|
807
|
+
.align 64
|
|
808
|
+
_bsaes_encrypt8:
|
|
809
|
+
lea .LBS0(%rip), $const # constants table
|
|
810
|
+
|
|
811
|
+
movdqa ($key), @XMM[9] # round 0 key
|
|
812
|
+
lea 0x10($key), $key
|
|
813
|
+
movdqa 0x50($const), @XMM[8] # .LM0SR
|
|
814
|
+
pxor @XMM[9], @XMM[0] # xor with round0 key
|
|
815
|
+
pxor @XMM[9], @XMM[1]
|
|
816
|
+
pxor @XMM[9], @XMM[2]
|
|
817
|
+
pxor @XMM[9], @XMM[3]
|
|
818
|
+
pshufb @XMM[8], @XMM[0]
|
|
819
|
+
pshufb @XMM[8], @XMM[1]
|
|
820
|
+
pxor @XMM[9], @XMM[4]
|
|
821
|
+
pxor @XMM[9], @XMM[5]
|
|
822
|
+
pshufb @XMM[8], @XMM[2]
|
|
823
|
+
pshufb @XMM[8], @XMM[3]
|
|
824
|
+
pxor @XMM[9], @XMM[6]
|
|
825
|
+
pxor @XMM[9], @XMM[7]
|
|
826
|
+
pshufb @XMM[8], @XMM[4]
|
|
827
|
+
pshufb @XMM[8], @XMM[5]
|
|
828
|
+
pshufb @XMM[8], @XMM[6]
|
|
829
|
+
pshufb @XMM[8], @XMM[7]
|
|
830
|
+
_bsaes_encrypt8_bitslice:
|
|
831
|
+
___
|
|
832
|
+
&bitslice (@XMM[0..7, 8..11]);
|
|
833
|
+
$code.=<<___;
|
|
834
|
+
dec $rounds
|
|
835
|
+
jmp .Lenc_sbox
|
|
836
|
+
.align 16
|
|
837
|
+
.Lenc_loop:
|
|
838
|
+
___
|
|
839
|
+
&ShiftRows (@XMM[0..7, 8]);
|
|
840
|
+
$code.=".Lenc_sbox:\n";
|
|
841
|
+
&Sbox (@XMM[0..7, 8..15]);
|
|
842
|
+
$code.=<<___;
|
|
843
|
+
dec $rounds
|
|
844
|
+
jl .Lenc_done
|
|
845
|
+
___
|
|
846
|
+
&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
|
|
847
|
+
$code.=<<___;
|
|
848
|
+
movdqa 0x30($const), @XMM[8] # .LSR
|
|
849
|
+
jnz .Lenc_loop
|
|
850
|
+
movdqa 0x40($const), @XMM[8] # .LSRM0
|
|
851
|
+
jmp .Lenc_loop
|
|
852
|
+
.align 16
|
|
853
|
+
.Lenc_done:
|
|
854
|
+
___
|
|
855
|
+
# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
|
|
856
|
+
&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
|
|
857
|
+
$code.=<<___;
|
|
858
|
+
movdqa ($key), @XMM[8] # last round key
|
|
859
|
+
pxor @XMM[8], @XMM[4]
|
|
860
|
+
pxor @XMM[8], @XMM[6]
|
|
861
|
+
pxor @XMM[8], @XMM[3]
|
|
862
|
+
pxor @XMM[8], @XMM[7]
|
|
863
|
+
pxor @XMM[8], @XMM[2]
|
|
864
|
+
pxor @XMM[8], @XMM[5]
|
|
865
|
+
pxor @XMM[8], @XMM[0]
|
|
866
|
+
pxor @XMM[8], @XMM[1]
|
|
867
|
+
ret
|
|
868
|
+
.size _bsaes_encrypt8,.-_bsaes_encrypt8
|
|
869
|
+
|
|
870
|
+
.type _bsaes_decrypt8,\@abi-omnipotent
|
|
871
|
+
.align 64
|
|
872
|
+
_bsaes_decrypt8:
|
|
873
|
+
lea .LBS0(%rip), $const # constants table
|
|
874
|
+
|
|
875
|
+
movdqa ($key), @XMM[9] # round 0 key
|
|
876
|
+
lea 0x10($key), $key
|
|
877
|
+
movdqa -0x30($const), @XMM[8] # .LM0ISR
|
|
878
|
+
pxor @XMM[9], @XMM[0] # xor with round0 key
|
|
879
|
+
pxor @XMM[9], @XMM[1]
|
|
880
|
+
pxor @XMM[9], @XMM[2]
|
|
881
|
+
pxor @XMM[9], @XMM[3]
|
|
882
|
+
pshufb @XMM[8], @XMM[0]
|
|
883
|
+
pshufb @XMM[8], @XMM[1]
|
|
884
|
+
pxor @XMM[9], @XMM[4]
|
|
885
|
+
pxor @XMM[9], @XMM[5]
|
|
886
|
+
pshufb @XMM[8], @XMM[2]
|
|
887
|
+
pshufb @XMM[8], @XMM[3]
|
|
888
|
+
pxor @XMM[9], @XMM[6]
|
|
889
|
+
pxor @XMM[9], @XMM[7]
|
|
890
|
+
pshufb @XMM[8], @XMM[4]
|
|
891
|
+
pshufb @XMM[8], @XMM[5]
|
|
892
|
+
pshufb @XMM[8], @XMM[6]
|
|
893
|
+
pshufb @XMM[8], @XMM[7]
|
|
894
|
+
___
|
|
895
|
+
&bitslice (@XMM[0..7, 8..11]);
|
|
896
|
+
$code.=<<___;
|
|
897
|
+
dec $rounds
|
|
898
|
+
jmp .Ldec_sbox
|
|
899
|
+
.align 16
|
|
900
|
+
.Ldec_loop:
|
|
901
|
+
___
|
|
902
|
+
&ShiftRows (@XMM[0..7, 8]);
|
|
903
|
+
$code.=".Ldec_sbox:\n";
|
|
904
|
+
&InvSbox (@XMM[0..7, 8..15]);
|
|
905
|
+
$code.=<<___;
|
|
906
|
+
dec $rounds
|
|
907
|
+
jl .Ldec_done
|
|
908
|
+
___
|
|
909
|
+
&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
|
|
910
|
+
$code.=<<___;
|
|
911
|
+
movdqa -0x10($const), @XMM[8] # .LISR
|
|
912
|
+
jnz .Ldec_loop
|
|
913
|
+
movdqa -0x20($const), @XMM[8] # .LISRM0
|
|
914
|
+
jmp .Ldec_loop
|
|
915
|
+
.align 16
|
|
916
|
+
.Ldec_done:
|
|
917
|
+
___
|
|
918
|
+
&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
|
|
919
|
+
$code.=<<___;
|
|
920
|
+
movdqa ($key), @XMM[8] # last round key
|
|
921
|
+
pxor @XMM[8], @XMM[6]
|
|
922
|
+
pxor @XMM[8], @XMM[4]
|
|
923
|
+
pxor @XMM[8], @XMM[2]
|
|
924
|
+
pxor @XMM[8], @XMM[7]
|
|
925
|
+
pxor @XMM[8], @XMM[3]
|
|
926
|
+
pxor @XMM[8], @XMM[5]
|
|
927
|
+
pxor @XMM[8], @XMM[0]
|
|
928
|
+
pxor @XMM[8], @XMM[1]
|
|
929
|
+
ret
|
|
930
|
+
.size _bsaes_decrypt8,.-_bsaes_decrypt8
|
|
931
|
+
___
|
|
932
|
+
}
|
|
933
|
+
{
|
|
934
|
+
my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
|
|
935
|
+
|
|
936
|
+
sub bitslice_key {
|
|
937
|
+
my @x=reverse(@_[0..7]);
|
|
938
|
+
my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
|
|
939
|
+
|
|
940
|
+
&swapmove (@x[0,1],1,$bs0,$t2,$t3);
|
|
941
|
+
$code.=<<___;
|
|
942
|
+
#&swapmove(@x[2,3],1,$t0,$t2,$t3);
|
|
943
|
+
movdqa @x[0], @x[2]
|
|
944
|
+
movdqa @x[1], @x[3]
|
|
945
|
+
___
|
|
946
|
+
#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
|
|
947
|
+
|
|
948
|
+
&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
|
|
949
|
+
$code.=<<___;
|
|
950
|
+
#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
|
|
951
|
+
movdqa @x[0], @x[4]
|
|
952
|
+
movdqa @x[2], @x[6]
|
|
953
|
+
movdqa @x[1], @x[5]
|
|
954
|
+
movdqa @x[3], @x[7]
|
|
955
|
+
___
|
|
956
|
+
&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
|
|
957
|
+
&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
|
|
958
|
+
}
|
|
959
|
+
|
|
960
|
+
$code.=<<___;
|
|
961
|
+
.type _bsaes_key_convert,\@abi-omnipotent
|
|
962
|
+
.align 16
|
|
963
|
+
_bsaes_key_convert:
|
|
964
|
+
lea .Lmasks(%rip), $const
|
|
965
|
+
movdqu ($inp), %xmm7 # load round 0 key
|
|
966
|
+
lea 0x10($inp), $inp
|
|
967
|
+
movdqa 0x00($const), %xmm0 # 0x01...
|
|
968
|
+
movdqa 0x10($const), %xmm1 # 0x02...
|
|
969
|
+
movdqa 0x20($const), %xmm2 # 0x04...
|
|
970
|
+
movdqa 0x30($const), %xmm3 # 0x08...
|
|
971
|
+
movdqa 0x40($const), %xmm4 # .LM0
|
|
972
|
+
pcmpeqd %xmm5, %xmm5 # .LNOT
|
|
973
|
+
|
|
974
|
+
movdqu ($inp), %xmm6 # load round 1 key
|
|
975
|
+
movdqa %xmm7, ($out) # save round 0 key
|
|
976
|
+
lea 0x10($out), $out
|
|
977
|
+
dec $rounds
|
|
978
|
+
jmp .Lkey_loop
|
|
979
|
+
.align 16
|
|
980
|
+
.Lkey_loop:
|
|
981
|
+
pshufb %xmm4, %xmm6 # .LM0
|
|
982
|
+
|
|
983
|
+
movdqa %xmm0, %xmm8
|
|
984
|
+
movdqa %xmm1, %xmm9
|
|
985
|
+
|
|
986
|
+
pand %xmm6, %xmm8
|
|
987
|
+
pand %xmm6, %xmm9
|
|
988
|
+
movdqa %xmm2, %xmm10
|
|
989
|
+
pcmpeqb %xmm0, %xmm8
|
|
990
|
+
psllq \$4, %xmm0 # 0x10...
|
|
991
|
+
movdqa %xmm3, %xmm11
|
|
992
|
+
pcmpeqb %xmm1, %xmm9
|
|
993
|
+
psllq \$4, %xmm1 # 0x20...
|
|
994
|
+
|
|
995
|
+
pand %xmm6, %xmm10
|
|
996
|
+
pand %xmm6, %xmm11
|
|
997
|
+
movdqa %xmm0, %xmm12
|
|
998
|
+
pcmpeqb %xmm2, %xmm10
|
|
999
|
+
psllq \$4, %xmm2 # 0x40...
|
|
1000
|
+
movdqa %xmm1, %xmm13
|
|
1001
|
+
pcmpeqb %xmm3, %xmm11
|
|
1002
|
+
psllq \$4, %xmm3 # 0x80...
|
|
1003
|
+
|
|
1004
|
+
movdqa %xmm2, %xmm14
|
|
1005
|
+
movdqa %xmm3, %xmm15
|
|
1006
|
+
pxor %xmm5, %xmm8 # "pnot"
|
|
1007
|
+
pxor %xmm5, %xmm9
|
|
1008
|
+
|
|
1009
|
+
pand %xmm6, %xmm12
|
|
1010
|
+
pand %xmm6, %xmm13
|
|
1011
|
+
movdqa %xmm8, 0x00($out) # write bit-sliced round key
|
|
1012
|
+
pcmpeqb %xmm0, %xmm12
|
|
1013
|
+
psrlq \$4, %xmm0 # 0x01...
|
|
1014
|
+
movdqa %xmm9, 0x10($out)
|
|
1015
|
+
pcmpeqb %xmm1, %xmm13
|
|
1016
|
+
psrlq \$4, %xmm1 # 0x02...
|
|
1017
|
+
lea 0x10($inp), $inp
|
|
1018
|
+
|
|
1019
|
+
pand %xmm6, %xmm14
|
|
1020
|
+
pand %xmm6, %xmm15
|
|
1021
|
+
movdqa %xmm10, 0x20($out)
|
|
1022
|
+
pcmpeqb %xmm2, %xmm14
|
|
1023
|
+
psrlq \$4, %xmm2 # 0x04...
|
|
1024
|
+
movdqa %xmm11, 0x30($out)
|
|
1025
|
+
pcmpeqb %xmm3, %xmm15
|
|
1026
|
+
psrlq \$4, %xmm3 # 0x08...
|
|
1027
|
+
movdqu ($inp), %xmm6 # load next round key
|
|
1028
|
+
|
|
1029
|
+
pxor %xmm5, %xmm13 # "pnot"
|
|
1030
|
+
pxor %xmm5, %xmm14
|
|
1031
|
+
movdqa %xmm12, 0x40($out)
|
|
1032
|
+
movdqa %xmm13, 0x50($out)
|
|
1033
|
+
movdqa %xmm14, 0x60($out)
|
|
1034
|
+
movdqa %xmm15, 0x70($out)
|
|
1035
|
+
lea 0x80($out),$out
|
|
1036
|
+
dec $rounds
|
|
1037
|
+
jnz .Lkey_loop
|
|
1038
|
+
|
|
1039
|
+
movdqa 0x50($const), %xmm7 # .L63
|
|
1040
|
+
#movdqa %xmm6, ($out) # don't save last round key
|
|
1041
|
+
ret
|
|
1042
|
+
.size _bsaes_key_convert,.-_bsaes_key_convert
|
|
1043
|
+
___
|
|
1044
|
+
}
|
|
1045
|
+
|
|
1046
|
+
if (0 && !$win64) { # following four functions are unsupported interface
|
|
1047
|
+
# used for benchmarking...
|
|
1048
|
+
$code.=<<___;
|
|
1049
|
+
.globl bsaes_enc_key_convert
|
|
1050
|
+
.type bsaes_enc_key_convert,\@function,2
|
|
1051
|
+
.align 16
|
|
1052
|
+
bsaes_enc_key_convert:
|
|
1053
|
+
mov 240($inp),%r10d # pass rounds
|
|
1054
|
+
mov $inp,%rcx # pass key
|
|
1055
|
+
mov $out,%rax # pass key schedule
|
|
1056
|
+
call _bsaes_key_convert
|
|
1057
|
+
pxor %xmm6,%xmm7 # fix up last round key
|
|
1058
|
+
movdqa %xmm7,(%rax) # save last round key
|
|
1059
|
+
ret
|
|
1060
|
+
.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
|
|
1061
|
+
|
|
1062
|
+
.globl bsaes_encrypt_128
|
|
1063
|
+
.type bsaes_encrypt_128,\@function,4
|
|
1064
|
+
.align 16
|
|
1065
|
+
bsaes_encrypt_128:
|
|
1066
|
+
.Lenc128_loop:
|
|
1067
|
+
movdqu 0x00($inp), @XMM[0] # load input
|
|
1068
|
+
movdqu 0x10($inp), @XMM[1]
|
|
1069
|
+
movdqu 0x20($inp), @XMM[2]
|
|
1070
|
+
movdqu 0x30($inp), @XMM[3]
|
|
1071
|
+
movdqu 0x40($inp), @XMM[4]
|
|
1072
|
+
movdqu 0x50($inp), @XMM[5]
|
|
1073
|
+
movdqu 0x60($inp), @XMM[6]
|
|
1074
|
+
movdqu 0x70($inp), @XMM[7]
|
|
1075
|
+
mov $key, %rax # pass the $key
|
|
1076
|
+
lea 0x80($inp), $inp
|
|
1077
|
+
mov \$10,%r10d
|
|
1078
|
+
|
|
1079
|
+
call _bsaes_encrypt8
|
|
1080
|
+
|
|
1081
|
+
movdqu @XMM[0], 0x00($out) # write output
|
|
1082
|
+
movdqu @XMM[1], 0x10($out)
|
|
1083
|
+
movdqu @XMM[4], 0x20($out)
|
|
1084
|
+
movdqu @XMM[6], 0x30($out)
|
|
1085
|
+
movdqu @XMM[3], 0x40($out)
|
|
1086
|
+
movdqu @XMM[7], 0x50($out)
|
|
1087
|
+
movdqu @XMM[2], 0x60($out)
|
|
1088
|
+
movdqu @XMM[5], 0x70($out)
|
|
1089
|
+
lea 0x80($out), $out
|
|
1090
|
+
sub \$0x80,$len
|
|
1091
|
+
ja .Lenc128_loop
|
|
1092
|
+
ret
|
|
1093
|
+
.size bsaes_encrypt_128,.-bsaes_encrypt_128
|
|
1094
|
+
|
|
1095
|
+
.globl bsaes_dec_key_convert
|
|
1096
|
+
.type bsaes_dec_key_convert,\@function,2
|
|
1097
|
+
.align 16
|
|
1098
|
+
bsaes_dec_key_convert:
|
|
1099
|
+
mov 240($inp),%r10d # pass rounds
|
|
1100
|
+
mov $inp,%rcx # pass key
|
|
1101
|
+
mov $out,%rax # pass key schedule
|
|
1102
|
+
call _bsaes_key_convert
|
|
1103
|
+
pxor ($out),%xmm7 # fix up round 0 key
|
|
1104
|
+
movdqa %xmm6,(%rax) # save last round key
|
|
1105
|
+
movdqa %xmm7,($out)
|
|
1106
|
+
ret
|
|
1107
|
+
.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
|
|
1108
|
+
|
|
1109
|
+
.globl bsaes_decrypt_128
|
|
1110
|
+
.type bsaes_decrypt_128,\@function,4
|
|
1111
|
+
.align 16
|
|
1112
|
+
bsaes_decrypt_128:
|
|
1113
|
+
.Ldec128_loop:
|
|
1114
|
+
movdqu 0x00($inp), @XMM[0] # load input
|
|
1115
|
+
movdqu 0x10($inp), @XMM[1]
|
|
1116
|
+
movdqu 0x20($inp), @XMM[2]
|
|
1117
|
+
movdqu 0x30($inp), @XMM[3]
|
|
1118
|
+
movdqu 0x40($inp), @XMM[4]
|
|
1119
|
+
movdqu 0x50($inp), @XMM[5]
|
|
1120
|
+
movdqu 0x60($inp), @XMM[6]
|
|
1121
|
+
movdqu 0x70($inp), @XMM[7]
|
|
1122
|
+
mov $key, %rax # pass the $key
|
|
1123
|
+
lea 0x80($inp), $inp
|
|
1124
|
+
mov \$10,%r10d
|
|
1125
|
+
|
|
1126
|
+
call _bsaes_decrypt8
|
|
1127
|
+
|
|
1128
|
+
movdqu @XMM[0], 0x00($out) # write output
|
|
1129
|
+
movdqu @XMM[1], 0x10($out)
|
|
1130
|
+
movdqu @XMM[6], 0x20($out)
|
|
1131
|
+
movdqu @XMM[4], 0x30($out)
|
|
1132
|
+
movdqu @XMM[2], 0x40($out)
|
|
1133
|
+
movdqu @XMM[7], 0x50($out)
|
|
1134
|
+
movdqu @XMM[3], 0x60($out)
|
|
1135
|
+
movdqu @XMM[5], 0x70($out)
|
|
1136
|
+
lea 0x80($out), $out
|
|
1137
|
+
sub \$0x80,$len
|
|
1138
|
+
ja .Ldec128_loop
|
|
1139
|
+
ret
|
|
1140
|
+
.size bsaes_decrypt_128,.-bsaes_decrypt_128
|
|
1141
|
+
___
|
|
1142
|
+
}
|
|
1143
|
+
{
|
|
1144
|
+
######################################################################
|
|
1145
|
+
#
|
|
1146
|
+
# OpenSSL interface
|
|
1147
|
+
#
|
|
1148
|
+
my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
|
|
1149
|
+
: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
|
|
1150
|
+
my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
|
|
1151
|
+
|
|
1152
|
+
$code.=<<___;
|
|
1153
|
+
.globl bsaes_ctr32_encrypt_blocks
|
|
1154
|
+
.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
|
|
1155
|
+
.align 16
|
|
1156
|
+
bsaes_ctr32_encrypt_blocks:
|
|
1157
|
+
mov %rsp, %rax
|
|
1158
|
+
.Lctr_enc_prologue:
|
|
1159
|
+
push %rbp
|
|
1160
|
+
push %rbx
|
|
1161
|
+
push %r12
|
|
1162
|
+
push %r13
|
|
1163
|
+
push %r14
|
|
1164
|
+
push %r15
|
|
1165
|
+
lea -0x48(%rsp), %rsp
|
|
1166
|
+
___
|
|
1167
|
+
$code.=<<___ if ($win64);
|
|
1168
|
+
mov 0xa0(%rsp),$arg5 # pull ivp
|
|
1169
|
+
lea -0xa0(%rsp), %rsp
|
|
1170
|
+
movaps %xmm6, 0x40(%rsp)
|
|
1171
|
+
movaps %xmm7, 0x50(%rsp)
|
|
1172
|
+
movaps %xmm8, 0x60(%rsp)
|
|
1173
|
+
movaps %xmm9, 0x70(%rsp)
|
|
1174
|
+
movaps %xmm10, 0x80(%rsp)
|
|
1175
|
+
movaps %xmm11, 0x90(%rsp)
|
|
1176
|
+
movaps %xmm12, 0xa0(%rsp)
|
|
1177
|
+
movaps %xmm13, 0xb0(%rsp)
|
|
1178
|
+
movaps %xmm14, 0xc0(%rsp)
|
|
1179
|
+
movaps %xmm15, 0xd0(%rsp)
|
|
1180
|
+
.Lctr_enc_body:
|
|
1181
|
+
___
|
|
1182
|
+
$code.=<<___;
|
|
1183
|
+
mov %rsp, %rbp # backup %rsp
|
|
1184
|
+
movdqu ($arg5), %xmm0 # load counter
|
|
1185
|
+
mov 240($arg4), %eax # rounds
|
|
1186
|
+
mov $arg1, $inp # backup arguments
|
|
1187
|
+
mov $arg2, $out
|
|
1188
|
+
mov $arg3, $len
|
|
1189
|
+
mov $arg4, $key
|
|
1190
|
+
movdqa %xmm0, 0x20(%rbp) # copy counter
|
|
1191
|
+
cmp \$8, $arg3
|
|
1192
|
+
jb .Lctr_enc_short
|
|
1193
|
+
|
|
1194
|
+
mov %eax, %ebx # rounds
|
|
1195
|
+
shl \$7, %rax # 128 bytes per inner round key
|
|
1196
|
+
sub \$`128-32`, %rax # size of bit-sliced key schedule
|
|
1197
|
+
sub %rax, %rsp
|
|
1198
|
+
|
|
1199
|
+
mov %rsp, %rax # pass key schedule
|
|
1200
|
+
mov $key, %rcx # pass key
|
|
1201
|
+
mov %ebx, %r10d # pass rounds
|
|
1202
|
+
call _bsaes_key_convert
|
|
1203
|
+
pxor %xmm6,%xmm7 # fix up last round key
|
|
1204
|
+
movdqa %xmm7,(%rax) # save last round key
|
|
1205
|
+
|
|
1206
|
+
movdqa (%rsp), @XMM[9] # load round0 key
|
|
1207
|
+
lea .LADD1(%rip), %r11
|
|
1208
|
+
movdqa 0x20(%rbp), @XMM[0] # counter copy
|
|
1209
|
+
movdqa -0x20(%r11), @XMM[8] # .LSWPUP
|
|
1210
|
+
pshufb @XMM[8], @XMM[9] # byte swap upper part
|
|
1211
|
+
pshufb @XMM[8], @XMM[0]
|
|
1212
|
+
movdqa @XMM[9], (%rsp) # save adjusted round0 key
|
|
1213
|
+
jmp .Lctr_enc_loop
|
|
1214
|
+
.align 16
|
|
1215
|
+
.Lctr_enc_loop:
|
|
1216
|
+
movdqa @XMM[0], 0x20(%rbp) # save counter
|
|
1217
|
+
movdqa @XMM[0], @XMM[1] # prepare 8 counter values
|
|
1218
|
+
movdqa @XMM[0], @XMM[2]
|
|
1219
|
+
paddd 0x00(%r11), @XMM[1] # .LADD1
|
|
1220
|
+
movdqa @XMM[0], @XMM[3]
|
|
1221
|
+
paddd 0x10(%r11), @XMM[2] # .LADD2
|
|
1222
|
+
movdqa @XMM[0], @XMM[4]
|
|
1223
|
+
paddd 0x20(%r11), @XMM[3] # .LADD3
|
|
1224
|
+
movdqa @XMM[0], @XMM[5]
|
|
1225
|
+
paddd 0x30(%r11), @XMM[4] # .LADD4
|
|
1226
|
+
movdqa @XMM[0], @XMM[6]
|
|
1227
|
+
paddd 0x40(%r11), @XMM[5] # .LADD5
|
|
1228
|
+
movdqa @XMM[0], @XMM[7]
|
|
1229
|
+
paddd 0x50(%r11), @XMM[6] # .LADD6
|
|
1230
|
+
paddd 0x60(%r11), @XMM[7] # .LADD7
|
|
1231
|
+
|
|
1232
|
+
# Borrow prologue from _bsaes_encrypt8 to use the opportunity
|
|
1233
|
+
# to flip byte order in 32-bit counter
|
|
1234
|
+
movdqa (%rsp), @XMM[9] # round 0 key
|
|
1235
|
+
lea 0x10(%rsp), %rax # pass key schedule
|
|
1236
|
+
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
|
|
1237
|
+
pxor @XMM[9], @XMM[0] # xor with round0 key
|
|
1238
|
+
pxor @XMM[9], @XMM[1]
|
|
1239
|
+
pxor @XMM[9], @XMM[2]
|
|
1240
|
+
pxor @XMM[9], @XMM[3]
|
|
1241
|
+
pshufb @XMM[8], @XMM[0]
|
|
1242
|
+
pshufb @XMM[8], @XMM[1]
|
|
1243
|
+
pxor @XMM[9], @XMM[4]
|
|
1244
|
+
pxor @XMM[9], @XMM[5]
|
|
1245
|
+
pshufb @XMM[8], @XMM[2]
|
|
1246
|
+
pshufb @XMM[8], @XMM[3]
|
|
1247
|
+
pxor @XMM[9], @XMM[6]
|
|
1248
|
+
pxor @XMM[9], @XMM[7]
|
|
1249
|
+
pshufb @XMM[8], @XMM[4]
|
|
1250
|
+
pshufb @XMM[8], @XMM[5]
|
|
1251
|
+
pshufb @XMM[8], @XMM[6]
|
|
1252
|
+
pshufb @XMM[8], @XMM[7]
|
|
1253
|
+
lea .LBS0(%rip), %r11 # constants table
|
|
1254
|
+
mov %ebx,%r10d # pass rounds
|
|
1255
|
+
|
|
1256
|
+
call _bsaes_encrypt8_bitslice
|
|
1257
|
+
|
|
1258
|
+
sub \$8,$len
|
|
1259
|
+
jc .Lctr_enc_loop_done
|
|
1260
|
+
|
|
1261
|
+
movdqu 0x00($inp), @XMM[8] # load input
|
|
1262
|
+
movdqu 0x10($inp), @XMM[9]
|
|
1263
|
+
movdqu 0x20($inp), @XMM[10]
|
|
1264
|
+
movdqu 0x30($inp), @XMM[11]
|
|
1265
|
+
movdqu 0x40($inp), @XMM[12]
|
|
1266
|
+
movdqu 0x50($inp), @XMM[13]
|
|
1267
|
+
movdqu 0x60($inp), @XMM[14]
|
|
1268
|
+
movdqu 0x70($inp), @XMM[15]
|
|
1269
|
+
lea 0x80($inp),$inp
|
|
1270
|
+
pxor @XMM[0], @XMM[8]
|
|
1271
|
+
movdqa 0x20(%rbp), @XMM[0] # load counter
|
|
1272
|
+
pxor @XMM[9], @XMM[1]
|
|
1273
|
+
movdqu @XMM[8], 0x00($out) # write output
|
|
1274
|
+
pxor @XMM[10], @XMM[4]
|
|
1275
|
+
movdqu @XMM[1], 0x10($out)
|
|
1276
|
+
pxor @XMM[11], @XMM[6]
|
|
1277
|
+
movdqu @XMM[4], 0x20($out)
|
|
1278
|
+
pxor @XMM[12], @XMM[3]
|
|
1279
|
+
movdqu @XMM[6], 0x30($out)
|
|
1280
|
+
pxor @XMM[13], @XMM[7]
|
|
1281
|
+
movdqu @XMM[3], 0x40($out)
|
|
1282
|
+
pxor @XMM[14], @XMM[2]
|
|
1283
|
+
movdqu @XMM[7], 0x50($out)
|
|
1284
|
+
pxor @XMM[15], @XMM[5]
|
|
1285
|
+
movdqu @XMM[2], 0x60($out)
|
|
1286
|
+
lea .LADD1(%rip), %r11
|
|
1287
|
+
movdqu @XMM[5], 0x70($out)
|
|
1288
|
+
lea 0x80($out), $out
|
|
1289
|
+
paddd 0x70(%r11), @XMM[0] # .LADD8
|
|
1290
|
+
jnz .Lctr_enc_loop
|
|
1291
|
+
|
|
1292
|
+
jmp .Lctr_enc_done
|
|
1293
|
+
.align 16
|
|
1294
|
+
.Lctr_enc_loop_done:
|
|
1295
|
+
add \$8, $len
|
|
1296
|
+
movdqu 0x00($inp), @XMM[8] # load input
|
|
1297
|
+
pxor @XMM[8], @XMM[0]
|
|
1298
|
+
movdqu @XMM[0], 0x00($out) # write output
|
|
1299
|
+
cmp \$2,$len
|
|
1300
|
+
jb .Lctr_enc_done
|
|
1301
|
+
movdqu 0x10($inp), @XMM[9]
|
|
1302
|
+
pxor @XMM[9], @XMM[1]
|
|
1303
|
+
movdqu @XMM[1], 0x10($out)
|
|
1304
|
+
je .Lctr_enc_done
|
|
1305
|
+
movdqu 0x20($inp), @XMM[10]
|
|
1306
|
+
pxor @XMM[10], @XMM[4]
|
|
1307
|
+
movdqu @XMM[4], 0x20($out)
|
|
1308
|
+
cmp \$4,$len
|
|
1309
|
+
jb .Lctr_enc_done
|
|
1310
|
+
movdqu 0x30($inp), @XMM[11]
|
|
1311
|
+
pxor @XMM[11], @XMM[6]
|
|
1312
|
+
movdqu @XMM[6], 0x30($out)
|
|
1313
|
+
je .Lctr_enc_done
|
|
1314
|
+
movdqu 0x40($inp), @XMM[12]
|
|
1315
|
+
pxor @XMM[12], @XMM[3]
|
|
1316
|
+
movdqu @XMM[3], 0x40($out)
|
|
1317
|
+
cmp \$6,$len
|
|
1318
|
+
jb .Lctr_enc_done
|
|
1319
|
+
movdqu 0x50($inp), @XMM[13]
|
|
1320
|
+
pxor @XMM[13], @XMM[7]
|
|
1321
|
+
movdqu @XMM[7], 0x50($out)
|
|
1322
|
+
je .Lctr_enc_done
|
|
1323
|
+
movdqu 0x60($inp), @XMM[14]
|
|
1324
|
+
pxor @XMM[14], @XMM[2]
|
|
1325
|
+
movdqu @XMM[2], 0x60($out)
|
|
1326
|
+
jmp .Lctr_enc_done
|
|
1327
|
+
|
|
1328
|
+
.align 16
|
|
1329
|
+
.Lctr_enc_short:
|
|
1330
|
+
lea 0x20(%rbp), $arg1
|
|
1331
|
+
lea 0x30(%rbp), $arg2
|
|
1332
|
+
lea ($key), $arg3
|
|
1333
|
+
call asm_AES_encrypt
|
|
1334
|
+
movdqu ($inp), @XMM[1]
|
|
1335
|
+
lea 16($inp), $inp
|
|
1336
|
+
mov 0x2c(%rbp), %eax # load 32-bit counter
|
|
1337
|
+
bswap %eax
|
|
1338
|
+
pxor 0x30(%rbp), @XMM[1]
|
|
1339
|
+
inc %eax # increment
|
|
1340
|
+
movdqu @XMM[1], ($out)
|
|
1341
|
+
bswap %eax
|
|
1342
|
+
lea 16($out), $out
|
|
1343
|
+
mov %eax, 0x2c(%rsp) # save 32-bit counter
|
|
1344
|
+
dec $len
|
|
1345
|
+
jnz .Lctr_enc_short
|
|
1346
|
+
|
|
1347
|
+
.Lctr_enc_done:
|
|
1348
|
+
lea (%rsp), %rax
|
|
1349
|
+
pxor %xmm0, %xmm0
|
|
1350
|
+
.Lctr_enc_bzero: # wipe key schedule [if any]
|
|
1351
|
+
movdqa %xmm0, 0x00(%rax)
|
|
1352
|
+
movdqa %xmm0, 0x10(%rax)
|
|
1353
|
+
lea 0x20(%rax), %rax
|
|
1354
|
+
cmp %rax, %rbp
|
|
1355
|
+
ja .Lctr_enc_bzero
|
|
1356
|
+
|
|
1357
|
+
lea (%rbp),%rsp # restore %rsp
|
|
1358
|
+
___
|
|
1359
|
+
$code.=<<___ if ($win64);
|
|
1360
|
+
movaps 0x40(%rbp), %xmm6
|
|
1361
|
+
movaps 0x50(%rbp), %xmm7
|
|
1362
|
+
movaps 0x60(%rbp), %xmm8
|
|
1363
|
+
movaps 0x70(%rbp), %xmm9
|
|
1364
|
+
movaps 0x80(%rbp), %xmm10
|
|
1365
|
+
movaps 0x90(%rbp), %xmm11
|
|
1366
|
+
movaps 0xa0(%rbp), %xmm12
|
|
1367
|
+
movaps 0xb0(%rbp), %xmm13
|
|
1368
|
+
movaps 0xc0(%rbp), %xmm14
|
|
1369
|
+
movaps 0xd0(%rbp), %xmm15
|
|
1370
|
+
lea 0xa0(%rbp), %rsp
|
|
1371
|
+
___
|
|
1372
|
+
$code.=<<___;
|
|
1373
|
+
mov 0x48(%rsp), %r15
|
|
1374
|
+
mov 0x50(%rsp), %r14
|
|
1375
|
+
mov 0x58(%rsp), %r13
|
|
1376
|
+
mov 0x60(%rsp), %r12
|
|
1377
|
+
mov 0x68(%rsp), %rbx
|
|
1378
|
+
mov 0x70(%rsp), %rax
|
|
1379
|
+
lea 0x78(%rsp), %rsp
|
|
1380
|
+
mov %rax, %rbp
|
|
1381
|
+
.Lctr_enc_epilogue:
|
|
1382
|
+
ret
|
|
1383
|
+
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
|
|
1384
|
+
___
|
|
1385
|
+
}
|
|
1386
|
+
$code.=<<___;
|
|
1387
|
+
.type _bsaes_const,\@object
|
|
1388
|
+
.align 64
|
|
1389
|
+
_bsaes_const:
|
|
1390
|
+
.LM0ISR: # InvShiftRows constants
|
|
1391
|
+
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
|
|
1392
|
+
.LISRM0:
|
|
1393
|
+
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
|
|
1394
|
+
.LISR:
|
|
1395
|
+
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
|
|
1396
|
+
.LBS0: # bit-slice constants
|
|
1397
|
+
.quad 0x5555555555555555, 0x5555555555555555
|
|
1398
|
+
.LBS1:
|
|
1399
|
+
.quad 0x3333333333333333, 0x3333333333333333
|
|
1400
|
+
.LBS2:
|
|
1401
|
+
.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
|
|
1402
|
+
.LSR: # shiftrows constants
|
|
1403
|
+
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
|
|
1404
|
+
.LSRM0:
|
|
1405
|
+
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
|
|
1406
|
+
.LM0SR:
|
|
1407
|
+
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
|
|
1408
|
+
.LSWPUP: # byte-swap upper dword
|
|
1409
|
+
.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
|
|
1410
|
+
.LSWPUPM0SR:
|
|
1411
|
+
.quad 0x0a0d02060c03070b, 0x0004080f05090e01
|
|
1412
|
+
.LADD1: # counter increment constants
|
|
1413
|
+
.quad 0x0000000000000000, 0x0000000100000000
|
|
1414
|
+
.LADD2:
|
|
1415
|
+
.quad 0x0000000000000000, 0x0000000200000000
|
|
1416
|
+
.LADD3:
|
|
1417
|
+
.quad 0x0000000000000000, 0x0000000300000000
|
|
1418
|
+
.LADD4:
|
|
1419
|
+
.quad 0x0000000000000000, 0x0000000400000000
|
|
1420
|
+
.LADD5:
|
|
1421
|
+
.quad 0x0000000000000000, 0x0000000500000000
|
|
1422
|
+
.LADD6:
|
|
1423
|
+
.quad 0x0000000000000000, 0x0000000600000000
|
|
1424
|
+
.LADD7:
|
|
1425
|
+
.quad 0x0000000000000000, 0x0000000700000000
|
|
1426
|
+
.LADD8:
|
|
1427
|
+
.quad 0x0000000000000000, 0x0000000800000000
|
|
1428
|
+
.Lmasks:
|
|
1429
|
+
.quad 0x0101010101010101, 0x0101010101010101
|
|
1430
|
+
.quad 0x0202020202020202, 0x0202020202020202
|
|
1431
|
+
.quad 0x0404040404040404, 0x0404040404040404
|
|
1432
|
+
.quad 0x0808080808080808, 0x0808080808080808
|
|
1433
|
+
.LM0:
|
|
1434
|
+
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
|
|
1435
|
+
.L63:
|
|
1436
|
+
.quad 0x6363636363636363, 0x6363636363636363
|
|
1437
|
+
.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
|
|
1438
|
+
.align 64
|
|
1439
|
+
.size _bsaes_const,.-_bsaes_const
|
|
1440
|
+
___
|
|
1441
|
+
|
|
1442
|
+
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
|
1443
|
+
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
|
1444
|
+
if ($win64) {
|
|
1445
|
+
$rec="%rcx";
|
|
1446
|
+
$frame="%rdx";
|
|
1447
|
+
$context="%r8";
|
|
1448
|
+
$disp="%r9";
|
|
1449
|
+
|
|
1450
|
+
$code.=<<___;
|
|
1451
|
+
.extern __imp_RtlVirtualUnwind
|
|
1452
|
+
.type se_handler,\@abi-omnipotent
|
|
1453
|
+
.align 16
|
|
1454
|
+
se_handler:
|
|
1455
|
+
push %rsi
|
|
1456
|
+
push %rdi
|
|
1457
|
+
push %rbx
|
|
1458
|
+
push %rbp
|
|
1459
|
+
push %r12
|
|
1460
|
+
push %r13
|
|
1461
|
+
push %r14
|
|
1462
|
+
push %r15
|
|
1463
|
+
pushfq
|
|
1464
|
+
sub \$64,%rsp
|
|
1465
|
+
|
|
1466
|
+
mov 120($context),%rax # pull context->Rax
|
|
1467
|
+
mov 248($context),%rbx # pull context->Rip
|
|
1468
|
+
|
|
1469
|
+
mov 8($disp),%rsi # disp->ImageBase
|
|
1470
|
+
mov 56($disp),%r11 # disp->HandlerData
|
|
1471
|
+
|
|
1472
|
+
mov 0(%r11),%r10d # HandlerData[0]
|
|
1473
|
+
lea (%rsi,%r10),%r10 # prologue label
|
|
1474
|
+
cmp %r10,%rbx # context->Rip<prologue label
|
|
1475
|
+
jb .Lin_prologue
|
|
1476
|
+
|
|
1477
|
+
mov 152($context),%rax # pull context->Rsp
|
|
1478
|
+
|
|
1479
|
+
mov 4(%r11),%r10d # HandlerData[1]
|
|
1480
|
+
lea (%rsi,%r10),%r10 # epilogue label
|
|
1481
|
+
cmp %r10,%rbx # context->Rip>=epilogue label
|
|
1482
|
+
jae .Lin_prologue
|
|
1483
|
+
|
|
1484
|
+
mov 160($context),%rax # pull context->Rbp
|
|
1485
|
+
|
|
1486
|
+
lea 0x40(%rax),%rsi # %xmm save area
|
|
1487
|
+
lea 512($context),%rdi # &context.Xmm6
|
|
1488
|
+
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
|
1489
|
+
.long 0xa548f3fc # cld; rep movsq
|
|
1490
|
+
lea 0xa0(%rax),%rax # adjust stack pointer
|
|
1491
|
+
|
|
1492
|
+
mov 0x70(%rax),%rbp
|
|
1493
|
+
mov 0x68(%rax),%rbx
|
|
1494
|
+
mov 0x60(%rax),%r12
|
|
1495
|
+
mov 0x58(%rax),%r13
|
|
1496
|
+
mov 0x50(%rax),%r14
|
|
1497
|
+
mov 0x48(%rax),%r15
|
|
1498
|
+
lea 0x78(%rax),%rax # adjust stack pointer
|
|
1499
|
+
mov %rbx,144($context) # restore context->Rbx
|
|
1500
|
+
mov %rbp,160($context) # restore context->Rbp
|
|
1501
|
+
mov %r12,216($context) # restore context->R12
|
|
1502
|
+
mov %r13,224($context) # restore context->R13
|
|
1503
|
+
mov %r14,232($context) # restore context->R14
|
|
1504
|
+
mov %r15,240($context) # restore context->R15
|
|
1505
|
+
|
|
1506
|
+
.Lin_prologue:
|
|
1507
|
+
mov %rax,152($context) # restore context->Rsp
|
|
1508
|
+
|
|
1509
|
+
mov 40($disp),%rdi # disp->ContextRecord
|
|
1510
|
+
mov $context,%rsi # context
|
|
1511
|
+
mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
|
1512
|
+
.long 0xa548f3fc # cld; rep movsq
|
|
1513
|
+
|
|
1514
|
+
mov $disp,%rsi
|
|
1515
|
+
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
|
1516
|
+
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
|
1517
|
+
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
|
1518
|
+
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
|
1519
|
+
mov 40(%rsi),%r10 # disp->ContextRecord
|
|
1520
|
+
lea 56(%rsi),%r11 # &disp->HandlerData
|
|
1521
|
+
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
|
1522
|
+
mov %r10,32(%rsp) # arg5
|
|
1523
|
+
mov %r11,40(%rsp) # arg6
|
|
1524
|
+
mov %r12,48(%rsp) # arg7
|
|
1525
|
+
mov %rcx,56(%rsp) # arg8, (NULL)
|
|
1526
|
+
call *__imp_RtlVirtualUnwind(%rip)
|
|
1527
|
+
|
|
1528
|
+
mov \$1,%eax # ExceptionContinueSearch
|
|
1529
|
+
add \$64,%rsp
|
|
1530
|
+
popfq
|
|
1531
|
+
pop %r15
|
|
1532
|
+
pop %r14
|
|
1533
|
+
pop %r13
|
|
1534
|
+
pop %r12
|
|
1535
|
+
pop %rbp
|
|
1536
|
+
pop %rbx
|
|
1537
|
+
pop %rdi
|
|
1538
|
+
pop %rsi
|
|
1539
|
+
ret
|
|
1540
|
+
.size se_handler,.-se_handler
|
|
1541
|
+
|
|
1542
|
+
.section .pdata
|
|
1543
|
+
.align 4
|
|
1544
|
+
___
|
|
1545
|
+
$code.=<<___;
|
|
1546
|
+
.rva .Lctr_enc_prologue
|
|
1547
|
+
.rva .Lctr_enc_epilogue
|
|
1548
|
+
.rva .Lctr_enc_info
|
|
1549
|
+
|
|
1550
|
+
.section .xdata
|
|
1551
|
+
.align 8
|
|
1552
|
+
___
|
|
1553
|
+
$code.=<<___;
|
|
1554
|
+
.Lctr_enc_info:
|
|
1555
|
+
.byte 9,0,0,0
|
|
1556
|
+
.rva se_handler
|
|
1557
|
+
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
|
|
1558
|
+
___
|
|
1559
|
+
}
|
|
1560
|
+
|
|
1561
|
+
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
|
1562
|
+
|
|
1563
|
+
print $code;
|
|
1564
|
+
|
|
1565
|
+
close STDOUT;
|