ring-native 0.0.0 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/CHANGES.md +7 -0
- data/Makefile +5 -0
- data/README.md +12 -5
- data/Rakefile +4 -0
- data/ext/ring/extconf.rb +4 -5
- data/lib/ring/native.rb +3 -1
- data/lib/ring/native/version.rb +5 -1
- data/ring-native.gemspec +6 -6
- data/vendor/ring-ffi/Cargo.lock +26 -0
- data/vendor/ring-ffi/Cargo.toml +45 -0
- data/vendor/ring-ffi/LICENSE +16 -0
- data/vendor/ring-ffi/README.md +59 -0
- data/vendor/ring-ffi/src/lib.rs +79 -0
- metadata +10 -255
- data/vendor/ring/BUILDING.md +0 -40
- data/vendor/ring/Cargo.toml +0 -43
- data/vendor/ring/LICENSE +0 -185
- data/vendor/ring/Makefile +0 -35
- data/vendor/ring/PORTING.md +0 -163
- data/vendor/ring/README.md +0 -113
- data/vendor/ring/STYLE.md +0 -197
- data/vendor/ring/appveyor.yml +0 -27
- data/vendor/ring/build.rs +0 -108
- data/vendor/ring/crypto/aes/aes.c +0 -1142
- data/vendor/ring/crypto/aes/aes_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/aes/aes_test.cc +0 -93
- data/vendor/ring/crypto/aes/asm/aes-586.pl +0 -2368
- data/vendor/ring/crypto/aes/asm/aes-armv4.pl +0 -1249
- data/vendor/ring/crypto/aes/asm/aes-x86_64.pl +0 -2246
- data/vendor/ring/crypto/aes/asm/aesni-x86.pl +0 -1318
- data/vendor/ring/crypto/aes/asm/aesni-x86_64.pl +0 -2084
- data/vendor/ring/crypto/aes/asm/aesv8-armx.pl +0 -675
- data/vendor/ring/crypto/aes/asm/bsaes-armv7.pl +0 -1364
- data/vendor/ring/crypto/aes/asm/bsaes-x86_64.pl +0 -1565
- data/vendor/ring/crypto/aes/asm/vpaes-x86.pl +0 -841
- data/vendor/ring/crypto/aes/asm/vpaes-x86_64.pl +0 -1116
- data/vendor/ring/crypto/aes/internal.h +0 -87
- data/vendor/ring/crypto/aes/mode_wrappers.c +0 -61
- data/vendor/ring/crypto/bn/add.c +0 -394
- data/vendor/ring/crypto/bn/asm/armv4-mont.pl +0 -694
- data/vendor/ring/crypto/bn/asm/armv8-mont.pl +0 -1503
- data/vendor/ring/crypto/bn/asm/bn-586.pl +0 -774
- data/vendor/ring/crypto/bn/asm/co-586.pl +0 -287
- data/vendor/ring/crypto/bn/asm/rsaz-avx2.pl +0 -1882
- data/vendor/ring/crypto/bn/asm/x86-mont.pl +0 -592
- data/vendor/ring/crypto/bn/asm/x86_64-gcc.c +0 -599
- data/vendor/ring/crypto/bn/asm/x86_64-mont.pl +0 -1393
- data/vendor/ring/crypto/bn/asm/x86_64-mont5.pl +0 -3507
- data/vendor/ring/crypto/bn/bn.c +0 -352
- data/vendor/ring/crypto/bn/bn_asn1.c +0 -74
- data/vendor/ring/crypto/bn/bn_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/bn/bn_test.cc +0 -1696
- data/vendor/ring/crypto/bn/cmp.c +0 -200
- data/vendor/ring/crypto/bn/convert.c +0 -433
- data/vendor/ring/crypto/bn/ctx.c +0 -311
- data/vendor/ring/crypto/bn/div.c +0 -594
- data/vendor/ring/crypto/bn/exponentiation.c +0 -1335
- data/vendor/ring/crypto/bn/gcd.c +0 -711
- data/vendor/ring/crypto/bn/generic.c +0 -1019
- data/vendor/ring/crypto/bn/internal.h +0 -316
- data/vendor/ring/crypto/bn/montgomery.c +0 -516
- data/vendor/ring/crypto/bn/mul.c +0 -888
- data/vendor/ring/crypto/bn/prime.c +0 -829
- data/vendor/ring/crypto/bn/random.c +0 -334
- data/vendor/ring/crypto/bn/rsaz_exp.c +0 -262
- data/vendor/ring/crypto/bn/rsaz_exp.h +0 -53
- data/vendor/ring/crypto/bn/shift.c +0 -276
- data/vendor/ring/crypto/bytestring/bytestring_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/bytestring/bytestring_test.cc +0 -421
- data/vendor/ring/crypto/bytestring/cbb.c +0 -399
- data/vendor/ring/crypto/bytestring/cbs.c +0 -227
- data/vendor/ring/crypto/bytestring/internal.h +0 -46
- data/vendor/ring/crypto/chacha/chacha_generic.c +0 -140
- data/vendor/ring/crypto/chacha/chacha_vec.c +0 -323
- data/vendor/ring/crypto/chacha/chacha_vec_arm.S +0 -1447
- data/vendor/ring/crypto/chacha/chacha_vec_arm_generate.go +0 -153
- data/vendor/ring/crypto/cipher/cipher_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/cipher/e_aes.c +0 -390
- data/vendor/ring/crypto/cipher/e_chacha20poly1305.c +0 -208
- data/vendor/ring/crypto/cipher/internal.h +0 -173
- data/vendor/ring/crypto/cipher/test/aes_128_gcm_tests.txt +0 -543
- data/vendor/ring/crypto/cipher/test/aes_128_key_wrap_tests.txt +0 -9
- data/vendor/ring/crypto/cipher/test/aes_256_gcm_tests.txt +0 -475
- data/vendor/ring/crypto/cipher/test/aes_256_key_wrap_tests.txt +0 -23
- data/vendor/ring/crypto/cipher/test/chacha20_poly1305_old_tests.txt +0 -422
- data/vendor/ring/crypto/cipher/test/chacha20_poly1305_tests.txt +0 -484
- data/vendor/ring/crypto/cipher/test/cipher_test.txt +0 -100
- data/vendor/ring/crypto/constant_time_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/constant_time_test.c +0 -304
- data/vendor/ring/crypto/cpu-arm-asm.S +0 -32
- data/vendor/ring/crypto/cpu-arm.c +0 -199
- data/vendor/ring/crypto/cpu-intel.c +0 -261
- data/vendor/ring/crypto/crypto.c +0 -151
- data/vendor/ring/crypto/curve25519/asm/x25519-arm.S +0 -2118
- data/vendor/ring/crypto/curve25519/curve25519.c +0 -4888
- data/vendor/ring/crypto/curve25519/x25519_test.cc +0 -128
- data/vendor/ring/crypto/digest/md32_common.h +0 -181
- data/vendor/ring/crypto/ec/asm/p256-x86_64-asm.pl +0 -2725
- data/vendor/ring/crypto/ec/ec.c +0 -193
- data/vendor/ring/crypto/ec/ec_curves.c +0 -61
- data/vendor/ring/crypto/ec/ec_key.c +0 -228
- data/vendor/ring/crypto/ec/ec_montgomery.c +0 -114
- data/vendor/ring/crypto/ec/example_mul.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/ec/internal.h +0 -243
- data/vendor/ring/crypto/ec/oct.c +0 -253
- data/vendor/ring/crypto/ec/p256-64.c +0 -1794
- data/vendor/ring/crypto/ec/p256-x86_64-table.h +0 -9548
- data/vendor/ring/crypto/ec/p256-x86_64.c +0 -509
- data/vendor/ring/crypto/ec/simple.c +0 -1007
- data/vendor/ring/crypto/ec/util-64.c +0 -183
- data/vendor/ring/crypto/ec/wnaf.c +0 -508
- data/vendor/ring/crypto/ecdh/ecdh.c +0 -155
- data/vendor/ring/crypto/ecdsa/ecdsa.c +0 -304
- data/vendor/ring/crypto/ecdsa/ecdsa_asn1.c +0 -193
- data/vendor/ring/crypto/ecdsa/ecdsa_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/ecdsa/ecdsa_test.cc +0 -327
- data/vendor/ring/crypto/header_removed.h +0 -17
- data/vendor/ring/crypto/internal.h +0 -495
- data/vendor/ring/crypto/libring.Windows.vcxproj +0 -101
- data/vendor/ring/crypto/mem.c +0 -98
- data/vendor/ring/crypto/modes/asm/aesni-gcm-x86_64.pl +0 -1045
- data/vendor/ring/crypto/modes/asm/ghash-armv4.pl +0 -517
- data/vendor/ring/crypto/modes/asm/ghash-x86.pl +0 -1393
- data/vendor/ring/crypto/modes/asm/ghash-x86_64.pl +0 -1741
- data/vendor/ring/crypto/modes/asm/ghashv8-armx.pl +0 -422
- data/vendor/ring/crypto/modes/ctr.c +0 -226
- data/vendor/ring/crypto/modes/gcm.c +0 -1206
- data/vendor/ring/crypto/modes/gcm_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/modes/gcm_test.c +0 -348
- data/vendor/ring/crypto/modes/internal.h +0 -299
- data/vendor/ring/crypto/perlasm/arm-xlate.pl +0 -170
- data/vendor/ring/crypto/perlasm/readme +0 -100
- data/vendor/ring/crypto/perlasm/x86_64-xlate.pl +0 -1164
- data/vendor/ring/crypto/perlasm/x86asm.pl +0 -292
- data/vendor/ring/crypto/perlasm/x86gas.pl +0 -263
- data/vendor/ring/crypto/perlasm/x86masm.pl +0 -200
- data/vendor/ring/crypto/perlasm/x86nasm.pl +0 -187
- data/vendor/ring/crypto/poly1305/poly1305.c +0 -331
- data/vendor/ring/crypto/poly1305/poly1305_arm.c +0 -301
- data/vendor/ring/crypto/poly1305/poly1305_arm_asm.S +0 -2015
- data/vendor/ring/crypto/poly1305/poly1305_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/poly1305/poly1305_test.cc +0 -80
- data/vendor/ring/crypto/poly1305/poly1305_test.txt +0 -52
- data/vendor/ring/crypto/poly1305/poly1305_vec.c +0 -892
- data/vendor/ring/crypto/rand/asm/rdrand-x86_64.pl +0 -75
- data/vendor/ring/crypto/rand/internal.h +0 -32
- data/vendor/ring/crypto/rand/rand.c +0 -189
- data/vendor/ring/crypto/rand/urandom.c +0 -219
- data/vendor/ring/crypto/rand/windows.c +0 -56
- data/vendor/ring/crypto/refcount_c11.c +0 -66
- data/vendor/ring/crypto/refcount_lock.c +0 -53
- data/vendor/ring/crypto/refcount_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/refcount_test.c +0 -58
- data/vendor/ring/crypto/rsa/blinding.c +0 -462
- data/vendor/ring/crypto/rsa/internal.h +0 -108
- data/vendor/ring/crypto/rsa/padding.c +0 -300
- data/vendor/ring/crypto/rsa/rsa.c +0 -450
- data/vendor/ring/crypto/rsa/rsa_asn1.c +0 -261
- data/vendor/ring/crypto/rsa/rsa_impl.c +0 -944
- data/vendor/ring/crypto/rsa/rsa_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/rsa/rsa_test.cc +0 -437
- data/vendor/ring/crypto/sha/asm/sha-armv8.pl +0 -436
- data/vendor/ring/crypto/sha/asm/sha-x86_64.pl +0 -2390
- data/vendor/ring/crypto/sha/asm/sha256-586.pl +0 -1275
- data/vendor/ring/crypto/sha/asm/sha256-armv4.pl +0 -735
- data/vendor/ring/crypto/sha/asm/sha256-armv8.pl +0 -14
- data/vendor/ring/crypto/sha/asm/sha256-x86_64.pl +0 -14
- data/vendor/ring/crypto/sha/asm/sha512-586.pl +0 -911
- data/vendor/ring/crypto/sha/asm/sha512-armv4.pl +0 -666
- data/vendor/ring/crypto/sha/asm/sha512-armv8.pl +0 -14
- data/vendor/ring/crypto/sha/asm/sha512-x86_64.pl +0 -14
- data/vendor/ring/crypto/sha/sha1.c +0 -271
- data/vendor/ring/crypto/sha/sha256.c +0 -204
- data/vendor/ring/crypto/sha/sha512.c +0 -355
- data/vendor/ring/crypto/test/file_test.cc +0 -326
- data/vendor/ring/crypto/test/file_test.h +0 -181
- data/vendor/ring/crypto/test/malloc.cc +0 -150
- data/vendor/ring/crypto/test/scoped_types.h +0 -95
- data/vendor/ring/crypto/test/test.Windows.vcxproj +0 -35
- data/vendor/ring/crypto/test/test_util.cc +0 -46
- data/vendor/ring/crypto/test/test_util.h +0 -41
- data/vendor/ring/crypto/thread_none.c +0 -55
- data/vendor/ring/crypto/thread_pthread.c +0 -165
- data/vendor/ring/crypto/thread_test.Windows.vcxproj +0 -25
- data/vendor/ring/crypto/thread_test.c +0 -200
- data/vendor/ring/crypto/thread_win.c +0 -282
- data/vendor/ring/examples/checkdigest.rs +0 -103
- data/vendor/ring/include/openssl/aes.h +0 -121
- data/vendor/ring/include/openssl/arm_arch.h +0 -129
- data/vendor/ring/include/openssl/base.h +0 -156
- data/vendor/ring/include/openssl/bn.h +0 -794
- data/vendor/ring/include/openssl/buffer.h +0 -18
- data/vendor/ring/include/openssl/bytestring.h +0 -235
- data/vendor/ring/include/openssl/chacha.h +0 -37
- data/vendor/ring/include/openssl/cmac.h +0 -76
- data/vendor/ring/include/openssl/cpu.h +0 -184
- data/vendor/ring/include/openssl/crypto.h +0 -43
- data/vendor/ring/include/openssl/curve25519.h +0 -88
- data/vendor/ring/include/openssl/ec.h +0 -225
- data/vendor/ring/include/openssl/ec_key.h +0 -129
- data/vendor/ring/include/openssl/ecdh.h +0 -110
- data/vendor/ring/include/openssl/ecdsa.h +0 -156
- data/vendor/ring/include/openssl/err.h +0 -201
- data/vendor/ring/include/openssl/mem.h +0 -101
- data/vendor/ring/include/openssl/obj_mac.h +0 -71
- data/vendor/ring/include/openssl/opensslfeatures.h +0 -68
- data/vendor/ring/include/openssl/opensslv.h +0 -18
- data/vendor/ring/include/openssl/ossl_typ.h +0 -18
- data/vendor/ring/include/openssl/poly1305.h +0 -51
- data/vendor/ring/include/openssl/rand.h +0 -70
- data/vendor/ring/include/openssl/rsa.h +0 -399
- data/vendor/ring/include/openssl/thread.h +0 -133
- data/vendor/ring/include/openssl/type_check.h +0 -71
- data/vendor/ring/mk/Common.props +0 -63
- data/vendor/ring/mk/Windows.props +0 -42
- data/vendor/ring/mk/WindowsTest.props +0 -18
- data/vendor/ring/mk/appveyor.bat +0 -62
- data/vendor/ring/mk/bottom_of_makefile.mk +0 -54
- data/vendor/ring/mk/ring.mk +0 -266
- data/vendor/ring/mk/top_of_makefile.mk +0 -214
- data/vendor/ring/mk/travis.sh +0 -40
- data/vendor/ring/mk/update-travis-yml.py +0 -229
- data/vendor/ring/ring.sln +0 -153
- data/vendor/ring/src/aead.rs +0 -682
- data/vendor/ring/src/agreement.rs +0 -248
- data/vendor/ring/src/c.rs +0 -129
- data/vendor/ring/src/constant_time.rs +0 -37
- data/vendor/ring/src/der.rs +0 -96
- data/vendor/ring/src/digest.rs +0 -690
- data/vendor/ring/src/digest_tests.txt +0 -57
- data/vendor/ring/src/ecc.rs +0 -28
- data/vendor/ring/src/ecc_build.rs +0 -279
- data/vendor/ring/src/ecc_curves.rs +0 -117
- data/vendor/ring/src/ed25519_tests.txt +0 -2579
- data/vendor/ring/src/exe_tests.rs +0 -46
- data/vendor/ring/src/ffi.rs +0 -29
- data/vendor/ring/src/file_test.rs +0 -187
- data/vendor/ring/src/hkdf.rs +0 -153
- data/vendor/ring/src/hkdf_tests.txt +0 -59
- data/vendor/ring/src/hmac.rs +0 -414
- data/vendor/ring/src/hmac_tests.txt +0 -97
- data/vendor/ring/src/input.rs +0 -312
- data/vendor/ring/src/lib.rs +0 -41
- data/vendor/ring/src/pbkdf2.rs +0 -265
- data/vendor/ring/src/pbkdf2_tests.txt +0 -113
- data/vendor/ring/src/polyfill.rs +0 -57
- data/vendor/ring/src/rand.rs +0 -28
- data/vendor/ring/src/signature.rs +0 -314
- data/vendor/ring/third-party/NIST/README.md +0 -9
- data/vendor/ring/third-party/NIST/SHAVS/SHA1LongMsg.rsp +0 -263
- data/vendor/ring/third-party/NIST/SHAVS/SHA1Monte.rsp +0 -309
- data/vendor/ring/third-party/NIST/SHAVS/SHA1ShortMsg.rsp +0 -267
- data/vendor/ring/third-party/NIST/SHAVS/SHA224LongMsg.rsp +0 -263
- data/vendor/ring/third-party/NIST/SHAVS/SHA224Monte.rsp +0 -309
- data/vendor/ring/third-party/NIST/SHAVS/SHA224ShortMsg.rsp +0 -267
- data/vendor/ring/third-party/NIST/SHAVS/SHA256LongMsg.rsp +0 -263
- data/vendor/ring/third-party/NIST/SHAVS/SHA256Monte.rsp +0 -309
- data/vendor/ring/third-party/NIST/SHAVS/SHA256ShortMsg.rsp +0 -267
- data/vendor/ring/third-party/NIST/SHAVS/SHA384LongMsg.rsp +0 -519
- data/vendor/ring/third-party/NIST/SHAVS/SHA384Monte.rsp +0 -309
- data/vendor/ring/third-party/NIST/SHAVS/SHA384ShortMsg.rsp +0 -523
- data/vendor/ring/third-party/NIST/SHAVS/SHA512LongMsg.rsp +0 -519
- data/vendor/ring/third-party/NIST/SHAVS/SHA512Monte.rsp +0 -309
- data/vendor/ring/third-party/NIST/SHAVS/SHA512ShortMsg.rsp +0 -523
- data/vendor/ring/third-party/NIST/sha256sums.txt +0 -1
@@ -1,1741 +0,0 @@
|
|
1
|
-
#!/usr/bin/env perl
|
2
|
-
#
|
3
|
-
# ====================================================================
|
4
|
-
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
5
|
-
# project. The module is, however, dual licensed under OpenSSL and
|
6
|
-
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
7
|
-
# details see http://www.openssl.org/~appro/cryptogams/.
|
8
|
-
# ====================================================================
|
9
|
-
#
|
10
|
-
# March, June 2010
|
11
|
-
#
|
12
|
-
# The module implements "4-bit" GCM GHASH function and underlying
|
13
|
-
# single multiplication operation in GF(2^128). "4-bit" means that
|
14
|
-
# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
|
15
|
-
# function features so called "528B" variant utilizing additional
|
16
|
-
# 256+16 bytes of per-key storage [+512 bytes shared table].
|
17
|
-
# Performance results are for this streamed GHASH subroutine and are
|
18
|
-
# expressed in cycles per processed byte, less is better:
|
19
|
-
#
|
20
|
-
# gcc 3.4.x(*) assembler
|
21
|
-
#
|
22
|
-
# P4 28.6 14.0 +100%
|
23
|
-
# Opteron 19.3 7.7 +150%
|
24
|
-
# Core2 17.8 8.1(**) +120%
|
25
|
-
# Atom 31.6 16.8 +88%
|
26
|
-
# VIA Nano 21.8 10.1 +115%
|
27
|
-
#
|
28
|
-
# (*) comparison is not completely fair, because C results are
|
29
|
-
# for vanilla "256B" implementation, while assembler results
|
30
|
-
# are for "528B";-)
|
31
|
-
# (**) it's mystery [to me] why Core2 result is not same as for
|
32
|
-
# Opteron;
|
33
|
-
|
34
|
-
# May 2010
|
35
|
-
#
|
36
|
-
# Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
|
37
|
-
# See ghash-x86.pl for background information and details about coding
|
38
|
-
# techniques.
|
39
|
-
#
|
40
|
-
# Special thanks to David Woodhouse <dwmw2@infradead.org> for
|
41
|
-
# providing access to a Westmere-based system on behalf of Intel
|
42
|
-
# Open Source Technology Centre.
|
43
|
-
|
44
|
-
# December 2012
|
45
|
-
#
|
46
|
-
# Overhaul: aggregate Karatsuba post-processing, improve ILP in
|
47
|
-
# reduction_alg9, increase reduction aggregate factor to 4x. As for
|
48
|
-
# the latter. ghash-x86.pl discusses that it makes lesser sense to
|
49
|
-
# increase aggregate factor. Then why increase here? Critical path
|
50
|
-
# consists of 3 independent pclmulqdq instructions, Karatsuba post-
|
51
|
-
# processing and reduction. "On top" of this we lay down aggregated
|
52
|
-
# multiplication operations, triplets of independent pclmulqdq's. As
|
53
|
-
# issue rate for pclmulqdq is limited, it makes lesser sense to
|
54
|
-
# aggregate more multiplications than it takes to perform remaining
|
55
|
-
# non-multiplication operations. 2x is near-optimal coefficient for
|
56
|
-
# contemporary Intel CPUs (therefore modest improvement coefficient),
|
57
|
-
# but not for Bulldozer. Latter is because logical SIMD operations
|
58
|
-
# are twice as slow in comparison to Intel, so that critical path is
|
59
|
-
# longer. A CPU with higher pclmulqdq issue rate would also benefit
|
60
|
-
# from higher aggregate factor...
|
61
|
-
#
|
62
|
-
# Westmere 1.78(+13%)
|
63
|
-
# Sandy Bridge 1.80(+8%)
|
64
|
-
# Ivy Bridge 1.80(+7%)
|
65
|
-
# Haswell 0.55(+93%) (if system doesn't support AVX)
|
66
|
-
# Broadwell 0.45(+110%)(if system doesn't support AVX)
|
67
|
-
# Bulldozer 1.49(+27%)
|
68
|
-
# Silvermont 2.88(+13%)
|
69
|
-
|
70
|
-
# March 2013
|
71
|
-
#
|
72
|
-
# ... 8x aggregate factor AVX code path is using reduction algorithm
|
73
|
-
# suggested by Shay Gueron[1]. Even though contemporary AVX-capable
|
74
|
-
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
|
75
|
-
# sub-optimally in comparison to above mentioned version. But thanks
|
76
|
-
# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
|
77
|
-
# it performs in 0.41 cycles per byte on Haswell processor, and in
|
78
|
-
# 0.29 on Broadwell.
|
79
|
-
#
|
80
|
-
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
|
81
|
-
|
82
|
-
$flavour = shift;
|
83
|
-
$output = shift;
|
84
|
-
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
85
|
-
|
86
|
-
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
87
|
-
|
88
|
-
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
89
|
-
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
90
|
-
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
91
|
-
die "can't locate x86_64-xlate.pl";
|
92
|
-
|
93
|
-
# In upstream, this is controlled by shelling out to the compiler to check
|
94
|
-
# versions, but BoringSSL is intended to be used with pre-generated perlasm
|
95
|
-
# output, so this isn't useful anyway.
|
96
|
-
#
|
97
|
-
# TODO(davidben): Enable this after testing. $avx goes up to 2.
|
98
|
-
$avx = 0;
|
99
|
-
|
100
|
-
open OUT,"| \"$^X\" $xlate $flavour $output";
|
101
|
-
*STDOUT=*OUT;
|
102
|
-
|
103
|
-
$do4xaggr=1;
|
104
|
-
|
105
|
-
# common register layout
|
106
|
-
$nlo="%rax";
|
107
|
-
$nhi="%rbx";
|
108
|
-
$Zlo="%r8";
|
109
|
-
$Zhi="%r9";
|
110
|
-
$tmp="%r10";
|
111
|
-
$rem_4bit = "%r11";
|
112
|
-
|
113
|
-
$Xi="%rdi";
|
114
|
-
$Htbl="%rsi";
|
115
|
-
|
116
|
-
# per-function register layout
|
117
|
-
$cnt="%rcx";
|
118
|
-
$rem="%rdx";
|
119
|
-
|
120
|
-
sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
|
121
|
-
$r =~ s/%[er]([sd]i)/%\1l/ or
|
122
|
-
$r =~ s/%[er](bp)/%\1l/ or
|
123
|
-
$r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
|
124
|
-
|
125
|
-
sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
|
126
|
-
{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
|
127
|
-
my $arg = pop;
|
128
|
-
$arg = "\$$arg" if ($arg*1 eq $arg);
|
129
|
-
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
|
130
|
-
}
|
131
|
-
|
132
|
-
{ my $N;
|
133
|
-
sub loop() {
|
134
|
-
my $inp = shift;
|
135
|
-
|
136
|
-
$N++;
|
137
|
-
$code.=<<___;
|
138
|
-
xor $nlo,$nlo
|
139
|
-
xor $nhi,$nhi
|
140
|
-
mov `&LB("$Zlo")`,`&LB("$nlo")`
|
141
|
-
mov `&LB("$Zlo")`,`&LB("$nhi")`
|
142
|
-
shl \$4,`&LB("$nlo")`
|
143
|
-
mov \$14,$cnt
|
144
|
-
mov 8($Htbl,$nlo),$Zlo
|
145
|
-
mov ($Htbl,$nlo),$Zhi
|
146
|
-
and \$0xf0,`&LB("$nhi")`
|
147
|
-
mov $Zlo,$rem
|
148
|
-
jmp .Loop$N
|
149
|
-
|
150
|
-
.align 16
|
151
|
-
.Loop$N:
|
152
|
-
shr \$4,$Zlo
|
153
|
-
and \$0xf,$rem
|
154
|
-
mov $Zhi,$tmp
|
155
|
-
mov ($inp,$cnt),`&LB("$nlo")`
|
156
|
-
shr \$4,$Zhi
|
157
|
-
xor 8($Htbl,$nhi),$Zlo
|
158
|
-
shl \$60,$tmp
|
159
|
-
xor ($Htbl,$nhi),$Zhi
|
160
|
-
mov `&LB("$nlo")`,`&LB("$nhi")`
|
161
|
-
xor ($rem_4bit,$rem,8),$Zhi
|
162
|
-
mov $Zlo,$rem
|
163
|
-
shl \$4,`&LB("$nlo")`
|
164
|
-
xor $tmp,$Zlo
|
165
|
-
dec $cnt
|
166
|
-
js .Lbreak$N
|
167
|
-
|
168
|
-
shr \$4,$Zlo
|
169
|
-
and \$0xf,$rem
|
170
|
-
mov $Zhi,$tmp
|
171
|
-
shr \$4,$Zhi
|
172
|
-
xor 8($Htbl,$nlo),$Zlo
|
173
|
-
shl \$60,$tmp
|
174
|
-
xor ($Htbl,$nlo),$Zhi
|
175
|
-
and \$0xf0,`&LB("$nhi")`
|
176
|
-
xor ($rem_4bit,$rem,8),$Zhi
|
177
|
-
mov $Zlo,$rem
|
178
|
-
xor $tmp,$Zlo
|
179
|
-
jmp .Loop$N
|
180
|
-
|
181
|
-
.align 16
|
182
|
-
.Lbreak$N:
|
183
|
-
shr \$4,$Zlo
|
184
|
-
and \$0xf,$rem
|
185
|
-
mov $Zhi,$tmp
|
186
|
-
shr \$4,$Zhi
|
187
|
-
xor 8($Htbl,$nlo),$Zlo
|
188
|
-
shl \$60,$tmp
|
189
|
-
xor ($Htbl,$nlo),$Zhi
|
190
|
-
and \$0xf0,`&LB("$nhi")`
|
191
|
-
xor ($rem_4bit,$rem,8),$Zhi
|
192
|
-
mov $Zlo,$rem
|
193
|
-
xor $tmp,$Zlo
|
194
|
-
|
195
|
-
shr \$4,$Zlo
|
196
|
-
and \$0xf,$rem
|
197
|
-
mov $Zhi,$tmp
|
198
|
-
shr \$4,$Zhi
|
199
|
-
xor 8($Htbl,$nhi),$Zlo
|
200
|
-
shl \$60,$tmp
|
201
|
-
xor ($Htbl,$nhi),$Zhi
|
202
|
-
xor $tmp,$Zlo
|
203
|
-
xor ($rem_4bit,$rem,8),$Zhi
|
204
|
-
|
205
|
-
bswap $Zlo
|
206
|
-
bswap $Zhi
|
207
|
-
___
|
208
|
-
}}
|
209
|
-
|
210
|
-
$code=<<___;
|
211
|
-
.text
|
212
|
-
.extern OPENSSL_ia32cap_P
|
213
|
-
|
214
|
-
.globl gcm_gmult_4bit
|
215
|
-
.type gcm_gmult_4bit,\@function,2
|
216
|
-
.align 16
|
217
|
-
gcm_gmult_4bit:
|
218
|
-
push %rbx
|
219
|
-
push %rbp # %rbp and %r12 are pushed exclusively in
|
220
|
-
push %r12 # order to reuse Win64 exception handler...
|
221
|
-
.Lgmult_prologue:
|
222
|
-
|
223
|
-
movzb 15($Xi),$Zlo
|
224
|
-
lea .Lrem_4bit(%rip),$rem_4bit
|
225
|
-
___
|
226
|
-
&loop ($Xi);
|
227
|
-
$code.=<<___;
|
228
|
-
mov $Zlo,8($Xi)
|
229
|
-
mov $Zhi,($Xi)
|
230
|
-
|
231
|
-
mov 16(%rsp),%rbx
|
232
|
-
lea 24(%rsp),%rsp
|
233
|
-
.Lgmult_epilogue:
|
234
|
-
ret
|
235
|
-
.size gcm_gmult_4bit,.-gcm_gmult_4bit
|
236
|
-
___
|
237
|
-
|
238
|
-
# per-function register layout
|
239
|
-
$inp="%rdx";
|
240
|
-
$len="%rcx";
|
241
|
-
$rem_8bit=$rem_4bit;
|
242
|
-
|
243
|
-
$code.=<<___;
|
244
|
-
.globl gcm_ghash_4bit
|
245
|
-
.type gcm_ghash_4bit,\@function,4
|
246
|
-
.align 16
|
247
|
-
gcm_ghash_4bit:
|
248
|
-
push %rbx
|
249
|
-
push %rbp
|
250
|
-
push %r12
|
251
|
-
push %r13
|
252
|
-
push %r14
|
253
|
-
push %r15
|
254
|
-
sub \$280,%rsp
|
255
|
-
.Lghash_prologue:
|
256
|
-
mov $inp,%r14 # reassign couple of args
|
257
|
-
mov $len,%r15
|
258
|
-
___
|
259
|
-
{ my $inp="%r14";
|
260
|
-
my $dat="%edx";
|
261
|
-
my $len="%r15";
|
262
|
-
my @nhi=("%ebx","%ecx");
|
263
|
-
my @rem=("%r12","%r13");
|
264
|
-
my $Hshr4="%rbp";
|
265
|
-
|
266
|
-
&sub ($Htbl,-128); # size optimization
|
267
|
-
&lea ($Hshr4,"16+128(%rsp)");
|
268
|
-
{ my @lo =($nlo,$nhi);
|
269
|
-
my @hi =($Zlo,$Zhi);
|
270
|
-
|
271
|
-
&xor ($dat,$dat);
|
272
|
-
for ($i=0,$j=-2;$i<18;$i++,$j++) {
|
273
|
-
&mov ("$j(%rsp)",&LB($dat)) if ($i>1);
|
274
|
-
&or ($lo[0],$tmp) if ($i>1);
|
275
|
-
&mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
|
276
|
-
&shr ($lo[1],4) if ($i>0 && $i<17);
|
277
|
-
&mov ($tmp,$hi[1]) if ($i>0 && $i<17);
|
278
|
-
&shr ($hi[1],4) if ($i>0 && $i<17);
|
279
|
-
&mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
|
280
|
-
&mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
|
281
|
-
&shl (&LB($dat),4) if ($i>0 && $i<17);
|
282
|
-
&mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
|
283
|
-
&mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
|
284
|
-
&shl ($tmp,60) if ($i>0 && $i<17);
|
285
|
-
|
286
|
-
push (@lo,shift(@lo));
|
287
|
-
push (@hi,shift(@hi));
|
288
|
-
}
|
289
|
-
}
|
290
|
-
&add ($Htbl,-128);
|
291
|
-
&mov ($Zlo,"8($Xi)");
|
292
|
-
&mov ($Zhi,"0($Xi)");
|
293
|
-
&add ($len,$inp); # pointer to the end of data
|
294
|
-
&lea ($rem_8bit,".Lrem_8bit(%rip)");
|
295
|
-
&jmp (".Louter_loop");
|
296
|
-
|
297
|
-
$code.=".align 16\n.Louter_loop:\n";
|
298
|
-
&xor ($Zhi,"($inp)");
|
299
|
-
&mov ("%rdx","8($inp)");
|
300
|
-
&lea ($inp,"16($inp)");
|
301
|
-
&xor ("%rdx",$Zlo);
|
302
|
-
&mov ("($Xi)",$Zhi);
|
303
|
-
&mov ("8($Xi)","%rdx");
|
304
|
-
&shr ("%rdx",32);
|
305
|
-
|
306
|
-
&xor ($nlo,$nlo);
|
307
|
-
&rol ($dat,8);
|
308
|
-
&mov (&LB($nlo),&LB($dat));
|
309
|
-
&movz ($nhi[0],&LB($dat));
|
310
|
-
&shl (&LB($nlo),4);
|
311
|
-
&shr ($nhi[0],4);
|
312
|
-
|
313
|
-
for ($j=11,$i=0;$i<15;$i++) {
|
314
|
-
&rol ($dat,8);
|
315
|
-
&xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
|
316
|
-
&xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
|
317
|
-
&mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
|
318
|
-
&mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
|
319
|
-
|
320
|
-
&mov (&LB($nlo),&LB($dat));
|
321
|
-
&xor ($Zlo,$tmp) if ($i>0);
|
322
|
-
&movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
|
323
|
-
|
324
|
-
&movz ($nhi[1],&LB($dat));
|
325
|
-
&shl (&LB($nlo),4);
|
326
|
-
&movzb ($rem[0],"(%rsp,$nhi[0])");
|
327
|
-
|
328
|
-
&shr ($nhi[1],4) if ($i<14);
|
329
|
-
&and ($nhi[1],0xf0) if ($i==14);
|
330
|
-
&shl ($rem[1],48) if ($i>0);
|
331
|
-
&xor ($rem[0],$Zlo);
|
332
|
-
|
333
|
-
&mov ($tmp,$Zhi);
|
334
|
-
&xor ($Zhi,$rem[1]) if ($i>0);
|
335
|
-
&shr ($Zlo,8);
|
336
|
-
|
337
|
-
&movz ($rem[0],&LB($rem[0]));
|
338
|
-
&mov ($dat,"$j($Xi)") if (--$j%4==0);
|
339
|
-
&shr ($Zhi,8);
|
340
|
-
|
341
|
-
&xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
|
342
|
-
&shl ($tmp,56);
|
343
|
-
&xor ($Zhi,"($Hshr4,$nhi[0],8)");
|
344
|
-
|
345
|
-
unshift (@nhi,pop(@nhi)); # "rotate" registers
|
346
|
-
unshift (@rem,pop(@rem));
|
347
|
-
}
|
348
|
-
&movzw ($rem[1],"($rem_8bit,$rem[1],2)");
|
349
|
-
&xor ($Zlo,"8($Htbl,$nlo)");
|
350
|
-
&xor ($Zhi,"($Htbl,$nlo)");
|
351
|
-
|
352
|
-
&shl ($rem[1],48);
|
353
|
-
&xor ($Zlo,$tmp);
|
354
|
-
|
355
|
-
&xor ($Zhi,$rem[1]);
|
356
|
-
&movz ($rem[0],&LB($Zlo));
|
357
|
-
&shr ($Zlo,4);
|
358
|
-
|
359
|
-
&mov ($tmp,$Zhi);
|
360
|
-
&shl (&LB($rem[0]),4);
|
361
|
-
&shr ($Zhi,4);
|
362
|
-
|
363
|
-
&xor ($Zlo,"8($Htbl,$nhi[0])");
|
364
|
-
&movzw ($rem[0],"($rem_8bit,$rem[0],2)");
|
365
|
-
&shl ($tmp,60);
|
366
|
-
|
367
|
-
&xor ($Zhi,"($Htbl,$nhi[0])");
|
368
|
-
&xor ($Zlo,$tmp);
|
369
|
-
&shl ($rem[0],48);
|
370
|
-
|
371
|
-
&bswap ($Zlo);
|
372
|
-
&xor ($Zhi,$rem[0]);
|
373
|
-
|
374
|
-
&bswap ($Zhi);
|
375
|
-
&cmp ($inp,$len);
|
376
|
-
&jb (".Louter_loop");
|
377
|
-
}
|
378
|
-
$code.=<<___;
|
379
|
-
mov $Zlo,8($Xi)
|
380
|
-
mov $Zhi,($Xi)
|
381
|
-
|
382
|
-
lea 280(%rsp),%rsi
|
383
|
-
mov 0(%rsi),%r15
|
384
|
-
mov 8(%rsi),%r14
|
385
|
-
mov 16(%rsi),%r13
|
386
|
-
mov 24(%rsi),%r12
|
387
|
-
mov 32(%rsi),%rbp
|
388
|
-
mov 40(%rsi),%rbx
|
389
|
-
lea 48(%rsi),%rsp
|
390
|
-
.Lghash_epilogue:
|
391
|
-
ret
|
392
|
-
.size gcm_ghash_4bit,.-gcm_ghash_4bit
|
393
|
-
___
|
394
|
-
|
395
|
-
######################################################################
|
396
|
-
# PCLMULQDQ version.
|
397
|
-
|
398
|
-
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
399
|
-
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
400
|
-
|
401
|
-
($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
|
402
|
-
($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
|
403
|
-
|
404
|
-
sub clmul64x64_T2 { # minimal register pressure
|
405
|
-
my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
406
|
-
|
407
|
-
if (!defined($HK)) { $HK = $T2;
|
408
|
-
$code.=<<___;
|
409
|
-
movdqa $Xi,$Xhi #
|
410
|
-
pshufd \$0b01001110,$Xi,$T1
|
411
|
-
pshufd \$0b01001110,$Hkey,$T2
|
412
|
-
pxor $Xi,$T1 #
|
413
|
-
pxor $Hkey,$T2
|
414
|
-
___
|
415
|
-
} else {
|
416
|
-
$code.=<<___;
|
417
|
-
movdqa $Xi,$Xhi #
|
418
|
-
pshufd \$0b01001110,$Xi,$T1
|
419
|
-
pxor $Xi,$T1 #
|
420
|
-
___
|
421
|
-
}
|
422
|
-
$code.=<<___;
|
423
|
-
pclmulqdq \$0x00,$Hkey,$Xi #######
|
424
|
-
pclmulqdq \$0x11,$Hkey,$Xhi #######
|
425
|
-
pclmulqdq \$0x00,$HK,$T1 #######
|
426
|
-
pxor $Xi,$T1 #
|
427
|
-
pxor $Xhi,$T1 #
|
428
|
-
|
429
|
-
movdqa $T1,$T2 #
|
430
|
-
psrldq \$8,$T1
|
431
|
-
pslldq \$8,$T2 #
|
432
|
-
pxor $T1,$Xhi
|
433
|
-
pxor $T2,$Xi #
|
434
|
-
___
|
435
|
-
}
|
436
|
-
|
437
|
-
sub reduction_alg9 { # 17/11 times faster than Intel version
|
438
|
-
my ($Xhi,$Xi) = @_;
|
439
|
-
|
440
|
-
$code.=<<___;
|
441
|
-
# 1st phase
|
442
|
-
movdqa $Xi,$T2 #
|
443
|
-
movdqa $Xi,$T1
|
444
|
-
psllq \$5,$Xi
|
445
|
-
pxor $Xi,$T1 #
|
446
|
-
psllq \$1,$Xi
|
447
|
-
pxor $T1,$Xi #
|
448
|
-
psllq \$57,$Xi #
|
449
|
-
movdqa $Xi,$T1 #
|
450
|
-
pslldq \$8,$Xi
|
451
|
-
psrldq \$8,$T1 #
|
452
|
-
pxor $T2,$Xi
|
453
|
-
pxor $T1,$Xhi #
|
454
|
-
|
455
|
-
# 2nd phase
|
456
|
-
movdqa $Xi,$T2
|
457
|
-
psrlq \$1,$Xi
|
458
|
-
pxor $T2,$Xhi #
|
459
|
-
pxor $Xi,$T2
|
460
|
-
psrlq \$5,$Xi
|
461
|
-
pxor $T2,$Xi #
|
462
|
-
psrlq \$1,$Xi #
|
463
|
-
pxor $Xhi,$Xi #
|
464
|
-
___
|
465
|
-
}
|
466
|
-
|
467
|
-
{ my ($Htbl,$Xip)=@_4args;
|
468
|
-
my $HK="%xmm6";
|
469
|
-
|
470
|
-
$code.=<<___;
|
471
|
-
.globl gcm_init_clmul
|
472
|
-
.type gcm_init_clmul,\@abi-omnipotent
|
473
|
-
.align 16
|
474
|
-
gcm_init_clmul:
|
475
|
-
.L_init_clmul:
|
476
|
-
___
|
477
|
-
$code.=<<___ if ($win64);
|
478
|
-
.LSEH_begin_gcm_init_clmul:
|
479
|
-
# I can't trust assembler to use specific encoding:-(
|
480
|
-
.byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
481
|
-
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
482
|
-
___
|
483
|
-
$code.=<<___;
|
484
|
-
movdqu ($Xip),$Hkey
|
485
|
-
pshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
486
|
-
|
487
|
-
# <<1 twist
|
488
|
-
pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
489
|
-
movdqa $Hkey,$T1
|
490
|
-
psllq \$1,$Hkey
|
491
|
-
pxor $T3,$T3 #
|
492
|
-
psrlq \$63,$T1
|
493
|
-
pcmpgtd $T2,$T3 # broadcast carry bit
|
494
|
-
pslldq \$8,$T1
|
495
|
-
por $T1,$Hkey # H<<=1
|
496
|
-
|
497
|
-
# magic reduction
|
498
|
-
pand .L0x1c2_polynomial(%rip),$T3
|
499
|
-
pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
|
500
|
-
|
501
|
-
# calculate H^2
|
502
|
-
pshufd \$0b01001110,$Hkey,$HK
|
503
|
-
movdqa $Hkey,$Xi
|
504
|
-
pxor $Hkey,$HK
|
505
|
-
___
|
506
|
-
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
|
507
|
-
&reduction_alg9 ($Xhi,$Xi);
|
508
|
-
$code.=<<___;
|
509
|
-
pshufd \$0b01001110,$Hkey,$T1
|
510
|
-
pshufd \$0b01001110,$Xi,$T2
|
511
|
-
pxor $Hkey,$T1 # Karatsuba pre-processing
|
512
|
-
movdqu $Hkey,0x00($Htbl) # save H
|
513
|
-
pxor $Xi,$T2 # Karatsuba pre-processing
|
514
|
-
movdqu $Xi,0x10($Htbl) # save H^2
|
515
|
-
palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
|
516
|
-
movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
|
517
|
-
___
|
518
|
-
if ($do4xaggr) {
|
519
|
-
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
|
520
|
-
&reduction_alg9 ($Xhi,$Xi);
|
521
|
-
$code.=<<___;
|
522
|
-
movdqa $Xi,$T3
|
523
|
-
___
|
524
|
-
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
|
525
|
-
&reduction_alg9 ($Xhi,$Xi);
|
526
|
-
$code.=<<___;
|
527
|
-
pshufd \$0b01001110,$T3,$T1
|
528
|
-
pshufd \$0b01001110,$Xi,$T2
|
529
|
-
pxor $T3,$T1 # Karatsuba pre-processing
|
530
|
-
movdqu $T3,0x30($Htbl) # save H^3
|
531
|
-
pxor $Xi,$T2 # Karatsuba pre-processing
|
532
|
-
movdqu $Xi,0x40($Htbl) # save H^4
|
533
|
-
palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
|
534
|
-
movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
|
535
|
-
___
|
536
|
-
}
|
537
|
-
$code.=<<___ if ($win64);
|
538
|
-
movaps (%rsp),%xmm6
|
539
|
-
lea 0x18(%rsp),%rsp
|
540
|
-
.LSEH_end_gcm_init_clmul:
|
541
|
-
___
|
542
|
-
$code.=<<___;
|
543
|
-
ret
|
544
|
-
.size gcm_init_clmul,.-gcm_init_clmul
|
545
|
-
___
|
546
|
-
}
|
547
|
-
|
548
|
-
{ my ($Xip,$Htbl)=@_4args;
|
549
|
-
|
550
|
-
$code.=<<___;
|
551
|
-
.globl gcm_gmult_clmul
|
552
|
-
.type gcm_gmult_clmul,\@abi-omnipotent
|
553
|
-
.align 16
|
554
|
-
gcm_gmult_clmul:
|
555
|
-
.L_gmult_clmul:
|
556
|
-
movdqu ($Xip),$Xi
|
557
|
-
movdqa .Lbswap_mask(%rip),$T3
|
558
|
-
movdqu ($Htbl),$Hkey
|
559
|
-
movdqu 0x20($Htbl),$T2
|
560
|
-
pshufb $T3,$Xi
|
561
|
-
___
|
562
|
-
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
|
563
|
-
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
|
564
|
-
# experimental alternative. special thing about is that there
|
565
|
-
# no dependency between the two multiplications...
|
566
|
-
mov \$`0xE1<<1`,%eax
|
567
|
-
mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
|
568
|
-
mov \$0x07,%r11d
|
569
|
-
movq %rax,$T1
|
570
|
-
movq %r10,$T2
|
571
|
-
movq %r11,$T3 # borrow $T3
|
572
|
-
pand $Xi,$T3
|
573
|
-
pshufb $T3,$T2 # ($Xi&7)·0xE0
|
574
|
-
movq %rax,$T3
|
575
|
-
pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
|
576
|
-
pxor $Xi,$T2
|
577
|
-
pslldq \$15,$T2
|
578
|
-
paddd $T2,$T2 # <<(64+56+1)
|
579
|
-
pxor $T2,$Xi
|
580
|
-
pclmulqdq \$0x01,$T3,$Xi
|
581
|
-
movdqa .Lbswap_mask(%rip),$T3 # reload $T3
|
582
|
-
psrldq \$1,$T1
|
583
|
-
pxor $T1,$Xhi
|
584
|
-
pslldq \$7,$Xi
|
585
|
-
pxor $Xhi,$Xi
|
586
|
-
___
|
587
|
-
$code.=<<___;
|
588
|
-
pshufb $T3,$Xi
|
589
|
-
movdqu $Xi,($Xip)
|
590
|
-
ret
|
591
|
-
.size gcm_gmult_clmul,.-gcm_gmult_clmul
|
592
|
-
___
|
593
|
-
}
|
594
|
-
|
595
|
-
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
|
596
|
-
my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
|
597
|
-
my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
|
598
|
-
|
599
|
-
$code.=<<___;
|
600
|
-
.globl gcm_ghash_clmul
|
601
|
-
.type gcm_ghash_clmul,\@abi-omnipotent
|
602
|
-
.align 32
|
603
|
-
gcm_ghash_clmul:
|
604
|
-
.L_ghash_clmul:
|
605
|
-
___
|
606
|
-
$code.=<<___ if ($win64);
|
607
|
-
lea -0x88(%rsp),%rax
|
608
|
-
.LSEH_begin_gcm_ghash_clmul:
|
609
|
-
# I can't trust assembler to use specific encoding:-(
|
610
|
-
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
611
|
-
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
612
|
-
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
613
|
-
.byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
614
|
-
.byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
615
|
-
.byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
616
|
-
.byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
617
|
-
.byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
618
|
-
.byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
619
|
-
.byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
620
|
-
.byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
621
|
-
___
|
622
|
-
$code.=<<___;
|
623
|
-
movdqa .Lbswap_mask(%rip),$T3
|
624
|
-
|
625
|
-
movdqu ($Xip),$Xi
|
626
|
-
movdqu ($Htbl),$Hkey
|
627
|
-
movdqu 0x20($Htbl),$HK
|
628
|
-
pshufb $T3,$Xi
|
629
|
-
|
630
|
-
sub \$0x10,$len
|
631
|
-
jz .Lodd_tail
|
632
|
-
|
633
|
-
movdqu 0x10($Htbl),$Hkey2
|
634
|
-
___
|
635
|
-
if ($do4xaggr) {
|
636
|
-
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
|
637
|
-
|
638
|
-
$code.=<<___;
|
639
|
-
mov OPENSSL_ia32cap_P+4(%rip),%eax
|
640
|
-
cmp \$0x30,$len
|
641
|
-
jb .Lskip4x
|
642
|
-
|
643
|
-
and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
|
644
|
-
cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
|
645
|
-
je .Lskip4x
|
646
|
-
|
647
|
-
sub \$0x30,$len
|
648
|
-
mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
|
649
|
-
movdqu 0x30($Htbl),$Hkey3
|
650
|
-
movdqu 0x40($Htbl),$Hkey4
|
651
|
-
|
652
|
-
#######
|
653
|
-
# Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
|
654
|
-
#
|
655
|
-
movdqu 0x30($inp),$Xln
|
656
|
-
movdqu 0x20($inp),$Xl
|
657
|
-
pshufb $T3,$Xln
|
658
|
-
pshufb $T3,$Xl
|
659
|
-
movdqa $Xln,$Xhn
|
660
|
-
pshufd \$0b01001110,$Xln,$Xmn
|
661
|
-
pxor $Xln,$Xmn
|
662
|
-
pclmulqdq \$0x00,$Hkey,$Xln
|
663
|
-
pclmulqdq \$0x11,$Hkey,$Xhn
|
664
|
-
pclmulqdq \$0x00,$HK,$Xmn
|
665
|
-
|
666
|
-
movdqa $Xl,$Xh
|
667
|
-
pshufd \$0b01001110,$Xl,$Xm
|
668
|
-
pxor $Xl,$Xm
|
669
|
-
pclmulqdq \$0x00,$Hkey2,$Xl
|
670
|
-
pclmulqdq \$0x11,$Hkey2,$Xh
|
671
|
-
pclmulqdq \$0x10,$HK,$Xm
|
672
|
-
xorps $Xl,$Xln
|
673
|
-
xorps $Xh,$Xhn
|
674
|
-
movups 0x50($Htbl),$HK
|
675
|
-
xorps $Xm,$Xmn
|
676
|
-
|
677
|
-
movdqu 0x10($inp),$Xl
|
678
|
-
movdqu 0($inp),$T1
|
679
|
-
pshufb $T3,$Xl
|
680
|
-
pshufb $T3,$T1
|
681
|
-
movdqa $Xl,$Xh
|
682
|
-
pshufd \$0b01001110,$Xl,$Xm
|
683
|
-
pxor $T1,$Xi
|
684
|
-
pxor $Xl,$Xm
|
685
|
-
pclmulqdq \$0x00,$Hkey3,$Xl
|
686
|
-
movdqa $Xi,$Xhi
|
687
|
-
pshufd \$0b01001110,$Xi,$T1
|
688
|
-
pxor $Xi,$T1
|
689
|
-
pclmulqdq \$0x11,$Hkey3,$Xh
|
690
|
-
pclmulqdq \$0x00,$HK,$Xm
|
691
|
-
xorps $Xl,$Xln
|
692
|
-
xorps $Xh,$Xhn
|
693
|
-
|
694
|
-
lea 0x40($inp),$inp
|
695
|
-
sub \$0x40,$len
|
696
|
-
jc .Ltail4x
|
697
|
-
|
698
|
-
jmp .Lmod4_loop
|
699
|
-
.align 32
|
700
|
-
.Lmod4_loop:
|
701
|
-
pclmulqdq \$0x00,$Hkey4,$Xi
|
702
|
-
xorps $Xm,$Xmn
|
703
|
-
movdqu 0x30($inp),$Xl
|
704
|
-
pshufb $T3,$Xl
|
705
|
-
pclmulqdq \$0x11,$Hkey4,$Xhi
|
706
|
-
xorps $Xln,$Xi
|
707
|
-
movdqu 0x20($inp),$Xln
|
708
|
-
movdqa $Xl,$Xh
|
709
|
-
pclmulqdq \$0x10,$HK,$T1
|
710
|
-
pshufd \$0b01001110,$Xl,$Xm
|
711
|
-
xorps $Xhn,$Xhi
|
712
|
-
pxor $Xl,$Xm
|
713
|
-
pshufb $T3,$Xln
|
714
|
-
movups 0x20($Htbl),$HK
|
715
|
-
xorps $Xmn,$T1
|
716
|
-
pclmulqdq \$0x00,$Hkey,$Xl
|
717
|
-
pshufd \$0b01001110,$Xln,$Xmn
|
718
|
-
|
719
|
-
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
720
|
-
movdqa $Xln,$Xhn
|
721
|
-
pxor $Xhi,$T1 #
|
722
|
-
pxor $Xln,$Xmn
|
723
|
-
movdqa $T1,$T2 #
|
724
|
-
pclmulqdq \$0x11,$Hkey,$Xh
|
725
|
-
pslldq \$8,$T1
|
726
|
-
psrldq \$8,$T2 #
|
727
|
-
pxor $T1,$Xi
|
728
|
-
movdqa .L7_mask(%rip),$T1
|
729
|
-
pxor $T2,$Xhi #
|
730
|
-
movq %rax,$T2
|
731
|
-
|
732
|
-
pand $Xi,$T1 # 1st phase
|
733
|
-
pshufb $T1,$T2 #
|
734
|
-
pxor $Xi,$T2 #
|
735
|
-
pclmulqdq \$0x00,$HK,$Xm
|
736
|
-
psllq \$57,$T2 #
|
737
|
-
movdqa $T2,$T1 #
|
738
|
-
pslldq \$8,$T2
|
739
|
-
pclmulqdq \$0x00,$Hkey2,$Xln
|
740
|
-
psrldq \$8,$T1 #
|
741
|
-
pxor $T2,$Xi
|
742
|
-
pxor $T1,$Xhi #
|
743
|
-
movdqu 0($inp),$T1
|
744
|
-
|
745
|
-
movdqa $Xi,$T2 # 2nd phase
|
746
|
-
psrlq \$1,$Xi
|
747
|
-
pclmulqdq \$0x11,$Hkey2,$Xhn
|
748
|
-
xorps $Xl,$Xln
|
749
|
-
movdqu 0x10($inp),$Xl
|
750
|
-
pshufb $T3,$Xl
|
751
|
-
pclmulqdq \$0x10,$HK,$Xmn
|
752
|
-
xorps $Xh,$Xhn
|
753
|
-
movups 0x50($Htbl),$HK
|
754
|
-
pshufb $T3,$T1
|
755
|
-
pxor $T2,$Xhi #
|
756
|
-
pxor $Xi,$T2
|
757
|
-
psrlq \$5,$Xi
|
758
|
-
|
759
|
-
movdqa $Xl,$Xh
|
760
|
-
pxor $Xm,$Xmn
|
761
|
-
pshufd \$0b01001110,$Xl,$Xm
|
762
|
-
pxor $T2,$Xi #
|
763
|
-
pxor $T1,$Xhi
|
764
|
-
pxor $Xl,$Xm
|
765
|
-
pclmulqdq \$0x00,$Hkey3,$Xl
|
766
|
-
psrlq \$1,$Xi #
|
767
|
-
pxor $Xhi,$Xi #
|
768
|
-
movdqa $Xi,$Xhi
|
769
|
-
pclmulqdq \$0x11,$Hkey3,$Xh
|
770
|
-
xorps $Xl,$Xln
|
771
|
-
pshufd \$0b01001110,$Xi,$T1
|
772
|
-
pxor $Xi,$T1
|
773
|
-
|
774
|
-
pclmulqdq \$0x00,$HK,$Xm
|
775
|
-
xorps $Xh,$Xhn
|
776
|
-
|
777
|
-
lea 0x40($inp),$inp
|
778
|
-
sub \$0x40,$len
|
779
|
-
jnc .Lmod4_loop
|
780
|
-
|
781
|
-
.Ltail4x:
|
782
|
-
pclmulqdq \$0x00,$Hkey4,$Xi
|
783
|
-
pclmulqdq \$0x11,$Hkey4,$Xhi
|
784
|
-
pclmulqdq \$0x10,$HK,$T1
|
785
|
-
xorps $Xm,$Xmn
|
786
|
-
xorps $Xln,$Xi
|
787
|
-
xorps $Xhn,$Xhi
|
788
|
-
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
|
789
|
-
pxor $Xmn,$T1
|
790
|
-
|
791
|
-
pxor $Xhi,$T1 #
|
792
|
-
pxor $Xi,$Xhi
|
793
|
-
|
794
|
-
movdqa $T1,$T2 #
|
795
|
-
psrldq \$8,$T1
|
796
|
-
pslldq \$8,$T2 #
|
797
|
-
pxor $T1,$Xhi
|
798
|
-
pxor $T2,$Xi #
|
799
|
-
___
|
800
|
-
&reduction_alg9($Xhi,$Xi);
|
801
|
-
$code.=<<___;
|
802
|
-
add \$0x40,$len
|
803
|
-
jz .Ldone
|
804
|
-
movdqu 0x20($Htbl),$HK
|
805
|
-
sub \$0x10,$len
|
806
|
-
jz .Lodd_tail
|
807
|
-
.Lskip4x:
|
808
|
-
___
|
809
|
-
}
|
810
|
-
$code.=<<___;
|
811
|
-
#######
|
812
|
-
# Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
|
813
|
-
# [(H*Ii+1) + (H*Xi+1)] mod P =
|
814
|
-
# [(H*Ii+1) + H^2*(Ii+Xi)] mod P
|
815
|
-
#
|
816
|
-
movdqu ($inp),$T1 # Ii
|
817
|
-
movdqu 16($inp),$Xln # Ii+1
|
818
|
-
pshufb $T3,$T1
|
819
|
-
pshufb $T3,$Xln
|
820
|
-
pxor $T1,$Xi # Ii+Xi
|
821
|
-
|
822
|
-
movdqa $Xln,$Xhn
|
823
|
-
pshufd \$0b01001110,$Xln,$Xmn
|
824
|
-
pxor $Xln,$Xmn
|
825
|
-
pclmulqdq \$0x00,$Hkey,$Xln
|
826
|
-
pclmulqdq \$0x11,$Hkey,$Xhn
|
827
|
-
pclmulqdq \$0x00,$HK,$Xmn
|
828
|
-
|
829
|
-
lea 32($inp),$inp # i+=2
|
830
|
-
nop
|
831
|
-
sub \$0x20,$len
|
832
|
-
jbe .Leven_tail
|
833
|
-
nop
|
834
|
-
jmp .Lmod_loop
|
835
|
-
|
836
|
-
.align 32
|
837
|
-
.Lmod_loop:
|
838
|
-
movdqa $Xi,$Xhi
|
839
|
-
movdqa $Xmn,$T1
|
840
|
-
pshufd \$0b01001110,$Xi,$Xmn #
|
841
|
-
pxor $Xi,$Xmn #
|
842
|
-
|
843
|
-
pclmulqdq \$0x00,$Hkey2,$Xi
|
844
|
-
pclmulqdq \$0x11,$Hkey2,$Xhi
|
845
|
-
pclmulqdq \$0x10,$HK,$Xmn
|
846
|
-
|
847
|
-
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
848
|
-
pxor $Xhn,$Xhi
|
849
|
-
movdqu ($inp),$T2 # Ii
|
850
|
-
pxor $Xi,$T1 # aggregated Karatsuba post-processing
|
851
|
-
pshufb $T3,$T2
|
852
|
-
movdqu 16($inp),$Xln # Ii+1
|
853
|
-
|
854
|
-
pxor $Xhi,$T1
|
855
|
-
pxor $T2,$Xhi # "Ii+Xi", consume early
|
856
|
-
pxor $T1,$Xmn
|
857
|
-
pshufb $T3,$Xln
|
858
|
-
movdqa $Xmn,$T1 #
|
859
|
-
psrldq \$8,$T1
|
860
|
-
pslldq \$8,$Xmn #
|
861
|
-
pxor $T1,$Xhi
|
862
|
-
pxor $Xmn,$Xi #
|
863
|
-
|
864
|
-
movdqa $Xln,$Xhn #
|
865
|
-
|
866
|
-
movdqa $Xi,$T2 # 1st phase
|
867
|
-
movdqa $Xi,$T1
|
868
|
-
psllq \$5,$Xi
|
869
|
-
pxor $Xi,$T1 #
|
870
|
-
pclmulqdq \$0x00,$Hkey,$Xln #######
|
871
|
-
psllq \$1,$Xi
|
872
|
-
pxor $T1,$Xi #
|
873
|
-
psllq \$57,$Xi #
|
874
|
-
movdqa $Xi,$T1 #
|
875
|
-
pslldq \$8,$Xi
|
876
|
-
psrldq \$8,$T1 #
|
877
|
-
pxor $T2,$Xi
|
878
|
-
pshufd \$0b01001110,$Xhn,$Xmn
|
879
|
-
pxor $T1,$Xhi #
|
880
|
-
pxor $Xhn,$Xmn #
|
881
|
-
|
882
|
-
movdqa $Xi,$T2 # 2nd phase
|
883
|
-
psrlq \$1,$Xi
|
884
|
-
pclmulqdq \$0x11,$Hkey,$Xhn #######
|
885
|
-
pxor $T2,$Xhi #
|
886
|
-
pxor $Xi,$T2
|
887
|
-
psrlq \$5,$Xi
|
888
|
-
pxor $T2,$Xi #
|
889
|
-
lea 32($inp),$inp
|
890
|
-
psrlq \$1,$Xi #
|
891
|
-
pclmulqdq \$0x00,$HK,$Xmn #######
|
892
|
-
pxor $Xhi,$Xi #
|
893
|
-
|
894
|
-
sub \$0x20,$len
|
895
|
-
ja .Lmod_loop
|
896
|
-
|
897
|
-
.Leven_tail:
|
898
|
-
movdqa $Xi,$Xhi
|
899
|
-
movdqa $Xmn,$T1
|
900
|
-
pshufd \$0b01001110,$Xi,$Xmn #
|
901
|
-
pxor $Xi,$Xmn #
|
902
|
-
|
903
|
-
pclmulqdq \$0x00,$Hkey2,$Xi
|
904
|
-
pclmulqdq \$0x11,$Hkey2,$Xhi
|
905
|
-
pclmulqdq \$0x10,$HK,$Xmn
|
906
|
-
|
907
|
-
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
|
908
|
-
pxor $Xhn,$Xhi
|
909
|
-
pxor $Xi,$T1
|
910
|
-
pxor $Xhi,$T1
|
911
|
-
pxor $T1,$Xmn
|
912
|
-
movdqa $Xmn,$T1 #
|
913
|
-
psrldq \$8,$T1
|
914
|
-
pslldq \$8,$Xmn #
|
915
|
-
pxor $T1,$Xhi
|
916
|
-
pxor $Xmn,$Xi #
|
917
|
-
___
|
918
|
-
&reduction_alg9 ($Xhi,$Xi);
|
919
|
-
$code.=<<___;
|
920
|
-
test $len,$len
|
921
|
-
jnz .Ldone
|
922
|
-
|
923
|
-
.Lodd_tail:
|
924
|
-
movdqu ($inp),$T1 # Ii
|
925
|
-
pshufb $T3,$T1
|
926
|
-
pxor $T1,$Xi # Ii+Xi
|
927
|
-
___
|
928
|
-
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
|
929
|
-
&reduction_alg9 ($Xhi,$Xi);
|
930
|
-
$code.=<<___;
|
931
|
-
.Ldone:
|
932
|
-
pshufb $T3,$Xi
|
933
|
-
movdqu $Xi,($Xip)
|
934
|
-
___
|
935
|
-
$code.=<<___ if ($win64);
|
936
|
-
movaps (%rsp),%xmm6
|
937
|
-
movaps 0x10(%rsp),%xmm7
|
938
|
-
movaps 0x20(%rsp),%xmm8
|
939
|
-
movaps 0x30(%rsp),%xmm9
|
940
|
-
movaps 0x40(%rsp),%xmm10
|
941
|
-
movaps 0x50(%rsp),%xmm11
|
942
|
-
movaps 0x60(%rsp),%xmm12
|
943
|
-
movaps 0x70(%rsp),%xmm13
|
944
|
-
movaps 0x80(%rsp),%xmm14
|
945
|
-
movaps 0x90(%rsp),%xmm15
|
946
|
-
lea 0xa8(%rsp),%rsp
|
947
|
-
.LSEH_end_gcm_ghash_clmul:
|
948
|
-
___
|
949
|
-
$code.=<<___;
|
950
|
-
ret
|
951
|
-
.size gcm_ghash_clmul,.-gcm_ghash_clmul
|
952
|
-
___
|
953
|
-
}
|
954
|
-
|
955
|
-
$code.=<<___;
|
956
|
-
.globl gcm_init_avx
|
957
|
-
.type gcm_init_avx,\@abi-omnipotent
|
958
|
-
.align 32
|
959
|
-
gcm_init_avx:
|
960
|
-
___
|
961
|
-
if ($avx) {
|
962
|
-
my ($Htbl,$Xip)=@_4args;
|
963
|
-
my $HK="%xmm6";
|
964
|
-
|
965
|
-
$code.=<<___ if ($win64);
|
966
|
-
.LSEH_begin_gcm_init_avx:
|
967
|
-
# I can't trust assembler to use specific encoding:-(
|
968
|
-
.byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
|
969
|
-
.byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
|
970
|
-
___
|
971
|
-
$code.=<<___;
|
972
|
-
vzeroupper
|
973
|
-
|
974
|
-
vmovdqu ($Xip),$Hkey
|
975
|
-
vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
|
976
|
-
|
977
|
-
# <<1 twist
|
978
|
-
vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
|
979
|
-
vpsrlq \$63,$Hkey,$T1
|
980
|
-
vpsllq \$1,$Hkey,$Hkey
|
981
|
-
vpxor $T3,$T3,$T3 #
|
982
|
-
vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
|
983
|
-
vpslldq \$8,$T1,$T1
|
984
|
-
vpor $T1,$Hkey,$Hkey # H<<=1
|
985
|
-
|
986
|
-
# magic reduction
|
987
|
-
vpand .L0x1c2_polynomial(%rip),$T3,$T3
|
988
|
-
vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
|
989
|
-
|
990
|
-
vpunpckhqdq $Hkey,$Hkey,$HK
|
991
|
-
vmovdqa $Hkey,$Xi
|
992
|
-
vpxor $Hkey,$HK,$HK
|
993
|
-
mov \$4,%r10 # up to H^8
|
994
|
-
jmp .Linit_start_avx
|
995
|
-
___
|
996
|
-
|
997
|
-
sub clmul64x64_avx {
|
998
|
-
my ($Xhi,$Xi,$Hkey,$HK)=@_;
|
999
|
-
|
1000
|
-
if (!defined($HK)) { $HK = $T2;
|
1001
|
-
$code.=<<___;
|
1002
|
-
vpunpckhqdq $Xi,$Xi,$T1
|
1003
|
-
vpunpckhqdq $Hkey,$Hkey,$T2
|
1004
|
-
vpxor $Xi,$T1,$T1 #
|
1005
|
-
vpxor $Hkey,$T2,$T2
|
1006
|
-
___
|
1007
|
-
} else {
|
1008
|
-
$code.=<<___;
|
1009
|
-
vpunpckhqdq $Xi,$Xi,$T1
|
1010
|
-
vpxor $Xi,$T1,$T1 #
|
1011
|
-
___
|
1012
|
-
}
|
1013
|
-
$code.=<<___;
|
1014
|
-
vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
|
1015
|
-
vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
|
1016
|
-
vpclmulqdq \$0x00,$HK,$T1,$T1 #######
|
1017
|
-
vpxor $Xi,$Xhi,$T2 #
|
1018
|
-
vpxor $T2,$T1,$T1 #
|
1019
|
-
|
1020
|
-
vpslldq \$8,$T1,$T2 #
|
1021
|
-
vpsrldq \$8,$T1,$T1
|
1022
|
-
vpxor $T2,$Xi,$Xi #
|
1023
|
-
vpxor $T1,$Xhi,$Xhi
|
1024
|
-
___
|
1025
|
-
}
|
1026
|
-
|
1027
|
-
sub reduction_avx {
|
1028
|
-
my ($Xhi,$Xi) = @_;
|
1029
|
-
|
1030
|
-
$code.=<<___;
|
1031
|
-
vpsllq \$57,$Xi,$T1 # 1st phase
|
1032
|
-
vpsllq \$62,$Xi,$T2
|
1033
|
-
vpxor $T1,$T2,$T2 #
|
1034
|
-
vpsllq \$63,$Xi,$T1
|
1035
|
-
vpxor $T1,$T2,$T2 #
|
1036
|
-
vpslldq \$8,$T2,$T1 #
|
1037
|
-
vpsrldq \$8,$T2,$T2
|
1038
|
-
vpxor $T1,$Xi,$Xi #
|
1039
|
-
vpxor $T2,$Xhi,$Xhi
|
1040
|
-
|
1041
|
-
vpsrlq \$1,$Xi,$T2 # 2nd phase
|
1042
|
-
vpxor $Xi,$Xhi,$Xhi
|
1043
|
-
vpxor $T2,$Xi,$Xi #
|
1044
|
-
vpsrlq \$5,$T2,$T2
|
1045
|
-
vpxor $T2,$Xi,$Xi #
|
1046
|
-
vpsrlq \$1,$Xi,$Xi #
|
1047
|
-
vpxor $Xhi,$Xi,$Xi #
|
1048
|
-
___
|
1049
|
-
}
|
1050
|
-
|
1051
|
-
$code.=<<___;
|
1052
|
-
.align 32
|
1053
|
-
.Linit_loop_avx:
|
1054
|
-
vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
|
1055
|
-
vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
|
1056
|
-
___
|
1057
|
-
&clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
|
1058
|
-
&reduction_avx ($Xhi,$Xi);
|
1059
|
-
$code.=<<___;
|
1060
|
-
.Linit_start_avx:
|
1061
|
-
vmovdqa $Xi,$T3
|
1062
|
-
___
|
1063
|
-
&clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
|
1064
|
-
&reduction_avx ($Xhi,$Xi);
|
1065
|
-
$code.=<<___;
|
1066
|
-
vpshufd \$0b01001110,$T3,$T1
|
1067
|
-
vpshufd \$0b01001110,$Xi,$T2
|
1068
|
-
vpxor $T3,$T1,$T1 # Karatsuba pre-processing
|
1069
|
-
vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
|
1070
|
-
vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
|
1071
|
-
vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
|
1072
|
-
lea 0x30($Htbl),$Htbl
|
1073
|
-
sub \$1,%r10
|
1074
|
-
jnz .Linit_loop_avx
|
1075
|
-
|
1076
|
-
vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
|
1077
|
-
vmovdqu $T3,-0x10($Htbl)
|
1078
|
-
|
1079
|
-
vzeroupper
|
1080
|
-
___
|
1081
|
-
$code.=<<___ if ($win64);
|
1082
|
-
movaps (%rsp),%xmm6
|
1083
|
-
lea 0x18(%rsp),%rsp
|
1084
|
-
.LSEH_end_gcm_init_avx:
|
1085
|
-
___
|
1086
|
-
$code.=<<___;
|
1087
|
-
ret
|
1088
|
-
.size gcm_init_avx,.-gcm_init_avx
|
1089
|
-
___
|
1090
|
-
} else {
|
1091
|
-
$code.=<<___;
|
1092
|
-
jmp .L_init_clmul
|
1093
|
-
.size gcm_init_avx,.-gcm_init_avx
|
1094
|
-
___
|
1095
|
-
}
|
1096
|
-
|
1097
|
-
$code.=<<___;
|
1098
|
-
.globl gcm_gmult_avx
|
1099
|
-
.type gcm_gmult_avx,\@abi-omnipotent
|
1100
|
-
.align 32
|
1101
|
-
gcm_gmult_avx:
|
1102
|
-
jmp .L_gmult_clmul
|
1103
|
-
.size gcm_gmult_avx,.-gcm_gmult_avx
|
1104
|
-
___
|
1105
|
-
|
1106
|
-
$code.=<<___;
|
1107
|
-
.globl gcm_ghash_avx
|
1108
|
-
.type gcm_ghash_avx,\@abi-omnipotent
|
1109
|
-
.align 32
|
1110
|
-
gcm_ghash_avx:
|
1111
|
-
___
|
1112
|
-
if ($avx) {
|
1113
|
-
my ($Xip,$Htbl,$inp,$len)=@_4args;
|
1114
|
-
my ($Xlo,$Xhi,$Xmi,
|
1115
|
-
$Zlo,$Zhi,$Zmi,
|
1116
|
-
$Hkey,$HK,$T1,$T2,
|
1117
|
-
$Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
|
1118
|
-
|
1119
|
-
$code.=<<___ if ($win64);
|
1120
|
-
lea -0x88(%rsp),%rax
|
1121
|
-
.LSEH_begin_gcm_ghash_avx:
|
1122
|
-
# I can't trust assembler to use specific encoding:-(
|
1123
|
-
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
1124
|
-
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
|
1125
|
-
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
|
1126
|
-
.byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
|
1127
|
-
.byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
|
1128
|
-
.byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
|
1129
|
-
.byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
|
1130
|
-
.byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
|
1131
|
-
.byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
|
1132
|
-
.byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
|
1133
|
-
.byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
|
1134
|
-
___
|
1135
|
-
$code.=<<___;
|
1136
|
-
vzeroupper
|
1137
|
-
|
1138
|
-
vmovdqu ($Xip),$Xi # load $Xi
|
1139
|
-
lea .L0x1c2_polynomial(%rip),%r10
|
1140
|
-
lea 0x40($Htbl),$Htbl # size optimization
|
1141
|
-
vmovdqu .Lbswap_mask(%rip),$bswap
|
1142
|
-
vpshufb $bswap,$Xi,$Xi
|
1143
|
-
cmp \$0x80,$len
|
1144
|
-
jb .Lshort_avx
|
1145
|
-
sub \$0x80,$len
|
1146
|
-
|
1147
|
-
vmovdqu 0x70($inp),$Ii # I[7]
|
1148
|
-
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
1149
|
-
vpshufb $bswap,$Ii,$Ii
|
1150
|
-
vmovdqu 0x20-0x40($Htbl),$HK
|
1151
|
-
|
1152
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1153
|
-
vmovdqu 0x60($inp),$Ij # I[6]
|
1154
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1155
|
-
vpxor $Ii,$T2,$T2
|
1156
|
-
vpshufb $bswap,$Ij,$Ij
|
1157
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1158
|
-
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
1159
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1160
|
-
vmovdqu 0x50($inp),$Ii # I[5]
|
1161
|
-
vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
1162
|
-
vpxor $Ij,$T1,$T1
|
1163
|
-
|
1164
|
-
vpshufb $bswap,$Ii,$Ii
|
1165
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1166
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1167
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1168
|
-
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
1169
|
-
vpxor $Ii,$T2,$T2
|
1170
|
-
vmovdqu 0x40($inp),$Ij # I[4]
|
1171
|
-
vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
1172
|
-
vmovdqu 0x50-0x40($Htbl),$HK
|
1173
|
-
|
1174
|
-
vpshufb $bswap,$Ij,$Ij
|
1175
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1176
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1177
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1178
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1179
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1180
|
-
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
1181
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1182
|
-
vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
1183
|
-
vpxor $Ij,$T1,$T1
|
1184
|
-
|
1185
|
-
vmovdqu 0x30($inp),$Ii # I[3]
|
1186
|
-
vpxor $Zlo,$Xlo,$Xlo
|
1187
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1188
|
-
vpxor $Zhi,$Xhi,$Xhi
|
1189
|
-
vpshufb $bswap,$Ii,$Ii
|
1190
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1191
|
-
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
1192
|
-
vpxor $Zmi,$Xmi,$Xmi
|
1193
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1194
|
-
vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
1195
|
-
vmovdqu 0x80-0x40($Htbl),$HK
|
1196
|
-
vpxor $Ii,$T2,$T2
|
1197
|
-
|
1198
|
-
vmovdqu 0x20($inp),$Ij # I[2]
|
1199
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1200
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1201
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1202
|
-
vpshufb $bswap,$Ij,$Ij
|
1203
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1204
|
-
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
1205
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1206
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1207
|
-
vpclmulqdq \$0x00,$HK,$T2,$Xmi
|
1208
|
-
vpxor $Ij,$T1,$T1
|
1209
|
-
|
1210
|
-
vmovdqu 0x10($inp),$Ii # I[1]
|
1211
|
-
vpxor $Zlo,$Xlo,$Xlo
|
1212
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1213
|
-
vpxor $Zhi,$Xhi,$Xhi
|
1214
|
-
vpshufb $bswap,$Ii,$Ii
|
1215
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1216
|
-
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
1217
|
-
vpxor $Zmi,$Xmi,$Xmi
|
1218
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1219
|
-
vpclmulqdq \$0x10,$HK,$T1,$Zmi
|
1220
|
-
vmovdqu 0xb0-0x40($Htbl),$HK
|
1221
|
-
vpxor $Ii,$T2,$T2
|
1222
|
-
|
1223
|
-
vmovdqu ($inp),$Ij # I[0]
|
1224
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1225
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1226
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1227
|
-
vpshufb $bswap,$Ij,$Ij
|
1228
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1229
|
-
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
1230
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1231
|
-
vpclmulqdq \$0x10,$HK,$T2,$Xmi
|
1232
|
-
|
1233
|
-
lea 0x80($inp),$inp
|
1234
|
-
cmp \$0x80,$len
|
1235
|
-
jb .Ltail_avx
|
1236
|
-
|
1237
|
-
vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
1238
|
-
sub \$0x80,$len
|
1239
|
-
jmp .Loop8x_avx
|
1240
|
-
|
1241
|
-
.align 32
|
1242
|
-
.Loop8x_avx:
|
1243
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1244
|
-
vmovdqu 0x70($inp),$Ii # I[7]
|
1245
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1246
|
-
vpxor $Ij,$T1,$T1
|
1247
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
|
1248
|
-
vpshufb $bswap,$Ii,$Ii
|
1249
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1250
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
|
1251
|
-
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
1252
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1253
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1254
|
-
vpclmulqdq \$0x00,$HK,$T1,$Tred
|
1255
|
-
vmovdqu 0x20-0x40($Htbl),$HK
|
1256
|
-
vpxor $Ii,$T2,$T2
|
1257
|
-
|
1258
|
-
vmovdqu 0x60($inp),$Ij # I[6]
|
1259
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1260
|
-
vpxor $Zlo,$Xi,$Xi # collect result
|
1261
|
-
vpshufb $bswap,$Ij,$Ij
|
1262
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1263
|
-
vxorps $Zhi,$Xo,$Xo
|
1264
|
-
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
1265
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1266
|
-
vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
1267
|
-
vpxor $Zmi,$Tred,$Tred
|
1268
|
-
vxorps $Ij,$T1,$T1
|
1269
|
-
|
1270
|
-
vmovdqu 0x50($inp),$Ii # I[5]
|
1271
|
-
vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
|
1272
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1273
|
-
vpxor $Xo,$Tred,$Tred
|
1274
|
-
vpslldq \$8,$Tred,$T2
|
1275
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1276
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1277
|
-
vpsrldq \$8,$Tred,$Tred
|
1278
|
-
vpxor $T2, $Xi, $Xi
|
1279
|
-
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
1280
|
-
vpshufb $bswap,$Ii,$Ii
|
1281
|
-
vxorps $Tred,$Xo, $Xo
|
1282
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1283
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1284
|
-
vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
1285
|
-
vmovdqu 0x50-0x40($Htbl),$HK
|
1286
|
-
vpxor $Ii,$T2,$T2
|
1287
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1288
|
-
|
1289
|
-
vmovdqu 0x40($inp),$Ij # I[4]
|
1290
|
-
vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
|
1291
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1292
|
-
vpshufb $bswap,$Ij,$Ij
|
1293
|
-
vpxor $Zlo,$Xlo,$Xlo
|
1294
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1295
|
-
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
1296
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1297
|
-
vpxor $Zhi,$Xhi,$Xhi
|
1298
|
-
vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
1299
|
-
vxorps $Ij,$T1,$T1
|
1300
|
-
vpxor $Zmi,$Xmi,$Xmi
|
1301
|
-
|
1302
|
-
vmovdqu 0x30($inp),$Ii # I[3]
|
1303
|
-
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
1304
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1305
|
-
vpshufb $bswap,$Ii,$Ii
|
1306
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1307
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1308
|
-
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
1309
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1310
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1311
|
-
vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
1312
|
-
vmovdqu 0x80-0x40($Htbl),$HK
|
1313
|
-
vpxor $Ii,$T2,$T2
|
1314
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1315
|
-
|
1316
|
-
vmovdqu 0x20($inp),$Ij # I[2]
|
1317
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1318
|
-
vpshufb $bswap,$Ij,$Ij
|
1319
|
-
vpxor $Zlo,$Xlo,$Xlo
|
1320
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1321
|
-
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
1322
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1323
|
-
vpxor $Zhi,$Xhi,$Xhi
|
1324
|
-
vpclmulqdq \$0x00,$HK, $T2,$Xmi
|
1325
|
-
vpxor $Ij,$T1,$T1
|
1326
|
-
vpxor $Zmi,$Xmi,$Xmi
|
1327
|
-
vxorps $Tred,$Xi,$Xi
|
1328
|
-
|
1329
|
-
vmovdqu 0x10($inp),$Ii # I[1]
|
1330
|
-
vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
|
1331
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
|
1332
|
-
vpshufb $bswap,$Ii,$Ii
|
1333
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1334
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
|
1335
|
-
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
1336
|
-
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
|
1337
|
-
vxorps $Xo,$Tred,$Tred
|
1338
|
-
vpunpckhqdq $Ii,$Ii,$T2
|
1339
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1340
|
-
vpclmulqdq \$0x10,$HK, $T1,$Zmi
|
1341
|
-
vmovdqu 0xb0-0x40($Htbl),$HK
|
1342
|
-
vpxor $Ii,$T2,$T2
|
1343
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1344
|
-
|
1345
|
-
vmovdqu ($inp),$Ij # I[0]
|
1346
|
-
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
|
1347
|
-
vpshufb $bswap,$Ij,$Ij
|
1348
|
-
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
|
1349
|
-
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
|
1350
|
-
vpxor $Tred,$Ij,$Ij
|
1351
|
-
vpclmulqdq \$0x10,$HK, $T2,$Xmi
|
1352
|
-
vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
1353
|
-
|
1354
|
-
lea 0x80($inp),$inp
|
1355
|
-
sub \$0x80,$len
|
1356
|
-
jnc .Loop8x_avx
|
1357
|
-
|
1358
|
-
add \$0x80,$len
|
1359
|
-
jmp .Ltail_no_xor_avx
|
1360
|
-
|
1361
|
-
.align 32
|
1362
|
-
.Lshort_avx:
|
1363
|
-
vmovdqu -0x10($inp,$len),$Ii # very last word
|
1364
|
-
lea ($inp,$len),$inp
|
1365
|
-
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
|
1366
|
-
vmovdqu 0x20-0x40($Htbl),$HK
|
1367
|
-
vpshufb $bswap,$Ii,$Ij
|
1368
|
-
|
1369
|
-
vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
|
1370
|
-
vmovdqa $Xhi,$Zhi # $Zhi and
|
1371
|
-
vmovdqa $Xmi,$Zmi # $Zmi
|
1372
|
-
sub \$0x10,$len
|
1373
|
-
jz .Ltail_avx
|
1374
|
-
|
1375
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1376
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1377
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1378
|
-
vpxor $Ij,$T1,$T1
|
1379
|
-
vmovdqu -0x20($inp),$Ii
|
1380
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1381
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1382
|
-
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
|
1383
|
-
vpshufb $bswap,$Ii,$Ij
|
1384
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1385
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1386
|
-
vpsrldq \$8,$HK,$HK
|
1387
|
-
sub \$0x10,$len
|
1388
|
-
jz .Ltail_avx
|
1389
|
-
|
1390
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1391
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1392
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1393
|
-
vpxor $Ij,$T1,$T1
|
1394
|
-
vmovdqu -0x30($inp),$Ii
|
1395
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1396
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1397
|
-
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
|
1398
|
-
vpshufb $bswap,$Ii,$Ij
|
1399
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1400
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1401
|
-
vmovdqu 0x50-0x40($Htbl),$HK
|
1402
|
-
sub \$0x10,$len
|
1403
|
-
jz .Ltail_avx
|
1404
|
-
|
1405
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1406
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1407
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1408
|
-
vpxor $Ij,$T1,$T1
|
1409
|
-
vmovdqu -0x40($inp),$Ii
|
1410
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1411
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1412
|
-
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
|
1413
|
-
vpshufb $bswap,$Ii,$Ij
|
1414
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1415
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1416
|
-
vpsrldq \$8,$HK,$HK
|
1417
|
-
sub \$0x10,$len
|
1418
|
-
jz .Ltail_avx
|
1419
|
-
|
1420
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1421
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1422
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1423
|
-
vpxor $Ij,$T1,$T1
|
1424
|
-
vmovdqu -0x50($inp),$Ii
|
1425
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1426
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1427
|
-
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
|
1428
|
-
vpshufb $bswap,$Ii,$Ij
|
1429
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1430
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1431
|
-
vmovdqu 0x80-0x40($Htbl),$HK
|
1432
|
-
sub \$0x10,$len
|
1433
|
-
jz .Ltail_avx
|
1434
|
-
|
1435
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1436
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1437
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1438
|
-
vpxor $Ij,$T1,$T1
|
1439
|
-
vmovdqu -0x60($inp),$Ii
|
1440
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1441
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1442
|
-
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
|
1443
|
-
vpshufb $bswap,$Ii,$Ij
|
1444
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1445
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1446
|
-
vpsrldq \$8,$HK,$HK
|
1447
|
-
sub \$0x10,$len
|
1448
|
-
jz .Ltail_avx
|
1449
|
-
|
1450
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1451
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1452
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1453
|
-
vpxor $Ij,$T1,$T1
|
1454
|
-
vmovdqu -0x70($inp),$Ii
|
1455
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1456
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1457
|
-
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
|
1458
|
-
vpshufb $bswap,$Ii,$Ij
|
1459
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1460
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1461
|
-
vmovq 0xb8-0x40($Htbl),$HK
|
1462
|
-
sub \$0x10,$len
|
1463
|
-
jmp .Ltail_avx
|
1464
|
-
|
1465
|
-
.align 32
|
1466
|
-
.Ltail_avx:
|
1467
|
-
vpxor $Xi,$Ij,$Ij # accumulate $Xi
|
1468
|
-
.Ltail_no_xor_avx:
|
1469
|
-
vpunpckhqdq $Ij,$Ij,$T1
|
1470
|
-
vpxor $Xlo,$Zlo,$Zlo
|
1471
|
-
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
|
1472
|
-
vpxor $Ij,$T1,$T1
|
1473
|
-
vpxor $Xhi,$Zhi,$Zhi
|
1474
|
-
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
|
1475
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1476
|
-
vpclmulqdq \$0x00,$HK,$T1,$Xmi
|
1477
|
-
|
1478
|
-
vmovdqu (%r10),$Tred
|
1479
|
-
|
1480
|
-
vpxor $Xlo,$Zlo,$Xi
|
1481
|
-
vpxor $Xhi,$Zhi,$Xo
|
1482
|
-
vpxor $Xmi,$Zmi,$Zmi
|
1483
|
-
|
1484
|
-
vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
|
1485
|
-
vpxor $Xo, $Zmi,$Zmi
|
1486
|
-
vpslldq \$8, $Zmi,$T2
|
1487
|
-
vpsrldq \$8, $Zmi,$Zmi
|
1488
|
-
vpxor $T2, $Xi, $Xi
|
1489
|
-
vpxor $Zmi,$Xo, $Xo
|
1490
|
-
|
1491
|
-
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
|
1492
|
-
vpalignr \$8,$Xi,$Xi,$Xi
|
1493
|
-
vpxor $T2,$Xi,$Xi
|
1494
|
-
|
1495
|
-
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
|
1496
|
-
vpalignr \$8,$Xi,$Xi,$Xi
|
1497
|
-
vpxor $Xo,$Xi,$Xi
|
1498
|
-
vpxor $T2,$Xi,$Xi
|
1499
|
-
|
1500
|
-
cmp \$0,$len
|
1501
|
-
jne .Lshort_avx
|
1502
|
-
|
1503
|
-
vpshufb $bswap,$Xi,$Xi
|
1504
|
-
vmovdqu $Xi,($Xip)
|
1505
|
-
vzeroupper
|
1506
|
-
___
|
1507
|
-
$code.=<<___ if ($win64);
|
1508
|
-
movaps (%rsp),%xmm6
|
1509
|
-
movaps 0x10(%rsp),%xmm7
|
1510
|
-
movaps 0x20(%rsp),%xmm8
|
1511
|
-
movaps 0x30(%rsp),%xmm9
|
1512
|
-
movaps 0x40(%rsp),%xmm10
|
1513
|
-
movaps 0x50(%rsp),%xmm11
|
1514
|
-
movaps 0x60(%rsp),%xmm12
|
1515
|
-
movaps 0x70(%rsp),%xmm13
|
1516
|
-
movaps 0x80(%rsp),%xmm14
|
1517
|
-
movaps 0x90(%rsp),%xmm15
|
1518
|
-
lea 0xa8(%rsp),%rsp
|
1519
|
-
.LSEH_end_gcm_ghash_avx:
|
1520
|
-
___
|
1521
|
-
$code.=<<___;
|
1522
|
-
ret
|
1523
|
-
.size gcm_ghash_avx,.-gcm_ghash_avx
|
1524
|
-
___
|
1525
|
-
} else {
|
1526
|
-
$code.=<<___;
|
1527
|
-
jmp .L_ghash_clmul
|
1528
|
-
.size gcm_ghash_avx,.-gcm_ghash_avx
|
1529
|
-
___
|
1530
|
-
}
|
1531
|
-
|
1532
|
-
$code.=<<___;
|
1533
|
-
.align 64
|
1534
|
-
.Lbswap_mask:
|
1535
|
-
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
1536
|
-
.L0x1c2_polynomial:
|
1537
|
-
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
|
1538
|
-
.L7_mask:
|
1539
|
-
.long 7,0,7,0
|
1540
|
-
.L7_mask_poly:
|
1541
|
-
.long 7,0,`0xE1<<1`,0
|
1542
|
-
.align 64
|
1543
|
-
.type .Lrem_4bit,\@object
|
1544
|
-
.Lrem_4bit:
|
1545
|
-
.long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
|
1546
|
-
.long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
|
1547
|
-
.long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
|
1548
|
-
.long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
|
1549
|
-
.type .Lrem_8bit,\@object
|
1550
|
-
.Lrem_8bit:
|
1551
|
-
.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
|
1552
|
-
.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
|
1553
|
-
.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
|
1554
|
-
.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
|
1555
|
-
.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
|
1556
|
-
.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
|
1557
|
-
.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
|
1558
|
-
.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
|
1559
|
-
.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
|
1560
|
-
.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
|
1561
|
-
.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
|
1562
|
-
.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
|
1563
|
-
.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
|
1564
|
-
.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
|
1565
|
-
.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
|
1566
|
-
.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
|
1567
|
-
.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
|
1568
|
-
.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
|
1569
|
-
.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
|
1570
|
-
.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
|
1571
|
-
.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
|
1572
|
-
.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
|
1573
|
-
.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
|
1574
|
-
.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
|
1575
|
-
.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
|
1576
|
-
.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
|
1577
|
-
.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
|
1578
|
-
.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
|
1579
|
-
.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
|
1580
|
-
.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
|
1581
|
-
.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
|
1582
|
-
.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
|
1583
|
-
|
1584
|
-
.asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
1585
|
-
.align 64
|
1586
|
-
___
|
1587
|
-
|
1588
|
-
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
1589
|
-
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
1590
|
-
if ($win64) {
|
1591
|
-
$rec="%rcx";
|
1592
|
-
$frame="%rdx";
|
1593
|
-
$context="%r8";
|
1594
|
-
$disp="%r9";
|
1595
|
-
|
1596
|
-
$code.=<<___;
|
1597
|
-
.extern __imp_RtlVirtualUnwind
|
1598
|
-
.type se_handler,\@abi-omnipotent
|
1599
|
-
.align 16
|
1600
|
-
se_handler:
|
1601
|
-
push %rsi
|
1602
|
-
push %rdi
|
1603
|
-
push %rbx
|
1604
|
-
push %rbp
|
1605
|
-
push %r12
|
1606
|
-
push %r13
|
1607
|
-
push %r14
|
1608
|
-
push %r15
|
1609
|
-
pushfq
|
1610
|
-
sub \$64,%rsp
|
1611
|
-
|
1612
|
-
mov 120($context),%rax # pull context->Rax
|
1613
|
-
mov 248($context),%rbx # pull context->Rip
|
1614
|
-
|
1615
|
-
mov 8($disp),%rsi # disp->ImageBase
|
1616
|
-
mov 56($disp),%r11 # disp->HandlerData
|
1617
|
-
|
1618
|
-
mov 0(%r11),%r10d # HandlerData[0]
|
1619
|
-
lea (%rsi,%r10),%r10 # prologue label
|
1620
|
-
cmp %r10,%rbx # context->Rip<prologue label
|
1621
|
-
jb .Lin_prologue
|
1622
|
-
|
1623
|
-
mov 152($context),%rax # pull context->Rsp
|
1624
|
-
|
1625
|
-
mov 4(%r11),%r10d # HandlerData[1]
|
1626
|
-
lea (%rsi,%r10),%r10 # epilogue label
|
1627
|
-
cmp %r10,%rbx # context->Rip>=epilogue label
|
1628
|
-
jae .Lin_prologue
|
1629
|
-
|
1630
|
-
lea 24(%rax),%rax # adjust "rsp"
|
1631
|
-
|
1632
|
-
mov -8(%rax),%rbx
|
1633
|
-
mov -16(%rax),%rbp
|
1634
|
-
mov -24(%rax),%r12
|
1635
|
-
mov %rbx,144($context) # restore context->Rbx
|
1636
|
-
mov %rbp,160($context) # restore context->Rbp
|
1637
|
-
mov %r12,216($context) # restore context->R12
|
1638
|
-
|
1639
|
-
.Lin_prologue:
|
1640
|
-
mov 8(%rax),%rdi
|
1641
|
-
mov 16(%rax),%rsi
|
1642
|
-
mov %rax,152($context) # restore context->Rsp
|
1643
|
-
mov %rsi,168($context) # restore context->Rsi
|
1644
|
-
mov %rdi,176($context) # restore context->Rdi
|
1645
|
-
|
1646
|
-
mov 40($disp),%rdi # disp->ContextRecord
|
1647
|
-
mov $context,%rsi # context
|
1648
|
-
mov \$`1232/8`,%ecx # sizeof(CONTEXT)
|
1649
|
-
.long 0xa548f3fc # cld; rep movsq
|
1650
|
-
|
1651
|
-
mov $disp,%rsi
|
1652
|
-
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
1653
|
-
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
1654
|
-
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
1655
|
-
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
1656
|
-
mov 40(%rsi),%r10 # disp->ContextRecord
|
1657
|
-
lea 56(%rsi),%r11 # &disp->HandlerData
|
1658
|
-
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
1659
|
-
mov %r10,32(%rsp) # arg5
|
1660
|
-
mov %r11,40(%rsp) # arg6
|
1661
|
-
mov %r12,48(%rsp) # arg7
|
1662
|
-
mov %rcx,56(%rsp) # arg8, (NULL)
|
1663
|
-
call *__imp_RtlVirtualUnwind(%rip)
|
1664
|
-
|
1665
|
-
mov \$1,%eax # ExceptionContinueSearch
|
1666
|
-
add \$64,%rsp
|
1667
|
-
popfq
|
1668
|
-
pop %r15
|
1669
|
-
pop %r14
|
1670
|
-
pop %r13
|
1671
|
-
pop %r12
|
1672
|
-
pop %rbp
|
1673
|
-
pop %rbx
|
1674
|
-
pop %rdi
|
1675
|
-
pop %rsi
|
1676
|
-
ret
|
1677
|
-
.size se_handler,.-se_handler
|
1678
|
-
|
1679
|
-
.section .pdata
|
1680
|
-
.align 4
|
1681
|
-
.rva .LSEH_begin_gcm_gmult_4bit
|
1682
|
-
.rva .LSEH_end_gcm_gmult_4bit
|
1683
|
-
.rva .LSEH_info_gcm_gmult_4bit
|
1684
|
-
|
1685
|
-
.rva .LSEH_begin_gcm_ghash_4bit
|
1686
|
-
.rva .LSEH_end_gcm_ghash_4bit
|
1687
|
-
.rva .LSEH_info_gcm_ghash_4bit
|
1688
|
-
|
1689
|
-
.rva .LSEH_begin_gcm_init_clmul
|
1690
|
-
.rva .LSEH_end_gcm_init_clmul
|
1691
|
-
.rva .LSEH_info_gcm_init_clmul
|
1692
|
-
|
1693
|
-
.rva .LSEH_begin_gcm_ghash_clmul
|
1694
|
-
.rva .LSEH_end_gcm_ghash_clmul
|
1695
|
-
.rva .LSEH_info_gcm_ghash_clmul
|
1696
|
-
___
|
1697
|
-
$code.=<<___ if ($avx);
|
1698
|
-
.rva .LSEH_begin_gcm_init_avx
|
1699
|
-
.rva .LSEH_end_gcm_init_avx
|
1700
|
-
.rva .LSEH_info_gcm_init_clmul
|
1701
|
-
|
1702
|
-
.rva .LSEH_begin_gcm_ghash_avx
|
1703
|
-
.rva .LSEH_end_gcm_ghash_avx
|
1704
|
-
.rva .LSEH_info_gcm_ghash_clmul
|
1705
|
-
___
|
1706
|
-
$code.=<<___;
|
1707
|
-
.section .xdata
|
1708
|
-
.align 8
|
1709
|
-
.LSEH_info_gcm_gmult_4bit:
|
1710
|
-
.byte 9,0,0,0
|
1711
|
-
.rva se_handler
|
1712
|
-
.rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
|
1713
|
-
.LSEH_info_gcm_ghash_4bit:
|
1714
|
-
.byte 9,0,0,0
|
1715
|
-
.rva se_handler
|
1716
|
-
.rva .Lghash_prologue,.Lghash_epilogue # HandlerData
|
1717
|
-
.LSEH_info_gcm_init_clmul:
|
1718
|
-
.byte 0x01,0x08,0x03,0x00
|
1719
|
-
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
1720
|
-
.byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
|
1721
|
-
.LSEH_info_gcm_ghash_clmul:
|
1722
|
-
.byte 0x01,0x33,0x16,0x00
|
1723
|
-
.byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
|
1724
|
-
.byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
|
1725
|
-
.byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
|
1726
|
-
.byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
|
1727
|
-
.byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
|
1728
|
-
.byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
|
1729
|
-
.byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
|
1730
|
-
.byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
|
1731
|
-
.byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
|
1732
|
-
.byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
|
1733
|
-
.byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
1734
|
-
___
|
1735
|
-
}
|
1736
|
-
|
1737
|
-
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
1738
|
-
|
1739
|
-
print $code;
|
1740
|
-
|
1741
|
-
close STDOUT;
|