rbnacl-libsodium 1.0.3 → 1.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGES.md +4 -0
- data/lib/rbnacl/libsodium/version.rb +1 -1
- data/vendor/libsodium/AUTHORS +3 -0
- data/vendor/libsodium/ChangeLog +21 -0
- data/vendor/libsodium/Makefile.in +9 -0
- data/vendor/libsodium/THANKS +3 -0
- data/vendor/libsodium/aclocal.m4 +1 -0
- data/vendor/libsodium/autom4te.cache/output.1 +919 -85
- data/vendor/libsodium/autom4te.cache/output.5 +18351 -0
- data/vendor/libsodium/autom4te.cache/requests +960 -725
- data/vendor/libsodium/autom4te.cache/traces.1 +289 -247
- data/vendor/libsodium/autom4te.cache/traces.5 +3032 -0
- data/vendor/libsodium/build-aux/ltmain.sh +70 -11
- data/vendor/libsodium/builds/msvc/properties/ARM.props +20 -0
- data/vendor/libsodium/builds/msvc/properties/ReleaseDEXE.props +1 -1
- data/vendor/libsodium/builds/msvc/version.h +2 -2
- data/vendor/libsodium/builds/msvc/vs2010/libsodium/libsodium.vcxproj +3 -1
- data/vendor/libsodium/builds/msvc/vs2010/libsodium/libsodium.vcxproj.filters +13 -1
- data/vendor/libsodium/builds/msvc/vs2010/test/test.vcxproj +244 -241
- data/vendor/libsodium/builds/msvc/vs2010/test/test.vcxproj.filters +192 -189
- data/vendor/libsodium/builds/msvc/vs2012/libsodium/libsodium.vcxproj +2 -0
- data/vendor/libsodium/builds/msvc/vs2012/libsodium/libsodium.vcxproj.filters +13 -1
- data/vendor/libsodium/builds/msvc/vs2012/test/test.vcxproj +244 -241
- data/vendor/libsodium/builds/msvc/vs2012/test/test.vcxproj.filters +192 -189
- data/vendor/libsodium/builds/msvc/vs2013/libsodium/libsodium.vcxproj +3 -1
- data/vendor/libsodium/builds/msvc/vs2013/libsodium/libsodium.vcxproj.filters +13 -1
- data/vendor/libsodium/builds/msvc/vs2013/test/test.vcxproj +244 -241
- data/vendor/libsodium/builds/msvc/vs2013/test/test.vcxproj.filters +192 -189
- data/vendor/libsodium/builds/msvc/vs2015/libsodium/libsodium.vcxproj +3 -1
- data/vendor/libsodium/builds/msvc/vs2015/libsodium/libsodium.vcxproj.filters +13 -1
- data/vendor/libsodium/builds/msvc/vs2015/test/test.vcxproj +244 -241
- data/vendor/libsodium/builds/msvc/vs2015/test/test.vcxproj.filters +192 -189
- data/vendor/libsodium/configure +918 -84
- data/vendor/libsodium/configure.ac +89 -15
- data/vendor/libsodium/dist-build/Makefile.am +6 -2
- data/vendor/libsodium/dist-build/Makefile.in +15 -2
- data/vendor/libsodium/dist-build/android-armv8-a.sh +4 -0
- data/vendor/libsodium/dist-build/android-build.sh +9 -9
- data/vendor/libsodium/dist-build/android-mips64.sh +4 -0
- data/vendor/libsodium/dist-build/android-x86-64.sh +4 -0
- data/vendor/libsodium/dist-build/emscripten.sh +3 -3
- data/vendor/libsodium/dist-build/ios.sh +5 -5
- data/vendor/libsodium/dist-build/nativeclient.sh +28 -0
- data/vendor/libsodium/examples/Makefile +21 -0
- data/vendor/libsodium/examples/auth.c +68 -0
- data/vendor/libsodium/examples/box.c +133 -0
- data/vendor/libsodium/examples/box_detached.c +132 -0
- data/vendor/libsodium/examples/generichash.c +80 -0
- data/vendor/libsodium/examples/generichash_stream.c +58 -0
- data/vendor/libsodium/examples/shorthash.c +58 -0
- data/vendor/libsodium/examples/sign.c +78 -0
- data/vendor/libsodium/examples/utils.h +106 -0
- data/vendor/libsodium/libsodium-uninstalled.pc.in +1 -1
- data/vendor/libsodium/libsodium.vcxproj +2 -0
- data/vendor/libsodium/libsodium.vcxproj.filters +6 -0
- data/vendor/libsodium/m4/ax_check_compile_flag.m4 +2 -2
- data/vendor/libsodium/m4/ax_check_define.m4 +92 -0
- data/vendor/libsodium/m4/ax_check_link_flag.m4 +3 -2
- data/vendor/libsodium/m4/libtool.m4 +111 -60
- data/vendor/libsodium/m4/ltoptions.m4 +1 -1
- data/vendor/libsodium/m4/ltsugar.m4 +1 -1
- data/vendor/libsodium/m4/ltversion.m4 +6 -6
- data/vendor/libsodium/m4/lt~obsolete.m4 +1 -1
- data/vendor/libsodium/msvc-scripts/Makefile.in +9 -0
- data/vendor/libsodium/msvc-scripts/process.bat +2 -2
- data/vendor/libsodium/src/Makefile.in +9 -0
- data/vendor/libsodium/src/libsodium/Makefile.am +27 -3
- data/vendor/libsodium/src/libsodium/Makefile.in +170 -63
- data/vendor/libsodium/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +852 -0
- data/vendor/libsodium/src/libsodium/crypto_aead/chacha20poly1305/sodium/aead_chacha20poly1305.c +137 -17
- data/vendor/libsodium/src/libsodium/crypto_auth/hmacsha256/cp/hmac_hmacsha256.c +1 -0
- data/vendor/libsodium/src/libsodium/crypto_auth/hmacsha512/cp/hmac_hmacsha512.c +1 -0
- data/vendor/libsodium/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c +10 -6
- data/vendor/libsodium/src/libsodium/crypto_generichash/crypto_generichash.c +1 -1
- data/vendor/libsodium/src/libsodium/crypto_hash/sha256/cp/hash_sha256.c +29 -23
- data/vendor/libsodium/src/libsodium/crypto_hash/sha512/cp/hash_sha512.c +9 -10
- data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/crypto_scrypt-common.c +4 -2
- data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/crypto_scrypt.h +1 -0
- data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/nosse/pwhash_scryptsalsa208sha256_nosse.c +4 -0
- data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/pbkdf2-sha256.c +3 -0
- data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c +2 -1
- data/vendor/libsodium/src/libsodium/crypto_scalarmult/curve25519/ref10/fe_frombytes_curve25519_ref10.c +10 -10
- data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/ref/api.h +10 -0
- data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c +51 -0
- data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/stream_chacha20_api.c +29 -0
- data/vendor/libsodium/src/libsodium/include/Makefile.am +6 -0
- data/vendor/libsodium/src/libsodium/include/Makefile.in +67 -40
- data/vendor/libsodium/src/libsodium/include/sodium.h +4 -0
- data/vendor/libsodium/src/libsodium/include/sodium/crypto_aead_aes256gcm.h +88 -0
- data/vendor/libsodium/src/libsodium/include/sodium/crypto_aead_chacha20poly1305.h +28 -2
- data/vendor/libsodium/src/libsodium/include/sodium/crypto_generichash_blake2b.h +0 -6
- data/vendor/libsodium/src/libsodium/include/sodium/crypto_hash_sha256.h +1 -1
- data/vendor/libsodium/src/libsodium/include/sodium/crypto_stream_chacha20.h +23 -0
- data/vendor/libsodium/src/libsodium/include/sodium/export.h +8 -0
- data/vendor/libsodium/src/libsodium/include/sodium/randombytes_nativeclient.h +37 -0
- data/vendor/libsodium/src/libsodium/include/sodium/randombytes_salsa20_random.h +3 -2
- data/vendor/libsodium/src/libsodium/include/sodium/runtime.h +6 -0
- data/vendor/libsodium/src/libsodium/include/sodium/utils.h +15 -1
- data/vendor/libsodium/src/libsodium/randombytes/nativeclient/randombytes_nativeclient.c +49 -0
- data/vendor/libsodium/src/libsodium/randombytes/randombytes.c +11 -1
- data/vendor/libsodium/src/libsodium/randombytes/salsa20/randombytes_salsa20_random.c +71 -45
- data/vendor/libsodium/src/libsodium/randombytes/sysrandom/randombytes_sysrandom.c +12 -2
- data/vendor/libsodium/src/libsodium/sodium/runtime.c +26 -3
- data/vendor/libsodium/src/libsodium/sodium/utils.c +86 -13
- data/vendor/libsodium/test/Makefile.in +9 -0
- data/vendor/libsodium/test/default/Makefile.am +130 -0
- data/vendor/libsodium/test/default/Makefile.in +197 -50
- data/vendor/libsodium/test/default/aead_aes256gcm.c +3197 -0
- data/vendor/libsodium/test/default/aead_aes256gcm.exp +1 -0
- data/vendor/libsodium/test/default/aead_chacha20poly1305.c +150 -17
- data/vendor/libsodium/test/default/aead_chacha20poly1305.exp +51 -0
- data/vendor/libsodium/test/default/chacha20.c +80 -5
- data/vendor/libsodium/test/default/chacha20.exp +11 -0
- data/vendor/libsodium/test/default/generichash.c +1332 -1
- data/vendor/libsodium/test/default/generichash.exp +1 -1
- data/vendor/libsodium/test/default/generichash2.c +3 -2
- data/vendor/libsodium/test/default/generichash3.c +2 -1
- data/vendor/libsodium/test/default/nacl-test-wrapper.sh +26 -0
- data/vendor/libsodium/test/default/randombytes.c +4 -0
- data/vendor/libsodium/test/default/scalarmult.c +33 -14
- data/vendor/libsodium/test/default/scalarmult.exp +4 -4
- data/vendor/libsodium/test/default/secretbox.c +9 -0
- data/vendor/libsodium/test/default/secretbox.exp +19 -0
- data/vendor/libsodium/test/default/secretbox_easy.c +40 -4
- data/vendor/libsodium/test/default/secretbox_easy.exp +58 -1
- data/vendor/libsodium/test/default/sodium_core.c +2 -0
- data/vendor/libsodium/test/default/sodium_utils.c +46 -7
- data/vendor/libsodium/test/default/sodium_utils.exp +5 -0
- data/vendor/libsodium/test/default/stream.c +1 -0
- metadata +28 -3
@@ -0,0 +1,852 @@
|
|
1
|
+
|
2
|
+
/*
|
3
|
+
* AES256-GCM, based on original code by Romain Dolbeau
|
4
|
+
*/
|
5
|
+
|
6
|
+
#include <stdint.h>
|
7
|
+
#include <stdlib.h>
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#include "crypto_aead_aes256gcm.h"
|
11
|
+
#include "export.h"
|
12
|
+
#include "runtime.h"
|
13
|
+
#include "utils.h"
|
14
|
+
|
15
|
+
#if defined(HAVE_WMMINTRIN_H) || \
|
16
|
+
(defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
|
17
|
+
|
18
|
+
#pragma GCC target("ssse3")
|
19
|
+
#pragma GCC target("aes")
|
20
|
+
#pragma GCC target("pclmul")
|
21
|
+
|
22
|
+
#include <immintrin.h>
|
23
|
+
|
24
|
+
#if defined(__INTEL_COMPILER) || defined(_bswap64)
|
25
|
+
#elif defined(_MSC_VER)
|
26
|
+
# define _bswap64(a) _byteswap_uint64(a)
|
27
|
+
#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
|
28
|
+
# define _bswap64(a) __builtin_bswap64(a)
|
29
|
+
#else
|
30
|
+
static inline uint64_t
|
31
|
+
_bswap64(const uint64_t x)
|
32
|
+
{
|
33
|
+
return
|
34
|
+
((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
|
35
|
+
((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) |
|
36
|
+
((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
|
37
|
+
((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
|
38
|
+
}
|
39
|
+
#endif
|
40
|
+
|
41
|
+
typedef struct context {
|
42
|
+
CRYPTO_ALIGN(16) unsigned char H[16];
|
43
|
+
__m128i rkeys[16];
|
44
|
+
} context;
|
45
|
+
|
46
|
+
static inline void
|
47
|
+
aesni_key256_expand(const unsigned char *key, __m128 *rkeys)
|
48
|
+
{
|
49
|
+
__m128 key0 = _mm_loadu_ps((const float *) (key + 0));
|
50
|
+
__m128 key1 = _mm_loadu_ps((const float *) (key + 16));
|
51
|
+
__m128 temp0, temp1, temp2, temp4;
|
52
|
+
int idx = 0;
|
53
|
+
|
54
|
+
rkeys[idx++] = key0;
|
55
|
+
temp0 = key0;
|
56
|
+
temp2 = key1;
|
57
|
+
temp4 = _mm_setzero_ps();
|
58
|
+
|
59
|
+
/* why single precision floating-point rather than integer instructions ?
|
60
|
+
because _mm_shuffle_ps takes two inputs, while _mm_shuffle_epi32 only
|
61
|
+
takes one - it doesn't perform the same computation...
|
62
|
+
_mm_shuffle_ps takes the lower 64 bits of the result from the first
|
63
|
+
operand, and the higher 64 bits of the result from the second operand
|
64
|
+
(in both cases, all four input floats are accessible).
|
65
|
+
I don't like the non-orthogonal naming scheme :-(
|
66
|
+
|
67
|
+
This is all strongly inspired by the openssl assembly code.
|
68
|
+
*/
|
69
|
+
#define BLOCK1(IMM) \
|
70
|
+
temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp2), IMM));\
|
71
|
+
rkeys[idx++] = temp2; \
|
72
|
+
temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \
|
73
|
+
temp0 = _mm_xor_ps(temp0, temp4); \
|
74
|
+
temp4 = _mm_shuffle_ps(temp4, temp0, 0x8c); \
|
75
|
+
temp0 = _mm_xor_ps(temp0, temp4); \
|
76
|
+
temp1 = _mm_shuffle_ps(temp1, temp1, 0xff); \
|
77
|
+
temp0 = _mm_xor_ps(temp0, temp1)
|
78
|
+
|
79
|
+
#define BLOCK2(IMM) \
|
80
|
+
temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp0), IMM));\
|
81
|
+
rkeys[idx++] = temp0; \
|
82
|
+
temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \
|
83
|
+
temp2 = _mm_xor_ps(temp2, temp4); \
|
84
|
+
temp4 = _mm_shuffle_ps(temp4, temp2, 0x8c); \
|
85
|
+
temp2 = _mm_xor_ps(temp2, temp4); \
|
86
|
+
temp1 = _mm_shuffle_ps(temp1, temp1, 0xaa); \
|
87
|
+
temp2 = _mm_xor_ps(temp2, temp1)
|
88
|
+
|
89
|
+
BLOCK1(0x01);
|
90
|
+
BLOCK2(0x01);
|
91
|
+
|
92
|
+
BLOCK1(0x02);
|
93
|
+
BLOCK2(0x02);
|
94
|
+
|
95
|
+
BLOCK1(0x04);
|
96
|
+
BLOCK2(0x04);
|
97
|
+
|
98
|
+
BLOCK1(0x08);
|
99
|
+
BLOCK2(0x08);
|
100
|
+
|
101
|
+
BLOCK1(0x10);
|
102
|
+
BLOCK2(0x10);
|
103
|
+
|
104
|
+
BLOCK1(0x20);
|
105
|
+
BLOCK2(0x20);
|
106
|
+
|
107
|
+
BLOCK1(0x40);
|
108
|
+
rkeys[idx++] = temp0;
|
109
|
+
}
|
110
|
+
|
111
|
+
/** single, by-the-book AES encryption with AES-NI */
|
112
|
+
static inline void
|
113
|
+
aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
|
114
|
+
{
|
115
|
+
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
|
116
|
+
int roundctr;
|
117
|
+
|
118
|
+
#pragma unroll(13)
|
119
|
+
for (roundctr = 1; roundctr < 14; roundctr++) {
|
120
|
+
temp = _mm_aesenc_si128(temp, rkeys[roundctr]);
|
121
|
+
}
|
122
|
+
temp = _mm_aesenclast_si128(temp, rkeys[14]);
|
123
|
+
_mm_storeu_si128((__m128i *) out, temp);
|
124
|
+
}
|
125
|
+
|
126
|
+
/** multiple-blocks-at-once AES encryption with AES-NI ;
|
127
|
+
on Haswell, aesenc as a latency of 7 and a througput of 1
|
128
|
+
so the sequence of aesenc should be bubble-free, if you
|
129
|
+
have at least 8 blocks. Let's build an arbitratry-sized
|
130
|
+
function */
|
131
|
+
/* Step 1 : loading the nonce */
|
132
|
+
/* load & increment the n vector (non-vectorized, unused for now) */
|
133
|
+
#define NVDECLx(a) \
|
134
|
+
__m128i nv##a
|
135
|
+
|
136
|
+
#define NVx(a) \
|
137
|
+
nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \
|
138
|
+
n[3]++
|
139
|
+
|
140
|
+
/* Step 2 : define value in round one (xor with subkey #0, aka key) */
|
141
|
+
#define TEMPDECLx(a) \
|
142
|
+
__m128i temp##a
|
143
|
+
|
144
|
+
#define TEMPx(a) \
|
145
|
+
temp##a = _mm_xor_si128(nv##a, rkeys[0])
|
146
|
+
|
147
|
+
/* Step 3: one round of AES */
|
148
|
+
#define AESENCx(a) \
|
149
|
+
temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
|
150
|
+
|
151
|
+
/* Step 4: last round of AES */
|
152
|
+
#define AESENCLASTx(a) \
|
153
|
+
temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
|
154
|
+
|
155
|
+
/* Step 5: store result */
|
156
|
+
#define STOREx(a) \
|
157
|
+
_mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
|
158
|
+
|
159
|
+
/* all the MAKE* macros are for automatic explicit unrolling */
|
160
|
+
#define MAKE4(X) \
|
161
|
+
X(0); \
|
162
|
+
X(1); \
|
163
|
+
X(2); \
|
164
|
+
X(3)
|
165
|
+
|
166
|
+
#define MAKE8(X) \
|
167
|
+
X(0); \
|
168
|
+
X(1); \
|
169
|
+
X(2); \
|
170
|
+
X(3); \
|
171
|
+
X(4); \
|
172
|
+
X(5); \
|
173
|
+
X(6); \
|
174
|
+
X(7)
|
175
|
+
|
176
|
+
#define COUNTER_INC2(N) (*(uint32_t *) &(N)[12]) = (2U + (((*(uint32_t *) &(N)[12]))))
|
177
|
+
|
178
|
+
/* create a function of unrolling N ; the MAKEN is the unrolling
|
179
|
+
macro, defined above. The N in MAKEN must match N, obviously. */
|
180
|
+
#define FUNC(N, MAKEN) \
|
181
|
+
static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \
|
182
|
+
{ \
|
183
|
+
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
184
|
+
int roundctr; \
|
185
|
+
MAKEN(NVDECLx); \
|
186
|
+
MAKEN(TEMPDECLx); \
|
187
|
+
\
|
188
|
+
MAKEN(NVx); \
|
189
|
+
MAKEN(TEMPx); \
|
190
|
+
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
191
|
+
MAKEN(AESENCx); \
|
192
|
+
} \
|
193
|
+
MAKEN(AESENCLASTx); \
|
194
|
+
MAKEN(STOREx); \
|
195
|
+
}
|
196
|
+
|
197
|
+
FUNC(8, MAKE8)
|
198
|
+
|
199
|
+
/* all GF(2^128) fnctions are by the book, meaning this one:
|
200
|
+
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
201
|
+
*/
|
202
|
+
|
203
|
+
static inline void
|
204
|
+
addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
|
205
|
+
{
|
206
|
+
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
207
|
+
__m128i A, B, C;
|
208
|
+
__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
|
209
|
+
__m128i tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18;
|
210
|
+
__m128i tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
|
211
|
+
__m128i tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36;
|
212
|
+
|
213
|
+
if (xlen >= 16) {
|
214
|
+
A = _mm_loadu_si128((const __m128i *) a);
|
215
|
+
} else {
|
216
|
+
CRYPTO_ALIGN(16) unsigned char padded[16];
|
217
|
+
memset(padded, 0, 16);
|
218
|
+
memcpy(padded, a, xlen);
|
219
|
+
A = _mm_load_si128((const __m128i *) padded);
|
220
|
+
}
|
221
|
+
A = _mm_shuffle_epi8(A, rev);
|
222
|
+
B = _mm_loadu_si128((const __m128i *) b);
|
223
|
+
C = _mm_loadu_si128((const __m128i *) c);
|
224
|
+
A = _mm_xor_si128(A, C);
|
225
|
+
tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
|
226
|
+
tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
|
227
|
+
tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
|
228
|
+
tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
|
229
|
+
tmp10 = _mm_xor_si128(tmp4, tmp5);
|
230
|
+
tmp13 = _mm_slli_si128(tmp10, 8);
|
231
|
+
tmp11 = _mm_srli_si128(tmp10, 8);
|
232
|
+
tmp15 = _mm_xor_si128(tmp3, tmp13);
|
233
|
+
tmp17 = _mm_xor_si128(tmp6, tmp11);
|
234
|
+
tmp7 = _mm_srli_epi32(tmp15, 31);
|
235
|
+
tmp8 = _mm_srli_epi32(tmp17, 31);
|
236
|
+
tmp16 = _mm_slli_epi32(tmp15, 1);
|
237
|
+
tmp18 = _mm_slli_epi32(tmp17, 1);
|
238
|
+
tmp9 = _mm_srli_si128(tmp7, 12);
|
239
|
+
tmp22 = _mm_slli_si128(tmp8, 4);
|
240
|
+
tmp25 = _mm_slli_si128(tmp7, 4);
|
241
|
+
tmp29 = _mm_or_si128(tmp16, tmp25);
|
242
|
+
tmp19 = _mm_or_si128(tmp18, tmp22);
|
243
|
+
tmp20 = _mm_or_si128(tmp19, tmp9);
|
244
|
+
tmp26 = _mm_slli_epi32(tmp29, 31);
|
245
|
+
tmp23 = _mm_slli_epi32(tmp29, 30);
|
246
|
+
tmp32 = _mm_slli_epi32(tmp29, 25);
|
247
|
+
tmp27 = _mm_xor_si128(tmp26, tmp23);
|
248
|
+
tmp28 = _mm_xor_si128(tmp27, tmp32);
|
249
|
+
tmp24 = _mm_srli_si128(tmp28, 4);
|
250
|
+
tmp33 = _mm_slli_si128(tmp28, 12);
|
251
|
+
tmp30 = _mm_xor_si128(tmp29, tmp33);
|
252
|
+
tmp2 = _mm_srli_epi32(tmp30, 1);
|
253
|
+
tmp12 = _mm_srli_epi32(tmp30, 2);
|
254
|
+
tmp14 = _mm_srli_epi32(tmp30, 7);
|
255
|
+
tmp34 = _mm_xor_si128(tmp2, tmp12);
|
256
|
+
tmp35 = _mm_xor_si128(tmp34, tmp14);
|
257
|
+
tmp36 = _mm_xor_si128(tmp35, tmp24);
|
258
|
+
tmp31 = _mm_xor_si128(tmp30, tmp36);
|
259
|
+
tmp21 = _mm_xor_si128(tmp20, tmp31);
|
260
|
+
_mm_storeu_si128((__m128i *) c, tmp21);
|
261
|
+
}
|
262
|
+
|
263
|
+
/* pure multiplication, for pre-computing powers of H */
|
264
|
+
static inline __m128i
|
265
|
+
mulv(__m128i A, __m128i B)
|
266
|
+
{
|
267
|
+
__m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
|
268
|
+
__m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
|
269
|
+
__m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
|
270
|
+
__m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
|
271
|
+
__m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
|
272
|
+
__m128i tmp13 = _mm_slli_si128(tmp10, 8);
|
273
|
+
__m128i tmp11 = _mm_srli_si128(tmp10, 8);
|
274
|
+
__m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
|
275
|
+
__m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
|
276
|
+
__m128i tmp7 = _mm_srli_epi32(tmp15, 31);
|
277
|
+
__m128i tmp8 = _mm_srli_epi32(tmp17, 31);
|
278
|
+
__m128i tmp16 = _mm_slli_epi32(tmp15, 1);
|
279
|
+
__m128i tmp18 = _mm_slli_epi32(tmp17, 1);
|
280
|
+
__m128i tmp9 = _mm_srli_si128(tmp7, 12);
|
281
|
+
__m128i tmp22 = _mm_slli_si128(tmp8, 4);
|
282
|
+
__m128i tmp25 = _mm_slli_si128(tmp7, 4);
|
283
|
+
__m128i tmp29 = _mm_or_si128(tmp16, tmp25);
|
284
|
+
__m128i tmp19 = _mm_or_si128(tmp18, tmp22);
|
285
|
+
__m128i tmp20 = _mm_or_si128(tmp19, tmp9);
|
286
|
+
__m128i tmp26 = _mm_slli_epi32(tmp29, 31);
|
287
|
+
__m128i tmp23 = _mm_slli_epi32(tmp29, 30);
|
288
|
+
__m128i tmp32 = _mm_slli_epi32(tmp29, 25);
|
289
|
+
__m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
|
290
|
+
__m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
|
291
|
+
__m128i tmp24 = _mm_srli_si128(tmp28, 4);
|
292
|
+
__m128i tmp33 = _mm_slli_si128(tmp28, 12);
|
293
|
+
__m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
|
294
|
+
__m128i tmp2 = _mm_srli_epi32(tmp30, 1);
|
295
|
+
__m128i tmp12 = _mm_srli_epi32(tmp30, 2);
|
296
|
+
__m128i tmp14 = _mm_srli_epi32(tmp30, 7);
|
297
|
+
__m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
|
298
|
+
__m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
|
299
|
+
__m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
|
300
|
+
__m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
|
301
|
+
__m128i C = _mm_xor_si128(tmp20, tmp31);
|
302
|
+
|
303
|
+
return C;
|
304
|
+
}
|
305
|
+
|
306
|
+
/* 4 multiply-accumulate at once; again
|
307
|
+
<https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
|
308
|
+
for the Aggregated Reduction Method & sample code.
|
309
|
+
Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
|
310
|
+
|
311
|
+
#define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
|
312
|
+
#define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
|
313
|
+
#define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
|
314
|
+
#define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
|
315
|
+
#define RED_MUL_MID(a) \
|
316
|
+
tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \
|
317
|
+
tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \
|
318
|
+
tmp##a = _mm_xor_si128(tmp##a, H##a); \
|
319
|
+
tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
|
320
|
+
tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
|
321
|
+
|
322
|
+
#define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
|
323
|
+
do { \
|
324
|
+
MAKE4(RED_DECL); \
|
325
|
+
__m128i lo, hi; \
|
326
|
+
__m128i tmp8, tmp9; \
|
327
|
+
__m128i H0 = H0_; \
|
328
|
+
__m128i H1 = H1_; \
|
329
|
+
__m128i H2 = H2_; \
|
330
|
+
__m128i H3 = H3_; \
|
331
|
+
__m128i X0 = X0_; \
|
332
|
+
__m128i X1 = X1_; \
|
333
|
+
__m128i X2 = X2_; \
|
334
|
+
__m128i X3 = X3_; \
|
335
|
+
\
|
336
|
+
/* byte-revert the inputs & xor the first one into the accumulator */ \
|
337
|
+
\
|
338
|
+
MAKE4(RED_SHUFFLE); \
|
339
|
+
X3 = _mm_xor_si128(X3, accv); \
|
340
|
+
\
|
341
|
+
/* 4 low H*X (x0*h0) */ \
|
342
|
+
\
|
343
|
+
MAKE4(RED_MUL_LOW); \
|
344
|
+
lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
|
345
|
+
lo = _mm_xor_si128(lo, H2_X2_lo); \
|
346
|
+
lo = _mm_xor_si128(lo, H3_X3_lo); \
|
347
|
+
\
|
348
|
+
/* 4 high H*X (x1*h1) */ \
|
349
|
+
\
|
350
|
+
MAKE4(RED_MUL_HIGH); \
|
351
|
+
hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
|
352
|
+
hi = _mm_xor_si128(hi, H2_X2_hi); \
|
353
|
+
hi = _mm_xor_si128(hi, H3_X3_hi); \
|
354
|
+
\
|
355
|
+
/* 4 middle H*X, using Karatsuba, i.e. \
|
356
|
+
x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
|
357
|
+
we already have all x1y1 & x0y0 (accumulated in hi & lo) \
|
358
|
+
(0 is low half and 1 is high half) \
|
359
|
+
*/ \
|
360
|
+
/* permute the high and low 64 bits in H1 & X1, \
|
361
|
+
so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
|
362
|
+
then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
|
363
|
+
and finally multiply \
|
364
|
+
*/ \
|
365
|
+
MAKE4(RED_MUL_MID); \
|
366
|
+
\
|
367
|
+
/* substracts x1*h1 and x0*h0 */ \
|
368
|
+
tmp0 = _mm_xor_si128(tmp0, lo); \
|
369
|
+
tmp0 = _mm_xor_si128(tmp0, hi); \
|
370
|
+
tmp0 = _mm_xor_si128(tmp1, tmp0); \
|
371
|
+
tmp0 = _mm_xor_si128(tmp2, tmp0); \
|
372
|
+
tmp0 = _mm_xor_si128(tmp3, tmp0);\
|
373
|
+
\
|
374
|
+
/* reduction */ \
|
375
|
+
tmp0B = _mm_slli_si128(tmp0, 8); \
|
376
|
+
tmp0 = _mm_srli_si128(tmp0, 8); \
|
377
|
+
lo = _mm_xor_si128(tmp0B, lo); \
|
378
|
+
hi = _mm_xor_si128(tmp0, hi); \
|
379
|
+
tmp3 = lo; \
|
380
|
+
tmp2B = hi; \
|
381
|
+
tmp3B = _mm_srli_epi32(tmp3, 31); \
|
382
|
+
tmp8 = _mm_srli_epi32(tmp2B, 31); \
|
383
|
+
tmp3 = _mm_slli_epi32(tmp3, 1); \
|
384
|
+
tmp2B = _mm_slli_epi32(tmp2B, 1); \
|
385
|
+
tmp9 = _mm_srli_si128(tmp3B, 12); \
|
386
|
+
tmp8 = _mm_slli_si128(tmp8, 4); \
|
387
|
+
tmp3B = _mm_slli_si128(tmp3B, 4); \
|
388
|
+
tmp3 = _mm_or_si128(tmp3, tmp3B); \
|
389
|
+
tmp2B = _mm_or_si128(tmp2B, tmp8); \
|
390
|
+
tmp2B = _mm_or_si128(tmp2B, tmp9); \
|
391
|
+
tmp3B = _mm_slli_epi32(tmp3, 31); \
|
392
|
+
tmp8 = _mm_slli_epi32(tmp3, 30); \
|
393
|
+
tmp9 = _mm_slli_epi32(tmp3, 25); \
|
394
|
+
tmp3B = _mm_xor_si128(tmp3B, tmp8); \
|
395
|
+
tmp3B = _mm_xor_si128(tmp3B, tmp9); \
|
396
|
+
tmp8 = _mm_srli_si128(tmp3B, 4); \
|
397
|
+
tmp3B = _mm_slli_si128(tmp3B, 12); \
|
398
|
+
tmp3 = _mm_xor_si128(tmp3, tmp3B); \
|
399
|
+
tmp2 = _mm_srli_epi32(tmp3, 1); \
|
400
|
+
tmp0B = _mm_srli_epi32(tmp3, 2); \
|
401
|
+
tmp1B = _mm_srli_epi32(tmp3, 7); \
|
402
|
+
tmp2 = _mm_xor_si128(tmp2, tmp0B); \
|
403
|
+
tmp2 = _mm_xor_si128(tmp2, tmp1B); \
|
404
|
+
tmp2 = _mm_xor_si128(tmp2, tmp8); \
|
405
|
+
tmp3 = _mm_xor_si128(tmp3, tmp2); \
|
406
|
+
tmp2B = _mm_xor_si128(tmp2B, tmp3); \
|
407
|
+
\
|
408
|
+
accv = tmp2B; \
|
409
|
+
} while(0)
|
410
|
+
|
411
|
+
#define XORx(a) \
|
412
|
+
temp##a = _mm_xor_si128(temp##a, \
|
413
|
+
_mm_loadu_si128((const __m128i *) (in + a * 16)))
|
414
|
+
|
415
|
+
#define LOADx(a) \
|
416
|
+
__m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
|
417
|
+
|
418
|
+
/* full encrypt & checksum 8 blocks at once */
|
419
|
+
#define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
|
420
|
+
do { \
|
421
|
+
unsigned char *out = out_; \
|
422
|
+
uint32_t *n = n_; \
|
423
|
+
const unsigned char *in = in_; \
|
424
|
+
const __m128i hv = hv_; \
|
425
|
+
const __m128i h2v = h2v_; \
|
426
|
+
const __m128i h3v = h3v_; \
|
427
|
+
const __m128i h4v = h4v_; \
|
428
|
+
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
429
|
+
__m128i accv_; \
|
430
|
+
int roundctr; \
|
431
|
+
\
|
432
|
+
MAKE8(NVDECLx); \
|
433
|
+
MAKE8(TEMPDECLx); \
|
434
|
+
MAKE8(NVx); \
|
435
|
+
MAKE8(TEMPx); \
|
436
|
+
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
437
|
+
MAKE8(AESENCx); \
|
438
|
+
} \
|
439
|
+
MAKE8(AESENCLASTx); \
|
440
|
+
MAKE8(XORx); \
|
441
|
+
MAKE8(STOREx); \
|
442
|
+
accv_ = _mm_load_si128((const __m128i *) accum); \
|
443
|
+
REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \
|
444
|
+
REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \
|
445
|
+
_mm_store_si128((__m128i *) accum, accv_); \
|
446
|
+
} while(0)
|
447
|
+
|
448
|
+
/* checksum 8 blocks at once */
|
449
|
+
#define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
|
450
|
+
do { \
|
451
|
+
const unsigned char *in = in_; \
|
452
|
+
const __m128i hv = hv_; \
|
453
|
+
const __m128i h2v = h2v_; \
|
454
|
+
const __m128i h3v = h3v_; \
|
455
|
+
const __m128i h4v = h4v_; \
|
456
|
+
__m128i accv_; \
|
457
|
+
\
|
458
|
+
MAKE8(LOADx); \
|
459
|
+
accv_ = _mm_load_si128((const __m128i *) accum); \
|
460
|
+
REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \
|
461
|
+
REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \
|
462
|
+
_mm_store_si128((__m128i *) accum, accv_); \
|
463
|
+
} while(0)
|
464
|
+
|
465
|
+
/* decrypt 8 blocks at once */
|
466
|
+
#define aesni_decrypt8full(out_, n_, rkeys, in_) \
|
467
|
+
do { \
|
468
|
+
unsigned char *out = out_; \
|
469
|
+
uint32_t *n = n_; \
|
470
|
+
const unsigned char *in = in_; \
|
471
|
+
const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
|
472
|
+
int roundctr; \
|
473
|
+
\
|
474
|
+
MAKE8(NVDECLx); \
|
475
|
+
MAKE8(TEMPDECLx); \
|
476
|
+
MAKE8(NVx); \
|
477
|
+
MAKE8(TEMPx); \
|
478
|
+
for (roundctr = 1; roundctr < 14; roundctr++) { \
|
479
|
+
MAKE8(AESENCx); \
|
480
|
+
} \
|
481
|
+
MAKE8(AESENCLASTx); \
|
482
|
+
MAKE8(XORx); \
|
483
|
+
MAKE8(STOREx); \
|
484
|
+
} while(0)
|
485
|
+
|
486
|
+
int
|
487
|
+
crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
|
488
|
+
const unsigned char *k)
|
489
|
+
{
|
490
|
+
context *ctx = (context *) ctx_;
|
491
|
+
__m128i *rkeys = ctx->rkeys;
|
492
|
+
__m128i zero = _mm_setzero_si128();
|
493
|
+
unsigned char *H = ctx->H;
|
494
|
+
|
495
|
+
(void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]);
|
496
|
+
aesni_key256_expand(k, (__m128 *) rkeys);
|
497
|
+
aesni_encrypt1(H, zero, rkeys);
|
498
|
+
|
499
|
+
return 0;
|
500
|
+
}
|
501
|
+
|
502
|
+
int
|
503
|
+
crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen,
|
504
|
+
const unsigned char *m, unsigned long long mlen,
|
505
|
+
const unsigned char *ad, unsigned long long adlen,
|
506
|
+
const unsigned char *nsec,
|
507
|
+
const unsigned char *npub,
|
508
|
+
const crypto_aead_aes256gcm_state *ctx_)
|
509
|
+
{
|
510
|
+
unsigned char H[16];
|
511
|
+
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
512
|
+
const context *ctx = (const context *) ctx_;
|
513
|
+
const __m128i *rkeys = ctx->rkeys;
|
514
|
+
__m128i Hv, H2v, H3v, H4v, accv;
|
515
|
+
unsigned long long i, j;
|
516
|
+
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
517
|
+
unsigned long long mlen_rnd128 = mlen & ~127ULL;
|
518
|
+
CRYPTO_ALIGN(16) unsigned char n2[16];
|
519
|
+
CRYPTO_ALIGN(16) unsigned char T[16];
|
520
|
+
CRYPTO_ALIGN(16) unsigned char accum[16];
|
521
|
+
CRYPTO_ALIGN(16) unsigned char fb[16];
|
522
|
+
|
523
|
+
(void) nsec;
|
524
|
+
memcpy(H, ctx->H, sizeof H);
|
525
|
+
if (mlen > 16ULL * (1ULL << 32)) {
|
526
|
+
abort();
|
527
|
+
}
|
528
|
+
memcpy(&n2[0], npub, 12);
|
529
|
+
*(uint32_t *) &n2[12] = 0x01000000;
|
530
|
+
aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
|
531
|
+
|
532
|
+
(*(uint64_t *) &fb[0]) = _bswap64((uint64_t) (8 * adlen));
|
533
|
+
(*(uint64_t *) &fb[8]) = _bswap64((uint64_t) (8 * mlen));
|
534
|
+
|
535
|
+
/* we store H (and it's power) byte-reverted once and for all */
|
536
|
+
Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
|
537
|
+
_mm_store_si128((__m128i *) H, Hv);
|
538
|
+
H2v = mulv(Hv, Hv);
|
539
|
+
H3v = mulv(H2v, Hv);
|
540
|
+
H4v = mulv(H3v, Hv);
|
541
|
+
|
542
|
+
accv = _mm_setzero_si128();
|
543
|
+
/* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */
|
544
|
+
for (i = 0; i < adlen_rnd64; i += 64) {
|
545
|
+
__m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
546
|
+
__m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
547
|
+
__m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
548
|
+
__m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
549
|
+
REDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
550
|
+
}
|
551
|
+
_mm_store_si128((__m128i *) accum, accv);
|
552
|
+
|
553
|
+
/* GCM remainder loop */
|
554
|
+
for (i = adlen_rnd64; i < adlen; i += 16) {
|
555
|
+
unsigned int blocklen = 16;
|
556
|
+
|
557
|
+
if (i + (unsigned long long) blocklen > adlen) {
|
558
|
+
blocklen = (unsigned int) (adlen - i);
|
559
|
+
}
|
560
|
+
addmul(accum, ad + i, blocklen, H);
|
561
|
+
}
|
562
|
+
|
563
|
+
/* this only does 8 full blocks, so no fancy bounds checking is necessary*/
|
564
|
+
#define LOOPRND128 \
|
565
|
+
do { \
|
566
|
+
const int iter = 8; \
|
567
|
+
const int lb = iter * 16; \
|
568
|
+
\
|
569
|
+
for (i = 0; i < mlen_rnd128; i += lb) { \
|
570
|
+
aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev); \
|
571
|
+
} \
|
572
|
+
} while(0)
|
573
|
+
|
574
|
+
/* remainder loop, with the slower GCM update to accomodate partial blocks */
|
575
|
+
#define LOOPRMD128 \
|
576
|
+
do { \
|
577
|
+
const int iter = 8; \
|
578
|
+
const int lb = iter * 16; \
|
579
|
+
\
|
580
|
+
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
581
|
+
CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
|
582
|
+
unsigned long long mj = lb; \
|
583
|
+
\
|
584
|
+
aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
|
585
|
+
if ((i + mj) >= mlen) { \
|
586
|
+
mj = mlen - i; \
|
587
|
+
} \
|
588
|
+
for (j = 0; j < mj; j++) { \
|
589
|
+
c[i + j] = m[i + j] ^ outni[j]; \
|
590
|
+
} \
|
591
|
+
for (j = 0; j < mj; j += 16) { \
|
592
|
+
unsigned int bl = 16; \
|
593
|
+
\
|
594
|
+
if (j + (unsigned long long) bl >= mj) { \
|
595
|
+
bl = (unsigned int) (mj - j); \
|
596
|
+
} \
|
597
|
+
addmul(accum, c + i + j, bl, H); \
|
598
|
+
} \
|
599
|
+
} \
|
600
|
+
} while(0)
|
601
|
+
|
602
|
+
n2[15] = 0;
|
603
|
+
COUNTER_INC2(n2);
|
604
|
+
LOOPRND128;
|
605
|
+
LOOPRMD128;
|
606
|
+
|
607
|
+
addmul(accum, fb, 16, H);
|
608
|
+
|
609
|
+
for (i = 0; i < 16; ++i) {
|
610
|
+
c[i + mlen] = T[i] ^ accum[15 - i];
|
611
|
+
}
|
612
|
+
if (clen != NULL) {
|
613
|
+
*clen = mlen + 16;
|
614
|
+
}
|
615
|
+
return 0;
|
616
|
+
}
|
617
|
+
|
618
|
+
int
|
619
|
+
crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
|
620
|
+
unsigned char *nsec,
|
621
|
+
const unsigned char *c, unsigned long long clen,
|
622
|
+
const unsigned char *ad, unsigned long long adlen,
|
623
|
+
const unsigned char *npub,
|
624
|
+
const crypto_aead_aes256gcm_state *ctx_)
|
625
|
+
{
|
626
|
+
unsigned char H[16];
|
627
|
+
const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
|
628
|
+
const context *ctx = (const context *) ctx_;
|
629
|
+
const __m128i *rkeys = ctx->rkeys;
|
630
|
+
__m128i Hv, H2v, H3v, H4v, accv;
|
631
|
+
unsigned long long i, j;
|
632
|
+
unsigned long long adlen_rnd64 = adlen & ~63ULL;
|
633
|
+
unsigned long long mlen;
|
634
|
+
unsigned long long mlen_rnd128;
|
635
|
+
CRYPTO_ALIGN(16) unsigned char n2[16];
|
636
|
+
CRYPTO_ALIGN(16) unsigned char T[16];
|
637
|
+
CRYPTO_ALIGN(16) unsigned char accum[16];
|
638
|
+
CRYPTO_ALIGN(16) unsigned char fb[16];
|
639
|
+
|
640
|
+
(void) nsec;
|
641
|
+
memcpy(H, ctx->H, sizeof H);
|
642
|
+
if (clen > 16ULL * (1ULL << 32) - 16ULL) {
|
643
|
+
abort();
|
644
|
+
}
|
645
|
+
mlen = clen - 16;
|
646
|
+
if (mlen_p != NULL) {
|
647
|
+
*mlen_p = 0U;
|
648
|
+
}
|
649
|
+
memcpy(&n2[0], npub, 12);
|
650
|
+
*(uint32_t *) &n2[12] = 0x01000000;
|
651
|
+
aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
|
652
|
+
|
653
|
+
(*(uint64_t *) &fb[0]) = _bswap64((uint64_t)(8 * adlen));
|
654
|
+
(*(uint64_t *) &fb[8]) = _bswap64((uint64_t)(8 * mlen));
|
655
|
+
|
656
|
+
Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
|
657
|
+
_mm_store_si128((__m128i *) H, Hv);
|
658
|
+
H2v = mulv(Hv, Hv);
|
659
|
+
H3v = mulv(H2v, Hv);
|
660
|
+
H4v = mulv(H3v, Hv);
|
661
|
+
|
662
|
+
accv = _mm_setzero_si128();
|
663
|
+
for (i = 0; i < adlen_rnd64; i += 64) {
|
664
|
+
__m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
|
665
|
+
__m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
|
666
|
+
__m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
|
667
|
+
__m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
|
668
|
+
REDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
|
669
|
+
}
|
670
|
+
_mm_store_si128((__m128i *) accum, accv);
|
671
|
+
|
672
|
+
for (i = adlen_rnd64; i < adlen; i += 16) {
|
673
|
+
unsigned int blocklen = 16;
|
674
|
+
if (i + (unsigned long long) blocklen > adlen) {
|
675
|
+
blocklen = (unsigned int) (adlen - i);
|
676
|
+
}
|
677
|
+
addmul(accum, ad + i, blocklen, H);
|
678
|
+
}
|
679
|
+
|
680
|
+
mlen_rnd128 = mlen & ~127ULL;
|
681
|
+
|
682
|
+
#define LOOPACCUMDRND128 \
|
683
|
+
do { \
|
684
|
+
const int iter = 8; \
|
685
|
+
const int lb = iter * 16; \
|
686
|
+
for (i = 0; i < mlen_rnd128; i += lb) { \
|
687
|
+
aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev); \
|
688
|
+
} \
|
689
|
+
} while(0)
|
690
|
+
|
691
|
+
#define LOOPDRND128 \
|
692
|
+
do { \
|
693
|
+
const int iter = 8; \
|
694
|
+
const int lb = iter * 16; \
|
695
|
+
\
|
696
|
+
for (i = 0; i < mlen_rnd128; i += lb) { \
|
697
|
+
aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \
|
698
|
+
} \
|
699
|
+
} while(0)
|
700
|
+
|
701
|
+
#define LOOPACCUMDRMD128 \
|
702
|
+
do { \
|
703
|
+
const int iter = 8; \
|
704
|
+
const int lb = iter * 16; \
|
705
|
+
\
|
706
|
+
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
707
|
+
unsigned long long mj = lb; \
|
708
|
+
\
|
709
|
+
if ((i + mj) >= mlen) { \
|
710
|
+
mj = mlen - i; \
|
711
|
+
} \
|
712
|
+
for (j = 0; j < mj; j += 16) { \
|
713
|
+
unsigned int bl = 16; \
|
714
|
+
\
|
715
|
+
if (j + (unsigned long long) bl >= mj) { \
|
716
|
+
bl = (unsigned int) (mj - j); \
|
717
|
+
} \
|
718
|
+
addmul(accum, c + i + j, bl, H); \
|
719
|
+
} \
|
720
|
+
} \
|
721
|
+
} while(0)
|
722
|
+
|
723
|
+
#define LOOPDRMD128 \
|
724
|
+
do { \
|
725
|
+
const int iter = 8; \
|
726
|
+
const int lb = iter * 16; \
|
727
|
+
\
|
728
|
+
for (i = mlen_rnd128; i < mlen; i += lb) { \
|
729
|
+
CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
|
730
|
+
unsigned long long mj = lb; \
|
731
|
+
\
|
732
|
+
if ((i + mj) >= mlen) { \
|
733
|
+
mj = mlen - i; \
|
734
|
+
} \
|
735
|
+
aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
|
736
|
+
for (j = 0; j < mj; j++) { \
|
737
|
+
m[i + j] = c[i + j] ^ outni[j]; \
|
738
|
+
} \
|
739
|
+
} \
|
740
|
+
} while(0)
|
741
|
+
n2[15] = 0;
|
742
|
+
|
743
|
+
COUNTER_INC2(n2);
|
744
|
+
LOOPACCUMDRND128;
|
745
|
+
LOOPACCUMDRMD128;
|
746
|
+
addmul(accum, fb, 16, H);
|
747
|
+
{
|
748
|
+
unsigned char d = 0;
|
749
|
+
|
750
|
+
for (i = 0; i < 16; i++) {
|
751
|
+
d |= (c[i + mlen] ^ (T[i] ^ accum[15 - i]));
|
752
|
+
}
|
753
|
+
if (d != 0) {
|
754
|
+
return -1;
|
755
|
+
}
|
756
|
+
}
|
757
|
+
*(uint32_t *) &n2[12] = 0;
|
758
|
+
COUNTER_INC2(n2);
|
759
|
+
LOOPDRND128;
|
760
|
+
LOOPDRMD128;
|
761
|
+
|
762
|
+
if (mlen_p != NULL) {
|
763
|
+
*mlen_p = mlen;
|
764
|
+
}
|
765
|
+
return 0;
|
766
|
+
}
|
767
|
+
|
768
|
+
int
|
769
|
+
crypto_aead_aes256gcm_encrypt(unsigned char *c,
|
770
|
+
unsigned long long *clen_p,
|
771
|
+
const unsigned char *m,
|
772
|
+
unsigned long long mlen,
|
773
|
+
const unsigned char *ad,
|
774
|
+
unsigned long long adlen,
|
775
|
+
const unsigned char *nsec,
|
776
|
+
const unsigned char *npub,
|
777
|
+
const unsigned char *k)
|
778
|
+
{
|
779
|
+
crypto_aead_aes256gcm_state ctx;
|
780
|
+
|
781
|
+
crypto_aead_aes256gcm_beforenm(&ctx, k);
|
782
|
+
|
783
|
+
return crypto_aead_aes256gcm_encrypt_afternm
|
784
|
+
(c, clen_p, m, mlen, ad, adlen, nsec, npub,
|
785
|
+
(const crypto_aead_aes256gcm_state *) &ctx);
|
786
|
+
}
|
787
|
+
|
788
|
+
int
|
789
|
+
crypto_aead_aes256gcm_decrypt(unsigned char *m,
|
790
|
+
unsigned long long *mlen_p,
|
791
|
+
unsigned char *nsec,
|
792
|
+
const unsigned char *c,
|
793
|
+
unsigned long long clen,
|
794
|
+
const unsigned char *ad,
|
795
|
+
unsigned long long adlen,
|
796
|
+
const unsigned char *npub,
|
797
|
+
const unsigned char *k)
|
798
|
+
{
|
799
|
+
crypto_aead_aes256gcm_state ctx;
|
800
|
+
|
801
|
+
crypto_aead_aes256gcm_beforenm(&ctx, k);
|
802
|
+
|
803
|
+
return crypto_aead_aes256gcm_decrypt_afternm
|
804
|
+
(m, mlen_p, nsec, c, clen, ad, adlen, npub,
|
805
|
+
(const crypto_aead_aes256gcm_state *) &ctx);
|
806
|
+
}
|
807
|
+
|
808
|
+
int
|
809
|
+
crypto_aead_aes256gcm_is_available(void)
|
810
|
+
{
|
811
|
+
return sodium_runtime_has_pclmul() & sodium_runtime_has_aesni();
|
812
|
+
}
|
813
|
+
|
814
|
+
size_t
|
815
|
+
crypto_aead_aes256gcm_keybytes(void)
|
816
|
+
{
|
817
|
+
return crypto_aead_aes256gcm_KEYBYTES;
|
818
|
+
}
|
819
|
+
|
820
|
+
size_t
|
821
|
+
crypto_aead_aes256gcm_nsecbytes(void)
|
822
|
+
{
|
823
|
+
return crypto_aead_aes256gcm_NSECBYTES;
|
824
|
+
}
|
825
|
+
|
826
|
+
size_t
|
827
|
+
crypto_aead_aes256gcm_npubbytes(void)
|
828
|
+
{
|
829
|
+
return crypto_aead_aes256gcm_NPUBBYTES;
|
830
|
+
}
|
831
|
+
|
832
|
+
size_t
|
833
|
+
crypto_aead_aes256gcm_abytes(void)
|
834
|
+
{
|
835
|
+
return crypto_aead_aes256gcm_ABYTES;
|
836
|
+
}
|
837
|
+
|
838
|
+
size_t
|
839
|
+
crypto_aead_aes256gcm_statebytes(void)
|
840
|
+
{
|
841
|
+
return (sizeof(crypto_aead_aes256gcm_state) + (size_t) 15U) & ~(size_t) 15U;
|
842
|
+
}
|
843
|
+
|
844
|
+
#else
|
845
|
+
|
846
|
+
int
|
847
|
+
crypto_aead_aes256gcm_is_available(void)
|
848
|
+
{
|
849
|
+
return 0;
|
850
|
+
}
|
851
|
+
|
852
|
+
#endif
|