rbnacl-libsodium 1.0.3 → 1.0.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (131) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGES.md +4 -0
  3. data/lib/rbnacl/libsodium/version.rb +1 -1
  4. data/vendor/libsodium/AUTHORS +3 -0
  5. data/vendor/libsodium/ChangeLog +21 -0
  6. data/vendor/libsodium/Makefile.in +9 -0
  7. data/vendor/libsodium/THANKS +3 -0
  8. data/vendor/libsodium/aclocal.m4 +1 -0
  9. data/vendor/libsodium/autom4te.cache/output.1 +919 -85
  10. data/vendor/libsodium/autom4te.cache/output.5 +18351 -0
  11. data/vendor/libsodium/autom4te.cache/requests +960 -725
  12. data/vendor/libsodium/autom4te.cache/traces.1 +289 -247
  13. data/vendor/libsodium/autom4te.cache/traces.5 +3032 -0
  14. data/vendor/libsodium/build-aux/ltmain.sh +70 -11
  15. data/vendor/libsodium/builds/msvc/properties/ARM.props +20 -0
  16. data/vendor/libsodium/builds/msvc/properties/ReleaseDEXE.props +1 -1
  17. data/vendor/libsodium/builds/msvc/version.h +2 -2
  18. data/vendor/libsodium/builds/msvc/vs2010/libsodium/libsodium.vcxproj +3 -1
  19. data/vendor/libsodium/builds/msvc/vs2010/libsodium/libsodium.vcxproj.filters +13 -1
  20. data/vendor/libsodium/builds/msvc/vs2010/test/test.vcxproj +244 -241
  21. data/vendor/libsodium/builds/msvc/vs2010/test/test.vcxproj.filters +192 -189
  22. data/vendor/libsodium/builds/msvc/vs2012/libsodium/libsodium.vcxproj +2 -0
  23. data/vendor/libsodium/builds/msvc/vs2012/libsodium/libsodium.vcxproj.filters +13 -1
  24. data/vendor/libsodium/builds/msvc/vs2012/test/test.vcxproj +244 -241
  25. data/vendor/libsodium/builds/msvc/vs2012/test/test.vcxproj.filters +192 -189
  26. data/vendor/libsodium/builds/msvc/vs2013/libsodium/libsodium.vcxproj +3 -1
  27. data/vendor/libsodium/builds/msvc/vs2013/libsodium/libsodium.vcxproj.filters +13 -1
  28. data/vendor/libsodium/builds/msvc/vs2013/test/test.vcxproj +244 -241
  29. data/vendor/libsodium/builds/msvc/vs2013/test/test.vcxproj.filters +192 -189
  30. data/vendor/libsodium/builds/msvc/vs2015/libsodium/libsodium.vcxproj +3 -1
  31. data/vendor/libsodium/builds/msvc/vs2015/libsodium/libsodium.vcxproj.filters +13 -1
  32. data/vendor/libsodium/builds/msvc/vs2015/test/test.vcxproj +244 -241
  33. data/vendor/libsodium/builds/msvc/vs2015/test/test.vcxproj.filters +192 -189
  34. data/vendor/libsodium/configure +918 -84
  35. data/vendor/libsodium/configure.ac +89 -15
  36. data/vendor/libsodium/dist-build/Makefile.am +6 -2
  37. data/vendor/libsodium/dist-build/Makefile.in +15 -2
  38. data/vendor/libsodium/dist-build/android-armv8-a.sh +4 -0
  39. data/vendor/libsodium/dist-build/android-build.sh +9 -9
  40. data/vendor/libsodium/dist-build/android-mips64.sh +4 -0
  41. data/vendor/libsodium/dist-build/android-x86-64.sh +4 -0
  42. data/vendor/libsodium/dist-build/emscripten.sh +3 -3
  43. data/vendor/libsodium/dist-build/ios.sh +5 -5
  44. data/vendor/libsodium/dist-build/nativeclient.sh +28 -0
  45. data/vendor/libsodium/examples/Makefile +21 -0
  46. data/vendor/libsodium/examples/auth.c +68 -0
  47. data/vendor/libsodium/examples/box.c +133 -0
  48. data/vendor/libsodium/examples/box_detached.c +132 -0
  49. data/vendor/libsodium/examples/generichash.c +80 -0
  50. data/vendor/libsodium/examples/generichash_stream.c +58 -0
  51. data/vendor/libsodium/examples/shorthash.c +58 -0
  52. data/vendor/libsodium/examples/sign.c +78 -0
  53. data/vendor/libsodium/examples/utils.h +106 -0
  54. data/vendor/libsodium/libsodium-uninstalled.pc.in +1 -1
  55. data/vendor/libsodium/libsodium.vcxproj +2 -0
  56. data/vendor/libsodium/libsodium.vcxproj.filters +6 -0
  57. data/vendor/libsodium/m4/ax_check_compile_flag.m4 +2 -2
  58. data/vendor/libsodium/m4/ax_check_define.m4 +92 -0
  59. data/vendor/libsodium/m4/ax_check_link_flag.m4 +3 -2
  60. data/vendor/libsodium/m4/libtool.m4 +111 -60
  61. data/vendor/libsodium/m4/ltoptions.m4 +1 -1
  62. data/vendor/libsodium/m4/ltsugar.m4 +1 -1
  63. data/vendor/libsodium/m4/ltversion.m4 +6 -6
  64. data/vendor/libsodium/m4/lt~obsolete.m4 +1 -1
  65. data/vendor/libsodium/msvc-scripts/Makefile.in +9 -0
  66. data/vendor/libsodium/msvc-scripts/process.bat +2 -2
  67. data/vendor/libsodium/src/Makefile.in +9 -0
  68. data/vendor/libsodium/src/libsodium/Makefile.am +27 -3
  69. data/vendor/libsodium/src/libsodium/Makefile.in +170 -63
  70. data/vendor/libsodium/src/libsodium/crypto_aead/aes256gcm/aesni/aead_aes256gcm_aesni.c +852 -0
  71. data/vendor/libsodium/src/libsodium/crypto_aead/chacha20poly1305/sodium/aead_chacha20poly1305.c +137 -17
  72. data/vendor/libsodium/src/libsodium/crypto_auth/hmacsha256/cp/hmac_hmacsha256.c +1 -0
  73. data/vendor/libsodium/src/libsodium/crypto_auth/hmacsha512/cp/hmac_hmacsha512.c +1 -0
  74. data/vendor/libsodium/src/libsodium/crypto_generichash/blake2/ref/blake2b-ref.c +10 -6
  75. data/vendor/libsodium/src/libsodium/crypto_generichash/crypto_generichash.c +1 -1
  76. data/vendor/libsodium/src/libsodium/crypto_hash/sha256/cp/hash_sha256.c +29 -23
  77. data/vendor/libsodium/src/libsodium/crypto_hash/sha512/cp/hash_sha512.c +9 -10
  78. data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/crypto_scrypt-common.c +4 -2
  79. data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/crypto_scrypt.h +1 -0
  80. data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/nosse/pwhash_scryptsalsa208sha256_nosse.c +4 -0
  81. data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/pbkdf2-sha256.c +3 -0
  82. data/vendor/libsodium/src/libsodium/crypto_pwhash/scryptsalsa208sha256/sse/pwhash_scryptsalsa208sha256_sse.c +2 -1
  83. data/vendor/libsodium/src/libsodium/crypto_scalarmult/curve25519/ref10/fe_frombytes_curve25519_ref10.c +10 -10
  84. data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/ref/api.h +10 -0
  85. data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/ref/stream_chacha20_ref.c +51 -0
  86. data/vendor/libsodium/src/libsodium/crypto_stream/chacha20/stream_chacha20_api.c +29 -0
  87. data/vendor/libsodium/src/libsodium/include/Makefile.am +6 -0
  88. data/vendor/libsodium/src/libsodium/include/Makefile.in +67 -40
  89. data/vendor/libsodium/src/libsodium/include/sodium.h +4 -0
  90. data/vendor/libsodium/src/libsodium/include/sodium/crypto_aead_aes256gcm.h +88 -0
  91. data/vendor/libsodium/src/libsodium/include/sodium/crypto_aead_chacha20poly1305.h +28 -2
  92. data/vendor/libsodium/src/libsodium/include/sodium/crypto_generichash_blake2b.h +0 -6
  93. data/vendor/libsodium/src/libsodium/include/sodium/crypto_hash_sha256.h +1 -1
  94. data/vendor/libsodium/src/libsodium/include/sodium/crypto_stream_chacha20.h +23 -0
  95. data/vendor/libsodium/src/libsodium/include/sodium/export.h +8 -0
  96. data/vendor/libsodium/src/libsodium/include/sodium/randombytes_nativeclient.h +37 -0
  97. data/vendor/libsodium/src/libsodium/include/sodium/randombytes_salsa20_random.h +3 -2
  98. data/vendor/libsodium/src/libsodium/include/sodium/runtime.h +6 -0
  99. data/vendor/libsodium/src/libsodium/include/sodium/utils.h +15 -1
  100. data/vendor/libsodium/src/libsodium/randombytes/nativeclient/randombytes_nativeclient.c +49 -0
  101. data/vendor/libsodium/src/libsodium/randombytes/randombytes.c +11 -1
  102. data/vendor/libsodium/src/libsodium/randombytes/salsa20/randombytes_salsa20_random.c +71 -45
  103. data/vendor/libsodium/src/libsodium/randombytes/sysrandom/randombytes_sysrandom.c +12 -2
  104. data/vendor/libsodium/src/libsodium/sodium/runtime.c +26 -3
  105. data/vendor/libsodium/src/libsodium/sodium/utils.c +86 -13
  106. data/vendor/libsodium/test/Makefile.in +9 -0
  107. data/vendor/libsodium/test/default/Makefile.am +130 -0
  108. data/vendor/libsodium/test/default/Makefile.in +197 -50
  109. data/vendor/libsodium/test/default/aead_aes256gcm.c +3197 -0
  110. data/vendor/libsodium/test/default/aead_aes256gcm.exp +1 -0
  111. data/vendor/libsodium/test/default/aead_chacha20poly1305.c +150 -17
  112. data/vendor/libsodium/test/default/aead_chacha20poly1305.exp +51 -0
  113. data/vendor/libsodium/test/default/chacha20.c +80 -5
  114. data/vendor/libsodium/test/default/chacha20.exp +11 -0
  115. data/vendor/libsodium/test/default/generichash.c +1332 -1
  116. data/vendor/libsodium/test/default/generichash.exp +1 -1
  117. data/vendor/libsodium/test/default/generichash2.c +3 -2
  118. data/vendor/libsodium/test/default/generichash3.c +2 -1
  119. data/vendor/libsodium/test/default/nacl-test-wrapper.sh +26 -0
  120. data/vendor/libsodium/test/default/randombytes.c +4 -0
  121. data/vendor/libsodium/test/default/scalarmult.c +33 -14
  122. data/vendor/libsodium/test/default/scalarmult.exp +4 -4
  123. data/vendor/libsodium/test/default/secretbox.c +9 -0
  124. data/vendor/libsodium/test/default/secretbox.exp +19 -0
  125. data/vendor/libsodium/test/default/secretbox_easy.c +40 -4
  126. data/vendor/libsodium/test/default/secretbox_easy.exp +58 -1
  127. data/vendor/libsodium/test/default/sodium_core.c +2 -0
  128. data/vendor/libsodium/test/default/sodium_utils.c +46 -7
  129. data/vendor/libsodium/test/default/sodium_utils.exp +5 -0
  130. data/vendor/libsodium/test/default/stream.c +1 -0
  131. metadata +28 -3
@@ -0,0 +1,852 @@
1
+
2
+ /*
3
+ * AES256-GCM, based on original code by Romain Dolbeau
4
+ */
5
+
6
+ #include <stdint.h>
7
+ #include <stdlib.h>
8
+ #include <string.h>
9
+
10
+ #include "crypto_aead_aes256gcm.h"
11
+ #include "export.h"
12
+ #include "runtime.h"
13
+ #include "utils.h"
14
+
15
+ #if defined(HAVE_WMMINTRIN_H) || \
16
+ (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64) || defined(_M_IX86)))
17
+
18
+ #pragma GCC target("ssse3")
19
+ #pragma GCC target("aes")
20
+ #pragma GCC target("pclmul")
21
+
22
+ #include <immintrin.h>
23
+
24
+ #if defined(__INTEL_COMPILER) || defined(_bswap64)
25
+ #elif defined(_MSC_VER)
26
+ # define _bswap64(a) _byteswap_uint64(a)
27
+ #elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))
28
+ # define _bswap64(a) __builtin_bswap64(a)
29
+ #else
30
+ static inline uint64_t
31
+ _bswap64(const uint64_t x)
32
+ {
33
+ return
34
+ ((x << 56) & 0xFF00000000000000UL) | ((x << 40) & 0x00FF000000000000UL) |
35
+ ((x << 24) & 0x0000FF0000000000UL) | ((x << 8) & 0x000000FF00000000UL) |
36
+ ((x >> 8) & 0x00000000FF000000UL) | ((x >> 24) & 0x0000000000FF0000UL) |
37
+ ((x >> 40) & 0x000000000000FF00UL) | ((x >> 56) & 0x00000000000000FFUL);
38
+ }
39
+ #endif
40
+
41
+ typedef struct context {
42
+ CRYPTO_ALIGN(16) unsigned char H[16];
43
+ __m128i rkeys[16];
44
+ } context;
45
+
46
+ static inline void
47
+ aesni_key256_expand(const unsigned char *key, __m128 *rkeys)
48
+ {
49
+ __m128 key0 = _mm_loadu_ps((const float *) (key + 0));
50
+ __m128 key1 = _mm_loadu_ps((const float *) (key + 16));
51
+ __m128 temp0, temp1, temp2, temp4;
52
+ int idx = 0;
53
+
54
+ rkeys[idx++] = key0;
55
+ temp0 = key0;
56
+ temp2 = key1;
57
+ temp4 = _mm_setzero_ps();
58
+
59
+ /* why single precision floating-point rather than integer instructions ?
60
+ because _mm_shuffle_ps takes two inputs, while _mm_shuffle_epi32 only
61
+ takes one - it doesn't perform the same computation...
62
+ _mm_shuffle_ps takes the lower 64 bits of the result from the first
63
+ operand, and the higher 64 bits of the result from the second operand
64
+ (in both cases, all four input floats are accessible).
65
+ I don't like the non-orthogonal naming scheme :-(
66
+
67
+ This is all strongly inspired by the openssl assembly code.
68
+ */
69
+ #define BLOCK1(IMM) \
70
+ temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp2), IMM));\
71
+ rkeys[idx++] = temp2; \
72
+ temp4 = _mm_shuffle_ps(temp4, temp0, 0x10); \
73
+ temp0 = _mm_xor_ps(temp0, temp4); \
74
+ temp4 = _mm_shuffle_ps(temp4, temp0, 0x8c); \
75
+ temp0 = _mm_xor_ps(temp0, temp4); \
76
+ temp1 = _mm_shuffle_ps(temp1, temp1, 0xff); \
77
+ temp0 = _mm_xor_ps(temp0, temp1)
78
+
79
+ #define BLOCK2(IMM) \
80
+ temp1 = _mm_castsi128_ps(_mm_aeskeygenassist_si128(_mm_castps_si128(temp0), IMM));\
81
+ rkeys[idx++] = temp0; \
82
+ temp4 = _mm_shuffle_ps(temp4, temp2, 0x10); \
83
+ temp2 = _mm_xor_ps(temp2, temp4); \
84
+ temp4 = _mm_shuffle_ps(temp4, temp2, 0x8c); \
85
+ temp2 = _mm_xor_ps(temp2, temp4); \
86
+ temp1 = _mm_shuffle_ps(temp1, temp1, 0xaa); \
87
+ temp2 = _mm_xor_ps(temp2, temp1)
88
+
89
+ BLOCK1(0x01);
90
+ BLOCK2(0x01);
91
+
92
+ BLOCK1(0x02);
93
+ BLOCK2(0x02);
94
+
95
+ BLOCK1(0x04);
96
+ BLOCK2(0x04);
97
+
98
+ BLOCK1(0x08);
99
+ BLOCK2(0x08);
100
+
101
+ BLOCK1(0x10);
102
+ BLOCK2(0x10);
103
+
104
+ BLOCK1(0x20);
105
+ BLOCK2(0x20);
106
+
107
+ BLOCK1(0x40);
108
+ rkeys[idx++] = temp0;
109
+ }
110
+
111
+ /** single, by-the-book AES encryption with AES-NI */
112
+ static inline void
113
+ aesni_encrypt1(unsigned char *out, __m128i nv, const __m128i *rkeys)
114
+ {
115
+ __m128i temp = _mm_xor_si128(nv, rkeys[0]);
116
+ int roundctr;
117
+
118
+ #pragma unroll(13)
119
+ for (roundctr = 1; roundctr < 14; roundctr++) {
120
+ temp = _mm_aesenc_si128(temp, rkeys[roundctr]);
121
+ }
122
+ temp = _mm_aesenclast_si128(temp, rkeys[14]);
123
+ _mm_storeu_si128((__m128i *) out, temp);
124
+ }
125
+
126
+ /** multiple-blocks-at-once AES encryption with AES-NI ;
127
+ on Haswell, aesenc as a latency of 7 and a througput of 1
128
+ so the sequence of aesenc should be bubble-free, if you
129
+ have at least 8 blocks. Let's build an arbitratry-sized
130
+ function */
131
+ /* Step 1 : loading the nonce */
132
+ /* load & increment the n vector (non-vectorized, unused for now) */
133
+ #define NVDECLx(a) \
134
+ __m128i nv##a
135
+
136
+ #define NVx(a) \
137
+ nv##a = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) n), pt); \
138
+ n[3]++
139
+
140
+ /* Step 2 : define value in round one (xor with subkey #0, aka key) */
141
+ #define TEMPDECLx(a) \
142
+ __m128i temp##a
143
+
144
+ #define TEMPx(a) \
145
+ temp##a = _mm_xor_si128(nv##a, rkeys[0])
146
+
147
+ /* Step 3: one round of AES */
148
+ #define AESENCx(a) \
149
+ temp##a = _mm_aesenc_si128(temp##a, rkeys[roundctr])
150
+
151
+ /* Step 4: last round of AES */
152
+ #define AESENCLASTx(a) \
153
+ temp##a = _mm_aesenclast_si128(temp##a, rkeys[14])
154
+
155
+ /* Step 5: store result */
156
+ #define STOREx(a) \
157
+ _mm_storeu_si128((__m128i *) (out + (a * 16)), temp##a)
158
+
159
+ /* all the MAKE* macros are for automatic explicit unrolling */
160
+ #define MAKE4(X) \
161
+ X(0); \
162
+ X(1); \
163
+ X(2); \
164
+ X(3)
165
+
166
+ #define MAKE8(X) \
167
+ X(0); \
168
+ X(1); \
169
+ X(2); \
170
+ X(3); \
171
+ X(4); \
172
+ X(5); \
173
+ X(6); \
174
+ X(7)
175
+
176
+ #define COUNTER_INC2(N) (*(uint32_t *) &(N)[12]) = (2U + (((*(uint32_t *) &(N)[12]))))
177
+
178
+ /* create a function of unrolling N ; the MAKEN is the unrolling
179
+ macro, defined above. The N in MAKEN must match N, obviously. */
180
+ #define FUNC(N, MAKEN) \
181
+ static inline void aesni_encrypt##N(unsigned char *out, uint32_t *n, const __m128i *rkeys) \
182
+ { \
183
+ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
184
+ int roundctr; \
185
+ MAKEN(NVDECLx); \
186
+ MAKEN(TEMPDECLx); \
187
+ \
188
+ MAKEN(NVx); \
189
+ MAKEN(TEMPx); \
190
+ for (roundctr = 1; roundctr < 14; roundctr++) { \
191
+ MAKEN(AESENCx); \
192
+ } \
193
+ MAKEN(AESENCLASTx); \
194
+ MAKEN(STOREx); \
195
+ }
196
+
197
+ FUNC(8, MAKE8)
198
+
199
+ /* all GF(2^128) fnctions are by the book, meaning this one:
200
+ <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
201
+ */
202
+
203
+ static inline void
204
+ addmul(unsigned char *c, const unsigned char *a, unsigned int xlen, const unsigned char *b)
205
+ {
206
+ const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
207
+ __m128i A, B, C;
208
+ __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
209
+ __m128i tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp18;
210
+ __m128i tmp19, tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
211
+ __m128i tmp28, tmp29, tmp30, tmp31, tmp32, tmp33, tmp34, tmp35, tmp36;
212
+
213
+ if (xlen >= 16) {
214
+ A = _mm_loadu_si128((const __m128i *) a);
215
+ } else {
216
+ CRYPTO_ALIGN(16) unsigned char padded[16];
217
+ memset(padded, 0, 16);
218
+ memcpy(padded, a, xlen);
219
+ A = _mm_load_si128((const __m128i *) padded);
220
+ }
221
+ A = _mm_shuffle_epi8(A, rev);
222
+ B = _mm_loadu_si128((const __m128i *) b);
223
+ C = _mm_loadu_si128((const __m128i *) c);
224
+ A = _mm_xor_si128(A, C);
225
+ tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
226
+ tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
227
+ tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
228
+ tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
229
+ tmp10 = _mm_xor_si128(tmp4, tmp5);
230
+ tmp13 = _mm_slli_si128(tmp10, 8);
231
+ tmp11 = _mm_srli_si128(tmp10, 8);
232
+ tmp15 = _mm_xor_si128(tmp3, tmp13);
233
+ tmp17 = _mm_xor_si128(tmp6, tmp11);
234
+ tmp7 = _mm_srli_epi32(tmp15, 31);
235
+ tmp8 = _mm_srli_epi32(tmp17, 31);
236
+ tmp16 = _mm_slli_epi32(tmp15, 1);
237
+ tmp18 = _mm_slli_epi32(tmp17, 1);
238
+ tmp9 = _mm_srli_si128(tmp7, 12);
239
+ tmp22 = _mm_slli_si128(tmp8, 4);
240
+ tmp25 = _mm_slli_si128(tmp7, 4);
241
+ tmp29 = _mm_or_si128(tmp16, tmp25);
242
+ tmp19 = _mm_or_si128(tmp18, tmp22);
243
+ tmp20 = _mm_or_si128(tmp19, tmp9);
244
+ tmp26 = _mm_slli_epi32(tmp29, 31);
245
+ tmp23 = _mm_slli_epi32(tmp29, 30);
246
+ tmp32 = _mm_slli_epi32(tmp29, 25);
247
+ tmp27 = _mm_xor_si128(tmp26, tmp23);
248
+ tmp28 = _mm_xor_si128(tmp27, tmp32);
249
+ tmp24 = _mm_srli_si128(tmp28, 4);
250
+ tmp33 = _mm_slli_si128(tmp28, 12);
251
+ tmp30 = _mm_xor_si128(tmp29, tmp33);
252
+ tmp2 = _mm_srli_epi32(tmp30, 1);
253
+ tmp12 = _mm_srli_epi32(tmp30, 2);
254
+ tmp14 = _mm_srli_epi32(tmp30, 7);
255
+ tmp34 = _mm_xor_si128(tmp2, tmp12);
256
+ tmp35 = _mm_xor_si128(tmp34, tmp14);
257
+ tmp36 = _mm_xor_si128(tmp35, tmp24);
258
+ tmp31 = _mm_xor_si128(tmp30, tmp36);
259
+ tmp21 = _mm_xor_si128(tmp20, tmp31);
260
+ _mm_storeu_si128((__m128i *) c, tmp21);
261
+ }
262
+
263
+ /* pure multiplication, for pre-computing powers of H */
264
+ static inline __m128i
265
+ mulv(__m128i A, __m128i B)
266
+ {
267
+ __m128i tmp3 = _mm_clmulepi64_si128(A, B, 0x00);
268
+ __m128i tmp4 = _mm_clmulepi64_si128(A, B, 0x10);
269
+ __m128i tmp5 = _mm_clmulepi64_si128(A, B, 0x01);
270
+ __m128i tmp6 = _mm_clmulepi64_si128(A, B, 0x11);
271
+ __m128i tmp10 = _mm_xor_si128(tmp4, tmp5);
272
+ __m128i tmp13 = _mm_slli_si128(tmp10, 8);
273
+ __m128i tmp11 = _mm_srli_si128(tmp10, 8);
274
+ __m128i tmp15 = _mm_xor_si128(tmp3, tmp13);
275
+ __m128i tmp17 = _mm_xor_si128(tmp6, tmp11);
276
+ __m128i tmp7 = _mm_srli_epi32(tmp15, 31);
277
+ __m128i tmp8 = _mm_srli_epi32(tmp17, 31);
278
+ __m128i tmp16 = _mm_slli_epi32(tmp15, 1);
279
+ __m128i tmp18 = _mm_slli_epi32(tmp17, 1);
280
+ __m128i tmp9 = _mm_srli_si128(tmp7, 12);
281
+ __m128i tmp22 = _mm_slli_si128(tmp8, 4);
282
+ __m128i tmp25 = _mm_slli_si128(tmp7, 4);
283
+ __m128i tmp29 = _mm_or_si128(tmp16, tmp25);
284
+ __m128i tmp19 = _mm_or_si128(tmp18, tmp22);
285
+ __m128i tmp20 = _mm_or_si128(tmp19, tmp9);
286
+ __m128i tmp26 = _mm_slli_epi32(tmp29, 31);
287
+ __m128i tmp23 = _mm_slli_epi32(tmp29, 30);
288
+ __m128i tmp32 = _mm_slli_epi32(tmp29, 25);
289
+ __m128i tmp27 = _mm_xor_si128(tmp26, tmp23);
290
+ __m128i tmp28 = _mm_xor_si128(tmp27, tmp32);
291
+ __m128i tmp24 = _mm_srli_si128(tmp28, 4);
292
+ __m128i tmp33 = _mm_slli_si128(tmp28, 12);
293
+ __m128i tmp30 = _mm_xor_si128(tmp29, tmp33);
294
+ __m128i tmp2 = _mm_srli_epi32(tmp30, 1);
295
+ __m128i tmp12 = _mm_srli_epi32(tmp30, 2);
296
+ __m128i tmp14 = _mm_srli_epi32(tmp30, 7);
297
+ __m128i tmp34 = _mm_xor_si128(tmp2, tmp12);
298
+ __m128i tmp35 = _mm_xor_si128(tmp34, tmp14);
299
+ __m128i tmp36 = _mm_xor_si128(tmp35, tmp24);
300
+ __m128i tmp31 = _mm_xor_si128(tmp30, tmp36);
301
+ __m128i C = _mm_xor_si128(tmp20, tmp31);
302
+
303
+ return C;
304
+ }
305
+
306
+ /* 4 multiply-accumulate at once; again
307
+ <https://software.intel.com/sites/default/files/managed/72/cc/clmul-wp-rev-2.02-2014-04-20.pdf>
308
+ for the Aggregated Reduction Method & sample code.
309
+ Algorithm by Krzysztof Jankowski, Pierre Laurent - Intel */
310
+
311
+ #define RED_DECL(a) __m128i H##a##_X##a##_lo, H##a##_X##a##_hi, tmp##a, tmp##a##B
312
+ #define RED_SHUFFLE(a) X##a = _mm_shuffle_epi8(X##a, rev)
313
+ #define RED_MUL_LOW(a) H##a##_X##a##_lo = _mm_clmulepi64_si128(H##a, X##a, 0x00)
314
+ #define RED_MUL_HIGH(a) H##a##_X##a##_hi = _mm_clmulepi64_si128(H##a, X##a, 0x11)
315
+ #define RED_MUL_MID(a) \
316
+ tmp##a = _mm_shuffle_epi32(H##a, 0x4e); \
317
+ tmp##a##B = _mm_shuffle_epi32(X##a, 0x4e); \
318
+ tmp##a = _mm_xor_si128(tmp##a, H##a); \
319
+ tmp##a##B = _mm_xor_si128(tmp##a##B, X##a); \
320
+ tmp##a = _mm_clmulepi64_si128(tmp##a, tmp##a##B, 0x00)
321
+
322
+ #define REDUCE4(rev, H0_, H1_, H2_, H3_, X0_, X1_, X2_, X3_, accv) \
323
+ do { \
324
+ MAKE4(RED_DECL); \
325
+ __m128i lo, hi; \
326
+ __m128i tmp8, tmp9; \
327
+ __m128i H0 = H0_; \
328
+ __m128i H1 = H1_; \
329
+ __m128i H2 = H2_; \
330
+ __m128i H3 = H3_; \
331
+ __m128i X0 = X0_; \
332
+ __m128i X1 = X1_; \
333
+ __m128i X2 = X2_; \
334
+ __m128i X3 = X3_; \
335
+ \
336
+ /* byte-revert the inputs & xor the first one into the accumulator */ \
337
+ \
338
+ MAKE4(RED_SHUFFLE); \
339
+ X3 = _mm_xor_si128(X3, accv); \
340
+ \
341
+ /* 4 low H*X (x0*h0) */ \
342
+ \
343
+ MAKE4(RED_MUL_LOW); \
344
+ lo = _mm_xor_si128(H0_X0_lo, H1_X1_lo); \
345
+ lo = _mm_xor_si128(lo, H2_X2_lo); \
346
+ lo = _mm_xor_si128(lo, H3_X3_lo); \
347
+ \
348
+ /* 4 high H*X (x1*h1) */ \
349
+ \
350
+ MAKE4(RED_MUL_HIGH); \
351
+ hi = _mm_xor_si128(H0_X0_hi, H1_X1_hi); \
352
+ hi = _mm_xor_si128(hi, H2_X2_hi); \
353
+ hi = _mm_xor_si128(hi, H3_X3_hi); \
354
+ \
355
+ /* 4 middle H*X, using Karatsuba, i.e. \
356
+ x1*h0+x0*h1 =(x1+x0)*(h1+h0)-x1*h1-x0*h0 \
357
+ we already have all x1y1 & x0y0 (accumulated in hi & lo) \
358
+ (0 is low half and 1 is high half) \
359
+ */ \
360
+ /* permute the high and low 64 bits in H1 & X1, \
361
+ so create (h0,h1) from (h1,h0) and (x0,x1) from (x1,x0), \
362
+ then compute (h0+h1,h1+h0) and (x0+x1,x1+x0), \
363
+ and finally multiply \
364
+ */ \
365
+ MAKE4(RED_MUL_MID); \
366
+ \
367
+ /* substracts x1*h1 and x0*h0 */ \
368
+ tmp0 = _mm_xor_si128(tmp0, lo); \
369
+ tmp0 = _mm_xor_si128(tmp0, hi); \
370
+ tmp0 = _mm_xor_si128(tmp1, tmp0); \
371
+ tmp0 = _mm_xor_si128(tmp2, tmp0); \
372
+ tmp0 = _mm_xor_si128(tmp3, tmp0);\
373
+ \
374
+ /* reduction */ \
375
+ tmp0B = _mm_slli_si128(tmp0, 8); \
376
+ tmp0 = _mm_srli_si128(tmp0, 8); \
377
+ lo = _mm_xor_si128(tmp0B, lo); \
378
+ hi = _mm_xor_si128(tmp0, hi); \
379
+ tmp3 = lo; \
380
+ tmp2B = hi; \
381
+ tmp3B = _mm_srli_epi32(tmp3, 31); \
382
+ tmp8 = _mm_srli_epi32(tmp2B, 31); \
383
+ tmp3 = _mm_slli_epi32(tmp3, 1); \
384
+ tmp2B = _mm_slli_epi32(tmp2B, 1); \
385
+ tmp9 = _mm_srli_si128(tmp3B, 12); \
386
+ tmp8 = _mm_slli_si128(tmp8, 4); \
387
+ tmp3B = _mm_slli_si128(tmp3B, 4); \
388
+ tmp3 = _mm_or_si128(tmp3, tmp3B); \
389
+ tmp2B = _mm_or_si128(tmp2B, tmp8); \
390
+ tmp2B = _mm_or_si128(tmp2B, tmp9); \
391
+ tmp3B = _mm_slli_epi32(tmp3, 31); \
392
+ tmp8 = _mm_slli_epi32(tmp3, 30); \
393
+ tmp9 = _mm_slli_epi32(tmp3, 25); \
394
+ tmp3B = _mm_xor_si128(tmp3B, tmp8); \
395
+ tmp3B = _mm_xor_si128(tmp3B, tmp9); \
396
+ tmp8 = _mm_srli_si128(tmp3B, 4); \
397
+ tmp3B = _mm_slli_si128(tmp3B, 12); \
398
+ tmp3 = _mm_xor_si128(tmp3, tmp3B); \
399
+ tmp2 = _mm_srli_epi32(tmp3, 1); \
400
+ tmp0B = _mm_srli_epi32(tmp3, 2); \
401
+ tmp1B = _mm_srli_epi32(tmp3, 7); \
402
+ tmp2 = _mm_xor_si128(tmp2, tmp0B); \
403
+ tmp2 = _mm_xor_si128(tmp2, tmp1B); \
404
+ tmp2 = _mm_xor_si128(tmp2, tmp8); \
405
+ tmp3 = _mm_xor_si128(tmp3, tmp2); \
406
+ tmp2B = _mm_xor_si128(tmp2B, tmp3); \
407
+ \
408
+ accv = tmp2B; \
409
+ } while(0)
410
+
411
+ #define XORx(a) \
412
+ temp##a = _mm_xor_si128(temp##a, \
413
+ _mm_loadu_si128((const __m128i *) (in + a * 16)))
414
+
415
+ #define LOADx(a) \
416
+ __m128i in##a = _mm_loadu_si128((const __m128i *) (in + a * 16))
417
+
418
+ /* full encrypt & checksum 8 blocks at once */
419
+ #define aesni_encrypt8full(out_, n_, rkeys, in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
420
+ do { \
421
+ unsigned char *out = out_; \
422
+ uint32_t *n = n_; \
423
+ const unsigned char *in = in_; \
424
+ const __m128i hv = hv_; \
425
+ const __m128i h2v = h2v_; \
426
+ const __m128i h3v = h3v_; \
427
+ const __m128i h4v = h4v_; \
428
+ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
429
+ __m128i accv_; \
430
+ int roundctr; \
431
+ \
432
+ MAKE8(NVDECLx); \
433
+ MAKE8(TEMPDECLx); \
434
+ MAKE8(NVx); \
435
+ MAKE8(TEMPx); \
436
+ for (roundctr = 1; roundctr < 14; roundctr++) { \
437
+ MAKE8(AESENCx); \
438
+ } \
439
+ MAKE8(AESENCLASTx); \
440
+ MAKE8(XORx); \
441
+ MAKE8(STOREx); \
442
+ accv_ = _mm_load_si128((const __m128i *) accum); \
443
+ REDUCE4(rev, hv, h2v, h3v, h4v, temp3, temp2, temp1, temp0, accv_); \
444
+ REDUCE4(rev, hv, h2v, h3v, h4v, temp7, temp6, temp5, temp4, accv_); \
445
+ _mm_store_si128((__m128i *) accum, accv_); \
446
+ } while(0)
447
+
448
+ /* checksum 8 blocks at once */
449
+ #define aesni_addmul8full(in_, accum, hv_, h2v_, h3v_, h4v_, rev) \
450
+ do { \
451
+ const unsigned char *in = in_; \
452
+ const __m128i hv = hv_; \
453
+ const __m128i h2v = h2v_; \
454
+ const __m128i h3v = h3v_; \
455
+ const __m128i h4v = h4v_; \
456
+ __m128i accv_; \
457
+ \
458
+ MAKE8(LOADx); \
459
+ accv_ = _mm_load_si128((const __m128i *) accum); \
460
+ REDUCE4(rev, hv, h2v, h3v, h4v, in3, in2, in1, in0, accv_); \
461
+ REDUCE4(rev, hv, h2v, h3v, h4v, in7, in6, in5, in4, accv_); \
462
+ _mm_store_si128((__m128i *) accum, accv_); \
463
+ } while(0)
464
+
465
+ /* decrypt 8 blocks at once */
466
+ #define aesni_decrypt8full(out_, n_, rkeys, in_) \
467
+ do { \
468
+ unsigned char *out = out_; \
469
+ uint32_t *n = n_; \
470
+ const unsigned char *in = in_; \
471
+ const __m128i pt = _mm_set_epi8(12, 13, 14, 15, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); \
472
+ int roundctr; \
473
+ \
474
+ MAKE8(NVDECLx); \
475
+ MAKE8(TEMPDECLx); \
476
+ MAKE8(NVx); \
477
+ MAKE8(TEMPx); \
478
+ for (roundctr = 1; roundctr < 14; roundctr++) { \
479
+ MAKE8(AESENCx); \
480
+ } \
481
+ MAKE8(AESENCLASTx); \
482
+ MAKE8(XORx); \
483
+ MAKE8(STOREx); \
484
+ } while(0)
485
+
486
+ int
487
+ crypto_aead_aes256gcm_beforenm(crypto_aead_aes256gcm_state *ctx_,
488
+ const unsigned char *k)
489
+ {
490
+ context *ctx = (context *) ctx_;
491
+ __m128i *rkeys = ctx->rkeys;
492
+ __m128i zero = _mm_setzero_si128();
493
+ unsigned char *H = ctx->H;
494
+
495
+ (void) sizeof(int[(sizeof *ctx_) >= (sizeof *ctx) ? 1 : -1]);
496
+ aesni_key256_expand(k, (__m128 *) rkeys);
497
+ aesni_encrypt1(H, zero, rkeys);
498
+
499
+ return 0;
500
+ }
501
+
502
+ int
503
+ crypto_aead_aes256gcm_encrypt_afternm(unsigned char *c, unsigned long long *clen,
504
+ const unsigned char *m, unsigned long long mlen,
505
+ const unsigned char *ad, unsigned long long adlen,
506
+ const unsigned char *nsec,
507
+ const unsigned char *npub,
508
+ const crypto_aead_aes256gcm_state *ctx_)
509
+ {
510
+ unsigned char H[16];
511
+ const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
512
+ const context *ctx = (const context *) ctx_;
513
+ const __m128i *rkeys = ctx->rkeys;
514
+ __m128i Hv, H2v, H3v, H4v, accv;
515
+ unsigned long long i, j;
516
+ unsigned long long adlen_rnd64 = adlen & ~63ULL;
517
+ unsigned long long mlen_rnd128 = mlen & ~127ULL;
518
+ CRYPTO_ALIGN(16) unsigned char n2[16];
519
+ CRYPTO_ALIGN(16) unsigned char T[16];
520
+ CRYPTO_ALIGN(16) unsigned char accum[16];
521
+ CRYPTO_ALIGN(16) unsigned char fb[16];
522
+
523
+ (void) nsec;
524
+ memcpy(H, ctx->H, sizeof H);
525
+ if (mlen > 16ULL * (1ULL << 32)) {
526
+ abort();
527
+ }
528
+ memcpy(&n2[0], npub, 12);
529
+ *(uint32_t *) &n2[12] = 0x01000000;
530
+ aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
531
+
532
+ (*(uint64_t *) &fb[0]) = _bswap64((uint64_t) (8 * adlen));
533
+ (*(uint64_t *) &fb[8]) = _bswap64((uint64_t) (8 * mlen));
534
+
535
+ /* we store H (and it's power) byte-reverted once and for all */
536
+ Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
537
+ _mm_store_si128((__m128i *) H, Hv);
538
+ H2v = mulv(Hv, Hv);
539
+ H3v = mulv(H2v, Hv);
540
+ H4v = mulv(H3v, Hv);
541
+
542
+ accv = _mm_setzero_si128();
543
+ /* unrolled by 4 GCM (by 8 doesn't improve using REDUCE4) */
544
+ for (i = 0; i < adlen_rnd64; i += 64) {
545
+ __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
546
+ __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
547
+ __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
548
+ __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
549
+ REDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
550
+ }
551
+ _mm_store_si128((__m128i *) accum, accv);
552
+
553
+ /* GCM remainder loop */
554
+ for (i = adlen_rnd64; i < adlen; i += 16) {
555
+ unsigned int blocklen = 16;
556
+
557
+ if (i + (unsigned long long) blocklen > adlen) {
558
+ blocklen = (unsigned int) (adlen - i);
559
+ }
560
+ addmul(accum, ad + i, blocklen, H);
561
+ }
562
+
563
+ /* this only does 8 full blocks, so no fancy bounds checking is necessary*/
564
+ #define LOOPRND128 \
565
+ do { \
566
+ const int iter = 8; \
567
+ const int lb = iter * 16; \
568
+ \
569
+ for (i = 0; i < mlen_rnd128; i += lb) { \
570
+ aesni_encrypt8full(c + i, (uint32_t *) n2, rkeys, m + i, accum, Hv, H2v, H3v, H4v, rev); \
571
+ } \
572
+ } while(0)
573
+
574
+ /* remainder loop, with the slower GCM update to accomodate partial blocks */
575
+ #define LOOPRMD128 \
576
+ do { \
577
+ const int iter = 8; \
578
+ const int lb = iter * 16; \
579
+ \
580
+ for (i = mlen_rnd128; i < mlen; i += lb) { \
581
+ CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
582
+ unsigned long long mj = lb; \
583
+ \
584
+ aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
585
+ if ((i + mj) >= mlen) { \
586
+ mj = mlen - i; \
587
+ } \
588
+ for (j = 0; j < mj; j++) { \
589
+ c[i + j] = m[i + j] ^ outni[j]; \
590
+ } \
591
+ for (j = 0; j < mj; j += 16) { \
592
+ unsigned int bl = 16; \
593
+ \
594
+ if (j + (unsigned long long) bl >= mj) { \
595
+ bl = (unsigned int) (mj - j); \
596
+ } \
597
+ addmul(accum, c + i + j, bl, H); \
598
+ } \
599
+ } \
600
+ } while(0)
601
+
602
+ n2[15] = 0;
603
+ COUNTER_INC2(n2);
604
+ LOOPRND128;
605
+ LOOPRMD128;
606
+
607
+ addmul(accum, fb, 16, H);
608
+
609
+ for (i = 0; i < 16; ++i) {
610
+ c[i + mlen] = T[i] ^ accum[15 - i];
611
+ }
612
+ if (clen != NULL) {
613
+ *clen = mlen + 16;
614
+ }
615
+ return 0;
616
+ }
617
+
618
+ int
619
+ crypto_aead_aes256gcm_decrypt_afternm(unsigned char *m, unsigned long long *mlen_p,
620
+ unsigned char *nsec,
621
+ const unsigned char *c, unsigned long long clen,
622
+ const unsigned char *ad, unsigned long long adlen,
623
+ const unsigned char *npub,
624
+ const crypto_aead_aes256gcm_state *ctx_)
625
+ {
626
+ unsigned char H[16];
627
+ const __m128i rev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
628
+ const context *ctx = (const context *) ctx_;
629
+ const __m128i *rkeys = ctx->rkeys;
630
+ __m128i Hv, H2v, H3v, H4v, accv;
631
+ unsigned long long i, j;
632
+ unsigned long long adlen_rnd64 = adlen & ~63ULL;
633
+ unsigned long long mlen;
634
+ unsigned long long mlen_rnd128;
635
+ CRYPTO_ALIGN(16) unsigned char n2[16];
636
+ CRYPTO_ALIGN(16) unsigned char T[16];
637
+ CRYPTO_ALIGN(16) unsigned char accum[16];
638
+ CRYPTO_ALIGN(16) unsigned char fb[16];
639
+
640
+ (void) nsec;
641
+ memcpy(H, ctx->H, sizeof H);
642
+ if (clen > 16ULL * (1ULL << 32) - 16ULL) {
643
+ abort();
644
+ }
645
+ mlen = clen - 16;
646
+ if (mlen_p != NULL) {
647
+ *mlen_p = 0U;
648
+ }
649
+ memcpy(&n2[0], npub, 12);
650
+ *(uint32_t *) &n2[12] = 0x01000000;
651
+ aesni_encrypt1(T, _mm_load_si128((const __m128i *) n2), rkeys);
652
+
653
+ (*(uint64_t *) &fb[0]) = _bswap64((uint64_t)(8 * adlen));
654
+ (*(uint64_t *) &fb[8]) = _bswap64((uint64_t)(8 * mlen));
655
+
656
+ Hv = _mm_shuffle_epi8(_mm_load_si128((const __m128i *) H), rev);
657
+ _mm_store_si128((__m128i *) H, Hv);
658
+ H2v = mulv(Hv, Hv);
659
+ H3v = mulv(H2v, Hv);
660
+ H4v = mulv(H3v, Hv);
661
+
662
+ accv = _mm_setzero_si128();
663
+ for (i = 0; i < adlen_rnd64; i += 64) {
664
+ __m128i X4_ = _mm_loadu_si128((const __m128i *) (ad + i + 0));
665
+ __m128i X3_ = _mm_loadu_si128((const __m128i *) (ad + i + 16));
666
+ __m128i X2_ = _mm_loadu_si128((const __m128i *) (ad + i + 32));
667
+ __m128i X1_ = _mm_loadu_si128((const __m128i *) (ad + i + 48));
668
+ REDUCE4(rev, Hv, H2v, H3v, H4v, X1_, X2_, X3_, X4_, accv);
669
+ }
670
+ _mm_store_si128((__m128i *) accum, accv);
671
+
672
+ for (i = adlen_rnd64; i < adlen; i += 16) {
673
+ unsigned int blocklen = 16;
674
+ if (i + (unsigned long long) blocklen > adlen) {
675
+ blocklen = (unsigned int) (adlen - i);
676
+ }
677
+ addmul(accum, ad + i, blocklen, H);
678
+ }
679
+
680
+ mlen_rnd128 = mlen & ~127ULL;
681
+
682
+ #define LOOPACCUMDRND128 \
683
+ do { \
684
+ const int iter = 8; \
685
+ const int lb = iter * 16; \
686
+ for (i = 0; i < mlen_rnd128; i += lb) { \
687
+ aesni_addmul8full(c + i, accum, Hv, H2v, H3v, H4v, rev); \
688
+ } \
689
+ } while(0)
690
+
691
+ #define LOOPDRND128 \
692
+ do { \
693
+ const int iter = 8; \
694
+ const int lb = iter * 16; \
695
+ \
696
+ for (i = 0; i < mlen_rnd128; i += lb) { \
697
+ aesni_decrypt8full(m + i, (uint32_t *) n2, rkeys, c + i); \
698
+ } \
699
+ } while(0)
700
+
701
+ #define LOOPACCUMDRMD128 \
702
+ do { \
703
+ const int iter = 8; \
704
+ const int lb = iter * 16; \
705
+ \
706
+ for (i = mlen_rnd128; i < mlen; i += lb) { \
707
+ unsigned long long mj = lb; \
708
+ \
709
+ if ((i + mj) >= mlen) { \
710
+ mj = mlen - i; \
711
+ } \
712
+ for (j = 0; j < mj; j += 16) { \
713
+ unsigned int bl = 16; \
714
+ \
715
+ if (j + (unsigned long long) bl >= mj) { \
716
+ bl = (unsigned int) (mj - j); \
717
+ } \
718
+ addmul(accum, c + i + j, bl, H); \
719
+ } \
720
+ } \
721
+ } while(0)
722
+
723
+ #define LOOPDRMD128 \
724
+ do { \
725
+ const int iter = 8; \
726
+ const int lb = iter * 16; \
727
+ \
728
+ for (i = mlen_rnd128; i < mlen; i += lb) { \
729
+ CRYPTO_ALIGN(16) unsigned char outni[8 * 16]; \
730
+ unsigned long long mj = lb; \
731
+ \
732
+ if ((i + mj) >= mlen) { \
733
+ mj = mlen - i; \
734
+ } \
735
+ aesni_encrypt8(outni, (uint32_t *) n2, rkeys); \
736
+ for (j = 0; j < mj; j++) { \
737
+ m[i + j] = c[i + j] ^ outni[j]; \
738
+ } \
739
+ } \
740
+ } while(0)
741
+ n2[15] = 0;
742
+
743
+ COUNTER_INC2(n2);
744
+ LOOPACCUMDRND128;
745
+ LOOPACCUMDRMD128;
746
+ addmul(accum, fb, 16, H);
747
+ {
748
+ unsigned char d = 0;
749
+
750
+ for (i = 0; i < 16; i++) {
751
+ d |= (c[i + mlen] ^ (T[i] ^ accum[15 - i]));
752
+ }
753
+ if (d != 0) {
754
+ return -1;
755
+ }
756
+ }
757
+ *(uint32_t *) &n2[12] = 0;
758
+ COUNTER_INC2(n2);
759
+ LOOPDRND128;
760
+ LOOPDRMD128;
761
+
762
+ if (mlen_p != NULL) {
763
+ *mlen_p = mlen;
764
+ }
765
+ return 0;
766
+ }
767
+
768
+ int
769
+ crypto_aead_aes256gcm_encrypt(unsigned char *c,
770
+ unsigned long long *clen_p,
771
+ const unsigned char *m,
772
+ unsigned long long mlen,
773
+ const unsigned char *ad,
774
+ unsigned long long adlen,
775
+ const unsigned char *nsec,
776
+ const unsigned char *npub,
777
+ const unsigned char *k)
778
+ {
779
+ crypto_aead_aes256gcm_state ctx;
780
+
781
+ crypto_aead_aes256gcm_beforenm(&ctx, k);
782
+
783
+ return crypto_aead_aes256gcm_encrypt_afternm
784
+ (c, clen_p, m, mlen, ad, adlen, nsec, npub,
785
+ (const crypto_aead_aes256gcm_state *) &ctx);
786
+ }
787
+
788
+ int
789
+ crypto_aead_aes256gcm_decrypt(unsigned char *m,
790
+ unsigned long long *mlen_p,
791
+ unsigned char *nsec,
792
+ const unsigned char *c,
793
+ unsigned long long clen,
794
+ const unsigned char *ad,
795
+ unsigned long long adlen,
796
+ const unsigned char *npub,
797
+ const unsigned char *k)
798
+ {
799
+ crypto_aead_aes256gcm_state ctx;
800
+
801
+ crypto_aead_aes256gcm_beforenm(&ctx, k);
802
+
803
+ return crypto_aead_aes256gcm_decrypt_afternm
804
+ (m, mlen_p, nsec, c, clen, ad, adlen, npub,
805
+ (const crypto_aead_aes256gcm_state *) &ctx);
806
+ }
807
+
808
+ int
809
+ crypto_aead_aes256gcm_is_available(void)
810
+ {
811
+ return sodium_runtime_has_pclmul() & sodium_runtime_has_aesni();
812
+ }
813
+
814
+ size_t
815
+ crypto_aead_aes256gcm_keybytes(void)
816
+ {
817
+ return crypto_aead_aes256gcm_KEYBYTES;
818
+ }
819
+
820
+ size_t
821
+ crypto_aead_aes256gcm_nsecbytes(void)
822
+ {
823
+ return crypto_aead_aes256gcm_NSECBYTES;
824
+ }
825
+
826
+ size_t
827
+ crypto_aead_aes256gcm_npubbytes(void)
828
+ {
829
+ return crypto_aead_aes256gcm_NPUBBYTES;
830
+ }
831
+
832
+ size_t
833
+ crypto_aead_aes256gcm_abytes(void)
834
+ {
835
+ return crypto_aead_aes256gcm_ABYTES;
836
+ }
837
+
838
+ size_t
839
+ crypto_aead_aes256gcm_statebytes(void)
840
+ {
841
+ return (sizeof(crypto_aead_aes256gcm_state) + (size_t) 15U) & ~(size_t) 15U;
842
+ }
843
+
844
+ #else
845
+
846
+ int
847
+ crypto_aead_aes256gcm_is_available(void)
848
+ {
849
+ return 0;
850
+ }
851
+
852
+ #endif