ed25519_blake2b 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/CODE_OF_CONDUCT.md +74 -0
  4. data/Gemfile +6 -0
  5. data/Gemfile.lock +23 -0
  6. data/LICENSE +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +13 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/ed25519_blake2b.gemspec +31 -0
  12. data/ext/ed25519_blake2b/blake2-config.h +72 -0
  13. data/ext/ed25519_blake2b/blake2-impl.h +160 -0
  14. data/ext/ed25519_blake2b/blake2.h +195 -0
  15. data/ext/ed25519_blake2b/blake2b-load-sse2.h +68 -0
  16. data/ext/ed25519_blake2b/blake2b-load-sse41.h +402 -0
  17. data/ext/ed25519_blake2b/blake2b-ref.c +373 -0
  18. data/ext/ed25519_blake2b/blake2b-round.h +157 -0
  19. data/ext/ed25519_blake2b/curve25519-donna-32bit.h +579 -0
  20. data/ext/ed25519_blake2b/curve25519-donna-64bit.h +413 -0
  21. data/ext/ed25519_blake2b/curve25519-donna-helpers.h +67 -0
  22. data/ext/ed25519_blake2b/curve25519-donna-sse2.h +1112 -0
  23. data/ext/ed25519_blake2b/ed25519-donna-32bit-sse2.h +513 -0
  24. data/ext/ed25519_blake2b/ed25519-donna-32bit-tables.h +61 -0
  25. data/ext/ed25519_blake2b/ed25519-donna-64bit-sse2.h +436 -0
  26. data/ext/ed25519_blake2b/ed25519-donna-64bit-tables.h +53 -0
  27. data/ext/ed25519_blake2b/ed25519-donna-64bit-x86-32bit.h +435 -0
  28. data/ext/ed25519_blake2b/ed25519-donna-64bit-x86.h +351 -0
  29. data/ext/ed25519_blake2b/ed25519-donna-basepoint-table.h +259 -0
  30. data/ext/ed25519_blake2b/ed25519-donna-batchverify.h +275 -0
  31. data/ext/ed25519_blake2b/ed25519-donna-impl-base.h +364 -0
  32. data/ext/ed25519_blake2b/ed25519-donna-impl-sse2.h +390 -0
  33. data/ext/ed25519_blake2b/ed25519-donna-portable-identify.h +103 -0
  34. data/ext/ed25519_blake2b/ed25519-donna-portable.h +135 -0
  35. data/ext/ed25519_blake2b/ed25519-donna.h +115 -0
  36. data/ext/ed25519_blake2b/ed25519-hash-custom.c +28 -0
  37. data/ext/ed25519_blake2b/ed25519-hash-custom.h +30 -0
  38. data/ext/ed25519_blake2b/ed25519-hash.h +219 -0
  39. data/ext/ed25519_blake2b/ed25519-randombytes-custom.h +10 -0
  40. data/ext/ed25519_blake2b/ed25519-randombytes.h +91 -0
  41. data/ext/ed25519_blake2b/ed25519.c +150 -0
  42. data/ext/ed25519_blake2b/ed25519.h +30 -0
  43. data/ext/ed25519_blake2b/extconf.rb +3 -0
  44. data/ext/ed25519_blake2b/fuzz/README.md +173 -0
  45. data/ext/ed25519_blake2b/fuzz/build-nix.php +134 -0
  46. data/ext/ed25519_blake2b/fuzz/curve25519-ref10.c +1272 -0
  47. data/ext/ed25519_blake2b/fuzz/curve25519-ref10.h +8 -0
  48. data/ext/ed25519_blake2b/fuzz/ed25519-donna-sse2.c +3 -0
  49. data/ext/ed25519_blake2b/fuzz/ed25519-donna.c +1 -0
  50. data/ext/ed25519_blake2b/fuzz/ed25519-donna.h +34 -0
  51. data/ext/ed25519_blake2b/fuzz/ed25519-ref10.c +4647 -0
  52. data/ext/ed25519_blake2b/fuzz/ed25519-ref10.h +9 -0
  53. data/ext/ed25519_blake2b/fuzz/fuzz-curve25519.c +172 -0
  54. data/ext/ed25519_blake2b/fuzz/fuzz-ed25519.c +219 -0
  55. data/ext/ed25519_blake2b/modm-donna-32bit.h +469 -0
  56. data/ext/ed25519_blake2b/modm-donna-64bit.h +361 -0
  57. data/ext/ed25519_blake2b/rbext.c +25 -0
  58. data/ext/ed25519_blake2b/regression.h +1024 -0
  59. data/lib/ed25519_blake2b/ed25519_blake2b.rb +4 -0
  60. data/lib/ed25519_blake2b/version.rb +3 -0
  61. metadata +147 -0
@@ -0,0 +1,373 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+
16
+ #include <stdint.h>
17
+ #include <string.h>
18
+ #include <stdio.h>
19
+
20
+ #include "blake2.h"
21
+ #include "blake2-impl.h"
22
+
23
+ #include "blake2-config.h"
24
+
25
+ #ifdef _MSC_VER
26
+ #include <intrin.h> /* for _mm_set_epi64x */
27
+ #endif
28
+ #include <emmintrin.h>
29
+ #if defined(HAVE_SSSE3)
30
+ #include <tmmintrin.h>
31
+ #endif
32
+ #if defined(HAVE_SSE41)
33
+ #include <smmintrin.h>
34
+ #endif
35
+ #if defined(HAVE_AVX)
36
+ #include <immintrin.h>
37
+ #endif
38
+ #if defined(HAVE_XOP)
39
+ #include <x86intrin.h>
40
+ #endif
41
+
42
+ #include "blake2b-round.h"
43
+
44
+ static const uint64_t blake2b_IV[8] =
45
+ {
46
+ 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
47
+ 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
48
+ 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
49
+ 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
50
+ };
51
+
52
+ /* Some helper functions */
53
+ static void blake2b_set_lastnode( blake2b_state *S )
54
+ {
55
+ S->f[1] = (uint64_t)-1;
56
+ }
57
+
58
+ static int blake2b_is_lastblock( const blake2b_state *S )
59
+ {
60
+ return S->f[0] != 0;
61
+ }
62
+
63
+ static void blake2b_set_lastblock( blake2b_state *S )
64
+ {
65
+ if( S->last_node ) blake2b_set_lastnode( S );
66
+
67
+ S->f[0] = (uint64_t)-1;
68
+ }
69
+
70
+ static void blake2b_increment_counter( blake2b_state *S, const uint64_t inc )
71
+ {
72
+ S->t[0] += inc;
73
+ S->t[1] += ( S->t[0] < inc );
74
+ }
75
+
76
+ /* init xors IV with input parameter block */
77
+ int blake2b_init_param( blake2b_state *S, const blake2b_param *P )
78
+ {
79
+ size_t i;
80
+ /*blake2b_init0( S ); */
81
+ const unsigned char * v = ( const unsigned char * )( blake2b_IV );
82
+ const unsigned char * p = ( const unsigned char * )( P );
83
+ unsigned char * h = ( unsigned char * )( S->h );
84
+ /* IV XOR ParamBlock */
85
+ memset( S, 0, sizeof( blake2b_state ) );
86
+
87
+ for( i = 0; i < BLAKE2B_OUTBYTES; ++i ) h[i] = v[i] ^ p[i];
88
+
89
+ S->outlen = P->digest_length;
90
+ return 0;
91
+ }
92
+
93
+
94
+ /* Some sort of default parameter block initialization, for sequential blake2b */
95
+ int blake2b_init( blake2b_state *S, size_t outlen )
96
+ {
97
+ blake2b_param P[1];
98
+
99
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
100
+
101
+ P->digest_length = (uint8_t)outlen;
102
+ P->key_length = 0;
103
+ P->fanout = 1;
104
+ P->depth = 1;
105
+ store32( &P->leaf_length, 0 );
106
+ store32( &P->node_offset, 0 );
107
+ store32( &P->xof_length, 0 );
108
+ P->node_depth = 0;
109
+ P->inner_length = 0;
110
+ memset( P->reserved, 0, sizeof( P->reserved ) );
111
+ memset( P->salt, 0, sizeof( P->salt ) );
112
+ memset( P->personal, 0, sizeof( P->personal ) );
113
+
114
+ return blake2b_init_param( S, P );
115
+ }
116
+
117
+ int blake2b_init_key( blake2b_state *S, size_t outlen, const void *key, size_t keylen )
118
+ {
119
+ blake2b_param P[1];
120
+
121
+ if ( ( !outlen ) || ( outlen > BLAKE2B_OUTBYTES ) ) return -1;
122
+
123
+ if ( ( !keylen ) || keylen > BLAKE2B_KEYBYTES ) return -1;
124
+
125
+ P->digest_length = (uint8_t)outlen;
126
+ P->key_length = (uint8_t)keylen;
127
+ P->fanout = 1;
128
+ P->depth = 1;
129
+ store32( &P->leaf_length, 0 );
130
+ store32( &P->node_offset, 0 );
131
+ store32( &P->xof_length, 0 );
132
+ P->node_depth = 0;
133
+ P->inner_length = 0;
134
+ memset( P->reserved, 0, sizeof( P->reserved ) );
135
+ memset( P->salt, 0, sizeof( P->salt ) );
136
+ memset( P->personal, 0, sizeof( P->personal ) );
137
+
138
+ if( blake2b_init_param( S, P ) < 0 )
139
+ return 0;
140
+
141
+ {
142
+ uint8_t block[BLAKE2B_BLOCKBYTES];
143
+ memset( block, 0, BLAKE2B_BLOCKBYTES );
144
+ memcpy( block, key, keylen );
145
+ blake2b_update( S, block, BLAKE2B_BLOCKBYTES );
146
+ secure_zero_memory( block, BLAKE2B_BLOCKBYTES ); /* Burn the key from stack */
147
+ }
148
+ return 0;
149
+ }
150
+
151
+ static void blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
152
+ {
153
+ __m128i row1l, row1h;
154
+ __m128i row2l, row2h;
155
+ __m128i row3l, row3h;
156
+ __m128i row4l, row4h;
157
+ __m128i b0, b1;
158
+ __m128i t0, t1;
159
+ #if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
160
+ const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
161
+ const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
162
+ #endif
163
+ #if defined(HAVE_SSE41)
164
+ const __m128i m0 = LOADU( block + 00 );
165
+ const __m128i m1 = LOADU( block + 16 );
166
+ const __m128i m2 = LOADU( block + 32 );
167
+ const __m128i m3 = LOADU( block + 48 );
168
+ const __m128i m4 = LOADU( block + 64 );
169
+ const __m128i m5 = LOADU( block + 80 );
170
+ const __m128i m6 = LOADU( block + 96 );
171
+ const __m128i m7 = LOADU( block + 112 );
172
+ #else
173
+ const uint64_t m0 = load64(block + 0 * sizeof(uint64_t));
174
+ const uint64_t m1 = load64(block + 1 * sizeof(uint64_t));
175
+ const uint64_t m2 = load64(block + 2 * sizeof(uint64_t));
176
+ const uint64_t m3 = load64(block + 3 * sizeof(uint64_t));
177
+ const uint64_t m4 = load64(block + 4 * sizeof(uint64_t));
178
+ const uint64_t m5 = load64(block + 5 * sizeof(uint64_t));
179
+ const uint64_t m6 = load64(block + 6 * sizeof(uint64_t));
180
+ const uint64_t m7 = load64(block + 7 * sizeof(uint64_t));
181
+ const uint64_t m8 = load64(block + 8 * sizeof(uint64_t));
182
+ const uint64_t m9 = load64(block + 9 * sizeof(uint64_t));
183
+ const uint64_t m10 = load64(block + 10 * sizeof(uint64_t));
184
+ const uint64_t m11 = load64(block + 11 * sizeof(uint64_t));
185
+ const uint64_t m12 = load64(block + 12 * sizeof(uint64_t));
186
+ const uint64_t m13 = load64(block + 13 * sizeof(uint64_t));
187
+ const uint64_t m14 = load64(block + 14 * sizeof(uint64_t));
188
+ const uint64_t m15 = load64(block + 15 * sizeof(uint64_t));
189
+ #endif
190
+ row1l = LOADU( &S->h[0] );
191
+ row1h = LOADU( &S->h[2] );
192
+ row2l = LOADU( &S->h[4] );
193
+ row2h = LOADU( &S->h[6] );
194
+ row3l = LOADU( &blake2b_IV[0] );
195
+ row3h = LOADU( &blake2b_IV[2] );
196
+ row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
197
+ row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
198
+ ROUND( 0 );
199
+ ROUND( 1 );
200
+ ROUND( 2 );
201
+ ROUND( 3 );
202
+ ROUND( 4 );
203
+ ROUND( 5 );
204
+ ROUND( 6 );
205
+ ROUND( 7 );
206
+ ROUND( 8 );
207
+ ROUND( 9 );
208
+ ROUND( 10 );
209
+ ROUND( 11 );
210
+ row1l = _mm_xor_si128( row3l, row1l );
211
+ row1h = _mm_xor_si128( row3h, row1h );
212
+ STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
213
+ STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
214
+ row2l = _mm_xor_si128( row4l, row2l );
215
+ row2h = _mm_xor_si128( row4h, row2h );
216
+ STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
217
+ STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
218
+ }
219
+
220
+
221
+ int blake2b_update( blake2b_state *S, const void *pin, size_t inlen )
222
+ {
223
+ const unsigned char * in = (const unsigned char *)pin;
224
+ if( inlen > 0 )
225
+ {
226
+ size_t left = S->buflen;
227
+ size_t fill = BLAKE2B_BLOCKBYTES - left;
228
+ if( inlen > fill )
229
+ {
230
+ S->buflen = 0;
231
+ memcpy( S->buf + left, in, fill ); /* Fill buffer */
232
+ blake2b_increment_counter( S, BLAKE2B_BLOCKBYTES );
233
+ blake2b_compress( S, S->buf ); /* Compress */
234
+ in += fill; inlen -= fill;
235
+ while(inlen > BLAKE2B_BLOCKBYTES) {
236
+ blake2b_increment_counter(S, BLAKE2B_BLOCKBYTES);
237
+ blake2b_compress( S, in );
238
+ in += BLAKE2B_BLOCKBYTES;
239
+ inlen -= BLAKE2B_BLOCKBYTES;
240
+ }
241
+ }
242
+ memcpy( S->buf + S->buflen, in, inlen );
243
+ S->buflen += inlen;
244
+ }
245
+ return 0;
246
+ }
247
+
248
+
249
+ int blake2b_final( blake2b_state *S, void *out, size_t outlen )
250
+ {
251
+ if( out == NULL || outlen < S->outlen )
252
+ return -1;
253
+
254
+ if( blake2b_is_lastblock( S ) )
255
+ return -1;
256
+
257
+ blake2b_increment_counter( S, S->buflen );
258
+ blake2b_set_lastblock( S );
259
+ memset( S->buf + S->buflen, 0, BLAKE2B_BLOCKBYTES - S->buflen ); /* Padding */
260
+ blake2b_compress( S, S->buf );
261
+
262
+ memcpy( out, &S->h[0], S->outlen );
263
+ return 0;
264
+ }
265
+
266
+
267
+ int blake2b( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen )
268
+ {
269
+ blake2b_state S[1];
270
+
271
+ /* Verify parameters */
272
+ if ( NULL == in && inlen > 0 ) return -1;
273
+
274
+ if ( NULL == out ) return -1;
275
+
276
+ if( NULL == key && keylen > 0 ) return -1;
277
+
278
+ if( !outlen || outlen > BLAKE2B_OUTBYTES ) return -1;
279
+
280
+ if( keylen > BLAKE2B_KEYBYTES ) return -1;
281
+
282
+ if( keylen )
283
+ {
284
+ if( blake2b_init_key( S, outlen, key, keylen ) < 0 ) return -1;
285
+ }
286
+ else
287
+ {
288
+ if( blake2b_init( S, outlen ) < 0 ) return -1;
289
+ }
290
+
291
+ blake2b_update( S, ( const uint8_t * )in, inlen );
292
+ blake2b_final( S, out, outlen );
293
+ return 0;
294
+ }
295
+
296
+ int blake2( void *out, size_t outlen, const void *in, size_t inlen, const void *key, size_t keylen ) {
297
+ return blake2b(out, outlen, in, inlen, key, keylen);
298
+ }
299
+
300
+ #if defined(SUPERCOP)
301
+ int crypto_hash( unsigned char *out, unsigned char *in, unsigned long long inlen )
302
+ {
303
+ return blake2b( out, BLAKE2B_OUTBYTES, in, inlen, NULL, 0 );
304
+ }
305
+ #endif
306
+
307
+ #if defined(BLAKE2B_SELFTEST)
308
+ #include <string.h>
309
+ #include "blake2-kat.h"
310
+ int main( void )
311
+ {
312
+ uint8_t key[BLAKE2B_KEYBYTES];
313
+ uint8_t buf[BLAKE2_KAT_LENGTH];
314
+ size_t i, step;
315
+
316
+ for( i = 0; i < BLAKE2B_KEYBYTES; ++i )
317
+ key[i] = ( uint8_t )i;
318
+
319
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
320
+ buf[i] = ( uint8_t )i;
321
+
322
+ /* Test simple API */
323
+ for( i = 0; i < BLAKE2_KAT_LENGTH; ++i )
324
+ {
325
+ uint8_t hash[BLAKE2B_OUTBYTES];
326
+ blake2b( hash, BLAKE2B_OUTBYTES, buf, i, key, BLAKE2B_KEYBYTES );
327
+
328
+ if( 0 != memcmp( hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES ) )
329
+ {
330
+ goto fail;
331
+ }
332
+ }
333
+
334
+ /* Test streaming API */
335
+ for(step = 1; step < BLAKE2B_BLOCKBYTES; ++step) {
336
+ for (i = 0; i < BLAKE2_KAT_LENGTH; ++i) {
337
+ uint8_t hash[BLAKE2B_OUTBYTES];
338
+ blake2b_state S;
339
+ uint8_t * p = buf;
340
+ size_t mlen = i;
341
+ int err = 0;
342
+
343
+ if( (err = blake2b_init_key(&S, BLAKE2B_OUTBYTES, key, BLAKE2B_KEYBYTES)) < 0 ) {
344
+ goto fail;
345
+ }
346
+
347
+ while (mlen >= step) {
348
+ if ( (err = blake2b_update(&S, p, step)) < 0 ) {
349
+ goto fail;
350
+ }
351
+ mlen -= step;
352
+ p += step;
353
+ }
354
+ if ( (err = blake2b_update(&S, p, mlen)) < 0) {
355
+ goto fail;
356
+ }
357
+ if ( (err = blake2b_final(&S, hash, BLAKE2B_OUTBYTES)) < 0) {
358
+ goto fail;
359
+ }
360
+
361
+ if (0 != memcmp(hash, blake2b_keyed_kat[i], BLAKE2B_OUTBYTES)) {
362
+ goto fail;
363
+ }
364
+ }
365
+ }
366
+
367
+ puts( "ok" );
368
+ return 0;
369
+ fail:
370
+ puts("error");
371
+ return -1;
372
+ }
373
+ #endif
@@ -0,0 +1,157 @@
1
+ /*
2
+ BLAKE2 reference source code package - optimized C implementations
3
+
4
+ Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
5
+ terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
6
+ your option. The terms of these licenses can be found at:
7
+
8
+ - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
9
+ - OpenSSL license : https://www.openssl.org/source/license.html
10
+ - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ More information about the BLAKE2 hash function can be found at
13
+ https://blake2.net.
14
+ */
15
+ #ifndef BLAKE2B_ROUND_H
16
+ #define BLAKE2B_ROUND_H
17
+
18
+ #define LOADU(p) _mm_loadu_si128( (const __m128i *)(p) )
19
+ #define STOREU(p,r) _mm_storeu_si128((__m128i *)(p), r)
20
+
21
+ #define TOF(reg) _mm_castsi128_ps((reg))
22
+ #define TOI(reg) _mm_castps_si128((reg))
23
+
24
+ #define LIKELY(x) __builtin_expect((x),1)
25
+
26
+
27
+ /* Microarchitecture-specific macros */
28
+ #ifndef HAVE_XOP
29
+ #ifdef HAVE_SSSE3
30
+ #define _mm_roti_epi64(x, c) \
31
+ (-(c) == 32) ? _mm_shuffle_epi32((x), _MM_SHUFFLE(2,3,0,1)) \
32
+ : (-(c) == 24) ? _mm_shuffle_epi8((x), r24) \
33
+ : (-(c) == 16) ? _mm_shuffle_epi8((x), r16) \
34
+ : (-(c) == 63) ? _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_add_epi64((x), (x))) \
35
+ : _mm_xor_si128(_mm_srli_epi64((x), -(c)), _mm_slli_epi64((x), 64-(-(c))))
36
+ #else
37
+ #define _mm_roti_epi64(r, c) _mm_xor_si128(_mm_srli_epi64( (r), -(c) ),_mm_slli_epi64( (r), 64-(-(c)) ))
38
+ #endif
39
+ #else
40
+ /* ... */
41
+ #endif
42
+
43
+
44
+
45
+ #define G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
46
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
47
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
48
+ \
49
+ row4l = _mm_xor_si128(row4l, row1l); \
50
+ row4h = _mm_xor_si128(row4h, row1h); \
51
+ \
52
+ row4l = _mm_roti_epi64(row4l, -32); \
53
+ row4h = _mm_roti_epi64(row4h, -32); \
54
+ \
55
+ row3l = _mm_add_epi64(row3l, row4l); \
56
+ row3h = _mm_add_epi64(row3h, row4h); \
57
+ \
58
+ row2l = _mm_xor_si128(row2l, row3l); \
59
+ row2h = _mm_xor_si128(row2h, row3h); \
60
+ \
61
+ row2l = _mm_roti_epi64(row2l, -24); \
62
+ row2h = _mm_roti_epi64(row2h, -24); \
63
+
64
+ #define G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1) \
65
+ row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l); \
66
+ row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h); \
67
+ \
68
+ row4l = _mm_xor_si128(row4l, row1l); \
69
+ row4h = _mm_xor_si128(row4h, row1h); \
70
+ \
71
+ row4l = _mm_roti_epi64(row4l, -16); \
72
+ row4h = _mm_roti_epi64(row4h, -16); \
73
+ \
74
+ row3l = _mm_add_epi64(row3l, row4l); \
75
+ row3h = _mm_add_epi64(row3h, row4h); \
76
+ \
77
+ row2l = _mm_xor_si128(row2l, row3l); \
78
+ row2h = _mm_xor_si128(row2h, row3h); \
79
+ \
80
+ row2l = _mm_roti_epi64(row2l, -63); \
81
+ row2h = _mm_roti_epi64(row2h, -63); \
82
+
83
+ #if defined(HAVE_SSSE3)
84
+ #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
85
+ t0 = _mm_alignr_epi8(row2h, row2l, 8); \
86
+ t1 = _mm_alignr_epi8(row2l, row2h, 8); \
87
+ row2l = t0; \
88
+ row2h = t1; \
89
+ \
90
+ t0 = row3l; \
91
+ row3l = row3h; \
92
+ row3h = t0; \
93
+ \
94
+ t0 = _mm_alignr_epi8(row4h, row4l, 8); \
95
+ t1 = _mm_alignr_epi8(row4l, row4h, 8); \
96
+ row4l = t1; \
97
+ row4h = t0;
98
+
99
+ #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
100
+ t0 = _mm_alignr_epi8(row2l, row2h, 8); \
101
+ t1 = _mm_alignr_epi8(row2h, row2l, 8); \
102
+ row2l = t0; \
103
+ row2h = t1; \
104
+ \
105
+ t0 = row3l; \
106
+ row3l = row3h; \
107
+ row3h = t0; \
108
+ \
109
+ t0 = _mm_alignr_epi8(row4l, row4h, 8); \
110
+ t1 = _mm_alignr_epi8(row4h, row4l, 8); \
111
+ row4l = t1; \
112
+ row4h = t0;
113
+ #else
114
+
115
+ #define DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
116
+ t0 = row4l;\
117
+ t1 = row2l;\
118
+ row4l = row3l;\
119
+ row3l = row3h;\
120
+ row3h = row4l;\
121
+ row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0)); \
122
+ row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h)); \
123
+ row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h)); \
124
+ row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1))
125
+
126
+ #define UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h) \
127
+ t0 = row3l;\
128
+ row3l = row3h;\
129
+ row3h = t0;\
130
+ t0 = row2l;\
131
+ t1 = row4l;\
132
+ row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l)); \
133
+ row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h)); \
134
+ row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h)); \
135
+ row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1))
136
+
137
+ #endif
138
+
139
+ #if defined(HAVE_SSE41)
140
+ #include "blake2b-load-sse41.h"
141
+ #else
142
+ #include "blake2b-load-sse2.h"
143
+ #endif
144
+
145
+ #define ROUND(r) \
146
+ LOAD_MSG_ ##r ##_1(b0, b1); \
147
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
148
+ LOAD_MSG_ ##r ##_2(b0, b1); \
149
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
150
+ DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
151
+ LOAD_MSG_ ##r ##_3(b0, b1); \
152
+ G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
153
+ LOAD_MSG_ ##r ##_4(b0, b1); \
154
+ G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
155
+ UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
156
+
157
+ #endif