sorcery-argon2 1.0.0 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/codeql.yml +74 -0
- data/.github/workflows/ruby.yml +13 -3
- data/.rubocop.yml +112 -2
- data/CHANGELOG.md +8 -0
- data/MAINTAINING.md +8 -3
- data/README.md +155 -14
- data/bin/setup +4 -0
- data/ext/argon2_wrap/{Makefile → Makefile.real} +1 -0
- data/ext/argon2_wrap/extconf.rb +4 -1
- data/ext/argon2_wrap/libargon2_wrap.so +0 -0
- data/ext/argon2_wrap/tests +0 -0
- data/ext/phc-winner-argon2/.git +1 -0
- data/ext/phc-winner-argon2/.gitattributes +10 -0
- data/ext/phc-winner-argon2/.gitignore +22 -0
- data/ext/phc-winner-argon2/.travis.yml +25 -0
- data/ext/phc-winner-argon2/Argon2.sln +158 -0
- data/ext/phc-winner-argon2/CHANGELOG.md +32 -0
- data/ext/phc-winner-argon2/LICENSE +314 -0
- data/ext/phc-winner-argon2/Makefile +255 -0
- data/ext/phc-winner-argon2/Package.swift +46 -0
- data/ext/phc-winner-argon2/README.md +303 -0
- data/ext/phc-winner-argon2/appveyor.yml +25 -0
- data/ext/phc-winner-argon2/argon2-specs.pdf +0 -0
- data/ext/phc-winner-argon2/export.sh +7 -0
- data/ext/phc-winner-argon2/include/argon2.h +437 -0
- data/ext/phc-winner-argon2/kats/argon2d +12304 -0
- data/ext/phc-winner-argon2/kats/argon2d.shasum +1 -0
- data/ext/phc-winner-argon2/kats/argon2d_v16 +12304 -0
- data/ext/phc-winner-argon2/kats/argon2d_v16.shasum +1 -0
- data/ext/phc-winner-argon2/kats/argon2i +12304 -0
- data/ext/phc-winner-argon2/kats/argon2i.shasum +1 -0
- data/ext/phc-winner-argon2/kats/argon2i_v16 +12304 -0
- data/ext/phc-winner-argon2/kats/argon2i_v16.shasum +1 -0
- data/ext/phc-winner-argon2/kats/argon2id +12304 -0
- data/ext/phc-winner-argon2/kats/argon2id.shasum +1 -0
- data/ext/phc-winner-argon2/kats/argon2id_v16 +12304 -0
- data/ext/phc-winner-argon2/kats/argon2id_v16.shasum +1 -0
- data/ext/phc-winner-argon2/kats/check-sums.ps1 +42 -0
- data/ext/phc-winner-argon2/kats/check-sums.sh +13 -0
- data/ext/phc-winner-argon2/kats/test.ps1 +50 -0
- data/ext/phc-winner-argon2/kats/test.sh +49 -0
- data/ext/phc-winner-argon2/latex/IEEEtran.cls +6347 -0
- data/ext/phc-winner-argon2/latex/Makefile +18 -0
- data/ext/phc-winner-argon2/latex/argon2-specs.tex +920 -0
- data/ext/phc-winner-argon2/latex/pics/argon2-par.pdf +0 -0
- data/ext/phc-winner-argon2/latex/pics/compression.pdf +0 -0
- data/ext/phc-winner-argon2/latex/pics/generic.pdf +0 -0
- data/ext/phc-winner-argon2/latex/pics/power-distribution.jpg +0 -0
- data/ext/phc-winner-argon2/latex/tradeoff.bib +822 -0
- data/ext/phc-winner-argon2/libargon2.pc.in +18 -0
- data/ext/phc-winner-argon2/man/argon2.1 +57 -0
- data/ext/phc-winner-argon2/src/argon2.c +452 -0
- data/ext/phc-winner-argon2/src/bench.c +111 -0
- data/ext/phc-winner-argon2/src/blake2/blake2-impl.h +156 -0
- data/ext/phc-winner-argon2/src/blake2/blake2.h +89 -0
- data/ext/phc-winner-argon2/src/blake2/blake2b.c +390 -0
- data/ext/phc-winner-argon2/src/blake2/blamka-round-opt.h +471 -0
- data/ext/phc-winner-argon2/src/blake2/blamka-round-ref.h +56 -0
- data/ext/phc-winner-argon2/src/core.c +648 -0
- data/ext/phc-winner-argon2/src/core.h +228 -0
- data/ext/phc-winner-argon2/src/encoding.c +463 -0
- data/ext/phc-winner-argon2/src/encoding.h +57 -0
- data/ext/phc-winner-argon2/src/genkat.c +213 -0
- data/ext/phc-winner-argon2/src/genkat.h +51 -0
- data/ext/phc-winner-argon2/src/opt.c +283 -0
- data/ext/phc-winner-argon2/src/ref.c +194 -0
- data/ext/phc-winner-argon2/src/run.c +337 -0
- data/ext/phc-winner-argon2/src/test.c +289 -0
- data/ext/phc-winner-argon2/src/thread.c +57 -0
- data/ext/phc-winner-argon2/src/thread.h +67 -0
- data/ext/phc-winner-argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj +231 -0
- data/ext/phc-winner-argon2/vs2015/Argon2Opt/Argon2Opt.vcxproj.filters +69 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj +231 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptBench/Argon2OptBench.vcxproj.filters +69 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj +230 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptDll/Argon2OptDll.vcxproj.filters +66 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj +244 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptGenKAT/Argon2OptGenKAT.vcxproj.filters +72 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj +235 -0
- data/ext/phc-winner-argon2/vs2015/Argon2OptTestCI/Argon2OptTestCI.vcxproj.filters +69 -0
- data/ext/phc-winner-argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj +243 -0
- data/ext/phc-winner-argon2/vs2015/Argon2Ref/Argon2Ref.vcxproj.filters +69 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj +231 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefBench/Argon2RefBench.vcxproj.filters +69 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj +230 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefDll/Argon2RefDll.vcxproj.filters +66 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj +232 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefGenKAT/Argon2RefGenKAT.vcxproj.filters +72 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj +231 -0
- data/ext/phc-winner-argon2/vs2015/Argon2RefTestCI/Argon2RefTestCI.vcxproj.filters +69 -0
- data/lib/argon2/ffi_engine.rb +4 -4
- data/lib/argon2/password.rb +28 -5
- data/lib/argon2/version.rb +1 -1
- data/sorcery-argon2.gemspec +3 -2
- metadata +91 -8
@@ -0,0 +1,471 @@
|
|
1
|
+
/*
|
2
|
+
* Argon2 reference source code package - reference C implementations
|
3
|
+
*
|
4
|
+
* Copyright 2015
|
5
|
+
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
|
+
*
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
|
+
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
|
+
* these licenses can be found at:
|
10
|
+
*
|
11
|
+
* - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
|
12
|
+
* - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
*
|
14
|
+
* You should have received a copy of both of these licenses along with this
|
15
|
+
* software. If not, they may be obtained at the above URLs.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#ifndef BLAKE_ROUND_MKA_OPT_H
|
19
|
+
#define BLAKE_ROUND_MKA_OPT_H
|
20
|
+
|
21
|
+
#include "blake2-impl.h"
|
22
|
+
|
23
|
+
#include <emmintrin.h>
|
24
|
+
#if defined(__SSSE3__)
|
25
|
+
#include <tmmintrin.h> /* for _mm_shuffle_epi8 and _mm_alignr_epi8 */
|
26
|
+
#endif
|
27
|
+
|
28
|
+
#if defined(__XOP__) && (defined(__GNUC__) || defined(__clang__))
|
29
|
+
#include <x86intrin.h>
|
30
|
+
#endif
|
31
|
+
|
32
|
+
#if !defined(__AVX512F__)
|
33
|
+
#if !defined(__AVX2__)
|
34
|
+
#if !defined(__XOP__)
|
35
|
+
#if defined(__SSSE3__)
|
36
|
+
#define r16 \
|
37
|
+
(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
38
|
+
#define r24 \
|
39
|
+
(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
40
|
+
#define _mm_roti_epi64(x, c) \
|
41
|
+
(-(c) == 32) \
|
42
|
+
? _mm_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) \
|
43
|
+
: (-(c) == 24) \
|
44
|
+
? _mm_shuffle_epi8((x), r24) \
|
45
|
+
: (-(c) == 16) \
|
46
|
+
? _mm_shuffle_epi8((x), r16) \
|
47
|
+
: (-(c) == 63) \
|
48
|
+
? _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
49
|
+
_mm_add_epi64((x), (x))) \
|
50
|
+
: _mm_xor_si128(_mm_srli_epi64((x), -(c)), \
|
51
|
+
_mm_slli_epi64((x), 64 - (-(c))))
|
52
|
+
#else /* defined(__SSE2__) */
|
53
|
+
#define _mm_roti_epi64(r, c) \
|
54
|
+
_mm_xor_si128(_mm_srli_epi64((r), -(c)), _mm_slli_epi64((r), 64 - (-(c))))
|
55
|
+
#endif
|
56
|
+
#else
|
57
|
+
#endif
|
58
|
+
|
59
|
+
static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
60
|
+
const __m128i z = _mm_mul_epu32(x, y);
|
61
|
+
return _mm_add_epi64(_mm_add_epi64(x, y), _mm_add_epi64(z, z));
|
62
|
+
}
|
63
|
+
|
64
|
+
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
65
|
+
do { \
|
66
|
+
A0 = fBlaMka(A0, B0); \
|
67
|
+
A1 = fBlaMka(A1, B1); \
|
68
|
+
\
|
69
|
+
D0 = _mm_xor_si128(D0, A0); \
|
70
|
+
D1 = _mm_xor_si128(D1, A1); \
|
71
|
+
\
|
72
|
+
D0 = _mm_roti_epi64(D0, -32); \
|
73
|
+
D1 = _mm_roti_epi64(D1, -32); \
|
74
|
+
\
|
75
|
+
C0 = fBlaMka(C0, D0); \
|
76
|
+
C1 = fBlaMka(C1, D1); \
|
77
|
+
\
|
78
|
+
B0 = _mm_xor_si128(B0, C0); \
|
79
|
+
B1 = _mm_xor_si128(B1, C1); \
|
80
|
+
\
|
81
|
+
B0 = _mm_roti_epi64(B0, -24); \
|
82
|
+
B1 = _mm_roti_epi64(B1, -24); \
|
83
|
+
} while ((void)0, 0)
|
84
|
+
|
85
|
+
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
86
|
+
do { \
|
87
|
+
A0 = fBlaMka(A0, B0); \
|
88
|
+
A1 = fBlaMka(A1, B1); \
|
89
|
+
\
|
90
|
+
D0 = _mm_xor_si128(D0, A0); \
|
91
|
+
D1 = _mm_xor_si128(D1, A1); \
|
92
|
+
\
|
93
|
+
D0 = _mm_roti_epi64(D0, -16); \
|
94
|
+
D1 = _mm_roti_epi64(D1, -16); \
|
95
|
+
\
|
96
|
+
C0 = fBlaMka(C0, D0); \
|
97
|
+
C1 = fBlaMka(C1, D1); \
|
98
|
+
\
|
99
|
+
B0 = _mm_xor_si128(B0, C0); \
|
100
|
+
B1 = _mm_xor_si128(B1, C1); \
|
101
|
+
\
|
102
|
+
B0 = _mm_roti_epi64(B0, -63); \
|
103
|
+
B1 = _mm_roti_epi64(B1, -63); \
|
104
|
+
} while ((void)0, 0)
|
105
|
+
|
106
|
+
#if defined(__SSSE3__)
|
107
|
+
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
108
|
+
do { \
|
109
|
+
__m128i t0 = _mm_alignr_epi8(B1, B0, 8); \
|
110
|
+
__m128i t1 = _mm_alignr_epi8(B0, B1, 8); \
|
111
|
+
B0 = t0; \
|
112
|
+
B1 = t1; \
|
113
|
+
\
|
114
|
+
t0 = C0; \
|
115
|
+
C0 = C1; \
|
116
|
+
C1 = t0; \
|
117
|
+
\
|
118
|
+
t0 = _mm_alignr_epi8(D1, D0, 8); \
|
119
|
+
t1 = _mm_alignr_epi8(D0, D1, 8); \
|
120
|
+
D0 = t1; \
|
121
|
+
D1 = t0; \
|
122
|
+
} while ((void)0, 0)
|
123
|
+
|
124
|
+
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
125
|
+
do { \
|
126
|
+
__m128i t0 = _mm_alignr_epi8(B0, B1, 8); \
|
127
|
+
__m128i t1 = _mm_alignr_epi8(B1, B0, 8); \
|
128
|
+
B0 = t0; \
|
129
|
+
B1 = t1; \
|
130
|
+
\
|
131
|
+
t0 = C0; \
|
132
|
+
C0 = C1; \
|
133
|
+
C1 = t0; \
|
134
|
+
\
|
135
|
+
t0 = _mm_alignr_epi8(D0, D1, 8); \
|
136
|
+
t1 = _mm_alignr_epi8(D1, D0, 8); \
|
137
|
+
D0 = t1; \
|
138
|
+
D1 = t0; \
|
139
|
+
} while ((void)0, 0)
|
140
|
+
#else /* SSE2 */
|
141
|
+
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
142
|
+
do { \
|
143
|
+
__m128i t0 = D0; \
|
144
|
+
__m128i t1 = B0; \
|
145
|
+
D0 = C0; \
|
146
|
+
C0 = C1; \
|
147
|
+
C1 = D0; \
|
148
|
+
D0 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t0, t0)); \
|
149
|
+
D1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(D1, D1)); \
|
150
|
+
B0 = _mm_unpackhi_epi64(B0, _mm_unpacklo_epi64(B1, B1)); \
|
151
|
+
B1 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(t1, t1)); \
|
152
|
+
} while ((void)0, 0)
|
153
|
+
|
154
|
+
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
155
|
+
do { \
|
156
|
+
__m128i t0, t1; \
|
157
|
+
t0 = C0; \
|
158
|
+
C0 = C1; \
|
159
|
+
C1 = t0; \
|
160
|
+
t0 = B0; \
|
161
|
+
t1 = D0; \
|
162
|
+
B0 = _mm_unpackhi_epi64(B1, _mm_unpacklo_epi64(B0, B0)); \
|
163
|
+
B1 = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(B1, B1)); \
|
164
|
+
D0 = _mm_unpackhi_epi64(D0, _mm_unpacklo_epi64(D1, D1)); \
|
165
|
+
D1 = _mm_unpackhi_epi64(D1, _mm_unpacklo_epi64(t1, t1)); \
|
166
|
+
} while ((void)0, 0)
|
167
|
+
#endif
|
168
|
+
|
169
|
+
#define BLAKE2_ROUND(A0, A1, B0, B1, C0, C1, D0, D1) \
|
170
|
+
do { \
|
171
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
172
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
173
|
+
\
|
174
|
+
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
175
|
+
\
|
176
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
177
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
178
|
+
\
|
179
|
+
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
180
|
+
} while ((void)0, 0)
|
181
|
+
#else /* __AVX2__ */
|
182
|
+
|
183
|
+
#include <immintrin.h>
|
184
|
+
|
185
|
+
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
186
|
+
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
187
|
+
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
188
|
+
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
189
|
+
|
190
|
+
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
191
|
+
do { \
|
192
|
+
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
193
|
+
ml = _mm256_add_epi64(ml, ml); \
|
194
|
+
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
195
|
+
D0 = _mm256_xor_si256(D0, A0); \
|
196
|
+
D0 = rotr32(D0); \
|
197
|
+
\
|
198
|
+
ml = _mm256_mul_epu32(C0, D0); \
|
199
|
+
ml = _mm256_add_epi64(ml, ml); \
|
200
|
+
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
201
|
+
\
|
202
|
+
B0 = _mm256_xor_si256(B0, C0); \
|
203
|
+
B0 = rotr24(B0); \
|
204
|
+
\
|
205
|
+
ml = _mm256_mul_epu32(A1, B1); \
|
206
|
+
ml = _mm256_add_epi64(ml, ml); \
|
207
|
+
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
208
|
+
D1 = _mm256_xor_si256(D1, A1); \
|
209
|
+
D1 = rotr32(D1); \
|
210
|
+
\
|
211
|
+
ml = _mm256_mul_epu32(C1, D1); \
|
212
|
+
ml = _mm256_add_epi64(ml, ml); \
|
213
|
+
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
214
|
+
\
|
215
|
+
B1 = _mm256_xor_si256(B1, C1); \
|
216
|
+
B1 = rotr24(B1); \
|
217
|
+
} while((void)0, 0);
|
218
|
+
|
219
|
+
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
220
|
+
do { \
|
221
|
+
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
222
|
+
ml = _mm256_add_epi64(ml, ml); \
|
223
|
+
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
224
|
+
D0 = _mm256_xor_si256(D0, A0); \
|
225
|
+
D0 = rotr16(D0); \
|
226
|
+
\
|
227
|
+
ml = _mm256_mul_epu32(C0, D0); \
|
228
|
+
ml = _mm256_add_epi64(ml, ml); \
|
229
|
+
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
230
|
+
B0 = _mm256_xor_si256(B0, C0); \
|
231
|
+
B0 = rotr63(B0); \
|
232
|
+
\
|
233
|
+
ml = _mm256_mul_epu32(A1, B1); \
|
234
|
+
ml = _mm256_add_epi64(ml, ml); \
|
235
|
+
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
236
|
+
D1 = _mm256_xor_si256(D1, A1); \
|
237
|
+
D1 = rotr16(D1); \
|
238
|
+
\
|
239
|
+
ml = _mm256_mul_epu32(C1, D1); \
|
240
|
+
ml = _mm256_add_epi64(ml, ml); \
|
241
|
+
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
242
|
+
B1 = _mm256_xor_si256(B1, C1); \
|
243
|
+
B1 = rotr63(B1); \
|
244
|
+
} while((void)0, 0);
|
245
|
+
|
246
|
+
#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
247
|
+
do { \
|
248
|
+
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
249
|
+
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
250
|
+
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
251
|
+
\
|
252
|
+
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
253
|
+
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
254
|
+
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
255
|
+
} while((void)0, 0);
|
256
|
+
|
257
|
+
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
258
|
+
do { \
|
259
|
+
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
260
|
+
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
261
|
+
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
262
|
+
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
263
|
+
\
|
264
|
+
tmp1 = C0; \
|
265
|
+
C0 = C1; \
|
266
|
+
C1 = tmp1; \
|
267
|
+
\
|
268
|
+
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
269
|
+
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
270
|
+
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
271
|
+
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
272
|
+
} while(0);
|
273
|
+
|
274
|
+
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
275
|
+
do { \
|
276
|
+
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
277
|
+
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
278
|
+
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
279
|
+
\
|
280
|
+
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
281
|
+
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
282
|
+
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
283
|
+
} while((void)0, 0);
|
284
|
+
|
285
|
+
#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
286
|
+
do { \
|
287
|
+
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
288
|
+
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
289
|
+
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
290
|
+
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
291
|
+
\
|
292
|
+
tmp1 = C0; \
|
293
|
+
C0 = C1; \
|
294
|
+
C1 = tmp1; \
|
295
|
+
\
|
296
|
+
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
297
|
+
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
298
|
+
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
299
|
+
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
300
|
+
} while((void)0, 0);
|
301
|
+
|
302
|
+
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
303
|
+
do{ \
|
304
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
305
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
306
|
+
\
|
307
|
+
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
308
|
+
\
|
309
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
310
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
311
|
+
\
|
312
|
+
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
313
|
+
} while((void)0, 0);
|
314
|
+
|
315
|
+
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
316
|
+
do{ \
|
317
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
318
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
319
|
+
\
|
320
|
+
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
321
|
+
\
|
322
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
323
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
324
|
+
\
|
325
|
+
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
326
|
+
} while((void)0, 0);
|
327
|
+
|
328
|
+
#endif /* __AVX2__ */
|
329
|
+
|
330
|
+
#else /* __AVX512F__ */
|
331
|
+
|
332
|
+
#include <immintrin.h>
|
333
|
+
|
334
|
+
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
335
|
+
|
336
|
+
static __m512i muladd(__m512i x, __m512i y)
|
337
|
+
{
|
338
|
+
__m512i z = _mm512_mul_epu32(x, y);
|
339
|
+
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
340
|
+
}
|
341
|
+
|
342
|
+
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
343
|
+
do { \
|
344
|
+
A0 = muladd(A0, B0); \
|
345
|
+
A1 = muladd(A1, B1); \
|
346
|
+
\
|
347
|
+
D0 = _mm512_xor_si512(D0, A0); \
|
348
|
+
D1 = _mm512_xor_si512(D1, A1); \
|
349
|
+
\
|
350
|
+
D0 = ror64(D0, 32); \
|
351
|
+
D1 = ror64(D1, 32); \
|
352
|
+
\
|
353
|
+
C0 = muladd(C0, D0); \
|
354
|
+
C1 = muladd(C1, D1); \
|
355
|
+
\
|
356
|
+
B0 = _mm512_xor_si512(B0, C0); \
|
357
|
+
B1 = _mm512_xor_si512(B1, C1); \
|
358
|
+
\
|
359
|
+
B0 = ror64(B0, 24); \
|
360
|
+
B1 = ror64(B1, 24); \
|
361
|
+
} while ((void)0, 0)
|
362
|
+
|
363
|
+
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
364
|
+
do { \
|
365
|
+
A0 = muladd(A0, B0); \
|
366
|
+
A1 = muladd(A1, B1); \
|
367
|
+
\
|
368
|
+
D0 = _mm512_xor_si512(D0, A0); \
|
369
|
+
D1 = _mm512_xor_si512(D1, A1); \
|
370
|
+
\
|
371
|
+
D0 = ror64(D0, 16); \
|
372
|
+
D1 = ror64(D1, 16); \
|
373
|
+
\
|
374
|
+
C0 = muladd(C0, D0); \
|
375
|
+
C1 = muladd(C1, D1); \
|
376
|
+
\
|
377
|
+
B0 = _mm512_xor_si512(B0, C0); \
|
378
|
+
B1 = _mm512_xor_si512(B1, C1); \
|
379
|
+
\
|
380
|
+
B0 = ror64(B0, 63); \
|
381
|
+
B1 = ror64(B1, 63); \
|
382
|
+
} while ((void)0, 0)
|
383
|
+
|
384
|
+
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
385
|
+
do { \
|
386
|
+
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
387
|
+
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
388
|
+
\
|
389
|
+
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
390
|
+
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
391
|
+
\
|
392
|
+
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
393
|
+
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
394
|
+
} while ((void)0, 0)
|
395
|
+
|
396
|
+
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
397
|
+
do { \
|
398
|
+
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
399
|
+
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
400
|
+
\
|
401
|
+
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
402
|
+
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
403
|
+
\
|
404
|
+
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
405
|
+
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
406
|
+
} while ((void)0, 0)
|
407
|
+
|
408
|
+
#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
|
409
|
+
do { \
|
410
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
411
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
412
|
+
\
|
413
|
+
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
414
|
+
\
|
415
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
416
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
417
|
+
\
|
418
|
+
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
419
|
+
} while ((void)0, 0)
|
420
|
+
|
421
|
+
#define SWAP_HALVES(A0, A1) \
|
422
|
+
do { \
|
423
|
+
__m512i t0, t1; \
|
424
|
+
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
425
|
+
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
426
|
+
A0 = t0; \
|
427
|
+
A1 = t1; \
|
428
|
+
} while((void)0, 0)
|
429
|
+
|
430
|
+
#define SWAP_QUARTERS(A0, A1) \
|
431
|
+
do { \
|
432
|
+
SWAP_HALVES(A0, A1); \
|
433
|
+
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
434
|
+
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
435
|
+
} while((void)0, 0)
|
436
|
+
|
437
|
+
#define UNSWAP_QUARTERS(A0, A1) \
|
438
|
+
do { \
|
439
|
+
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
440
|
+
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
441
|
+
SWAP_HALVES(A0, A1); \
|
442
|
+
} while((void)0, 0)
|
443
|
+
|
444
|
+
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
|
445
|
+
do { \
|
446
|
+
SWAP_HALVES(A0, B0); \
|
447
|
+
SWAP_HALVES(C0, D0); \
|
448
|
+
SWAP_HALVES(A1, B1); \
|
449
|
+
SWAP_HALVES(C1, D1); \
|
450
|
+
BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
|
451
|
+
SWAP_HALVES(A0, B0); \
|
452
|
+
SWAP_HALVES(C0, D0); \
|
453
|
+
SWAP_HALVES(A1, B1); \
|
454
|
+
SWAP_HALVES(C1, D1); \
|
455
|
+
} while ((void)0, 0)
|
456
|
+
|
457
|
+
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
458
|
+
do { \
|
459
|
+
SWAP_QUARTERS(A0, A1); \
|
460
|
+
SWAP_QUARTERS(B0, B1); \
|
461
|
+
SWAP_QUARTERS(C0, C1); \
|
462
|
+
SWAP_QUARTERS(D0, D1); \
|
463
|
+
BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
|
464
|
+
UNSWAP_QUARTERS(A0, A1); \
|
465
|
+
UNSWAP_QUARTERS(B0, B1); \
|
466
|
+
UNSWAP_QUARTERS(C0, C1); \
|
467
|
+
UNSWAP_QUARTERS(D0, D1); \
|
468
|
+
} while ((void)0, 0)
|
469
|
+
|
470
|
+
#endif /* __AVX512F__ */
|
471
|
+
#endif /* BLAKE_ROUND_MKA_OPT_H */
|
@@ -0,0 +1,56 @@
|
|
1
|
+
/*
|
2
|
+
* Argon2 reference source code package - reference C implementations
|
3
|
+
*
|
4
|
+
* Copyright 2015
|
5
|
+
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
|
+
*
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
|
+
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
|
+
* these licenses can be found at:
|
10
|
+
*
|
11
|
+
* - CC0 1.0 Universal : https://creativecommons.org/publicdomain/zero/1.0
|
12
|
+
* - Apache 2.0 : https://www.apache.org/licenses/LICENSE-2.0
|
13
|
+
*
|
14
|
+
* You should have received a copy of both of these licenses along with this
|
15
|
+
* software. If not, they may be obtained at the above URLs.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#ifndef BLAKE_ROUND_MKA_H
|
19
|
+
#define BLAKE_ROUND_MKA_H
|
20
|
+
|
21
|
+
#include "blake2.h"
|
22
|
+
#include "blake2-impl.h"
|
23
|
+
|
24
|
+
/* designed by the Lyra PHC team */
|
25
|
+
static BLAKE2_INLINE uint64_t fBlaMka(uint64_t x, uint64_t y) {
|
26
|
+
const uint64_t m = UINT64_C(0xFFFFFFFF);
|
27
|
+
const uint64_t xy = (x & m) * (y & m);
|
28
|
+
return x + y + 2 * xy;
|
29
|
+
}
|
30
|
+
|
31
|
+
#define G(a, b, c, d) \
|
32
|
+
do { \
|
33
|
+
a = fBlaMka(a, b); \
|
34
|
+
d = rotr64(d ^ a, 32); \
|
35
|
+
c = fBlaMka(c, d); \
|
36
|
+
b = rotr64(b ^ c, 24); \
|
37
|
+
a = fBlaMka(a, b); \
|
38
|
+
d = rotr64(d ^ a, 16); \
|
39
|
+
c = fBlaMka(c, d); \
|
40
|
+
b = rotr64(b ^ c, 63); \
|
41
|
+
} while ((void)0, 0)
|
42
|
+
|
43
|
+
#define BLAKE2_ROUND_NOMSG(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, \
|
44
|
+
v12, v13, v14, v15) \
|
45
|
+
do { \
|
46
|
+
G(v0, v4, v8, v12); \
|
47
|
+
G(v1, v5, v9, v13); \
|
48
|
+
G(v2, v6, v10, v14); \
|
49
|
+
G(v3, v7, v11, v15); \
|
50
|
+
G(v0, v5, v10, v15); \
|
51
|
+
G(v1, v6, v11, v12); \
|
52
|
+
G(v2, v7, v8, v13); \
|
53
|
+
G(v3, v4, v9, v14); \
|
54
|
+
} while ((void)0, 0)
|
55
|
+
|
56
|
+
#endif
|