argon2 1.1.4 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.rubocop.yml +6 -10
- data/.travis.yml +2 -1
- data/Changelog.md +4 -0
- data/README.md +2 -1
- data/argon2.gemspec +4 -4
- data/ext/phc-winner-argon2/CHANGELOG.md +7 -0
- data/ext/phc-winner-argon2/Makefile +2 -2
- data/ext/phc-winner-argon2/README.md +4 -0
- data/ext/phc-winner-argon2/include/argon2.h +5 -3
- data/ext/phc-winner-argon2/src/argon2.c +1 -1
- data/ext/phc-winner-argon2/src/bench.c +2 -2
- data/ext/phc-winner-argon2/src/blake2/blake2-impl.h +1 -1
- data/ext/phc-winner-argon2/src/blake2/blake2.h +10 -12
- data/ext/phc-winner-argon2/src/blake2/blake2b.c +1 -1
- data/ext/phc-winner-argon2/src/blake2/blamka-round-opt.h +293 -2
- data/ext/phc-winner-argon2/src/blake2/blamka-round-ref.h +2 -2
- data/ext/phc-winner-argon2/src/core.c +1 -2
- data/ext/phc-winner-argon2/src/core.h +3 -9
- data/ext/phc-winner-argon2/src/encoding.c +1 -1
- data/ext/phc-winner-argon2/src/encoding.h +1 -1
- data/ext/phc-winner-argon2/src/genkat.c +3 -4
- data/ext/phc-winner-argon2/src/genkat.h +1 -1
- data/ext/phc-winner-argon2/src/opt.c +90 -2
- data/ext/phc-winner-argon2/src/ref.c +1 -1
- data/ext/phc-winner-argon2/src/run.c +43 -23
- data/ext/phc-winner-argon2/src/test.c +5 -6
- data/ext/phc-winner-argon2/src/thread.c +1 -1
- data/ext/phc-winner-argon2/src/thread.h +2 -2
- data/lib/argon2.rb +1 -0
- data/lib/argon2/constants.rb +2 -0
- data/lib/argon2/engine.rb +1 -0
- data/lib/argon2/errors.rb +4 -2
- data/lib/argon2/version.rb +3 -1
- metadata +19 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 9425c5c639ac3e940ffffe3f9baa5967976ad2caa49d0716db0de43aa6d41b66
|
4
|
+
data.tar.gz: 607eff42d6c915f2528d5e61f95d264bdace086515a250188e0f937397b684b1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 60b5ff68f0e29fa1c40dc2e9a10e21f585dacd1d95af4a219166be38e9b2734d8bcae85010245ceb5a2d8782f8f65906bce4f951159eaf5bd1e9b8bccd98cf71
|
7
|
+
data.tar.gz: e8d1074c9ad878cb2a7b7e3791b377f0fbf573ead9d9432ad178d1e4c4af4de80d337455d7fb438fd0e5ad15e2bcfa0f5091254674b7c5d45145ac8ce9e5f6e1
|
data/.rubocop.yml
CHANGED
@@ -5,20 +5,16 @@ Metrics/CyclomaticComplexity:
|
|
5
5
|
Enabled: false
|
6
6
|
Metrics/PerceivedComplexity:
|
7
7
|
Enabled: false
|
8
|
-
#Style/MutableConstant:
|
9
|
-
# Exclude:
|
10
|
-
# - 'test/key_test.rb'
|
11
|
-
|
12
8
|
Metrics/LineLength:
|
13
9
|
Max: 160
|
14
10
|
|
15
11
|
Metrics/MethodLength:
|
16
12
|
Max: 24
|
17
13
|
|
18
|
-
|
14
|
+
Layout/AlignParameters:
|
19
15
|
Enabled: false
|
20
16
|
|
21
|
-
|
17
|
+
Layout/AlignArray:
|
22
18
|
Enabled: false
|
23
19
|
|
24
20
|
# Configuration parameters: Exclude.
|
@@ -30,7 +26,7 @@ Style/Documentation:
|
|
30
26
|
# Offense count: 16
|
31
27
|
# Cop supports --auto-correct.
|
32
28
|
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
33
|
-
|
29
|
+
Layout/FirstParameterIndentation:
|
34
30
|
Exclude:
|
35
31
|
- 'lib/argon2.rb'
|
36
32
|
- 'test/low_level_test.rb'
|
@@ -41,13 +37,13 @@ Style/HashSyntax:
|
|
41
37
|
|
42
38
|
# Offense count: 1
|
43
39
|
# Cop supports --auto-correct.
|
44
|
-
|
40
|
+
Layout/IndentArray:
|
45
41
|
Exclude:
|
46
42
|
- 'lib/argon2/errors.rb'
|
47
43
|
|
48
44
|
# Offense count: 44
|
49
45
|
# Cop supports --auto-correct.
|
50
|
-
|
46
|
+
Layout/LeadingCommentSpace:
|
51
47
|
Exclude:
|
52
48
|
- 'ext/argon2_wrap/extconf.rb'
|
53
49
|
|
@@ -60,7 +56,7 @@ Style/StringLiterals:
|
|
60
56
|
Style/WordArray:
|
61
57
|
MinSize: 33
|
62
58
|
|
63
|
-
|
59
|
+
Layout/MultilineMethodCallBraceLayout:
|
64
60
|
Exclude:
|
65
61
|
- 'lib/argon2.rb'
|
66
62
|
- 'test/low_level_test.rb'
|
data/.travis.yml
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
language: ruby
|
2
|
+
sudo: required
|
2
3
|
rvm:
|
4
|
+
- 2.5.0
|
3
5
|
- 2.4.0
|
4
6
|
- 2.3.3
|
5
7
|
- jruby-9000
|
@@ -10,4 +12,3 @@ install: bin/setup
|
|
10
12
|
script:
|
11
13
|
- cd ext/argon2_wrap/ && make test && cd ../..
|
12
14
|
- bundle exec rake test
|
13
|
-
- CODECLIMATE_REPO_TOKEN=2b619b81040453ecbcf1cf0869e1238c4bbaab666a42e7dd94d762c747c0f51a bundle exec codeclimate-test-reporter
|
data/Changelog.md
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,7 @@ This Ruby Gem provides FFI bindings, and a simplified interface, to the Argon2 a
|
|
5
5
|
|
6
6
|
[](https://travis-ci.org/technion/ruby-argon2)
|
7
7
|
[](https://codeclimate.com/github/technion/ruby-argon2)
|
8
|
-
[](https://coveralls.io/github/technion/ruby-argon2)
|
9
9
|
|
10
10
|
## Design
|
11
11
|
|
@@ -44,6 +44,7 @@ hasher = Argon2::Password.new
|
|
44
44
|
hasher.create("password")
|
45
45
|
```
|
46
46
|
|
47
|
+
If you follow this pattern, it is important to create a new `Argon2::Password` every time you generate a hash, in order to ensure a unique salt. See [issue 23](https://github.com/technion/ruby-argon2/issues/23) for more information.
|
47
48
|
Alternatively, use this shotcut:
|
48
49
|
|
49
50
|
```ruby
|
data/argon2.gemspec
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
# coding: utf-8
|
2
1
|
lib = File.expand_path('../lib', __FILE__)
|
2
|
+
|
3
3
|
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
4
|
require 'argon2/version'
|
5
5
|
|
@@ -24,9 +24,9 @@ Gem::Specification.new do |spec|
|
|
24
24
|
spec.add_dependency 'ffi-compiler', '~> 0.1'
|
25
25
|
|
26
26
|
spec.add_development_dependency "bundler", '~> 1.10', '>= 1.10.5'
|
27
|
-
spec.add_development_dependency "
|
27
|
+
spec.add_development_dependency "coveralls", '~> 0.8'
|
28
28
|
spec.add_development_dependency "minitest", '~> 5.8'
|
29
|
-
spec.add_development_dependency "
|
30
|
-
spec.add_development_dependency "
|
29
|
+
spec.add_development_dependency "rake", '~> 10.4', '>= 10.4.2'
|
30
|
+
spec.add_development_dependency "rubocop", '~> 0.49'
|
31
31
|
spec.extensions << 'ext/argon2_wrap/extconf.rb'
|
32
32
|
end
|
@@ -20,7 +20,7 @@ BENCH = bench
|
|
20
20
|
GENKAT = genkat
|
21
21
|
|
22
22
|
# Increment on an ABI breaking change
|
23
|
-
ABI_VERSION =
|
23
|
+
ABI_VERSION = 1
|
24
24
|
|
25
25
|
DIST = phc-winner-argon2
|
26
26
|
|
@@ -123,7 +123,7 @@ INST_BINARY = $(DESTDIR)$(PREFIX)/$(BINARY_REL)
|
|
123
123
|
|
124
124
|
.PHONY: clean dist format $(GENKAT) all install
|
125
125
|
|
126
|
-
all:
|
126
|
+
all: $(RUN) libs
|
127
127
|
libs: $(LIBRARIES)
|
128
128
|
|
129
129
|
$(RUN): $(SRC) $(SRC_RUN)
|
@@ -256,10 +256,14 @@ their documentation):
|
|
256
256
|
* [OCaml](https://github.com/Khady/ocaml-argon2) by [@Khady](https://github.com/Khady)
|
257
257
|
* [Python (native)](https://pypi.python.org/pypi/argon2), by [@flamewow](https://github.com/flamewow)
|
258
258
|
* [Python (ffi)](https://pypi.python.org/pypi/argon2_cffi), by [@hynek](https://github.com/hynek)
|
259
|
+
* [Python (ffi, with keyed hashing)](https://github.com/thusoy/porridge), by [@thusoy](https://github.com/thusoy)
|
260
|
+
* [R](https://cran.r-project.org/package=argon2) by [@wrathematics](https://github.com/wrathematics)
|
259
261
|
* [Ruby](https://github.com/technion/ruby-argon2) by [@technion](https://github.com/technion)
|
260
262
|
* [Rust](https://github.com/quininer/argon2-rs) by [@quininer](https://github.com/quininer)
|
261
263
|
* [C#/.NET CoreCLR](https://github.com/kmaragon/Konscious.Security.Cryptography) by [@kmaragon](https://github.com/kmaragon)
|
262
264
|
* [Perl](https://github.com/Leont/crypt-argon2) by [@leont](https://github.com/Leont)
|
265
|
+
* [mruby](https://github.com/Asmod4n/mruby-argon2) by [@Asmod4n](https://github.com/Asmod4n)
|
266
|
+
* [Swift](https://github.com/ImKcat/CatCrypto) by [@ImKcat](https://github.com/ImKcat)
|
263
267
|
|
264
268
|
|
265
269
|
## Test suite
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -29,10 +29,13 @@ extern "C" {
|
|
29
29
|
/* Symbols visibility control */
|
30
30
|
#ifdef A2_VISCTL
|
31
31
|
#define ARGON2_PUBLIC __attribute__((visibility("default")))
|
32
|
+
#define ARGON2_LOCAL __attribute__ ((visibility ("hidden")))
|
32
33
|
#elif _MSC_VER
|
33
34
|
#define ARGON2_PUBLIC __declspec(dllexport)
|
35
|
+
#define ARGON2_LOCAL
|
34
36
|
#else
|
35
37
|
#define ARGON2_PUBLIC
|
38
|
+
#define ARGON2_LOCAL
|
36
39
|
#endif
|
37
40
|
|
38
41
|
/*
|
@@ -267,8 +270,7 @@ ARGON2_PUBLIC int argon2i_hash_encoded(const uint32_t t_cost,
|
|
267
270
|
const size_t encodedlen);
|
268
271
|
|
269
272
|
/**
|
270
|
-
* Hashes a password with Argon2i, producing a raw hash
|
271
|
-
* @hash
|
273
|
+
* Hashes a password with Argon2i, producing a raw hash at @hash
|
272
274
|
* @param t_cost Number of iterations
|
273
275
|
* @param m_cost Sets memory usage to m_cost kibibytes
|
274
276
|
* @param parallelism Number of threads and compute lanes
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -45,7 +45,7 @@ static uint64_t rdtsc(void) {
|
|
45
45
|
}
|
46
46
|
|
47
47
|
/*
|
48
|
-
* Benchmarks Argon2 with salt length 16, password length 16, t_cost
|
48
|
+
* Benchmarks Argon2 with salt length 16, password length 16, t_cost 3,
|
49
49
|
and different m_cost and threads
|
50
50
|
*/
|
51
51
|
static void benchmark() {
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -18,9 +18,7 @@
|
|
18
18
|
#ifndef PORTABLE_BLAKE2_H
|
19
19
|
#define PORTABLE_BLAKE2_H
|
20
20
|
|
21
|
-
#include <
|
22
|
-
#include <stdint.h>
|
23
|
-
#include <limits.h>
|
21
|
+
#include <argon2.h>
|
24
22
|
|
25
23
|
#if defined(__cplusplus)
|
26
24
|
extern "C" {
|
@@ -69,19 +67,19 @@ enum {
|
|
69
67
|
};
|
70
68
|
|
71
69
|
/* Streaming API */
|
72
|
-
int blake2b_init(blake2b_state *S, size_t outlen);
|
73
|
-
int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
70
|
+
ARGON2_LOCAL int blake2b_init(blake2b_state *S, size_t outlen);
|
71
|
+
ARGON2_LOCAL int blake2b_init_key(blake2b_state *S, size_t outlen, const void *key,
|
74
72
|
size_t keylen);
|
75
|
-
int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
|
76
|
-
int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
77
|
-
int blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
73
|
+
ARGON2_LOCAL int blake2b_init_param(blake2b_state *S, const blake2b_param *P);
|
74
|
+
ARGON2_LOCAL int blake2b_update(blake2b_state *S, const void *in, size_t inlen);
|
75
|
+
ARGON2_LOCAL int blake2b_final(blake2b_state *S, void *out, size_t outlen);
|
78
76
|
|
79
77
|
/* Simple API */
|
80
|
-
int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
|
81
|
-
|
78
|
+
ARGON2_LOCAL int blake2b(void *out, size_t outlen, const void *in, size_t inlen,
|
79
|
+
const void *key, size_t keylen);
|
82
80
|
|
83
81
|
/* Argon2 Team - Begin Code */
|
84
|
-
int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
|
82
|
+
ARGON2_LOCAL int blake2b_long(void *out, size_t outlen, const void *in, size_t inlen);
|
85
83
|
/* Argon2 Team - End Code */
|
86
84
|
|
87
85
|
#if defined(__cplusplus)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -4,7 +4,7 @@
|
|
4
4
|
* Copyright 2015
|
5
5
|
* Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves
|
6
6
|
*
|
7
|
-
* You may use this work under the terms of a Creative Commons CC0 1.0
|
7
|
+
* You may use this work under the terms of a Creative Commons CC0 1.0
|
8
8
|
* License/Waiver or the Apache Public License 2.0, at your option. The terms of
|
9
9
|
* these licenses can be found at:
|
10
10
|
*
|
@@ -29,6 +29,8 @@
|
|
29
29
|
#include <x86intrin.h>
|
30
30
|
#endif
|
31
31
|
|
32
|
+
#if !defined(__AVX512F__)
|
33
|
+
#if !defined(__AVX2__)
|
32
34
|
#if !defined(__XOP__)
|
33
35
|
#if defined(__SSSE3__)
|
34
36
|
#define r16 \
|
@@ -176,5 +178,294 @@ static BLAKE2_INLINE __m128i fBlaMka(__m128i x, __m128i y) {
|
|
176
178
|
\
|
177
179
|
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
178
180
|
} while ((void)0, 0)
|
181
|
+
#else /* __AVX2__ */
|
179
182
|
|
180
|
-
#
|
183
|
+
#include <immintrin.h>
|
184
|
+
|
185
|
+
#define rotr32(x) _mm256_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1))
|
186
|
+
#define rotr24(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10))
|
187
|
+
#define rotr16(x) _mm256_shuffle_epi8(x, _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9))
|
188
|
+
#define rotr63(x) _mm256_xor_si256(_mm256_srli_epi64((x), 63), _mm256_add_epi64((x), (x)))
|
189
|
+
|
190
|
+
#define G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
191
|
+
do { \
|
192
|
+
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
193
|
+
ml = _mm256_add_epi64(ml, ml); \
|
194
|
+
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
195
|
+
D0 = _mm256_xor_si256(D0, A0); \
|
196
|
+
D0 = rotr32(D0); \
|
197
|
+
\
|
198
|
+
ml = _mm256_mul_epu32(C0, D0); \
|
199
|
+
ml = _mm256_add_epi64(ml, ml); \
|
200
|
+
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
201
|
+
\
|
202
|
+
B0 = _mm256_xor_si256(B0, C0); \
|
203
|
+
B0 = rotr24(B0); \
|
204
|
+
\
|
205
|
+
ml = _mm256_mul_epu32(A1, B1); \
|
206
|
+
ml = _mm256_add_epi64(ml, ml); \
|
207
|
+
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
208
|
+
D1 = _mm256_xor_si256(D1, A1); \
|
209
|
+
D1 = rotr32(D1); \
|
210
|
+
\
|
211
|
+
ml = _mm256_mul_epu32(C1, D1); \
|
212
|
+
ml = _mm256_add_epi64(ml, ml); \
|
213
|
+
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
214
|
+
\
|
215
|
+
B1 = _mm256_xor_si256(B1, C1); \
|
216
|
+
B1 = rotr24(B1); \
|
217
|
+
} while((void)0, 0);
|
218
|
+
|
219
|
+
#define G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
220
|
+
do { \
|
221
|
+
__m256i ml = _mm256_mul_epu32(A0, B0); \
|
222
|
+
ml = _mm256_add_epi64(ml, ml); \
|
223
|
+
A0 = _mm256_add_epi64(A0, _mm256_add_epi64(B0, ml)); \
|
224
|
+
D0 = _mm256_xor_si256(D0, A0); \
|
225
|
+
D0 = rotr16(D0); \
|
226
|
+
\
|
227
|
+
ml = _mm256_mul_epu32(C0, D0); \
|
228
|
+
ml = _mm256_add_epi64(ml, ml); \
|
229
|
+
C0 = _mm256_add_epi64(C0, _mm256_add_epi64(D0, ml)); \
|
230
|
+
B0 = _mm256_xor_si256(B0, C0); \
|
231
|
+
B0 = rotr63(B0); \
|
232
|
+
\
|
233
|
+
ml = _mm256_mul_epu32(A1, B1); \
|
234
|
+
ml = _mm256_add_epi64(ml, ml); \
|
235
|
+
A1 = _mm256_add_epi64(A1, _mm256_add_epi64(B1, ml)); \
|
236
|
+
D1 = _mm256_xor_si256(D1, A1); \
|
237
|
+
D1 = rotr16(D1); \
|
238
|
+
\
|
239
|
+
ml = _mm256_mul_epu32(C1, D1); \
|
240
|
+
ml = _mm256_add_epi64(ml, ml); \
|
241
|
+
C1 = _mm256_add_epi64(C1, _mm256_add_epi64(D1, ml)); \
|
242
|
+
B1 = _mm256_xor_si256(B1, C1); \
|
243
|
+
B1 = rotr63(B1); \
|
244
|
+
} while((void)0, 0);
|
245
|
+
|
246
|
+
#define DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
247
|
+
do { \
|
248
|
+
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
249
|
+
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
250
|
+
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
251
|
+
\
|
252
|
+
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
253
|
+
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
254
|
+
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
255
|
+
} while((void)0, 0);
|
256
|
+
|
257
|
+
#define DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
258
|
+
do { \
|
259
|
+
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
260
|
+
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
261
|
+
B1 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
262
|
+
B0 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
263
|
+
\
|
264
|
+
tmp1 = C0; \
|
265
|
+
C0 = C1; \
|
266
|
+
C1 = tmp1; \
|
267
|
+
\
|
268
|
+
tmp1 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
269
|
+
tmp2 = _mm256_blend_epi32(D0, D1, 0x33); \
|
270
|
+
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
271
|
+
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
272
|
+
} while(0);
|
273
|
+
|
274
|
+
#define UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
275
|
+
do { \
|
276
|
+
B0 = _mm256_permute4x64_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
277
|
+
C0 = _mm256_permute4x64_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
278
|
+
D0 = _mm256_permute4x64_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
279
|
+
\
|
280
|
+
B1 = _mm256_permute4x64_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
281
|
+
C1 = _mm256_permute4x64_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
282
|
+
D1 = _mm256_permute4x64_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
283
|
+
} while((void)0, 0);
|
284
|
+
|
285
|
+
#define UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
286
|
+
do { \
|
287
|
+
__m256i tmp1 = _mm256_blend_epi32(B0, B1, 0xCC); \
|
288
|
+
__m256i tmp2 = _mm256_blend_epi32(B0, B1, 0x33); \
|
289
|
+
B0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
290
|
+
B1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
291
|
+
\
|
292
|
+
tmp1 = C0; \
|
293
|
+
C0 = C1; \
|
294
|
+
C1 = tmp1; \
|
295
|
+
\
|
296
|
+
tmp1 = _mm256_blend_epi32(D0, D1, 0x33); \
|
297
|
+
tmp2 = _mm256_blend_epi32(D0, D1, 0xCC); \
|
298
|
+
D0 = _mm256_permute4x64_epi64(tmp1, _MM_SHUFFLE(2,3,0,1)); \
|
299
|
+
D1 = _mm256_permute4x64_epi64(tmp2, _MM_SHUFFLE(2,3,0,1)); \
|
300
|
+
} while((void)0, 0);
|
301
|
+
|
302
|
+
#define BLAKE2_ROUND_1(A0, A1, B0, B1, C0, C1, D0, D1) \
|
303
|
+
do{ \
|
304
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
305
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
306
|
+
\
|
307
|
+
DIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
308
|
+
\
|
309
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
310
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
311
|
+
\
|
312
|
+
UNDIAGONALIZE_1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
313
|
+
} while((void)0, 0);
|
314
|
+
|
315
|
+
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
316
|
+
do{ \
|
317
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
318
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
319
|
+
\
|
320
|
+
DIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
321
|
+
\
|
322
|
+
G1_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
323
|
+
G2_AVX2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
324
|
+
\
|
325
|
+
UNDIAGONALIZE_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
326
|
+
} while((void)0, 0);
|
327
|
+
|
328
|
+
#endif /* __AVX2__ */
|
329
|
+
|
330
|
+
#else /* __AVX512F__ */
|
331
|
+
|
332
|
+
#include <immintrin.h>
|
333
|
+
|
334
|
+
#define ror64(x, n) _mm512_ror_epi64((x), (n))
|
335
|
+
|
336
|
+
static __m512i muladd(__m512i x, __m512i y)
|
337
|
+
{
|
338
|
+
__m512i z = _mm512_mul_epu32(x, y);
|
339
|
+
return _mm512_add_epi64(_mm512_add_epi64(x, y), _mm512_add_epi64(z, z));
|
340
|
+
}
|
341
|
+
|
342
|
+
#define G1(A0, B0, C0, D0, A1, B1, C1, D1) \
|
343
|
+
do { \
|
344
|
+
A0 = muladd(A0, B0); \
|
345
|
+
A1 = muladd(A1, B1); \
|
346
|
+
\
|
347
|
+
D0 = _mm512_xor_si512(D0, A0); \
|
348
|
+
D1 = _mm512_xor_si512(D1, A1); \
|
349
|
+
\
|
350
|
+
D0 = ror64(D0, 32); \
|
351
|
+
D1 = ror64(D1, 32); \
|
352
|
+
\
|
353
|
+
C0 = muladd(C0, D0); \
|
354
|
+
C1 = muladd(C1, D1); \
|
355
|
+
\
|
356
|
+
B0 = _mm512_xor_si512(B0, C0); \
|
357
|
+
B1 = _mm512_xor_si512(B1, C1); \
|
358
|
+
\
|
359
|
+
B0 = ror64(B0, 24); \
|
360
|
+
B1 = ror64(B1, 24); \
|
361
|
+
} while ((void)0, 0)
|
362
|
+
|
363
|
+
#define G2(A0, B0, C0, D0, A1, B1, C1, D1) \
|
364
|
+
do { \
|
365
|
+
A0 = muladd(A0, B0); \
|
366
|
+
A1 = muladd(A1, B1); \
|
367
|
+
\
|
368
|
+
D0 = _mm512_xor_si512(D0, A0); \
|
369
|
+
D1 = _mm512_xor_si512(D1, A1); \
|
370
|
+
\
|
371
|
+
D0 = ror64(D0, 16); \
|
372
|
+
D1 = ror64(D1, 16); \
|
373
|
+
\
|
374
|
+
C0 = muladd(C0, D0); \
|
375
|
+
C1 = muladd(C1, D1); \
|
376
|
+
\
|
377
|
+
B0 = _mm512_xor_si512(B0, C0); \
|
378
|
+
B1 = _mm512_xor_si512(B1, C1); \
|
379
|
+
\
|
380
|
+
B0 = ror64(B0, 63); \
|
381
|
+
B1 = ror64(B1, 63); \
|
382
|
+
} while ((void)0, 0)
|
383
|
+
|
384
|
+
#define DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
385
|
+
do { \
|
386
|
+
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
387
|
+
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
388
|
+
\
|
389
|
+
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
390
|
+
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
391
|
+
\
|
392
|
+
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
393
|
+
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
394
|
+
} while ((void)0, 0)
|
395
|
+
|
396
|
+
#define UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1) \
|
397
|
+
do { \
|
398
|
+
B0 = _mm512_permutex_epi64(B0, _MM_SHUFFLE(2, 1, 0, 3)); \
|
399
|
+
B1 = _mm512_permutex_epi64(B1, _MM_SHUFFLE(2, 1, 0, 3)); \
|
400
|
+
\
|
401
|
+
C0 = _mm512_permutex_epi64(C0, _MM_SHUFFLE(1, 0, 3, 2)); \
|
402
|
+
C1 = _mm512_permutex_epi64(C1, _MM_SHUFFLE(1, 0, 3, 2)); \
|
403
|
+
\
|
404
|
+
D0 = _mm512_permutex_epi64(D0, _MM_SHUFFLE(0, 3, 2, 1)); \
|
405
|
+
D1 = _mm512_permutex_epi64(D1, _MM_SHUFFLE(0, 3, 2, 1)); \
|
406
|
+
} while ((void)0, 0)
|
407
|
+
|
408
|
+
#define BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1) \
|
409
|
+
do { \
|
410
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
411
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
412
|
+
\
|
413
|
+
DIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
414
|
+
\
|
415
|
+
G1(A0, B0, C0, D0, A1, B1, C1, D1); \
|
416
|
+
G2(A0, B0, C0, D0, A1, B1, C1, D1); \
|
417
|
+
\
|
418
|
+
UNDIAGONALIZE(A0, B0, C0, D0, A1, B1, C1, D1); \
|
419
|
+
} while ((void)0, 0)
|
420
|
+
|
421
|
+
#define SWAP_HALVES(A0, A1) \
|
422
|
+
do { \
|
423
|
+
__m512i t0, t1; \
|
424
|
+
t0 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(1, 0, 1, 0)); \
|
425
|
+
t1 = _mm512_shuffle_i64x2(A0, A1, _MM_SHUFFLE(3, 2, 3, 2)); \
|
426
|
+
A0 = t0; \
|
427
|
+
A1 = t1; \
|
428
|
+
} while((void)0, 0)
|
429
|
+
|
430
|
+
#define SWAP_QUARTERS(A0, A1) \
|
431
|
+
do { \
|
432
|
+
SWAP_HALVES(A0, A1); \
|
433
|
+
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
434
|
+
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
435
|
+
} while((void)0, 0)
|
436
|
+
|
437
|
+
#define UNSWAP_QUARTERS(A0, A1) \
|
438
|
+
do { \
|
439
|
+
A0 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A0); \
|
440
|
+
A1 = _mm512_permutexvar_epi64(_mm512_setr_epi64(0, 1, 4, 5, 2, 3, 6, 7), A1); \
|
441
|
+
SWAP_HALVES(A0, A1); \
|
442
|
+
} while((void)0, 0)
|
443
|
+
|
444
|
+
#define BLAKE2_ROUND_1(A0, C0, B0, D0, A1, C1, B1, D1) \
|
445
|
+
do { \
|
446
|
+
SWAP_HALVES(A0, B0); \
|
447
|
+
SWAP_HALVES(C0, D0); \
|
448
|
+
SWAP_HALVES(A1, B1); \
|
449
|
+
SWAP_HALVES(C1, D1); \
|
450
|
+
BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
|
451
|
+
SWAP_HALVES(A0, B0); \
|
452
|
+
SWAP_HALVES(C0, D0); \
|
453
|
+
SWAP_HALVES(A1, B1); \
|
454
|
+
SWAP_HALVES(C1, D1); \
|
455
|
+
} while ((void)0, 0)
|
456
|
+
|
457
|
+
#define BLAKE2_ROUND_2(A0, A1, B0, B1, C0, C1, D0, D1) \
|
458
|
+
do { \
|
459
|
+
SWAP_QUARTERS(A0, A1); \
|
460
|
+
SWAP_QUARTERS(B0, B1); \
|
461
|
+
SWAP_QUARTERS(C0, C1); \
|
462
|
+
SWAP_QUARTERS(D0, D1); \
|
463
|
+
BLAKE2_ROUND(A0, B0, C0, D0, A1, B1, C1, D1); \
|
464
|
+
UNSWAP_QUARTERS(A0, A1); \
|
465
|
+
UNSWAP_QUARTERS(B0, B1); \
|
466
|
+
UNSWAP_QUARTERS(C0, C1); \
|
467
|
+
UNSWAP_QUARTERS(D0, D1); \
|
468
|
+
} while ((void)0, 0)
|
469
|
+
|
470
|
+
#endif /* __AVX512F__ */
|
471
|
+
#endif /* BLAKE_ROUND_MKA_OPT_H */
|