digest-blake3 0.37.0.1 → 1.3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/ext/digest/blake3/blake3.c +16 -3
- data/ext/digest/blake3/blake3.h +4 -2
- data/ext/digest/blake3/blake3_avx2.c +4 -3
- data/ext/digest/blake3/blake3_avx512.c +26 -10
- data/ext/digest/blake3/blake3_dispatch.c +11 -5
- data/ext/digest/blake3/blake3_impl.h +17 -5
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_sse2.c +5 -4
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2 -2
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +8 -8
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +10 -10
- data/ext/digest/blake3/blake3_sse41.c +4 -3
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +4 -4
- data/ext/digest/blake3/extconf.rb +6 -4
- data/lib/digest/blake3/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
|
4
|
+
data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
|
7
|
+
data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
digest-blake3 (
|
4
|
+
digest-blake3 (1.3.3.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0.
|
9
|
+
minitest (5.16.3)
|
10
|
+
rake (13.0.6)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
ruby
|
@@ -19,4 +19,4 @@ DEPENDENCIES
|
|
19
19
|
rake
|
20
20
|
|
21
21
|
BUNDLED WITH
|
22
|
-
|
22
|
+
2.3.1
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#include "blake3.h"
|
6
6
|
#include "blake3_impl.h"
|
7
7
|
|
8
|
+
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
|
9
|
+
|
8
10
|
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
|
9
11
|
uint8_t flags) {
|
10
12
|
memcpy(self->cv, key, BLAKE3_KEY_LEN);
|
@@ -244,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
244
246
|
|
245
247
|
// The wide helper function returns (writes out) an array of chaining values
|
246
248
|
// and returns the length of that array. The number of chaining values returned
|
247
|
-
// is the
|
249
|
+
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
|
248
250
|
// if the input is shorter than that many chunks. The reason for maintaining a
|
249
251
|
// wide array of chaining values going back up the tree, is to allow the
|
250
252
|
// implementation to hash as many parents in parallel as possible.
|
@@ -252,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
252
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
253
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
254
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
255
|
-
// wouldn't be able to implement exendable
|
257
|
+
// wouldn't be able to implement exendable output.) Note that this function is
|
256
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
257
259
|
// codepath.
|
258
260
|
//
|
@@ -338,12 +340,18 @@ INLINE void compress_subtree_to_parent_node(
|
|
338
340
|
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
339
341
|
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
340
342
|
chunk_counter, flags, cv_array);
|
343
|
+
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
|
341
344
|
|
342
345
|
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
|
343
346
|
// compress_subtree_wide() returns more than 2 chaining values. Condense
|
344
347
|
// them into 2 by forming parent nodes repeatedly.
|
345
348
|
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
|
346
|
-
|
349
|
+
// The second half of this loop condition is always true, and we just
|
350
|
+
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
|
351
|
+
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
|
352
|
+
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
|
353
|
+
// this code, test it against that version.
|
354
|
+
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
|
347
355
|
num_cvs =
|
348
356
|
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
|
349
357
|
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
|
@@ -601,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
|
601
609
|
}
|
602
610
|
output_root_bytes(&output, seek, out, out_len);
|
603
611
|
}
|
612
|
+
|
613
|
+
void blake3_hasher_reset(blake3_hasher *self) {
|
614
|
+
chunk_state_reset(&self->chunk, self->key, 0);
|
615
|
+
self->cv_stack_len = 0;
|
616
|
+
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -8,12 +8,12 @@
|
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#define BLAKE3_VERSION_STRING "1.3.3"
|
11
12
|
#define BLAKE3_KEY_LEN 32
|
12
13
|
#define BLAKE3_OUT_LEN 32
|
13
14
|
#define BLAKE3_BLOCK_LEN 64
|
14
15
|
#define BLAKE3_CHUNK_LEN 1024
|
15
16
|
#define BLAKE3_MAX_DEPTH 54
|
16
|
-
#define BLAKE3_MAX_SIMD_DEGREE 16
|
17
17
|
|
18
18
|
// This struct is a private implementation detail. It has to be here because
|
19
19
|
// it's part of blake3_hasher below.
|
@@ -38,11 +38,12 @@ typedef struct {
|
|
38
38
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
39
|
} blake3_hasher;
|
40
40
|
|
41
|
+
const char *blake3_version(void);
|
41
42
|
void blake3_hasher_init(blake3_hasher *self);
|
42
43
|
void blake3_hasher_init_keyed(blake3_hasher *self,
|
43
44
|
const uint8_t key[BLAKE3_KEY_LEN]);
|
44
45
|
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
45
|
-
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
46
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
46
47
|
size_t context_len);
|
47
48
|
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
48
49
|
size_t input_len);
|
@@ -50,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
50
51
|
size_t out_len);
|
51
52
|
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
52
53
|
uint8_t *out, size_t out_len);
|
54
|
+
void blake3_hasher_reset(blake3_hasher *self);
|
53
55
|
|
54
56
|
#ifdef __cplusplus
|
55
57
|
}
|
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
208
208
|
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
209
209
|
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
210
210
|
for (size_t i = 0; i < 8; ++i) {
|
211
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
211
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
212
212
|
}
|
213
213
|
transpose_vecs(&out[0]);
|
214
214
|
transpose_vecs(&out[8]);
|
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
219
219
|
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
|
220
220
|
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
221
221
|
const __m256i add1 = _mm256_and_si256(mask, add0);
|
222
|
-
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
|
222
|
+
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
|
223
223
|
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
|
224
224
|
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
|
225
|
-
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
|
225
|
+
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
|
226
226
|
*out_lo = l;
|
227
227
|
*out_hi = h;
|
228
228
|
}
|
229
229
|
|
230
|
+
static
|
230
231
|
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
|
231
232
|
const uint32_t key[8], uint64_t counter,
|
232
233
|
bool increment_counter, uint8_t flags,
|
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
|
468
468
|
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
469
469
|
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
470
470
|
for (size_t i = 0; i < 4; ++i) {
|
471
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
471
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
472
472
|
}
|
473
473
|
transpose_vecs_128(&out[0]);
|
474
474
|
transpose_vecs_128(&out[4]);
|
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
|
|
488
488
|
*out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
|
489
489
|
}
|
490
490
|
|
491
|
+
static
|
491
492
|
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
|
492
493
|
const uint32_t key[8], uint64_t counter,
|
493
494
|
bool increment_counter, uint8_t flags,
|
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
|
|
724
725
|
out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
725
726
|
out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
726
727
|
for (size_t i = 0; i < 8; ++i) {
|
727
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
729
|
}
|
729
730
|
transpose_vecs_256(&out[0]);
|
730
731
|
transpose_vecs_256(&out[8]);
|
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
|
|
742
743
|
*out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
|
743
744
|
}
|
744
745
|
|
746
|
+
static
|
745
747
|
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
|
746
748
|
const uint32_t key[8], uint64_t counter,
|
747
749
|
bool increment_counter, uint8_t flags,
|
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1037
1039
|
out[14] = loadu_512(&inputs[14][block_offset]);
|
1038
1040
|
out[15] = loadu_512(&inputs[15][block_offset]);
|
1039
1041
|
for (size_t i = 0; i < 16; ++i) {
|
1040
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1042
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1041
1043
|
}
|
1042
1044
|
transpose_vecs_512(out);
|
1043
1045
|
}
|
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1045
1047
|
INLINE void load_counters16(uint64_t counter, bool increment_counter,
|
1046
1048
|
__m512i *out_lo, __m512i *out_hi) {
|
1047
1049
|
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
|
1048
|
-
const __m512i
|
1049
|
-
const __m512i
|
1050
|
-
__m512i
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1050
|
+
const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
1051
|
+
const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
|
1052
|
+
const __m512i low_words = _mm512_add_epi32(
|
1053
|
+
_mm512_set1_epi32((int32_t)counter),
|
1054
|
+
masked_deltas);
|
1055
|
+
// The carry bit is 1 if the high bit of the word was 1 before addition and is
|
1056
|
+
// 0 after.
|
1057
|
+
// NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
|
1058
|
+
// compute the carry bits here, and originally we did, but that intrinsic is
|
1059
|
+
// broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
|
1060
|
+
const __m512i carries = _mm512_srli_epi32(
|
1061
|
+
_mm512_andnot_si512(
|
1062
|
+
low_words, // 0 after (gets inverted by andnot)
|
1063
|
+
_mm512_set1_epi32((int32_t)counter)), // and 1 before
|
1064
|
+
31);
|
1065
|
+
const __m512i high_words = _mm512_add_epi32(
|
1066
|
+
_mm512_set1_epi32((int32_t)(counter >> 32)),
|
1067
|
+
carries);
|
1068
|
+
*out_lo = low_words;
|
1069
|
+
*out_hi = high_words;
|
1055
1070
|
}
|
1056
1071
|
|
1072
|
+
static
|
1057
1073
|
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
|
1058
1074
|
const uint32_t key[8], uint64_t counter,
|
1059
1075
|
bool increment_counter, uint8_t flags,
|
@@ -10,12 +10,14 @@
|
|
10
10
|
#elif defined(__GNUC__)
|
11
11
|
#include <immintrin.h>
|
12
12
|
#else
|
13
|
-
#
|
13
|
+
#undef IS_X86 /* Unimplemented! */
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
|
+
#define MAYBE_UNUSED(x) (void)((x))
|
18
|
+
|
17
19
|
#if defined(IS_X86)
|
18
|
-
static uint64_t xgetbv() {
|
20
|
+
static uint64_t xgetbv(void) {
|
19
21
|
#if defined(_MSC_VER)
|
20
22
|
return _xgetbv(0);
|
21
23
|
#else
|
@@ -80,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
|
|
80
82
|
static
|
81
83
|
#endif
|
82
84
|
enum cpu_feature
|
83
|
-
get_cpu_features() {
|
85
|
+
get_cpu_features(void) {
|
84
86
|
|
85
87
|
if (g_cpu_features != UNDEFINED) {
|
86
88
|
return g_cpu_features;
|
@@ -137,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
137
139
|
uint8_t flags) {
|
138
140
|
#if defined(IS_X86)
|
139
141
|
const enum cpu_feature features = get_cpu_features();
|
142
|
+
MAYBE_UNUSED(features);
|
140
143
|
#if !defined(BLAKE3_NO_AVX512)
|
141
144
|
if (features & AVX512VL) {
|
142
145
|
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
|
@@ -165,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
165
168
|
uint8_t out[64]) {
|
166
169
|
#if defined(IS_X86)
|
167
170
|
const enum cpu_feature features = get_cpu_features();
|
171
|
+
MAYBE_UNUSED(features);
|
168
172
|
#if !defined(BLAKE3_NO_AVX512)
|
169
173
|
if (features & AVX512VL) {
|
170
174
|
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
@@ -193,6 +197,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
193
197
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
194
198
|
#if defined(IS_X86)
|
195
199
|
const enum cpu_feature features = get_cpu_features();
|
200
|
+
MAYBE_UNUSED(features);
|
196
201
|
#if !defined(BLAKE3_NO_AVX512)
|
197
202
|
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
198
203
|
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
@@ -227,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
227
232
|
#endif
|
228
233
|
#endif
|
229
234
|
|
230
|
-
#if
|
235
|
+
#if BLAKE3_USE_NEON == 1
|
231
236
|
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
|
232
237
|
increment_counter, flags, flags_start, flags_end, out);
|
233
238
|
return;
|
@@ -242,6 +247,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
242
247
|
size_t blake3_simd_degree(void) {
|
243
248
|
#if defined(IS_X86)
|
244
249
|
const enum cpu_feature features = get_cpu_features();
|
250
|
+
MAYBE_UNUSED(features);
|
245
251
|
#if !defined(BLAKE3_NO_AVX512)
|
246
252
|
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
247
253
|
return 16;
|
@@ -263,7 +269,7 @@ size_t blake3_simd_degree(void) {
|
|
263
269
|
}
|
264
270
|
#endif
|
265
271
|
#endif
|
266
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
267
273
|
return 4;
|
268
274
|
#endif
|
269
275
|
return 1;
|
@@ -38,16 +38,28 @@ enum blake3_flags {
|
|
38
38
|
#define IS_X86_32
|
39
39
|
#endif
|
40
40
|
|
41
|
+
#if defined(__aarch64__) || defined(_M_ARM64)
|
42
|
+
#define IS_AARCH64
|
43
|
+
#endif
|
44
|
+
|
41
45
|
#if defined(IS_X86)
|
42
46
|
#if defined(_MSC_VER)
|
43
47
|
#include <intrin.h>
|
44
48
|
#endif
|
45
|
-
#
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#if !defined(BLAKE3_USE_NEON)
|
52
|
+
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
|
53
|
+
#if defined(IS_AARCH64)
|
54
|
+
#define BLAKE3_USE_NEON 1
|
55
|
+
#else
|
56
|
+
#define BLAKE3_USE_NEON 0
|
57
|
+
#endif
|
46
58
|
#endif
|
47
59
|
|
48
60
|
#if defined(IS_X86)
|
49
61
|
#define MAX_SIMD_DEGREE 16
|
50
|
-
#elif
|
62
|
+
#elif BLAKE3_USE_NEON == 1
|
51
63
|
#define MAX_SIMD_DEGREE 4
|
52
64
|
#else
|
53
65
|
#define MAX_SIMD_DEGREE 1
|
@@ -83,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
|
|
83
95
|
#elif defined(_MSC_VER) && defined(IS_X86_32)
|
84
96
|
if(x >> 32) {
|
85
97
|
unsigned long index;
|
86
|
-
_BitScanReverse(&index, x >> 32);
|
98
|
+
_BitScanReverse(&index, (unsigned long)(x >> 32));
|
87
99
|
return 32 + index;
|
88
100
|
} else {
|
89
101
|
unsigned long index;
|
90
|
-
_BitScanReverse(&index, x);
|
102
|
+
_BitScanReverse(&index, (unsigned long)x);
|
91
103
|
return index;
|
92
104
|
}
|
93
105
|
#else
|
@@ -257,7 +269,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
|
257
269
|
#endif
|
258
270
|
#endif
|
259
271
|
|
260
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
261
273
|
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
262
274
|
size_t blocks, const uint32_t key[8],
|
263
275
|
uint64_t counter, bool increment_counter,
|
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
#include <arm_neon.h>
|
4
4
|
|
5
|
-
|
5
|
+
#ifdef __ARM_BIG_ENDIAN
|
6
|
+
#error "This implementation only supports little-endian ARM."
|
7
|
+
// It might be that all we need for big-endian support here is to get the loads
|
8
|
+
// and stores right, but step zero would be finding a way to test it in CI.
|
9
|
+
#endif
|
10
|
+
|
6
11
|
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
7
12
|
// vld1q_u32 has alignment requirements. Don't use it.
|
8
13
|
uint32x4_t x;
|
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
|
78
78
|
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
|
79
79
|
}
|
80
80
|
|
81
|
-
INLINE __m128i blend_epi16(__m128i a, __m128i b, const
|
81
|
+
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
|
82
82
|
const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
83
83
|
__m128i mask = _mm_set1_epi16(imm8);
|
84
84
|
mask = _mm_and_si128(mask, bits);
|
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
435
435
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
436
436
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
437
437
|
for (size_t i = 0; i < 4; ++i) {
|
438
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
438
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
439
439
|
}
|
440
440
|
transpose_vecs(&out[0]);
|
441
441
|
transpose_vecs(&out[4]);
|
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
448
448
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
449
449
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
450
450
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
451
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
451
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
452
452
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
453
453
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
454
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
454
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
455
455
|
*out_lo = l;
|
456
456
|
*out_hi = h;
|
457
457
|
}
|
458
458
|
|
459
|
+
static
|
459
460
|
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
|
460
461
|
const uint32_t key[8], uint64_t counter,
|
461
462
|
bool increment_counter, uint8_t flags,
|
@@ -1704,7 +1704,7 @@ blake3_hash_many_sse2:
|
|
1704
1704
|
pshufd xmm15, xmm11, 0x93
|
1705
1705
|
shl rax, 0x20
|
1706
1706
|
or rax, 0x40
|
1707
|
-
|
1707
|
+
movq xmm3, rax
|
1708
1708
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
1709
1709
|
movaps xmm3, xmmword ptr [rsp]
|
1710
1710
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
@@ -1917,7 +1917,7 @@ blake3_hash_many_sse2:
|
|
1917
1917
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
1918
1918
|
shl rax, 32
|
1919
1919
|
or rax, 64
|
1920
|
-
|
1920
|
+
movq xmm12, rax
|
1921
1921
|
movdqa xmm3, xmm13
|
1922
1922
|
punpcklqdq xmm3, xmm12
|
1923
1923
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
@@ -1715,7 +1715,7 @@ blake3_hash_many_sse2:
|
|
1715
1715
|
pshufd xmm15, xmm11, 0x93
|
1716
1716
|
shl rax, 0x20
|
1717
1717
|
or rax, 0x40
|
1718
|
-
|
1718
|
+
movq xmm3, rax
|
1719
1719
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
1720
1720
|
movaps xmm3, xmmword ptr [rsp]
|
1721
1721
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
@@ -1928,7 +1928,7 @@ blake3_hash_many_sse2:
|
|
1928
1928
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
1929
1929
|
shl rax, 32
|
1930
1930
|
or rax, 64
|
1931
|
-
|
1931
|
+
movq xmm12, rax
|
1932
1932
|
movdqa xmm3, xmm13
|
1933
1933
|
punpcklqdq xmm3, xmm12
|
1934
1934
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
|
|
2137
2137
|
por xmm9, xmm8
|
2138
2138
|
movdqa xmm8, xmm7
|
2139
2139
|
punpcklqdq xmm8, xmm5
|
2140
|
-
movdqa
|
2140
|
+
movdqa xmm14, xmm6
|
2141
2141
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
2142
|
-
pand
|
2143
|
-
por xmm8,
|
2142
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
2143
|
+
por xmm8, xmm14
|
2144
2144
|
pshufd xmm8, xmm8, 0x78
|
2145
2145
|
punpckhdq xmm5, xmm7
|
2146
2146
|
punpckldq xmm6, xmm5
|
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
|
|
2268
2268
|
por xmm9, xmm8
|
2269
2269
|
movdqa xmm8, xmm7
|
2270
2270
|
punpcklqdq xmm8, xmm5
|
2271
|
-
movdqa
|
2271
|
+
movdqa xmm14, xmm6
|
2272
2272
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
2273
|
-
pand
|
2274
|
-
por xmm8,
|
2273
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
2274
|
+
por xmm8, xmm14
|
2275
2275
|
pshufd xmm8, xmm8, 0x78
|
2276
2276
|
punpckhdq xmm5, xmm7
|
2277
2277
|
punpckldq xmm6, xmm5
|
@@ -2054,8 +2054,8 @@ _blake3_compress_in_place_sse2 PROC
|
|
2054
2054
|
movzx r8d, r8b
|
2055
2055
|
shl rax, 32
|
2056
2056
|
add r8, rax
|
2057
|
-
|
2058
|
-
|
2057
|
+
movd xmm3, r9
|
2058
|
+
movd xmm4, r8
|
2059
2059
|
punpcklqdq xmm3, xmm4
|
2060
2060
|
movups xmm4, xmmword ptr [rdx]
|
2061
2061
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
|
|
2139
2139
|
por xmm9, xmm8
|
2140
2140
|
movdqa xmm8, xmm7
|
2141
2141
|
punpcklqdq xmm8, xmm5
|
2142
|
-
movdqa
|
2142
|
+
movdqa xmm14, xmm6
|
2143
2143
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
2144
|
-
pand
|
2145
|
-
por xmm8,
|
2144
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
2145
|
+
por xmm8, xmm14
|
2146
2146
|
pshufd xmm8, xmm8, 78H
|
2147
2147
|
punpckhdq xmm5, xmm7
|
2148
2148
|
punpckldq xmm6, xmm5
|
@@ -2186,8 +2186,8 @@ _blake3_compress_xof_sse2 PROC
|
|
2186
2186
|
mov r10, qword ptr [rsp+0A8H]
|
2187
2187
|
shl rax, 32
|
2188
2188
|
add r8, rax
|
2189
|
-
|
2190
|
-
|
2189
|
+
movd xmm3, r9
|
2190
|
+
movd xmm4, r8
|
2191
2191
|
punpcklqdq xmm3, xmm4
|
2192
2192
|
movups xmm4, xmmword ptr [rdx]
|
2193
2193
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
|
|
2271
2271
|
por xmm9, xmm8
|
2272
2272
|
movdqa xmm8, xmm7
|
2273
2273
|
punpcklqdq xmm8, xmm5
|
2274
|
-
movdqa
|
2274
|
+
movdqa xmm14, xmm6
|
2275
2275
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
2276
|
-
pand
|
2277
|
-
por xmm8,
|
2276
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
2277
|
+
por xmm8, xmm14
|
2278
2278
|
pshufd xmm8, xmm8, 78H
|
2279
2279
|
punpckhdq xmm5, xmm7
|
2280
2280
|
punpckldq xmm6, xmm5
|
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
429
429
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
430
430
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
431
431
|
for (size_t i = 0; i < 4; ++i) {
|
432
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
432
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
433
433
|
}
|
434
434
|
transpose_vecs(&out[0]);
|
435
435
|
transpose_vecs(&out[4]);
|
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
442
442
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
443
443
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
444
444
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
445
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
445
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
446
446
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
447
447
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
448
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
448
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
449
449
|
*out_lo = l;
|
450
450
|
*out_hi = h;
|
451
451
|
}
|
452
452
|
|
453
|
+
static
|
453
454
|
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
|
454
455
|
const uint32_t key[8], uint64_t counter,
|
455
456
|
bool increment_counter, uint8_t flags,
|
@@ -1817,8 +1817,8 @@ _blake3_compress_in_place_sse41 PROC
|
|
1817
1817
|
movzx r8d, r8b
|
1818
1818
|
shl rax, 32
|
1819
1819
|
add r8, rax
|
1820
|
-
|
1821
|
-
|
1820
|
+
movd xmm3, r9
|
1821
|
+
movd xmm4, r8
|
1822
1822
|
punpcklqdq xmm3, xmm4
|
1823
1823
|
movups xmm4, xmmword ptr [rdx]
|
1824
1824
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -1938,8 +1938,8 @@ _blake3_compress_xof_sse41 PROC
|
|
1938
1938
|
mov r10, qword ptr [rsp+0A8H]
|
1939
1939
|
shl rax, 32
|
1940
1940
|
add r8, rax
|
1941
|
-
|
1942
|
-
|
1941
|
+
movd xmm3, r9
|
1942
|
+
movd xmm4, r8
|
1943
1943
|
punpcklqdq xmm3, xmm4
|
1944
1944
|
movups xmm4, xmmword ptr [rdx]
|
1945
1945
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
|
-
check_supported_flags("-
|
39
|
-
check_supported_flags("-
|
40
|
-
check_supported_flags("-
|
37
|
+
unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
|
38
|
+
check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
|
39
|
+
check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
|
40
|
+
check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
|
41
|
+
check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
|
42
|
+
end
|
41
43
|
|
42
44
|
if have_header("arm_neon.h")
|
43
45
|
$objs << "blake3_neon.o"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digest-blake3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.3.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Bryant
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
|
|
100
100
|
licenses:
|
101
101
|
- MIT
|
102
102
|
metadata: {}
|
103
|
-
post_install_message:
|
103
|
+
post_install_message:
|
104
104
|
rdoc_options: []
|
105
105
|
require_paths:
|
106
106
|
- lib
|
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0'
|
118
118
|
requirements: []
|
119
|
-
rubygems_version: 3.
|
120
|
-
signing_key:
|
119
|
+
rubygems_version: 3.1.6
|
120
|
+
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: BLAKE3 for Ruby
|
123
123
|
test_files: []
|