digest-blake3 1.2.0.0 → 1.3.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/ext/digest/blake3/blake3.c +7 -2
- data/ext/digest/blake3/blake3.h +2 -1
- data/ext/digest/blake3/blake3_avx2.c +4 -3
- data/ext/digest/blake3/blake3_avx512.c +26 -10
- data/ext/digest/blake3/blake3_dispatch.c +3 -3
- data/ext/digest/blake3/blake3_impl.h +2 -3
- data/ext/digest/blake3/blake3_sse2.c +5 -4
- data/ext/digest/blake3/blake3_sse41.c +4 -3
- data/ext/digest/blake3/extconf.rb +6 -4
- data/lib/digest/blake3/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
|
4
|
+
data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
|
7
|
+
data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
digest-blake3 (1.
|
4
|
+
digest-blake3 (1.3.3.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0.
|
9
|
+
minitest (5.16.3)
|
10
|
+
rake (13.0.6)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
ruby
|
@@ -19,4 +19,4 @@ DEPENDENCIES
|
|
19
19
|
rake
|
20
20
|
|
21
21
|
BUNDLED WITH
|
22
|
-
|
22
|
+
2.3.1
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -246,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
246
246
|
|
247
247
|
// The wide helper function returns (writes out) an array of chaining values
|
248
248
|
// and returns the length of that array. The number of chaining values returned
|
249
|
-
// is the
|
249
|
+
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
|
250
250
|
// if the input is shorter than that many chunks. The reason for maintaining a
|
251
251
|
// wide array of chaining values going back up the tree, is to allow the
|
252
252
|
// implementation to hash as many parents in parallel as possible.
|
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
254
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
255
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
256
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
257
|
-
// wouldn't be able to implement exendable
|
257
|
+
// wouldn't be able to implement exendable output.) Note that this function is
|
258
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
259
259
|
// codepath.
|
260
260
|
//
|
@@ -609,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
|
609
609
|
}
|
610
610
|
output_root_bytes(&output, seek, out, out_len);
|
611
611
|
}
|
612
|
+
|
613
|
+
void blake3_hasher_reset(blake3_hasher *self) {
|
614
|
+
chunk_state_reset(&self->chunk, self->key, 0);
|
615
|
+
self->cv_stack_len = 0;
|
616
|
+
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
-
#define BLAKE3_VERSION_STRING "1.
|
11
|
+
#define BLAKE3_VERSION_STRING "1.3.3"
|
12
12
|
#define BLAKE3_KEY_LEN 32
|
13
13
|
#define BLAKE3_OUT_LEN 32
|
14
14
|
#define BLAKE3_BLOCK_LEN 64
|
@@ -51,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
51
51
|
size_t out_len);
|
52
52
|
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
53
|
uint8_t *out, size_t out_len);
|
54
|
+
void blake3_hasher_reset(blake3_hasher *self);
|
54
55
|
|
55
56
|
#ifdef __cplusplus
|
56
57
|
}
|
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
208
208
|
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
209
209
|
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
210
210
|
for (size_t i = 0; i < 8; ++i) {
|
211
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
211
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
212
212
|
}
|
213
213
|
transpose_vecs(&out[0]);
|
214
214
|
transpose_vecs(&out[8]);
|
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
219
219
|
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
|
220
220
|
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
221
221
|
const __m256i add1 = _mm256_and_si256(mask, add0);
|
222
|
-
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
|
222
|
+
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
|
223
223
|
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
|
224
224
|
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
|
225
|
-
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
|
225
|
+
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
|
226
226
|
*out_lo = l;
|
227
227
|
*out_hi = h;
|
228
228
|
}
|
229
229
|
|
230
|
+
static
|
230
231
|
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
|
231
232
|
const uint32_t key[8], uint64_t counter,
|
232
233
|
bool increment_counter, uint8_t flags,
|
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
|
468
468
|
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
469
469
|
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
470
470
|
for (size_t i = 0; i < 4; ++i) {
|
471
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
471
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
472
472
|
}
|
473
473
|
transpose_vecs_128(&out[0]);
|
474
474
|
transpose_vecs_128(&out[4]);
|
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
|
|
488
488
|
*out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
|
489
489
|
}
|
490
490
|
|
491
|
+
static
|
491
492
|
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
|
492
493
|
const uint32_t key[8], uint64_t counter,
|
493
494
|
bool increment_counter, uint8_t flags,
|
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
|
|
724
725
|
out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
725
726
|
out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
726
727
|
for (size_t i = 0; i < 8; ++i) {
|
727
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
729
|
}
|
729
730
|
transpose_vecs_256(&out[0]);
|
730
731
|
transpose_vecs_256(&out[8]);
|
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
|
|
742
743
|
*out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
|
743
744
|
}
|
744
745
|
|
746
|
+
static
|
745
747
|
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
|
746
748
|
const uint32_t key[8], uint64_t counter,
|
747
749
|
bool increment_counter, uint8_t flags,
|
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1037
1039
|
out[14] = loadu_512(&inputs[14][block_offset]);
|
1038
1040
|
out[15] = loadu_512(&inputs[15][block_offset]);
|
1039
1041
|
for (size_t i = 0; i < 16; ++i) {
|
1040
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1042
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1041
1043
|
}
|
1042
1044
|
transpose_vecs_512(out);
|
1043
1045
|
}
|
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1045
1047
|
INLINE void load_counters16(uint64_t counter, bool increment_counter,
|
1046
1048
|
__m512i *out_lo, __m512i *out_hi) {
|
1047
1049
|
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
|
1048
|
-
const __m512i
|
1049
|
-
const __m512i
|
1050
|
-
__m512i
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1050
|
+
const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
1051
|
+
const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
|
1052
|
+
const __m512i low_words = _mm512_add_epi32(
|
1053
|
+
_mm512_set1_epi32((int32_t)counter),
|
1054
|
+
masked_deltas);
|
1055
|
+
// The carry bit is 1 if the high bit of the word was 1 before addition and is
|
1056
|
+
// 0 after.
|
1057
|
+
// NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
|
1058
|
+
// compute the carry bits here, and originally we did, but that intrinsic is
|
1059
|
+
// broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
|
1060
|
+
const __m512i carries = _mm512_srli_epi32(
|
1061
|
+
_mm512_andnot_si512(
|
1062
|
+
low_words, // 0 after (gets inverted by andnot)
|
1063
|
+
_mm512_set1_epi32((int32_t)counter)), // and 1 before
|
1064
|
+
31);
|
1065
|
+
const __m512i high_words = _mm512_add_epi32(
|
1066
|
+
_mm512_set1_epi32((int32_t)(counter >> 32)),
|
1067
|
+
carries);
|
1068
|
+
*out_lo = low_words;
|
1069
|
+
*out_hi = high_words;
|
1055
1070
|
}
|
1056
1071
|
|
1072
|
+
static
|
1057
1073
|
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
|
1058
1074
|
const uint32_t key[8], uint64_t counter,
|
1059
1075
|
bool increment_counter, uint8_t flags,
|
@@ -10,14 +10,14 @@
|
|
10
10
|
#elif defined(__GNUC__)
|
11
11
|
#include <immintrin.h>
|
12
12
|
#else
|
13
|
-
#
|
13
|
+
#undef IS_X86 /* Unimplemented! */
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
17
|
#define MAYBE_UNUSED(x) (void)((x))
|
18
18
|
|
19
19
|
#if defined(IS_X86)
|
20
|
-
static uint64_t xgetbv() {
|
20
|
+
static uint64_t xgetbv(void) {
|
21
21
|
#if defined(_MSC_VER)
|
22
22
|
return _xgetbv(0);
|
23
23
|
#else
|
@@ -82,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
|
|
82
82
|
static
|
83
83
|
#endif
|
84
84
|
enum cpu_feature
|
85
|
-
get_cpu_features() {
|
85
|
+
get_cpu_features(void) {
|
86
86
|
|
87
87
|
if (g_cpu_features != UNDEFINED) {
|
88
88
|
return g_cpu_features;
|
@@ -46,7 +46,6 @@ enum blake3_flags {
|
|
46
46
|
#if defined(_MSC_VER)
|
47
47
|
#include <intrin.h>
|
48
48
|
#endif
|
49
|
-
#include <immintrin.h>
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#if !defined(BLAKE3_USE_NEON)
|
@@ -96,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
|
|
96
95
|
#elif defined(_MSC_VER) && defined(IS_X86_32)
|
97
96
|
if(x >> 32) {
|
98
97
|
unsigned long index;
|
99
|
-
_BitScanReverse(&index, x >> 32);
|
98
|
+
_BitScanReverse(&index, (unsigned long)(x >> 32));
|
100
99
|
return 32 + index;
|
101
100
|
} else {
|
102
101
|
unsigned long index;
|
103
|
-
_BitScanReverse(&index, x);
|
102
|
+
_BitScanReverse(&index, (unsigned long)x);
|
104
103
|
return index;
|
105
104
|
}
|
106
105
|
#else
|
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
|
78
78
|
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
|
79
79
|
}
|
80
80
|
|
81
|
-
INLINE __m128i blend_epi16(__m128i a, __m128i b, const
|
81
|
+
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
|
82
82
|
const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
83
83
|
__m128i mask = _mm_set1_epi16(imm8);
|
84
84
|
mask = _mm_and_si128(mask, bits);
|
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
435
435
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
436
436
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
437
437
|
for (size_t i = 0; i < 4; ++i) {
|
438
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
438
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
439
439
|
}
|
440
440
|
transpose_vecs(&out[0]);
|
441
441
|
transpose_vecs(&out[4]);
|
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
448
448
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
449
449
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
450
450
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
451
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
451
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
452
452
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
453
453
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
454
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
454
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
455
455
|
*out_lo = l;
|
456
456
|
*out_hi = h;
|
457
457
|
}
|
458
458
|
|
459
|
+
static
|
459
460
|
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
|
460
461
|
const uint32_t key[8], uint64_t counter,
|
461
462
|
bool increment_counter, uint8_t flags,
|
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
429
429
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
430
430
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
431
431
|
for (size_t i = 0; i < 4; ++i) {
|
432
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
432
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
433
433
|
}
|
434
434
|
transpose_vecs(&out[0]);
|
435
435
|
transpose_vecs(&out[4]);
|
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
442
442
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
443
443
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
444
444
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
445
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
445
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
446
446
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
447
447
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
448
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
448
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
449
449
|
*out_lo = l;
|
450
450
|
*out_hi = h;
|
451
451
|
}
|
452
452
|
|
453
|
+
static
|
453
454
|
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
|
454
455
|
const uint32_t key[8], uint64_t counter,
|
455
456
|
bool increment_counter, uint8_t flags,
|
@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
|
-
check_supported_flags("-
|
39
|
-
check_supported_flags("-
|
40
|
-
check_supported_flags("-
|
37
|
+
unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
|
38
|
+
check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
|
39
|
+
check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
|
40
|
+
check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
|
41
|
+
check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
|
42
|
+
end
|
41
43
|
|
42
44
|
if have_header("arm_neon.h")
|
43
45
|
$objs << "blake3_neon.o"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digest-blake3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Bryant
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
|
|
100
100
|
licenses:
|
101
101
|
- MIT
|
102
102
|
metadata: {}
|
103
|
-
post_install_message:
|
103
|
+
post_install_message:
|
104
104
|
rdoc_options: []
|
105
105
|
require_paths:
|
106
106
|
- lib
|
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0'
|
118
118
|
requirements: []
|
119
|
-
rubygems_version: 3.
|
120
|
-
signing_key:
|
119
|
+
rubygems_version: 3.1.6
|
120
|
+
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: BLAKE3 for Ruby
|
123
123
|
test_files: []
|