digest-blake3 1.2.0.0 → 1.4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/ext/digest/blake3/blake3.c +7 -2
- data/ext/digest/blake3/blake3.h +2 -1
- data/ext/digest/blake3/blake3_avx2.c +4 -3
- data/ext/digest/blake3/blake3_avx512.c +26 -10
- data/ext/digest/blake3/blake3_dispatch.c +3 -3
- data/ext/digest/blake3/blake3_impl.h +2 -3
- data/ext/digest/blake3/blake3_sse2.c +5 -4
- data/ext/digest/blake3/blake3_sse41.c +4 -3
- data/ext/digest/blake3/extconf.rb +7 -5
- data/lib/digest/blake3/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 600afca6f08145f3e28b49fbe757b661368d58dc8ec20e1778a915407dcc660a
|
4
|
+
data.tar.gz: 1cd455e9caf97fd0f514623ba6b9d7f74249071af8cb8b5377ba10de91c5eb34
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d515228fab5f92576d9b1f67d66ffff97623f154dcab9a1dcb140b9e69884325797991d1f87d7f5cb26cb6397a86e989a1c1f2163b478641efdc4b85b5772026
|
7
|
+
data.tar.gz: ed75418dda098a8700554b871189c9995e1e6e4969d86cc8d9afd46d8439ecf985fb111f1a6d6adb9173f68cb33c7c261b35b0820439c48a92e354da2f662e35
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
digest-blake3 (1.
|
4
|
+
digest-blake3 (1.3.3.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0.
|
9
|
+
minitest (5.16.3)
|
10
|
+
rake (13.0.6)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
ruby
|
@@ -19,4 +19,4 @@ DEPENDENCIES
|
|
19
19
|
rake
|
20
20
|
|
21
21
|
BUNDLED WITH
|
22
|
-
|
22
|
+
2.3.1
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -246,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
246
246
|
|
247
247
|
// The wide helper function returns (writes out) an array of chaining values
|
248
248
|
// and returns the length of that array. The number of chaining values returned
|
249
|
-
// is the
|
249
|
+
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
|
250
250
|
// if the input is shorter than that many chunks. The reason for maintaining a
|
251
251
|
// wide array of chaining values going back up the tree, is to allow the
|
252
252
|
// implementation to hash as many parents in parallel as possible.
|
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
254
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
255
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
256
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
257
|
-
// wouldn't be able to implement exendable
|
257
|
+
// wouldn't be able to implement exendable output.) Note that this function is
|
258
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
259
259
|
// codepath.
|
260
260
|
//
|
@@ -609,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
|
609
609
|
}
|
610
610
|
output_root_bytes(&output, seek, out, out_len);
|
611
611
|
}
|
612
|
+
|
613
|
+
void blake3_hasher_reset(blake3_hasher *self) {
|
614
|
+
chunk_state_reset(&self->chunk, self->key, 0);
|
615
|
+
self->cv_stack_len = 0;
|
616
|
+
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -8,7 +8,7 @@
|
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
-
#define BLAKE3_VERSION_STRING "1.
|
11
|
+
#define BLAKE3_VERSION_STRING "1.3.3"
|
12
12
|
#define BLAKE3_KEY_LEN 32
|
13
13
|
#define BLAKE3_OUT_LEN 32
|
14
14
|
#define BLAKE3_BLOCK_LEN 64
|
@@ -51,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
51
51
|
size_t out_len);
|
52
52
|
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
53
|
uint8_t *out, size_t out_len);
|
54
|
+
void blake3_hasher_reset(blake3_hasher *self);
|
54
55
|
|
55
56
|
#ifdef __cplusplus
|
56
57
|
}
|
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
208
208
|
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
209
209
|
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
210
210
|
for (size_t i = 0; i < 8; ++i) {
|
211
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
211
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
212
212
|
}
|
213
213
|
transpose_vecs(&out[0]);
|
214
214
|
transpose_vecs(&out[8]);
|
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
219
219
|
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
|
220
220
|
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
221
221
|
const __m256i add1 = _mm256_and_si256(mask, add0);
|
222
|
-
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
|
222
|
+
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
|
223
223
|
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
|
224
224
|
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
|
225
|
-
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
|
225
|
+
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
|
226
226
|
*out_lo = l;
|
227
227
|
*out_hi = h;
|
228
228
|
}
|
229
229
|
|
230
|
+
static
|
230
231
|
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
|
231
232
|
const uint32_t key[8], uint64_t counter,
|
232
233
|
bool increment_counter, uint8_t flags,
|
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
|
468
468
|
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
469
469
|
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
470
470
|
for (size_t i = 0; i < 4; ++i) {
|
471
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
471
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
472
472
|
}
|
473
473
|
transpose_vecs_128(&out[0]);
|
474
474
|
transpose_vecs_128(&out[4]);
|
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
|
|
488
488
|
*out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
|
489
489
|
}
|
490
490
|
|
491
|
+
static
|
491
492
|
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
|
492
493
|
const uint32_t key[8], uint64_t counter,
|
493
494
|
bool increment_counter, uint8_t flags,
|
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
|
|
724
725
|
out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
725
726
|
out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
726
727
|
for (size_t i = 0; i < 8; ++i) {
|
727
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
729
|
}
|
729
730
|
transpose_vecs_256(&out[0]);
|
730
731
|
transpose_vecs_256(&out[8]);
|
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
|
|
742
743
|
*out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
|
743
744
|
}
|
744
745
|
|
746
|
+
static
|
745
747
|
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
|
746
748
|
const uint32_t key[8], uint64_t counter,
|
747
749
|
bool increment_counter, uint8_t flags,
|
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1037
1039
|
out[14] = loadu_512(&inputs[14][block_offset]);
|
1038
1040
|
out[15] = loadu_512(&inputs[15][block_offset]);
|
1039
1041
|
for (size_t i = 0; i < 16; ++i) {
|
1040
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1042
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1041
1043
|
}
|
1042
1044
|
transpose_vecs_512(out);
|
1043
1045
|
}
|
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1045
1047
|
INLINE void load_counters16(uint64_t counter, bool increment_counter,
|
1046
1048
|
__m512i *out_lo, __m512i *out_hi) {
|
1047
1049
|
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
|
1048
|
-
const __m512i
|
1049
|
-
const __m512i
|
1050
|
-
__m512i
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1050
|
+
const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
1051
|
+
const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
|
1052
|
+
const __m512i low_words = _mm512_add_epi32(
|
1053
|
+
_mm512_set1_epi32((int32_t)counter),
|
1054
|
+
masked_deltas);
|
1055
|
+
// The carry bit is 1 if the high bit of the word was 1 before addition and is
|
1056
|
+
// 0 after.
|
1057
|
+
// NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
|
1058
|
+
// compute the carry bits here, and originally we did, but that intrinsic is
|
1059
|
+
// broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
|
1060
|
+
const __m512i carries = _mm512_srli_epi32(
|
1061
|
+
_mm512_andnot_si512(
|
1062
|
+
low_words, // 0 after (gets inverted by andnot)
|
1063
|
+
_mm512_set1_epi32((int32_t)counter)), // and 1 before
|
1064
|
+
31);
|
1065
|
+
const __m512i high_words = _mm512_add_epi32(
|
1066
|
+
_mm512_set1_epi32((int32_t)(counter >> 32)),
|
1067
|
+
carries);
|
1068
|
+
*out_lo = low_words;
|
1069
|
+
*out_hi = high_words;
|
1055
1070
|
}
|
1056
1071
|
|
1072
|
+
static
|
1057
1073
|
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
|
1058
1074
|
const uint32_t key[8], uint64_t counter,
|
1059
1075
|
bool increment_counter, uint8_t flags,
|
@@ -10,14 +10,14 @@
|
|
10
10
|
#elif defined(__GNUC__)
|
11
11
|
#include <immintrin.h>
|
12
12
|
#else
|
13
|
-
#
|
13
|
+
#undef IS_X86 /* Unimplemented! */
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
17
|
#define MAYBE_UNUSED(x) (void)((x))
|
18
18
|
|
19
19
|
#if defined(IS_X86)
|
20
|
-
static uint64_t xgetbv() {
|
20
|
+
static uint64_t xgetbv(void) {
|
21
21
|
#if defined(_MSC_VER)
|
22
22
|
return _xgetbv(0);
|
23
23
|
#else
|
@@ -82,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
|
|
82
82
|
static
|
83
83
|
#endif
|
84
84
|
enum cpu_feature
|
85
|
-
get_cpu_features() {
|
85
|
+
get_cpu_features(void) {
|
86
86
|
|
87
87
|
if (g_cpu_features != UNDEFINED) {
|
88
88
|
return g_cpu_features;
|
@@ -46,7 +46,6 @@ enum blake3_flags {
|
|
46
46
|
#if defined(_MSC_VER)
|
47
47
|
#include <intrin.h>
|
48
48
|
#endif
|
49
|
-
#include <immintrin.h>
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#if !defined(BLAKE3_USE_NEON)
|
@@ -96,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
|
|
96
95
|
#elif defined(_MSC_VER) && defined(IS_X86_32)
|
97
96
|
if(x >> 32) {
|
98
97
|
unsigned long index;
|
99
|
-
_BitScanReverse(&index, x >> 32);
|
98
|
+
_BitScanReverse(&index, (unsigned long)(x >> 32));
|
100
99
|
return 32 + index;
|
101
100
|
} else {
|
102
101
|
unsigned long index;
|
103
|
-
_BitScanReverse(&index, x);
|
102
|
+
_BitScanReverse(&index, (unsigned long)x);
|
104
103
|
return index;
|
105
104
|
}
|
106
105
|
#else
|
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
|
78
78
|
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
|
79
79
|
}
|
80
80
|
|
81
|
-
INLINE __m128i blend_epi16(__m128i a, __m128i b, const
|
81
|
+
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
|
82
82
|
const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
83
83
|
__m128i mask = _mm_set1_epi16(imm8);
|
84
84
|
mask = _mm_and_si128(mask, bits);
|
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
435
435
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
436
436
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
437
437
|
for (size_t i = 0; i < 4; ++i) {
|
438
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
438
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
439
439
|
}
|
440
440
|
transpose_vecs(&out[0]);
|
441
441
|
transpose_vecs(&out[4]);
|
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
448
448
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
449
449
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
450
450
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
451
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
451
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
452
452
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
453
453
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
454
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
454
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
455
455
|
*out_lo = l;
|
456
456
|
*out_hi = h;
|
457
457
|
}
|
458
458
|
|
459
|
+
static
|
459
460
|
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
|
460
461
|
const uint32_t key[8], uint64_t counter,
|
461
462
|
bool increment_counter, uint8_t flags,
|
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
429
429
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
430
430
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
431
431
|
for (size_t i = 0; i < 4; ++i) {
|
432
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
432
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
433
433
|
}
|
434
434
|
transpose_vecs(&out[0]);
|
435
435
|
transpose_vecs(&out[4]);
|
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
442
442
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
443
443
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
444
444
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
445
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
445
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
446
446
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
447
447
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
448
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
448
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
449
449
|
*out_lo = l;
|
450
450
|
*out_hi = h;
|
451
451
|
}
|
452
452
|
|
453
|
+
static
|
453
454
|
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
|
454
455
|
const uint32_t key[8], uint64_t counter,
|
455
456
|
bool increment_counter, uint8_t flags,
|
@@ -34,17 +34,19 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
|
-
check_supported_flags("-
|
39
|
-
check_supported_flags("-
|
40
|
-
check_supported_flags("-
|
37
|
+
unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
|
38
|
+
check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
|
39
|
+
check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
|
40
|
+
check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
|
41
|
+
check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
|
42
|
+
end
|
41
43
|
|
42
44
|
if have_header("arm_neon.h")
|
43
45
|
$objs << "blake3_neon.o"
|
44
46
|
$defs << "-DBLAKE3_USE_NEON"
|
45
47
|
end
|
46
48
|
|
47
|
-
create_makefile("digest/blake3") do |conf|
|
49
|
+
create_makefile("digest/blake3/blake3") do |conf|
|
48
50
|
# annoyingly, we have to repeat this line from the default output, so that it appears above the
|
49
51
|
# defines we add below and therefore becomes the default target. otherwise running 'make' with
|
50
52
|
# no arguments builds the first of our .o files instead of the library.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digest-blake3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Bryant
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -116,7 +116,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0'
|
118
118
|
requirements: []
|
119
|
-
rubygems_version: 3.
|
119
|
+
rubygems_version: 3.4.10
|
120
120
|
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: BLAKE3 for Ruby
|