digest-blake3 0.37.0.1 → 1.3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/ext/digest/blake3/blake3.c +16 -3
- data/ext/digest/blake3/blake3.h +4 -2
- data/ext/digest/blake3/blake3_avx2.c +4 -3
- data/ext/digest/blake3/blake3_avx512.c +26 -10
- data/ext/digest/blake3/blake3_dispatch.c +11 -5
- data/ext/digest/blake3/blake3_impl.h +17 -5
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_sse2.c +5 -4
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2 -2
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +8 -8
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +10 -10
- data/ext/digest/blake3/blake3_sse41.c +4 -3
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +4 -4
- data/ext/digest/blake3/extconf.rb +6 -4
- data/lib/digest/blake3/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
|
4
|
+
data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
|
7
|
+
data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056
|
data/Gemfile.lock
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
digest-blake3 (
|
4
|
+
digest-blake3 (1.3.3.1)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
|
-
minitest (5.
|
10
|
-
rake (13.0.
|
9
|
+
minitest (5.16.3)
|
10
|
+
rake (13.0.6)
|
11
11
|
|
12
12
|
PLATFORMS
|
13
13
|
ruby
|
@@ -19,4 +19,4 @@ DEPENDENCIES
|
|
19
19
|
rake
|
20
20
|
|
21
21
|
BUNDLED WITH
|
22
|
-
|
22
|
+
2.3.1
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#include "blake3.h"
|
6
6
|
#include "blake3_impl.h"
|
7
7
|
|
8
|
+
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
|
9
|
+
|
8
10
|
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
|
9
11
|
uint8_t flags) {
|
10
12
|
memcpy(self->cv, key, BLAKE3_KEY_LEN);
|
@@ -244,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
244
246
|
|
245
247
|
// The wide helper function returns (writes out) an array of chaining values
|
246
248
|
// and returns the length of that array. The number of chaining values returned
|
247
|
-
// is the
|
249
|
+
// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
|
248
250
|
// if the input is shorter than that many chunks. The reason for maintaining a
|
249
251
|
// wide array of chaining values going back up the tree, is to allow the
|
250
252
|
// implementation to hash as many parents in parallel as possible.
|
@@ -252,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
252
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
253
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
254
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
255
|
-
// wouldn't be able to implement exendable
|
257
|
+
// wouldn't be able to implement exendable output.) Note that this function is
|
256
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
257
259
|
// codepath.
|
258
260
|
//
|
@@ -338,12 +340,18 @@ INLINE void compress_subtree_to_parent_node(
|
|
338
340
|
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
339
341
|
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
340
342
|
chunk_counter, flags, cv_array);
|
343
|
+
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
|
341
344
|
|
342
345
|
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
|
343
346
|
// compress_subtree_wide() returns more than 2 chaining values. Condense
|
344
347
|
// them into 2 by forming parent nodes repeatedly.
|
345
348
|
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
|
346
|
-
|
349
|
+
// The second half of this loop condition is always true, and we just
|
350
|
+
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
|
351
|
+
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
|
352
|
+
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
|
353
|
+
// this code, test it against that version.
|
354
|
+
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
|
347
355
|
num_cvs =
|
348
356
|
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
|
349
357
|
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
|
@@ -601,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
|
601
609
|
}
|
602
610
|
output_root_bytes(&output, seek, out, out_len);
|
603
611
|
}
|
612
|
+
|
613
|
+
void blake3_hasher_reset(blake3_hasher *self) {
|
614
|
+
chunk_state_reset(&self->chunk, self->key, 0);
|
615
|
+
self->cv_stack_len = 0;
|
616
|
+
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -8,12 +8,12 @@
|
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#define BLAKE3_VERSION_STRING "1.3.3"
|
11
12
|
#define BLAKE3_KEY_LEN 32
|
12
13
|
#define BLAKE3_OUT_LEN 32
|
13
14
|
#define BLAKE3_BLOCK_LEN 64
|
14
15
|
#define BLAKE3_CHUNK_LEN 1024
|
15
16
|
#define BLAKE3_MAX_DEPTH 54
|
16
|
-
#define BLAKE3_MAX_SIMD_DEGREE 16
|
17
17
|
|
18
18
|
// This struct is a private implementation detail. It has to be here because
|
19
19
|
// it's part of blake3_hasher below.
|
@@ -38,11 +38,12 @@ typedef struct {
|
|
38
38
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
39
|
} blake3_hasher;
|
40
40
|
|
41
|
+
const char *blake3_version(void);
|
41
42
|
void blake3_hasher_init(blake3_hasher *self);
|
42
43
|
void blake3_hasher_init_keyed(blake3_hasher *self,
|
43
44
|
const uint8_t key[BLAKE3_KEY_LEN]);
|
44
45
|
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
45
|
-
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
46
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
46
47
|
size_t context_len);
|
47
48
|
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
48
49
|
size_t input_len);
|
@@ -50,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
50
51
|
size_t out_len);
|
51
52
|
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
52
53
|
uint8_t *out, size_t out_len);
|
54
|
+
void blake3_hasher_reset(blake3_hasher *self);
|
53
55
|
|
54
56
|
#ifdef __cplusplus
|
55
57
|
}
|
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
208
208
|
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
209
209
|
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
210
210
|
for (size_t i = 0; i < 8; ++i) {
|
211
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
211
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
212
212
|
}
|
213
213
|
transpose_vecs(&out[0]);
|
214
214
|
transpose_vecs(&out[8]);
|
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
219
219
|
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
|
220
220
|
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
|
221
221
|
const __m256i add1 = _mm256_and_si256(mask, add0);
|
222
|
-
__m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
|
222
|
+
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
|
223
223
|
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
|
224
224
|
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
|
225
|
-
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
|
225
|
+
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
|
226
226
|
*out_lo = l;
|
227
227
|
*out_hi = h;
|
228
228
|
}
|
229
229
|
|
230
|
+
static
|
230
231
|
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
|
231
232
|
const uint32_t key[8], uint64_t counter,
|
232
233
|
bool increment_counter, uint8_t flags,
|
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
|
468
468
|
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
469
469
|
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
470
470
|
for (size_t i = 0; i < 4; ++i) {
|
471
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
471
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
472
472
|
}
|
473
473
|
transpose_vecs_128(&out[0]);
|
474
474
|
transpose_vecs_128(&out[4]);
|
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
|
|
488
488
|
*out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
|
489
489
|
}
|
490
490
|
|
491
|
+
static
|
491
492
|
void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
|
492
493
|
const uint32_t key[8], uint64_t counter,
|
493
494
|
bool increment_counter, uint8_t flags,
|
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
|
|
724
725
|
out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
|
725
726
|
out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
|
726
727
|
for (size_t i = 0; i < 8; ++i) {
|
727
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
728
729
|
}
|
729
730
|
transpose_vecs_256(&out[0]);
|
730
731
|
transpose_vecs_256(&out[8]);
|
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
|
|
742
743
|
*out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
|
743
744
|
}
|
744
745
|
|
746
|
+
static
|
745
747
|
void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
|
746
748
|
const uint32_t key[8], uint64_t counter,
|
747
749
|
bool increment_counter, uint8_t flags,
|
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1037
1039
|
out[14] = loadu_512(&inputs[14][block_offset]);
|
1038
1040
|
out[15] = loadu_512(&inputs[15][block_offset]);
|
1039
1041
|
for (size_t i = 0; i < 16; ++i) {
|
1040
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1042
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
1041
1043
|
}
|
1042
1044
|
transpose_vecs_512(out);
|
1043
1045
|
}
|
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
|
|
1045
1047
|
INLINE void load_counters16(uint64_t counter, bool increment_counter,
|
1046
1048
|
__m512i *out_lo, __m512i *out_hi) {
|
1047
1049
|
const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
|
1048
|
-
const __m512i
|
1049
|
-
const __m512i
|
1050
|
-
__m512i
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1050
|
+
const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
|
1051
|
+
const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
|
1052
|
+
const __m512i low_words = _mm512_add_epi32(
|
1053
|
+
_mm512_set1_epi32((int32_t)counter),
|
1054
|
+
masked_deltas);
|
1055
|
+
// The carry bit is 1 if the high bit of the word was 1 before addition and is
|
1056
|
+
// 0 after.
|
1057
|
+
// NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
|
1058
|
+
// compute the carry bits here, and originally we did, but that intrinsic is
|
1059
|
+
// broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
|
1060
|
+
const __m512i carries = _mm512_srli_epi32(
|
1061
|
+
_mm512_andnot_si512(
|
1062
|
+
low_words, // 0 after (gets inverted by andnot)
|
1063
|
+
_mm512_set1_epi32((int32_t)counter)), // and 1 before
|
1064
|
+
31);
|
1065
|
+
const __m512i high_words = _mm512_add_epi32(
|
1066
|
+
_mm512_set1_epi32((int32_t)(counter >> 32)),
|
1067
|
+
carries);
|
1068
|
+
*out_lo = low_words;
|
1069
|
+
*out_hi = high_words;
|
1055
1070
|
}
|
1056
1071
|
|
1072
|
+
static
|
1057
1073
|
void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
|
1058
1074
|
const uint32_t key[8], uint64_t counter,
|
1059
1075
|
bool increment_counter, uint8_t flags,
|
@@ -10,12 +10,14 @@
|
|
10
10
|
#elif defined(__GNUC__)
|
11
11
|
#include <immintrin.h>
|
12
12
|
#else
|
13
|
-
#
|
13
|
+
#undef IS_X86 /* Unimplemented! */
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
|
+
#define MAYBE_UNUSED(x) (void)((x))
|
18
|
+
|
17
19
|
#if defined(IS_X86)
|
18
|
-
static uint64_t xgetbv() {
|
20
|
+
static uint64_t xgetbv(void) {
|
19
21
|
#if defined(_MSC_VER)
|
20
22
|
return _xgetbv(0);
|
21
23
|
#else
|
@@ -80,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
|
|
80
82
|
static
|
81
83
|
#endif
|
82
84
|
enum cpu_feature
|
83
|
-
get_cpu_features() {
|
85
|
+
get_cpu_features(void) {
|
84
86
|
|
85
87
|
if (g_cpu_features != UNDEFINED) {
|
86
88
|
return g_cpu_features;
|
@@ -137,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
137
139
|
uint8_t flags) {
|
138
140
|
#if defined(IS_X86)
|
139
141
|
const enum cpu_feature features = get_cpu_features();
|
142
|
+
MAYBE_UNUSED(features);
|
140
143
|
#if !defined(BLAKE3_NO_AVX512)
|
141
144
|
if (features & AVX512VL) {
|
142
145
|
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
|
@@ -165,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
165
168
|
uint8_t out[64]) {
|
166
169
|
#if defined(IS_X86)
|
167
170
|
const enum cpu_feature features = get_cpu_features();
|
171
|
+
MAYBE_UNUSED(features);
|
168
172
|
#if !defined(BLAKE3_NO_AVX512)
|
169
173
|
if (features & AVX512VL) {
|
170
174
|
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
@@ -193,6 +197,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
193
197
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
194
198
|
#if defined(IS_X86)
|
195
199
|
const enum cpu_feature features = get_cpu_features();
|
200
|
+
MAYBE_UNUSED(features);
|
196
201
|
#if !defined(BLAKE3_NO_AVX512)
|
197
202
|
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
198
203
|
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
@@ -227,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
227
232
|
#endif
|
228
233
|
#endif
|
229
234
|
|
230
|
-
#if
|
235
|
+
#if BLAKE3_USE_NEON == 1
|
231
236
|
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
|
232
237
|
increment_counter, flags, flags_start, flags_end, out);
|
233
238
|
return;
|
@@ -242,6 +247,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
242
247
|
size_t blake3_simd_degree(void) {
|
243
248
|
#if defined(IS_X86)
|
244
249
|
const enum cpu_feature features = get_cpu_features();
|
250
|
+
MAYBE_UNUSED(features);
|
245
251
|
#if !defined(BLAKE3_NO_AVX512)
|
246
252
|
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
247
253
|
return 16;
|
@@ -263,7 +269,7 @@ size_t blake3_simd_degree(void) {
|
|
263
269
|
}
|
264
270
|
#endif
|
265
271
|
#endif
|
266
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
267
273
|
return 4;
|
268
274
|
#endif
|
269
275
|
return 1;
|
@@ -38,16 +38,28 @@ enum blake3_flags {
|
|
38
38
|
#define IS_X86_32
|
39
39
|
#endif
|
40
40
|
|
41
|
+
#if defined(__aarch64__) || defined(_M_ARM64)
|
42
|
+
#define IS_AARCH64
|
43
|
+
#endif
|
44
|
+
|
41
45
|
#if defined(IS_X86)
|
42
46
|
#if defined(_MSC_VER)
|
43
47
|
#include <intrin.h>
|
44
48
|
#endif
|
45
|
-
#
|
49
|
+
#endif
|
50
|
+
|
51
|
+
#if !defined(BLAKE3_USE_NEON)
|
52
|
+
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
|
53
|
+
#if defined(IS_AARCH64)
|
54
|
+
#define BLAKE3_USE_NEON 1
|
55
|
+
#else
|
56
|
+
#define BLAKE3_USE_NEON 0
|
57
|
+
#endif
|
46
58
|
#endif
|
47
59
|
|
48
60
|
#if defined(IS_X86)
|
49
61
|
#define MAX_SIMD_DEGREE 16
|
50
|
-
#elif
|
62
|
+
#elif BLAKE3_USE_NEON == 1
|
51
63
|
#define MAX_SIMD_DEGREE 4
|
52
64
|
#else
|
53
65
|
#define MAX_SIMD_DEGREE 1
|
@@ -83,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
|
|
83
95
|
#elif defined(_MSC_VER) && defined(IS_X86_32)
|
84
96
|
if(x >> 32) {
|
85
97
|
unsigned long index;
|
86
|
-
_BitScanReverse(&index, x >> 32);
|
98
|
+
_BitScanReverse(&index, (unsigned long)(x >> 32));
|
87
99
|
return 32 + index;
|
88
100
|
} else {
|
89
101
|
unsigned long index;
|
90
|
-
_BitScanReverse(&index, x);
|
102
|
+
_BitScanReverse(&index, (unsigned long)x);
|
91
103
|
return index;
|
92
104
|
}
|
93
105
|
#else
|
@@ -257,7 +269,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
|
257
269
|
#endif
|
258
270
|
#endif
|
259
271
|
|
260
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
261
273
|
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
262
274
|
size_t blocks, const uint32_t key[8],
|
263
275
|
uint64_t counter, bool increment_counter,
|
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
#include <arm_neon.h>
|
4
4
|
|
5
|
-
|
5
|
+
#ifdef __ARM_BIG_ENDIAN
|
6
|
+
#error "This implementation only supports little-endian ARM."
|
7
|
+
// It might be that all we need for big-endian support here is to get the loads
|
8
|
+
// and stores right, but step zero would be finding a way to test it in CI.
|
9
|
+
#endif
|
10
|
+
|
6
11
|
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
7
12
|
// vld1q_u32 has alignment requirements. Don't use it.
|
8
13
|
uint32x4_t x;
|
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
|
|
78
78
|
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
|
79
79
|
}
|
80
80
|
|
81
|
-
INLINE __m128i blend_epi16(__m128i a, __m128i b, const
|
81
|
+
INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
|
82
82
|
const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
|
83
83
|
__m128i mask = _mm_set1_epi16(imm8);
|
84
84
|
mask = _mm_and_si128(mask, bits);
|
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
435
435
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
436
436
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
437
437
|
for (size_t i = 0; i < 4; ++i) {
|
438
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
438
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
439
439
|
}
|
440
440
|
transpose_vecs(&out[0]);
|
441
441
|
transpose_vecs(&out[4]);
|
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
448
448
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
449
449
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
450
450
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
451
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
451
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
452
452
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
453
453
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
454
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
454
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
455
455
|
*out_lo = l;
|
456
456
|
*out_hi = h;
|
457
457
|
}
|
458
458
|
|
459
|
+
static
|
459
460
|
void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
|
460
461
|
const uint32_t key[8], uint64_t counter,
|
461
462
|
bool increment_counter, uint8_t flags,
|
@@ -1704,7 +1704,7 @@ blake3_hash_many_sse2:
|
|
1704
1704
|
pshufd xmm15, xmm11, 0x93
|
1705
1705
|
shl rax, 0x20
|
1706
1706
|
or rax, 0x40
|
1707
|
-
|
1707
|
+
movq xmm3, rax
|
1708
1708
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
1709
1709
|
movaps xmm3, xmmword ptr [rsp]
|
1710
1710
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
@@ -1917,7 +1917,7 @@ blake3_hash_many_sse2:
|
|
1917
1917
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
1918
1918
|
shl rax, 32
|
1919
1919
|
or rax, 64
|
1920
|
-
|
1920
|
+
movq xmm12, rax
|
1921
1921
|
movdqa xmm3, xmm13
|
1922
1922
|
punpcklqdq xmm3, xmm12
|
1923
1923
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
@@ -1715,7 +1715,7 @@ blake3_hash_many_sse2:
|
|
1715
1715
|
pshufd xmm15, xmm11, 0x93
|
1716
1716
|
shl rax, 0x20
|
1717
1717
|
or rax, 0x40
|
1718
|
-
|
1718
|
+
movq xmm3, rax
|
1719
1719
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
1720
1720
|
movaps xmm3, xmmword ptr [rsp]
|
1721
1721
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
@@ -1928,7 +1928,7 @@ blake3_hash_many_sse2:
|
|
1928
1928
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
1929
1929
|
shl rax, 32
|
1930
1930
|
or rax, 64
|
1931
|
-
|
1931
|
+
movq xmm12, rax
|
1932
1932
|
movdqa xmm3, xmm13
|
1933
1933
|
punpcklqdq xmm3, xmm12
|
1934
1934
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
|
|
2137
2137
|
por xmm9, xmm8
|
2138
2138
|
movdqa xmm8, xmm7
|
2139
2139
|
punpcklqdq xmm8, xmm5
|
2140
|
-
movdqa
|
2140
|
+
movdqa xmm14, xmm6
|
2141
2141
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
2142
|
-
pand
|
2143
|
-
por xmm8,
|
2142
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
2143
|
+
por xmm8, xmm14
|
2144
2144
|
pshufd xmm8, xmm8, 0x78
|
2145
2145
|
punpckhdq xmm5, xmm7
|
2146
2146
|
punpckldq xmm6, xmm5
|
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
|
|
2268
2268
|
por xmm9, xmm8
|
2269
2269
|
movdqa xmm8, xmm7
|
2270
2270
|
punpcklqdq xmm8, xmm5
|
2271
|
-
movdqa
|
2271
|
+
movdqa xmm14, xmm6
|
2272
2272
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
2273
|
-
pand
|
2274
|
-
por xmm8,
|
2273
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
2274
|
+
por xmm8, xmm14
|
2275
2275
|
pshufd xmm8, xmm8, 0x78
|
2276
2276
|
punpckhdq xmm5, xmm7
|
2277
2277
|
punpckldq xmm6, xmm5
|
@@ -2054,8 +2054,8 @@ _blake3_compress_in_place_sse2 PROC
|
|
2054
2054
|
movzx r8d, r8b
|
2055
2055
|
shl rax, 32
|
2056
2056
|
add r8, rax
|
2057
|
-
|
2058
|
-
|
2057
|
+
movd xmm3, r9
|
2058
|
+
movd xmm4, r8
|
2059
2059
|
punpcklqdq xmm3, xmm4
|
2060
2060
|
movups xmm4, xmmword ptr [rdx]
|
2061
2061
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
|
|
2139
2139
|
por xmm9, xmm8
|
2140
2140
|
movdqa xmm8, xmm7
|
2141
2141
|
punpcklqdq xmm8, xmm5
|
2142
|
-
movdqa
|
2142
|
+
movdqa xmm14, xmm6
|
2143
2143
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
2144
|
-
pand
|
2145
|
-
por xmm8,
|
2144
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
2145
|
+
por xmm8, xmm14
|
2146
2146
|
pshufd xmm8, xmm8, 78H
|
2147
2147
|
punpckhdq xmm5, xmm7
|
2148
2148
|
punpckldq xmm6, xmm5
|
@@ -2186,8 +2186,8 @@ _blake3_compress_xof_sse2 PROC
|
|
2186
2186
|
mov r10, qword ptr [rsp+0A8H]
|
2187
2187
|
shl rax, 32
|
2188
2188
|
add r8, rax
|
2189
|
-
|
2190
|
-
|
2189
|
+
movd xmm3, r9
|
2190
|
+
movd xmm4, r8
|
2191
2191
|
punpcklqdq xmm3, xmm4
|
2192
2192
|
movups xmm4, xmmword ptr [rdx]
|
2193
2193
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
|
|
2271
2271
|
por xmm9, xmm8
|
2272
2272
|
movdqa xmm8, xmm7
|
2273
2273
|
punpcklqdq xmm8, xmm5
|
2274
|
-
movdqa
|
2274
|
+
movdqa xmm14, xmm6
|
2275
2275
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
|
2276
|
-
pand
|
2277
|
-
por xmm8,
|
2276
|
+
pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
|
2277
|
+
por xmm8, xmm14
|
2278
2278
|
pshufd xmm8, xmm8, 78H
|
2279
2279
|
punpckhdq xmm5, xmm7
|
2280
2280
|
punpckldq xmm6, xmm5
|
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
|
|
429
429
|
out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
|
430
430
|
out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
|
431
431
|
for (size_t i = 0; i < 4; ++i) {
|
432
|
-
_mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
|
432
|
+
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
|
433
433
|
}
|
434
434
|
transpose_vecs(&out[0]);
|
435
435
|
transpose_vecs(&out[4]);
|
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
|
|
442
442
|
const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
|
443
443
|
const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
|
444
444
|
const __m128i add1 = _mm_and_si128(mask, add0);
|
445
|
-
__m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
|
445
|
+
__m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
|
446
446
|
__m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
|
447
447
|
_mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
|
448
|
-
__m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
|
448
|
+
__m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
|
449
449
|
*out_lo = l;
|
450
450
|
*out_hi = h;
|
451
451
|
}
|
452
452
|
|
453
|
+
static
|
453
454
|
void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
|
454
455
|
const uint32_t key[8], uint64_t counter,
|
455
456
|
bool increment_counter, uint8_t flags,
|
@@ -1817,8 +1817,8 @@ _blake3_compress_in_place_sse41 PROC
|
|
1817
1817
|
movzx r8d, r8b
|
1818
1818
|
shl rax, 32
|
1819
1819
|
add r8, rax
|
1820
|
-
|
1821
|
-
|
1820
|
+
movd xmm3, r9
|
1821
|
+
movd xmm4, r8
|
1822
1822
|
punpcklqdq xmm3, xmm4
|
1823
1823
|
movups xmm4, xmmword ptr [rdx]
|
1824
1824
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -1938,8 +1938,8 @@ _blake3_compress_xof_sse41 PROC
|
|
1938
1938
|
mov r10, qword ptr [rsp+0A8H]
|
1939
1939
|
shl rax, 32
|
1940
1940
|
add r8, rax
|
1941
|
-
|
1942
|
-
|
1941
|
+
movd xmm3, r9
|
1942
|
+
movd xmm4, r8
|
1943
1943
|
punpcklqdq xmm3, xmm4
|
1944
1944
|
movups xmm4, xmmword ptr [rdx]
|
1945
1945
|
movups xmm5, xmmword ptr [rdx+10H]
|
@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
|
|
34
34
|
end
|
35
35
|
end
|
36
36
|
|
37
|
-
|
38
|
-
check_supported_flags("-
|
39
|
-
check_supported_flags("-
|
40
|
-
check_supported_flags("-
|
37
|
+
unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
|
38
|
+
check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
|
39
|
+
check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
|
40
|
+
check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
|
41
|
+
check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
|
42
|
+
end
|
41
43
|
|
42
44
|
if have_header("arm_neon.h")
|
43
45
|
$objs << "blake3_neon.o"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: digest-blake3
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.3.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Will Bryant
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-12-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
|
|
100
100
|
licenses:
|
101
101
|
- MIT
|
102
102
|
metadata: {}
|
103
|
-
post_install_message:
|
103
|
+
post_install_message:
|
104
104
|
rdoc_options: []
|
105
105
|
require_paths:
|
106
106
|
- lib
|
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
116
116
|
- !ruby/object:Gem::Version
|
117
117
|
version: '0'
|
118
118
|
requirements: []
|
119
|
-
rubygems_version: 3.
|
120
|
-
signing_key:
|
119
|
+
rubygems_version: 3.1.6
|
120
|
+
signing_key:
|
121
121
|
specification_version: 4
|
122
122
|
summary: BLAKE3 for Ruby
|
123
123
|
test_files: []
|