digest-blake3 0.22.1 → 1.2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/ext/digest/blake3/blake3.c +42 -20
- data/ext/digest/blake3/blake3.h +8 -3
- data/ext/digest/blake3/blake3_avx2_x86-64_unix.S +15 -0
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +34 -18
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +35 -71
- data/ext/digest/blake3/blake3_impl.h +117 -2
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_portable.c +1 -9
- data/ext/digest/blake3/blake3_sse2.c +565 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_unix.S +17 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +23 -11
- data/ext/digest/blake3/extconf.rb +4 -3
- data/lib/digest/blake3/version.rb +1 -1
- metadata +10 -7
@@ -14,72 +14,7 @@
|
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
|
-
|
18
|
-
void blake3_compress_in_place_portable(uint32_t cv[8],
|
19
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
20
|
-
uint8_t block_len, uint64_t counter,
|
21
|
-
uint8_t flags);
|
22
|
-
|
23
|
-
void blake3_compress_xof_portable(const uint32_t cv[8],
|
24
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
25
|
-
uint8_t block_len, uint64_t counter,
|
26
|
-
uint8_t flags, uint8_t out[64]);
|
27
|
-
|
28
|
-
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
29
|
-
size_t blocks, const uint32_t key[8],
|
30
|
-
uint64_t counter, bool increment_counter,
|
31
|
-
uint8_t flags, uint8_t flags_start,
|
32
|
-
uint8_t flags_end, uint8_t *out);
|
33
|
-
|
34
|
-
#if defined(IS_X86)
|
35
|
-
#if !defined(BLAKE3_NO_SSE41)
|
36
|
-
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
37
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
38
|
-
uint8_t block_len, uint64_t counter,
|
39
|
-
uint8_t flags);
|
40
|
-
void blake3_compress_xof_sse41(const uint32_t cv[8],
|
41
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
42
|
-
uint8_t block_len, uint64_t counter,
|
43
|
-
uint8_t flags, uint8_t out[64]);
|
44
|
-
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
|
45
|
-
size_t blocks, const uint32_t key[8],
|
46
|
-
uint64_t counter, bool increment_counter,
|
47
|
-
uint8_t flags, uint8_t flags_start,
|
48
|
-
uint8_t flags_end, uint8_t *out);
|
49
|
-
#endif
|
50
|
-
#if !defined(BLAKE3_NO_AVX2)
|
51
|
-
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
|
52
|
-
size_t blocks, const uint32_t key[8],
|
53
|
-
uint64_t counter, bool increment_counter,
|
54
|
-
uint8_t flags, uint8_t flags_start,
|
55
|
-
uint8_t flags_end, uint8_t *out);
|
56
|
-
#endif
|
57
|
-
#if !defined(BLAKE3_NO_AVX512)
|
58
|
-
void blake3_compress_in_place_avx512(uint32_t cv[8],
|
59
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
60
|
-
uint8_t block_len, uint64_t counter,
|
61
|
-
uint8_t flags);
|
62
|
-
|
63
|
-
void blake3_compress_xof_avx512(const uint32_t cv[8],
|
64
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
65
|
-
uint8_t block_len, uint64_t counter,
|
66
|
-
uint8_t flags, uint8_t out[64]);
|
67
|
-
|
68
|
-
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
69
|
-
size_t blocks, const uint32_t key[8],
|
70
|
-
uint64_t counter, bool increment_counter,
|
71
|
-
uint8_t flags, uint8_t flags_start,
|
72
|
-
uint8_t flags_end, uint8_t *out);
|
73
|
-
#endif
|
74
|
-
#endif
|
75
|
-
|
76
|
-
#if defined(BLAKE3_USE_NEON)
|
77
|
-
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
78
|
-
size_t blocks, const uint32_t key[8],
|
79
|
-
uint64_t counter, bool increment_counter,
|
80
|
-
uint8_t flags, uint8_t flags_start,
|
81
|
-
uint8_t flags_end, uint8_t *out);
|
82
|
-
#endif
|
17
|
+
#define MAYBE_UNUSED(x) (void)((x))
|
83
18
|
|
84
19
|
#if defined(IS_X86)
|
85
20
|
static uint64_t xgetbv() {
|
@@ -204,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
204
139
|
uint8_t flags) {
|
205
140
|
#if defined(IS_X86)
|
206
141
|
const enum cpu_feature features = get_cpu_features();
|
142
|
+
MAYBE_UNUSED(features);
|
207
143
|
#if !defined(BLAKE3_NO_AVX512)
|
208
144
|
if (features & AVX512VL) {
|
209
145
|
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
|
@@ -216,6 +152,12 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
216
152
|
return;
|
217
153
|
}
|
218
154
|
#endif
|
155
|
+
#if !defined(BLAKE3_NO_SSE2)
|
156
|
+
if (features & SSE2) {
|
157
|
+
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
|
158
|
+
return;
|
159
|
+
}
|
160
|
+
#endif
|
219
161
|
#endif
|
220
162
|
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
|
221
163
|
}
|
@@ -226,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
226
168
|
uint8_t out[64]) {
|
227
169
|
#if defined(IS_X86)
|
228
170
|
const enum cpu_feature features = get_cpu_features();
|
171
|
+
MAYBE_UNUSED(features);
|
229
172
|
#if !defined(BLAKE3_NO_AVX512)
|
230
173
|
if (features & AVX512VL) {
|
231
174
|
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
@@ -238,6 +181,12 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
238
181
|
return;
|
239
182
|
}
|
240
183
|
#endif
|
184
|
+
#if !defined(BLAKE3_NO_SSE2)
|
185
|
+
if (features & SSE2) {
|
186
|
+
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
|
187
|
+
return;
|
188
|
+
}
|
189
|
+
#endif
|
241
190
|
#endif
|
242
191
|
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
|
243
192
|
}
|
@@ -248,8 +197,9 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
248
197
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
249
198
|
#if defined(IS_X86)
|
250
199
|
const enum cpu_feature features = get_cpu_features();
|
200
|
+
MAYBE_UNUSED(features);
|
251
201
|
#if !defined(BLAKE3_NO_AVX512)
|
252
|
-
if (features & AVX512F) {
|
202
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
253
203
|
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
254
204
|
increment_counter, flags, flags_start, flags_end,
|
255
205
|
out);
|
@@ -272,9 +222,17 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
272
222
|
return;
|
273
223
|
}
|
274
224
|
#endif
|
225
|
+
#if !defined(BLAKE3_NO_SSE2)
|
226
|
+
if (features & SSE2) {
|
227
|
+
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
|
228
|
+
increment_counter, flags, flags_start, flags_end,
|
229
|
+
out);
|
230
|
+
return;
|
231
|
+
}
|
232
|
+
#endif
|
275
233
|
#endif
|
276
234
|
|
277
|
-
#if
|
235
|
+
#if BLAKE3_USE_NEON == 1
|
278
236
|
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
|
279
237
|
increment_counter, flags, flags_start, flags_end, out);
|
280
238
|
return;
|
@@ -286,11 +244,12 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
286
244
|
}
|
287
245
|
|
288
246
|
// The dynamically detected SIMD degree of the current platform.
|
289
|
-
size_t blake3_simd_degree() {
|
247
|
+
size_t blake3_simd_degree(void) {
|
290
248
|
#if defined(IS_X86)
|
291
249
|
const enum cpu_feature features = get_cpu_features();
|
250
|
+
MAYBE_UNUSED(features);
|
292
251
|
#if !defined(BLAKE3_NO_AVX512)
|
293
|
-
if (features & AVX512F) {
|
252
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
294
253
|
return 16;
|
295
254
|
}
|
296
255
|
#endif
|
@@ -304,8 +263,13 @@ size_t blake3_simd_degree() {
|
|
304
263
|
return 4;
|
305
264
|
}
|
306
265
|
#endif
|
266
|
+
#if !defined(BLAKE3_NO_SSE2)
|
267
|
+
if (features & SSE2) {
|
268
|
+
return 4;
|
269
|
+
}
|
270
|
+
#endif
|
307
271
|
#endif
|
308
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
309
273
|
return 4;
|
310
274
|
#endif
|
311
275
|
return 1;
|
@@ -38,6 +38,10 @@ enum blake3_flags {
|
|
38
38
|
#define IS_X86_32
|
39
39
|
#endif
|
40
40
|
|
41
|
+
#if defined(__aarch64__) || defined(_M_ARM64)
|
42
|
+
#define IS_AARCH64
|
43
|
+
#endif
|
44
|
+
|
41
45
|
#if defined(IS_X86)
|
42
46
|
#if defined(_MSC_VER)
|
43
47
|
#include <intrin.h>
|
@@ -45,9 +49,18 @@ enum blake3_flags {
|
|
45
49
|
#include <immintrin.h>
|
46
50
|
#endif
|
47
51
|
|
52
|
+
#if !defined(BLAKE3_USE_NEON)
|
53
|
+
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
|
54
|
+
#if defined(IS_AARCH64)
|
55
|
+
#define BLAKE3_USE_NEON 1
|
56
|
+
#else
|
57
|
+
#define BLAKE3_USE_NEON 0
|
58
|
+
#endif
|
59
|
+
#endif
|
60
|
+
|
48
61
|
#if defined(IS_X86)
|
49
62
|
#define MAX_SIMD_DEGREE 16
|
50
|
-
#elif
|
63
|
+
#elif BLAKE3_USE_NEON == 1
|
51
64
|
#define MAX_SIMD_DEGREE 4
|
52
65
|
#else
|
53
66
|
#define MAX_SIMD_DEGREE 1
|
@@ -146,6 +159,25 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
|
|
146
159
|
key_words[7] = load32(&key[7 * 4]);
|
147
160
|
}
|
148
161
|
|
162
|
+
INLINE void store32(void *dst, uint32_t w) {
|
163
|
+
uint8_t *p = (uint8_t *)dst;
|
164
|
+
p[0] = (uint8_t)(w >> 0);
|
165
|
+
p[1] = (uint8_t)(w >> 8);
|
166
|
+
p[2] = (uint8_t)(w >> 16);
|
167
|
+
p[3] = (uint8_t)(w >> 24);
|
168
|
+
}
|
169
|
+
|
170
|
+
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
|
171
|
+
store32(&bytes_out[0 * 4], cv_words[0]);
|
172
|
+
store32(&bytes_out[1 * 4], cv_words[1]);
|
173
|
+
store32(&bytes_out[2 * 4], cv_words[2]);
|
174
|
+
store32(&bytes_out[3 * 4], cv_words[3]);
|
175
|
+
store32(&bytes_out[4 * 4], cv_words[4]);
|
176
|
+
store32(&bytes_out[5 * 4], cv_words[5]);
|
177
|
+
store32(&bytes_out[6 * 4], cv_words[6]);
|
178
|
+
store32(&bytes_out[7 * 4], cv_words[7]);
|
179
|
+
}
|
180
|
+
|
149
181
|
void blake3_compress_in_place(uint32_t cv[8],
|
150
182
|
const uint8_t block[BLAKE3_BLOCK_LEN],
|
151
183
|
uint8_t block_len, uint64_t counter,
|
@@ -161,7 +193,90 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
161
193
|
bool increment_counter, uint8_t flags,
|
162
194
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out);
|
163
195
|
|
164
|
-
size_t blake3_simd_degree();
|
196
|
+
size_t blake3_simd_degree(void);
|
197
|
+
|
198
|
+
|
199
|
+
// Declarations for implementation-specific functions.
|
200
|
+
void blake3_compress_in_place_portable(uint32_t cv[8],
|
201
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
202
|
+
uint8_t block_len, uint64_t counter,
|
203
|
+
uint8_t flags);
|
204
|
+
|
205
|
+
void blake3_compress_xof_portable(const uint32_t cv[8],
|
206
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
207
|
+
uint8_t block_len, uint64_t counter,
|
208
|
+
uint8_t flags, uint8_t out[64]);
|
209
|
+
|
210
|
+
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
211
|
+
size_t blocks, const uint32_t key[8],
|
212
|
+
uint64_t counter, bool increment_counter,
|
213
|
+
uint8_t flags, uint8_t flags_start,
|
214
|
+
uint8_t flags_end, uint8_t *out);
|
215
|
+
|
216
|
+
#if defined(IS_X86)
|
217
|
+
#if !defined(BLAKE3_NO_SSE2)
|
218
|
+
void blake3_compress_in_place_sse2(uint32_t cv[8],
|
219
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
220
|
+
uint8_t block_len, uint64_t counter,
|
221
|
+
uint8_t flags);
|
222
|
+
void blake3_compress_xof_sse2(const uint32_t cv[8],
|
223
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
224
|
+
uint8_t block_len, uint64_t counter,
|
225
|
+
uint8_t flags, uint8_t out[64]);
|
226
|
+
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
|
227
|
+
size_t blocks, const uint32_t key[8],
|
228
|
+
uint64_t counter, bool increment_counter,
|
229
|
+
uint8_t flags, uint8_t flags_start,
|
230
|
+
uint8_t flags_end, uint8_t *out);
|
231
|
+
#endif
|
232
|
+
#if !defined(BLAKE3_NO_SSE41)
|
233
|
+
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
234
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
235
|
+
uint8_t block_len, uint64_t counter,
|
236
|
+
uint8_t flags);
|
237
|
+
void blake3_compress_xof_sse41(const uint32_t cv[8],
|
238
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
239
|
+
uint8_t block_len, uint64_t counter,
|
240
|
+
uint8_t flags, uint8_t out[64]);
|
241
|
+
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
|
242
|
+
size_t blocks, const uint32_t key[8],
|
243
|
+
uint64_t counter, bool increment_counter,
|
244
|
+
uint8_t flags, uint8_t flags_start,
|
245
|
+
uint8_t flags_end, uint8_t *out);
|
246
|
+
#endif
|
247
|
+
#if !defined(BLAKE3_NO_AVX2)
|
248
|
+
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
|
249
|
+
size_t blocks, const uint32_t key[8],
|
250
|
+
uint64_t counter, bool increment_counter,
|
251
|
+
uint8_t flags, uint8_t flags_start,
|
252
|
+
uint8_t flags_end, uint8_t *out);
|
253
|
+
#endif
|
254
|
+
#if !defined(BLAKE3_NO_AVX512)
|
255
|
+
void blake3_compress_in_place_avx512(uint32_t cv[8],
|
256
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
257
|
+
uint8_t block_len, uint64_t counter,
|
258
|
+
uint8_t flags);
|
259
|
+
|
260
|
+
void blake3_compress_xof_avx512(const uint32_t cv[8],
|
261
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
262
|
+
uint8_t block_len, uint64_t counter,
|
263
|
+
uint8_t flags, uint8_t out[64]);
|
264
|
+
|
265
|
+
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
266
|
+
size_t blocks, const uint32_t key[8],
|
267
|
+
uint64_t counter, bool increment_counter,
|
268
|
+
uint8_t flags, uint8_t flags_start,
|
269
|
+
uint8_t flags_end, uint8_t *out);
|
270
|
+
#endif
|
271
|
+
#endif
|
272
|
+
|
273
|
+
#if BLAKE3_USE_NEON == 1
|
274
|
+
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
275
|
+
size_t blocks, const uint32_t key[8],
|
276
|
+
uint64_t counter, bool increment_counter,
|
277
|
+
uint8_t flags, uint8_t flags_start,
|
278
|
+
uint8_t flags_end, uint8_t *out);
|
279
|
+
#endif
|
165
280
|
|
166
281
|
|
167
282
|
#endif /* BLAKE3_IMPL_H */
|
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
#include <arm_neon.h>
|
4
4
|
|
5
|
-
|
5
|
+
#ifdef __ARM_BIG_ENDIAN
|
6
|
+
#error "This implementation only supports little-endian ARM."
|
7
|
+
// It might be that all we need for big-endian support here is to get the loads
|
8
|
+
// and stores right, but step zero would be finding a way to test it in CI.
|
9
|
+
#endif
|
10
|
+
|
6
11
|
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
7
12
|
// vld1q_u32 has alignment requirements. Don't use it.
|
8
13
|
uint32x4_t x;
|
@@ -1,14 +1,6 @@
|
|
1
1
|
#include "blake3_impl.h"
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
-
INLINE void store32(void *dst, uint32_t w) {
|
5
|
-
uint8_t *p = (uint8_t *)dst;
|
6
|
-
p[0] = (uint8_t)(w >> 0);
|
7
|
-
p[1] = (uint8_t)(w >> 8);
|
8
|
-
p[2] = (uint8_t)(w >> 16);
|
9
|
-
p[3] = (uint8_t)(w >> 24);
|
10
|
-
}
|
11
|
-
|
12
4
|
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
|
13
5
|
return (w >> c) | (w << (32 - c));
|
14
6
|
}
|
@@ -147,7 +139,7 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
|
|
147
139
|
blocks -= 1;
|
148
140
|
block_flags = flags;
|
149
141
|
}
|
150
|
-
|
142
|
+
store_cv_words(out, cv);
|
151
143
|
}
|
152
144
|
|
153
145
|
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|