digest-blake3 0.22.1 → 1.2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/ext/digest/blake3/blake3.c +42 -20
- data/ext/digest/blake3/blake3.h +8 -3
- data/ext/digest/blake3/blake3_avx2_x86-64_unix.S +15 -0
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +34 -18
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +35 -71
- data/ext/digest/blake3/blake3_impl.h +117 -2
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_portable.c +1 -9
- data/ext/digest/blake3/blake3_sse2.c +565 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_unix.S +17 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +23 -11
- data/ext/digest/blake3/extconf.rb +4 -3
- data/lib/digest/blake3/version.rb +1 -1
- metadata +10 -7
@@ -14,72 +14,7 @@
|
|
14
14
|
#endif
|
15
15
|
#endif
|
16
16
|
|
17
|
-
|
18
|
-
void blake3_compress_in_place_portable(uint32_t cv[8],
|
19
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
20
|
-
uint8_t block_len, uint64_t counter,
|
21
|
-
uint8_t flags);
|
22
|
-
|
23
|
-
void blake3_compress_xof_portable(const uint32_t cv[8],
|
24
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
25
|
-
uint8_t block_len, uint64_t counter,
|
26
|
-
uint8_t flags, uint8_t out[64]);
|
27
|
-
|
28
|
-
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
29
|
-
size_t blocks, const uint32_t key[8],
|
30
|
-
uint64_t counter, bool increment_counter,
|
31
|
-
uint8_t flags, uint8_t flags_start,
|
32
|
-
uint8_t flags_end, uint8_t *out);
|
33
|
-
|
34
|
-
#if defined(IS_X86)
|
35
|
-
#if !defined(BLAKE3_NO_SSE41)
|
36
|
-
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
37
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
38
|
-
uint8_t block_len, uint64_t counter,
|
39
|
-
uint8_t flags);
|
40
|
-
void blake3_compress_xof_sse41(const uint32_t cv[8],
|
41
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
42
|
-
uint8_t block_len, uint64_t counter,
|
43
|
-
uint8_t flags, uint8_t out[64]);
|
44
|
-
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
|
45
|
-
size_t blocks, const uint32_t key[8],
|
46
|
-
uint64_t counter, bool increment_counter,
|
47
|
-
uint8_t flags, uint8_t flags_start,
|
48
|
-
uint8_t flags_end, uint8_t *out);
|
49
|
-
#endif
|
50
|
-
#if !defined(BLAKE3_NO_AVX2)
|
51
|
-
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
|
52
|
-
size_t blocks, const uint32_t key[8],
|
53
|
-
uint64_t counter, bool increment_counter,
|
54
|
-
uint8_t flags, uint8_t flags_start,
|
55
|
-
uint8_t flags_end, uint8_t *out);
|
56
|
-
#endif
|
57
|
-
#if !defined(BLAKE3_NO_AVX512)
|
58
|
-
void blake3_compress_in_place_avx512(uint32_t cv[8],
|
59
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
60
|
-
uint8_t block_len, uint64_t counter,
|
61
|
-
uint8_t flags);
|
62
|
-
|
63
|
-
void blake3_compress_xof_avx512(const uint32_t cv[8],
|
64
|
-
const uint8_t block[BLAKE3_BLOCK_LEN],
|
65
|
-
uint8_t block_len, uint64_t counter,
|
66
|
-
uint8_t flags, uint8_t out[64]);
|
67
|
-
|
68
|
-
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
69
|
-
size_t blocks, const uint32_t key[8],
|
70
|
-
uint64_t counter, bool increment_counter,
|
71
|
-
uint8_t flags, uint8_t flags_start,
|
72
|
-
uint8_t flags_end, uint8_t *out);
|
73
|
-
#endif
|
74
|
-
#endif
|
75
|
-
|
76
|
-
#if defined(BLAKE3_USE_NEON)
|
77
|
-
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
78
|
-
size_t blocks, const uint32_t key[8],
|
79
|
-
uint64_t counter, bool increment_counter,
|
80
|
-
uint8_t flags, uint8_t flags_start,
|
81
|
-
uint8_t flags_end, uint8_t *out);
|
82
|
-
#endif
|
17
|
+
#define MAYBE_UNUSED(x) (void)((x))
|
83
18
|
|
84
19
|
#if defined(IS_X86)
|
85
20
|
static uint64_t xgetbv() {
|
@@ -204,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
204
139
|
uint8_t flags) {
|
205
140
|
#if defined(IS_X86)
|
206
141
|
const enum cpu_feature features = get_cpu_features();
|
142
|
+
MAYBE_UNUSED(features);
|
207
143
|
#if !defined(BLAKE3_NO_AVX512)
|
208
144
|
if (features & AVX512VL) {
|
209
145
|
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
|
@@ -216,6 +152,12 @@ void blake3_compress_in_place(uint32_t cv[8],
|
|
216
152
|
return;
|
217
153
|
}
|
218
154
|
#endif
|
155
|
+
#if !defined(BLAKE3_NO_SSE2)
|
156
|
+
if (features & SSE2) {
|
157
|
+
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
|
158
|
+
return;
|
159
|
+
}
|
160
|
+
#endif
|
219
161
|
#endif
|
220
162
|
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
|
221
163
|
}
|
@@ -226,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
226
168
|
uint8_t out[64]) {
|
227
169
|
#if defined(IS_X86)
|
228
170
|
const enum cpu_feature features = get_cpu_features();
|
171
|
+
MAYBE_UNUSED(features);
|
229
172
|
#if !defined(BLAKE3_NO_AVX512)
|
230
173
|
if (features & AVX512VL) {
|
231
174
|
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
@@ -238,6 +181,12 @@ void blake3_compress_xof(const uint32_t cv[8],
|
|
238
181
|
return;
|
239
182
|
}
|
240
183
|
#endif
|
184
|
+
#if !defined(BLAKE3_NO_SSE2)
|
185
|
+
if (features & SSE2) {
|
186
|
+
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
|
187
|
+
return;
|
188
|
+
}
|
189
|
+
#endif
|
241
190
|
#endif
|
242
191
|
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
|
243
192
|
}
|
@@ -248,8 +197,9 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
248
197
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
249
198
|
#if defined(IS_X86)
|
250
199
|
const enum cpu_feature features = get_cpu_features();
|
200
|
+
MAYBE_UNUSED(features);
|
251
201
|
#if !defined(BLAKE3_NO_AVX512)
|
252
|
-
if (features & AVX512F) {
|
202
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
253
203
|
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
254
204
|
increment_counter, flags, flags_start, flags_end,
|
255
205
|
out);
|
@@ -272,9 +222,17 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
272
222
|
return;
|
273
223
|
}
|
274
224
|
#endif
|
225
|
+
#if !defined(BLAKE3_NO_SSE2)
|
226
|
+
if (features & SSE2) {
|
227
|
+
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
|
228
|
+
increment_counter, flags, flags_start, flags_end,
|
229
|
+
out);
|
230
|
+
return;
|
231
|
+
}
|
232
|
+
#endif
|
275
233
|
#endif
|
276
234
|
|
277
|
-
#if
|
235
|
+
#if BLAKE3_USE_NEON == 1
|
278
236
|
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
|
279
237
|
increment_counter, flags, flags_start, flags_end, out);
|
280
238
|
return;
|
@@ -286,11 +244,12 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
286
244
|
}
|
287
245
|
|
288
246
|
// The dynamically detected SIMD degree of the current platform.
|
289
|
-
size_t blake3_simd_degree() {
|
247
|
+
size_t blake3_simd_degree(void) {
|
290
248
|
#if defined(IS_X86)
|
291
249
|
const enum cpu_feature features = get_cpu_features();
|
250
|
+
MAYBE_UNUSED(features);
|
292
251
|
#if !defined(BLAKE3_NO_AVX512)
|
293
|
-
if (features & AVX512F) {
|
252
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
294
253
|
return 16;
|
295
254
|
}
|
296
255
|
#endif
|
@@ -304,8 +263,13 @@ size_t blake3_simd_degree() {
|
|
304
263
|
return 4;
|
305
264
|
}
|
306
265
|
#endif
|
266
|
+
#if !defined(BLAKE3_NO_SSE2)
|
267
|
+
if (features & SSE2) {
|
268
|
+
return 4;
|
269
|
+
}
|
270
|
+
#endif
|
307
271
|
#endif
|
308
|
-
#if
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
309
273
|
return 4;
|
310
274
|
#endif
|
311
275
|
return 1;
|
@@ -38,6 +38,10 @@ enum blake3_flags {
|
|
38
38
|
#define IS_X86_32
|
39
39
|
#endif
|
40
40
|
|
41
|
+
#if defined(__aarch64__) || defined(_M_ARM64)
|
42
|
+
#define IS_AARCH64
|
43
|
+
#endif
|
44
|
+
|
41
45
|
#if defined(IS_X86)
|
42
46
|
#if defined(_MSC_VER)
|
43
47
|
#include <intrin.h>
|
@@ -45,9 +49,18 @@ enum blake3_flags {
|
|
45
49
|
#include <immintrin.h>
|
46
50
|
#endif
|
47
51
|
|
52
|
+
#if !defined(BLAKE3_USE_NEON)
|
53
|
+
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
|
54
|
+
#if defined(IS_AARCH64)
|
55
|
+
#define BLAKE3_USE_NEON 1
|
56
|
+
#else
|
57
|
+
#define BLAKE3_USE_NEON 0
|
58
|
+
#endif
|
59
|
+
#endif
|
60
|
+
|
48
61
|
#if defined(IS_X86)
|
49
62
|
#define MAX_SIMD_DEGREE 16
|
50
|
-
#elif
|
63
|
+
#elif BLAKE3_USE_NEON == 1
|
51
64
|
#define MAX_SIMD_DEGREE 4
|
52
65
|
#else
|
53
66
|
#define MAX_SIMD_DEGREE 1
|
@@ -146,6 +159,25 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
|
|
146
159
|
key_words[7] = load32(&key[7 * 4]);
|
147
160
|
}
|
148
161
|
|
162
|
+
INLINE void store32(void *dst, uint32_t w) {
|
163
|
+
uint8_t *p = (uint8_t *)dst;
|
164
|
+
p[0] = (uint8_t)(w >> 0);
|
165
|
+
p[1] = (uint8_t)(w >> 8);
|
166
|
+
p[2] = (uint8_t)(w >> 16);
|
167
|
+
p[3] = (uint8_t)(w >> 24);
|
168
|
+
}
|
169
|
+
|
170
|
+
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
|
171
|
+
store32(&bytes_out[0 * 4], cv_words[0]);
|
172
|
+
store32(&bytes_out[1 * 4], cv_words[1]);
|
173
|
+
store32(&bytes_out[2 * 4], cv_words[2]);
|
174
|
+
store32(&bytes_out[3 * 4], cv_words[3]);
|
175
|
+
store32(&bytes_out[4 * 4], cv_words[4]);
|
176
|
+
store32(&bytes_out[5 * 4], cv_words[5]);
|
177
|
+
store32(&bytes_out[6 * 4], cv_words[6]);
|
178
|
+
store32(&bytes_out[7 * 4], cv_words[7]);
|
179
|
+
}
|
180
|
+
|
149
181
|
void blake3_compress_in_place(uint32_t cv[8],
|
150
182
|
const uint8_t block[BLAKE3_BLOCK_LEN],
|
151
183
|
uint8_t block_len, uint64_t counter,
|
@@ -161,7 +193,90 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
|
161
193
|
bool increment_counter, uint8_t flags,
|
162
194
|
uint8_t flags_start, uint8_t flags_end, uint8_t *out);
|
163
195
|
|
164
|
-
size_t blake3_simd_degree();
|
196
|
+
size_t blake3_simd_degree(void);
|
197
|
+
|
198
|
+
|
199
|
+
// Declarations for implementation-specific functions.
|
200
|
+
void blake3_compress_in_place_portable(uint32_t cv[8],
|
201
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
202
|
+
uint8_t block_len, uint64_t counter,
|
203
|
+
uint8_t flags);
|
204
|
+
|
205
|
+
void blake3_compress_xof_portable(const uint32_t cv[8],
|
206
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
207
|
+
uint8_t block_len, uint64_t counter,
|
208
|
+
uint8_t flags, uint8_t out[64]);
|
209
|
+
|
210
|
+
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
211
|
+
size_t blocks, const uint32_t key[8],
|
212
|
+
uint64_t counter, bool increment_counter,
|
213
|
+
uint8_t flags, uint8_t flags_start,
|
214
|
+
uint8_t flags_end, uint8_t *out);
|
215
|
+
|
216
|
+
#if defined(IS_X86)
|
217
|
+
#if !defined(BLAKE3_NO_SSE2)
|
218
|
+
void blake3_compress_in_place_sse2(uint32_t cv[8],
|
219
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
220
|
+
uint8_t block_len, uint64_t counter,
|
221
|
+
uint8_t flags);
|
222
|
+
void blake3_compress_xof_sse2(const uint32_t cv[8],
|
223
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
224
|
+
uint8_t block_len, uint64_t counter,
|
225
|
+
uint8_t flags, uint8_t out[64]);
|
226
|
+
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
|
227
|
+
size_t blocks, const uint32_t key[8],
|
228
|
+
uint64_t counter, bool increment_counter,
|
229
|
+
uint8_t flags, uint8_t flags_start,
|
230
|
+
uint8_t flags_end, uint8_t *out);
|
231
|
+
#endif
|
232
|
+
#if !defined(BLAKE3_NO_SSE41)
|
233
|
+
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
234
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
235
|
+
uint8_t block_len, uint64_t counter,
|
236
|
+
uint8_t flags);
|
237
|
+
void blake3_compress_xof_sse41(const uint32_t cv[8],
|
238
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
239
|
+
uint8_t block_len, uint64_t counter,
|
240
|
+
uint8_t flags, uint8_t out[64]);
|
241
|
+
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
|
242
|
+
size_t blocks, const uint32_t key[8],
|
243
|
+
uint64_t counter, bool increment_counter,
|
244
|
+
uint8_t flags, uint8_t flags_start,
|
245
|
+
uint8_t flags_end, uint8_t *out);
|
246
|
+
#endif
|
247
|
+
#if !defined(BLAKE3_NO_AVX2)
|
248
|
+
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
|
249
|
+
size_t blocks, const uint32_t key[8],
|
250
|
+
uint64_t counter, bool increment_counter,
|
251
|
+
uint8_t flags, uint8_t flags_start,
|
252
|
+
uint8_t flags_end, uint8_t *out);
|
253
|
+
#endif
|
254
|
+
#if !defined(BLAKE3_NO_AVX512)
|
255
|
+
void blake3_compress_in_place_avx512(uint32_t cv[8],
|
256
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
257
|
+
uint8_t block_len, uint64_t counter,
|
258
|
+
uint8_t flags);
|
259
|
+
|
260
|
+
void blake3_compress_xof_avx512(const uint32_t cv[8],
|
261
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
262
|
+
uint8_t block_len, uint64_t counter,
|
263
|
+
uint8_t flags, uint8_t out[64]);
|
264
|
+
|
265
|
+
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
266
|
+
size_t blocks, const uint32_t key[8],
|
267
|
+
uint64_t counter, bool increment_counter,
|
268
|
+
uint8_t flags, uint8_t flags_start,
|
269
|
+
uint8_t flags_end, uint8_t *out);
|
270
|
+
#endif
|
271
|
+
#endif
|
272
|
+
|
273
|
+
#if BLAKE3_USE_NEON == 1
|
274
|
+
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
275
|
+
size_t blocks, const uint32_t key[8],
|
276
|
+
uint64_t counter, bool increment_counter,
|
277
|
+
uint8_t flags, uint8_t flags_start,
|
278
|
+
uint8_t flags_end, uint8_t *out);
|
279
|
+
#endif
|
165
280
|
|
166
281
|
|
167
282
|
#endif /* BLAKE3_IMPL_H */
|
@@ -2,7 +2,12 @@
|
|
2
2
|
|
3
3
|
#include <arm_neon.h>
|
4
4
|
|
5
|
-
|
5
|
+
#ifdef __ARM_BIG_ENDIAN
|
6
|
+
#error "This implementation only supports little-endian ARM."
|
7
|
+
// It might be that all we need for big-endian support here is to get the loads
|
8
|
+
// and stores right, but step zero would be finding a way to test it in CI.
|
9
|
+
#endif
|
10
|
+
|
6
11
|
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
7
12
|
// vld1q_u32 has alignment requirements. Don't use it.
|
8
13
|
uint32x4_t x;
|
@@ -1,14 +1,6 @@
|
|
1
1
|
#include "blake3_impl.h"
|
2
2
|
#include <string.h>
|
3
3
|
|
4
|
-
INLINE void store32(void *dst, uint32_t w) {
|
5
|
-
uint8_t *p = (uint8_t *)dst;
|
6
|
-
p[0] = (uint8_t)(w >> 0);
|
7
|
-
p[1] = (uint8_t)(w >> 8);
|
8
|
-
p[2] = (uint8_t)(w >> 16);
|
9
|
-
p[3] = (uint8_t)(w >> 24);
|
10
|
-
}
|
11
|
-
|
12
4
|
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
|
13
5
|
return (w >> c) | (w << (32 - c));
|
14
6
|
}
|
@@ -147,7 +139,7 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
|
|
147
139
|
blocks -= 1;
|
148
140
|
block_flags = flags;
|
149
141
|
}
|
150
|
-
|
142
|
+
store_cv_words(out, cv);
|
151
143
|
}
|
152
144
|
|
153
145
|
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|