uncle_blake3 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.md +27 -0
- data/README.md +89 -0
- data/ext/Rakefile +55 -0
- data/ext/binding/uncle_blake3.c +41 -0
- data/ext/blake3/c/Makefile.testing +82 -0
- data/ext/blake3/c/README.md +316 -0
- data/ext/blake3/c/blake3.c +616 -0
- data/ext/blake3/c/blake3.h +60 -0
- data/ext/blake3/c/blake3_avx2.c +326 -0
- data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
- data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
- data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
- data/ext/blake3/c/blake3_avx512.c +1207 -0
- data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
- data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
- data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
- data/ext/blake3/c/blake3_dispatch.c +276 -0
- data/ext/blake3/c/blake3_impl.h +282 -0
- data/ext/blake3/c/blake3_neon.c +351 -0
- data/ext/blake3/c/blake3_portable.c +160 -0
- data/ext/blake3/c/blake3_sse2.c +566 -0
- data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/blake3/c/blake3_sse41.c +560 -0
- data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
- data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
- data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
- data/ext/blake3/c/example.c +37 -0
- data/ext/blake3/c/main.c +166 -0
- data/ext/blake3/c/test.py +97 -0
- data/lib/uncle_blake3/binding.rb +20 -0
- data/lib/uncle_blake3/build/loader.rb +40 -0
- data/lib/uncle_blake3/build/platform.rb +37 -0
- data/lib/uncle_blake3/build.rb +4 -0
- data/lib/uncle_blake3/digest.rb +119 -0
- data/lib/uncle_blake3/version.rb +5 -0
- data/lib/uncle_blake3.rb +7 -0
- metadata +112 -0
@@ -0,0 +1,276 @@
|
|
1
|
+
#include <stdbool.h>
|
2
|
+
#include <stddef.h>
|
3
|
+
#include <stdint.h>
|
4
|
+
|
5
|
+
#include "blake3_impl.h"
|
6
|
+
|
7
|
+
#if defined(IS_X86)
|
8
|
+
#if defined(_MSC_VER)
|
9
|
+
#include <intrin.h>
|
10
|
+
#elif defined(__GNUC__)
|
11
|
+
#include <immintrin.h>
|
12
|
+
#else
|
13
|
+
#error "Unimplemented!"
|
14
|
+
#endif
|
15
|
+
#endif
|
16
|
+
|
17
|
+
#define MAYBE_UNUSED(x) (void)((x))
|
18
|
+
|
19
|
+
#if defined(IS_X86)
|
20
|
+
static uint64_t xgetbv() {
|
21
|
+
#if defined(_MSC_VER)
|
22
|
+
return _xgetbv(0);
|
23
|
+
#else
|
24
|
+
uint32_t eax = 0, edx = 0;
|
25
|
+
__asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
|
26
|
+
return ((uint64_t)edx << 32) | eax;
|
27
|
+
#endif
|
28
|
+
}
|
29
|
+
|
30
|
+
static void cpuid(uint32_t out[4], uint32_t id) {
|
31
|
+
#if defined(_MSC_VER)
|
32
|
+
__cpuid((int *)out, id);
|
33
|
+
#elif defined(__i386__) || defined(_M_IX86)
|
34
|
+
__asm__ __volatile__("movl %%ebx, %1\n"
|
35
|
+
"cpuid\n"
|
36
|
+
"xchgl %1, %%ebx\n"
|
37
|
+
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
|
38
|
+
: "a"(id));
|
39
|
+
#else
|
40
|
+
__asm__ __volatile__("cpuid\n"
|
41
|
+
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
|
42
|
+
: "a"(id));
|
43
|
+
#endif
|
44
|
+
}
|
45
|
+
|
46
|
+
static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
|
47
|
+
#if defined(_MSC_VER)
|
48
|
+
__cpuidex((int *)out, id, sid);
|
49
|
+
#elif defined(__i386__) || defined(_M_IX86)
|
50
|
+
__asm__ __volatile__("movl %%ebx, %1\n"
|
51
|
+
"cpuid\n"
|
52
|
+
"xchgl %1, %%ebx\n"
|
53
|
+
: "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
|
54
|
+
: "a"(id), "c"(sid));
|
55
|
+
#else
|
56
|
+
__asm__ __volatile__("cpuid\n"
|
57
|
+
: "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
|
58
|
+
: "a"(id), "c"(sid));
|
59
|
+
#endif
|
60
|
+
}
|
61
|
+
|
62
|
+
#endif
|
63
|
+
|
64
|
+
enum cpu_feature {
|
65
|
+
SSE2 = 1 << 0,
|
66
|
+
SSSE3 = 1 << 1,
|
67
|
+
SSE41 = 1 << 2,
|
68
|
+
AVX = 1 << 3,
|
69
|
+
AVX2 = 1 << 4,
|
70
|
+
AVX512F = 1 << 5,
|
71
|
+
AVX512VL = 1 << 6,
|
72
|
+
/* ... */
|
73
|
+
UNDEFINED = 1 << 30
|
74
|
+
};
|
75
|
+
|
76
|
+
#if !defined(BLAKE3_TESTING)
|
77
|
+
static /* Allow the variable to be controlled manually for testing */
|
78
|
+
#endif
|
79
|
+
enum cpu_feature g_cpu_features = UNDEFINED;
|
80
|
+
|
81
|
+
#if !defined(BLAKE3_TESTING)
|
82
|
+
static
|
83
|
+
#endif
|
84
|
+
enum cpu_feature
|
85
|
+
get_cpu_features() {
|
86
|
+
|
87
|
+
if (g_cpu_features != UNDEFINED) {
|
88
|
+
return g_cpu_features;
|
89
|
+
} else {
|
90
|
+
#if defined(IS_X86)
|
91
|
+
uint32_t regs[4] = {0};
|
92
|
+
uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3];
|
93
|
+
(void)edx;
|
94
|
+
enum cpu_feature features = 0;
|
95
|
+
cpuid(regs, 0);
|
96
|
+
const int max_id = *eax;
|
97
|
+
cpuid(regs, 1);
|
98
|
+
#if defined(__amd64__) || defined(_M_X64)
|
99
|
+
features |= SSE2;
|
100
|
+
#else
|
101
|
+
if (*edx & (1UL << 26))
|
102
|
+
features |= SSE2;
|
103
|
+
#endif
|
104
|
+
if (*ecx & (1UL << 0))
|
105
|
+
features |= SSSE3;
|
106
|
+
if (*ecx & (1UL << 19))
|
107
|
+
features |= SSE41;
|
108
|
+
|
109
|
+
if (*ecx & (1UL << 27)) { // OSXSAVE
|
110
|
+
const uint64_t mask = xgetbv();
|
111
|
+
if ((mask & 6) == 6) { // SSE and AVX states
|
112
|
+
if (*ecx & (1UL << 28))
|
113
|
+
features |= AVX;
|
114
|
+
if (max_id >= 7) {
|
115
|
+
cpuidex(regs, 7, 0);
|
116
|
+
if (*ebx & (1UL << 5))
|
117
|
+
features |= AVX2;
|
118
|
+
if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
|
119
|
+
if (*ebx & (1UL << 31))
|
120
|
+
features |= AVX512VL;
|
121
|
+
if (*ebx & (1UL << 16))
|
122
|
+
features |= AVX512F;
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
}
|
127
|
+
g_cpu_features = features;
|
128
|
+
return features;
|
129
|
+
#else
|
130
|
+
/* How to detect NEON? */
|
131
|
+
return 0;
|
132
|
+
#endif
|
133
|
+
}
|
134
|
+
}
|
135
|
+
|
136
|
+
void blake3_compress_in_place(uint32_t cv[8],
|
137
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
138
|
+
uint8_t block_len, uint64_t counter,
|
139
|
+
uint8_t flags) {
|
140
|
+
#if defined(IS_X86)
|
141
|
+
const enum cpu_feature features = get_cpu_features();
|
142
|
+
MAYBE_UNUSED(features);
|
143
|
+
#if !defined(BLAKE3_NO_AVX512)
|
144
|
+
if (features & AVX512VL) {
|
145
|
+
blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
|
146
|
+
return;
|
147
|
+
}
|
148
|
+
#endif
|
149
|
+
#if !defined(BLAKE3_NO_SSE41)
|
150
|
+
if (features & SSE41) {
|
151
|
+
blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
|
152
|
+
return;
|
153
|
+
}
|
154
|
+
#endif
|
155
|
+
#if !defined(BLAKE3_NO_SSE2)
|
156
|
+
if (features & SSE2) {
|
157
|
+
blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
|
158
|
+
return;
|
159
|
+
}
|
160
|
+
#endif
|
161
|
+
#endif
|
162
|
+
blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
|
163
|
+
}
|
164
|
+
|
165
|
+
void blake3_compress_xof(const uint32_t cv[8],
|
166
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
167
|
+
uint8_t block_len, uint64_t counter, uint8_t flags,
|
168
|
+
uint8_t out[64]) {
|
169
|
+
#if defined(IS_X86)
|
170
|
+
const enum cpu_feature features = get_cpu_features();
|
171
|
+
MAYBE_UNUSED(features);
|
172
|
+
#if !defined(BLAKE3_NO_AVX512)
|
173
|
+
if (features & AVX512VL) {
|
174
|
+
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
|
175
|
+
return;
|
176
|
+
}
|
177
|
+
#endif
|
178
|
+
#if !defined(BLAKE3_NO_SSE41)
|
179
|
+
if (features & SSE41) {
|
180
|
+
blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
|
181
|
+
return;
|
182
|
+
}
|
183
|
+
#endif
|
184
|
+
#if !defined(BLAKE3_NO_SSE2)
|
185
|
+
if (features & SSE2) {
|
186
|
+
blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
|
187
|
+
return;
|
188
|
+
}
|
189
|
+
#endif
|
190
|
+
#endif
|
191
|
+
blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
|
192
|
+
}
|
193
|
+
|
194
|
+
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
195
|
+
size_t blocks, const uint32_t key[8], uint64_t counter,
|
196
|
+
bool increment_counter, uint8_t flags,
|
197
|
+
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
198
|
+
#if defined(IS_X86)
|
199
|
+
const enum cpu_feature features = get_cpu_features();
|
200
|
+
MAYBE_UNUSED(features);
|
201
|
+
#if !defined(BLAKE3_NO_AVX512)
|
202
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
203
|
+
blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
|
204
|
+
increment_counter, flags, flags_start, flags_end,
|
205
|
+
out);
|
206
|
+
return;
|
207
|
+
}
|
208
|
+
#endif
|
209
|
+
#if !defined(BLAKE3_NO_AVX2)
|
210
|
+
if (features & AVX2) {
|
211
|
+
blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
|
212
|
+
increment_counter, flags, flags_start, flags_end,
|
213
|
+
out);
|
214
|
+
return;
|
215
|
+
}
|
216
|
+
#endif
|
217
|
+
#if !defined(BLAKE3_NO_SSE41)
|
218
|
+
if (features & SSE41) {
|
219
|
+
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
|
220
|
+
increment_counter, flags, flags_start, flags_end,
|
221
|
+
out);
|
222
|
+
return;
|
223
|
+
}
|
224
|
+
#endif
|
225
|
+
#if !defined(BLAKE3_NO_SSE2)
|
226
|
+
if (features & SSE2) {
|
227
|
+
blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
|
228
|
+
increment_counter, flags, flags_start, flags_end,
|
229
|
+
out);
|
230
|
+
return;
|
231
|
+
}
|
232
|
+
#endif
|
233
|
+
#endif
|
234
|
+
|
235
|
+
#if BLAKE3_USE_NEON == 1
|
236
|
+
blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
|
237
|
+
increment_counter, flags, flags_start, flags_end, out);
|
238
|
+
return;
|
239
|
+
#endif
|
240
|
+
|
241
|
+
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
|
242
|
+
increment_counter, flags, flags_start, flags_end,
|
243
|
+
out);
|
244
|
+
}
|
245
|
+
|
246
|
+
// The dynamically detected SIMD degree of the current platform.
|
247
|
+
size_t blake3_simd_degree(void) {
|
248
|
+
#if defined(IS_X86)
|
249
|
+
const enum cpu_feature features = get_cpu_features();
|
250
|
+
MAYBE_UNUSED(features);
|
251
|
+
#if !defined(BLAKE3_NO_AVX512)
|
252
|
+
if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
|
253
|
+
return 16;
|
254
|
+
}
|
255
|
+
#endif
|
256
|
+
#if !defined(BLAKE3_NO_AVX2)
|
257
|
+
if (features & AVX2) {
|
258
|
+
return 8;
|
259
|
+
}
|
260
|
+
#endif
|
261
|
+
#if !defined(BLAKE3_NO_SSE41)
|
262
|
+
if (features & SSE41) {
|
263
|
+
return 4;
|
264
|
+
}
|
265
|
+
#endif
|
266
|
+
#if !defined(BLAKE3_NO_SSE2)
|
267
|
+
if (features & SSE2) {
|
268
|
+
return 4;
|
269
|
+
}
|
270
|
+
#endif
|
271
|
+
#endif
|
272
|
+
#if BLAKE3_USE_NEON == 1
|
273
|
+
return 4;
|
274
|
+
#endif
|
275
|
+
return 1;
|
276
|
+
}
|
@@ -0,0 +1,282 @@
|
|
1
|
+
#ifndef BLAKE3_IMPL_H
|
2
|
+
#define BLAKE3_IMPL_H
|
3
|
+
|
4
|
+
#include <assert.h>
|
5
|
+
#include <stdbool.h>
|
6
|
+
#include <stddef.h>
|
7
|
+
#include <stdint.h>
|
8
|
+
#include <string.h>
|
9
|
+
|
10
|
+
#include "blake3.h"
|
11
|
+
|
12
|
+
// internal flags
|
13
|
+
enum blake3_flags {
|
14
|
+
CHUNK_START = 1 << 0,
|
15
|
+
CHUNK_END = 1 << 1,
|
16
|
+
PARENT = 1 << 2,
|
17
|
+
ROOT = 1 << 3,
|
18
|
+
KEYED_HASH = 1 << 4,
|
19
|
+
DERIVE_KEY_CONTEXT = 1 << 5,
|
20
|
+
DERIVE_KEY_MATERIAL = 1 << 6,
|
21
|
+
};
|
22
|
+
|
23
|
+
// This C implementation tries to support recent versions of GCC, Clang, and
|
24
|
+
// MSVC.
|
25
|
+
#if defined(_MSC_VER)
|
26
|
+
#define INLINE static __forceinline
|
27
|
+
#else
|
28
|
+
#define INLINE static inline __attribute__((always_inline))
|
29
|
+
#endif
|
30
|
+
|
31
|
+
#if defined(__x86_64__) || defined(_M_X64)
|
32
|
+
#define IS_X86
|
33
|
+
#define IS_X86_64
|
34
|
+
#endif
|
35
|
+
|
36
|
+
#if defined(__i386__) || defined(_M_IX86)
|
37
|
+
#define IS_X86
|
38
|
+
#define IS_X86_32
|
39
|
+
#endif
|
40
|
+
|
41
|
+
#if defined(__aarch64__) || defined(_M_ARM64)
|
42
|
+
#define IS_AARCH64
|
43
|
+
#endif
|
44
|
+
|
45
|
+
#if defined(IS_X86)
|
46
|
+
#if defined(_MSC_VER)
|
47
|
+
#include <intrin.h>
|
48
|
+
#endif
|
49
|
+
#include <immintrin.h>
|
50
|
+
#endif
|
51
|
+
|
52
|
+
#if !defined(BLAKE3_USE_NEON)
|
53
|
+
// If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
|
54
|
+
#if defined(IS_AARCH64)
|
55
|
+
#define BLAKE3_USE_NEON 1
|
56
|
+
#else
|
57
|
+
#define BLAKE3_USE_NEON 0
|
58
|
+
#endif
|
59
|
+
#endif
|
60
|
+
|
61
|
+
#if defined(IS_X86)
|
62
|
+
#define MAX_SIMD_DEGREE 16
|
63
|
+
#elif BLAKE3_USE_NEON == 1
|
64
|
+
#define MAX_SIMD_DEGREE 4
|
65
|
+
#else
|
66
|
+
#define MAX_SIMD_DEGREE 1
|
67
|
+
#endif
|
68
|
+
|
69
|
+
// There are some places where we want a static size that's equal to the
|
70
|
+
// MAX_SIMD_DEGREE, but also at least 2.
|
71
|
+
#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
|
72
|
+
|
73
|
+
static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
|
74
|
+
0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
|
75
|
+
0x1F83D9ABUL, 0x5BE0CD19UL};
|
76
|
+
|
77
|
+
static const uint8_t MSG_SCHEDULE[7][16] = {
|
78
|
+
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
79
|
+
{2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
|
80
|
+
{3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
|
81
|
+
{10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
|
82
|
+
{12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
|
83
|
+
{9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
|
84
|
+
{11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
|
85
|
+
};
|
86
|
+
|
87
|
+
/* Find index of the highest set bit */
|
88
|
+
/* x is assumed to be nonzero. */
|
89
|
+
static unsigned int highest_one(uint64_t x) {
|
90
|
+
#if defined(__GNUC__) || defined(__clang__)
|
91
|
+
return 63 ^ __builtin_clzll(x);
|
92
|
+
#elif defined(_MSC_VER) && defined(IS_X86_64)
|
93
|
+
unsigned long index;
|
94
|
+
_BitScanReverse64(&index, x);
|
95
|
+
return index;
|
96
|
+
#elif defined(_MSC_VER) && defined(IS_X86_32)
|
97
|
+
if(x >> 32) {
|
98
|
+
unsigned long index;
|
99
|
+
_BitScanReverse(&index, (unsigned long)(x >> 32));
|
100
|
+
return 32 + index;
|
101
|
+
} else {
|
102
|
+
unsigned long index;
|
103
|
+
_BitScanReverse(&index, (unsigned long)x);
|
104
|
+
return index;
|
105
|
+
}
|
106
|
+
#else
|
107
|
+
unsigned int c = 0;
|
108
|
+
if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
|
109
|
+
if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
|
110
|
+
if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
|
111
|
+
if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
|
112
|
+
if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
|
113
|
+
if(x & 0x0000000000000002ULL) { c += 1; }
|
114
|
+
return c;
|
115
|
+
#endif
|
116
|
+
}
|
117
|
+
|
118
|
+
// Count the number of 1 bits.
|
119
|
+
INLINE unsigned int popcnt(uint64_t x) {
|
120
|
+
#if defined(__GNUC__) || defined(__clang__)
|
121
|
+
return __builtin_popcountll(x);
|
122
|
+
#else
|
123
|
+
unsigned int count = 0;
|
124
|
+
while (x != 0) {
|
125
|
+
count += 1;
|
126
|
+
x &= x - 1;
|
127
|
+
}
|
128
|
+
return count;
|
129
|
+
#endif
|
130
|
+
}
|
131
|
+
|
132
|
+
// Largest power of two less than or equal to x. As a special case, returns 1
|
133
|
+
// when x is 0.
|
134
|
+
INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
|
135
|
+
return 1ULL << highest_one(x | 1);
|
136
|
+
}
|
137
|
+
|
138
|
+
INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
|
139
|
+
|
140
|
+
INLINE uint32_t counter_high(uint64_t counter) {
|
141
|
+
return (uint32_t)(counter >> 32);
|
142
|
+
}
|
143
|
+
|
144
|
+
INLINE uint32_t load32(const void *src) {
|
145
|
+
const uint8_t *p = (const uint8_t *)src;
|
146
|
+
return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
|
147
|
+
((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
|
148
|
+
}
|
149
|
+
|
150
|
+
INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
|
151
|
+
uint32_t key_words[8]) {
|
152
|
+
key_words[0] = load32(&key[0 * 4]);
|
153
|
+
key_words[1] = load32(&key[1 * 4]);
|
154
|
+
key_words[2] = load32(&key[2 * 4]);
|
155
|
+
key_words[3] = load32(&key[3 * 4]);
|
156
|
+
key_words[4] = load32(&key[4 * 4]);
|
157
|
+
key_words[5] = load32(&key[5 * 4]);
|
158
|
+
key_words[6] = load32(&key[6 * 4]);
|
159
|
+
key_words[7] = load32(&key[7 * 4]);
|
160
|
+
}
|
161
|
+
|
162
|
+
INLINE void store32(void *dst, uint32_t w) {
|
163
|
+
uint8_t *p = (uint8_t *)dst;
|
164
|
+
p[0] = (uint8_t)(w >> 0);
|
165
|
+
p[1] = (uint8_t)(w >> 8);
|
166
|
+
p[2] = (uint8_t)(w >> 16);
|
167
|
+
p[3] = (uint8_t)(w >> 24);
|
168
|
+
}
|
169
|
+
|
170
|
+
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
|
171
|
+
store32(&bytes_out[0 * 4], cv_words[0]);
|
172
|
+
store32(&bytes_out[1 * 4], cv_words[1]);
|
173
|
+
store32(&bytes_out[2 * 4], cv_words[2]);
|
174
|
+
store32(&bytes_out[3 * 4], cv_words[3]);
|
175
|
+
store32(&bytes_out[4 * 4], cv_words[4]);
|
176
|
+
store32(&bytes_out[5 * 4], cv_words[5]);
|
177
|
+
store32(&bytes_out[6 * 4], cv_words[6]);
|
178
|
+
store32(&bytes_out[7 * 4], cv_words[7]);
|
179
|
+
}
|
180
|
+
|
181
|
+
void blake3_compress_in_place(uint32_t cv[8],
|
182
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
183
|
+
uint8_t block_len, uint64_t counter,
|
184
|
+
uint8_t flags);
|
185
|
+
|
186
|
+
void blake3_compress_xof(const uint32_t cv[8],
|
187
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
188
|
+
uint8_t block_len, uint64_t counter, uint8_t flags,
|
189
|
+
uint8_t out[64]);
|
190
|
+
|
191
|
+
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
|
192
|
+
size_t blocks, const uint32_t key[8], uint64_t counter,
|
193
|
+
bool increment_counter, uint8_t flags,
|
194
|
+
uint8_t flags_start, uint8_t flags_end, uint8_t *out);
|
195
|
+
|
196
|
+
size_t blake3_simd_degree(void);
|
197
|
+
|
198
|
+
|
199
|
+
// Declarations for implementation-specific functions.
|
200
|
+
void blake3_compress_in_place_portable(uint32_t cv[8],
|
201
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
202
|
+
uint8_t block_len, uint64_t counter,
|
203
|
+
uint8_t flags);
|
204
|
+
|
205
|
+
void blake3_compress_xof_portable(const uint32_t cv[8],
|
206
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
207
|
+
uint8_t block_len, uint64_t counter,
|
208
|
+
uint8_t flags, uint8_t out[64]);
|
209
|
+
|
210
|
+
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
211
|
+
size_t blocks, const uint32_t key[8],
|
212
|
+
uint64_t counter, bool increment_counter,
|
213
|
+
uint8_t flags, uint8_t flags_start,
|
214
|
+
uint8_t flags_end, uint8_t *out);
|
215
|
+
|
216
|
+
#if defined(IS_X86)
|
217
|
+
#if !defined(BLAKE3_NO_SSE2)
|
218
|
+
void blake3_compress_in_place_sse2(uint32_t cv[8],
|
219
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
220
|
+
uint8_t block_len, uint64_t counter,
|
221
|
+
uint8_t flags);
|
222
|
+
void blake3_compress_xof_sse2(const uint32_t cv[8],
|
223
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
224
|
+
uint8_t block_len, uint64_t counter,
|
225
|
+
uint8_t flags, uint8_t out[64]);
|
226
|
+
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
|
227
|
+
size_t blocks, const uint32_t key[8],
|
228
|
+
uint64_t counter, bool increment_counter,
|
229
|
+
uint8_t flags, uint8_t flags_start,
|
230
|
+
uint8_t flags_end, uint8_t *out);
|
231
|
+
#endif
|
232
|
+
#if !defined(BLAKE3_NO_SSE41)
|
233
|
+
void blake3_compress_in_place_sse41(uint32_t cv[8],
|
234
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
235
|
+
uint8_t block_len, uint64_t counter,
|
236
|
+
uint8_t flags);
|
237
|
+
void blake3_compress_xof_sse41(const uint32_t cv[8],
|
238
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
239
|
+
uint8_t block_len, uint64_t counter,
|
240
|
+
uint8_t flags, uint8_t out[64]);
|
241
|
+
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
|
242
|
+
size_t blocks, const uint32_t key[8],
|
243
|
+
uint64_t counter, bool increment_counter,
|
244
|
+
uint8_t flags, uint8_t flags_start,
|
245
|
+
uint8_t flags_end, uint8_t *out);
|
246
|
+
#endif
|
247
|
+
#if !defined(BLAKE3_NO_AVX2)
|
248
|
+
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
|
249
|
+
size_t blocks, const uint32_t key[8],
|
250
|
+
uint64_t counter, bool increment_counter,
|
251
|
+
uint8_t flags, uint8_t flags_start,
|
252
|
+
uint8_t flags_end, uint8_t *out);
|
253
|
+
#endif
|
254
|
+
#if !defined(BLAKE3_NO_AVX512)
|
255
|
+
void blake3_compress_in_place_avx512(uint32_t cv[8],
|
256
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
257
|
+
uint8_t block_len, uint64_t counter,
|
258
|
+
uint8_t flags);
|
259
|
+
|
260
|
+
void blake3_compress_xof_avx512(const uint32_t cv[8],
|
261
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
262
|
+
uint8_t block_len, uint64_t counter,
|
263
|
+
uint8_t flags, uint8_t out[64]);
|
264
|
+
|
265
|
+
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
|
266
|
+
size_t blocks, const uint32_t key[8],
|
267
|
+
uint64_t counter, bool increment_counter,
|
268
|
+
uint8_t flags, uint8_t flags_start,
|
269
|
+
uint8_t flags_end, uint8_t *out);
|
270
|
+
#endif
|
271
|
+
#endif
|
272
|
+
|
273
|
+
#if BLAKE3_USE_NEON == 1
|
274
|
+
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
275
|
+
size_t blocks, const uint32_t key[8],
|
276
|
+
uint64_t counter, bool increment_counter,
|
277
|
+
uint8_t flags, uint8_t flags_start,
|
278
|
+
uint8_t flags_end, uint8_t *out);
|
279
|
+
#endif
|
280
|
+
|
281
|
+
|
282
|
+
#endif /* BLAKE3_IMPL_H */
|