uncle_blake3 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.md +27 -0
  3. data/README.md +89 -0
  4. data/ext/Rakefile +55 -0
  5. data/ext/binding/uncle_blake3.c +41 -0
  6. data/ext/blake3/c/Makefile.testing +82 -0
  7. data/ext/blake3/c/README.md +316 -0
  8. data/ext/blake3/c/blake3.c +616 -0
  9. data/ext/blake3/c/blake3.h +60 -0
  10. data/ext/blake3/c/blake3_avx2.c +326 -0
  11. data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
  12. data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
  13. data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
  14. data/ext/blake3/c/blake3_avx512.c +1207 -0
  15. data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
  16. data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
  17. data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
  18. data/ext/blake3/c/blake3_dispatch.c +276 -0
  19. data/ext/blake3/c/blake3_impl.h +282 -0
  20. data/ext/blake3/c/blake3_neon.c +351 -0
  21. data/ext/blake3/c/blake3_portable.c +160 -0
  22. data/ext/blake3/c/blake3_sse2.c +566 -0
  23. data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
  24. data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
  25. data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
  26. data/ext/blake3/c/blake3_sse41.c +560 -0
  27. data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
  28. data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
  29. data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
  30. data/ext/blake3/c/example.c +37 -0
  31. data/ext/blake3/c/main.c +166 -0
  32. data/ext/blake3/c/test.py +97 -0
  33. data/lib/uncle_blake3/binding.rb +20 -0
  34. data/lib/uncle_blake3/build/loader.rb +40 -0
  35. data/lib/uncle_blake3/build/platform.rb +37 -0
  36. data/lib/uncle_blake3/build.rb +4 -0
  37. data/lib/uncle_blake3/digest.rb +119 -0
  38. data/lib/uncle_blake3/version.rb +5 -0
  39. data/lib/uncle_blake3.rb +7 -0
  40. metadata +112 -0
@@ -0,0 +1,276 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "blake3_impl.h"
6
+
7
+ #if defined(IS_X86)
8
+ #if defined(_MSC_VER)
9
+ #include <intrin.h>
10
+ #elif defined(__GNUC__)
11
+ #include <immintrin.h>
12
+ #else
13
+ #error "Unimplemented!"
14
+ #endif
15
+ #endif
16
+
17
+ #define MAYBE_UNUSED(x) (void)((x))
18
+
19
+ #if defined(IS_X86)
20
+ static uint64_t xgetbv() {
21
+ #if defined(_MSC_VER)
22
+ return _xgetbv(0);
23
+ #else
24
+ uint32_t eax = 0, edx = 0;
25
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26
+ return ((uint64_t)edx << 32) | eax;
27
+ #endif
28
+ }
29
+
30
+ static void cpuid(uint32_t out[4], uint32_t id) {
31
+ #if defined(_MSC_VER)
32
+ __cpuid((int *)out, id);
33
+ #elif defined(__i386__) || defined(_M_IX86)
34
+ __asm__ __volatile__("movl %%ebx, %1\n"
35
+ "cpuid\n"
36
+ "xchgl %1, %%ebx\n"
37
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38
+ : "a"(id));
39
+ #else
40
+ __asm__ __volatile__("cpuid\n"
41
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42
+ : "a"(id));
43
+ #endif
44
+ }
45
+
46
+ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47
+ #if defined(_MSC_VER)
48
+ __cpuidex((int *)out, id, sid);
49
+ #elif defined(__i386__) || defined(_M_IX86)
50
+ __asm__ __volatile__("movl %%ebx, %1\n"
51
+ "cpuid\n"
52
+ "xchgl %1, %%ebx\n"
53
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54
+ : "a"(id), "c"(sid));
55
+ #else
56
+ __asm__ __volatile__("cpuid\n"
57
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58
+ : "a"(id), "c"(sid));
59
+ #endif
60
+ }
61
+
62
+ #endif
63
+
64
+ enum cpu_feature {
65
+ SSE2 = 1 << 0,
66
+ SSSE3 = 1 << 1,
67
+ SSE41 = 1 << 2,
68
+ AVX = 1 << 3,
69
+ AVX2 = 1 << 4,
70
+ AVX512F = 1 << 5,
71
+ AVX512VL = 1 << 6,
72
+ /* ... */
73
+ UNDEFINED = 1 << 30
74
+ };
75
+
76
+ #if !defined(BLAKE3_TESTING)
77
+ static /* Allow the variable to be controlled manually for testing */
78
+ #endif
79
+ enum cpu_feature g_cpu_features = UNDEFINED;
80
+
81
+ #if !defined(BLAKE3_TESTING)
82
+ static
83
+ #endif
84
+ enum cpu_feature
85
+ get_cpu_features() {
86
+
87
+ if (g_cpu_features != UNDEFINED) {
88
+ return g_cpu_features;
89
+ } else {
90
+ #if defined(IS_X86)
91
+ uint32_t regs[4] = {0};
92
+ uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
93
+ (void)edx;
94
+ enum cpu_feature features = 0;
95
+ cpuid(regs, 0);
96
+ const int max_id = *eax;
97
+ cpuid(regs, 1);
98
+ #if defined(__amd64__) || defined(_M_X64)
99
+ features |= SSE2;
100
+ #else
101
+ if (*edx & (1UL << 26))
102
+ features |= SSE2;
103
+ #endif
104
+ if (*ecx & (1UL << 0))
105
+ features |= SSSE3;
106
+ if (*ecx & (1UL << 19))
107
+ features |= SSE41;
108
+
109
+ if (*ecx & (1UL << 27)) { // OSXSAVE
110
+ const uint64_t mask = xgetbv();
111
+ if ((mask & 6) == 6) { // SSE and AVX states
112
+ if (*ecx & (1UL << 28))
113
+ features |= AVX;
114
+ if (max_id >= 7) {
115
+ cpuidex(regs, 7, 0);
116
+ if (*ebx & (1UL << 5))
117
+ features |= AVX2;
118
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
119
+ if (*ebx & (1UL << 31))
120
+ features |= AVX512VL;
121
+ if (*ebx & (1UL << 16))
122
+ features |= AVX512F;
123
+ }
124
+ }
125
+ }
126
+ }
127
+ g_cpu_features = features;
128
+ return features;
129
+ #else
130
+ /* How to detect NEON? */
131
+ return 0;
132
+ #endif
133
+ }
134
+ }
135
+
136
+ void blake3_compress_in_place(uint32_t cv[8],
137
+ const uint8_t block[BLAKE3_BLOCK_LEN],
138
+ uint8_t block_len, uint64_t counter,
139
+ uint8_t flags) {
140
+ #if defined(IS_X86)
141
+ const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
143
+ #if !defined(BLAKE3_NO_AVX512)
144
+ if (features & AVX512VL) {
145
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
146
+ return;
147
+ }
148
+ #endif
149
+ #if !defined(BLAKE3_NO_SSE41)
150
+ if (features & SSE41) {
151
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
152
+ return;
153
+ }
154
+ #endif
155
+ #if !defined(BLAKE3_NO_SSE2)
156
+ if (features & SSE2) {
157
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158
+ return;
159
+ }
160
+ #endif
161
+ #endif
162
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
163
+ }
164
+
165
+ void blake3_compress_xof(const uint32_t cv[8],
166
+ const uint8_t block[BLAKE3_BLOCK_LEN],
167
+ uint8_t block_len, uint64_t counter, uint8_t flags,
168
+ uint8_t out[64]) {
169
+ #if defined(IS_X86)
170
+ const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
172
+ #if !defined(BLAKE3_NO_AVX512)
173
+ if (features & AVX512VL) {
174
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
175
+ return;
176
+ }
177
+ #endif
178
+ #if !defined(BLAKE3_NO_SSE41)
179
+ if (features & SSE41) {
180
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
181
+ return;
182
+ }
183
+ #endif
184
+ #if !defined(BLAKE3_NO_SSE2)
185
+ if (features & SSE2) {
186
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187
+ return;
188
+ }
189
+ #endif
190
+ #endif
191
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
192
+ }
193
+
194
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
195
+ size_t blocks, const uint32_t key[8], uint64_t counter,
196
+ bool increment_counter, uint8_t flags,
197
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
198
+ #if defined(IS_X86)
199
+ const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
201
+ #if !defined(BLAKE3_NO_AVX512)
202
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
203
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
204
+ increment_counter, flags, flags_start, flags_end,
205
+ out);
206
+ return;
207
+ }
208
+ #endif
209
+ #if !defined(BLAKE3_NO_AVX2)
210
+ if (features & AVX2) {
211
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
212
+ increment_counter, flags, flags_start, flags_end,
213
+ out);
214
+ return;
215
+ }
216
+ #endif
217
+ #if !defined(BLAKE3_NO_SSE41)
218
+ if (features & SSE41) {
219
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
220
+ increment_counter, flags, flags_start, flags_end,
221
+ out);
222
+ return;
223
+ }
224
+ #endif
225
+ #if !defined(BLAKE3_NO_SSE2)
226
+ if (features & SSE2) {
227
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228
+ increment_counter, flags, flags_start, flags_end,
229
+ out);
230
+ return;
231
+ }
232
+ #endif
233
+ #endif
234
+
235
+ #if BLAKE3_USE_NEON == 1
236
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
237
+ increment_counter, flags, flags_start, flags_end, out);
238
+ return;
239
+ #endif
240
+
241
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
242
+ increment_counter, flags, flags_start, flags_end,
243
+ out);
244
+ }
245
+
246
+ // The dynamically detected SIMD degree of the current platform.
247
+ size_t blake3_simd_degree(void) {
248
+ #if defined(IS_X86)
249
+ const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
251
+ #if !defined(BLAKE3_NO_AVX512)
252
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253
+ return 16;
254
+ }
255
+ #endif
256
+ #if !defined(BLAKE3_NO_AVX2)
257
+ if (features & AVX2) {
258
+ return 8;
259
+ }
260
+ #endif
261
+ #if !defined(BLAKE3_NO_SSE41)
262
+ if (features & SSE41) {
263
+ return 4;
264
+ }
265
+ #endif
266
+ #if !defined(BLAKE3_NO_SSE2)
267
+ if (features & SSE2) {
268
+ return 4;
269
+ }
270
+ #endif
271
+ #endif
272
+ #if BLAKE3_USE_NEON == 1
273
+ return 4;
274
+ #endif
275
+ return 1;
276
+ }
@@ -0,0 +1,282 @@
1
+ #ifndef BLAKE3_IMPL_H
2
+ #define BLAKE3_IMPL_H
3
+
4
+ #include <assert.h>
5
+ #include <stdbool.h>
6
+ #include <stddef.h>
7
+ #include <stdint.h>
8
+ #include <string.h>
9
+
10
+ #include "blake3.h"
11
+
12
+ // internal flags
13
+ enum blake3_flags {
14
+ CHUNK_START = 1 << 0,
15
+ CHUNK_END = 1 << 1,
16
+ PARENT = 1 << 2,
17
+ ROOT = 1 << 3,
18
+ KEYED_HASH = 1 << 4,
19
+ DERIVE_KEY_CONTEXT = 1 << 5,
20
+ DERIVE_KEY_MATERIAL = 1 << 6,
21
+ };
22
+
23
+ // This C implementation tries to support recent versions of GCC, Clang, and
24
+ // MSVC.
25
+ #if defined(_MSC_VER)
26
+ #define INLINE static __forceinline
27
+ #else
28
+ #define INLINE static inline __attribute__((always_inline))
29
+ #endif
30
+
31
+ #if defined(__x86_64__) || defined(_M_X64)
32
+ #define IS_X86
33
+ #define IS_X86_64
34
+ #endif
35
+
36
+ #if defined(__i386__) || defined(_M_IX86)
37
+ #define IS_X86
38
+ #define IS_X86_32
39
+ #endif
40
+
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
45
+ #if defined(IS_X86)
46
+ #if defined(_MSC_VER)
47
+ #include <intrin.h>
48
+ #endif
49
+ #include <immintrin.h>
50
+ #endif
51
+
52
+ #if !defined(BLAKE3_USE_NEON)
53
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
54
+ #if defined(IS_AARCH64)
55
+ #define BLAKE3_USE_NEON 1
56
+ #else
57
+ #define BLAKE3_USE_NEON 0
58
+ #endif
59
+ #endif
60
+
61
+ #if defined(IS_X86)
62
+ #define MAX_SIMD_DEGREE 16
63
+ #elif BLAKE3_USE_NEON == 1
64
+ #define MAX_SIMD_DEGREE 4
65
+ #else
66
+ #define MAX_SIMD_DEGREE 1
67
+ #endif
68
+
69
+ // There are some places where we want a static size that's equal to the
70
+ // MAX_SIMD_DEGREE, but also at least 2.
71
+ #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
72
+
73
+ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
74
+ 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
75
+ 0x1F83D9ABUL, 0x5BE0CD19UL};
76
+
77
+ static const uint8_t MSG_SCHEDULE[7][16] = {
78
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
79
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
80
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
81
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
82
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
83
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
84
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
85
+ };
86
+
87
+ /* Find index of the highest set bit */
88
+ /* x is assumed to be nonzero. */
89
+ static unsigned int highest_one(uint64_t x) {
90
+ #if defined(__GNUC__) || defined(__clang__)
91
+ return 63 ^ __builtin_clzll(x);
92
+ #elif defined(_MSC_VER) && defined(IS_X86_64)
93
+ unsigned long index;
94
+ _BitScanReverse64(&index, x);
95
+ return index;
96
+ #elif defined(_MSC_VER) && defined(IS_X86_32)
97
+ if(x >> 32) {
98
+ unsigned long index;
99
+ _BitScanReverse(&index, (unsigned long)(x >> 32));
100
+ return 32 + index;
101
+ } else {
102
+ unsigned long index;
103
+ _BitScanReverse(&index, (unsigned long)x);
104
+ return index;
105
+ }
106
+ #else
107
+ unsigned int c = 0;
108
+ if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
109
+ if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
110
+ if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
111
+ if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
112
+ if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
113
+ if(x & 0x0000000000000002ULL) { c += 1; }
114
+ return c;
115
+ #endif
116
+ }
117
+
118
+ // Count the number of 1 bits.
119
+ INLINE unsigned int popcnt(uint64_t x) {
120
+ #if defined(__GNUC__) || defined(__clang__)
121
+ return __builtin_popcountll(x);
122
+ #else
123
+ unsigned int count = 0;
124
+ while (x != 0) {
125
+ count += 1;
126
+ x &= x - 1;
127
+ }
128
+ return count;
129
+ #endif
130
+ }
131
+
132
+ // Largest power of two less than or equal to x. As a special case, returns 1
133
+ // when x is 0.
134
+ INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
135
+ return 1ULL << highest_one(x | 1);
136
+ }
137
+
138
+ INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
139
+
140
+ INLINE uint32_t counter_high(uint64_t counter) {
141
+ return (uint32_t)(counter >> 32);
142
+ }
143
+
144
+ INLINE uint32_t load32(const void *src) {
145
+ const uint8_t *p = (const uint8_t *)src;
146
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
147
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
148
+ }
149
+
150
+ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
151
+ uint32_t key_words[8]) {
152
+ key_words[0] = load32(&key[0 * 4]);
153
+ key_words[1] = load32(&key[1 * 4]);
154
+ key_words[2] = load32(&key[2 * 4]);
155
+ key_words[3] = load32(&key[3 * 4]);
156
+ key_words[4] = load32(&key[4 * 4]);
157
+ key_words[5] = load32(&key[5 * 4]);
158
+ key_words[6] = load32(&key[6 * 4]);
159
+ key_words[7] = load32(&key[7 * 4]);
160
+ }
161
+
162
+ INLINE void store32(void *dst, uint32_t w) {
163
+ uint8_t *p = (uint8_t *)dst;
164
+ p[0] = (uint8_t)(w >> 0);
165
+ p[1] = (uint8_t)(w >> 8);
166
+ p[2] = (uint8_t)(w >> 16);
167
+ p[3] = (uint8_t)(w >> 24);
168
+ }
169
+
170
+ INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
171
+ store32(&bytes_out[0 * 4], cv_words[0]);
172
+ store32(&bytes_out[1 * 4], cv_words[1]);
173
+ store32(&bytes_out[2 * 4], cv_words[2]);
174
+ store32(&bytes_out[3 * 4], cv_words[3]);
175
+ store32(&bytes_out[4 * 4], cv_words[4]);
176
+ store32(&bytes_out[5 * 4], cv_words[5]);
177
+ store32(&bytes_out[6 * 4], cv_words[6]);
178
+ store32(&bytes_out[7 * 4], cv_words[7]);
179
+ }
180
+
181
+ void blake3_compress_in_place(uint32_t cv[8],
182
+ const uint8_t block[BLAKE3_BLOCK_LEN],
183
+ uint8_t block_len, uint64_t counter,
184
+ uint8_t flags);
185
+
186
+ void blake3_compress_xof(const uint32_t cv[8],
187
+ const uint8_t block[BLAKE3_BLOCK_LEN],
188
+ uint8_t block_len, uint64_t counter, uint8_t flags,
189
+ uint8_t out[64]);
190
+
191
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
192
+ size_t blocks, const uint32_t key[8], uint64_t counter,
193
+ bool increment_counter, uint8_t flags,
194
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
195
+
196
+ size_t blake3_simd_degree(void);
197
+
198
+
199
+ // Declarations for implementation-specific functions.
200
+ void blake3_compress_in_place_portable(uint32_t cv[8],
201
+ const uint8_t block[BLAKE3_BLOCK_LEN],
202
+ uint8_t block_len, uint64_t counter,
203
+ uint8_t flags);
204
+
205
+ void blake3_compress_xof_portable(const uint32_t cv[8],
206
+ const uint8_t block[BLAKE3_BLOCK_LEN],
207
+ uint8_t block_len, uint64_t counter,
208
+ uint8_t flags, uint8_t out[64]);
209
+
210
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
211
+ size_t blocks, const uint32_t key[8],
212
+ uint64_t counter, bool increment_counter,
213
+ uint8_t flags, uint8_t flags_start,
214
+ uint8_t flags_end, uint8_t *out);
215
+
216
+ #if defined(IS_X86)
217
+ #if !defined(BLAKE3_NO_SSE2)
218
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
219
+ const uint8_t block[BLAKE3_BLOCK_LEN],
220
+ uint8_t block_len, uint64_t counter,
221
+ uint8_t flags);
222
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
223
+ const uint8_t block[BLAKE3_BLOCK_LEN],
224
+ uint8_t block_len, uint64_t counter,
225
+ uint8_t flags, uint8_t out[64]);
226
+ void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
227
+ size_t blocks, const uint32_t key[8],
228
+ uint64_t counter, bool increment_counter,
229
+ uint8_t flags, uint8_t flags_start,
230
+ uint8_t flags_end, uint8_t *out);
231
+ #endif
232
+ #if !defined(BLAKE3_NO_SSE41)
233
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
234
+ const uint8_t block[BLAKE3_BLOCK_LEN],
235
+ uint8_t block_len, uint64_t counter,
236
+ uint8_t flags);
237
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
238
+ const uint8_t block[BLAKE3_BLOCK_LEN],
239
+ uint8_t block_len, uint64_t counter,
240
+ uint8_t flags, uint8_t out[64]);
241
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
242
+ size_t blocks, const uint32_t key[8],
243
+ uint64_t counter, bool increment_counter,
244
+ uint8_t flags, uint8_t flags_start,
245
+ uint8_t flags_end, uint8_t *out);
246
+ #endif
247
+ #if !defined(BLAKE3_NO_AVX2)
248
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
249
+ size_t blocks, const uint32_t key[8],
250
+ uint64_t counter, bool increment_counter,
251
+ uint8_t flags, uint8_t flags_start,
252
+ uint8_t flags_end, uint8_t *out);
253
+ #endif
254
+ #if !defined(BLAKE3_NO_AVX512)
255
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
256
+ const uint8_t block[BLAKE3_BLOCK_LEN],
257
+ uint8_t block_len, uint64_t counter,
258
+ uint8_t flags);
259
+
260
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
261
+ const uint8_t block[BLAKE3_BLOCK_LEN],
262
+ uint8_t block_len, uint64_t counter,
263
+ uint8_t flags, uint8_t out[64]);
264
+
265
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
266
+ size_t blocks, const uint32_t key[8],
267
+ uint64_t counter, bool increment_counter,
268
+ uint8_t flags, uint8_t flags_start,
269
+ uint8_t flags_end, uint8_t *out);
270
+ #endif
271
+ #endif
272
+
273
+ #if BLAKE3_USE_NEON == 1
274
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
275
+ size_t blocks, const uint32_t key[8],
276
+ uint64_t counter, bool increment_counter,
277
+ uint8_t flags, uint8_t flags_start,
278
+ uint8_t flags_end, uint8_t *out);
279
+ #endif
280
+
281
+
282
+ #endif /* BLAKE3_IMPL_H */