uncle_blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.md +27 -0
  3. data/README.md +89 -0
  4. data/ext/Rakefile +55 -0
  5. data/ext/binding/uncle_blake3.c +41 -0
  6. data/ext/blake3/c/Makefile.testing +82 -0
  7. data/ext/blake3/c/README.md +316 -0
  8. data/ext/blake3/c/blake3.c +616 -0
  9. data/ext/blake3/c/blake3.h +60 -0
  10. data/ext/blake3/c/blake3_avx2.c +326 -0
  11. data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
  12. data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
  13. data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
  14. data/ext/blake3/c/blake3_avx512.c +1207 -0
  15. data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
  16. data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
  17. data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
  18. data/ext/blake3/c/blake3_dispatch.c +276 -0
  19. data/ext/blake3/c/blake3_impl.h +282 -0
  20. data/ext/blake3/c/blake3_neon.c +351 -0
  21. data/ext/blake3/c/blake3_portable.c +160 -0
  22. data/ext/blake3/c/blake3_sse2.c +566 -0
  23. data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
  24. data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
  25. data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
  26. data/ext/blake3/c/blake3_sse41.c +560 -0
  27. data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
  28. data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
  29. data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
  30. data/ext/blake3/c/example.c +37 -0
  31. data/ext/blake3/c/main.c +166 -0
  32. data/ext/blake3/c/test.py +97 -0
  33. data/lib/uncle_blake3/binding.rb +20 -0
  34. data/lib/uncle_blake3/build/loader.rb +40 -0
  35. data/lib/uncle_blake3/build/platform.rb +37 -0
  36. data/lib/uncle_blake3/build.rb +4 -0
  37. data/lib/uncle_blake3/digest.rb +119 -0
  38. data/lib/uncle_blake3/version.rb +5 -0
  39. data/lib/uncle_blake3.rb +7 -0
  40. metadata +112 -0
@@ -0,0 +1,276 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "blake3_impl.h"
6
+
7
+ #if defined(IS_X86)
8
+ #if defined(_MSC_VER)
9
+ #include <intrin.h>
10
+ #elif defined(__GNUC__)
11
+ #include <immintrin.h>
12
+ #else
13
+ #error "Unimplemented!"
14
+ #endif
15
+ #endif
16
+
17
+ #define MAYBE_UNUSED(x) (void)((x))
18
+
19
+ #if defined(IS_X86)
20
+ static uint64_t xgetbv() {
21
+ #if defined(_MSC_VER)
22
+ return _xgetbv(0);
23
+ #else
24
+ uint32_t eax = 0, edx = 0;
25
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
26
+ return ((uint64_t)edx << 32) | eax;
27
+ #endif
28
+ }
29
+
30
+ static void cpuid(uint32_t out[4], uint32_t id) {
31
+ #if defined(_MSC_VER)
32
+ __cpuid((int *)out, id);
33
+ #elif defined(__i386__) || defined(_M_IX86)
34
+ __asm__ __volatile__("movl %%ebx, %1\n"
35
+ "cpuid\n"
36
+ "xchgl %1, %%ebx\n"
37
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
38
+ : "a"(id));
39
+ #else
40
+ __asm__ __volatile__("cpuid\n"
41
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
42
+ : "a"(id));
43
+ #endif
44
+ }
45
+
46
+ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
47
+ #if defined(_MSC_VER)
48
+ __cpuidex((int *)out, id, sid);
49
+ #elif defined(__i386__) || defined(_M_IX86)
50
+ __asm__ __volatile__("movl %%ebx, %1\n"
51
+ "cpuid\n"
52
+ "xchgl %1, %%ebx\n"
53
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
54
+ : "a"(id), "c"(sid));
55
+ #else
56
+ __asm__ __volatile__("cpuid\n"
57
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
58
+ : "a"(id), "c"(sid));
59
+ #endif
60
+ }
61
+
62
+ #endif
63
+
64
+ enum cpu_feature {
65
+ SSE2 = 1 << 0,
66
+ SSSE3 = 1 << 1,
67
+ SSE41 = 1 << 2,
68
+ AVX = 1 << 3,
69
+ AVX2 = 1 << 4,
70
+ AVX512F = 1 << 5,
71
+ AVX512VL = 1 << 6,
72
+ /* ... */
73
+ UNDEFINED = 1 << 30
74
+ };
75
+
76
+ #if !defined(BLAKE3_TESTING)
77
+ static /* Allow the variable to be controlled manually for testing */
78
+ #endif
79
+ enum cpu_feature g_cpu_features = UNDEFINED;
80
+
81
+ #if !defined(BLAKE3_TESTING)
82
+ static
83
+ #endif
84
+ enum cpu_feature
85
+ get_cpu_features() {
86
+
87
+ if (g_cpu_features != UNDEFINED) {
88
+ return g_cpu_features;
89
+ } else {
90
+ #if defined(IS_X86)
91
+ uint32_t regs[4] = {0};
92
+ uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
93
+ (void)edx;
94
+ enum cpu_feature features = 0;
95
+ cpuid(regs, 0);
96
+ const int max_id = *eax;
97
+ cpuid(regs, 1);
98
+ #if defined(__amd64__) || defined(_M_X64)
99
+ features |= SSE2;
100
+ #else
101
+ if (*edx & (1UL << 26))
102
+ features |= SSE2;
103
+ #endif
104
+ if (*ecx & (1UL << 0))
105
+ features |= SSSE3;
106
+ if (*ecx & (1UL << 19))
107
+ features |= SSE41;
108
+
109
+ if (*ecx & (1UL << 27)) { // OSXSAVE
110
+ const uint64_t mask = xgetbv();
111
+ if ((mask & 6) == 6) { // SSE and AVX states
112
+ if (*ecx & (1UL << 28))
113
+ features |= AVX;
114
+ if (max_id >= 7) {
115
+ cpuidex(regs, 7, 0);
116
+ if (*ebx & (1UL << 5))
117
+ features |= AVX2;
118
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
119
+ if (*ebx & (1UL << 31))
120
+ features |= AVX512VL;
121
+ if (*ebx & (1UL << 16))
122
+ features |= AVX512F;
123
+ }
124
+ }
125
+ }
126
+ }
127
+ g_cpu_features = features;
128
+ return features;
129
+ #else
130
+ /* How to detect NEON? */
131
+ return 0;
132
+ #endif
133
+ }
134
+ }
135
+
136
+ void blake3_compress_in_place(uint32_t cv[8],
137
+ const uint8_t block[BLAKE3_BLOCK_LEN],
138
+ uint8_t block_len, uint64_t counter,
139
+ uint8_t flags) {
140
+ #if defined(IS_X86)
141
+ const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
143
+ #if !defined(BLAKE3_NO_AVX512)
144
+ if (features & AVX512VL) {
145
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
146
+ return;
147
+ }
148
+ #endif
149
+ #if !defined(BLAKE3_NO_SSE41)
150
+ if (features & SSE41) {
151
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
152
+ return;
153
+ }
154
+ #endif
155
+ #if !defined(BLAKE3_NO_SSE2)
156
+ if (features & SSE2) {
157
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158
+ return;
159
+ }
160
+ #endif
161
+ #endif
162
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
163
+ }
164
+
165
+ void blake3_compress_xof(const uint32_t cv[8],
166
+ const uint8_t block[BLAKE3_BLOCK_LEN],
167
+ uint8_t block_len, uint64_t counter, uint8_t flags,
168
+ uint8_t out[64]) {
169
+ #if defined(IS_X86)
170
+ const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
172
+ #if !defined(BLAKE3_NO_AVX512)
173
+ if (features & AVX512VL) {
174
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
175
+ return;
176
+ }
177
+ #endif
178
+ #if !defined(BLAKE3_NO_SSE41)
179
+ if (features & SSE41) {
180
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
181
+ return;
182
+ }
183
+ #endif
184
+ #if !defined(BLAKE3_NO_SSE2)
185
+ if (features & SSE2) {
186
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187
+ return;
188
+ }
189
+ #endif
190
+ #endif
191
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
192
+ }
193
+
194
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
195
+ size_t blocks, const uint32_t key[8], uint64_t counter,
196
+ bool increment_counter, uint8_t flags,
197
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
198
+ #if defined(IS_X86)
199
+ const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
201
+ #if !defined(BLAKE3_NO_AVX512)
202
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
203
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
204
+ increment_counter, flags, flags_start, flags_end,
205
+ out);
206
+ return;
207
+ }
208
+ #endif
209
+ #if !defined(BLAKE3_NO_AVX2)
210
+ if (features & AVX2) {
211
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
212
+ increment_counter, flags, flags_start, flags_end,
213
+ out);
214
+ return;
215
+ }
216
+ #endif
217
+ #if !defined(BLAKE3_NO_SSE41)
218
+ if (features & SSE41) {
219
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
220
+ increment_counter, flags, flags_start, flags_end,
221
+ out);
222
+ return;
223
+ }
224
+ #endif
225
+ #if !defined(BLAKE3_NO_SSE2)
226
+ if (features & SSE2) {
227
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228
+ increment_counter, flags, flags_start, flags_end,
229
+ out);
230
+ return;
231
+ }
232
+ #endif
233
+ #endif
234
+
235
+ #if BLAKE3_USE_NEON == 1
236
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
237
+ increment_counter, flags, flags_start, flags_end, out);
238
+ return;
239
+ #endif
240
+
241
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
242
+ increment_counter, flags, flags_start, flags_end,
243
+ out);
244
+ }
245
+
246
+ // The dynamically detected SIMD degree of the current platform.
247
+ size_t blake3_simd_degree(void) {
248
+ #if defined(IS_X86)
249
+ const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
251
+ #if !defined(BLAKE3_NO_AVX512)
252
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253
+ return 16;
254
+ }
255
+ #endif
256
+ #if !defined(BLAKE3_NO_AVX2)
257
+ if (features & AVX2) {
258
+ return 8;
259
+ }
260
+ #endif
261
+ #if !defined(BLAKE3_NO_SSE41)
262
+ if (features & SSE41) {
263
+ return 4;
264
+ }
265
+ #endif
266
+ #if !defined(BLAKE3_NO_SSE2)
267
+ if (features & SSE2) {
268
+ return 4;
269
+ }
270
+ #endif
271
+ #endif
272
+ #if BLAKE3_USE_NEON == 1
273
+ return 4;
274
+ #endif
275
+ return 1;
276
+ }
@@ -0,0 +1,282 @@
1
+ #ifndef BLAKE3_IMPL_H
2
+ #define BLAKE3_IMPL_H
3
+
4
+ #include <assert.h>
5
+ #include <stdbool.h>
6
+ #include <stddef.h>
7
+ #include <stdint.h>
8
+ #include <string.h>
9
+
10
+ #include "blake3.h"
11
+
12
+ // internal flags
13
+ enum blake3_flags {
14
+ CHUNK_START = 1 << 0,
15
+ CHUNK_END = 1 << 1,
16
+ PARENT = 1 << 2,
17
+ ROOT = 1 << 3,
18
+ KEYED_HASH = 1 << 4,
19
+ DERIVE_KEY_CONTEXT = 1 << 5,
20
+ DERIVE_KEY_MATERIAL = 1 << 6,
21
+ };
22
+
23
+ // This C implementation tries to support recent versions of GCC, Clang, and
24
+ // MSVC.
25
+ #if defined(_MSC_VER)
26
+ #define INLINE static __forceinline
27
+ #else
28
+ #define INLINE static inline __attribute__((always_inline))
29
+ #endif
30
+
31
+ #if defined(__x86_64__) || defined(_M_X64)
32
+ #define IS_X86
33
+ #define IS_X86_64
34
+ #endif
35
+
36
+ #if defined(__i386__) || defined(_M_IX86)
37
+ #define IS_X86
38
+ #define IS_X86_32
39
+ #endif
40
+
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
45
+ #if defined(IS_X86)
46
+ #if defined(_MSC_VER)
47
+ #include <intrin.h>
48
+ #endif
49
+ #include <immintrin.h>
50
+ #endif
51
+
52
+ #if !defined(BLAKE3_USE_NEON)
53
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
54
+ #if defined(IS_AARCH64)
55
+ #define BLAKE3_USE_NEON 1
56
+ #else
57
+ #define BLAKE3_USE_NEON 0
58
+ #endif
59
+ #endif
60
+
61
+ #if defined(IS_X86)
62
+ #define MAX_SIMD_DEGREE 16
63
+ #elif BLAKE3_USE_NEON == 1
64
+ #define MAX_SIMD_DEGREE 4
65
+ #else
66
+ #define MAX_SIMD_DEGREE 1
67
+ #endif
68
+
69
+ // There are some places where we want a static size that's equal to the
70
+ // MAX_SIMD_DEGREE, but also at least 2.
71
+ #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
72
+
73
+ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
74
+ 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
75
+ 0x1F83D9ABUL, 0x5BE0CD19UL};
76
+
77
+ static const uint8_t MSG_SCHEDULE[7][16] = {
78
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
79
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
80
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
81
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
82
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
83
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
84
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
85
+ };
86
+
87
+ /* Find index of the highest set bit */
88
+ /* x is assumed to be nonzero. */
89
+ static unsigned int highest_one(uint64_t x) {
90
+ #if defined(__GNUC__) || defined(__clang__)
91
+ return 63 ^ __builtin_clzll(x);
92
+ #elif defined(_MSC_VER) && defined(IS_X86_64)
93
+ unsigned long index;
94
+ _BitScanReverse64(&index, x);
95
+ return index;
96
+ #elif defined(_MSC_VER) && defined(IS_X86_32)
97
+ if(x >> 32) {
98
+ unsigned long index;
99
+ _BitScanReverse(&index, (unsigned long)(x >> 32));
100
+ return 32 + index;
101
+ } else {
102
+ unsigned long index;
103
+ _BitScanReverse(&index, (unsigned long)x);
104
+ return index;
105
+ }
106
+ #else
107
+ unsigned int c = 0;
108
+ if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
109
+ if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
110
+ if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
111
+ if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
112
+ if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
113
+ if(x & 0x0000000000000002ULL) { c += 1; }
114
+ return c;
115
+ #endif
116
+ }
117
+
118
+ // Count the number of 1 bits.
119
+ INLINE unsigned int popcnt(uint64_t x) {
120
+ #if defined(__GNUC__) || defined(__clang__)
121
+ return __builtin_popcountll(x);
122
+ #else
123
+ unsigned int count = 0;
124
+ while (x != 0) {
125
+ count += 1;
126
+ x &= x - 1;
127
+ }
128
+ return count;
129
+ #endif
130
+ }
131
+
132
+ // Largest power of two less than or equal to x. As a special case, returns 1
133
+ // when x is 0.
134
+ INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
135
+ return 1ULL << highest_one(x | 1);
136
+ }
137
+
138
+ INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
139
+
140
+ INLINE uint32_t counter_high(uint64_t counter) {
141
+ return (uint32_t)(counter >> 32);
142
+ }
143
+
144
+ INLINE uint32_t load32(const void *src) {
145
+ const uint8_t *p = (const uint8_t *)src;
146
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
147
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
148
+ }
149
+
150
+ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
151
+ uint32_t key_words[8]) {
152
+ key_words[0] = load32(&key[0 * 4]);
153
+ key_words[1] = load32(&key[1 * 4]);
154
+ key_words[2] = load32(&key[2 * 4]);
155
+ key_words[3] = load32(&key[3 * 4]);
156
+ key_words[4] = load32(&key[4 * 4]);
157
+ key_words[5] = load32(&key[5 * 4]);
158
+ key_words[6] = load32(&key[6 * 4]);
159
+ key_words[7] = load32(&key[7 * 4]);
160
+ }
161
+
162
+ INLINE void store32(void *dst, uint32_t w) {
163
+ uint8_t *p = (uint8_t *)dst;
164
+ p[0] = (uint8_t)(w >> 0);
165
+ p[1] = (uint8_t)(w >> 8);
166
+ p[2] = (uint8_t)(w >> 16);
167
+ p[3] = (uint8_t)(w >> 24);
168
+ }
169
+
170
+ INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
171
+ store32(&bytes_out[0 * 4], cv_words[0]);
172
+ store32(&bytes_out[1 * 4], cv_words[1]);
173
+ store32(&bytes_out[2 * 4], cv_words[2]);
174
+ store32(&bytes_out[3 * 4], cv_words[3]);
175
+ store32(&bytes_out[4 * 4], cv_words[4]);
176
+ store32(&bytes_out[5 * 4], cv_words[5]);
177
+ store32(&bytes_out[6 * 4], cv_words[6]);
178
+ store32(&bytes_out[7 * 4], cv_words[7]);
179
+ }
180
+
181
+ void blake3_compress_in_place(uint32_t cv[8],
182
+ const uint8_t block[BLAKE3_BLOCK_LEN],
183
+ uint8_t block_len, uint64_t counter,
184
+ uint8_t flags);
185
+
186
+ void blake3_compress_xof(const uint32_t cv[8],
187
+ const uint8_t block[BLAKE3_BLOCK_LEN],
188
+ uint8_t block_len, uint64_t counter, uint8_t flags,
189
+ uint8_t out[64]);
190
+
191
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
192
+ size_t blocks, const uint32_t key[8], uint64_t counter,
193
+ bool increment_counter, uint8_t flags,
194
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
195
+
196
+ size_t blake3_simd_degree(void);
197
+
198
+
199
+ // Declarations for implementation-specific functions.
200
+ void blake3_compress_in_place_portable(uint32_t cv[8],
201
+ const uint8_t block[BLAKE3_BLOCK_LEN],
202
+ uint8_t block_len, uint64_t counter,
203
+ uint8_t flags);
204
+
205
+ void blake3_compress_xof_portable(const uint32_t cv[8],
206
+ const uint8_t block[BLAKE3_BLOCK_LEN],
207
+ uint8_t block_len, uint64_t counter,
208
+ uint8_t flags, uint8_t out[64]);
209
+
210
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
211
+ size_t blocks, const uint32_t key[8],
212
+ uint64_t counter, bool increment_counter,
213
+ uint8_t flags, uint8_t flags_start,
214
+ uint8_t flags_end, uint8_t *out);
215
+
216
+ #if defined(IS_X86)
217
+ #if !defined(BLAKE3_NO_SSE2)
218
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
219
+ const uint8_t block[BLAKE3_BLOCK_LEN],
220
+ uint8_t block_len, uint64_t counter,
221
+ uint8_t flags);
222
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
223
+ const uint8_t block[BLAKE3_BLOCK_LEN],
224
+ uint8_t block_len, uint64_t counter,
225
+ uint8_t flags, uint8_t out[64]);
226
+ void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
227
+ size_t blocks, const uint32_t key[8],
228
+ uint64_t counter, bool increment_counter,
229
+ uint8_t flags, uint8_t flags_start,
230
+ uint8_t flags_end, uint8_t *out);
231
+ #endif
232
+ #if !defined(BLAKE3_NO_SSE41)
233
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
234
+ const uint8_t block[BLAKE3_BLOCK_LEN],
235
+ uint8_t block_len, uint64_t counter,
236
+ uint8_t flags);
237
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
238
+ const uint8_t block[BLAKE3_BLOCK_LEN],
239
+ uint8_t block_len, uint64_t counter,
240
+ uint8_t flags, uint8_t out[64]);
241
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
242
+ size_t blocks, const uint32_t key[8],
243
+ uint64_t counter, bool increment_counter,
244
+ uint8_t flags, uint8_t flags_start,
245
+ uint8_t flags_end, uint8_t *out);
246
+ #endif
247
+ #if !defined(BLAKE3_NO_AVX2)
248
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
249
+ size_t blocks, const uint32_t key[8],
250
+ uint64_t counter, bool increment_counter,
251
+ uint8_t flags, uint8_t flags_start,
252
+ uint8_t flags_end, uint8_t *out);
253
+ #endif
254
+ #if !defined(BLAKE3_NO_AVX512)
255
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
256
+ const uint8_t block[BLAKE3_BLOCK_LEN],
257
+ uint8_t block_len, uint64_t counter,
258
+ uint8_t flags);
259
+
260
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
261
+ const uint8_t block[BLAKE3_BLOCK_LEN],
262
+ uint8_t block_len, uint64_t counter,
263
+ uint8_t flags, uint8_t out[64]);
264
+
265
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
266
+ size_t blocks, const uint32_t key[8],
267
+ uint64_t counter, bool increment_counter,
268
+ uint8_t flags, uint8_t flags_start,
269
+ uint8_t flags_end, uint8_t *out);
270
+ #endif
271
+ #endif
272
+
273
+ #if BLAKE3_USE_NEON == 1
274
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
275
+ size_t blocks, const uint32_t key[8],
276
+ uint64_t counter, bool increment_counter,
277
+ uint8_t flags, uint8_t flags_start,
278
+ uint8_t flags_end, uint8_t *out);
279
+ #endif
280
+
281
+
282
+ #endif /* BLAKE3_IMPL_H */