digest-blake3 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,312 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "blake3_impl.h"
6
+
7
+ #if defined(IS_X86)
8
+ #if defined(_MSC_VER)
9
+ #include <intrin.h>
10
+ #elif defined(__GNUC__)
11
+ #include <immintrin.h>
12
+ #else
13
+ #error "Unimplemented!"
14
+ #endif
15
+ #endif
16
+
17
+ // Declarations for implementation-specific functions.
18
+ void blake3_compress_in_place_portable(uint32_t cv[8],
19
+ const uint8_t block[BLAKE3_BLOCK_LEN],
20
+ uint8_t block_len, uint64_t counter,
21
+ uint8_t flags);
22
+
23
+ void blake3_compress_xof_portable(const uint32_t cv[8],
24
+ const uint8_t block[BLAKE3_BLOCK_LEN],
25
+ uint8_t block_len, uint64_t counter,
26
+ uint8_t flags, uint8_t out[64]);
27
+
28
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
29
+ size_t blocks, const uint32_t key[8],
30
+ uint64_t counter, bool increment_counter,
31
+ uint8_t flags, uint8_t flags_start,
32
+ uint8_t flags_end, uint8_t *out);
33
+
34
+ #if defined(IS_X86)
35
+ #if !defined(BLAKE3_NO_SSE41)
36
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
37
+ const uint8_t block[BLAKE3_BLOCK_LEN],
38
+ uint8_t block_len, uint64_t counter,
39
+ uint8_t flags);
40
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
41
+ const uint8_t block[BLAKE3_BLOCK_LEN],
42
+ uint8_t block_len, uint64_t counter,
43
+ uint8_t flags, uint8_t out[64]);
44
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
45
+ size_t blocks, const uint32_t key[8],
46
+ uint64_t counter, bool increment_counter,
47
+ uint8_t flags, uint8_t flags_start,
48
+ uint8_t flags_end, uint8_t *out);
49
+ #endif
50
+ #if !defined(BLAKE3_NO_AVX2)
51
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
52
+ size_t blocks, const uint32_t key[8],
53
+ uint64_t counter, bool increment_counter,
54
+ uint8_t flags, uint8_t flags_start,
55
+ uint8_t flags_end, uint8_t *out);
56
+ #endif
57
+ #if !defined(BLAKE3_NO_AVX512)
58
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
59
+ const uint8_t block[BLAKE3_BLOCK_LEN],
60
+ uint8_t block_len, uint64_t counter,
61
+ uint8_t flags);
62
+
63
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
64
+ const uint8_t block[BLAKE3_BLOCK_LEN],
65
+ uint8_t block_len, uint64_t counter,
66
+ uint8_t flags, uint8_t out[64]);
67
+
68
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
69
+ size_t blocks, const uint32_t key[8],
70
+ uint64_t counter, bool increment_counter,
71
+ uint8_t flags, uint8_t flags_start,
72
+ uint8_t flags_end, uint8_t *out);
73
+ #endif
74
+ #endif
75
+
76
+ #if defined(BLAKE3_USE_NEON)
77
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
78
+ size_t blocks, const uint32_t key[8],
79
+ uint64_t counter, bool increment_counter,
80
+ uint8_t flags, uint8_t flags_start,
81
+ uint8_t flags_end, uint8_t *out);
82
+ #endif
83
+
84
+ #if defined(IS_X86)
85
+ static uint64_t xgetbv() {
86
+ #if defined(_MSC_VER)
87
+ return _xgetbv(0);
88
+ #else
89
+ uint32_t eax = 0, edx = 0;
90
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
91
+ return ((uint64_t)edx << 32) | eax;
92
+ #endif
93
+ }
94
+
95
+ static void cpuid(uint32_t out[4], uint32_t id) {
96
+ #if defined(_MSC_VER)
97
+ __cpuid((int *)out, id);
98
+ #elif defined(__i386__) || defined(_M_IX86)
99
+ __asm__ __volatile__("movl %%ebx, %1\n"
100
+ "cpuid\n"
101
+ "xchgl %1, %%ebx\n"
102
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
103
+ : "a"(id));
104
+ #else
105
+ __asm__ __volatile__("cpuid\n"
106
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
107
+ : "a"(id));
108
+ #endif
109
+ }
110
+
111
+ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
112
+ #if defined(_MSC_VER)
113
+ __cpuidex((int *)out, id, sid);
114
+ #elif defined(__i386__) || defined(_M_IX86)
115
+ __asm__ __volatile__("movl %%ebx, %1\n"
116
+ "cpuid\n"
117
+ "xchgl %1, %%ebx\n"
118
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
119
+ : "a"(id), "c"(sid));
120
+ #else
121
+ __asm__ __volatile__("cpuid\n"
122
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
123
+ : "a"(id), "c"(sid));
124
+ #endif
125
+ }
126
+
127
+ #endif
128
+
129
+ enum cpu_feature {
130
+ SSE2 = 1 << 0,
131
+ SSSE3 = 1 << 1,
132
+ SSE41 = 1 << 2,
133
+ AVX = 1 << 3,
134
+ AVX2 = 1 << 4,
135
+ AVX512F = 1 << 5,
136
+ AVX512VL = 1 << 6,
137
+ /* ... */
138
+ UNDEFINED = 1 << 30
139
+ };
140
+
141
+ #if !defined(BLAKE3_TESTING)
142
+ static /* Allow the variable to be controlled manually for testing */
143
+ #endif
144
+ enum cpu_feature g_cpu_features = UNDEFINED;
145
+
146
+ #if !defined(BLAKE3_TESTING)
147
+ static
148
+ #endif
149
+ enum cpu_feature
150
+ get_cpu_features() {
151
+
152
+ if (g_cpu_features != UNDEFINED) {
153
+ return g_cpu_features;
154
+ } else {
155
+ #if defined(IS_X86)
156
+ uint32_t regs[4] = {0};
157
+ uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
158
+ (void)edx;
159
+ enum cpu_feature features = 0;
160
+ cpuid(regs, 0);
161
+ const int max_id = *eax;
162
+ cpuid(regs, 1);
163
+ #if defined(__amd64__) || defined(_M_X64)
164
+ features |= SSE2;
165
+ #else
166
+ if (*edx & (1UL << 26))
167
+ features |= SSE2;
168
+ #endif
169
+ if (*ecx & (1UL << 0))
170
+ features |= SSSE3;
171
+ if (*ecx & (1UL << 19))
172
+ features |= SSE41;
173
+
174
+ if (*ecx & (1UL << 27)) { // OSXSAVE
175
+ const uint64_t mask = xgetbv();
176
+ if ((mask & 6) == 6) { // SSE and AVX states
177
+ if (*ecx & (1UL << 28))
178
+ features |= AVX;
179
+ if (max_id >= 7) {
180
+ cpuidex(regs, 7, 0);
181
+ if (*ebx & (1UL << 5))
182
+ features |= AVX2;
183
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
184
+ if (*ebx & (1UL << 31))
185
+ features |= AVX512VL;
186
+ if (*ebx & (1UL << 16))
187
+ features |= AVX512F;
188
+ }
189
+ }
190
+ }
191
+ }
192
+ g_cpu_features = features;
193
+ return features;
194
+ #else
195
+ /* How to detect NEON? */
196
+ return 0;
197
+ #endif
198
+ }
199
+ }
200
+
201
+ void blake3_compress_in_place(uint32_t cv[8],
202
+ const uint8_t block[BLAKE3_BLOCK_LEN],
203
+ uint8_t block_len, uint64_t counter,
204
+ uint8_t flags) {
205
+ #if defined(IS_X86)
206
+ const enum cpu_feature features = get_cpu_features();
207
+ #if !defined(BLAKE3_NO_AVX512)
208
+ if (features & AVX512VL) {
209
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
210
+ return;
211
+ }
212
+ #endif
213
+ #if !defined(BLAKE3_NO_SSE41)
214
+ if (features & SSE41) {
215
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
216
+ return;
217
+ }
218
+ #endif
219
+ #endif
220
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
221
+ }
222
+
223
+ void blake3_compress_xof(const uint32_t cv[8],
224
+ const uint8_t block[BLAKE3_BLOCK_LEN],
225
+ uint8_t block_len, uint64_t counter, uint8_t flags,
226
+ uint8_t out[64]) {
227
+ #if defined(IS_X86)
228
+ const enum cpu_feature features = get_cpu_features();
229
+ #if !defined(BLAKE3_NO_AVX512)
230
+ if (features & AVX512VL) {
231
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
232
+ return;
233
+ }
234
+ #endif
235
+ #if !defined(BLAKE3_NO_SSE41)
236
+ if (features & SSE41) {
237
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
238
+ return;
239
+ }
240
+ #endif
241
+ #endif
242
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
243
+ }
244
+
245
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
246
+ size_t blocks, const uint32_t key[8], uint64_t counter,
247
+ bool increment_counter, uint8_t flags,
248
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
249
+ #if defined(IS_X86)
250
+ const enum cpu_feature features = get_cpu_features();
251
+ #if !defined(BLAKE3_NO_AVX512)
252
+ if (features & AVX512F) {
253
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
254
+ increment_counter, flags, flags_start, flags_end,
255
+ out);
256
+ return;
257
+ }
258
+ #endif
259
+ #if !defined(BLAKE3_NO_AVX2)
260
+ if (features & AVX2) {
261
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
262
+ increment_counter, flags, flags_start, flags_end,
263
+ out);
264
+ return;
265
+ }
266
+ #endif
267
+ #if !defined(BLAKE3_NO_SSE41)
268
+ if (features & SSE41) {
269
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
270
+ increment_counter, flags, flags_start, flags_end,
271
+ out);
272
+ return;
273
+ }
274
+ #endif
275
+ #endif
276
+
277
+ #if defined(BLAKE3_USE_NEON)
278
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
279
+ increment_counter, flags, flags_start, flags_end, out);
280
+ return;
281
+ #endif
282
+
283
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
284
+ increment_counter, flags, flags_start, flags_end,
285
+ out);
286
+ }
287
+
288
+ // The dynamically detected SIMD degree of the current platform.
289
+ size_t blake3_simd_degree() {
290
+ #if defined(IS_X86)
291
+ const enum cpu_feature features = get_cpu_features();
292
+ #if !defined(BLAKE3_NO_AVX512)
293
+ if (features & AVX512F) {
294
+ return 16;
295
+ }
296
+ #endif
297
+ #if !defined(BLAKE3_NO_AVX2)
298
+ if (features & AVX2) {
299
+ return 8;
300
+ }
301
+ #endif
302
+ #if !defined(BLAKE3_NO_SSE41)
303
+ if (features & SSE41) {
304
+ return 4;
305
+ }
306
+ #endif
307
+ #endif
308
+ #if defined(BLAKE3_USE_NEON)
309
+ return 4;
310
+ #endif
311
+ return 1;
312
+ }
@@ -0,0 +1,167 @@
1
+ #ifndef BLAKE3_IMPL_H
2
+ #define BLAKE3_IMPL_H
3
+
4
+ #include <assert.h>
5
+ #include <stdbool.h>
6
+ #include <stddef.h>
7
+ #include <stdint.h>
8
+ #include <string.h>
9
+
10
+ #include "blake3.h"
11
+
12
+ // internal flags
13
+ enum blake3_flags {
14
+ CHUNK_START = 1 << 0,
15
+ CHUNK_END = 1 << 1,
16
+ PARENT = 1 << 2,
17
+ ROOT = 1 << 3,
18
+ KEYED_HASH = 1 << 4,
19
+ DERIVE_KEY_CONTEXT = 1 << 5,
20
+ DERIVE_KEY_MATERIAL = 1 << 6,
21
+ };
22
+
23
+ // This C implementation tries to support recent versions of GCC, Clang, and
24
+ // MSVC.
25
+ #if defined(_MSC_VER)
26
+ #define INLINE static __forceinline
27
+ #else
28
+ #define INLINE static inline __attribute__((always_inline))
29
+ #endif
30
+
31
+ #if defined(__x86_64__) || defined(_M_X64)
32
+ #define IS_X86
33
+ #define IS_X86_64
34
+ #endif
35
+
36
+ #if defined(__i386__) || defined(_M_IX86)
37
+ #define IS_X86
38
+ #define IS_X86_32
39
+ #endif
40
+
41
+ #if defined(IS_X86)
42
+ #if defined(_MSC_VER)
43
+ #include <intrin.h>
44
+ #endif
45
+ #include <immintrin.h>
46
+ #endif
47
+
48
+ #if defined(IS_X86)
49
+ #define MAX_SIMD_DEGREE 16
50
+ #elif defined(BLAKE3_USE_NEON)
51
+ #define MAX_SIMD_DEGREE 4
52
+ #else
53
+ #define MAX_SIMD_DEGREE 1
54
+ #endif
55
+
56
+ // There are some places where we want a static size that's equal to the
57
+ // MAX_SIMD_DEGREE, but also at least 2.
58
+ #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
59
+
60
+ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
61
+ 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
62
+ 0x1F83D9ABUL, 0x5BE0CD19UL};
63
+
64
+ static const uint8_t MSG_SCHEDULE[7][16] = {
65
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
66
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
67
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
68
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
69
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
70
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
71
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
72
+ };
73
+
74
+ /* Find index of the highest set bit */
75
+ /* x is assumed to be nonzero. */
76
+ static unsigned int highest_one(uint64_t x) {
77
+ #if defined(__GNUC__) || defined(__clang__)
78
+ return 63 ^ __builtin_clzll(x);
79
+ #elif defined(_MSC_VER) && defined(IS_X86_64)
80
+ unsigned long index;
81
+ _BitScanReverse64(&index, x);
82
+ return index;
83
+ #elif defined(_MSC_VER) && defined(IS_X86_32)
84
+ if(x >> 32) {
85
+ unsigned long index;
86
+ _BitScanReverse(&index, x >> 32);
87
+ return 32 + index;
88
+ } else {
89
+ unsigned long index;
90
+ _BitScanReverse(&index, x);
91
+ return index;
92
+ }
93
+ #else
94
+ unsigned int c = 0;
95
+ if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
96
+ if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
97
+ if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
98
+ if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
99
+ if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
100
+ if(x & 0x0000000000000002ULL) { c += 1; }
101
+ return c;
102
+ #endif
103
+ }
104
+
105
+ // Count the number of 1 bits.
106
+ INLINE unsigned int popcnt(uint64_t x) {
107
+ #if defined(__GNUC__) || defined(__clang__)
108
+ return __builtin_popcountll(x);
109
+ #else
110
+ unsigned int count = 0;
111
+ while (x != 0) {
112
+ count += 1;
113
+ x &= x - 1;
114
+ }
115
+ return count;
116
+ #endif
117
+ }
118
+
119
+ // Largest power of two less than or equal to x. As a special case, returns 1
120
+ // when x is 0.
121
+ INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
122
+ return 1ULL << highest_one(x | 1);
123
+ }
124
+
125
+ INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
126
+
127
+ INLINE uint32_t counter_high(uint64_t counter) {
128
+ return (uint32_t)(counter >> 32);
129
+ }
130
+
131
+ INLINE uint32_t load32(const void *src) {
132
+ const uint8_t *p = (const uint8_t *)src;
133
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
134
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
135
+ }
136
+
137
+ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
138
+ uint32_t key_words[8]) {
139
+ key_words[0] = load32(&key[0 * 4]);
140
+ key_words[1] = load32(&key[1 * 4]);
141
+ key_words[2] = load32(&key[2 * 4]);
142
+ key_words[3] = load32(&key[3 * 4]);
143
+ key_words[4] = load32(&key[4 * 4]);
144
+ key_words[5] = load32(&key[5 * 4]);
145
+ key_words[6] = load32(&key[6 * 4]);
146
+ key_words[7] = load32(&key[7 * 4]);
147
+ }
148
+
149
+ void blake3_compress_in_place(uint32_t cv[8],
150
+ const uint8_t block[BLAKE3_BLOCK_LEN],
151
+ uint8_t block_len, uint64_t counter,
152
+ uint8_t flags);
153
+
154
+ void blake3_compress_xof(const uint32_t cv[8],
155
+ const uint8_t block[BLAKE3_BLOCK_LEN],
156
+ uint8_t block_len, uint64_t counter, uint8_t flags,
157
+ uint8_t out[64]);
158
+
159
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
160
+ size_t blocks, const uint32_t key[8], uint64_t counter,
161
+ bool increment_counter, uint8_t flags,
162
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
163
+
164
+ size_t blake3_simd_degree();
165
+
166
+
167
+ #endif /* BLAKE3_IMPL_H */