digest-blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,312 @@
1
+ #include <stdbool.h>
2
+ #include <stddef.h>
3
+ #include <stdint.h>
4
+
5
+ #include "blake3_impl.h"
6
+
7
+ #if defined(IS_X86)
8
+ #if defined(_MSC_VER)
9
+ #include <intrin.h>
10
+ #elif defined(__GNUC__)
11
+ #include <immintrin.h>
12
+ #else
13
+ #error "Unimplemented!"
14
+ #endif
15
+ #endif
16
+
17
+ // Declarations for implementation-specific functions.
18
+ void blake3_compress_in_place_portable(uint32_t cv[8],
19
+ const uint8_t block[BLAKE3_BLOCK_LEN],
20
+ uint8_t block_len, uint64_t counter,
21
+ uint8_t flags);
22
+
23
+ void blake3_compress_xof_portable(const uint32_t cv[8],
24
+ const uint8_t block[BLAKE3_BLOCK_LEN],
25
+ uint8_t block_len, uint64_t counter,
26
+ uint8_t flags, uint8_t out[64]);
27
+
28
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
29
+ size_t blocks, const uint32_t key[8],
30
+ uint64_t counter, bool increment_counter,
31
+ uint8_t flags, uint8_t flags_start,
32
+ uint8_t flags_end, uint8_t *out);
33
+
34
+ #if defined(IS_X86)
35
+ #if !defined(BLAKE3_NO_SSE41)
36
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
37
+ const uint8_t block[BLAKE3_BLOCK_LEN],
38
+ uint8_t block_len, uint64_t counter,
39
+ uint8_t flags);
40
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
41
+ const uint8_t block[BLAKE3_BLOCK_LEN],
42
+ uint8_t block_len, uint64_t counter,
43
+ uint8_t flags, uint8_t out[64]);
44
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
45
+ size_t blocks, const uint32_t key[8],
46
+ uint64_t counter, bool increment_counter,
47
+ uint8_t flags, uint8_t flags_start,
48
+ uint8_t flags_end, uint8_t *out);
49
+ #endif
50
+ #if !defined(BLAKE3_NO_AVX2)
51
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
52
+ size_t blocks, const uint32_t key[8],
53
+ uint64_t counter, bool increment_counter,
54
+ uint8_t flags, uint8_t flags_start,
55
+ uint8_t flags_end, uint8_t *out);
56
+ #endif
57
+ #if !defined(BLAKE3_NO_AVX512)
58
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
59
+ const uint8_t block[BLAKE3_BLOCK_LEN],
60
+ uint8_t block_len, uint64_t counter,
61
+ uint8_t flags);
62
+
63
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
64
+ const uint8_t block[BLAKE3_BLOCK_LEN],
65
+ uint8_t block_len, uint64_t counter,
66
+ uint8_t flags, uint8_t out[64]);
67
+
68
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
69
+ size_t blocks, const uint32_t key[8],
70
+ uint64_t counter, bool increment_counter,
71
+ uint8_t flags, uint8_t flags_start,
72
+ uint8_t flags_end, uint8_t *out);
73
+ #endif
74
+ #endif
75
+
76
+ #if defined(BLAKE3_USE_NEON)
77
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
78
+ size_t blocks, const uint32_t key[8],
79
+ uint64_t counter, bool increment_counter,
80
+ uint8_t flags, uint8_t flags_start,
81
+ uint8_t flags_end, uint8_t *out);
82
+ #endif
83
+
84
+ #if defined(IS_X86)
85
+ static uint64_t xgetbv() {
86
+ #if defined(_MSC_VER)
87
+ return _xgetbv(0);
88
+ #else
89
+ uint32_t eax = 0, edx = 0;
90
+ __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0));
91
+ return ((uint64_t)edx << 32) | eax;
92
+ #endif
93
+ }
94
+
95
+ static void cpuid(uint32_t out[4], uint32_t id) {
96
+ #if defined(_MSC_VER)
97
+ __cpuid((int *)out, id);
98
+ #elif defined(__i386__) || defined(_M_IX86)
99
+ __asm__ __volatile__("movl %%ebx, %1\n"
100
+ "cpuid\n"
101
+ "xchgl %1, %%ebx\n"
102
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
103
+ : "a"(id));
104
+ #else
105
+ __asm__ __volatile__("cpuid\n"
106
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
107
+ : "a"(id));
108
+ #endif
109
+ }
110
+
111
+ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
112
+ #if defined(_MSC_VER)
113
+ __cpuidex((int *)out, id, sid);
114
+ #elif defined(__i386__) || defined(_M_IX86)
115
+ __asm__ __volatile__("movl %%ebx, %1\n"
116
+ "cpuid\n"
117
+ "xchgl %1, %%ebx\n"
118
+ : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
119
+ : "a"(id), "c"(sid));
120
+ #else
121
+ __asm__ __volatile__("cpuid\n"
122
+ : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3])
123
+ : "a"(id), "c"(sid));
124
+ #endif
125
+ }
126
+
127
+ #endif
128
+
129
+ enum cpu_feature {
130
+ SSE2 = 1 << 0,
131
+ SSSE3 = 1 << 1,
132
+ SSE41 = 1 << 2,
133
+ AVX = 1 << 3,
134
+ AVX2 = 1 << 4,
135
+ AVX512F = 1 << 5,
136
+ AVX512VL = 1 << 6,
137
+ /* ... */
138
+ UNDEFINED = 1 << 30
139
+ };
140
+
141
+ #if !defined(BLAKE3_TESTING)
142
+ static /* Allow the variable to be controlled manually for testing */
143
+ #endif
144
+ enum cpu_feature g_cpu_features = UNDEFINED;
145
+
146
+ #if !defined(BLAKE3_TESTING)
147
+ static
148
+ #endif
149
+ enum cpu_feature
150
+ get_cpu_features() {
151
+
152
+ if (g_cpu_features != UNDEFINED) {
153
+ return g_cpu_features;
154
+ } else {
155
+ #if defined(IS_X86)
156
+ uint32_t regs[4] = {0};
157
+ uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
158
+ (void)edx;
159
+ enum cpu_feature features = 0;
160
+ cpuid(regs, 0);
161
+ const int max_id = *eax;
162
+ cpuid(regs, 1);
163
+ #if defined(__amd64__) || defined(_M_X64)
164
+ features |= SSE2;
165
+ #else
166
+ if (*edx & (1UL << 26))
167
+ features |= SSE2;
168
+ #endif
169
+ if (*ecx & (1UL << 0))
170
+ features |= SSSE3;
171
+ if (*ecx & (1UL << 19))
172
+ features |= SSE41;
173
+
174
+ if (*ecx & (1UL << 27)) { // OSXSAVE
175
+ const uint64_t mask = xgetbv();
176
+ if ((mask & 6) == 6) { // SSE and AVX states
177
+ if (*ecx & (1UL << 28))
178
+ features |= AVX;
179
+ if (max_id >= 7) {
180
+ cpuidex(regs, 7, 0);
181
+ if (*ebx & (1UL << 5))
182
+ features |= AVX2;
183
+ if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm
184
+ if (*ebx & (1UL << 31))
185
+ features |= AVX512VL;
186
+ if (*ebx & (1UL << 16))
187
+ features |= AVX512F;
188
+ }
189
+ }
190
+ }
191
+ }
192
+ g_cpu_features = features;
193
+ return features;
194
+ #else
195
+ /* How to detect NEON? */
196
+ return 0;
197
+ #endif
198
+ }
199
+ }
200
+
201
+ void blake3_compress_in_place(uint32_t cv[8],
202
+ const uint8_t block[BLAKE3_BLOCK_LEN],
203
+ uint8_t block_len, uint64_t counter,
204
+ uint8_t flags) {
205
+ #if defined(IS_X86)
206
+ const enum cpu_feature features = get_cpu_features();
207
+ #if !defined(BLAKE3_NO_AVX512)
208
+ if (features & AVX512VL) {
209
+ blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
210
+ return;
211
+ }
212
+ #endif
213
+ #if !defined(BLAKE3_NO_SSE41)
214
+ if (features & SSE41) {
215
+ blake3_compress_in_place_sse41(cv, block, block_len, counter, flags);
216
+ return;
217
+ }
218
+ #endif
219
+ #endif
220
+ blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
221
+ }
222
+
223
+ void blake3_compress_xof(const uint32_t cv[8],
224
+ const uint8_t block[BLAKE3_BLOCK_LEN],
225
+ uint8_t block_len, uint64_t counter, uint8_t flags,
226
+ uint8_t out[64]) {
227
+ #if defined(IS_X86)
228
+ const enum cpu_feature features = get_cpu_features();
229
+ #if !defined(BLAKE3_NO_AVX512)
230
+ if (features & AVX512VL) {
231
+ blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
232
+ return;
233
+ }
234
+ #endif
235
+ #if !defined(BLAKE3_NO_SSE41)
236
+ if (features & SSE41) {
237
+ blake3_compress_xof_sse41(cv, block, block_len, counter, flags, out);
238
+ return;
239
+ }
240
+ #endif
241
+ #endif
242
+ blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
243
+ }
244
+
245
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
246
+ size_t blocks, const uint32_t key[8], uint64_t counter,
247
+ bool increment_counter, uint8_t flags,
248
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
249
+ #if defined(IS_X86)
250
+ const enum cpu_feature features = get_cpu_features();
251
+ #if !defined(BLAKE3_NO_AVX512)
252
+ if (features & AVX512F) {
253
+ blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
254
+ increment_counter, flags, flags_start, flags_end,
255
+ out);
256
+ return;
257
+ }
258
+ #endif
259
+ #if !defined(BLAKE3_NO_AVX2)
260
+ if (features & AVX2) {
261
+ blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter,
262
+ increment_counter, flags, flags_start, flags_end,
263
+ out);
264
+ return;
265
+ }
266
+ #endif
267
+ #if !defined(BLAKE3_NO_SSE41)
268
+ if (features & SSE41) {
269
+ blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
270
+ increment_counter, flags, flags_start, flags_end,
271
+ out);
272
+ return;
273
+ }
274
+ #endif
275
+ #endif
276
+
277
+ #if defined(BLAKE3_USE_NEON)
278
+ blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
279
+ increment_counter, flags, flags_start, flags_end, out);
280
+ return;
281
+ #endif
282
+
283
+ blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
284
+ increment_counter, flags, flags_start, flags_end,
285
+ out);
286
+ }
287
+
288
+ // The dynamically detected SIMD degree of the current platform.
289
+ size_t blake3_simd_degree() {
290
+ #if defined(IS_X86)
291
+ const enum cpu_feature features = get_cpu_features();
292
+ #if !defined(BLAKE3_NO_AVX512)
293
+ if (features & AVX512F) {
294
+ return 16;
295
+ }
296
+ #endif
297
+ #if !defined(BLAKE3_NO_AVX2)
298
+ if (features & AVX2) {
299
+ return 8;
300
+ }
301
+ #endif
302
+ #if !defined(BLAKE3_NO_SSE41)
303
+ if (features & SSE41) {
304
+ return 4;
305
+ }
306
+ #endif
307
+ #endif
308
+ #if defined(BLAKE3_USE_NEON)
309
+ return 4;
310
+ #endif
311
+ return 1;
312
+ }
@@ -0,0 +1,167 @@
1
+ #ifndef BLAKE3_IMPL_H
2
+ #define BLAKE3_IMPL_H
3
+
4
+ #include <assert.h>
5
+ #include <stdbool.h>
6
+ #include <stddef.h>
7
+ #include <stdint.h>
8
+ #include <string.h>
9
+
10
+ #include "blake3.h"
11
+
12
+ // internal flags
13
+ enum blake3_flags {
14
+ CHUNK_START = 1 << 0,
15
+ CHUNK_END = 1 << 1,
16
+ PARENT = 1 << 2,
17
+ ROOT = 1 << 3,
18
+ KEYED_HASH = 1 << 4,
19
+ DERIVE_KEY_CONTEXT = 1 << 5,
20
+ DERIVE_KEY_MATERIAL = 1 << 6,
21
+ };
22
+
23
+ // This C implementation tries to support recent versions of GCC, Clang, and
24
+ // MSVC.
25
+ #if defined(_MSC_VER)
26
+ #define INLINE static __forceinline
27
+ #else
28
+ #define INLINE static inline __attribute__((always_inline))
29
+ #endif
30
+
31
+ #if defined(__x86_64__) || defined(_M_X64)
32
+ #define IS_X86
33
+ #define IS_X86_64
34
+ #endif
35
+
36
+ #if defined(__i386__) || defined(_M_IX86)
37
+ #define IS_X86
38
+ #define IS_X86_32
39
+ #endif
40
+
41
+ #if defined(IS_X86)
42
+ #if defined(_MSC_VER)
43
+ #include <intrin.h>
44
+ #endif
45
+ #include <immintrin.h>
46
+ #endif
47
+
48
+ #if defined(IS_X86)
49
+ #define MAX_SIMD_DEGREE 16
50
+ #elif defined(BLAKE3_USE_NEON)
51
+ #define MAX_SIMD_DEGREE 4
52
+ #else
53
+ #define MAX_SIMD_DEGREE 1
54
+ #endif
55
+
56
+ // There are some places where we want a static size that's equal to the
57
+ // MAX_SIMD_DEGREE, but also at least 2.
58
+ #define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2)
59
+
60
+ static const uint32_t IV[8] = {0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL,
61
+ 0xA54FF53AUL, 0x510E527FUL, 0x9B05688CUL,
62
+ 0x1F83D9ABUL, 0x5BE0CD19UL};
63
+
64
+ static const uint8_t MSG_SCHEDULE[7][16] = {
65
+ {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
66
+ {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8},
67
+ {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1},
68
+ {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6},
69
+ {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4},
70
+ {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7},
71
+ {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13},
72
+ };
73
+
74
+ /* Find index of the highest set bit */
75
+ /* x is assumed to be nonzero. */
76
+ static unsigned int highest_one(uint64_t x) {
77
+ #if defined(__GNUC__) || defined(__clang__)
78
+ return 63 ^ __builtin_clzll(x);
79
+ #elif defined(_MSC_VER) && defined(IS_X86_64)
80
+ unsigned long index;
81
+ _BitScanReverse64(&index, x);
82
+ return index;
83
+ #elif defined(_MSC_VER) && defined(IS_X86_32)
84
+ if(x >> 32) {
85
+ unsigned long index;
86
+ _BitScanReverse(&index, x >> 32);
87
+ return 32 + index;
88
+ } else {
89
+ unsigned long index;
90
+ _BitScanReverse(&index, x);
91
+ return index;
92
+ }
93
+ #else
94
+ unsigned int c = 0;
95
+ if(x & 0xffffffff00000000ULL) { x >>= 32; c += 32; }
96
+ if(x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; }
97
+ if(x & 0x000000000000ff00ULL) { x >>= 8; c += 8; }
98
+ if(x & 0x00000000000000f0ULL) { x >>= 4; c += 4; }
99
+ if(x & 0x000000000000000cULL) { x >>= 2; c += 2; }
100
+ if(x & 0x0000000000000002ULL) { c += 1; }
101
+ return c;
102
+ #endif
103
+ }
104
+
105
+ // Count the number of 1 bits.
106
+ INLINE unsigned int popcnt(uint64_t x) {
107
+ #if defined(__GNUC__) || defined(__clang__)
108
+ return __builtin_popcountll(x);
109
+ #else
110
+ unsigned int count = 0;
111
+ while (x != 0) {
112
+ count += 1;
113
+ x &= x - 1;
114
+ }
115
+ return count;
116
+ #endif
117
+ }
118
+
119
+ // Largest power of two less than or equal to x. As a special case, returns 1
120
+ // when x is 0.
121
+ INLINE uint64_t round_down_to_power_of_2(uint64_t x) {
122
+ return 1ULL << highest_one(x | 1);
123
+ }
124
+
125
+ INLINE uint32_t counter_low(uint64_t counter) { return (uint32_t)counter; }
126
+
127
+ INLINE uint32_t counter_high(uint64_t counter) {
128
+ return (uint32_t)(counter >> 32);
129
+ }
130
+
131
+ INLINE uint32_t load32(const void *src) {
132
+ const uint8_t *p = (const uint8_t *)src;
133
+ return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) |
134
+ ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24);
135
+ }
136
+
137
+ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
138
+ uint32_t key_words[8]) {
139
+ key_words[0] = load32(&key[0 * 4]);
140
+ key_words[1] = load32(&key[1 * 4]);
141
+ key_words[2] = load32(&key[2 * 4]);
142
+ key_words[3] = load32(&key[3 * 4]);
143
+ key_words[4] = load32(&key[4 * 4]);
144
+ key_words[5] = load32(&key[5 * 4]);
145
+ key_words[6] = load32(&key[6 * 4]);
146
+ key_words[7] = load32(&key[7 * 4]);
147
+ }
148
+
149
+ void blake3_compress_in_place(uint32_t cv[8],
150
+ const uint8_t block[BLAKE3_BLOCK_LEN],
151
+ uint8_t block_len, uint64_t counter,
152
+ uint8_t flags);
153
+
154
+ void blake3_compress_xof(const uint32_t cv[8],
155
+ const uint8_t block[BLAKE3_BLOCK_LEN],
156
+ uint8_t block_len, uint64_t counter, uint8_t flags,
157
+ uint8_t out[64]);
158
+
159
+ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
160
+ size_t blocks, const uint32_t key[8], uint64_t counter,
161
+ bool increment_counter, uint8_t flags,
162
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out);
163
+
164
+ size_t blake3_simd_degree();
165
+
166
+
167
+ #endif /* BLAKE3_IMPL_H */