digest-blake3 0.22.1 → 1.2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,72 +14,7 @@
14
14
  #endif
15
15
  #endif
16
16
 
17
- // Declarations for implementation-specific functions.
18
- void blake3_compress_in_place_portable(uint32_t cv[8],
19
- const uint8_t block[BLAKE3_BLOCK_LEN],
20
- uint8_t block_len, uint64_t counter,
21
- uint8_t flags);
22
-
23
- void blake3_compress_xof_portable(const uint32_t cv[8],
24
- const uint8_t block[BLAKE3_BLOCK_LEN],
25
- uint8_t block_len, uint64_t counter,
26
- uint8_t flags, uint8_t out[64]);
27
-
28
- void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
29
- size_t blocks, const uint32_t key[8],
30
- uint64_t counter, bool increment_counter,
31
- uint8_t flags, uint8_t flags_start,
32
- uint8_t flags_end, uint8_t *out);
33
-
34
- #if defined(IS_X86)
35
- #if !defined(BLAKE3_NO_SSE41)
36
- void blake3_compress_in_place_sse41(uint32_t cv[8],
37
- const uint8_t block[BLAKE3_BLOCK_LEN],
38
- uint8_t block_len, uint64_t counter,
39
- uint8_t flags);
40
- void blake3_compress_xof_sse41(const uint32_t cv[8],
41
- const uint8_t block[BLAKE3_BLOCK_LEN],
42
- uint8_t block_len, uint64_t counter,
43
- uint8_t flags, uint8_t out[64]);
44
- void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
45
- size_t blocks, const uint32_t key[8],
46
- uint64_t counter, bool increment_counter,
47
- uint8_t flags, uint8_t flags_start,
48
- uint8_t flags_end, uint8_t *out);
49
- #endif
50
- #if !defined(BLAKE3_NO_AVX2)
51
- void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
52
- size_t blocks, const uint32_t key[8],
53
- uint64_t counter, bool increment_counter,
54
- uint8_t flags, uint8_t flags_start,
55
- uint8_t flags_end, uint8_t *out);
56
- #endif
57
- #if !defined(BLAKE3_NO_AVX512)
58
- void blake3_compress_in_place_avx512(uint32_t cv[8],
59
- const uint8_t block[BLAKE3_BLOCK_LEN],
60
- uint8_t block_len, uint64_t counter,
61
- uint8_t flags);
62
-
63
- void blake3_compress_xof_avx512(const uint32_t cv[8],
64
- const uint8_t block[BLAKE3_BLOCK_LEN],
65
- uint8_t block_len, uint64_t counter,
66
- uint8_t flags, uint8_t out[64]);
67
-
68
- void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
69
- size_t blocks, const uint32_t key[8],
70
- uint64_t counter, bool increment_counter,
71
- uint8_t flags, uint8_t flags_start,
72
- uint8_t flags_end, uint8_t *out);
73
- #endif
74
- #endif
75
-
76
- #if defined(BLAKE3_USE_NEON)
77
- void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
78
- size_t blocks, const uint32_t key[8],
79
- uint64_t counter, bool increment_counter,
80
- uint8_t flags, uint8_t flags_start,
81
- uint8_t flags_end, uint8_t *out);
82
- #endif
17
+ #define MAYBE_UNUSED(x) (void)((x))
83
18
 
84
19
  #if defined(IS_X86)
85
20
  static uint64_t xgetbv() {
@@ -204,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
204
139
  uint8_t flags) {
205
140
  #if defined(IS_X86)
206
141
  const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
207
143
  #if !defined(BLAKE3_NO_AVX512)
208
144
  if (features & AVX512VL) {
209
145
  blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
@@ -216,6 +152,12 @@ void blake3_compress_in_place(uint32_t cv[8],
216
152
  return;
217
153
  }
218
154
  #endif
155
+ #if !defined(BLAKE3_NO_SSE2)
156
+ if (features & SSE2) {
157
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158
+ return;
159
+ }
160
+ #endif
219
161
  #endif
220
162
  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
221
163
  }
@@ -226,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
226
168
  uint8_t out[64]) {
227
169
  #if defined(IS_X86)
228
170
  const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
229
172
  #if !defined(BLAKE3_NO_AVX512)
230
173
  if (features & AVX512VL) {
231
174
  blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
@@ -238,6 +181,12 @@ void blake3_compress_xof(const uint32_t cv[8],
238
181
  return;
239
182
  }
240
183
  #endif
184
+ #if !defined(BLAKE3_NO_SSE2)
185
+ if (features & SSE2) {
186
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187
+ return;
188
+ }
189
+ #endif
241
190
  #endif
242
191
  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
243
192
  }
@@ -248,8 +197,9 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
248
197
  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
249
198
  #if defined(IS_X86)
250
199
  const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
251
201
  #if !defined(BLAKE3_NO_AVX512)
252
- if (features & AVX512F) {
202
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253
203
  blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
254
204
  increment_counter, flags, flags_start, flags_end,
255
205
  out);
@@ -272,9 +222,17 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
272
222
  return;
273
223
  }
274
224
  #endif
225
+ #if !defined(BLAKE3_NO_SSE2)
226
+ if (features & SSE2) {
227
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228
+ increment_counter, flags, flags_start, flags_end,
229
+ out);
230
+ return;
231
+ }
232
+ #endif
275
233
  #endif
276
234
 
277
- #if defined(BLAKE3_USE_NEON)
235
+ #if BLAKE3_USE_NEON == 1
278
236
  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
279
237
  increment_counter, flags, flags_start, flags_end, out);
280
238
  return;
@@ -286,11 +244,12 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
286
244
  }
287
245
 
288
246
  // The dynamically detected SIMD degree of the current platform.
289
- size_t blake3_simd_degree() {
247
+ size_t blake3_simd_degree(void) {
290
248
  #if defined(IS_X86)
291
249
  const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
292
251
  #if !defined(BLAKE3_NO_AVX512)
293
- if (features & AVX512F) {
252
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
294
253
  return 16;
295
254
  }
296
255
  #endif
@@ -304,8 +263,13 @@ size_t blake3_simd_degree() {
304
263
  return 4;
305
264
  }
306
265
  #endif
266
+ #if !defined(BLAKE3_NO_SSE2)
267
+ if (features & SSE2) {
268
+ return 4;
269
+ }
270
+ #endif
307
271
  #endif
308
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
309
273
  return 4;
310
274
  #endif
311
275
  return 1;
@@ -38,6 +38,10 @@ enum blake3_flags {
38
38
  #define IS_X86_32
39
39
  #endif
40
40
 
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
41
45
  #if defined(IS_X86)
42
46
  #if defined(_MSC_VER)
43
47
  #include <intrin.h>
@@ -45,9 +49,18 @@ enum blake3_flags {
45
49
  #include <immintrin.h>
46
50
  #endif
47
51
 
52
+ #if !defined(BLAKE3_USE_NEON)
53
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
54
+ #if defined(IS_AARCH64)
55
+ #define BLAKE3_USE_NEON 1
56
+ #else
57
+ #define BLAKE3_USE_NEON 0
58
+ #endif
59
+ #endif
60
+
48
61
  #if defined(IS_X86)
49
62
  #define MAX_SIMD_DEGREE 16
50
- #elif defined(BLAKE3_USE_NEON)
63
+ #elif BLAKE3_USE_NEON == 1
51
64
  #define MAX_SIMD_DEGREE 4
52
65
  #else
53
66
  #define MAX_SIMD_DEGREE 1
@@ -146,6 +159,25 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
146
159
  key_words[7] = load32(&key[7 * 4]);
147
160
  }
148
161
 
162
+ INLINE void store32(void *dst, uint32_t w) {
163
+ uint8_t *p = (uint8_t *)dst;
164
+ p[0] = (uint8_t)(w >> 0);
165
+ p[1] = (uint8_t)(w >> 8);
166
+ p[2] = (uint8_t)(w >> 16);
167
+ p[3] = (uint8_t)(w >> 24);
168
+ }
169
+
170
+ INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
171
+ store32(&bytes_out[0 * 4], cv_words[0]);
172
+ store32(&bytes_out[1 * 4], cv_words[1]);
173
+ store32(&bytes_out[2 * 4], cv_words[2]);
174
+ store32(&bytes_out[3 * 4], cv_words[3]);
175
+ store32(&bytes_out[4 * 4], cv_words[4]);
176
+ store32(&bytes_out[5 * 4], cv_words[5]);
177
+ store32(&bytes_out[6 * 4], cv_words[6]);
178
+ store32(&bytes_out[7 * 4], cv_words[7]);
179
+ }
180
+
149
181
  void blake3_compress_in_place(uint32_t cv[8],
150
182
  const uint8_t block[BLAKE3_BLOCK_LEN],
151
183
  uint8_t block_len, uint64_t counter,
@@ -161,7 +193,90 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
161
193
  bool increment_counter, uint8_t flags,
162
194
  uint8_t flags_start, uint8_t flags_end, uint8_t *out);
163
195
 
164
- size_t blake3_simd_degree();
196
+ size_t blake3_simd_degree(void);
197
+
198
+
199
+ // Declarations for implementation-specific functions.
200
+ void blake3_compress_in_place_portable(uint32_t cv[8],
201
+ const uint8_t block[BLAKE3_BLOCK_LEN],
202
+ uint8_t block_len, uint64_t counter,
203
+ uint8_t flags);
204
+
205
+ void blake3_compress_xof_portable(const uint32_t cv[8],
206
+ const uint8_t block[BLAKE3_BLOCK_LEN],
207
+ uint8_t block_len, uint64_t counter,
208
+ uint8_t flags, uint8_t out[64]);
209
+
210
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
211
+ size_t blocks, const uint32_t key[8],
212
+ uint64_t counter, bool increment_counter,
213
+ uint8_t flags, uint8_t flags_start,
214
+ uint8_t flags_end, uint8_t *out);
215
+
216
+ #if defined(IS_X86)
217
+ #if !defined(BLAKE3_NO_SSE2)
218
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
219
+ const uint8_t block[BLAKE3_BLOCK_LEN],
220
+ uint8_t block_len, uint64_t counter,
221
+ uint8_t flags);
222
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
223
+ const uint8_t block[BLAKE3_BLOCK_LEN],
224
+ uint8_t block_len, uint64_t counter,
225
+ uint8_t flags, uint8_t out[64]);
226
+ void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
227
+ size_t blocks, const uint32_t key[8],
228
+ uint64_t counter, bool increment_counter,
229
+ uint8_t flags, uint8_t flags_start,
230
+ uint8_t flags_end, uint8_t *out);
231
+ #endif
232
+ #if !defined(BLAKE3_NO_SSE41)
233
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
234
+ const uint8_t block[BLAKE3_BLOCK_LEN],
235
+ uint8_t block_len, uint64_t counter,
236
+ uint8_t flags);
237
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
238
+ const uint8_t block[BLAKE3_BLOCK_LEN],
239
+ uint8_t block_len, uint64_t counter,
240
+ uint8_t flags, uint8_t out[64]);
241
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
242
+ size_t blocks, const uint32_t key[8],
243
+ uint64_t counter, bool increment_counter,
244
+ uint8_t flags, uint8_t flags_start,
245
+ uint8_t flags_end, uint8_t *out);
246
+ #endif
247
+ #if !defined(BLAKE3_NO_AVX2)
248
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
249
+ size_t blocks, const uint32_t key[8],
250
+ uint64_t counter, bool increment_counter,
251
+ uint8_t flags, uint8_t flags_start,
252
+ uint8_t flags_end, uint8_t *out);
253
+ #endif
254
+ #if !defined(BLAKE3_NO_AVX512)
255
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
256
+ const uint8_t block[BLAKE3_BLOCK_LEN],
257
+ uint8_t block_len, uint64_t counter,
258
+ uint8_t flags);
259
+
260
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
261
+ const uint8_t block[BLAKE3_BLOCK_LEN],
262
+ uint8_t block_len, uint64_t counter,
263
+ uint8_t flags, uint8_t out[64]);
264
+
265
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
266
+ size_t blocks, const uint32_t key[8],
267
+ uint64_t counter, bool increment_counter,
268
+ uint8_t flags, uint8_t flags_start,
269
+ uint8_t flags_end, uint8_t *out);
270
+ #endif
271
+ #endif
272
+
273
+ #if BLAKE3_USE_NEON == 1
274
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
275
+ size_t blocks, const uint32_t key[8],
276
+ uint64_t counter, bool increment_counter,
277
+ uint8_t flags, uint8_t flags_start,
278
+ uint8_t flags_end, uint8_t *out);
279
+ #endif
165
280
 
166
281
 
167
282
  #endif /* BLAKE3_IMPL_H */
@@ -2,7 +2,12 @@
2
2
 
3
3
  #include <arm_neon.h>
4
4
 
5
- // TODO: This is probably incorrect for big-endian ARM. How should that work?
5
+ #ifdef __ARM_BIG_ENDIAN
6
+ #error "This implementation only supports little-endian ARM."
7
+ // It might be that all we need for big-endian support here is to get the loads
8
+ // and stores right, but step zero would be finding a way to test it in CI.
9
+ #endif
10
+
6
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
7
12
  // vld1q_u32 has alignment requirements. Don't use it.
8
13
  uint32x4_t x;
@@ -1,14 +1,6 @@
1
1
  #include "blake3_impl.h"
2
2
  #include <string.h>
3
3
 
4
- INLINE void store32(void *dst, uint32_t w) {
5
- uint8_t *p = (uint8_t *)dst;
6
- p[0] = (uint8_t)(w >> 0);
7
- p[1] = (uint8_t)(w >> 8);
8
- p[2] = (uint8_t)(w >> 16);
9
- p[3] = (uint8_t)(w >> 24);
10
- }
11
-
12
4
  INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
13
5
  return (w >> c) | (w << (32 - c));
14
6
  }
@@ -147,7 +139,7 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
147
139
  blocks -= 1;
148
140
  block_flags = flags;
149
141
  }
150
- memcpy(out, cv, 32);
142
+ store_cv_words(out, cv);
151
143
  }
152
144
 
153
145
  void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,