digest-blake3 0.22.1 → 1.2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,72 +14,7 @@
14
14
  #endif
15
15
  #endif
16
16
 
17
- // Declarations for implementation-specific functions.
18
- void blake3_compress_in_place_portable(uint32_t cv[8],
19
- const uint8_t block[BLAKE3_BLOCK_LEN],
20
- uint8_t block_len, uint64_t counter,
21
- uint8_t flags);
22
-
23
- void blake3_compress_xof_portable(const uint32_t cv[8],
24
- const uint8_t block[BLAKE3_BLOCK_LEN],
25
- uint8_t block_len, uint64_t counter,
26
- uint8_t flags, uint8_t out[64]);
27
-
28
- void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
29
- size_t blocks, const uint32_t key[8],
30
- uint64_t counter, bool increment_counter,
31
- uint8_t flags, uint8_t flags_start,
32
- uint8_t flags_end, uint8_t *out);
33
-
34
- #if defined(IS_X86)
35
- #if !defined(BLAKE3_NO_SSE41)
36
- void blake3_compress_in_place_sse41(uint32_t cv[8],
37
- const uint8_t block[BLAKE3_BLOCK_LEN],
38
- uint8_t block_len, uint64_t counter,
39
- uint8_t flags);
40
- void blake3_compress_xof_sse41(const uint32_t cv[8],
41
- const uint8_t block[BLAKE3_BLOCK_LEN],
42
- uint8_t block_len, uint64_t counter,
43
- uint8_t flags, uint8_t out[64]);
44
- void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
45
- size_t blocks, const uint32_t key[8],
46
- uint64_t counter, bool increment_counter,
47
- uint8_t flags, uint8_t flags_start,
48
- uint8_t flags_end, uint8_t *out);
49
- #endif
50
- #if !defined(BLAKE3_NO_AVX2)
51
- void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
52
- size_t blocks, const uint32_t key[8],
53
- uint64_t counter, bool increment_counter,
54
- uint8_t flags, uint8_t flags_start,
55
- uint8_t flags_end, uint8_t *out);
56
- #endif
57
- #if !defined(BLAKE3_NO_AVX512)
58
- void blake3_compress_in_place_avx512(uint32_t cv[8],
59
- const uint8_t block[BLAKE3_BLOCK_LEN],
60
- uint8_t block_len, uint64_t counter,
61
- uint8_t flags);
62
-
63
- void blake3_compress_xof_avx512(const uint32_t cv[8],
64
- const uint8_t block[BLAKE3_BLOCK_LEN],
65
- uint8_t block_len, uint64_t counter,
66
- uint8_t flags, uint8_t out[64]);
67
-
68
- void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
69
- size_t blocks, const uint32_t key[8],
70
- uint64_t counter, bool increment_counter,
71
- uint8_t flags, uint8_t flags_start,
72
- uint8_t flags_end, uint8_t *out);
73
- #endif
74
- #endif
75
-
76
- #if defined(BLAKE3_USE_NEON)
77
- void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
78
- size_t blocks, const uint32_t key[8],
79
- uint64_t counter, bool increment_counter,
80
- uint8_t flags, uint8_t flags_start,
81
- uint8_t flags_end, uint8_t *out);
82
- #endif
17
+ #define MAYBE_UNUSED(x) (void)((x))
83
18
 
84
19
  #if defined(IS_X86)
85
20
  static uint64_t xgetbv() {
@@ -204,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
204
139
  uint8_t flags) {
205
140
  #if defined(IS_X86)
206
141
  const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
207
143
  #if !defined(BLAKE3_NO_AVX512)
208
144
  if (features & AVX512VL) {
209
145
  blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
@@ -216,6 +152,12 @@ void blake3_compress_in_place(uint32_t cv[8],
216
152
  return;
217
153
  }
218
154
  #endif
155
+ #if !defined(BLAKE3_NO_SSE2)
156
+ if (features & SSE2) {
157
+ blake3_compress_in_place_sse2(cv, block, block_len, counter, flags);
158
+ return;
159
+ }
160
+ #endif
219
161
  #endif
220
162
  blake3_compress_in_place_portable(cv, block, block_len, counter, flags);
221
163
  }
@@ -226,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
226
168
  uint8_t out[64]) {
227
169
  #if defined(IS_X86)
228
170
  const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
229
172
  #if !defined(BLAKE3_NO_AVX512)
230
173
  if (features & AVX512VL) {
231
174
  blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
@@ -238,6 +181,12 @@ void blake3_compress_xof(const uint32_t cv[8],
238
181
  return;
239
182
  }
240
183
  #endif
184
+ #if !defined(BLAKE3_NO_SSE2)
185
+ if (features & SSE2) {
186
+ blake3_compress_xof_sse2(cv, block, block_len, counter, flags, out);
187
+ return;
188
+ }
189
+ #endif
241
190
  #endif
242
191
  blake3_compress_xof_portable(cv, block, block_len, counter, flags, out);
243
192
  }
@@ -248,8 +197,9 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
248
197
  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
249
198
  #if defined(IS_X86)
250
199
  const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
251
201
  #if !defined(BLAKE3_NO_AVX512)
252
- if (features & AVX512F) {
202
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
253
203
  blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
254
204
  increment_counter, flags, flags_start, flags_end,
255
205
  out);
@@ -272,9 +222,17 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
272
222
  return;
273
223
  }
274
224
  #endif
225
+ #if !defined(BLAKE3_NO_SSE2)
226
+ if (features & SSE2) {
227
+ blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter,
228
+ increment_counter, flags, flags_start, flags_end,
229
+ out);
230
+ return;
231
+ }
232
+ #endif
275
233
  #endif
276
234
 
277
- #if defined(BLAKE3_USE_NEON)
235
+ #if BLAKE3_USE_NEON == 1
278
236
  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
279
237
  increment_counter, flags, flags_start, flags_end, out);
280
238
  return;
@@ -286,11 +244,12 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
286
244
  }
287
245
 
288
246
  // The dynamically detected SIMD degree of the current platform.
289
- size_t blake3_simd_degree() {
247
+ size_t blake3_simd_degree(void) {
290
248
  #if defined(IS_X86)
291
249
  const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
292
251
  #if !defined(BLAKE3_NO_AVX512)
293
- if (features & AVX512F) {
252
+ if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
294
253
  return 16;
295
254
  }
296
255
  #endif
@@ -304,8 +263,13 @@ size_t blake3_simd_degree() {
304
263
  return 4;
305
264
  }
306
265
  #endif
266
+ #if !defined(BLAKE3_NO_SSE2)
267
+ if (features & SSE2) {
268
+ return 4;
269
+ }
270
+ #endif
307
271
  #endif
308
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
309
273
  return 4;
310
274
  #endif
311
275
  return 1;
@@ -38,6 +38,10 @@ enum blake3_flags {
38
38
  #define IS_X86_32
39
39
  #endif
40
40
 
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
41
45
  #if defined(IS_X86)
42
46
  #if defined(_MSC_VER)
43
47
  #include <intrin.h>
@@ -45,9 +49,18 @@ enum blake3_flags {
45
49
  #include <immintrin.h>
46
50
  #endif
47
51
 
52
+ #if !defined(BLAKE3_USE_NEON)
53
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
54
+ #if defined(IS_AARCH64)
55
+ #define BLAKE3_USE_NEON 1
56
+ #else
57
+ #define BLAKE3_USE_NEON 0
58
+ #endif
59
+ #endif
60
+
48
61
  #if defined(IS_X86)
49
62
  #define MAX_SIMD_DEGREE 16
50
- #elif defined(BLAKE3_USE_NEON)
63
+ #elif BLAKE3_USE_NEON == 1
51
64
  #define MAX_SIMD_DEGREE 4
52
65
  #else
53
66
  #define MAX_SIMD_DEGREE 1
@@ -146,6 +159,25 @@ INLINE void load_key_words(const uint8_t key[BLAKE3_KEY_LEN],
146
159
  key_words[7] = load32(&key[7 * 4]);
147
160
  }
148
161
 
162
+ INLINE void store32(void *dst, uint32_t w) {
163
+ uint8_t *p = (uint8_t *)dst;
164
+ p[0] = (uint8_t)(w >> 0);
165
+ p[1] = (uint8_t)(w >> 8);
166
+ p[2] = (uint8_t)(w >> 16);
167
+ p[3] = (uint8_t)(w >> 24);
168
+ }
169
+
170
+ INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) {
171
+ store32(&bytes_out[0 * 4], cv_words[0]);
172
+ store32(&bytes_out[1 * 4], cv_words[1]);
173
+ store32(&bytes_out[2 * 4], cv_words[2]);
174
+ store32(&bytes_out[3 * 4], cv_words[3]);
175
+ store32(&bytes_out[4 * 4], cv_words[4]);
176
+ store32(&bytes_out[5 * 4], cv_words[5]);
177
+ store32(&bytes_out[6 * 4], cv_words[6]);
178
+ store32(&bytes_out[7 * 4], cv_words[7]);
179
+ }
180
+
149
181
  void blake3_compress_in_place(uint32_t cv[8],
150
182
  const uint8_t block[BLAKE3_BLOCK_LEN],
151
183
  uint8_t block_len, uint64_t counter,
@@ -161,7 +193,90 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
161
193
  bool increment_counter, uint8_t flags,
162
194
  uint8_t flags_start, uint8_t flags_end, uint8_t *out);
163
195
 
164
- size_t blake3_simd_degree();
196
+ size_t blake3_simd_degree(void);
197
+
198
+
199
+ // Declarations for implementation-specific functions.
200
+ void blake3_compress_in_place_portable(uint32_t cv[8],
201
+ const uint8_t block[BLAKE3_BLOCK_LEN],
202
+ uint8_t block_len, uint64_t counter,
203
+ uint8_t flags);
204
+
205
+ void blake3_compress_xof_portable(const uint32_t cv[8],
206
+ const uint8_t block[BLAKE3_BLOCK_LEN],
207
+ uint8_t block_len, uint64_t counter,
208
+ uint8_t flags, uint8_t out[64]);
209
+
210
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
211
+ size_t blocks, const uint32_t key[8],
212
+ uint64_t counter, bool increment_counter,
213
+ uint8_t flags, uint8_t flags_start,
214
+ uint8_t flags_end, uint8_t *out);
215
+
216
+ #if defined(IS_X86)
217
+ #if !defined(BLAKE3_NO_SSE2)
218
+ void blake3_compress_in_place_sse2(uint32_t cv[8],
219
+ const uint8_t block[BLAKE3_BLOCK_LEN],
220
+ uint8_t block_len, uint64_t counter,
221
+ uint8_t flags);
222
+ void blake3_compress_xof_sse2(const uint32_t cv[8],
223
+ const uint8_t block[BLAKE3_BLOCK_LEN],
224
+ uint8_t block_len, uint64_t counter,
225
+ uint8_t flags, uint8_t out[64]);
226
+ void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs,
227
+ size_t blocks, const uint32_t key[8],
228
+ uint64_t counter, bool increment_counter,
229
+ uint8_t flags, uint8_t flags_start,
230
+ uint8_t flags_end, uint8_t *out);
231
+ #endif
232
+ #if !defined(BLAKE3_NO_SSE41)
233
+ void blake3_compress_in_place_sse41(uint32_t cv[8],
234
+ const uint8_t block[BLAKE3_BLOCK_LEN],
235
+ uint8_t block_len, uint64_t counter,
236
+ uint8_t flags);
237
+ void blake3_compress_xof_sse41(const uint32_t cv[8],
238
+ const uint8_t block[BLAKE3_BLOCK_LEN],
239
+ uint8_t block_len, uint64_t counter,
240
+ uint8_t flags, uint8_t out[64]);
241
+ void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
242
+ size_t blocks, const uint32_t key[8],
243
+ uint64_t counter, bool increment_counter,
244
+ uint8_t flags, uint8_t flags_start,
245
+ uint8_t flags_end, uint8_t *out);
246
+ #endif
247
+ #if !defined(BLAKE3_NO_AVX2)
248
+ void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
249
+ size_t blocks, const uint32_t key[8],
250
+ uint64_t counter, bool increment_counter,
251
+ uint8_t flags, uint8_t flags_start,
252
+ uint8_t flags_end, uint8_t *out);
253
+ #endif
254
+ #if !defined(BLAKE3_NO_AVX512)
255
+ void blake3_compress_in_place_avx512(uint32_t cv[8],
256
+ const uint8_t block[BLAKE3_BLOCK_LEN],
257
+ uint8_t block_len, uint64_t counter,
258
+ uint8_t flags);
259
+
260
+ void blake3_compress_xof_avx512(const uint32_t cv[8],
261
+ const uint8_t block[BLAKE3_BLOCK_LEN],
262
+ uint8_t block_len, uint64_t counter,
263
+ uint8_t flags, uint8_t out[64]);
264
+
265
+ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
266
+ size_t blocks, const uint32_t key[8],
267
+ uint64_t counter, bool increment_counter,
268
+ uint8_t flags, uint8_t flags_start,
269
+ uint8_t flags_end, uint8_t *out);
270
+ #endif
271
+ #endif
272
+
273
+ #if BLAKE3_USE_NEON == 1
274
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
275
+ size_t blocks, const uint32_t key[8],
276
+ uint64_t counter, bool increment_counter,
277
+ uint8_t flags, uint8_t flags_start,
278
+ uint8_t flags_end, uint8_t *out);
279
+ #endif
165
280
 
166
281
 
167
282
  #endif /* BLAKE3_IMPL_H */
@@ -2,7 +2,12 @@
2
2
 
3
3
  #include <arm_neon.h>
4
4
 
5
- // TODO: This is probably incorrect for big-endian ARM. How should that work?
5
+ #ifdef __ARM_BIG_ENDIAN
6
+ #error "This implementation only supports little-endian ARM."
7
+ // It might be that all we need for big-endian support here is to get the loads
8
+ // and stores right, but step zero would be finding a way to test it in CI.
9
+ #endif
10
+
6
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
7
12
  // vld1q_u32 has alignment requirements. Don't use it.
8
13
  uint32x4_t x;
@@ -1,14 +1,6 @@
1
1
  #include "blake3_impl.h"
2
2
  #include <string.h>
3
3
 
4
- INLINE void store32(void *dst, uint32_t w) {
5
- uint8_t *p = (uint8_t *)dst;
6
- p[0] = (uint8_t)(w >> 0);
7
- p[1] = (uint8_t)(w >> 8);
8
- p[2] = (uint8_t)(w >> 16);
9
- p[3] = (uint8_t)(w >> 24);
10
- }
11
-
12
4
  INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
13
5
  return (w >> c) | (w << (32 - c));
14
6
  }
@@ -147,7 +139,7 @@ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
147
139
  blocks -= 1;
148
140
  block_flags = flags;
149
141
  }
150
- memcpy(out, cv, 32);
142
+ store_cv_words(out, cv);
151
143
  }
152
144
 
153
145
  void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,