digest-blake3 0.22.1 → 1.2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cc9e904530fd556aa81345371cdcc84ba0f96af0f34ad88e7d5fca03ea334413
4
- data.tar.gz: e7eb9d2902ea6b314598a476e36ee3d2ec1253a063283cce93b6bc9dbc33ad15
3
+ metadata.gz: ec7a77d6875b688e1cb1fbe8470cbf67278f9fe3f2f8e516bafe7abc0bf54bc4
4
+ data.tar.gz: 74e13b2480eccd5c2fe3fa913a0962217c1f07c95b5db80b8303086488ee5d9f
5
5
  SHA512:
6
- metadata.gz: 0d1a215201ad7aae6cebca040db27f4336861582be4870d6d57adcf9b345fa3c44fd5dd7443b0417fc9fe407bf52d98d03eb7065d92f699091f6adfb10fa67c2
7
- data.tar.gz: 262293d252c00c9aaa00bcf7cbeb066a3be416a4b2314473740627906fbbec466a2b1cf199d1fdfa0aecfa1568bf40fe661ca2c22162243c2cacd49f43aed9dc
6
+ metadata.gz: de0fb7b5ccce755c313da8e547a430950d181170c64561746890ce8855ce5e09d3232b16316f36d22320ae5d23cf7904e8221a26358e96d9566ba247ef613214
7
+ data.tar.gz: 33e15e9469128ba227dbe6b57d9c44fe55078b9031975bf9db783a469c93342c7ccbf38b763ddfed7f09c941a42a6df89302cfda0e38b0ad4967a12acac4b18a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (0.22.1)
4
+ digest-blake3 (1.2.0.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
58
58
 
59
59
  ## Contributing
60
60
 
61
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/digest-blake3.
61
+ Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
62
62
 
63
63
  ## License
64
64
 
@@ -5,6 +5,8 @@
5
5
  #include "blake3.h"
6
6
  #include "blake3_impl.h"
7
7
 
8
+ const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
9
+
8
10
  INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
9
11
  uint8_t flags) {
10
12
  memcpy(self->cv, key, BLAKE3_KEY_LEN);
@@ -81,26 +83,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
81
83
  memcpy(cv_words, self->input_cv, 32);
82
84
  blake3_compress_in_place(cv_words, self->block, self->block_len,
83
85
  self->counter, self->flags);
84
- memcpy(cv, cv_words, 32);
86
+ store_cv_words(cv, cv_words);
85
87
  }
86
88
 
87
- INLINE void output_root_bytes(const output_t *self, uint8_t *out,
89
+ INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
88
90
  size_t out_len) {
89
- uint64_t output_block_counter = 0;
91
+ uint64_t output_block_counter = seek / 64;
92
+ size_t offset_within_block = seek % 64;
90
93
  uint8_t wide_buf[64];
91
94
  while (out_len > 0) {
92
95
  blake3_compress_xof(self->input_cv, self->block, self->block_len,
93
96
  output_block_counter, self->flags | ROOT, wide_buf);
97
+ size_t available_bytes = 64 - offset_within_block;
94
98
  size_t memcpy_len;
95
- if (out_len > 64) {
96
- memcpy_len = 64;
99
+ if (out_len > available_bytes) {
100
+ memcpy_len = available_bytes;
97
101
  } else {
98
102
  memcpy_len = out_len;
99
103
  }
100
- memcpy(out, wide_buf, memcpy_len);
104
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
101
105
  out += memcpy_len;
102
106
  out_len -= memcpy_len;
103
107
  output_block_counter += 1;
108
+ offset_within_block = 0;
104
109
  }
105
110
  }
106
111
 
@@ -256,10 +261,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
256
261
  // Why not just have the caller split the input on the first update(), instead
257
262
  // of implementing this special rule? Because we don't want to limit SIMD or
258
263
  // multi-threading parallelism for that update().
259
- size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
260
- const uint32_t key[8],
261
- uint64_t chunk_counter, uint8_t flags,
262
- uint8_t *out) {
264
+ static size_t blake3_compress_subtree_wide(const uint8_t *input,
265
+ size_t input_len,
266
+ const uint32_t key[8],
267
+ uint64_t chunk_counter,
268
+ uint8_t flags, uint8_t *out) {
263
269
  // Note that the single chunk case does *not* bump the SIMD degree up to 2
264
270
  // when it is 1. If this implementation adds multi-threading in the future,
265
271
  // this gives us the option of multi-threading even the 2-chunk case, which
@@ -331,15 +337,21 @@ INLINE void compress_subtree_to_parent_node(
331
337
  assert(input_len > BLAKE3_CHUNK_LEN);
332
338
  #endif
333
339
 
334
- uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
340
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
335
341
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
336
342
  chunk_counter, flags, cv_array);
343
+ assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
337
344
 
338
345
  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
339
346
  // compress_subtree_wide() returns more than 2 chaining values. Condense
340
347
  // them into 2 by forming parent nodes repeatedly.
341
348
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
342
- while (num_cvs > 2) {
349
+ // The second half of this loop condition is always true, and we just
350
+ // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
351
+ // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
352
+ // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
353
+ // this code, test it against that version.
354
+ while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
343
355
  num_cvs =
344
356
  compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
345
357
  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
@@ -363,10 +375,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
363
375
  hasher_init_base(self, key_words, KEYED_HASH);
364
376
  }
365
377
 
366
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
378
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
379
+ size_t context_len) {
367
380
  blake3_hasher context_hasher;
368
381
  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
369
- blake3_hasher_update(&context_hasher, context, strlen(context));
382
+ blake3_hasher_update(&context_hasher, context, context_len);
370
383
  uint8_t context_key[BLAKE3_KEY_LEN];
371
384
  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
372
385
  uint32_t context_key_words[8];
@@ -374,6 +387,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
374
387
  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
375
388
  }
376
389
 
390
+ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
391
+ blake3_hasher_init_derive_key_raw(self, context, strlen(context));
392
+ }
393
+
377
394
  // As described in hasher_push_cv() below, we do "lazy merging", delaying
378
395
  // merges until right before the next CV is about to be added. This is
379
396
  // different from the reference implementation. Another difference is that we
@@ -425,8 +442,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
425
442
  // compress_subtree_to_parent_node(). That function always returns the top
426
443
  // *two* chaining values of the subtree it's compressing. We then do lazy
427
444
  // merging with each of them separately, so that the second CV will always
428
- // remain unmerged. (The compress_subtree_to_parent_node also helps us support
429
- // extendable output when we're hashing an input all-at-once.)
445
+ // remain unmerged. (That also helps us support extendable output when we're
446
+ // hashing an input all-at-once.)
430
447
  INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
431
448
  uint64_t chunk_counter) {
432
449
  hasher_merge_cv_stack(self, chunk_counter);
@@ -472,8 +489,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
472
489
 
473
490
  // Now the chunk_state is clear, and we have more input. If there's more than
474
491
  // a single chunk (so, definitely not the root chunk), hash the largest whole
475
- // subtree we can, with the full benefits of SIMD and multi-threading
476
- // parallelism. Two restrictions:
492
+ // subtree we can, with the full benefits of SIMD (and maybe in the future,
493
+ // multi-threading) parallelism. Two restrictions:
477
494
  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
478
495
  // the right edge can be incomplete, and we don't know where the right edge
479
496
  // is going to be until we get to finalize().
@@ -546,6 +563,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
546
563
 
547
564
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
548
565
  size_t out_len) {
566
+ blake3_hasher_finalize_seek(self, 0, out, out_len);
567
+ }
568
+
569
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
570
+ uint8_t *out, size_t out_len) {
549
571
  // Explicitly checking for zero avoids causing UB by passing a null pointer
550
572
  // to memcpy. This comes up in practice with things like:
551
573
  // std::vector<uint8_t> v;
@@ -557,7 +579,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
557
579
  // If the subtree stack is empty, then the current chunk is the root.
558
580
  if (self->cv_stack_len == 0) {
559
581
  output_t output = chunk_state_output(&self->chunk);
560
- output_root_bytes(&output, out, out_len);
582
+ output_root_bytes(&output, seek, out, out_len);
561
583
  return;
562
584
  }
563
585
  // If there are any bytes in the chunk state, finalize that chunk and do a
@@ -585,5 +607,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
585
607
  output_chaining_value(&output, &parent_block[32]);
586
608
  output = parent_output(parent_block, self->key, self->chunk.flags);
587
609
  }
588
- output_root_bytes(&output, out, out_len);
610
+ output_root_bytes(&output, seek, out, out_len);
589
611
  }
@@ -4,16 +4,16 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
- #ifdef __cplusplus
7
+ #ifdef __cplusplus
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ #define BLAKE3_VERSION_STRING "1.2.0"
11
12
  #define BLAKE3_KEY_LEN 32
12
13
  #define BLAKE3_OUT_LEN 32
13
14
  #define BLAKE3_BLOCK_LEN 64
14
15
  #define BLAKE3_CHUNK_LEN 1024
15
16
  #define BLAKE3_MAX_DEPTH 54
16
- #define BLAKE3_MAX_SIMD_DEGREE 16
17
17
 
18
18
  // This struct is a private implementation detail. It has to be here because
19
19
  // it's part of blake3_hasher below.
@@ -38,16 +38,21 @@ typedef struct {
38
38
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
39
  } blake3_hasher;
40
40
 
41
+ const char *blake3_version(void);
41
42
  void blake3_hasher_init(blake3_hasher *self);
42
43
  void blake3_hasher_init_keyed(blake3_hasher *self,
43
44
  const uint8_t key[BLAKE3_KEY_LEN]);
44
45
  void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
46
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
47
+ size_t context_len);
45
48
  void blake3_hasher_update(blake3_hasher *self, const void *input,
46
49
  size_t input_len);
47
50
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
48
51
  size_t out_len);
52
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
53
+ uint8_t *out, size_t out_len);
49
54
 
50
- #ifdef __cplusplus
55
+ #ifdef __cplusplus
51
56
  }
52
57
  #endif
53
58
 
@@ -1,3 +1,17 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
1
15
  .intel_syntax noprefix
2
16
  .global _blake3_hash_many_avx2
3
17
  .global blake3_hash_many_avx2
@@ -9,6 +23,7 @@
9
23
  .p2align 6
10
24
  _blake3_hash_many_avx2:
11
25
  blake3_hash_many_avx2:
26
+ _CET_ENDBR
12
27
  push r15
13
28
  push r14
14
29
  push r13
@@ -1,5 +1,18 @@
1
- .intel_syntax noprefix
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
2
14
 
15
+ .intel_syntax noprefix
3
16
  .global _blake3_hash_many_avx512
4
17
  .global blake3_hash_many_avx512
5
18
  .global blake3_compress_in_place_avx512
@@ -15,6 +28,7 @@
15
28
  .p2align 6
16
29
  _blake3_hash_many_avx512:
17
30
  blake3_hash_many_avx512:
31
+ _CET_ENDBR
18
32
  push r15
19
33
  push r14
20
34
  push r13
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
82
96
  mov r14, qword ptr [rdi+0x50]
83
97
  mov r15, qword ptr [rdi+0x58]
84
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
85
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
86
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
87
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
88
102
  vpunpcklqdq zmm8, zmm16, zmm17
89
103
  vpunpckhqdq zmm9, zmm16, zmm17
90
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
91
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
92
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
93
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
94
108
  vpunpcklqdq zmm10, zmm18, zmm19
95
109
  vpunpckhqdq zmm11, zmm18, zmm19
96
110
  mov r8, qword ptr [rdi+0x20]
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
102
116
  mov r14, qword ptr [rdi+0x70]
103
117
  mov r15, qword ptr [rdi+0x78]
104
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
105
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
106
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
107
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
108
122
  vpunpcklqdq zmm12, zmm16, zmm17
109
123
  vpunpckhqdq zmm13, zmm16, zmm17
110
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
111
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
112
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
113
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
114
128
  vpunpcklqdq zmm14, zmm18, zmm19
115
129
  vpunpckhqdq zmm15, zmm18, zmm19
116
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
144
158
  mov r14, qword ptr [rdi+0x50]
145
159
  mov r15, qword ptr [rdi+0x58]
146
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
147
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
148
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
149
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
150
164
  vpunpcklqdq zmm8, zmm24, zmm25
151
165
  vpunpckhqdq zmm9, zmm24, zmm25
152
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
153
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
154
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
155
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
156
170
  vpunpcklqdq zmm10, zmm24, zmm25
157
171
  vpunpckhqdq zmm11, zmm24, zmm25
158
172
  prefetcht0 [r8+rdx+0x80]
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
172
186
  mov r14, qword ptr [rdi+0x70]
173
187
  mov r15, qword ptr [rdi+0x78]
174
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
175
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
176
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
177
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
178
192
  vpunpcklqdq zmm12, zmm24, zmm25
179
193
  vpunpckhqdq zmm13, zmm24, zmm25
180
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
181
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
182
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
183
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
184
198
  vpunpcklqdq zmm14, zmm24, zmm25
185
199
  vpunpckhqdq zmm15, zmm24, zmm25
186
200
  prefetcht0 [r8+rdx+0x80]
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
2039
2053
  vpermq ymm14, ymm14, 0xDC
2040
2054
  vpermq ymm15, ymm15, 0xDC
2041
2055
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2042
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2056
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2043
2057
  mov eax, 17476
2044
2058
  kmovw k2, eax
2045
2059
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
2372
2386
  .p2align 6
2373
2387
  _blake3_compress_in_place_avx512:
2374
2388
  blake3_compress_in_place_avx512:
2389
+ _CET_ENDBR
2375
2390
  vmovdqu xmm0, xmmword ptr [rdi]
2376
2391
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2377
2392
  movzx eax, r8b
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
2454
2469
  .p2align 6
2455
2470
  _blake3_compress_xof_avx512:
2456
2471
  blake3_compress_xof_avx512:
2472
+ _CET_ENDBR
2457
2473
  vmovdqu xmm0, xmmword ptr [rdi]
2458
2474
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2459
2475
  movzx eax, r8b
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
96
96
  mov r14, qword ptr [rdi+0x50]
97
97
  mov r15, qword ptr [rdi+0x58]
98
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
99
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
100
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
101
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
102
102
  vpunpcklqdq zmm8, zmm16, zmm17
103
103
  vpunpckhqdq zmm9, zmm16, zmm17
104
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
105
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
106
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
107
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
108
108
  vpunpcklqdq zmm10, zmm18, zmm19
109
109
  vpunpckhqdq zmm11, zmm18, zmm19
110
110
  mov r8, qword ptr [rdi+0x20]
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
116
116
  mov r14, qword ptr [rdi+0x70]
117
117
  mov r15, qword ptr [rdi+0x78]
118
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
119
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
120
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
121
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
122
122
  vpunpcklqdq zmm12, zmm16, zmm17
123
123
  vpunpckhqdq zmm13, zmm16, zmm17
124
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
125
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
126
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
127
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
128
128
  vpunpcklqdq zmm14, zmm18, zmm19
129
129
  vpunpckhqdq zmm15, zmm18, zmm19
130
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
158
158
  mov r14, qword ptr [rdi+0x50]
159
159
  mov r15, qword ptr [rdi+0x58]
160
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
161
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
162
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
163
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
164
164
  vpunpcklqdq zmm8, zmm24, zmm25
165
165
  vpunpckhqdq zmm9, zmm24, zmm25
166
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
167
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
168
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
169
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
170
170
  vpunpcklqdq zmm10, zmm24, zmm25
171
171
  vpunpckhqdq zmm11, zmm24, zmm25
172
172
  prefetcht0 [r8+rdx+0x80]
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
186
186
  mov r14, qword ptr [rdi+0x70]
187
187
  mov r15, qword ptr [rdi+0x78]
188
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
189
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
190
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
191
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
192
192
  vpunpcklqdq zmm12, zmm24, zmm25
193
193
  vpunpckhqdq zmm13, zmm24, zmm25
194
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
195
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
196
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
197
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
198
198
  vpunpcklqdq zmm14, zmm24, zmm25
199
199
  vpunpckhqdq zmm15, zmm24, zmm25
200
200
  prefetcht0 [r8+rdx+0x80]
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
2065
2065
  vpermq ymm14, ymm14, 0xDC
2066
2066
  vpermq ymm15, ymm15, 0xDC
2067
2067
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2068
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2068
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2069
2069
  mov eax, 17476
2070
2070
  kmovw k2, eax
2071
2071
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -99,15 +99,15 @@ innerloop16:
99
99
  mov r14, qword ptr [rdi+50H]
100
100
  mov r15, qword ptr [rdi+58H]
101
101
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
102
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
102
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
103
103
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
104
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
104
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
105
105
  vpunpcklqdq zmm8, zmm16, zmm17
106
106
  vpunpckhqdq zmm9, zmm16, zmm17
107
107
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
108
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
108
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
109
109
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
110
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
110
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
111
111
  vpunpcklqdq zmm10, zmm18, zmm19
112
112
  vpunpckhqdq zmm11, zmm18, zmm19
113
113
  mov r8, qword ptr [rdi+20H]
@@ -119,15 +119,15 @@ innerloop16:
119
119
  mov r14, qword ptr [rdi+70H]
120
120
  mov r15, qword ptr [rdi+78H]
121
121
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
122
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
122
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
123
123
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
124
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
124
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
125
125
  vpunpcklqdq zmm12, zmm16, zmm17
126
126
  vpunpckhqdq zmm13, zmm16, zmm17
127
127
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
128
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
128
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
129
129
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
130
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
130
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
131
131
  vpunpcklqdq zmm14, zmm18, zmm19
132
132
  vpunpckhqdq zmm15, zmm18, zmm19
133
133
  vmovdqa32 zmm27, zmmword ptr [INDEX0]
@@ -161,15 +161,15 @@ innerloop16:
161
161
  mov r14, qword ptr [rdi+50H]
162
162
  mov r15, qword ptr [rdi+58H]
163
163
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
164
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
164
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
165
165
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
166
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
166
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
167
167
  vpunpcklqdq zmm8, zmm24, zmm25
168
168
  vpunpckhqdq zmm9, zmm24, zmm25
169
169
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
170
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
170
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
171
171
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
172
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
172
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
173
173
  vpunpcklqdq zmm10, zmm24, zmm25
174
174
  vpunpckhqdq zmm11, zmm24, zmm25
175
175
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -189,15 +189,15 @@ innerloop16:
189
189
  mov r14, qword ptr [rdi+70H]
190
190
  mov r15, qword ptr [rdi+78H]
191
191
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
192
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
192
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
193
193
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
194
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
194
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
195
195
  vpunpcklqdq zmm12, zmm24, zmm25
196
196
  vpunpckhqdq zmm13, zmm24, zmm25
197
197
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
198
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
198
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
199
199
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
200
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
200
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
201
201
  vpunpcklqdq zmm14, zmm24, zmm25
202
202
  vpunpckhqdq zmm15, zmm24, zmm25
203
203
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -2073,7 +2073,7 @@ final7blocks:
2073
2073
  vpermq ymm14, ymm14, 0DCH
2074
2074
  vpermq ymm15, ymm15, 0DCH
2075
2075
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
2076
- vinserti32x8 zmm13, zmm14, ymm15, 01H
2076
+ vinserti64x4 zmm13, zmm14, ymm15, 01H
2077
2077
  mov eax, 17476
2078
2078
  kmovw k2, eax
2079
2079
  vpblendmd zmm13 {k2}, zmm13, zmm12