digest-blake3 0.0.1 → 0.37.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd017f37b8a918a01192a1bc37736f0e3c5f4304a168e9dc86c0b248e9df938a
4
- data.tar.gz: 99e2195b8415c0165e8bda5118c32b6136cb92b7fc0372b5539edbac045b4792
3
+ metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
4
+ data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
5
5
  SHA512:
6
- metadata.gz: dea6e7b5ca985d34bd630c75a432ebee442908079bd5099cd667e83da4db30bb5169c07cf216e7c1e84c8d4fe5124e8ace7960455d179aa7c2450b943f284b27
7
- data.tar.gz: bcfa94c0b78ce490aa0149412dd56f660bd6577469c13618f46ae8132d9a12c7cb2c7afb8329db8810adf6455c1e1554b51c918d94e14fe1a4fa6065ca37b09a
6
+ metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
7
+ data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (0.0.1)
4
+ digest-blake3 (0.37.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
58
58
 
59
59
  ## Contributing
60
60
 
61
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/digest-blake3.
61
+ Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
62
62
 
63
63
  ## License
64
64
 
@@ -81,26 +81,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
81
81
  memcpy(cv_words, self->input_cv, 32);
82
82
  blake3_compress_in_place(cv_words, self->block, self->block_len,
83
83
  self->counter, self->flags);
84
- memcpy(cv, cv_words, 32);
84
+ store_cv_words(cv, cv_words);
85
85
  }
86
86
 
87
- INLINE void output_root_bytes(const output_t *self, uint8_t *out,
87
+ INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
88
88
  size_t out_len) {
89
- uint64_t output_block_counter = 0;
89
+ uint64_t output_block_counter = seek / 64;
90
+ size_t offset_within_block = seek % 64;
90
91
  uint8_t wide_buf[64];
91
92
  while (out_len > 0) {
92
93
  blake3_compress_xof(self->input_cv, self->block, self->block_len,
93
94
  output_block_counter, self->flags | ROOT, wide_buf);
95
+ size_t available_bytes = 64 - offset_within_block;
94
96
  size_t memcpy_len;
95
- if (out_len > 64) {
96
- memcpy_len = 64;
97
+ if (out_len > available_bytes) {
98
+ memcpy_len = available_bytes;
97
99
  } else {
98
100
  memcpy_len = out_len;
99
101
  }
100
- memcpy(out, wide_buf, memcpy_len);
102
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
101
103
  out += memcpy_len;
102
104
  out_len -= memcpy_len;
103
105
  output_block_counter += 1;
106
+ offset_within_block = 0;
104
107
  }
105
108
  }
106
109
 
@@ -256,10 +259,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
256
259
  // Why not just have the caller split the input on the first update(), instead
257
260
  // of implementing this special rule? Because we don't want to limit SIMD or
258
261
  // multi-threading parallelism for that update().
259
- size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
260
- const uint32_t key[8],
261
- uint64_t chunk_counter, uint8_t flags,
262
- uint8_t *out) {
262
+ static size_t blake3_compress_subtree_wide(const uint8_t *input,
263
+ size_t input_len,
264
+ const uint32_t key[8],
265
+ uint64_t chunk_counter,
266
+ uint8_t flags, uint8_t *out) {
263
267
  // Note that the single chunk case does *not* bump the SIMD degree up to 2
264
268
  // when it is 1. If this implementation adds multi-threading in the future,
265
269
  // this gives us the option of multi-threading even the 2-chunk case, which
@@ -331,7 +335,7 @@ INLINE void compress_subtree_to_parent_node(
331
335
  assert(input_len > BLAKE3_CHUNK_LEN);
332
336
  #endif
333
337
 
334
- uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
338
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
335
339
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
336
340
  chunk_counter, flags, cv_array);
337
341
 
@@ -363,10 +367,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
363
367
  hasher_init_base(self, key_words, KEYED_HASH);
364
368
  }
365
369
 
366
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
370
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
371
+ size_t context_len) {
367
372
  blake3_hasher context_hasher;
368
373
  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
369
- blake3_hasher_update(&context_hasher, context, strlen(context));
374
+ blake3_hasher_update(&context_hasher, context, context_len);
370
375
  uint8_t context_key[BLAKE3_KEY_LEN];
371
376
  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
372
377
  uint32_t context_key_words[8];
@@ -374,6 +379,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
374
379
  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
375
380
  }
376
381
 
382
+ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
383
+ blake3_hasher_init_derive_key_raw(self, context, strlen(context));
384
+ }
385
+
377
386
  // As described in hasher_push_cv() below, we do "lazy merging", delaying
378
387
  // merges until right before the next CV is about to be added. This is
379
388
  // different from the reference implementation. Another difference is that we
@@ -425,8 +434,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
425
434
  // compress_subtree_to_parent_node(). That function always returns the top
426
435
  // *two* chaining values of the subtree it's compressing. We then do lazy
427
436
  // merging with each of them separately, so that the second CV will always
428
- // remain unmerged. (The compress_subtree_to_parent_node also helps us support
429
- // extendable output when we're hashing an input all-at-once.)
437
+ // remain unmerged. (That also helps us support extendable output when we're
438
+ // hashing an input all-at-once.)
430
439
  INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
431
440
  uint64_t chunk_counter) {
432
441
  hasher_merge_cv_stack(self, chunk_counter);
@@ -472,8 +481,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
472
481
 
473
482
  // Now the chunk_state is clear, and we have more input. If there's more than
474
483
  // a single chunk (so, definitely not the root chunk), hash the largest whole
475
- // subtree we can, with the full benefits of SIMD and multi-threading
476
- // parallelism. Two restrictions:
484
+ // subtree we can, with the full benefits of SIMD (and maybe in the future,
485
+ // multi-threading) parallelism. Two restrictions:
477
486
  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
478
487
  // the right edge can be incomplete, and we don't know where the right edge
479
488
  // is going to be until we get to finalize().
@@ -546,6 +555,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
546
555
 
547
556
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
548
557
  size_t out_len) {
558
+ blake3_hasher_finalize_seek(self, 0, out, out_len);
559
+ }
560
+
561
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
562
+ uint8_t *out, size_t out_len) {
549
563
  // Explicitly checking for zero avoids causing UB by passing a null pointer
550
564
  // to memcpy. This comes up in practice with things like:
551
565
  // std::vector<uint8_t> v;
@@ -557,7 +571,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
557
571
  // If the subtree stack is empty, then the current chunk is the root.
558
572
  if (self->cv_stack_len == 0) {
559
573
  output_t output = chunk_state_output(&self->chunk);
560
- output_root_bytes(&output, out, out_len);
574
+ output_root_bytes(&output, seek, out, out_len);
561
575
  return;
562
576
  }
563
577
  // If there are any bytes in the chunk state, finalize that chunk and do a
@@ -585,5 +599,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
585
599
  output_chaining_value(&output, &parent_block[32]);
586
600
  output = parent_output(parent_block, self->key, self->chunk.flags);
587
601
  }
588
- output_root_bytes(&output, out, out_len);
602
+ output_root_bytes(&output, seek, out, out_len);
589
603
  }
@@ -4,7 +4,7 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
- #ifdef __cplusplus
7
+ #ifdef __cplusplus
8
8
  extern "C" {
9
9
  #endif
10
10
 
@@ -42,12 +42,16 @@ void blake3_hasher_init(blake3_hasher *self);
42
42
  void blake3_hasher_init_keyed(blake3_hasher *self,
43
43
  const uint8_t key[BLAKE3_KEY_LEN]);
44
44
  void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
45
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
+ size_t context_len);
45
47
  void blake3_hasher_update(blake3_hasher *self, const void *input,
46
48
  size_t input_len);
47
49
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
48
50
  size_t out_len);
51
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
52
+ uint8_t *out, size_t out_len);
49
53
 
50
- #ifdef __cplusplus
54
+ #ifdef __cplusplus
51
55
  }
52
56
  #endif
53
57
 
@@ -1,3 +1,17 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
1
15
  .intel_syntax noprefix
2
16
  .global _blake3_hash_many_avx2
3
17
  .global blake3_hash_many_avx2
@@ -9,6 +23,7 @@
9
23
  .p2align 6
10
24
  _blake3_hash_many_avx2:
11
25
  blake3_hash_many_avx2:
26
+ _CET_ENDBR
12
27
  push r15
13
28
  push r14
14
29
  push r13
@@ -1,5 +1,18 @@
1
- .intel_syntax noprefix
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
2
14
 
15
+ .intel_syntax noprefix
3
16
  .global _blake3_hash_many_avx512
4
17
  .global blake3_hash_many_avx512
5
18
  .global blake3_compress_in_place_avx512
@@ -15,6 +28,7 @@
15
28
  .p2align 6
16
29
  _blake3_hash_many_avx512:
17
30
  blake3_hash_many_avx512:
31
+ _CET_ENDBR
18
32
  push r15
19
33
  push r14
20
34
  push r13
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
82
96
  mov r14, qword ptr [rdi+0x50]
83
97
  mov r15, qword ptr [rdi+0x58]
84
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
85
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
86
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
87
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
88
102
  vpunpcklqdq zmm8, zmm16, zmm17
89
103
  vpunpckhqdq zmm9, zmm16, zmm17
90
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
91
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
92
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
93
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
94
108
  vpunpcklqdq zmm10, zmm18, zmm19
95
109
  vpunpckhqdq zmm11, zmm18, zmm19
96
110
  mov r8, qword ptr [rdi+0x20]
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
102
116
  mov r14, qword ptr [rdi+0x70]
103
117
  mov r15, qword ptr [rdi+0x78]
104
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
105
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
106
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
107
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
108
122
  vpunpcklqdq zmm12, zmm16, zmm17
109
123
  vpunpckhqdq zmm13, zmm16, zmm17
110
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
111
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
112
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
113
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
114
128
  vpunpcklqdq zmm14, zmm18, zmm19
115
129
  vpunpckhqdq zmm15, zmm18, zmm19
116
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
144
158
  mov r14, qword ptr [rdi+0x50]
145
159
  mov r15, qword ptr [rdi+0x58]
146
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
147
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
148
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
149
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
150
164
  vpunpcklqdq zmm8, zmm24, zmm25
151
165
  vpunpckhqdq zmm9, zmm24, zmm25
152
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
153
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
154
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
155
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
156
170
  vpunpcklqdq zmm10, zmm24, zmm25
157
171
  vpunpckhqdq zmm11, zmm24, zmm25
158
172
  prefetcht0 [r8+rdx+0x80]
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
172
186
  mov r14, qword ptr [rdi+0x70]
173
187
  mov r15, qword ptr [rdi+0x78]
174
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
175
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
176
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
177
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
178
192
  vpunpcklqdq zmm12, zmm24, zmm25
179
193
  vpunpckhqdq zmm13, zmm24, zmm25
180
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
181
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
182
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
183
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
184
198
  vpunpcklqdq zmm14, zmm24, zmm25
185
199
  vpunpckhqdq zmm15, zmm24, zmm25
186
200
  prefetcht0 [r8+rdx+0x80]
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
2039
2053
  vpermq ymm14, ymm14, 0xDC
2040
2054
  vpermq ymm15, ymm15, 0xDC
2041
2055
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2042
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2056
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2043
2057
  mov eax, 17476
2044
2058
  kmovw k2, eax
2045
2059
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
2372
2386
  .p2align 6
2373
2387
  _blake3_compress_in_place_avx512:
2374
2388
  blake3_compress_in_place_avx512:
2389
+ _CET_ENDBR
2375
2390
  vmovdqu xmm0, xmmword ptr [rdi]
2376
2391
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2377
2392
  movzx eax, r8b
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
2454
2469
  .p2align 6
2455
2470
  _blake3_compress_xof_avx512:
2456
2471
  blake3_compress_xof_avx512:
2472
+ _CET_ENDBR
2457
2473
  vmovdqu xmm0, xmmword ptr [rdi]
2458
2474
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2459
2475
  movzx eax, r8b
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
96
96
  mov r14, qword ptr [rdi+0x50]
97
97
  mov r15, qword ptr [rdi+0x58]
98
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
99
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
100
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
101
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
102
102
  vpunpcklqdq zmm8, zmm16, zmm17
103
103
  vpunpckhqdq zmm9, zmm16, zmm17
104
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
105
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
106
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
107
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
108
108
  vpunpcklqdq zmm10, zmm18, zmm19
109
109
  vpunpckhqdq zmm11, zmm18, zmm19
110
110
  mov r8, qword ptr [rdi+0x20]
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
116
116
  mov r14, qword ptr [rdi+0x70]
117
117
  mov r15, qword ptr [rdi+0x78]
118
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
119
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
120
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
121
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
122
122
  vpunpcklqdq zmm12, zmm16, zmm17
123
123
  vpunpckhqdq zmm13, zmm16, zmm17
124
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
125
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
126
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
127
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
128
128
  vpunpcklqdq zmm14, zmm18, zmm19
129
129
  vpunpckhqdq zmm15, zmm18, zmm19
130
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
158
158
  mov r14, qword ptr [rdi+0x50]
159
159
  mov r15, qword ptr [rdi+0x58]
160
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
161
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
162
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
163
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
164
164
  vpunpcklqdq zmm8, zmm24, zmm25
165
165
  vpunpckhqdq zmm9, zmm24, zmm25
166
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
167
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
168
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
169
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
170
170
  vpunpcklqdq zmm10, zmm24, zmm25
171
171
  vpunpckhqdq zmm11, zmm24, zmm25
172
172
  prefetcht0 [r8+rdx+0x80]
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
186
186
  mov r14, qword ptr [rdi+0x70]
187
187
  mov r15, qword ptr [rdi+0x78]
188
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
189
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
190
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
191
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
192
192
  vpunpcklqdq zmm12, zmm24, zmm25
193
193
  vpunpckhqdq zmm13, zmm24, zmm25
194
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
195
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
196
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
197
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
198
198
  vpunpcklqdq zmm14, zmm24, zmm25
199
199
  vpunpckhqdq zmm15, zmm24, zmm25
200
200
  prefetcht0 [r8+rdx+0x80]
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
2065
2065
  vpermq ymm14, ymm14, 0xDC
2066
2066
  vpermq ymm15, ymm15, 0xDC
2067
2067
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2068
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2068
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2069
2069
  mov eax, 17476
2070
2070
  kmovw k2, eax
2071
2071
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -99,15 +99,15 @@ innerloop16:
99
99
  mov r14, qword ptr [rdi+50H]
100
100
  mov r15, qword ptr [rdi+58H]
101
101
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
102
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
102
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
103
103
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
104
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
104
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
105
105
  vpunpcklqdq zmm8, zmm16, zmm17
106
106
  vpunpckhqdq zmm9, zmm16, zmm17
107
107
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
108
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
108
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
109
109
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
110
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
110
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
111
111
  vpunpcklqdq zmm10, zmm18, zmm19
112
112
  vpunpckhqdq zmm11, zmm18, zmm19
113
113
  mov r8, qword ptr [rdi+20H]
@@ -119,15 +119,15 @@ innerloop16:
119
119
  mov r14, qword ptr [rdi+70H]
120
120
  mov r15, qword ptr [rdi+78H]
121
121
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
122
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
122
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
123
123
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
124
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
124
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
125
125
  vpunpcklqdq zmm12, zmm16, zmm17
126
126
  vpunpckhqdq zmm13, zmm16, zmm17
127
127
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
128
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
128
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
129
129
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
130
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
130
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
131
131
  vpunpcklqdq zmm14, zmm18, zmm19
132
132
  vpunpckhqdq zmm15, zmm18, zmm19
133
133
  vmovdqa32 zmm27, zmmword ptr [INDEX0]
@@ -161,15 +161,15 @@ innerloop16:
161
161
  mov r14, qword ptr [rdi+50H]
162
162
  mov r15, qword ptr [rdi+58H]
163
163
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
164
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
164
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
165
165
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
166
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
166
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
167
167
  vpunpcklqdq zmm8, zmm24, zmm25
168
168
  vpunpckhqdq zmm9, zmm24, zmm25
169
169
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
170
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
170
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
171
171
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
172
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
172
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
173
173
  vpunpcklqdq zmm10, zmm24, zmm25
174
174
  vpunpckhqdq zmm11, zmm24, zmm25
175
175
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -189,15 +189,15 @@ innerloop16:
189
189
  mov r14, qword ptr [rdi+70H]
190
190
  mov r15, qword ptr [rdi+78H]
191
191
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
192
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
192
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
193
193
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
194
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
194
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
195
195
  vpunpcklqdq zmm12, zmm24, zmm25
196
196
  vpunpckhqdq zmm13, zmm24, zmm25
197
197
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
198
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
198
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
199
199
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
200
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
200
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
201
201
  vpunpcklqdq zmm14, zmm24, zmm25
202
202
  vpunpckhqdq zmm15, zmm24, zmm25
203
203
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -2073,7 +2073,7 @@ final7blocks:
2073
2073
  vpermq ymm14, ymm14, 0DCH
2074
2074
  vpermq ymm15, ymm15, 0DCH
2075
2075
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
2076
- vinserti32x8 zmm13, zmm14, ymm15, 01H
2076
+ vinserti64x4 zmm13, zmm14, ymm15, 01H
2077
2077
  mov eax, 17476
2078
2078
  kmovw k2, eax
2079
2079
  vpblendmd zmm13 {k2}, zmm13, zmm12