digest-blake3 0.0.1 → 0.37.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd017f37b8a918a01192a1bc37736f0e3c5f4304a168e9dc86c0b248e9df938a
4
- data.tar.gz: 99e2195b8415c0165e8bda5118c32b6136cb92b7fc0372b5539edbac045b4792
3
+ metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
4
+ data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
5
5
  SHA512:
6
- metadata.gz: dea6e7b5ca985d34bd630c75a432ebee442908079bd5099cd667e83da4db30bb5169c07cf216e7c1e84c8d4fe5124e8ace7960455d179aa7c2450b943f284b27
7
- data.tar.gz: bcfa94c0b78ce490aa0149412dd56f660bd6577469c13618f46ae8132d9a12c7cb2c7afb8329db8810adf6455c1e1554b51c918d94e14fe1a4fa6065ca37b09a
6
+ metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
7
+ data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (0.0.1)
4
+ digest-blake3 (0.37.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
58
58
 
59
59
  ## Contributing
60
60
 
61
- Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/digest-blake3.
61
+ Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
62
62
 
63
63
  ## License
64
64
 
@@ -81,26 +81,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
81
81
  memcpy(cv_words, self->input_cv, 32);
82
82
  blake3_compress_in_place(cv_words, self->block, self->block_len,
83
83
  self->counter, self->flags);
84
- memcpy(cv, cv_words, 32);
84
+ store_cv_words(cv, cv_words);
85
85
  }
86
86
 
87
- INLINE void output_root_bytes(const output_t *self, uint8_t *out,
87
+ INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
88
88
  size_t out_len) {
89
- uint64_t output_block_counter = 0;
89
+ uint64_t output_block_counter = seek / 64;
90
+ size_t offset_within_block = seek % 64;
90
91
  uint8_t wide_buf[64];
91
92
  while (out_len > 0) {
92
93
  blake3_compress_xof(self->input_cv, self->block, self->block_len,
93
94
  output_block_counter, self->flags | ROOT, wide_buf);
95
+ size_t available_bytes = 64 - offset_within_block;
94
96
  size_t memcpy_len;
95
- if (out_len > 64) {
96
- memcpy_len = 64;
97
+ if (out_len > available_bytes) {
98
+ memcpy_len = available_bytes;
97
99
  } else {
98
100
  memcpy_len = out_len;
99
101
  }
100
- memcpy(out, wide_buf, memcpy_len);
102
+ memcpy(out, wide_buf + offset_within_block, memcpy_len);
101
103
  out += memcpy_len;
102
104
  out_len -= memcpy_len;
103
105
  output_block_counter += 1;
106
+ offset_within_block = 0;
104
107
  }
105
108
  }
106
109
 
@@ -256,10 +259,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
256
259
  // Why not just have the caller split the input on the first update(), instead
257
260
  // of implementing this special rule? Because we don't want to limit SIMD or
258
261
  // multi-threading parallelism for that update().
259
- size_t blake3_compress_subtree_wide(const uint8_t *input, size_t input_len,
260
- const uint32_t key[8],
261
- uint64_t chunk_counter, uint8_t flags,
262
- uint8_t *out) {
262
+ static size_t blake3_compress_subtree_wide(const uint8_t *input,
263
+ size_t input_len,
264
+ const uint32_t key[8],
265
+ uint64_t chunk_counter,
266
+ uint8_t flags, uint8_t *out) {
263
267
  // Note that the single chunk case does *not* bump the SIMD degree up to 2
264
268
  // when it is 1. If this implementation adds multi-threading in the future,
265
269
  // this gives us the option of multi-threading even the 2-chunk case, which
@@ -331,7 +335,7 @@ INLINE void compress_subtree_to_parent_node(
331
335
  assert(input_len > BLAKE3_CHUNK_LEN);
332
336
  #endif
333
337
 
334
- uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
338
+ uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
335
339
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
336
340
  chunk_counter, flags, cv_array);
337
341
 
@@ -363,10 +367,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
363
367
  hasher_init_base(self, key_words, KEYED_HASH);
364
368
  }
365
369
 
366
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
370
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
371
+ size_t context_len) {
367
372
  blake3_hasher context_hasher;
368
373
  hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
369
- blake3_hasher_update(&context_hasher, context, strlen(context));
374
+ blake3_hasher_update(&context_hasher, context, context_len);
370
375
  uint8_t context_key[BLAKE3_KEY_LEN];
371
376
  blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
372
377
  uint32_t context_key_words[8];
@@ -374,6 +379,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
374
379
  hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
375
380
  }
376
381
 
382
+ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
383
+ blake3_hasher_init_derive_key_raw(self, context, strlen(context));
384
+ }
385
+
377
386
  // As described in hasher_push_cv() below, we do "lazy merging", delaying
378
387
  // merges until right before the next CV is about to be added. This is
379
388
  // different from the reference implementation. Another difference is that we
@@ -425,8 +434,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
425
434
  // compress_subtree_to_parent_node(). That function always returns the top
426
435
  // *two* chaining values of the subtree it's compressing. We then do lazy
427
436
  // merging with each of them separately, so that the second CV will always
428
- // remain unmerged. (The compress_subtree_to_parent_node also helps us support
429
- // extendable output when we're hashing an input all-at-once.)
437
+ // remain unmerged. (That also helps us support extendable output when we're
438
+ // hashing an input all-at-once.)
430
439
  INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
431
440
  uint64_t chunk_counter) {
432
441
  hasher_merge_cv_stack(self, chunk_counter);
@@ -472,8 +481,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
472
481
 
473
482
  // Now the chunk_state is clear, and we have more input. If there's more than
474
483
  // a single chunk (so, definitely not the root chunk), hash the largest whole
475
- // subtree we can, with the full benefits of SIMD and multi-threading
476
- // parallelism. Two restrictions:
484
+ // subtree we can, with the full benefits of SIMD (and maybe in the future,
485
+ // multi-threading) parallelism. Two restrictions:
477
486
  // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
478
487
  // the right edge can be incomplete, and we don't know where the right edge
479
488
  // is going to be until we get to finalize().
@@ -546,6 +555,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
546
555
 
547
556
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
548
557
  size_t out_len) {
558
+ blake3_hasher_finalize_seek(self, 0, out, out_len);
559
+ }
560
+
561
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
562
+ uint8_t *out, size_t out_len) {
549
563
  // Explicitly checking for zero avoids causing UB by passing a null pointer
550
564
  // to memcpy. This comes up in practice with things like:
551
565
  // std::vector<uint8_t> v;
@@ -557,7 +571,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
557
571
  // If the subtree stack is empty, then the current chunk is the root.
558
572
  if (self->cv_stack_len == 0) {
559
573
  output_t output = chunk_state_output(&self->chunk);
560
- output_root_bytes(&output, out, out_len);
574
+ output_root_bytes(&output, seek, out, out_len);
561
575
  return;
562
576
  }
563
577
  // If there are any bytes in the chunk state, finalize that chunk and do a
@@ -585,5 +599,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
585
599
  output_chaining_value(&output, &parent_block[32]);
586
600
  output = parent_output(parent_block, self->key, self->chunk.flags);
587
601
  }
588
- output_root_bytes(&output, out, out_len);
602
+ output_root_bytes(&output, seek, out, out_len);
589
603
  }
@@ -4,7 +4,7 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
- #ifdef __cplusplus
7
+ #ifdef __cplusplus
8
8
  extern "C" {
9
9
  #endif
10
10
 
@@ -42,12 +42,16 @@ void blake3_hasher_init(blake3_hasher *self);
42
42
  void blake3_hasher_init_keyed(blake3_hasher *self,
43
43
  const uint8_t key[BLAKE3_KEY_LEN]);
44
44
  void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
45
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
+ size_t context_len);
45
47
  void blake3_hasher_update(blake3_hasher *self, const void *input,
46
48
  size_t input_len);
47
49
  void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
48
50
  size_t out_len);
51
+ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
52
+ uint8_t *out, size_t out_len);
49
53
 
50
- #ifdef __cplusplus
54
+ #ifdef __cplusplus
51
55
  }
52
56
  #endif
53
57
 
@@ -1,3 +1,17 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
1
15
  .intel_syntax noprefix
2
16
  .global _blake3_hash_many_avx2
3
17
  .global blake3_hash_many_avx2
@@ -9,6 +23,7 @@
9
23
  .p2align 6
10
24
  _blake3_hash_many_avx2:
11
25
  blake3_hash_many_avx2:
26
+ _CET_ENDBR
12
27
  push r15
13
28
  push r14
14
29
  push r13
@@ -1,5 +1,18 @@
1
- .intel_syntax noprefix
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
2
14
 
15
+ .intel_syntax noprefix
3
16
  .global _blake3_hash_many_avx512
4
17
  .global blake3_hash_many_avx512
5
18
  .global blake3_compress_in_place_avx512
@@ -15,6 +28,7 @@
15
28
  .p2align 6
16
29
  _blake3_hash_many_avx512:
17
30
  blake3_hash_many_avx512:
31
+ _CET_ENDBR
18
32
  push r15
19
33
  push r14
20
34
  push r13
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
82
96
  mov r14, qword ptr [rdi+0x50]
83
97
  mov r15, qword ptr [rdi+0x58]
84
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
85
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
86
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
87
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
88
102
  vpunpcklqdq zmm8, zmm16, zmm17
89
103
  vpunpckhqdq zmm9, zmm16, zmm17
90
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
91
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
92
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
93
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
94
108
  vpunpcklqdq zmm10, zmm18, zmm19
95
109
  vpunpckhqdq zmm11, zmm18, zmm19
96
110
  mov r8, qword ptr [rdi+0x20]
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
102
116
  mov r14, qword ptr [rdi+0x70]
103
117
  mov r15, qword ptr [rdi+0x78]
104
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
105
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
106
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
107
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
108
122
  vpunpcklqdq zmm12, zmm16, zmm17
109
123
  vpunpckhqdq zmm13, zmm16, zmm17
110
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
111
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
112
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
113
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
114
128
  vpunpcklqdq zmm14, zmm18, zmm19
115
129
  vpunpckhqdq zmm15, zmm18, zmm19
116
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
144
158
  mov r14, qword ptr [rdi+0x50]
145
159
  mov r15, qword ptr [rdi+0x58]
146
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
147
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
148
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
149
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
150
164
  vpunpcklqdq zmm8, zmm24, zmm25
151
165
  vpunpckhqdq zmm9, zmm24, zmm25
152
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
153
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
154
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
155
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
156
170
  vpunpcklqdq zmm10, zmm24, zmm25
157
171
  vpunpckhqdq zmm11, zmm24, zmm25
158
172
  prefetcht0 [r8+rdx+0x80]
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
172
186
  mov r14, qword ptr [rdi+0x70]
173
187
  mov r15, qword ptr [rdi+0x78]
174
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
175
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
176
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
177
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
178
192
  vpunpcklqdq zmm12, zmm24, zmm25
179
193
  vpunpckhqdq zmm13, zmm24, zmm25
180
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
181
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
182
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
183
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
184
198
  vpunpcklqdq zmm14, zmm24, zmm25
185
199
  vpunpckhqdq zmm15, zmm24, zmm25
186
200
  prefetcht0 [r8+rdx+0x80]
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
2039
2053
  vpermq ymm14, ymm14, 0xDC
2040
2054
  vpermq ymm15, ymm15, 0xDC
2041
2055
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2042
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2056
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2043
2057
  mov eax, 17476
2044
2058
  kmovw k2, eax
2045
2059
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
2372
2386
  .p2align 6
2373
2387
  _blake3_compress_in_place_avx512:
2374
2388
  blake3_compress_in_place_avx512:
2389
+ _CET_ENDBR
2375
2390
  vmovdqu xmm0, xmmword ptr [rdi]
2376
2391
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2377
2392
  movzx eax, r8b
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
2454
2469
  .p2align 6
2455
2470
  _blake3_compress_xof_avx512:
2456
2471
  blake3_compress_xof_avx512:
2472
+ _CET_ENDBR
2457
2473
  vmovdqu xmm0, xmmword ptr [rdi]
2458
2474
  vmovdqu xmm1, xmmword ptr [rdi+0x10]
2459
2475
  movzx eax, r8b
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
96
96
  mov r14, qword ptr [rdi+0x50]
97
97
  mov r15, qword ptr [rdi+0x58]
98
98
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
99
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
99
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
100
100
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
101
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
101
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
102
102
  vpunpcklqdq zmm8, zmm16, zmm17
103
103
  vpunpckhqdq zmm9, zmm16, zmm17
104
104
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
105
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
105
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
106
106
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
107
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
107
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
108
108
  vpunpcklqdq zmm10, zmm18, zmm19
109
109
  vpunpckhqdq zmm11, zmm18, zmm19
110
110
  mov r8, qword ptr [rdi+0x20]
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
116
116
  mov r14, qword ptr [rdi+0x70]
117
117
  mov r15, qword ptr [rdi+0x78]
118
118
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
119
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
119
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
120
120
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
121
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
121
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
122
122
  vpunpcklqdq zmm12, zmm16, zmm17
123
123
  vpunpckhqdq zmm13, zmm16, zmm17
124
124
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
125
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
125
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
126
126
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
127
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
127
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
128
128
  vpunpcklqdq zmm14, zmm18, zmm19
129
129
  vpunpckhqdq zmm15, zmm18, zmm19
130
130
  vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
158
158
  mov r14, qword ptr [rdi+0x50]
159
159
  mov r15, qword ptr [rdi+0x58]
160
160
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
161
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
161
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
162
162
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
163
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
163
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
164
164
  vpunpcklqdq zmm8, zmm24, zmm25
165
165
  vpunpckhqdq zmm9, zmm24, zmm25
166
166
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
167
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
167
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
168
168
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
169
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
169
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
170
170
  vpunpcklqdq zmm10, zmm24, zmm25
171
171
  vpunpckhqdq zmm11, zmm24, zmm25
172
172
  prefetcht0 [r8+rdx+0x80]
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
186
186
  mov r14, qword ptr [rdi+0x70]
187
187
  mov r15, qword ptr [rdi+0x78]
188
188
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
189
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
189
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
190
190
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
191
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
191
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
192
192
  vpunpcklqdq zmm12, zmm24, zmm25
193
193
  vpunpckhqdq zmm13, zmm24, zmm25
194
194
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
195
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
195
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
196
196
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
197
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
197
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
198
198
  vpunpcklqdq zmm14, zmm24, zmm25
199
199
  vpunpckhqdq zmm15, zmm24, zmm25
200
200
  prefetcht0 [r8+rdx+0x80]
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
2065
2065
  vpermq ymm14, ymm14, 0xDC
2066
2066
  vpermq ymm15, ymm15, 0xDC
2067
2067
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
2068
- vinserti32x8 zmm13, zmm14, ymm15, 0x01
2068
+ vinserti64x4 zmm13, zmm14, ymm15, 0x01
2069
2069
  mov eax, 17476
2070
2070
  kmovw k2, eax
2071
2071
  vpblendmd zmm13 {k2}, zmm13, zmm12
@@ -99,15 +99,15 @@ innerloop16:
99
99
  mov r14, qword ptr [rdi+50H]
100
100
  mov r15, qword ptr [rdi+58H]
101
101
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
102
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
102
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
103
103
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
104
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
104
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
105
105
  vpunpcklqdq zmm8, zmm16, zmm17
106
106
  vpunpckhqdq zmm9, zmm16, zmm17
107
107
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
108
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
108
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
109
109
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
110
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
110
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
111
111
  vpunpcklqdq zmm10, zmm18, zmm19
112
112
  vpunpckhqdq zmm11, zmm18, zmm19
113
113
  mov r8, qword ptr [rdi+20H]
@@ -119,15 +119,15 @@ innerloop16:
119
119
  mov r14, qword ptr [rdi+70H]
120
120
  mov r15, qword ptr [rdi+78H]
121
121
  vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
122
- vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
122
+ vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
123
123
  vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
124
- vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
124
+ vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
125
125
  vpunpcklqdq zmm12, zmm16, zmm17
126
126
  vpunpckhqdq zmm13, zmm16, zmm17
127
127
  vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
128
- vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
128
+ vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
129
129
  vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
130
- vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
130
+ vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
131
131
  vpunpcklqdq zmm14, zmm18, zmm19
132
132
  vpunpckhqdq zmm15, zmm18, zmm19
133
133
  vmovdqa32 zmm27, zmmword ptr [INDEX0]
@@ -161,15 +161,15 @@ innerloop16:
161
161
  mov r14, qword ptr [rdi+50H]
162
162
  mov r15, qword ptr [rdi+58H]
163
163
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
164
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
164
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
165
165
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
166
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
166
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
167
167
  vpunpcklqdq zmm8, zmm24, zmm25
168
168
  vpunpckhqdq zmm9, zmm24, zmm25
169
169
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
170
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
170
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
171
171
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
172
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
172
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
173
173
  vpunpcklqdq zmm10, zmm24, zmm25
174
174
  vpunpckhqdq zmm11, zmm24, zmm25
175
175
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -189,15 +189,15 @@ innerloop16:
189
189
  mov r14, qword ptr [rdi+70H]
190
190
  mov r15, qword ptr [rdi+78H]
191
191
  vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
192
- vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
192
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
193
193
  vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
194
- vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
194
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
195
195
  vpunpcklqdq zmm12, zmm24, zmm25
196
196
  vpunpckhqdq zmm13, zmm24, zmm25
197
197
  vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
198
- vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
198
+ vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
199
199
  vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
200
- vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
200
+ vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
201
201
  vpunpcklqdq zmm14, zmm24, zmm25
202
202
  vpunpckhqdq zmm15, zmm24, zmm25
203
203
  prefetcht0 byte ptr [r8+rdx+80H]
@@ -2073,7 +2073,7 @@ final7blocks:
2073
2073
  vpermq ymm14, ymm14, 0DCH
2074
2074
  vpermq ymm15, ymm15, 0DCH
2075
2075
  vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
2076
- vinserti32x8 zmm13, zmm14, ymm15, 01H
2076
+ vinserti64x4 zmm13, zmm14, ymm15, 01H
2077
2077
  mov eax, 17476
2078
2078
  kmovw k2, eax
2079
2079
  vpblendmd zmm13 {k2}, zmm13, zmm12