digest-blake3 0.0.1 → 0.37.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/ext/digest/blake3/blake3.c +33 -19
- data/ext/digest/blake3/blake3.h +6 -2
- data/ext/digest/blake3/blake3_avx2_x86-64_unix.S +15 -0
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +34 -18
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +28 -70
- data/ext/digest/blake3/blake3_impl.h +103 -1
- data/ext/digest/blake3/blake3_portable.c +1 -9
- data/ext/digest/blake3/blake3_sse2.c +565 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_unix.S +17 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +19 -7
- data/ext/digest/blake3/extconf.rb +6 -3
- data/lib/digest/blake3/version.rb +1 -1
- metadata +10 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
|
4
|
+
data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
|
7
|
+
data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
58
58
|
|
59
59
|
## Contributing
|
60
60
|
|
61
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
61
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
|
62
62
|
|
63
63
|
## License
|
64
64
|
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -81,26 +81,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
|
|
81
81
|
memcpy(cv_words, self->input_cv, 32);
|
82
82
|
blake3_compress_in_place(cv_words, self->block, self->block_len,
|
83
83
|
self->counter, self->flags);
|
84
|
-
|
84
|
+
store_cv_words(cv, cv_words);
|
85
85
|
}
|
86
86
|
|
87
|
-
INLINE void output_root_bytes(const output_t *self, uint8_t *out,
|
87
|
+
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
|
88
88
|
size_t out_len) {
|
89
|
-
uint64_t output_block_counter =
|
89
|
+
uint64_t output_block_counter = seek / 64;
|
90
|
+
size_t offset_within_block = seek % 64;
|
90
91
|
uint8_t wide_buf[64];
|
91
92
|
while (out_len > 0) {
|
92
93
|
blake3_compress_xof(self->input_cv, self->block, self->block_len,
|
93
94
|
output_block_counter, self->flags | ROOT, wide_buf);
|
95
|
+
size_t available_bytes = 64 - offset_within_block;
|
94
96
|
size_t memcpy_len;
|
95
|
-
if (out_len >
|
96
|
-
memcpy_len =
|
97
|
+
if (out_len > available_bytes) {
|
98
|
+
memcpy_len = available_bytes;
|
97
99
|
} else {
|
98
100
|
memcpy_len = out_len;
|
99
101
|
}
|
100
|
-
memcpy(out, wide_buf, memcpy_len);
|
102
|
+
memcpy(out, wide_buf + offset_within_block, memcpy_len);
|
101
103
|
out += memcpy_len;
|
102
104
|
out_len -= memcpy_len;
|
103
105
|
output_block_counter += 1;
|
106
|
+
offset_within_block = 0;
|
104
107
|
}
|
105
108
|
}
|
106
109
|
|
@@ -256,10 +259,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
256
259
|
// Why not just have the caller split the input on the first update(), instead
|
257
260
|
// of implementing this special rule? Because we don't want to limit SIMD or
|
258
261
|
// multi-threading parallelism for that update().
|
259
|
-
size_t blake3_compress_subtree_wide(const uint8_t *input,
|
260
|
-
|
261
|
-
|
262
|
-
|
262
|
+
static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
263
|
+
size_t input_len,
|
264
|
+
const uint32_t key[8],
|
265
|
+
uint64_t chunk_counter,
|
266
|
+
uint8_t flags, uint8_t *out) {
|
263
267
|
// Note that the single chunk case does *not* bump the SIMD degree up to 2
|
264
268
|
// when it is 1. If this implementation adds multi-threading in the future,
|
265
269
|
// this gives us the option of multi-threading even the 2-chunk case, which
|
@@ -331,7 +335,7 @@ INLINE void compress_subtree_to_parent_node(
|
|
331
335
|
assert(input_len > BLAKE3_CHUNK_LEN);
|
332
336
|
#endif
|
333
337
|
|
334
|
-
uint8_t cv_array[
|
338
|
+
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
335
339
|
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
336
340
|
chunk_counter, flags, cv_array);
|
337
341
|
|
@@ -363,10 +367,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
|
|
363
367
|
hasher_init_base(self, key_words, KEYED_HASH);
|
364
368
|
}
|
365
369
|
|
366
|
-
void
|
370
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
371
|
+
size_t context_len) {
|
367
372
|
blake3_hasher context_hasher;
|
368
373
|
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
|
369
|
-
blake3_hasher_update(&context_hasher, context,
|
374
|
+
blake3_hasher_update(&context_hasher, context, context_len);
|
370
375
|
uint8_t context_key[BLAKE3_KEY_LEN];
|
371
376
|
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
|
372
377
|
uint32_t context_key_words[8];
|
@@ -374,6 +379,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
|
374
379
|
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
|
375
380
|
}
|
376
381
|
|
382
|
+
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
383
|
+
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
|
384
|
+
}
|
385
|
+
|
377
386
|
// As described in hasher_push_cv() below, we do "lazy merging", delaying
|
378
387
|
// merges until right before the next CV is about to be added. This is
|
379
388
|
// different from the reference implementation. Another difference is that we
|
@@ -425,8 +434,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
|
|
425
434
|
// compress_subtree_to_parent_node(). That function always returns the top
|
426
435
|
// *two* chaining values of the subtree it's compressing. We then do lazy
|
427
436
|
// merging with each of them separately, so that the second CV will always
|
428
|
-
// remain unmerged. (
|
429
|
-
//
|
437
|
+
// remain unmerged. (That also helps us support extendable output when we're
|
438
|
+
// hashing an input all-at-once.)
|
430
439
|
INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
|
431
440
|
uint64_t chunk_counter) {
|
432
441
|
hasher_merge_cv_stack(self, chunk_counter);
|
@@ -472,8 +481,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
472
481
|
|
473
482
|
// Now the chunk_state is clear, and we have more input. If there's more than
|
474
483
|
// a single chunk (so, definitely not the root chunk), hash the largest whole
|
475
|
-
// subtree we can, with the full benefits of SIMD and
|
476
|
-
// parallelism. Two restrictions:
|
484
|
+
// subtree we can, with the full benefits of SIMD (and maybe in the future,
|
485
|
+
// multi-threading) parallelism. Two restrictions:
|
477
486
|
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
|
478
487
|
// the right edge can be incomplete, and we don't know where the right edge
|
479
488
|
// is going to be until we get to finalize().
|
@@ -546,6 +555,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
546
555
|
|
547
556
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
548
557
|
size_t out_len) {
|
558
|
+
blake3_hasher_finalize_seek(self, 0, out, out_len);
|
559
|
+
}
|
560
|
+
|
561
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
562
|
+
uint8_t *out, size_t out_len) {
|
549
563
|
// Explicitly checking for zero avoids causing UB by passing a null pointer
|
550
564
|
// to memcpy. This comes up in practice with things like:
|
551
565
|
// std::vector<uint8_t> v;
|
@@ -557,7 +571,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
557
571
|
// If the subtree stack is empty, then the current chunk is the root.
|
558
572
|
if (self->cv_stack_len == 0) {
|
559
573
|
output_t output = chunk_state_output(&self->chunk);
|
560
|
-
output_root_bytes(&output, out, out_len);
|
574
|
+
output_root_bytes(&output, seek, out, out_len);
|
561
575
|
return;
|
562
576
|
}
|
563
577
|
// If there are any bytes in the chunk state, finalize that chunk and do a
|
@@ -585,5 +599,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
585
599
|
output_chaining_value(&output, &parent_block[32]);
|
586
600
|
output = parent_output(parent_block, self->key, self->chunk.flags);
|
587
601
|
}
|
588
|
-
output_root_bytes(&output, out, out_len);
|
602
|
+
output_root_bytes(&output, seek, out, out_len);
|
589
603
|
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -4,7 +4,7 @@
|
|
4
4
|
#include <stddef.h>
|
5
5
|
#include <stdint.h>
|
6
6
|
|
7
|
-
#ifdef
|
7
|
+
#ifdef __cplusplus
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
@@ -42,12 +42,16 @@ void blake3_hasher_init(blake3_hasher *self);
|
|
42
42
|
void blake3_hasher_init_keyed(blake3_hasher *self,
|
43
43
|
const uint8_t key[BLAKE3_KEY_LEN]);
|
44
44
|
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
45
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
46
|
+
size_t context_len);
|
45
47
|
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
46
48
|
size_t input_len);
|
47
49
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
48
50
|
size_t out_len);
|
51
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
52
|
+
uint8_t *out, size_t out_len);
|
49
53
|
|
50
|
-
#ifdef
|
54
|
+
#ifdef __cplusplus
|
51
55
|
}
|
52
56
|
#endif
|
53
57
|
|
@@ -1,3 +1,17 @@
|
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
14
|
+
|
1
15
|
.intel_syntax noprefix
|
2
16
|
.global _blake3_hash_many_avx2
|
3
17
|
.global blake3_hash_many_avx2
|
@@ -9,6 +23,7 @@
|
|
9
23
|
.p2align 6
|
10
24
|
_blake3_hash_many_avx2:
|
11
25
|
blake3_hash_many_avx2:
|
26
|
+
_CET_ENDBR
|
12
27
|
push r15
|
13
28
|
push r14
|
14
29
|
push r13
|
@@ -1,5 +1,18 @@
|
|
1
|
-
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
2
14
|
|
15
|
+
.intel_syntax noprefix
|
3
16
|
.global _blake3_hash_many_avx512
|
4
17
|
.global blake3_hash_many_avx512
|
5
18
|
.global blake3_compress_in_place_avx512
|
@@ -15,6 +28,7 @@
|
|
15
28
|
.p2align 6
|
16
29
|
_blake3_hash_many_avx512:
|
17
30
|
blake3_hash_many_avx512:
|
31
|
+
_CET_ENDBR
|
18
32
|
push r15
|
19
33
|
push r14
|
20
34
|
push r13
|
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
|
|
82
96
|
mov r14, qword ptr [rdi+0x50]
|
83
97
|
mov r15, qword ptr [rdi+0x58]
|
84
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
85
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
86
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
87
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
88
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
89
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
90
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
91
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
92
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
93
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
94
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
95
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
96
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
|
|
102
116
|
mov r14, qword ptr [rdi+0x70]
|
103
117
|
mov r15, qword ptr [rdi+0x78]
|
104
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
105
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
106
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
107
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
108
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
109
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
110
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
111
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
112
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
113
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
114
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
115
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
116
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
|
|
144
158
|
mov r14, qword ptr [rdi+0x50]
|
145
159
|
mov r15, qword ptr [rdi+0x58]
|
146
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
147
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
148
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
149
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
150
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
151
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
152
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
153
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
154
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
155
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
156
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
157
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
158
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
|
|
172
186
|
mov r14, qword ptr [rdi+0x70]
|
173
187
|
mov r15, qword ptr [rdi+0x78]
|
174
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
175
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
176
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
177
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
178
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
179
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
180
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
181
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
182
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
183
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
184
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
185
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
186
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
|
|
2039
2053
|
vpermq ymm14, ymm14, 0xDC
|
2040
2054
|
vpermq ymm15, ymm15, 0xDC
|
2041
2055
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2042
|
-
|
2056
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2043
2057
|
mov eax, 17476
|
2044
2058
|
kmovw k2, eax
|
2045
2059
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
|
|
2372
2386
|
.p2align 6
|
2373
2387
|
_blake3_compress_in_place_avx512:
|
2374
2388
|
blake3_compress_in_place_avx512:
|
2389
|
+
_CET_ENDBR
|
2375
2390
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2376
2391
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2377
2392
|
movzx eax, r8b
|
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
|
|
2454
2469
|
.p2align 6
|
2455
2470
|
_blake3_compress_xof_avx512:
|
2456
2471
|
blake3_compress_xof_avx512:
|
2472
|
+
_CET_ENDBR
|
2457
2473
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2458
2474
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2459
2475
|
movzx eax, r8b
|
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
|
|
96
96
|
mov r14, qword ptr [rdi+0x50]
|
97
97
|
mov r15, qword ptr [rdi+0x58]
|
98
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
99
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
100
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
101
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
102
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
103
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
104
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
105
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
106
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
107
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
108
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
109
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
110
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
|
|
116
116
|
mov r14, qword ptr [rdi+0x70]
|
117
117
|
mov r15, qword ptr [rdi+0x78]
|
118
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
119
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
120
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
121
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
122
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
123
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
124
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
125
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
126
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
127
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
128
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
129
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
130
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
|
|
158
158
|
mov r14, qword ptr [rdi+0x50]
|
159
159
|
mov r15, qword ptr [rdi+0x58]
|
160
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
161
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
162
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
163
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
164
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
165
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
166
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
167
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
168
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
169
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
170
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
171
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
172
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
|
|
186
186
|
mov r14, qword ptr [rdi+0x70]
|
187
187
|
mov r15, qword ptr [rdi+0x78]
|
188
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
189
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
190
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
191
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
192
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
193
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
194
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
195
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
196
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
197
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
198
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
199
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
200
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
|
|
2065
2065
|
vpermq ymm14, ymm14, 0xDC
|
2066
2066
|
vpermq ymm15, ymm15, 0xDC
|
2067
2067
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2068
|
-
|
2068
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2069
2069
|
mov eax, 17476
|
2070
2070
|
kmovw k2, eax
|
2071
2071
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -99,15 +99,15 @@ innerloop16:
|
|
99
99
|
mov r14, qword ptr [rdi+50H]
|
100
100
|
mov r15, qword ptr [rdi+58H]
|
101
101
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
102
|
-
|
102
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
103
103
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
104
|
-
|
104
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
105
105
|
vpunpcklqdq zmm8, zmm16, zmm17
|
106
106
|
vpunpckhqdq zmm9, zmm16, zmm17
|
107
107
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
108
|
-
|
108
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
109
109
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
110
|
-
|
110
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
111
111
|
vpunpcklqdq zmm10, zmm18, zmm19
|
112
112
|
vpunpckhqdq zmm11, zmm18, zmm19
|
113
113
|
mov r8, qword ptr [rdi+20H]
|
@@ -119,15 +119,15 @@ innerloop16:
|
|
119
119
|
mov r14, qword ptr [rdi+70H]
|
120
120
|
mov r15, qword ptr [rdi+78H]
|
121
121
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
122
|
-
|
122
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
123
123
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
124
|
-
|
124
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
125
125
|
vpunpcklqdq zmm12, zmm16, zmm17
|
126
126
|
vpunpckhqdq zmm13, zmm16, zmm17
|
127
127
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
128
|
-
|
128
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
129
129
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
130
|
-
|
130
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
131
131
|
vpunpcklqdq zmm14, zmm18, zmm19
|
132
132
|
vpunpckhqdq zmm15, zmm18, zmm19
|
133
133
|
vmovdqa32 zmm27, zmmword ptr [INDEX0]
|
@@ -161,15 +161,15 @@ innerloop16:
|
|
161
161
|
mov r14, qword ptr [rdi+50H]
|
162
162
|
mov r15, qword ptr [rdi+58H]
|
163
163
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
164
|
-
|
164
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
165
165
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
166
|
-
|
166
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
167
167
|
vpunpcklqdq zmm8, zmm24, zmm25
|
168
168
|
vpunpckhqdq zmm9, zmm24, zmm25
|
169
169
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
170
|
-
|
170
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
171
171
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
172
|
-
|
172
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
173
173
|
vpunpcklqdq zmm10, zmm24, zmm25
|
174
174
|
vpunpckhqdq zmm11, zmm24, zmm25
|
175
175
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -189,15 +189,15 @@ innerloop16:
|
|
189
189
|
mov r14, qword ptr [rdi+70H]
|
190
190
|
mov r15, qword ptr [rdi+78H]
|
191
191
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
192
|
-
|
192
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
193
193
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
194
|
-
|
194
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
195
195
|
vpunpcklqdq zmm12, zmm24, zmm25
|
196
196
|
vpunpckhqdq zmm13, zmm24, zmm25
|
197
197
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
198
|
-
|
198
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
199
199
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
200
|
-
|
200
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
201
201
|
vpunpcklqdq zmm14, zmm24, zmm25
|
202
202
|
vpunpckhqdq zmm15, zmm24, zmm25
|
203
203
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -2073,7 +2073,7 @@ final7blocks:
|
|
2073
2073
|
vpermq ymm14, ymm14, 0DCH
|
2074
2074
|
vpermq ymm15, ymm15, 0DCH
|
2075
2075
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
|
2076
|
-
|
2076
|
+
vinserti64x4 zmm13, zmm14, ymm15, 01H
|
2077
2077
|
mov eax, 17476
|
2078
2078
|
kmovw k2, eax
|
2079
2079
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|