digest-blake3 0.22.1 → 1.2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/ext/digest/blake3/blake3.c +42 -20
- data/ext/digest/blake3/blake3.h +8 -3
- data/ext/digest/blake3/blake3_avx2_x86-64_unix.S +15 -0
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +34 -18
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +35 -71
- data/ext/digest/blake3/blake3_impl.h +117 -2
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_portable.c +1 -9
- data/ext/digest/blake3/blake3_sse2.c +565 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_unix.S +17 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +23 -11
- data/ext/digest/blake3/extconf.rb +4 -3
- data/lib/digest/blake3/version.rb +1 -1
- metadata +10 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ec7a77d6875b688e1cb1fbe8470cbf67278f9fe3f2f8e516bafe7abc0bf54bc4
|
4
|
+
data.tar.gz: 74e13b2480eccd5c2fe3fa913a0962217c1f07c95b5db80b8303086488ee5d9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de0fb7b5ccce755c313da8e547a430950d181170c64561746890ce8855ce5e09d3232b16316f36d22320ae5d23cf7904e8221a26358e96d9566ba247ef613214
|
7
|
+
data.tar.gz: 33e15e9469128ba227dbe6b57d9c44fe55078b9031975bf9db783a469c93342c7ccbf38b763ddfed7f09c941a42a6df89302cfda0e38b0ad4967a12acac4b18a
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
58
58
|
|
59
59
|
## Contributing
|
60
60
|
|
61
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
61
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
|
62
62
|
|
63
63
|
## License
|
64
64
|
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#include "blake3.h"
|
6
6
|
#include "blake3_impl.h"
|
7
7
|
|
8
|
+
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
|
9
|
+
|
8
10
|
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
|
9
11
|
uint8_t flags) {
|
10
12
|
memcpy(self->cv, key, BLAKE3_KEY_LEN);
|
@@ -81,26 +83,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
|
|
81
83
|
memcpy(cv_words, self->input_cv, 32);
|
82
84
|
blake3_compress_in_place(cv_words, self->block, self->block_len,
|
83
85
|
self->counter, self->flags);
|
84
|
-
|
86
|
+
store_cv_words(cv, cv_words);
|
85
87
|
}
|
86
88
|
|
87
|
-
INLINE void output_root_bytes(const output_t *self, uint8_t *out,
|
89
|
+
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
|
88
90
|
size_t out_len) {
|
89
|
-
uint64_t output_block_counter =
|
91
|
+
uint64_t output_block_counter = seek / 64;
|
92
|
+
size_t offset_within_block = seek % 64;
|
90
93
|
uint8_t wide_buf[64];
|
91
94
|
while (out_len > 0) {
|
92
95
|
blake3_compress_xof(self->input_cv, self->block, self->block_len,
|
93
96
|
output_block_counter, self->flags | ROOT, wide_buf);
|
97
|
+
size_t available_bytes = 64 - offset_within_block;
|
94
98
|
size_t memcpy_len;
|
95
|
-
if (out_len >
|
96
|
-
memcpy_len =
|
99
|
+
if (out_len > available_bytes) {
|
100
|
+
memcpy_len = available_bytes;
|
97
101
|
} else {
|
98
102
|
memcpy_len = out_len;
|
99
103
|
}
|
100
|
-
memcpy(out, wide_buf, memcpy_len);
|
104
|
+
memcpy(out, wide_buf + offset_within_block, memcpy_len);
|
101
105
|
out += memcpy_len;
|
102
106
|
out_len -= memcpy_len;
|
103
107
|
output_block_counter += 1;
|
108
|
+
offset_within_block = 0;
|
104
109
|
}
|
105
110
|
}
|
106
111
|
|
@@ -256,10 +261,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
256
261
|
// Why not just have the caller split the input on the first update(), instead
|
257
262
|
// of implementing this special rule? Because we don't want to limit SIMD or
|
258
263
|
// multi-threading parallelism for that update().
|
259
|
-
size_t blake3_compress_subtree_wide(const uint8_t *input,
|
260
|
-
|
261
|
-
|
262
|
-
|
264
|
+
static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
265
|
+
size_t input_len,
|
266
|
+
const uint32_t key[8],
|
267
|
+
uint64_t chunk_counter,
|
268
|
+
uint8_t flags, uint8_t *out) {
|
263
269
|
// Note that the single chunk case does *not* bump the SIMD degree up to 2
|
264
270
|
// when it is 1. If this implementation adds multi-threading in the future,
|
265
271
|
// this gives us the option of multi-threading even the 2-chunk case, which
|
@@ -331,15 +337,21 @@ INLINE void compress_subtree_to_parent_node(
|
|
331
337
|
assert(input_len > BLAKE3_CHUNK_LEN);
|
332
338
|
#endif
|
333
339
|
|
334
|
-
uint8_t cv_array[
|
340
|
+
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
335
341
|
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
336
342
|
chunk_counter, flags, cv_array);
|
343
|
+
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
|
337
344
|
|
338
345
|
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
|
339
346
|
// compress_subtree_wide() returns more than 2 chaining values. Condense
|
340
347
|
// them into 2 by forming parent nodes repeatedly.
|
341
348
|
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
|
342
|
-
|
349
|
+
// The second half of this loop condition is always true, and we just
|
350
|
+
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
|
351
|
+
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
|
352
|
+
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
|
353
|
+
// this code, test it against that version.
|
354
|
+
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
|
343
355
|
num_cvs =
|
344
356
|
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
|
345
357
|
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
|
@@ -363,10 +375,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
|
|
363
375
|
hasher_init_base(self, key_words, KEYED_HASH);
|
364
376
|
}
|
365
377
|
|
366
|
-
void
|
378
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
379
|
+
size_t context_len) {
|
367
380
|
blake3_hasher context_hasher;
|
368
381
|
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
|
369
|
-
blake3_hasher_update(&context_hasher, context,
|
382
|
+
blake3_hasher_update(&context_hasher, context, context_len);
|
370
383
|
uint8_t context_key[BLAKE3_KEY_LEN];
|
371
384
|
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
|
372
385
|
uint32_t context_key_words[8];
|
@@ -374,6 +387,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
|
374
387
|
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
|
375
388
|
}
|
376
389
|
|
390
|
+
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
391
|
+
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
|
392
|
+
}
|
393
|
+
|
377
394
|
// As described in hasher_push_cv() below, we do "lazy merging", delaying
|
378
395
|
// merges until right before the next CV is about to be added. This is
|
379
396
|
// different from the reference implementation. Another difference is that we
|
@@ -425,8 +442,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
|
|
425
442
|
// compress_subtree_to_parent_node(). That function always returns the top
|
426
443
|
// *two* chaining values of the subtree it's compressing. We then do lazy
|
427
444
|
// merging with each of them separately, so that the second CV will always
|
428
|
-
// remain unmerged. (
|
429
|
-
//
|
445
|
+
// remain unmerged. (That also helps us support extendable output when we're
|
446
|
+
// hashing an input all-at-once.)
|
430
447
|
INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
|
431
448
|
uint64_t chunk_counter) {
|
432
449
|
hasher_merge_cv_stack(self, chunk_counter);
|
@@ -472,8 +489,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
472
489
|
|
473
490
|
// Now the chunk_state is clear, and we have more input. If there's more than
|
474
491
|
// a single chunk (so, definitely not the root chunk), hash the largest whole
|
475
|
-
// subtree we can, with the full benefits of SIMD and
|
476
|
-
// parallelism. Two restrictions:
|
492
|
+
// subtree we can, with the full benefits of SIMD (and maybe in the future,
|
493
|
+
// multi-threading) parallelism. Two restrictions:
|
477
494
|
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
|
478
495
|
// the right edge can be incomplete, and we don't know where the right edge
|
479
496
|
// is going to be until we get to finalize().
|
@@ -546,6 +563,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
546
563
|
|
547
564
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
548
565
|
size_t out_len) {
|
566
|
+
blake3_hasher_finalize_seek(self, 0, out, out_len);
|
567
|
+
}
|
568
|
+
|
569
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
570
|
+
uint8_t *out, size_t out_len) {
|
549
571
|
// Explicitly checking for zero avoids causing UB by passing a null pointer
|
550
572
|
// to memcpy. This comes up in practice with things like:
|
551
573
|
// std::vector<uint8_t> v;
|
@@ -557,7 +579,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
557
579
|
// If the subtree stack is empty, then the current chunk is the root.
|
558
580
|
if (self->cv_stack_len == 0) {
|
559
581
|
output_t output = chunk_state_output(&self->chunk);
|
560
|
-
output_root_bytes(&output, out, out_len);
|
582
|
+
output_root_bytes(&output, seek, out, out_len);
|
561
583
|
return;
|
562
584
|
}
|
563
585
|
// If there are any bytes in the chunk state, finalize that chunk and do a
|
@@ -585,5 +607,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
585
607
|
output_chaining_value(&output, &parent_block[32]);
|
586
608
|
output = parent_output(parent_block, self->key, self->chunk.flags);
|
587
609
|
}
|
588
|
-
output_root_bytes(&output, out, out_len);
|
610
|
+
output_root_bytes(&output, seek, out, out_len);
|
589
611
|
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -4,16 +4,16 @@
|
|
4
4
|
#include <stddef.h>
|
5
5
|
#include <stdint.h>
|
6
6
|
|
7
|
-
#ifdef
|
7
|
+
#ifdef __cplusplus
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#define BLAKE3_VERSION_STRING "1.2.0"
|
11
12
|
#define BLAKE3_KEY_LEN 32
|
12
13
|
#define BLAKE3_OUT_LEN 32
|
13
14
|
#define BLAKE3_BLOCK_LEN 64
|
14
15
|
#define BLAKE3_CHUNK_LEN 1024
|
15
16
|
#define BLAKE3_MAX_DEPTH 54
|
16
|
-
#define BLAKE3_MAX_SIMD_DEGREE 16
|
17
17
|
|
18
18
|
// This struct is a private implementation detail. It has to be here because
|
19
19
|
// it's part of blake3_hasher below.
|
@@ -38,16 +38,21 @@ typedef struct {
|
|
38
38
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
39
|
} blake3_hasher;
|
40
40
|
|
41
|
+
const char *blake3_version(void);
|
41
42
|
void blake3_hasher_init(blake3_hasher *self);
|
42
43
|
void blake3_hasher_init_keyed(blake3_hasher *self,
|
43
44
|
const uint8_t key[BLAKE3_KEY_LEN]);
|
44
45
|
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
46
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
47
|
+
size_t context_len);
|
45
48
|
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
46
49
|
size_t input_len);
|
47
50
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
48
51
|
size_t out_len);
|
52
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
|
+
uint8_t *out, size_t out_len);
|
49
54
|
|
50
|
-
#ifdef
|
55
|
+
#ifdef __cplusplus
|
51
56
|
}
|
52
57
|
#endif
|
53
58
|
|
@@ -1,3 +1,17 @@
|
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
14
|
+
|
1
15
|
.intel_syntax noprefix
|
2
16
|
.global _blake3_hash_many_avx2
|
3
17
|
.global blake3_hash_many_avx2
|
@@ -9,6 +23,7 @@
|
|
9
23
|
.p2align 6
|
10
24
|
_blake3_hash_many_avx2:
|
11
25
|
blake3_hash_many_avx2:
|
26
|
+
_CET_ENDBR
|
12
27
|
push r15
|
13
28
|
push r14
|
14
29
|
push r13
|
@@ -1,5 +1,18 @@
|
|
1
|
-
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
2
14
|
|
15
|
+
.intel_syntax noprefix
|
3
16
|
.global _blake3_hash_many_avx512
|
4
17
|
.global blake3_hash_many_avx512
|
5
18
|
.global blake3_compress_in_place_avx512
|
@@ -15,6 +28,7 @@
|
|
15
28
|
.p2align 6
|
16
29
|
_blake3_hash_many_avx512:
|
17
30
|
blake3_hash_many_avx512:
|
31
|
+
_CET_ENDBR
|
18
32
|
push r15
|
19
33
|
push r14
|
20
34
|
push r13
|
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
|
|
82
96
|
mov r14, qword ptr [rdi+0x50]
|
83
97
|
mov r15, qword ptr [rdi+0x58]
|
84
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
85
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
86
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
87
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
88
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
89
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
90
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
91
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
92
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
93
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
94
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
95
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
96
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
|
|
102
116
|
mov r14, qword ptr [rdi+0x70]
|
103
117
|
mov r15, qword ptr [rdi+0x78]
|
104
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
105
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
106
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
107
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
108
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
109
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
110
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
111
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
112
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
113
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
114
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
115
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
116
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
|
|
144
158
|
mov r14, qword ptr [rdi+0x50]
|
145
159
|
mov r15, qword ptr [rdi+0x58]
|
146
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
147
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
148
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
149
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
150
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
151
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
152
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
153
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
154
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
155
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
156
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
157
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
158
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
|
|
172
186
|
mov r14, qword ptr [rdi+0x70]
|
173
187
|
mov r15, qword ptr [rdi+0x78]
|
174
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
175
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
176
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
177
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
178
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
179
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
180
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
181
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
182
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
183
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
184
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
185
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
186
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
|
|
2039
2053
|
vpermq ymm14, ymm14, 0xDC
|
2040
2054
|
vpermq ymm15, ymm15, 0xDC
|
2041
2055
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2042
|
-
|
2056
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2043
2057
|
mov eax, 17476
|
2044
2058
|
kmovw k2, eax
|
2045
2059
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
|
|
2372
2386
|
.p2align 6
|
2373
2387
|
_blake3_compress_in_place_avx512:
|
2374
2388
|
blake3_compress_in_place_avx512:
|
2389
|
+
_CET_ENDBR
|
2375
2390
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2376
2391
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2377
2392
|
movzx eax, r8b
|
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
|
|
2454
2469
|
.p2align 6
|
2455
2470
|
_blake3_compress_xof_avx512:
|
2456
2471
|
blake3_compress_xof_avx512:
|
2472
|
+
_CET_ENDBR
|
2457
2473
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2458
2474
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2459
2475
|
movzx eax, r8b
|
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
|
|
96
96
|
mov r14, qword ptr [rdi+0x50]
|
97
97
|
mov r15, qword ptr [rdi+0x58]
|
98
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
99
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
100
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
101
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
102
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
103
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
104
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
105
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
106
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
107
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
108
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
109
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
110
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
|
|
116
116
|
mov r14, qword ptr [rdi+0x70]
|
117
117
|
mov r15, qword ptr [rdi+0x78]
|
118
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
119
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
120
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
121
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
122
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
123
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
124
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
125
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
126
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
127
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
128
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
129
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
130
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
|
|
158
158
|
mov r14, qword ptr [rdi+0x50]
|
159
159
|
mov r15, qword ptr [rdi+0x58]
|
160
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
161
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
162
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
163
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
164
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
165
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
166
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
167
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
168
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
169
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
170
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
171
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
172
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
|
|
186
186
|
mov r14, qword ptr [rdi+0x70]
|
187
187
|
mov r15, qword ptr [rdi+0x78]
|
188
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
189
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
190
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
191
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
192
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
193
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
194
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
195
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
196
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
197
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
198
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
199
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
200
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
|
|
2065
2065
|
vpermq ymm14, ymm14, 0xDC
|
2066
2066
|
vpermq ymm15, ymm15, 0xDC
|
2067
2067
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2068
|
-
|
2068
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2069
2069
|
mov eax, 17476
|
2070
2070
|
kmovw k2, eax
|
2071
2071
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -99,15 +99,15 @@ innerloop16:
|
|
99
99
|
mov r14, qword ptr [rdi+50H]
|
100
100
|
mov r15, qword ptr [rdi+58H]
|
101
101
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
102
|
-
|
102
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
103
103
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
104
|
-
|
104
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
105
105
|
vpunpcklqdq zmm8, zmm16, zmm17
|
106
106
|
vpunpckhqdq zmm9, zmm16, zmm17
|
107
107
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
108
|
-
|
108
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
109
109
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
110
|
-
|
110
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
111
111
|
vpunpcklqdq zmm10, zmm18, zmm19
|
112
112
|
vpunpckhqdq zmm11, zmm18, zmm19
|
113
113
|
mov r8, qword ptr [rdi+20H]
|
@@ -119,15 +119,15 @@ innerloop16:
|
|
119
119
|
mov r14, qword ptr [rdi+70H]
|
120
120
|
mov r15, qword ptr [rdi+78H]
|
121
121
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
122
|
-
|
122
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
123
123
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
124
|
-
|
124
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
125
125
|
vpunpcklqdq zmm12, zmm16, zmm17
|
126
126
|
vpunpckhqdq zmm13, zmm16, zmm17
|
127
127
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
128
|
-
|
128
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
129
129
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
130
|
-
|
130
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
131
131
|
vpunpcklqdq zmm14, zmm18, zmm19
|
132
132
|
vpunpckhqdq zmm15, zmm18, zmm19
|
133
133
|
vmovdqa32 zmm27, zmmword ptr [INDEX0]
|
@@ -161,15 +161,15 @@ innerloop16:
|
|
161
161
|
mov r14, qword ptr [rdi+50H]
|
162
162
|
mov r15, qword ptr [rdi+58H]
|
163
163
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
164
|
-
|
164
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
165
165
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
166
|
-
|
166
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
167
167
|
vpunpcklqdq zmm8, zmm24, zmm25
|
168
168
|
vpunpckhqdq zmm9, zmm24, zmm25
|
169
169
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
170
|
-
|
170
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
171
171
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
172
|
-
|
172
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
173
173
|
vpunpcklqdq zmm10, zmm24, zmm25
|
174
174
|
vpunpckhqdq zmm11, zmm24, zmm25
|
175
175
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -189,15 +189,15 @@ innerloop16:
|
|
189
189
|
mov r14, qword ptr [rdi+70H]
|
190
190
|
mov r15, qword ptr [rdi+78H]
|
191
191
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
192
|
-
|
192
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
193
193
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
194
|
-
|
194
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
195
195
|
vpunpcklqdq zmm12, zmm24, zmm25
|
196
196
|
vpunpckhqdq zmm13, zmm24, zmm25
|
197
197
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
198
|
-
|
198
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
199
199
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
200
|
-
|
200
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
201
201
|
vpunpcklqdq zmm14, zmm24, zmm25
|
202
202
|
vpunpckhqdq zmm15, zmm24, zmm25
|
203
203
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -2073,7 +2073,7 @@ final7blocks:
|
|
2073
2073
|
vpermq ymm14, ymm14, 0DCH
|
2074
2074
|
vpermq ymm15, ymm15, 0DCH
|
2075
2075
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
|
2076
|
-
|
2076
|
+
vinserti64x4 zmm13, zmm14, ymm15, 01H
|
2077
2077
|
mov eax, 17476
|
2078
2078
|
kmovw k2, eax
|
2079
2079
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|