digest-blake3 0.22.1 → 1.2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/README.md +1 -1
- data/ext/digest/blake3/blake3.c +42 -20
- data/ext/digest/blake3/blake3.h +8 -3
- data/ext/digest/blake3/blake3_avx2_x86-64_unix.S +15 -0
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +34 -18
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +35 -71
- data/ext/digest/blake3/blake3_impl.h +117 -2
- data/ext/digest/blake3/blake3_neon.c +6 -1
- data/ext/digest/blake3/blake3_portable.c +1 -9
- data/ext/digest/blake3/blake3_sse2.c +565 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_unix.S +17 -0
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +23 -11
- data/ext/digest/blake3/extconf.rb +4 -3
- data/lib/digest/blake3/version.rb +1 -1
- metadata +10 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ec7a77d6875b688e1cb1fbe8470cbf67278f9fe3f2f8e516bafe7abc0bf54bc4
|
4
|
+
data.tar.gz: 74e13b2480eccd5c2fe3fa913a0962217c1f07c95b5db80b8303086488ee5d9f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de0fb7b5ccce755c313da8e547a430950d181170c64561746890ce8855ce5e09d3232b16316f36d22320ae5d23cf7904e8221a26358e96d9566ba247ef613214
|
7
|
+
data.tar.gz: 33e15e9469128ba227dbe6b57d9c44fe55078b9031975bf9db783a469c93342c7ccbf38b763ddfed7f09c941a42a6df89302cfda0e38b0ad4967a12acac4b18a
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -58,7 +58,7 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
|
|
58
58
|
|
59
59
|
## Contributing
|
60
60
|
|
61
|
-
Bug reports and pull requests are welcome on GitHub at https://github.com/
|
61
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/willbryant/digest-blake3.
|
62
62
|
|
63
63
|
## License
|
64
64
|
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -5,6 +5,8 @@
|
|
5
5
|
#include "blake3.h"
|
6
6
|
#include "blake3_impl.h"
|
7
7
|
|
8
|
+
const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
|
9
|
+
|
8
10
|
INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
|
9
11
|
uint8_t flags) {
|
10
12
|
memcpy(self->cv, key, BLAKE3_KEY_LEN);
|
@@ -81,26 +83,29 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {
|
|
81
83
|
memcpy(cv_words, self->input_cv, 32);
|
82
84
|
blake3_compress_in_place(cv_words, self->block, self->block_len,
|
83
85
|
self->counter, self->flags);
|
84
|
-
|
86
|
+
store_cv_words(cv, cv_words);
|
85
87
|
}
|
86
88
|
|
87
|
-
INLINE void output_root_bytes(const output_t *self, uint8_t *out,
|
89
|
+
INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
|
88
90
|
size_t out_len) {
|
89
|
-
uint64_t output_block_counter =
|
91
|
+
uint64_t output_block_counter = seek / 64;
|
92
|
+
size_t offset_within_block = seek % 64;
|
90
93
|
uint8_t wide_buf[64];
|
91
94
|
while (out_len > 0) {
|
92
95
|
blake3_compress_xof(self->input_cv, self->block, self->block_len,
|
93
96
|
output_block_counter, self->flags | ROOT, wide_buf);
|
97
|
+
size_t available_bytes = 64 - offset_within_block;
|
94
98
|
size_t memcpy_len;
|
95
|
-
if (out_len >
|
96
|
-
memcpy_len =
|
99
|
+
if (out_len > available_bytes) {
|
100
|
+
memcpy_len = available_bytes;
|
97
101
|
} else {
|
98
102
|
memcpy_len = out_len;
|
99
103
|
}
|
100
|
-
memcpy(out, wide_buf, memcpy_len);
|
104
|
+
memcpy(out, wide_buf + offset_within_block, memcpy_len);
|
101
105
|
out += memcpy_len;
|
102
106
|
out_len -= memcpy_len;
|
103
107
|
output_block_counter += 1;
|
108
|
+
offset_within_block = 0;
|
104
109
|
}
|
105
110
|
}
|
106
111
|
|
@@ -256,10 +261,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
256
261
|
// Why not just have the caller split the input on the first update(), instead
|
257
262
|
// of implementing this special rule? Because we don't want to limit SIMD or
|
258
263
|
// multi-threading parallelism for that update().
|
259
|
-
size_t blake3_compress_subtree_wide(const uint8_t *input,
|
260
|
-
|
261
|
-
|
262
|
-
|
264
|
+
static size_t blake3_compress_subtree_wide(const uint8_t *input,
|
265
|
+
size_t input_len,
|
266
|
+
const uint32_t key[8],
|
267
|
+
uint64_t chunk_counter,
|
268
|
+
uint8_t flags, uint8_t *out) {
|
263
269
|
// Note that the single chunk case does *not* bump the SIMD degree up to 2
|
264
270
|
// when it is 1. If this implementation adds multi-threading in the future,
|
265
271
|
// this gives us the option of multi-threading even the 2-chunk case, which
|
@@ -331,15 +337,21 @@ INLINE void compress_subtree_to_parent_node(
|
|
331
337
|
assert(input_len > BLAKE3_CHUNK_LEN);
|
332
338
|
#endif
|
333
339
|
|
334
|
-
uint8_t cv_array[
|
340
|
+
uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
|
335
341
|
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
|
336
342
|
chunk_counter, flags, cv_array);
|
343
|
+
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
|
337
344
|
|
338
345
|
// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
|
339
346
|
// compress_subtree_wide() returns more than 2 chaining values. Condense
|
340
347
|
// them into 2 by forming parent nodes repeatedly.
|
341
348
|
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
|
342
|
-
|
349
|
+
// The second half of this loop condition is always true, and we just
|
350
|
+
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
|
351
|
+
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
|
352
|
+
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
|
353
|
+
// this code, test it against that version.
|
354
|
+
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
|
343
355
|
num_cvs =
|
344
356
|
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
|
345
357
|
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
|
@@ -363,10 +375,11 @@ void blake3_hasher_init_keyed(blake3_hasher *self,
|
|
363
375
|
hasher_init_base(self, key_words, KEYED_HASH);
|
364
376
|
}
|
365
377
|
|
366
|
-
void
|
378
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
379
|
+
size_t context_len) {
|
367
380
|
blake3_hasher context_hasher;
|
368
381
|
hasher_init_base(&context_hasher, IV, DERIVE_KEY_CONTEXT);
|
369
|
-
blake3_hasher_update(&context_hasher, context,
|
382
|
+
blake3_hasher_update(&context_hasher, context, context_len);
|
370
383
|
uint8_t context_key[BLAKE3_KEY_LEN];
|
371
384
|
blake3_hasher_finalize(&context_hasher, context_key, BLAKE3_KEY_LEN);
|
372
385
|
uint32_t context_key_words[8];
|
@@ -374,6 +387,10 @@ void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
|
374
387
|
hasher_init_base(self, context_key_words, DERIVE_KEY_MATERIAL);
|
375
388
|
}
|
376
389
|
|
390
|
+
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context) {
|
391
|
+
blake3_hasher_init_derive_key_raw(self, context, strlen(context));
|
392
|
+
}
|
393
|
+
|
377
394
|
// As described in hasher_push_cv() below, we do "lazy merging", delaying
|
378
395
|
// merges until right before the next CV is about to be added. This is
|
379
396
|
// different from the reference implementation. Another difference is that we
|
@@ -425,8 +442,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
|
|
425
442
|
// compress_subtree_to_parent_node(). That function always returns the top
|
426
443
|
// *two* chaining values of the subtree it's compressing. We then do lazy
|
427
444
|
// merging with each of them separately, so that the second CV will always
|
428
|
-
// remain unmerged. (
|
429
|
-
//
|
445
|
+
// remain unmerged. (That also helps us support extendable output when we're
|
446
|
+
// hashing an input all-at-once.)
|
430
447
|
INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
|
431
448
|
uint64_t chunk_counter) {
|
432
449
|
hasher_merge_cv_stack(self, chunk_counter);
|
@@ -472,8 +489,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
472
489
|
|
473
490
|
// Now the chunk_state is clear, and we have more input. If there's more than
|
474
491
|
// a single chunk (so, definitely not the root chunk), hash the largest whole
|
475
|
-
// subtree we can, with the full benefits of SIMD and
|
476
|
-
// parallelism. Two restrictions:
|
492
|
+
// subtree we can, with the full benefits of SIMD (and maybe in the future,
|
493
|
+
// multi-threading) parallelism. Two restrictions:
|
477
494
|
// - The subtree has to be a power-of-2 number of chunks. Only subtrees along
|
478
495
|
// the right edge can be incomplete, and we don't know where the right edge
|
479
496
|
// is going to be until we get to finalize().
|
@@ -546,6 +563,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input,
|
|
546
563
|
|
547
564
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
548
565
|
size_t out_len) {
|
566
|
+
blake3_hasher_finalize_seek(self, 0, out, out_len);
|
567
|
+
}
|
568
|
+
|
569
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
570
|
+
uint8_t *out, size_t out_len) {
|
549
571
|
// Explicitly checking for zero avoids causing UB by passing a null pointer
|
550
572
|
// to memcpy. This comes up in practice with things like:
|
551
573
|
// std::vector<uint8_t> v;
|
@@ -557,7 +579,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
557
579
|
// If the subtree stack is empty, then the current chunk is the root.
|
558
580
|
if (self->cv_stack_len == 0) {
|
559
581
|
output_t output = chunk_state_output(&self->chunk);
|
560
|
-
output_root_bytes(&output, out, out_len);
|
582
|
+
output_root_bytes(&output, seek, out, out_len);
|
561
583
|
return;
|
562
584
|
}
|
563
585
|
// If there are any bytes in the chunk state, finalize that chunk and do a
|
@@ -585,5 +607,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
|
585
607
|
output_chaining_value(&output, &parent_block[32]);
|
586
608
|
output = parent_output(parent_block, self->key, self->chunk.flags);
|
587
609
|
}
|
588
|
-
output_root_bytes(&output, out, out_len);
|
610
|
+
output_root_bytes(&output, seek, out, out_len);
|
589
611
|
}
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -4,16 +4,16 @@
|
|
4
4
|
#include <stddef.h>
|
5
5
|
#include <stdint.h>
|
6
6
|
|
7
|
-
#ifdef
|
7
|
+
#ifdef __cplusplus
|
8
8
|
extern "C" {
|
9
9
|
#endif
|
10
10
|
|
11
|
+
#define BLAKE3_VERSION_STRING "1.2.0"
|
11
12
|
#define BLAKE3_KEY_LEN 32
|
12
13
|
#define BLAKE3_OUT_LEN 32
|
13
14
|
#define BLAKE3_BLOCK_LEN 64
|
14
15
|
#define BLAKE3_CHUNK_LEN 1024
|
15
16
|
#define BLAKE3_MAX_DEPTH 54
|
16
|
-
#define BLAKE3_MAX_SIMD_DEGREE 16
|
17
17
|
|
18
18
|
// This struct is a private implementation detail. It has to be here because
|
19
19
|
// it's part of blake3_hasher below.
|
@@ -38,16 +38,21 @@ typedef struct {
|
|
38
38
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
39
|
} blake3_hasher;
|
40
40
|
|
41
|
+
const char *blake3_version(void);
|
41
42
|
void blake3_hasher_init(blake3_hasher *self);
|
42
43
|
void blake3_hasher_init_keyed(blake3_hasher *self,
|
43
44
|
const uint8_t key[BLAKE3_KEY_LEN]);
|
44
45
|
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
46
|
+
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
47
|
+
size_t context_len);
|
45
48
|
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
46
49
|
size_t input_len);
|
47
50
|
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
48
51
|
size_t out_len);
|
52
|
+
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
|
+
uint8_t *out, size_t out_len);
|
49
54
|
|
50
|
-
#ifdef
|
55
|
+
#ifdef __cplusplus
|
51
56
|
}
|
52
57
|
#endif
|
53
58
|
|
@@ -1,3 +1,17 @@
|
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
14
|
+
|
1
15
|
.intel_syntax noprefix
|
2
16
|
.global _blake3_hash_many_avx2
|
3
17
|
.global blake3_hash_many_avx2
|
@@ -9,6 +23,7 @@
|
|
9
23
|
.p2align 6
|
10
24
|
_blake3_hash_many_avx2:
|
11
25
|
blake3_hash_many_avx2:
|
26
|
+
_CET_ENDBR
|
12
27
|
push r15
|
13
28
|
push r14
|
14
29
|
push r13
|
@@ -1,5 +1,18 @@
|
|
1
|
-
|
1
|
+
#if defined(__ELF__) && defined(__linux__)
|
2
|
+
.section .note.GNU-stack,"",%progbits
|
3
|
+
#endif
|
4
|
+
|
5
|
+
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
6
|
+
#if __has_include(<cet.h>)
|
7
|
+
#include <cet.h>
|
8
|
+
#endif
|
9
|
+
#endif
|
10
|
+
|
11
|
+
#if !defined(_CET_ENDBR)
|
12
|
+
#define _CET_ENDBR
|
13
|
+
#endif
|
2
14
|
|
15
|
+
.intel_syntax noprefix
|
3
16
|
.global _blake3_hash_many_avx512
|
4
17
|
.global blake3_hash_many_avx512
|
5
18
|
.global blake3_compress_in_place_avx512
|
@@ -15,6 +28,7 @@
|
|
15
28
|
.p2align 6
|
16
29
|
_blake3_hash_many_avx512:
|
17
30
|
blake3_hash_many_avx512:
|
31
|
+
_CET_ENDBR
|
18
32
|
push r15
|
19
33
|
push r14
|
20
34
|
push r13
|
@@ -82,15 +96,15 @@ blake3_hash_many_avx512:
|
|
82
96
|
mov r14, qword ptr [rdi+0x50]
|
83
97
|
mov r15, qword ptr [rdi+0x58]
|
84
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
85
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
86
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
87
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
88
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
89
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
90
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
91
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
92
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
93
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
94
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
95
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
96
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -102,15 +116,15 @@ blake3_hash_many_avx512:
|
|
102
116
|
mov r14, qword ptr [rdi+0x70]
|
103
117
|
mov r15, qword ptr [rdi+0x78]
|
104
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
105
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
106
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
107
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
108
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
109
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
110
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
111
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
112
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
113
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
114
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
115
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
116
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -144,15 +158,15 @@ blake3_hash_many_avx512:
|
|
144
158
|
mov r14, qword ptr [rdi+0x50]
|
145
159
|
mov r15, qword ptr [rdi+0x58]
|
146
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
147
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
148
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
149
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
150
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
151
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
152
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
153
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
154
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
155
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
156
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
157
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
158
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -172,15 +186,15 @@ blake3_hash_many_avx512:
|
|
172
186
|
mov r14, qword ptr [rdi+0x70]
|
173
187
|
mov r15, qword ptr [rdi+0x78]
|
174
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
175
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
176
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
177
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
178
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
179
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
180
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
181
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
182
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
183
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
184
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
185
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
186
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2039,7 +2053,7 @@ blake3_hash_many_avx512:
|
|
2039
2053
|
vpermq ymm14, ymm14, 0xDC
|
2040
2054
|
vpermq ymm15, ymm15, 0xDC
|
2041
2055
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2042
|
-
|
2056
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2043
2057
|
mov eax, 17476
|
2044
2058
|
kmovw k2, eax
|
2045
2059
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -2372,6 +2386,7 @@ blake3_hash_many_avx512:
|
|
2372
2386
|
.p2align 6
|
2373
2387
|
_blake3_compress_in_place_avx512:
|
2374
2388
|
blake3_compress_in_place_avx512:
|
2389
|
+
_CET_ENDBR
|
2375
2390
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2376
2391
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2377
2392
|
movzx eax, r8b
|
@@ -2454,6 +2469,7 @@ blake3_compress_in_place_avx512:
|
|
2454
2469
|
.p2align 6
|
2455
2470
|
_blake3_compress_xof_avx512:
|
2456
2471
|
blake3_compress_xof_avx512:
|
2472
|
+
_CET_ENDBR
|
2457
2473
|
vmovdqu xmm0, xmmword ptr [rdi]
|
2458
2474
|
vmovdqu xmm1, xmmword ptr [rdi+0x10]
|
2459
2475
|
movzx eax, r8b
|
@@ -96,15 +96,15 @@ blake3_hash_many_avx512:
|
|
96
96
|
mov r14, qword ptr [rdi+0x50]
|
97
97
|
mov r15, qword ptr [rdi+0x58]
|
98
98
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
99
|
-
|
99
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
100
100
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
101
|
-
|
101
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
102
102
|
vpunpcklqdq zmm8, zmm16, zmm17
|
103
103
|
vpunpckhqdq zmm9, zmm16, zmm17
|
104
104
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
105
|
-
|
105
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
106
106
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
107
|
-
|
107
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
108
108
|
vpunpcklqdq zmm10, zmm18, zmm19
|
109
109
|
vpunpckhqdq zmm11, zmm18, zmm19
|
110
110
|
mov r8, qword ptr [rdi+0x20]
|
@@ -116,15 +116,15 @@ blake3_hash_many_avx512:
|
|
116
116
|
mov r14, qword ptr [rdi+0x70]
|
117
117
|
mov r15, qword ptr [rdi+0x78]
|
118
118
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
|
119
|
-
|
119
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
|
120
120
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
|
121
|
-
|
121
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
|
122
122
|
vpunpcklqdq zmm12, zmm16, zmm17
|
123
123
|
vpunpckhqdq zmm13, zmm16, zmm17
|
124
124
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
|
125
|
-
|
125
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
|
126
126
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
|
127
|
-
|
127
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
|
128
128
|
vpunpcklqdq zmm14, zmm18, zmm19
|
129
129
|
vpunpckhqdq zmm15, zmm18, zmm19
|
130
130
|
vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
|
@@ -158,15 +158,15 @@ blake3_hash_many_avx512:
|
|
158
158
|
mov r14, qword ptr [rdi+0x50]
|
159
159
|
mov r15, qword ptr [rdi+0x58]
|
160
160
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
161
|
-
|
161
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
162
162
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
163
|
-
|
163
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
164
164
|
vpunpcklqdq zmm8, zmm24, zmm25
|
165
165
|
vpunpckhqdq zmm9, zmm24, zmm25
|
166
166
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
167
|
-
|
167
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
168
168
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
169
|
-
|
169
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
170
170
|
vpunpcklqdq zmm10, zmm24, zmm25
|
171
171
|
vpunpckhqdq zmm11, zmm24, zmm25
|
172
172
|
prefetcht0 [r8+rdx+0x80]
|
@@ -186,15 +186,15 @@ blake3_hash_many_avx512:
|
|
186
186
|
mov r14, qword ptr [rdi+0x70]
|
187
187
|
mov r15, qword ptr [rdi+0x78]
|
188
188
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
|
189
|
-
|
189
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
|
190
190
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
|
191
|
-
|
191
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
|
192
192
|
vpunpcklqdq zmm12, zmm24, zmm25
|
193
193
|
vpunpckhqdq zmm13, zmm24, zmm25
|
194
194
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
|
195
|
-
|
195
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
|
196
196
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
|
197
|
-
|
197
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
|
198
198
|
vpunpcklqdq zmm14, zmm24, zmm25
|
199
199
|
vpunpckhqdq zmm15, zmm24, zmm25
|
200
200
|
prefetcht0 [r8+rdx+0x80]
|
@@ -2065,7 +2065,7 @@ blake3_hash_many_avx512:
|
|
2065
2065
|
vpermq ymm14, ymm14, 0xDC
|
2066
2066
|
vpermq ymm15, ymm15, 0xDC
|
2067
2067
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
|
2068
|
-
|
2068
|
+
vinserti64x4 zmm13, zmm14, ymm15, 0x01
|
2069
2069
|
mov eax, 17476
|
2070
2070
|
kmovw k2, eax
|
2071
2071
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|
@@ -99,15 +99,15 @@ innerloop16:
|
|
99
99
|
mov r14, qword ptr [rdi+50H]
|
100
100
|
mov r15, qword ptr [rdi+58H]
|
101
101
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
102
|
-
|
102
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
103
103
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
104
|
-
|
104
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
105
105
|
vpunpcklqdq zmm8, zmm16, zmm17
|
106
106
|
vpunpckhqdq zmm9, zmm16, zmm17
|
107
107
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
108
|
-
|
108
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
109
109
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
110
|
-
|
110
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
111
111
|
vpunpcklqdq zmm10, zmm18, zmm19
|
112
112
|
vpunpckhqdq zmm11, zmm18, zmm19
|
113
113
|
mov r8, qword ptr [rdi+20H]
|
@@ -119,15 +119,15 @@ innerloop16:
|
|
119
119
|
mov r14, qword ptr [rdi+70H]
|
120
120
|
mov r15, qword ptr [rdi+78H]
|
121
121
|
vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
|
122
|
-
|
122
|
+
vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
|
123
123
|
vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
|
124
|
-
|
124
|
+
vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
|
125
125
|
vpunpcklqdq zmm12, zmm16, zmm17
|
126
126
|
vpunpckhqdq zmm13, zmm16, zmm17
|
127
127
|
vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
|
128
|
-
|
128
|
+
vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
|
129
129
|
vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
|
130
|
-
|
130
|
+
vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
|
131
131
|
vpunpcklqdq zmm14, zmm18, zmm19
|
132
132
|
vpunpckhqdq zmm15, zmm18, zmm19
|
133
133
|
vmovdqa32 zmm27, zmmword ptr [INDEX0]
|
@@ -161,15 +161,15 @@ innerloop16:
|
|
161
161
|
mov r14, qword ptr [rdi+50H]
|
162
162
|
mov r15, qword ptr [rdi+58H]
|
163
163
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
164
|
-
|
164
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
165
165
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
166
|
-
|
166
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
167
167
|
vpunpcklqdq zmm8, zmm24, zmm25
|
168
168
|
vpunpckhqdq zmm9, zmm24, zmm25
|
169
169
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
170
|
-
|
170
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
171
171
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
172
|
-
|
172
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
173
173
|
vpunpcklqdq zmm10, zmm24, zmm25
|
174
174
|
vpunpckhqdq zmm11, zmm24, zmm25
|
175
175
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -189,15 +189,15 @@ innerloop16:
|
|
189
189
|
mov r14, qword ptr [rdi+70H]
|
190
190
|
mov r15, qword ptr [rdi+78H]
|
191
191
|
vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
|
192
|
-
|
192
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
|
193
193
|
vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
|
194
|
-
|
194
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
|
195
195
|
vpunpcklqdq zmm12, zmm24, zmm25
|
196
196
|
vpunpckhqdq zmm13, zmm24, zmm25
|
197
197
|
vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
|
198
|
-
|
198
|
+
vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
|
199
199
|
vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
|
200
|
-
|
200
|
+
vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
|
201
201
|
vpunpcklqdq zmm14, zmm24, zmm25
|
202
202
|
vpunpckhqdq zmm15, zmm24, zmm25
|
203
203
|
prefetcht0 byte ptr [r8+rdx+80H]
|
@@ -2073,7 +2073,7 @@ final7blocks:
|
|
2073
2073
|
vpermq ymm14, ymm14, 0DCH
|
2074
2074
|
vpermq ymm15, ymm15, 0DCH
|
2075
2075
|
vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
|
2076
|
-
|
2076
|
+
vinserti64x4 zmm13, zmm14, ymm15, 01H
|
2077
2077
|
mov eax, 17476
|
2078
2078
|
kmovw k2, eax
|
2079
2079
|
vpblendmd zmm13 {k2}, zmm13, zmm12
|