digest-blake3 0.22.1 → 0.34.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/ext/digest/blake3/blake3.c +24 -15
- data/ext/digest/blake3/blake3.h +4 -2
- data/ext/digest/blake3/blake3_avx512_x86-64_unix.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +17 -17
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_msvc.asm +17 -17
- data/ext/digest/blake3/blake3_dispatch.c +3 -70
- data/ext/digest/blake3/blake3_impl.h +69 -1
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +19 -7
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +19 -7
- data/lib/digest/blake3/version.rb +1 -1
- metadata +3 -4
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: a3db2fab1165a083a1a83d5c656c1c737d53f853de91babcb6c9c0e74ec7e23a
         | 
| 4 | 
            +
              data.tar.gz: d4692ef2c6326a70ffa0cad5ed90219daa96c0940c0e9986d9ee7b4469d6b48d
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 7ef86ba9e54408a68179d43678d7863d1af3d51e6002315d4607e377f2a142d374f4dc0e4d5f8ddde641063d3b5e2f93214fb10274aba849eee757d5f884d854
         | 
| 7 | 
            +
              data.tar.gz: e8bf900ad7eece0df62964ca7695af5c8681cbe23c3b7e6cc2af4b0ac2c1d6b3f74de987d7998f4627a5bfcb164c2bafa7ffda8bf8f96be5075ca44761aa2c23
         | 
    
        data/Gemfile.lock
    CHANGED
    
    
    
        data/ext/digest/blake3/blake3.c
    CHANGED
    
    | @@ -84,23 +84,26 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) { | |
| 84 84 | 
             
              memcpy(cv, cv_words, 32);
         | 
| 85 85 | 
             
            }
         | 
| 86 86 |  | 
| 87 | 
            -
            INLINE void output_root_bytes(const output_t *self, uint8_t *out,
         | 
| 87 | 
            +
            INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
         | 
| 88 88 | 
             
                                          size_t out_len) {
         | 
| 89 | 
            -
              uint64_t output_block_counter =  | 
| 89 | 
            +
              uint64_t output_block_counter = seek / 64;
         | 
| 90 | 
            +
              size_t offset_within_block = seek % 64;
         | 
| 90 91 | 
             
              uint8_t wide_buf[64];
         | 
| 91 92 | 
             
              while (out_len > 0) {
         | 
| 92 93 | 
             
                blake3_compress_xof(self->input_cv, self->block, self->block_len,
         | 
| 93 94 | 
             
                                    output_block_counter, self->flags | ROOT, wide_buf);
         | 
| 95 | 
            +
                size_t available_bytes = 64 - offset_within_block;
         | 
| 94 96 | 
             
                size_t memcpy_len;
         | 
| 95 | 
            -
                if (out_len >  | 
| 96 | 
            -
                  memcpy_len =  | 
| 97 | 
            +
                if (out_len > available_bytes) {
         | 
| 98 | 
            +
                  memcpy_len = available_bytes;
         | 
| 97 99 | 
             
                } else {
         | 
| 98 100 | 
             
                  memcpy_len = out_len;
         | 
| 99 101 | 
             
                }
         | 
| 100 | 
            -
                memcpy(out, wide_buf, memcpy_len);
         | 
| 102 | 
            +
                memcpy(out, wide_buf + offset_within_block, memcpy_len);
         | 
| 101 103 | 
             
                out += memcpy_len;
         | 
| 102 104 | 
             
                out_len -= memcpy_len;
         | 
| 103 105 | 
             
                output_block_counter += 1;
         | 
| 106 | 
            +
                offset_within_block = 0;
         | 
| 104 107 | 
             
              }
         | 
| 105 108 | 
             
            }
         | 
| 106 109 |  | 
| @@ -256,10 +259,11 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values, | |
| 256 259 | 
             
            // Why not just have the caller split the input on the first update(), instead
         | 
| 257 260 | 
             
            // of implementing this special rule? Because we don't want to limit SIMD or
         | 
| 258 261 | 
             
            // multi-threading parallelism for that update().
         | 
| 259 | 
            -
            size_t blake3_compress_subtree_wide(const uint8_t *input, | 
| 260 | 
            -
             | 
| 261 | 
            -
             | 
| 262 | 
            -
             | 
| 262 | 
            +
            static size_t blake3_compress_subtree_wide(const uint8_t *input,
         | 
| 263 | 
            +
                                                       size_t input_len,
         | 
| 264 | 
            +
                                                       const uint32_t key[8],
         | 
| 265 | 
            +
                                                       uint64_t chunk_counter,
         | 
| 266 | 
            +
                                                       uint8_t flags, uint8_t *out) {
         | 
| 263 267 | 
             
              // Note that the single chunk case does *not* bump the SIMD degree up to 2
         | 
| 264 268 | 
             
              // when it is 1. If this implementation adds multi-threading in the future,
         | 
| 265 269 | 
             
              // this gives us the option of multi-threading even the 2-chunk case, which
         | 
| @@ -425,8 +429,8 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) { | |
| 425 429 | 
             
            // compress_subtree_to_parent_node(). That function always returns the top
         | 
| 426 430 | 
             
            // *two* chaining values of the subtree it's compressing. We then do lazy
         | 
| 427 431 | 
             
            // merging with each of them separately, so that the second CV will always
         | 
| 428 | 
            -
            // remain unmerged. ( | 
| 429 | 
            -
            //  | 
| 432 | 
            +
            // remain unmerged. (That also helps us support extendable output when we're
         | 
| 433 | 
            +
            // hashing an input all-at-once.)
         | 
| 430 434 | 
             
            INLINE void hasher_push_cv(blake3_hasher *self, uint8_t new_cv[BLAKE3_OUT_LEN],
         | 
| 431 435 | 
             
                                       uint64_t chunk_counter) {
         | 
| 432 436 | 
             
              hasher_merge_cv_stack(self, chunk_counter);
         | 
| @@ -472,8 +476,8 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, | |
| 472 476 |  | 
| 473 477 | 
             
              // Now the chunk_state is clear, and we have more input. If there's more than
         | 
| 474 478 | 
             
              // a single chunk (so, definitely not the root chunk), hash the largest whole
         | 
| 475 | 
            -
              // subtree we can, with the full benefits of SIMD and  | 
| 476 | 
            -
              // parallelism. Two restrictions:
         | 
| 479 | 
            +
              // subtree we can, with the full benefits of SIMD (and maybe in the future,
         | 
| 480 | 
            +
              // multi-threading) parallelism. Two restrictions:
         | 
| 477 481 | 
             
              // - The subtree has to be a power-of-2 number of chunks. Only subtrees along
         | 
| 478 482 | 
             
              //   the right edge can be incomplete, and we don't know where the right edge
         | 
| 479 483 | 
             
              //   is going to be until we get to finalize().
         | 
| @@ -546,6 +550,11 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, | |
| 546 550 |  | 
| 547 551 | 
             
            void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
         | 
| 548 552 | 
             
                                        size_t out_len) {
         | 
| 553 | 
            +
              blake3_hasher_finalize_seek(self, 0, out, out_len);
         | 
| 554 | 
            +
            }
         | 
| 555 | 
            +
             | 
| 556 | 
            +
            void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
         | 
| 557 | 
            +
                                             uint8_t *out, size_t out_len) {
         | 
| 549 558 | 
             
              // Explicitly checking for zero avoids causing UB by passing a null pointer
         | 
| 550 559 | 
             
              // to memcpy. This comes up in practice with things like:
         | 
| 551 560 | 
             
              //   std::vector<uint8_t> v;
         | 
| @@ -557,7 +566,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, | |
| 557 566 | 
             
              // If the subtree stack is empty, then the current chunk is the root.
         | 
| 558 567 | 
             
              if (self->cv_stack_len == 0) {
         | 
| 559 568 | 
             
                output_t output = chunk_state_output(&self->chunk);
         | 
| 560 | 
            -
                output_root_bytes(&output, out, out_len);
         | 
| 569 | 
            +
                output_root_bytes(&output, seek, out, out_len);
         | 
| 561 570 | 
             
                return;
         | 
| 562 571 | 
             
              }
         | 
| 563 572 | 
             
              // If there are any bytes in the chunk state, finalize that chunk and do a
         | 
| @@ -585,5 +594,5 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out, | |
| 585 594 | 
             
                output_chaining_value(&output, &parent_block[32]);
         | 
| 586 595 | 
             
                output = parent_output(parent_block, self->key, self->chunk.flags);
         | 
| 587 596 | 
             
              }
         | 
| 588 | 
            -
              output_root_bytes(&output, out, out_len);
         | 
| 597 | 
            +
              output_root_bytes(&output, seek, out, out_len);
         | 
| 589 598 | 
             
            }
         | 
    
        data/ext/digest/blake3/blake3.h
    CHANGED
    
    | @@ -4,7 +4,7 @@ | |
| 4 4 | 
             
            #include <stddef.h>
         | 
| 5 5 | 
             
            #include <stdint.h>
         | 
| 6 6 |  | 
| 7 | 
            -
            #ifdef | 
| 7 | 
            +
            #ifdef __cplusplus
         | 
| 8 8 | 
             
            extern "C" {
         | 
| 9 9 | 
             
            #endif
         | 
| 10 10 |  | 
| @@ -46,8 +46,10 @@ void blake3_hasher_update(blake3_hasher *self, const void *input, | |
| 46 46 | 
             
                                      size_t input_len);
         | 
| 47 47 | 
             
            void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
         | 
| 48 48 | 
             
                                        size_t out_len);
         | 
| 49 | 
            +
            void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
         | 
| 50 | 
            +
                                             uint8_t *out, size_t out_len);
         | 
| 49 51 |  | 
| 50 | 
            -
            #ifdef | 
| 52 | 
            +
            #ifdef __cplusplus
         | 
| 51 53 | 
             
            }
         | 
| 52 54 | 
             
            #endif
         | 
| 53 55 |  | 
| @@ -82,15 +82,15 @@ blake3_hash_many_avx512: | |
| 82 82 | 
             
                    mov     r14, qword ptr [rdi+0x50]
         | 
| 83 83 | 
             
                    mov     r15, qword ptr [rdi+0x58]
         | 
| 84 84 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
         | 
| 85 | 
            -
                     | 
| 85 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
         | 
| 86 86 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
         | 
| 87 | 
            -
                     | 
| 87 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
         | 
| 88 88 | 
             
                    vpunpcklqdq zmm8, zmm16, zmm17
         | 
| 89 89 | 
             
                    vpunpckhqdq zmm9, zmm16, zmm17
         | 
| 90 90 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
         | 
| 91 | 
            -
                     | 
| 91 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
         | 
| 92 92 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
         | 
| 93 | 
            -
                     | 
| 93 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
         | 
| 94 94 | 
             
                    vpunpcklqdq zmm10, zmm18, zmm19
         | 
| 95 95 | 
             
                    vpunpckhqdq zmm11, zmm18, zmm19
         | 
| 96 96 | 
             
                    mov     r8, qword ptr [rdi+0x20]
         | 
| @@ -102,15 +102,15 @@ blake3_hash_many_avx512: | |
| 102 102 | 
             
                    mov     r14, qword ptr [rdi+0x70]
         | 
| 103 103 | 
             
                    mov     r15, qword ptr [rdi+0x78]
         | 
| 104 104 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
         | 
| 105 | 
            -
                     | 
| 105 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
         | 
| 106 106 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
         | 
| 107 | 
            -
                     | 
| 107 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
         | 
| 108 108 | 
             
                    vpunpcklqdq zmm12, zmm16, zmm17
         | 
| 109 109 | 
             
                    vpunpckhqdq zmm13, zmm16, zmm17
         | 
| 110 110 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
         | 
| 111 | 
            -
                     | 
| 111 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
         | 
| 112 112 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
         | 
| 113 | 
            -
                     | 
| 113 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
         | 
| 114 114 | 
             
                    vpunpcklqdq zmm14, zmm18, zmm19
         | 
| 115 115 | 
             
                    vpunpckhqdq zmm15, zmm18, zmm19
         | 
| 116 116 | 
             
                    vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
         | 
| @@ -144,15 +144,15 @@ blake3_hash_many_avx512: | |
| 144 144 | 
             
                    mov     r14, qword ptr [rdi+0x50]
         | 
| 145 145 | 
             
                    mov     r15, qword ptr [rdi+0x58]
         | 
| 146 146 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
         | 
| 147 | 
            -
                     | 
| 147 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
         | 
| 148 148 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
         | 
| 149 | 
            -
                     | 
| 149 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
         | 
| 150 150 | 
             
                    vpunpcklqdq zmm8, zmm24, zmm25
         | 
| 151 151 | 
             
                    vpunpckhqdq zmm9, zmm24, zmm25
         | 
| 152 152 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
         | 
| 153 | 
            -
                     | 
| 153 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
         | 
| 154 154 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
         | 
| 155 | 
            -
                     | 
| 155 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
         | 
| 156 156 | 
             
                    vpunpcklqdq zmm10, zmm24, zmm25
         | 
| 157 157 | 
             
                    vpunpckhqdq zmm11, zmm24, zmm25
         | 
| 158 158 | 
             
                    prefetcht0 [r8+rdx+0x80]
         | 
| @@ -172,15 +172,15 @@ blake3_hash_many_avx512: | |
| 172 172 | 
             
                    mov     r14, qword ptr [rdi+0x70]
         | 
| 173 173 | 
             
                    mov     r15, qword ptr [rdi+0x78]
         | 
| 174 174 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
         | 
| 175 | 
            -
                     | 
| 175 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
         | 
| 176 176 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
         | 
| 177 | 
            -
                     | 
| 177 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
         | 
| 178 178 | 
             
                    vpunpcklqdq zmm12, zmm24, zmm25
         | 
| 179 179 | 
             
                    vpunpckhqdq zmm13, zmm24, zmm25
         | 
| 180 180 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
         | 
| 181 | 
            -
                     | 
| 181 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
         | 
| 182 182 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
         | 
| 183 | 
            -
                     | 
| 183 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
         | 
| 184 184 | 
             
                    vpunpcklqdq zmm14, zmm24, zmm25
         | 
| 185 185 | 
             
                    vpunpckhqdq zmm15, zmm24, zmm25
         | 
| 186 186 | 
             
                    prefetcht0 [r8+rdx+0x80]
         | 
| @@ -2039,7 +2039,7 @@ blake3_hash_many_avx512: | |
| 2039 2039 | 
             
                    vpermq  ymm14, ymm14, 0xDC
         | 
| 2040 2040 | 
             
                    vpermq  ymm15, ymm15, 0xDC
         | 
| 2041 2041 | 
             
                    vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
         | 
| 2042 | 
            -
                     | 
| 2042 | 
            +
                    vinserti64x4 zmm13, zmm14, ymm15, 0x01
         | 
| 2043 2043 | 
             
                    mov     eax, 17476
         | 
| 2044 2044 | 
             
                    kmovw   k2, eax
         | 
| 2045 2045 | 
             
                    vpblendmd zmm13 {k2}, zmm13, zmm12
         | 
| @@ -96,15 +96,15 @@ blake3_hash_many_avx512: | |
| 96 96 | 
             
                    mov     r14, qword ptr [rdi+0x50]
         | 
| 97 97 | 
             
                    mov     r15, qword ptr [rdi+0x58]
         | 
| 98 98 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
         | 
| 99 | 
            -
                     | 
| 99 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
         | 
| 100 100 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
         | 
| 101 | 
            -
                     | 
| 101 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
         | 
| 102 102 | 
             
                    vpunpcklqdq zmm8, zmm16, zmm17
         | 
| 103 103 | 
             
                    vpunpckhqdq zmm9, zmm16, zmm17
         | 
| 104 104 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
         | 
| 105 | 
            -
                     | 
| 105 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
         | 
| 106 106 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
         | 
| 107 | 
            -
                     | 
| 107 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
         | 
| 108 108 | 
             
                    vpunpcklqdq zmm10, zmm18, zmm19
         | 
| 109 109 | 
             
                    vpunpckhqdq zmm11, zmm18, zmm19
         | 
| 110 110 | 
             
                    mov     r8, qword ptr [rdi+0x20]
         | 
| @@ -116,15 +116,15 @@ blake3_hash_many_avx512: | |
| 116 116 | 
             
                    mov     r14, qword ptr [rdi+0x70]
         | 
| 117 117 | 
             
                    mov     r15, qword ptr [rdi+0x78]
         | 
| 118 118 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20]
         | 
| 119 | 
            -
                     | 
| 119 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01
         | 
| 120 120 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20]
         | 
| 121 | 
            -
                     | 
| 121 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01
         | 
| 122 122 | 
             
                    vpunpcklqdq zmm12, zmm16, zmm17
         | 
| 123 123 | 
             
                    vpunpckhqdq zmm13, zmm16, zmm17
         | 
| 124 124 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20]
         | 
| 125 | 
            -
                     | 
| 125 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01
         | 
| 126 126 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20]
         | 
| 127 | 
            -
                     | 
| 127 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01
         | 
| 128 128 | 
             
                    vpunpcklqdq zmm14, zmm18, zmm19
         | 
| 129 129 | 
             
                    vpunpckhqdq zmm15, zmm18, zmm19
         | 
| 130 130 | 
             
                    vmovdqa32 zmm27, zmmword ptr [INDEX0+rip]
         | 
| @@ -158,15 +158,15 @@ blake3_hash_many_avx512: | |
| 158 158 | 
             
                    mov     r14, qword ptr [rdi+0x50]
         | 
| 159 159 | 
             
                    mov     r15, qword ptr [rdi+0x58]
         | 
| 160 160 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
         | 
| 161 | 
            -
                     | 
| 161 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
         | 
| 162 162 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
         | 
| 163 | 
            -
                     | 
| 163 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
         | 
| 164 164 | 
             
                    vpunpcklqdq zmm8, zmm24, zmm25
         | 
| 165 165 | 
             
                    vpunpckhqdq zmm9, zmm24, zmm25
         | 
| 166 166 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
         | 
| 167 | 
            -
                     | 
| 167 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
         | 
| 168 168 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
         | 
| 169 | 
            -
                     | 
| 169 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
         | 
| 170 170 | 
             
                    vpunpcklqdq zmm10, zmm24, zmm25
         | 
| 171 171 | 
             
                    vpunpckhqdq zmm11, zmm24, zmm25
         | 
| 172 172 | 
             
                    prefetcht0 [r8+rdx+0x80]
         | 
| @@ -186,15 +186,15 @@ blake3_hash_many_avx512: | |
| 186 186 | 
             
                    mov     r14, qword ptr [rdi+0x70]
         | 
| 187 187 | 
             
                    mov     r15, qword ptr [rdi+0x78]
         | 
| 188 188 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20]
         | 
| 189 | 
            -
                     | 
| 189 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01
         | 
| 190 190 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20]
         | 
| 191 | 
            -
                     | 
| 191 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01
         | 
| 192 192 | 
             
                    vpunpcklqdq zmm12, zmm24, zmm25
         | 
| 193 193 | 
             
                    vpunpckhqdq zmm13, zmm24, zmm25
         | 
| 194 194 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20]
         | 
| 195 | 
            -
                     | 
| 195 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01
         | 
| 196 196 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20]
         | 
| 197 | 
            -
                     | 
| 197 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01
         | 
| 198 198 | 
             
                    vpunpcklqdq zmm14, zmm24, zmm25
         | 
| 199 199 | 
             
                    vpunpckhqdq zmm15, zmm24, zmm25
         | 
| 200 200 | 
             
                    prefetcht0 [r8+rdx+0x80]
         | 
| @@ -2065,7 +2065,7 @@ blake3_hash_many_avx512: | |
| 2065 2065 | 
             
                    vpermq  ymm14, ymm14, 0xDC
         | 
| 2066 2066 | 
             
                    vpermq  ymm15, ymm15, 0xDC
         | 
| 2067 2067 | 
             
                    vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip]
         | 
| 2068 | 
            -
                     | 
| 2068 | 
            +
                    vinserti64x4 zmm13, zmm14, ymm15, 0x01
         | 
| 2069 2069 | 
             
                    mov     eax, 17476
         | 
| 2070 2070 | 
             
                    kmovw   k2, eax
         | 
| 2071 2071 | 
             
                    vpblendmd zmm13 {k2}, zmm13, zmm12
         | 
| @@ -99,15 +99,15 @@ innerloop16: | |
| 99 99 | 
             
                    mov     r14, qword ptr [rdi+50H]
         | 
| 100 100 | 
             
                    mov     r15, qword ptr [rdi+58H]
         | 
| 101 101 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
         | 
| 102 | 
            -
                     | 
| 102 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
         | 
| 103 103 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
         | 
| 104 | 
            -
                     | 
| 104 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
         | 
| 105 105 | 
             
                    vpunpcklqdq zmm8, zmm16, zmm17
         | 
| 106 106 | 
             
                    vpunpckhqdq zmm9, zmm16, zmm17
         | 
| 107 107 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
         | 
| 108 | 
            -
                     | 
| 108 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
         | 
| 109 109 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
         | 
| 110 | 
            -
                     | 
| 110 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
         | 
| 111 111 | 
             
                    vpunpcklqdq zmm10, zmm18, zmm19
         | 
| 112 112 | 
             
                    vpunpckhqdq zmm11, zmm18, zmm19
         | 
| 113 113 | 
             
                    mov     r8, qword ptr [rdi+20H]
         | 
| @@ -119,15 +119,15 @@ innerloop16: | |
| 119 119 | 
             
                    mov     r14, qword ptr [rdi+70H]
         | 
| 120 120 | 
             
                    mov     r15, qword ptr [rdi+78H]
         | 
| 121 121 | 
             
                    vmovdqu32 ymm16, ymmword ptr [rdx+r8-2H*20H]
         | 
| 122 | 
            -
                     | 
| 122 | 
            +
                    vinserti64x4 zmm16, zmm16, ymmword ptr [rdx+r12-2H*20H], 01H
         | 
| 123 123 | 
             
                    vmovdqu32 ymm17, ymmword ptr [rdx+r9-2H*20H]
         | 
| 124 | 
            -
                     | 
| 124 | 
            +
                    vinserti64x4 zmm17, zmm17, ymmword ptr [rdx+r13-2H*20H], 01H
         | 
| 125 125 | 
             
                    vpunpcklqdq zmm12, zmm16, zmm17
         | 
| 126 126 | 
             
                    vpunpckhqdq zmm13, zmm16, zmm17
         | 
| 127 127 | 
             
                    vmovdqu32 ymm18, ymmword ptr [rdx+r10-2H*20H]
         | 
| 128 | 
            -
                     | 
| 128 | 
            +
                    vinserti64x4 zmm18, zmm18, ymmword ptr [rdx+r14-2H*20H], 01H
         | 
| 129 129 | 
             
                    vmovdqu32 ymm19, ymmword ptr [rdx+r11-2H*20H]
         | 
| 130 | 
            -
                     | 
| 130 | 
            +
                    vinserti64x4 zmm19, zmm19, ymmword ptr [rdx+r15-2H*20H], 01H
         | 
| 131 131 | 
             
                    vpunpcklqdq zmm14, zmm18, zmm19
         | 
| 132 132 | 
             
                    vpunpckhqdq zmm15, zmm18, zmm19
         | 
| 133 133 | 
             
                    vmovdqa32 zmm27, zmmword ptr [INDEX0]
         | 
| @@ -161,15 +161,15 @@ innerloop16: | |
| 161 161 | 
             
                    mov     r14, qword ptr [rdi+50H]
         | 
| 162 162 | 
             
                    mov     r15, qword ptr [rdi+58H]
         | 
| 163 163 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
         | 
| 164 | 
            -
                     | 
| 164 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
         | 
| 165 165 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
         | 
| 166 | 
            -
                     | 
| 166 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
         | 
| 167 167 | 
             
                    vpunpcklqdq zmm8, zmm24, zmm25
         | 
| 168 168 | 
             
                    vpunpckhqdq zmm9, zmm24, zmm25
         | 
| 169 169 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
         | 
| 170 | 
            -
                     | 
| 170 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
         | 
| 171 171 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
         | 
| 172 | 
            -
                     | 
| 172 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
         | 
| 173 173 | 
             
                    vpunpcklqdq zmm10, zmm24, zmm25
         | 
| 174 174 | 
             
                    vpunpckhqdq zmm11, zmm24, zmm25
         | 
| 175 175 | 
             
                    prefetcht0 byte ptr [r8+rdx+80H]
         | 
| @@ -189,15 +189,15 @@ innerloop16: | |
| 189 189 | 
             
                    mov     r14, qword ptr [rdi+70H]
         | 
| 190 190 | 
             
                    mov     r15, qword ptr [rdi+78H]
         | 
| 191 191 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r8+rdx-1H*20H]
         | 
| 192 | 
            -
                     | 
| 192 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r12+rdx-1H*20H], 01H
         | 
| 193 193 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r9+rdx-1H*20H]
         | 
| 194 | 
            -
                     | 
| 194 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r13+rdx-1H*20H], 01H
         | 
| 195 195 | 
             
                    vpunpcklqdq zmm12, zmm24, zmm25
         | 
| 196 196 | 
             
                    vpunpckhqdq zmm13, zmm24, zmm25
         | 
| 197 197 | 
             
                    vmovdqu32 ymm24, ymmword ptr [r10+rdx-1H*20H]
         | 
| 198 | 
            -
                     | 
| 198 | 
            +
                    vinserti64x4 zmm24, zmm24, ymmword ptr [r14+rdx-1H*20H], 01H
         | 
| 199 199 | 
             
                    vmovdqu32 ymm25, ymmword ptr [r11+rdx-1H*20H]
         | 
| 200 | 
            -
                     | 
| 200 | 
            +
                    vinserti64x4 zmm25, zmm25, ymmword ptr [r15+rdx-1H*20H], 01H
         | 
| 201 201 | 
             
                    vpunpcklqdq zmm14, zmm24, zmm25
         | 
| 202 202 | 
             
                    vpunpckhqdq zmm15, zmm24, zmm25
         | 
| 203 203 | 
             
                    prefetcht0 byte  ptr [r8+rdx+80H]
         | 
| @@ -2073,7 +2073,7 @@ final7blocks: | |
| 2073 2073 | 
             
                    vpermq  ymm14, ymm14, 0DCH
         | 
| 2074 2074 | 
             
                    vpermq  ymm15, ymm15, 0DCH
         | 
| 2075 2075 | 
             
                    vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN]
         | 
| 2076 | 
            -
                     | 
| 2076 | 
            +
                    vinserti64x4 zmm13, zmm14, ymm15, 01H
         | 
| 2077 2077 | 
             
                    mov     eax, 17476
         | 
| 2078 2078 | 
             
                    kmovw   k2, eax
         | 
| 2079 2079 | 
             
                    vpblendmd zmm13 {k2}, zmm13, zmm12
         | 
| @@ -14,73 +14,6 @@ | |
| 14 14 | 
             
            #endif
         | 
| 15 15 | 
             
            #endif
         | 
| 16 16 |  | 
| 17 | 
            -
            // Declarations for implementation-specific functions.
         | 
| 18 | 
            -
            void blake3_compress_in_place_portable(uint32_t cv[8],
         | 
| 19 | 
            -
                                                   const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 20 | 
            -
                                                   uint8_t block_len, uint64_t counter,
         | 
| 21 | 
            -
                                                   uint8_t flags);
         | 
| 22 | 
            -
             | 
| 23 | 
            -
            void blake3_compress_xof_portable(const uint32_t cv[8],
         | 
| 24 | 
            -
                                              const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 25 | 
            -
                                              uint8_t block_len, uint64_t counter,
         | 
| 26 | 
            -
                                              uint8_t flags, uint8_t out[64]);
         | 
| 27 | 
            -
             | 
| 28 | 
            -
            void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 29 | 
            -
                                           size_t blocks, const uint32_t key[8],
         | 
| 30 | 
            -
                                           uint64_t counter, bool increment_counter,
         | 
| 31 | 
            -
                                           uint8_t flags, uint8_t flags_start,
         | 
| 32 | 
            -
                                           uint8_t flags_end, uint8_t *out);
         | 
| 33 | 
            -
             | 
| 34 | 
            -
            #if defined(IS_X86)
         | 
| 35 | 
            -
            #if !defined(BLAKE3_NO_SSE41)
         | 
| 36 | 
            -
            void blake3_compress_in_place_sse41(uint32_t cv[8],
         | 
| 37 | 
            -
                                                const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 38 | 
            -
                                                uint8_t block_len, uint64_t counter,
         | 
| 39 | 
            -
                                                uint8_t flags);
         | 
| 40 | 
            -
            void blake3_compress_xof_sse41(const uint32_t cv[8],
         | 
| 41 | 
            -
                                           const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 42 | 
            -
                                           uint8_t block_len, uint64_t counter,
         | 
| 43 | 
            -
                                           uint8_t flags, uint8_t out[64]);
         | 
| 44 | 
            -
            void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 45 | 
            -
                                        size_t blocks, const uint32_t key[8],
         | 
| 46 | 
            -
                                        uint64_t counter, bool increment_counter,
         | 
| 47 | 
            -
                                        uint8_t flags, uint8_t flags_start,
         | 
| 48 | 
            -
                                        uint8_t flags_end, uint8_t *out);
         | 
| 49 | 
            -
            #endif
         | 
| 50 | 
            -
            #if !defined(BLAKE3_NO_AVX2)
         | 
| 51 | 
            -
            void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 52 | 
            -
                                       size_t blocks, const uint32_t key[8],
         | 
| 53 | 
            -
                                       uint64_t counter, bool increment_counter,
         | 
| 54 | 
            -
                                       uint8_t flags, uint8_t flags_start,
         | 
| 55 | 
            -
                                       uint8_t flags_end, uint8_t *out);
         | 
| 56 | 
            -
            #endif
         | 
| 57 | 
            -
            #if !defined(BLAKE3_NO_AVX512)
         | 
| 58 | 
            -
            void blake3_compress_in_place_avx512(uint32_t cv[8],
         | 
| 59 | 
            -
                                                 const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 60 | 
            -
                                                 uint8_t block_len, uint64_t counter,
         | 
| 61 | 
            -
                                                 uint8_t flags);
         | 
| 62 | 
            -
             | 
| 63 | 
            -
            void blake3_compress_xof_avx512(const uint32_t cv[8],
         | 
| 64 | 
            -
                                            const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 65 | 
            -
                                            uint8_t block_len, uint64_t counter,
         | 
| 66 | 
            -
                                            uint8_t flags, uint8_t out[64]);
         | 
| 67 | 
            -
             | 
| 68 | 
            -
            void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 69 | 
            -
                                         size_t blocks, const uint32_t key[8],
         | 
| 70 | 
            -
                                         uint64_t counter, bool increment_counter,
         | 
| 71 | 
            -
                                         uint8_t flags, uint8_t flags_start,
         | 
| 72 | 
            -
                                         uint8_t flags_end, uint8_t *out);
         | 
| 73 | 
            -
            #endif
         | 
| 74 | 
            -
            #endif
         | 
| 75 | 
            -
             | 
| 76 | 
            -
            #if defined(BLAKE3_USE_NEON)
         | 
| 77 | 
            -
            void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 78 | 
            -
                                       size_t blocks, const uint32_t key[8],
         | 
| 79 | 
            -
                                       uint64_t counter, bool increment_counter,
         | 
| 80 | 
            -
                                       uint8_t flags, uint8_t flags_start,
         | 
| 81 | 
            -
                                       uint8_t flags_end, uint8_t *out);
         | 
| 82 | 
            -
            #endif
         | 
| 83 | 
            -
             | 
| 84 17 | 
             
            #if defined(IS_X86)
         | 
| 85 18 | 
             
            static uint64_t xgetbv() {
         | 
| 86 19 | 
             
            #if defined(_MSC_VER)
         | 
| @@ -249,7 +182,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, | |
| 249 182 | 
             
            #if defined(IS_X86)
         | 
| 250 183 | 
             
              const enum cpu_feature features = get_cpu_features();
         | 
| 251 184 | 
             
            #if !defined(BLAKE3_NO_AVX512)
         | 
| 252 | 
            -
              if (features & AVX512F) {
         | 
| 185 | 
            +
              if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
         | 
| 253 186 | 
             
                blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
         | 
| 254 187 | 
             
                                        increment_counter, flags, flags_start, flags_end,
         | 
| 255 188 | 
             
                                        out);
         | 
| @@ -286,11 +219,11 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, | |
| 286 219 | 
             
            }
         | 
| 287 220 |  | 
| 288 221 | 
             
            // The dynamically detected SIMD degree of the current platform.
         | 
| 289 | 
            -
            size_t blake3_simd_degree() {
         | 
| 222 | 
            +
            size_t blake3_simd_degree(void) {
         | 
| 290 223 | 
             
            #if defined(IS_X86)
         | 
| 291 224 | 
             
              const enum cpu_feature features = get_cpu_features();
         | 
| 292 225 | 
             
            #if !defined(BLAKE3_NO_AVX512)
         | 
| 293 | 
            -
              if (features & AVX512F) {
         | 
| 226 | 
            +
              if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
         | 
| 294 227 | 
             
                return 16;
         | 
| 295 228 | 
             
              }
         | 
| 296 229 | 
             
            #endif
         | 
| @@ -161,7 +161,75 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, | |
| 161 161 | 
             
                                  bool increment_counter, uint8_t flags,
         | 
| 162 162 | 
             
                                  uint8_t flags_start, uint8_t flags_end, uint8_t *out);
         | 
| 163 163 |  | 
| 164 | 
            -
            size_t blake3_simd_degree();
         | 
| 164 | 
            +
            size_t blake3_simd_degree(void);
         | 
| 165 | 
            +
             | 
| 166 | 
            +
             | 
| 167 | 
            +
            // Declarations for implementation-specific functions.
         | 
| 168 | 
            +
            void blake3_compress_in_place_portable(uint32_t cv[8],
         | 
| 169 | 
            +
                                                   const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 170 | 
            +
                                                   uint8_t block_len, uint64_t counter,
         | 
| 171 | 
            +
                                                   uint8_t flags);
         | 
| 172 | 
            +
             | 
| 173 | 
            +
            void blake3_compress_xof_portable(const uint32_t cv[8],
         | 
| 174 | 
            +
                                              const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 175 | 
            +
                                              uint8_t block_len, uint64_t counter,
         | 
| 176 | 
            +
                                              uint8_t flags, uint8_t out[64]);
         | 
| 177 | 
            +
             | 
| 178 | 
            +
            void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 179 | 
            +
                                           size_t blocks, const uint32_t key[8],
         | 
| 180 | 
            +
                                           uint64_t counter, bool increment_counter,
         | 
| 181 | 
            +
                                           uint8_t flags, uint8_t flags_start,
         | 
| 182 | 
            +
                                           uint8_t flags_end, uint8_t *out);
         | 
| 183 | 
            +
             | 
| 184 | 
            +
            #if defined(IS_X86)
         | 
| 185 | 
            +
            #if !defined(BLAKE3_NO_SSE41)
         | 
| 186 | 
            +
            void blake3_compress_in_place_sse41(uint32_t cv[8],
         | 
| 187 | 
            +
                                                const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 188 | 
            +
                                                uint8_t block_len, uint64_t counter,
         | 
| 189 | 
            +
                                                uint8_t flags);
         | 
| 190 | 
            +
            void blake3_compress_xof_sse41(const uint32_t cv[8],
         | 
| 191 | 
            +
                                           const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 192 | 
            +
                                           uint8_t block_len, uint64_t counter,
         | 
| 193 | 
            +
                                           uint8_t flags, uint8_t out[64]);
         | 
| 194 | 
            +
            void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 195 | 
            +
                                        size_t blocks, const uint32_t key[8],
         | 
| 196 | 
            +
                                        uint64_t counter, bool increment_counter,
         | 
| 197 | 
            +
                                        uint8_t flags, uint8_t flags_start,
         | 
| 198 | 
            +
                                        uint8_t flags_end, uint8_t *out);
         | 
| 199 | 
            +
            #endif
         | 
| 200 | 
            +
            #if !defined(BLAKE3_NO_AVX2)
         | 
| 201 | 
            +
            void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 202 | 
            +
                                       size_t blocks, const uint32_t key[8],
         | 
| 203 | 
            +
                                       uint64_t counter, bool increment_counter,
         | 
| 204 | 
            +
                                       uint8_t flags, uint8_t flags_start,
         | 
| 205 | 
            +
                                       uint8_t flags_end, uint8_t *out);
         | 
| 206 | 
            +
            #endif
         | 
| 207 | 
            +
            #if !defined(BLAKE3_NO_AVX512)
         | 
| 208 | 
            +
            void blake3_compress_in_place_avx512(uint32_t cv[8],
         | 
| 209 | 
            +
                                                 const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 210 | 
            +
                                                 uint8_t block_len, uint64_t counter,
         | 
| 211 | 
            +
                                                 uint8_t flags);
         | 
| 212 | 
            +
             | 
| 213 | 
            +
            void blake3_compress_xof_avx512(const uint32_t cv[8],
         | 
| 214 | 
            +
                                            const uint8_t block[BLAKE3_BLOCK_LEN],
         | 
| 215 | 
            +
                                            uint8_t block_len, uint64_t counter,
         | 
| 216 | 
            +
                                            uint8_t flags, uint8_t out[64]);
         | 
| 217 | 
            +
             | 
| 218 | 
            +
            void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 219 | 
            +
                                         size_t blocks, const uint32_t key[8],
         | 
| 220 | 
            +
                                         uint64_t counter, bool increment_counter,
         | 
| 221 | 
            +
                                         uint8_t flags, uint8_t flags_start,
         | 
| 222 | 
            +
                                         uint8_t flags_end, uint8_t *out);
         | 
| 223 | 
            +
            #endif
         | 
| 224 | 
            +
            #endif
         | 
| 225 | 
            +
             | 
| 226 | 
            +
            #if defined(BLAKE3_USE_NEON)
         | 
| 227 | 
            +
            void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
         | 
| 228 | 
            +
                                       size_t blocks, const uint32_t key[8],
         | 
| 229 | 
            +
                                       uint64_t counter, bool increment_counter,
         | 
| 230 | 
            +
                                       uint8_t flags, uint8_t flags_start,
         | 
| 231 | 
            +
                                       uint8_t flags_end, uint8_t *out);
         | 
| 232 | 
            +
            #endif
         | 
| 165 233 |  | 
| 166 234 |  | 
| 167 235 | 
             
            #endif /* BLAKE3_IMPL_H */
         | 
| @@ -1800,15 +1800,18 @@ blake3_hash_many_sse41: | |
| 1800 1800 | 
             
            .p2align 6
         | 
| 1801 1801 | 
             
            blake3_compress_in_place_sse41:
         | 
| 1802 1802 | 
             
            _blake3_compress_in_place_sse41:
         | 
| 1803 | 
            -
                    sub     rsp,  | 
| 1803 | 
            +
                    sub     rsp, 120
         | 
| 1804 1804 | 
             
                    movdqa  xmmword ptr [rsp], xmm6
         | 
| 1805 1805 | 
             
                    movdqa  xmmword ptr [rsp+0x10], xmm7
         | 
| 1806 1806 | 
             
                    movdqa  xmmword ptr [rsp+0x20], xmm8
         | 
| 1807 1807 | 
             
                    movdqa  xmmword ptr [rsp+0x30], xmm9
         | 
| 1808 | 
            +
                    movdqa  xmmword ptr [rsp+0x40], xmm11
         | 
| 1809 | 
            +
                    movdqa  xmmword ptr [rsp+0x50], xmm14
         | 
| 1810 | 
            +
                    movdqa  xmmword ptr [rsp+0x60], xmm15
         | 
| 1808 1811 | 
             
                    movups  xmm0, xmmword ptr [rcx]
         | 
| 1809 1812 | 
             
                    movups  xmm1, xmmword ptr [rcx+0x10]
         | 
| 1810 1813 | 
             
                    movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
         | 
| 1811 | 
            -
                    movzx   eax, byte ptr [rsp+ | 
| 1814 | 
            +
                    movzx   eax, byte ptr [rsp+0xA0]
         | 
| 1812 1815 | 
             
                    movzx   r8d, r8b
         | 
| 1813 1816 | 
             
                    shl     rax, 32
         | 
| 1814 1817 | 
             
                    add     r8, rax
         | 
| @@ -1906,24 +1909,30 @@ _blake3_compress_in_place_sse41: | |
| 1906 1909 | 
             
                    movdqa  xmm7, xmmword ptr [rsp+0x10]
         | 
| 1907 1910 | 
             
                    movdqa  xmm8, xmmword ptr [rsp+0x20]
         | 
| 1908 1911 | 
             
                    movdqa  xmm9, xmmword ptr [rsp+0x30]
         | 
| 1909 | 
            -
                     | 
| 1912 | 
            +
                    movdqa  xmm11, xmmword ptr [rsp+0x40]
         | 
| 1913 | 
            +
                    movdqa  xmm14, xmmword ptr [rsp+0x50]
         | 
| 1914 | 
            +
                    movdqa  xmm15, xmmword ptr [rsp+0x60]
         | 
| 1915 | 
            +
                    add     rsp, 120
         | 
| 1910 1916 | 
             
                    ret
         | 
| 1911 1917 |  | 
| 1912 1918 |  | 
| 1913 1919 | 
             
            .p2align 6
         | 
| 1914 1920 | 
             
            _blake3_compress_xof_sse41:
         | 
| 1915 1921 | 
             
            blake3_compress_xof_sse41:
         | 
| 1916 | 
            -
                    sub     rsp,  | 
| 1922 | 
            +
                    sub     rsp, 120
         | 
| 1917 1923 | 
             
                    movdqa  xmmword ptr [rsp], xmm6
         | 
| 1918 1924 | 
             
                    movdqa  xmmword ptr [rsp+0x10], xmm7
         | 
| 1919 1925 | 
             
                    movdqa  xmmword ptr [rsp+0x20], xmm8
         | 
| 1920 1926 | 
             
                    movdqa  xmmword ptr [rsp+0x30], xmm9
         | 
| 1927 | 
            +
                    movdqa  xmmword ptr [rsp+0x40], xmm11
         | 
| 1928 | 
            +
                    movdqa  xmmword ptr [rsp+0x50], xmm14
         | 
| 1929 | 
            +
                    movdqa  xmmword ptr [rsp+0x60], xmm15
         | 
| 1921 1930 | 
             
                    movups  xmm0, xmmword ptr [rcx]
         | 
| 1922 1931 | 
             
                    movups  xmm1, xmmword ptr [rcx+0x10]
         | 
| 1923 1932 | 
             
                    movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
         | 
| 1924 | 
            -
                    movzx   eax, byte ptr [rsp+ | 
| 1933 | 
            +
                    movzx   eax, byte ptr [rsp+0xA0]
         | 
| 1925 1934 | 
             
                    movzx   r8d, r8b
         | 
| 1926 | 
            -
                    mov     r10, qword ptr [rsp+ | 
| 1935 | 
            +
                    mov     r10, qword ptr [rsp+0xA8]
         | 
| 1927 1936 | 
             
                    shl     rax, 32
         | 
| 1928 1937 | 
             
                    add     r8, rax
         | 
| 1929 1938 | 
             
                    movq    xmm3, r9
         | 
| @@ -2026,7 +2035,10 @@ blake3_compress_xof_sse41: | |
| 2026 2035 | 
             
                    movdqa  xmm7, xmmword ptr [rsp+0x10]
         | 
| 2027 2036 | 
             
                    movdqa  xmm8, xmmword ptr [rsp+0x20]
         | 
| 2028 2037 | 
             
                    movdqa  xmm9, xmmword ptr [rsp+0x30]
         | 
| 2029 | 
            -
                     | 
| 2038 | 
            +
                    movdqa  xmm11, xmmword ptr [rsp+0x40]
         | 
| 2039 | 
            +
                    movdqa  xmm14, xmmword ptr [rsp+0x50]
         | 
| 2040 | 
            +
                    movdqa  xmm15, xmmword ptr [rsp+0x60]
         | 
| 2041 | 
            +
                    add     rsp, 120
         | 
| 2030 2042 | 
             
                    ret
         | 
| 2031 2043 |  | 
| 2032 2044 |  | 
| @@ -1802,15 +1802,18 @@ blake3_hash_many_sse41 ENDP | |
| 1802 1802 |  | 
| 1803 1803 | 
             
            blake3_compress_in_place_sse41 PROC
         | 
| 1804 1804 | 
             
            _blake3_compress_in_place_sse41 PROC
         | 
| 1805 | 
            -
                    sub     rsp,  | 
| 1805 | 
            +
                    sub     rsp, 120
         | 
| 1806 1806 | 
             
                    movdqa  xmmword ptr [rsp], xmm6
         | 
| 1807 1807 | 
             
                    movdqa  xmmword ptr [rsp+10H], xmm7
         | 
| 1808 1808 | 
             
                    movdqa  xmmword ptr [rsp+20H], xmm8
         | 
| 1809 1809 | 
             
                    movdqa  xmmword ptr [rsp+30H], xmm9
         | 
| 1810 | 
            +
                    movdqa  xmmword ptr [rsp+40H], xmm11
         | 
| 1811 | 
            +
                    movdqa  xmmword ptr [rsp+50H], xmm14
         | 
| 1812 | 
            +
                    movdqa  xmmword ptr [rsp+60H], xmm15
         | 
| 1810 1813 | 
             
                    movups  xmm0, xmmword ptr [rcx]
         | 
| 1811 1814 | 
             
                    movups  xmm1, xmmword ptr [rcx+10H]
         | 
| 1812 1815 | 
             
                    movaps  xmm2, xmmword ptr [BLAKE3_IV]
         | 
| 1813 | 
            -
                    movzx   eax, byte ptr [rsp+ | 
| 1816 | 
            +
                    movzx   eax, byte ptr [rsp+0A0H]
         | 
| 1814 1817 | 
             
                    movzx   r8d, r8b
         | 
| 1815 1818 | 
             
                    shl     rax, 32
         | 
| 1816 1819 | 
             
                    add     r8, rax
         | 
| @@ -1908,7 +1911,10 @@ _blake3_compress_in_place_sse41 PROC | |
| 1908 1911 | 
             
                    movdqa  xmm7, xmmword ptr [rsp+10H]
         | 
| 1909 1912 | 
             
                    movdqa  xmm8, xmmword ptr [rsp+20H]
         | 
| 1910 1913 | 
             
                    movdqa  xmm9, xmmword ptr [rsp+30H]
         | 
| 1911 | 
            -
                     | 
| 1914 | 
            +
                    movdqa  xmm11, xmmword ptr [rsp+40H]
         | 
| 1915 | 
            +
                    movdqa  xmm14, xmmword ptr [rsp+50H]
         | 
| 1916 | 
            +
                    movdqa  xmm15, xmmword ptr [rsp+60H]
         | 
| 1917 | 
            +
                    add     rsp, 120
         | 
| 1912 1918 | 
             
                    ret
         | 
| 1913 1919 | 
             
            _blake3_compress_in_place_sse41 ENDP
         | 
| 1914 1920 | 
             
            blake3_compress_in_place_sse41 ENDP
         | 
| @@ -1916,17 +1922,20 @@ blake3_compress_in_place_sse41 ENDP | |
| 1916 1922 | 
             
            ALIGN 16
         | 
| 1917 1923 | 
             
            blake3_compress_xof_sse41 PROC
         | 
| 1918 1924 | 
             
            _blake3_compress_xof_sse41 PROC
         | 
| 1919 | 
            -
                    sub     rsp,  | 
| 1925 | 
            +
                    sub     rsp, 120
         | 
| 1920 1926 | 
             
                    movdqa  xmmword ptr [rsp], xmm6
         | 
| 1921 1927 | 
             
                    movdqa  xmmword ptr [rsp+10H], xmm7
         | 
| 1922 1928 | 
             
                    movdqa  xmmword ptr [rsp+20H], xmm8
         | 
| 1923 1929 | 
             
                    movdqa  xmmword ptr [rsp+30H], xmm9
         | 
| 1930 | 
            +
                    movdqa  xmmword ptr [rsp+40H], xmm11
         | 
| 1931 | 
            +
                    movdqa  xmmword ptr [rsp+50H], xmm14
         | 
| 1932 | 
            +
                    movdqa  xmmword ptr [rsp+60H], xmm15
         | 
| 1924 1933 | 
             
                    movups  xmm0, xmmword ptr [rcx]
         | 
| 1925 1934 | 
             
                    movups  xmm1, xmmword ptr [rcx+10H]
         | 
| 1926 1935 | 
             
                    movaps  xmm2, xmmword ptr [BLAKE3_IV]
         | 
| 1927 | 
            -
                    movzx   eax, byte ptr [rsp+ | 
| 1936 | 
            +
                    movzx   eax, byte ptr [rsp+0A0H]
         | 
| 1928 1937 | 
             
                    movzx   r8d, r8b
         | 
| 1929 | 
            -
                    mov     r10, qword ptr [rsp+ | 
| 1938 | 
            +
                    mov     r10, qword ptr [rsp+0A8H]
         | 
| 1930 1939 | 
             
                    shl     rax, 32
         | 
| 1931 1940 | 
             
                    add     r8, rax
         | 
| 1932 1941 | 
             
                    movq    xmm3, r9
         | 
| @@ -2029,7 +2038,10 @@ _blake3_compress_xof_sse41 PROC | |
| 2029 2038 | 
             
                    movdqa  xmm7, xmmword ptr [rsp+10H]
         | 
| 2030 2039 | 
             
                    movdqa  xmm8, xmmword ptr [rsp+20H]
         | 
| 2031 2040 | 
             
                    movdqa  xmm9, xmmword ptr [rsp+30H]
         | 
| 2032 | 
            -
                     | 
| 2041 | 
            +
                    movdqa  xmm11, xmmword ptr [rsp+40H]
         | 
| 2042 | 
            +
                    movdqa  xmm14, xmmword ptr [rsp+50H]
         | 
| 2043 | 
            +
                    movdqa  xmm15, xmmword ptr [rsp+60H]
         | 
| 2044 | 
            +
                    add     rsp, 120
         | 
| 2033 2045 | 
             
                    ret
         | 
| 2034 2046 | 
             
            _blake3_compress_xof_sse41 ENDP
         | 
| 2035 2047 | 
             
            blake3_compress_xof_sse41 ENDP
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: digest-blake3
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.34.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Will Bryant
         | 
| 8 8 | 
             
            autorequire: 
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2020- | 
| 11 | 
            +
            date: 2020-06-28 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: bundler
         | 
| @@ -112,8 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement | |
| 112 112 | 
             
                - !ruby/object:Gem::Version
         | 
| 113 113 | 
             
                  version: '0'
         | 
| 114 114 | 
             
            requirements: []
         | 
| 115 | 
            -
             | 
| 116 | 
            -
            rubygems_version: 2.7.6
         | 
| 115 | 
            +
            rubygems_version: 3.0.3
         | 
| 117 116 | 
             
            signing_key: 
         | 
| 118 117 | 
             
            specification_version: 4
         | 
| 119 118 | 
             
            summary: BLAKE3 for Ruby
         |