RubyGems - digest-blake3 - Versions diffs - 0.37.0.1 → 1.3.3.1 - Mend

digest-blake3 0.37.0.1 → 1.3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

checksums.yaml +4 -4
data/Gemfile.lock +4 -4
data/ext/digest/blake3/blake3.c +16 -3
data/ext/digest/blake3/blake3.h +4 -2
data/ext/digest/blake3/blake3_avx2.c +4 -3
data/ext/digest/blake3/blake3_avx512.c +26 -10
data/ext/digest/blake3/blake3_dispatch.c +11 -5
data/ext/digest/blake3/blake3_impl.h +17 -5
data/ext/digest/blake3/blake3_neon.c +6 -1
data/ext/digest/blake3/blake3_sse2.c +5 -4
data/ext/digest/blake3/blake3_sse2_x86-64_unix.S +2 -2
data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +8 -8
data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm +10 -10
data/ext/digest/blake3/blake3_sse41.c +4 -3
data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm +4 -4
data/ext/digest/blake3/extconf.rb +6 -4
data/lib/digest/blake3/version.rb +1 -1
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
-  data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
+  metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
+  data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
 SHA512:
-  metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
-  data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
+  metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
+  data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056

data/Gemfile.lock CHANGED Viewed

@@ -1,13 +1,13 @@
 PATH
   remote: .
   specs:
-    digest-blake3 (0.37.0)
+    digest-blake3 (1.3.3.1)
 GEM
   remote: https://rubygems.org/
   specs:
-    minitest (5.14.0)
-    rake (13.0.1)
+    minitest (5.16.3)
+    rake (13.0.6)
 PLATFORMS
   ruby
@@ -19,4 +19,4 @@ DEPENDENCIES
   rake
 BUNDLED WITH
-   1.17.3
+   2.3.1

data/ext/digest/blake3/blake3.c CHANGED Viewed

@@ -5,6 +5,8 @@
 #include "blake3.h"
 #include "blake3_impl.h"
+const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
 INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
                              uint8_t flags) {
   memcpy(self->cv, key, BLAKE3_KEY_LEN);
@@ -244,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // The wide helper function returns (writes out) an array of chaining values
 // and returns the length of that array. The number of chaining values returned
-// is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
+// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
 // if the input is shorter than that many chunks. The reason for maintaining a
 // wide array of chaining values going back up the tree, is to allow the
 // implementation to hash as many parents in parallel as possible.
@@ -252,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // As a special case when the SIMD degree is 1, this function will still return
 // at least 2 outputs. This guarantees that this function doesn't perform the
 // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable ouput.) Note that this function is
+// wouldn't be able to implement exendable output.) Note that this function is
 // not used when the whole input is only 1 chunk long; that's a different
 // codepath.
 //
@@ -338,12 +340,18 @@ INLINE void compress_subtree_to_parent_node(
   uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
   size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
                                                 chunk_counter, flags, cv_array);
+  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
   // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
   // compress_subtree_wide() returns more than 2 chaining values. Condense
   // them into 2 by forming parent nodes repeatedly.
   uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  while (num_cvs > 2) {
+  // The second half of this loop condition is always true, and we just
+  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
+  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
+  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
+  // this code, test it against that version.
+  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
     num_cvs =
         compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
     memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
@@ -601,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
   }
   output_root_bytes(&output, seek, out, out_len);
 }
+void blake3_hasher_reset(blake3_hasher *self) {
+  chunk_state_reset(&self->chunk, self->key, 0);
+  self->cv_stack_len = 0;
+}

data/ext/digest/blake3/blake3.h CHANGED Viewed

@@ -8,12 +8,12 @@
 extern "C" {
 #endif
+#define BLAKE3_VERSION_STRING "1.3.3"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
 #define BLAKE3_CHUNK_LEN 1024
 #define BLAKE3_MAX_DEPTH 54
-#define BLAKE3_MAX_SIMD_DEGREE 16
 // This struct is a private implementation detail. It has to be here because
 // it's part of blake3_hasher below.
@@ -38,11 +38,12 @@ typedef struct {
   uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
 } blake3_hasher;
+const char *blake3_version(void);
 void blake3_hasher_init(blake3_hasher *self);
 void blake3_hasher_init_keyed(blake3_hasher *self,
                               const uint8_t key[BLAKE3_KEY_LEN]);
 void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
                                        size_t context_len);
 void blake3_hasher_update(blake3_hasher *self, const void *input,
                           size_t input_len);
@@ -50,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
                             size_t out_len);
 void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
                                  uint8_t *out, size_t out_len);
+void blake3_hasher_reset(blake3_hasher *self);
 #ifdef __cplusplus
 }

data/ext/digest/blake3/blake3_avx2.c CHANGED Viewed

@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
   out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
   out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
   for (size_t i = 0; i < 8; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs(&out[0]);
   transpose_vecs(&out[8]);
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
   const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
   const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
   const __m256i add1 = _mm256_and_si256(mask, add0);
-  __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
+  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
   __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
                                      _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
-  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
+  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
   *out_lo = l;
   *out_hi = h;
 }
+static
 void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
                        const uint32_t key[8], uint64_t counter,
                        bool increment_counter, uint8_t flags,

data/ext/digest/blake3/blake3_avx512.c CHANGED Viewed

@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
   out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
   out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
   for (size_t i = 0; i < 4; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs_128(&out[0]);
   transpose_vecs_128(&out[4]);
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
   *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
 }
+static
 void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
                          const uint32_t key[8], uint64_t counter,
                          bool increment_counter, uint8_t flags,
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
   out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
   out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
   for (size_t i = 0; i < 8; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs_256(&out[0]);
   transpose_vecs_256(&out[8]);
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
   *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
 }
+static
 void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
                          const uint32_t key[8], uint64_t counter,
                          bool increment_counter, uint8_t flags,
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
   out[14] = loadu_512(&inputs[14][block_offset]);
   out[15] = loadu_512(&inputs[15][block_offset]);
   for (size_t i = 0; i < 16; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs_512(out);
 }
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
 INLINE void load_counters16(uint64_t counter, bool increment_counter,
                             __m512i *out_lo, __m512i *out_hi) {
   const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
-  const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-  const __m512i add1 = _mm512_and_si512(mask, add0);
-  __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
-  __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
-  __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
-  *out_lo = l;
-  *out_hi = h;
+  const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
+  const __m512i low_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)counter),
+    masked_deltas);
+  // The carry bit is 1 if the high bit of the word was 1 before addition and is
+  // 0 after.
+  // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
+  // compute the carry bits here, and originally we did, but that intrinsic is
+  // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
+  const __m512i carries = _mm512_srli_epi32(
+    _mm512_andnot_si512(
+        low_words, // 0 after (gets inverted by andnot)
+        _mm512_set1_epi32((int32_t)counter)), // and 1 before
+    31);
+  const __m512i high_words = _mm512_add_epi32(
+    _mm512_set1_epi32((int32_t)(counter >> 32)),
+    carries);
+  *out_lo = low_words;
+  *out_hi = high_words;
 }
+static
 void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
                           const uint32_t key[8], uint64_t counter,
                           bool increment_counter, uint8_t flags,

data/ext/digest/blake3/blake3_dispatch.c CHANGED Viewed

@@ -10,12 +10,14 @@
 #elif defined(__GNUC__)
 #include <immintrin.h>
 #else
-#error "Unimplemented!"
+#undef IS_X86 /* Unimplemented! */
 #endif
 #endif
+#define MAYBE_UNUSED(x) (void)((x))
 #if defined(IS_X86)
-static uint64_t xgetbv() {
+static uint64_t xgetbv(void) {
 #if defined(_MSC_VER)
   return _xgetbv(0);
 #else
@@ -80,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
 static
 #endif
     enum cpu_feature
-    get_cpu_features() {
+    get_cpu_features(void) {
   if (g_cpu_features != UNDEFINED) {
     return g_cpu_features;
@@ -137,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
                               uint8_t flags) {
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
 #if !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
@@ -165,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
                          uint8_t out[64]) {
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
 #if !defined(BLAKE3_NO_AVX512)
   if (features & AVX512VL) {
     blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
@@ -193,6 +197,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
 #if !defined(BLAKE3_NO_AVX512)
   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
     blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
@@ -227,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
 #endif
 #endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
   blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
                         increment_counter, flags, flags_start, flags_end, out);
   return;
@@ -242,6 +247,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
 size_t blake3_simd_degree(void) {
 #if defined(IS_X86)
   const enum cpu_feature features = get_cpu_features();
+  MAYBE_UNUSED(features);
 #if !defined(BLAKE3_NO_AVX512)
   if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
     return 16;
@@ -263,7 +269,7 @@ size_t blake3_simd_degree(void) {
   }
 #endif
 #endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
   return 4;
 #endif
   return 1;

data/ext/digest/blake3/blake3_impl.h CHANGED Viewed

@@ -38,16 +38,28 @@ enum blake3_flags {
 #define IS_X86_32
 #endif
+#if defined(__aarch64__) || defined(_M_ARM64)
+#define IS_AARCH64
+#endif
 #if defined(IS_X86)
 #if defined(_MSC_VER)
 #include <intrin.h>
 #endif
-#include <immintrin.h>
+#endif
+#if !defined(BLAKE3_USE_NEON)
+  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
+  #if defined(IS_AARCH64)
+    #define BLAKE3_USE_NEON 1
+  #else
+    #define BLAKE3_USE_NEON 0
+  #endif
 #endif
 #if defined(IS_X86)
 #define MAX_SIMD_DEGREE 16
-#elif defined(BLAKE3_USE_NEON)
+#elif BLAKE3_USE_NEON == 1
 #define MAX_SIMD_DEGREE 4
 #else
 #define MAX_SIMD_DEGREE 1
@@ -83,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
 #elif defined(_MSC_VER) && defined(IS_X86_32)
   if(x >> 32) {
     unsigned long index;
-    _BitScanReverse(&index, x >> 32);
+    _BitScanReverse(&index, (unsigned long)(x >> 32));
     return 32 + index;
   } else {
     unsigned long index;
-    _BitScanReverse(&index, x);
+    _BitScanReverse(&index, (unsigned long)x);
     return index;
   }
 #else
@@ -257,7 +269,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
 #endif
 #endif
-#if defined(BLAKE3_USE_NEON)
+#if BLAKE3_USE_NEON == 1
 void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
                            size_t blocks, const uint32_t key[8],
                            uint64_t counter, bool increment_counter,

data/ext/digest/blake3/blake3_neon.c CHANGED Viewed

@@ -2,7 +2,12 @@
 #include <arm_neon.h>
-// TODO: This is probably incorrect for big-endian ARM. How should that work?
+#ifdef __ARM_BIG_ENDIAN
+#error "This implementation only supports little-endian ARM."
+// It might be that all we need for big-endian support here is to get the loads
+// and stores right, but step zero would be finding a way to test it in CI.
+#endif
 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
   // vld1q_u32 has alignment requirements. Don't use it.
   uint32x4_t x;

data/ext/digest/blake3/blake3_sse2.c CHANGED Viewed

@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
 }
-INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
+INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
   const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
   __m128i mask = _mm_set1_epi16(imm8);
   mask = _mm_and_si128(mask, bits);
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
   out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
   out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
   for (size_t i = 0; i < 4; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs(&out[0]);
   transpose_vecs(&out[4]);
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
   const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
   const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
   const __m128i add1 = _mm_and_si128(mask, add0);
-  __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
   __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
                                   _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
-  __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
   *out_lo = l;
   *out_hi = h;
 }
+static
 void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
                        const uint32_t key[8], uint64_t counter,
                        bool increment_counter, uint8_t flags,

data/ext/digest/blake3/blake3_sse2_x86-64_unix.S CHANGED Viewed

@@ -1704,7 +1704,7 @@ blake3_hash_many_sse2:
         pshufd  xmm15, xmm11, 0x93
         shl     rax, 0x20
         or      rax, 0x40
-        movd    xmm3, rax
+        movq    xmm3, rax
         movdqa  xmmword ptr [rsp+0x20], xmm3
         movaps  xmm3, xmmword ptr [rsp]
         movaps  xmm11, xmmword ptr [rsp+0x10]
@@ -1917,7 +1917,7 @@ blake3_hash_many_sse2:
         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
         shl     rax, 32
         or      rax, 64
-        movd    xmm12, rax
+        movq    xmm12, rax
         movdqa  xmm3, xmm13
         punpcklqdq xmm3, xmm12
         movups  xmm4, xmmword ptr [r8+rdx-0x40]

data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S CHANGED Viewed

@@ -1715,7 +1715,7 @@ blake3_hash_many_sse2:
         pshufd  xmm15, xmm11, 0x93
         shl     rax, 0x20
         or      rax, 0x40
-        movd    xmm3, rax
+        movq    xmm3, rax
         movdqa  xmmword ptr [rsp+0x20], xmm3
         movaps  xmm3, xmmword ptr [rsp]
         movaps  xmm11, xmmword ptr [rsp+0x10]
@@ -1928,7 +1928,7 @@ blake3_hash_many_sse2:
         movaps  xmm2, xmmword ptr [BLAKE3_IV+rip]
         shl     rax, 32
         or      rax, 64
-        movd    xmm12, rax
+        movq    xmm12, rax
         movdqa  xmm3, xmm13
         punpcklqdq xmm3, xmm12
         movups  xmm4, xmmword ptr [r8+rdx-0x40]
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
         por     xmm9, xmm8
         movdqa  xmm8, xmm7
         punpcklqdq xmm8, xmm5
-        movdqa  xmm10, xmm6
+        movdqa  xmm14, xmm6
         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
-        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
-        por     xmm8, xmm10
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm14
         pshufd  xmm8, xmm8, 0x78
         punpckhdq xmm5, xmm7
         punpckldq xmm6, xmm5
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
         por     xmm9, xmm8
         movdqa  xmm8, xmm7
         punpcklqdq xmm8, xmm5
-        movdqa  xmm10, xmm6
+        movdqa  xmm14, xmm6
         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
-        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
-        por     xmm8, xmm10
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
+        por     xmm8, xmm14
         pshufd  xmm8, xmm8, 0x78
         punpckhdq xmm5, xmm7
         punpckldq xmm6, xmm5

data/ext/digest/blake3/blake3_sse2_x86-64_windows_msvc.asm CHANGED Viewed

@@ -2054,8 +2054,8 @@ _blake3_compress_in_place_sse2 PROC
         movzx   r8d, r8b
         shl     rax, 32
         add     r8, rax
-        movq    xmm3, r9
-        movq    xmm4, r8
+        movd    xmm3, r9
+        movd    xmm4, r8
         punpcklqdq xmm3, xmm4
         movups  xmm4, xmmword ptr [rdx]
         movups  xmm5, xmmword ptr [rdx+10H]
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
         por     xmm9, xmm8
         movdqa  xmm8, xmm7
         punpcklqdq xmm8, xmm5
-        movdqa  xmm10, xmm6
+        movdqa  xmm14, xmm6
         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
-        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
-        por     xmm8, xmm10
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm8, xmm14
         pshufd  xmm8, xmm8, 78H
         punpckhdq xmm5, xmm7
         punpckldq xmm6, xmm5
@@ -2186,8 +2186,8 @@ _blake3_compress_xof_sse2 PROC
         mov     r10, qword ptr [rsp+0A8H]
         shl     rax, 32
         add     r8, rax
-        movq    xmm3, r9
-        movq    xmm4, r8
+        movd    xmm3, r9
+        movd    xmm4, r8
         punpcklqdq xmm3, xmm4
         movups  xmm4, xmmword ptr [rdx]
         movups  xmm5, xmmword ptr [rdx+10H]
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
         por     xmm9, xmm8
         movdqa  xmm8, xmm7
         punpcklqdq xmm8, xmm5
-        movdqa  xmm10, xmm6
+        movdqa  xmm14, xmm6
         pand    xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
-        pand    xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
-        por     xmm8, xmm10
+        pand    xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
+        por     xmm8, xmm14
         pshufd  xmm8, xmm8, 78H
         punpckhdq xmm5, xmm7
         punpckldq xmm6, xmm5

data/ext/digest/blake3/blake3_sse41.c CHANGED Viewed

@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
   out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
   out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
   for (size_t i = 0; i < 4; ++i) {
-    _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
   }
   transpose_vecs(&out[0]);
   transpose_vecs(&out[4]);
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
   const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
   const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
   const __m128i add1 = _mm_and_si128(mask, add0);
-  __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
+  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
   __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
                                   _mm_xor_si128(   l, _mm_set1_epi32(0x80000000)));
-  __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
+  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
   *out_lo = l;
   *out_hi = h;
 }
+static
 void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
                         const uint32_t key[8], uint64_t counter,
                         bool increment_counter, uint8_t flags,

data/ext/digest/blake3/blake3_sse41_x86-64_windows_msvc.asm CHANGED Viewed

@@ -1817,8 +1817,8 @@ _blake3_compress_in_place_sse41 PROC
         movzx   r8d, r8b
         shl     rax, 32
         add     r8, rax
-        movq    xmm3, r9
-        movq    xmm4, r8
+        movd    xmm3, r9
+        movd    xmm4, r8
         punpcklqdq xmm3, xmm4
         movups  xmm4, xmmword ptr [rdx]
         movups  xmm5, xmmword ptr [rdx+10H]
@@ -1938,8 +1938,8 @@ _blake3_compress_xof_sse41 PROC
         mov     r10, qword ptr [rsp+0A8H]
         shl     rax, 32
         add     r8, rax
-        movq    xmm3, r9
-        movq    xmm4, r8
+        movd    xmm3, r9
+        movd    xmm4, r8
         punpcklqdq xmm3, xmm4
         movups  xmm4, xmmword ptr [rdx]
         movups  xmm5, xmmword ptr [rdx+10H]

data/ext/digest/blake3/extconf.rb CHANGED Viewed

@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
   end
 end
-check_supported_flags("-msse2",                          "blake3_sse2.o",   "-DBLAKE3_NO_SSE2")
-check_supported_flags("-msse4.1",                        "blake3_sse41.o",  "-DBLAKE3_NO_SSE41")
-check_supported_flags("-mavx2",                          "blake3_avx2.o",   "-DBLAKE3_NO_AVX2")
-check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
+unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
+  check_supported_flags("-msse2",                          "blake3_sse2.o",   "-DBLAKE3_NO_SSE2")
+  check_supported_flags("-msse4.1",                        "blake3_sse41.o",  "-DBLAKE3_NO_SSE41")
+  check_supported_flags("-mavx2",                          "blake3_avx2.o",   "-DBLAKE3_NO_AVX2")
+  check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
+end
 if have_header("arm_neon.h")
   $objs << "blake3_neon.o"

data/lib/digest/blake3/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@ require 'digest'
 module Digest
   class BLAKE3 < Base
-    VERSION = "0.37.0.1"
+    VERSION = "1.3.3.1"
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: digest-blake3
 version: !ruby/object:Gem::Version
-  version: 0.37.0.1
+  version: 1.3.3.1
 platform: ruby
 authors:
 - Will Bryant
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2020-10-18 00:00:00.000000000 Z
+date: 2022-12-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
 licenses:
 - MIT
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.0.3
-signing_key:
+rubygems_version: 3.1.6
+signing_key:
 specification_version: 4
 summary: BLAKE3 for Ruby
 test_files: []