RubyGems - digest-blake3 - Versions diffs - 1.4.0.0 → 1.5.1.0 - Mend

digest-blake3 1.4.0.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/ext/digest/blake3/blake3.c +12 -9
data/ext/digest/blake3/blake3.h +37 -15
data/ext/digest/blake3/blake3_avx2.c +1 -1
data/ext/digest/blake3/blake3_avx2_x86-64_windows_gnu.S +1 -1
data/ext/digest/blake3/blake3_avx512.c +3 -3
data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +1 -1
data/ext/digest/blake3/blake3_dispatch.c +35 -6
data/ext/digest/blake3/blake3_impl.h +7 -3
data/ext/digest/blake3/blake3_neon.c +23 -8
data/ext/digest/blake3/blake3_sse2.c +1 -1
data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +1 -1
data/ext/digest/blake3/blake3_sse41.c +1 -1
data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +1 -1
data/lib/digest/blake3/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 600afca6f08145f3e28b49fbe757b661368d58dc8ec20e1778a915407dcc660a
-  data.tar.gz: 1cd455e9caf97fd0f514623ba6b9d7f74249071af8cb8b5377ba10de91c5eb34
+  metadata.gz: 2bf10e44aaa74a31f9a334b67ecfadfeda7f31a4d3055bd48c5f1a8609e53661
+  data.tar.gz: 53072abb4749ecdfd6748360fc33d39789078413078858953bffcd1ae1cfcdaf
 SHA512:
-  metadata.gz: d515228fab5f92576d9b1f67d66ffff97623f154dcab9a1dcb140b9e69884325797991d1f87d7f5cb26cb6397a86e989a1c1f2163b478641efdc4b85b5772026
-  data.tar.gz: ed75418dda098a8700554b871189c9995e1e6e4969d86cc8d9afd46d8439ecf985fb111f1a6d6adb9173f68cb33c7c261b35b0820439c48a92e354da2f662e35
+  metadata.gz: b93a9bdf8b7f2fa4986090e466dfe6c0661d6e9e2de7864cb83846f314a07f198ce16bb6289edb50cf1a7333463136049d61c5ee6fa3db9c8c4855e229fd93a7
+  data.tar.gz: df226266cb38882b121c401d074ad84559265751c897fccda004a8f3d230c2e67215ee6086cfac13fc7043316955511179de4c024dd9d4b5f9048a948ea52080

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    digest-blake3 (1.3.3.1)
+    digest-blake3 (1.4.1.0)
 GEM
   remote: https://rubygems.org/

data/ext/digest/blake3/blake3.c CHANGED Viewed

@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
 // As a special case when the SIMD degree is 1, this function will still return
 // at least 2 outputs. This guarantees that this function doesn't perform the
 // root compression. (If it did, it would use the wrong flags, and also we
-// wouldn't be able to implement exendable output.) Note that this function is
+// wouldn't be able to implement extendable output.) Note that this function is
 // not used when the whole input is only 1 chunk long; that's a different
 // codepath.
 //
@@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
   size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
                                                 chunk_counter, flags, cv_array);
   assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
-  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
+  // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
+  // as we just asserted, num_cvs will always be <=2 in that case. But GCC
+  // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
+  // set then it emits incorrect warnings here. We tried a few different
+  // hacks to silence these, but in the end our hacks just produced different
+  // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
+  // desperation, we ifdef out this entire loop when we know it's not needed.
+#if MAX_SIMD_DEGREE_OR_2 > 2
+  // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
   // compress_subtree_wide() returns more than 2 chaining values. Condense
   // them into 2 by forming parent nodes repeatedly.
   uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
-  // The second half of this loop condition is always true, and we just
-  // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
-  // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
-  // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
-  // this code, test it against that version.
-  while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
+  while (num_cvs > 2) {
     num_cvs =
         compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
     memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
   }
+#endif
   memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
 }

data/ext/digest/blake3/blake3.h CHANGED Viewed

@@ -4,11 +4,33 @@
 #include <stddef.h>
 #include <stdint.h>
+#if !defined(BLAKE3_API)
+# if defined(_WIN32) || defined(__CYGWIN__)
+#   if defined(BLAKE3_DLL)
+#     if defined(BLAKE3_DLL_EXPORTS)
+#       define BLAKE3_API __declspec(dllexport)
+#     else
+#       define BLAKE3_API __declspec(dllimport)
+#     endif
+#     define BLAKE3_PRIVATE
+#   else
+#     define BLAKE3_API
+#     define BLAKE3_PRIVATE
+#   endif
+# elif __GNUC__ >= 4
+#   define BLAKE3_API __attribute__((visibility("default")))
+#   define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
+# else
+#   define BLAKE3_API
+#   define BLAKE3_PRIVATE
+# endif
+#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
-#define BLAKE3_VERSION_STRING "1.3.3"
+#define BLAKE3_VERSION_STRING "1.5.1"
 #define BLAKE3_KEY_LEN 32
 #define BLAKE3_OUT_LEN 32
 #define BLAKE3_BLOCK_LEN 64
@@ -38,20 +60,20 @@ typedef struct {
   uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
 } blake3_hasher;
-const char *blake3_version(void);
-void blake3_hasher_init(blake3_hasher *self);
-void blake3_hasher_init_keyed(blake3_hasher *self,
-                              const uint8_t key[BLAKE3_KEY_LEN]);
-void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
-void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
-                                       size_t context_len);
-void blake3_hasher_update(blake3_hasher *self, const void *input,
-                          size_t input_len);
-void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
-                            size_t out_len);
-void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
-                                 uint8_t *out, size_t out_len);
-void blake3_hasher_reset(blake3_hasher *self);
+BLAKE3_API const char *blake3_version(void);
+BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
+BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
+                                         const uint8_t key[BLAKE3_KEY_LEN]);
+BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                                  size_t context_len);
+BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
+                                     size_t input_len);
+BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                                       size_t out_len);
+BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                            uint8_t *out, size_t out_len);
+BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
 #ifdef __cplusplus
 }

data/ext/digest/blake3/blake3_avx2.c CHANGED Viewed

@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
   // 11/33.
   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);

data/ext/digest/blake3/blake3_avx2_x86-64_windows_gnu.S CHANGED Viewed

@@ -1784,7 +1784,7 @@ blake3_hash_many_avx2:
         vmovdqu xmmword ptr [rbx+0x10], xmm1
         jmp     4b
-.section .rodata
+.section .rdata
 .p2align  6
 ADD0:
         .long  0, 1, 2, 3, 4, 5, 6, 7

data/ext/digest/blake3/blake3_avx512.c CHANGED Viewed

@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
 }
 INLINE void transpose_vecs_128(__m128i vecs[4]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
   // 22/33. Note that this doesn't split the vector into two lanes, as the
   // AVX2 counterparts do.
   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
-  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
   // 11/33.
   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
   __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
   __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
-  // Interleave 64-bit lates. The _0 unpack is lanes
+  // Interleave 64-bit lanes. The _0 unpack is lanes
   // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
   // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
   // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes

data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S CHANGED Viewed

@@ -2587,7 +2587,7 @@ blake3_compress_xof_avx512:
         add     rsp, 72
         ret
-.section .rodata
+.section .rdata
 .p2align  6
 INDEX0:
         .long    0,  1,  2,  3, 16, 17, 18, 19

data/ext/digest/blake3/blake3_dispatch.c CHANGED Viewed

@@ -6,6 +6,7 @@
 #if defined(IS_X86)
 #if defined(_MSC_VER)
+#include <Windows.h>
 #include <intrin.h>
 #elif defined(__GNUC__)
 #include <immintrin.h>
@@ -14,6 +15,32 @@
 #endif
 #endif
+#if !defined(BLAKE3_ATOMICS)
+#if defined(__has_include)
+#if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
+#define BLAKE3_ATOMICS 1
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
+#else
+#define BLAKE3_ATOMICS 0
+#endif /* defined(__has_include) */
+#endif /* BLAKE3_ATOMICS */
+#if BLAKE3_ATOMICS
+#define ATOMIC_INT _Atomic int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#elif defined(_MSC_VER)
+#define ATOMIC_INT LONG
+#define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
+#define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
+#else
+#define ATOMIC_INT int
+#define ATOMIC_LOAD(x) x
+#define ATOMIC_STORE(x, y) x = y
+#endif
 #define MAYBE_UNUSED(x) (void)((x))
 #if defined(IS_X86)
@@ -76,7 +103,7 @@ enum cpu_feature {
 #if !defined(BLAKE3_TESTING)
 static /* Allow the variable to be controlled manually for testing */
 #endif
-    enum cpu_feature g_cpu_features = UNDEFINED;
+    ATOMIC_INT g_cpu_features = UNDEFINED;
 #if !defined(BLAKE3_TESTING)
 static
@@ -84,14 +111,16 @@ static
     enum cpu_feature
     get_cpu_features(void) {
-  if (g_cpu_features != UNDEFINED) {
-    return g_cpu_features;
+  /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
+  enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
+  if (features != UNDEFINED) {
+    return features;
   } else {
 #if defined(IS_X86)
     uint32_t regs[4] = {0};
     uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
     (void)edx;
-    enum cpu_feature features = 0;
+    features = 0;
     cpuid(regs, 0);
     const int max_id = *eax;
     cpuid(regs, 1);
@@ -101,7 +130,7 @@ static
     if (*edx & (1UL << 26))
       features |= SSE2;
 #endif
-    if (*ecx & (1UL << 0))
+    if (*ecx & (1UL << 9))
       features |= SSSE3;
     if (*ecx & (1UL << 19))
       features |= SSE41;
@@ -124,7 +153,7 @@ static
         }
       }
     }
-    g_cpu_features = features;
+    ATOMIC_STORE(g_cpu_features, features);
     return features;
 #else
     /* How to detect NEON? */

data/ext/digest/blake3/blake3_impl.h CHANGED Viewed

@@ -51,7 +51,11 @@ enum blake3_flags {
 #if !defined(BLAKE3_USE_NEON)
   // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
   #if defined(IS_AARCH64)
-    #define BLAKE3_USE_NEON 1
+    #if defined(__ARM_BIG_ENDIAN)
+      #define BLAKE3_USE_NEON 0
+    #else
+      #define BLAKE3_USE_NEON 1
+    #endif
   #else
     #define BLAKE3_USE_NEON 0
   #endif
@@ -87,7 +91,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
 /* x is assumed to be nonzero.       */
 static unsigned int highest_one(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return 63 ^ __builtin_clzll(x);
+  return 63 ^ (unsigned int)__builtin_clzll(x);
 #elif defined(_MSC_VER) && defined(IS_X86_64)
   unsigned long index;
   _BitScanReverse64(&index, x);
@@ -117,7 +121,7 @@ static unsigned int highest_one(uint64_t x) {
 // Count the number of 1 bits.
 INLINE unsigned int popcnt(uint64_t x) {
 #if defined(__GNUC__) || defined(__clang__)
-  return __builtin_popcountll(x);
+  return (unsigned int)__builtin_popcountll(x);
 #else
   unsigned int count = 0;
   while (x != 0) {

data/ext/digest/blake3/blake3_neon.c CHANGED Viewed

@@ -10,14 +10,12 @@
 INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
   // vld1q_u32 has alignment requirements. Don't use it.
-  uint32x4_t x;
-  memcpy(&x, src, 16);
-  return x;
+  return vreinterpretq_u32_u8(vld1q_u8(src));
 }
 INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
   // vst1q_u32 has alignment requirements. Don't use it.
-  memcpy(dest, &src, 16);
+  vst1q_u8(dest, vreinterpretq_u8_u32(src));
 }
 INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@@ -36,19 +34,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
 }
 INLINE uint32x4_t rot16_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  // The straightfoward implementation would be two shifts and an or, but that's
+  // slower on microarchitectures we've tested. See
+  // https://github.com/BLAKE3-team/BLAKE3/pull/319.
+  // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
+  return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
 }
 INLINE uint32x4_t rot12_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
 }
 INLINE uint32x4_t rot8_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
+#if defined(__clang__)
+  return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
+#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
+  static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
+  return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
+#else
+  return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
+#endif
 }
 INLINE uint32x4_t rot7_128(uint32x4_t x) {
-  return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  // See comment in rot16_128.
+  // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
+  return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
 }
 // TODO: compress_neon

data/ext/digest/blake3/blake3_sse2.c CHANGED Viewed

@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }
 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
   // 22/33. Note that this doesn't split the vector into two lanes, as the
   // AVX2 counterparts do.
   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);

data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S CHANGED Viewed

@@ -2301,7 +2301,7 @@ blake3_compress_xof_sse2:
         ret
-.section .rodata
+.section .rdata
 .p2align  6
 BLAKE3_IV:
         .long  0x6A09E667, 0xBB67AE85

data/ext/digest/blake3/blake3_sse41.c CHANGED Viewed

@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
 }
 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
-  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
   // 22/33. Note that this doesn't split the vector into two lanes, as the
   // AVX2 counterparts do.
   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);

data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S CHANGED Viewed

@@ -2042,7 +2042,7 @@ blake3_compress_xof_sse41:
         ret
-.section .rodata
+.section .rdata
 .p2align  6
 BLAKE3_IV:
         .long  0x6A09E667, 0xBB67AE85

data/lib/digest/blake3/version.rb CHANGED Viewed

@@ -2,6 +2,6 @@ require 'digest'
 module Digest
   class BLAKE3 < Base
-    VERSION = "1.4.0.0"
+    VERSION = "1.5.1.0"
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: digest-blake3
 version: !ruby/object:Gem::Version
-  version: 1.4.0.0
+  version: 1.5.1.0
 platform: ruby
 authors:
 - Will Bryant
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-09-17 00:00:00.000000000 Z
+date: 2024-07-10 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler