digest-blake3 1.4.0.0 → 1.5.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 600afca6f08145f3e28b49fbe757b661368d58dc8ec20e1778a915407dcc660a
4
- data.tar.gz: 1cd455e9caf97fd0f514623ba6b9d7f74249071af8cb8b5377ba10de91c5eb34
3
+ metadata.gz: 2bf10e44aaa74a31f9a334b67ecfadfeda7f31a4d3055bd48c5f1a8609e53661
4
+ data.tar.gz: 53072abb4749ecdfd6748360fc33d39789078413078858953bffcd1ae1cfcdaf
5
5
  SHA512:
6
- metadata.gz: d515228fab5f92576d9b1f67d66ffff97623f154dcab9a1dcb140b9e69884325797991d1f87d7f5cb26cb6397a86e989a1c1f2163b478641efdc4b85b5772026
7
- data.tar.gz: ed75418dda098a8700554b871189c9995e1e6e4969d86cc8d9afd46d8439ecf985fb111f1a6d6adb9173f68cb33c7c261b35b0820439c48a92e354da2f662e35
6
+ metadata.gz: b93a9bdf8b7f2fa4986090e466dfe6c0661d6e9e2de7864cb83846f314a07f198ce16bb6289edb50cf1a7333463136049d61c5ee6fa3db9c8c4855e229fd93a7
7
+ data.tar.gz: df226266cb38882b121c401d074ad84559265751c897fccda004a8f3d230c2e67215ee6086cfac13fc7043316955511179de4c024dd9d4b5f9048a948ea52080
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (1.3.3.1)
4
+ digest-blake3 (1.4.1.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
254
254
  // As a special case when the SIMD degree is 1, this function will still return
255
255
  // at least 2 outputs. This guarantees that this function doesn't perform the
256
256
  // root compression. (If it did, it would use the wrong flags, and also we
257
- // wouldn't be able to implement exendable output.) Note that this function is
257
+ // wouldn't be able to implement extendable output.) Note that this function is
258
258
  // not used when the whole input is only 1 chunk long; that's a different
259
259
  // codepath.
260
260
  //
@@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
341
341
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
342
342
  chunk_counter, flags, cv_array);
343
343
  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
344
-
345
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
344
+ // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
345
+ // as we just asserted, num_cvs will always be <=2 in that case. But GCC
346
+ // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
347
+ // set then it emits incorrect warnings here. We tried a few different
348
+ // hacks to silence these, but in the end our hacks just produced different
349
+ // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
350
+ // desperation, we ifdef out this entire loop when we know it's not needed.
351
+ #if MAX_SIMD_DEGREE_OR_2 > 2
352
+ // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
346
353
  // compress_subtree_wide() returns more than 2 chaining values. Condense
347
354
  // them into 2 by forming parent nodes repeatedly.
348
355
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
349
- // The second half of this loop condition is always true, and we just
350
- // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
351
- // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
352
- // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
353
- // this code, test it against that version.
354
- while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
356
+ while (num_cvs > 2) {
355
357
  num_cvs =
356
358
  compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
357
359
  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
358
360
  }
361
+ #endif
359
362
  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
360
363
  }
361
364
 
@@ -4,11 +4,33 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
+ #if !defined(BLAKE3_API)
8
+ # if defined(_WIN32) || defined(__CYGWIN__)
9
+ # if defined(BLAKE3_DLL)
10
+ # if defined(BLAKE3_DLL_EXPORTS)
11
+ # define BLAKE3_API __declspec(dllexport)
12
+ # else
13
+ # define BLAKE3_API __declspec(dllimport)
14
+ # endif
15
+ # define BLAKE3_PRIVATE
16
+ # else
17
+ # define BLAKE3_API
18
+ # define BLAKE3_PRIVATE
19
+ # endif
20
+ # elif __GNUC__ >= 4
21
+ # define BLAKE3_API __attribute__((visibility("default")))
22
+ # define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
23
+ # else
24
+ # define BLAKE3_API
25
+ # define BLAKE3_PRIVATE
26
+ # endif
27
+ #endif
28
+
7
29
  #ifdef __cplusplus
8
30
  extern "C" {
9
31
  #endif
10
32
 
11
- #define BLAKE3_VERSION_STRING "1.3.3"
33
+ #define BLAKE3_VERSION_STRING "1.5.1"
12
34
  #define BLAKE3_KEY_LEN 32
13
35
  #define BLAKE3_OUT_LEN 32
14
36
  #define BLAKE3_BLOCK_LEN 64
@@ -38,20 +60,20 @@ typedef struct {
38
60
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
61
  } blake3_hasher;
40
62
 
41
- const char *blake3_version(void);
42
- void blake3_hasher_init(blake3_hasher *self);
43
- void blake3_hasher_init_keyed(blake3_hasher *self,
44
- const uint8_t key[BLAKE3_KEY_LEN]);
45
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
46
- void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
47
- size_t context_len);
48
- void blake3_hasher_update(blake3_hasher *self, const void *input,
49
- size_t input_len);
50
- void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
51
- size_t out_len);
52
- void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
53
- uint8_t *out, size_t out_len);
54
- void blake3_hasher_reset(blake3_hasher *self);
63
+ BLAKE3_API const char *blake3_version(void);
64
+ BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
65
+ BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
66
+ const uint8_t key[BLAKE3_KEY_LEN]);
67
+ BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
68
+ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
69
+ size_t context_len);
70
+ BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
71
+ size_t input_len);
72
+ BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
73
+ size_t out_len);
74
+ BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
75
+ uint8_t *out, size_t out_len);
76
+ BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
55
77
 
56
78
  #ifdef __cplusplus
57
79
  }
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
167
167
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168
168
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169
169
 
170
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
170
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
171
171
  // 11/33.
172
172
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173
173
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -1784,7 +1784,7 @@ blake3_hash_many_avx2:
1784
1784
  vmovdqu xmmword ptr [rbx+0x10], xmm1
1785
1785
  jmp 4b
1786
1786
 
1787
- .section .rodata
1787
+ .section .rdata
1788
1788
  .p2align 6
1789
1789
  ADD0:
1790
1790
  .long 0, 1, 2, 3, 4, 5, 6, 7
@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
429
429
  }
430
430
 
431
431
  INLINE void transpose_vecs_128(__m128i vecs[4]) {
432
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
432
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
433
433
  // 22/33. Note that this doesn't split the vector into two lanes, as the
434
434
  // AVX2 counterparts do.
435
435
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
684
684
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685
685
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
686
686
 
687
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
687
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
688
688
  // 11/33.
689
689
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690
690
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
959
959
  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960
960
  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
961
961
 
962
- // Interleave 64-bit lates. The _0 unpack is lanes
962
+ // Interleave 64-bit lanes. The _0 unpack is lanes
963
963
  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
964
964
  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
965
965
  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@@ -2587,7 +2587,7 @@ blake3_compress_xof_avx512:
2587
2587
  add rsp, 72
2588
2588
  ret
2589
2589
 
2590
- .section .rodata
2590
+ .section .rdata
2591
2591
  .p2align 6
2592
2592
  INDEX0:
2593
2593
  .long 0, 1, 2, 3, 16, 17, 18, 19
@@ -6,6 +6,7 @@
6
6
 
7
7
  #if defined(IS_X86)
8
8
  #if defined(_MSC_VER)
9
+ #include <Windows.h>
9
10
  #include <intrin.h>
10
11
  #elif defined(__GNUC__)
11
12
  #include <immintrin.h>
@@ -14,6 +15,32 @@
14
15
  #endif
15
16
  #endif
16
17
 
18
+ #if !defined(BLAKE3_ATOMICS)
19
+ #if defined(__has_include)
20
+ #if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
21
+ #define BLAKE3_ATOMICS 1
22
+ #else
23
+ #define BLAKE3_ATOMICS 0
24
+ #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
25
+ #else
26
+ #define BLAKE3_ATOMICS 0
27
+ #endif /* defined(__has_include) */
28
+ #endif /* BLAKE3_ATOMICS */
29
+
30
+ #if BLAKE3_ATOMICS
31
+ #define ATOMIC_INT _Atomic int
32
+ #define ATOMIC_LOAD(x) x
33
+ #define ATOMIC_STORE(x, y) x = y
34
+ #elif defined(_MSC_VER)
35
+ #define ATOMIC_INT LONG
36
+ #define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
37
+ #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
38
+ #else
39
+ #define ATOMIC_INT int
40
+ #define ATOMIC_LOAD(x) x
41
+ #define ATOMIC_STORE(x, y) x = y
42
+ #endif
43
+
17
44
  #define MAYBE_UNUSED(x) (void)((x))
18
45
 
19
46
  #if defined(IS_X86)
@@ -76,7 +103,7 @@ enum cpu_feature {
76
103
  #if !defined(BLAKE3_TESTING)
77
104
  static /* Allow the variable to be controlled manually for testing */
78
105
  #endif
79
- enum cpu_feature g_cpu_features = UNDEFINED;
106
+ ATOMIC_INT g_cpu_features = UNDEFINED;
80
107
 
81
108
  #if !defined(BLAKE3_TESTING)
82
109
  static
@@ -84,14 +111,16 @@ static
84
111
  enum cpu_feature
85
112
  get_cpu_features(void) {
86
113
 
87
- if (g_cpu_features != UNDEFINED) {
88
- return g_cpu_features;
114
+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
115
+ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
116
+ if (features != UNDEFINED) {
117
+ return features;
89
118
  } else {
90
119
  #if defined(IS_X86)
91
120
  uint32_t regs[4] = {0};
92
121
  uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
93
122
  (void)edx;
94
- enum cpu_feature features = 0;
123
+ features = 0;
95
124
  cpuid(regs, 0);
96
125
  const int max_id = *eax;
97
126
  cpuid(regs, 1);
@@ -101,7 +130,7 @@ static
101
130
  if (*edx & (1UL << 26))
102
131
  features |= SSE2;
103
132
  #endif
104
- if (*ecx & (1UL << 0))
133
+ if (*ecx & (1UL << 9))
105
134
  features |= SSSE3;
106
135
  if (*ecx & (1UL << 19))
107
136
  features |= SSE41;
@@ -124,7 +153,7 @@ static
124
153
  }
125
154
  }
126
155
  }
127
- g_cpu_features = features;
156
+ ATOMIC_STORE(g_cpu_features, features);
128
157
  return features;
129
158
  #else
130
159
  /* How to detect NEON? */
@@ -51,7 +51,11 @@ enum blake3_flags {
51
51
  #if !defined(BLAKE3_USE_NEON)
52
52
  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
53
53
  #if defined(IS_AARCH64)
54
- #define BLAKE3_USE_NEON 1
54
+ #if defined(__ARM_BIG_ENDIAN)
55
+ #define BLAKE3_USE_NEON 0
56
+ #else
57
+ #define BLAKE3_USE_NEON 1
58
+ #endif
55
59
  #else
56
60
  #define BLAKE3_USE_NEON 0
57
61
  #endif
@@ -87,7 +91,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
87
91
  /* x is assumed to be nonzero. */
88
92
  static unsigned int highest_one(uint64_t x) {
89
93
  #if defined(__GNUC__) || defined(__clang__)
90
- return 63 ^ __builtin_clzll(x);
94
+ return 63 ^ (unsigned int)__builtin_clzll(x);
91
95
  #elif defined(_MSC_VER) && defined(IS_X86_64)
92
96
  unsigned long index;
93
97
  _BitScanReverse64(&index, x);
@@ -117,7 +121,7 @@ static unsigned int highest_one(uint64_t x) {
117
121
  // Count the number of 1 bits.
118
122
  INLINE unsigned int popcnt(uint64_t x) {
119
123
  #if defined(__GNUC__) || defined(__clang__)
120
- return __builtin_popcountll(x);
124
+ return (unsigned int)__builtin_popcountll(x);
121
125
  #else
122
126
  unsigned int count = 0;
123
127
  while (x != 0) {
@@ -10,14 +10,12 @@
10
10
 
11
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
12
12
  // vld1q_u32 has alignment requirements. Don't use it.
13
- uint32x4_t x;
14
- memcpy(&x, src, 16);
15
- return x;
13
+ return vreinterpretq_u32_u8(vld1q_u8(src));
16
14
  }
17
15
 
18
16
  INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
19
17
  // vst1q_u32 has alignment requirements. Don't use it.
20
- memcpy(dest, &src, 16);
18
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
21
19
  }
22
20
 
23
21
  INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@@ -36,19 +34,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36
34
  }
37
35
 
38
36
  INLINE uint32x4_t rot16_128(uint32x4_t x) {
39
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
37
+ // The straightfoward implementation would be two shifts and an or, but that's
38
+ // slower on microarchitectures we've tested. See
39
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
40
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
41
+ return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
40
42
  }
41
43
 
42
44
  INLINE uint32x4_t rot12_128(uint32x4_t x) {
43
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
45
+ // See comment in rot16_128.
46
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
47
+ return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
44
48
  }
45
49
 
46
50
  INLINE uint32x4_t rot8_128(uint32x4_t x) {
47
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
51
+ // See comment in rot16_128.
52
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
53
+ #if defined(__clang__)
54
+ return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
55
+ #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
56
+ static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
57
+ return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
58
+ #else
59
+ return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
60
+ #endif
48
61
  }
49
62
 
50
63
  INLINE uint32x4_t rot7_128(uint32x4_t x) {
51
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
64
+ // See comment in rot16_128.
65
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
66
+ return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
52
67
  }
53
68
 
54
69
  // TODO: compress_neon
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
396
396
  }
397
397
 
398
398
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
399
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
400
400
  // 22/33. Note that this doesn't split the vector into two lanes, as the
401
401
  // AVX2 counterparts do.
402
402
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2301,7 +2301,7 @@ blake3_compress_xof_sse2:
2301
2301
  ret
2302
2302
 
2303
2303
 
2304
- .section .rodata
2304
+ .section .rdata
2305
2305
  .p2align 6
2306
2306
  BLAKE3_IV:
2307
2307
  .long 0x6A09E667, 0xBB67AE85
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
390
390
  }
391
391
 
392
392
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
393
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
394
394
  // 22/33. Note that this doesn't split the vector into two lanes, as the
395
395
  // AVX2 counterparts do.
396
396
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2042,7 +2042,7 @@ blake3_compress_xof_sse41:
2042
2042
  ret
2043
2043
 
2044
2044
 
2045
- .section .rodata
2045
+ .section .rdata
2046
2046
  .p2align 6
2047
2047
  BLAKE3_IV:
2048
2048
  .long 0x6A09E667, 0xBB67AE85
@@ -2,6 +2,6 @@ require 'digest'
2
2
 
3
3
  module Digest
4
4
  class BLAKE3 < Base
5
- VERSION = "1.4.0.0"
5
+ VERSION = "1.5.1.0"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digest-blake3
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0.0
4
+ version: 1.5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Bryant
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-17 00:00:00.000000000 Z
11
+ date: 2024-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler