digest-blake3 1.4.0.0 → 1.5.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 600afca6f08145f3e28b49fbe757b661368d58dc8ec20e1778a915407dcc660a
4
- data.tar.gz: 1cd455e9caf97fd0f514623ba6b9d7f74249071af8cb8b5377ba10de91c5eb34
3
+ metadata.gz: 2bf10e44aaa74a31f9a334b67ecfadfeda7f31a4d3055bd48c5f1a8609e53661
4
+ data.tar.gz: 53072abb4749ecdfd6748360fc33d39789078413078858953bffcd1ae1cfcdaf
5
5
  SHA512:
6
- metadata.gz: d515228fab5f92576d9b1f67d66ffff97623f154dcab9a1dcb140b9e69884325797991d1f87d7f5cb26cb6397a86e989a1c1f2163b478641efdc4b85b5772026
7
- data.tar.gz: ed75418dda098a8700554b871189c9995e1e6e4969d86cc8d9afd46d8439ecf985fb111f1a6d6adb9173f68cb33c7c261b35b0820439c48a92e354da2f662e35
6
+ metadata.gz: b93a9bdf8b7f2fa4986090e466dfe6c0661d6e9e2de7864cb83846f314a07f198ce16bb6289edb50cf1a7333463136049d61c5ee6fa3db9c8c4855e229fd93a7
7
+ data.tar.gz: df226266cb38882b121c401d074ad84559265751c897fccda004a8f3d230c2e67215ee6086cfac13fc7043316955511179de4c024dd9d4b5f9048a948ea52080
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (1.3.3.1)
4
+ digest-blake3 (1.4.1.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
254
254
  // As a special case when the SIMD degree is 1, this function will still return
255
255
  // at least 2 outputs. This guarantees that this function doesn't perform the
256
256
  // root compression. (If it did, it would use the wrong flags, and also we
257
- // wouldn't be able to implement exendable output.) Note that this function is
257
+ // wouldn't be able to implement extendable output.) Note that this function is
258
258
  // not used when the whole input is only 1 chunk long; that's a different
259
259
  // codepath.
260
260
  //
@@ -341,21 +341,24 @@ INLINE void compress_subtree_to_parent_node(
341
341
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
342
342
  chunk_counter, flags, cv_array);
343
343
  assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
344
-
345
- // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
344
+ // The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
345
+ // as we just asserted, num_cvs will always be <=2 in that case. But GCC
346
+ // (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
347
+ // set then it emits incorrect warnings here. We tried a few different
348
+ // hacks to silence these, but in the end our hacks just produced different
349
+ // warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
350
+ // desperation, we ifdef out this entire loop when we know it's not needed.
351
+ #if MAX_SIMD_DEGREE_OR_2 > 2
352
+ // If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
346
353
  // compress_subtree_wide() returns more than 2 chaining values. Condense
347
354
  // them into 2 by forming parent nodes repeatedly.
348
355
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
349
- // The second half of this loop condition is always true, and we just
350
- // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
351
- // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
352
- // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
353
- // this code, test it against that version.
354
- while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
356
+ while (num_cvs > 2) {
355
357
  num_cvs =
356
358
  compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
357
359
  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
358
360
  }
361
+ #endif
359
362
  memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
360
363
  }
361
364
 
@@ -4,11 +4,33 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
+ #if !defined(BLAKE3_API)
8
+ # if defined(_WIN32) || defined(__CYGWIN__)
9
+ # if defined(BLAKE3_DLL)
10
+ # if defined(BLAKE3_DLL_EXPORTS)
11
+ # define BLAKE3_API __declspec(dllexport)
12
+ # else
13
+ # define BLAKE3_API __declspec(dllimport)
14
+ # endif
15
+ # define BLAKE3_PRIVATE
16
+ # else
17
+ # define BLAKE3_API
18
+ # define BLAKE3_PRIVATE
19
+ # endif
20
+ # elif __GNUC__ >= 4
21
+ # define BLAKE3_API __attribute__((visibility("default")))
22
+ # define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
23
+ # else
24
+ # define BLAKE3_API
25
+ # define BLAKE3_PRIVATE
26
+ # endif
27
+ #endif
28
+
7
29
  #ifdef __cplusplus
8
30
  extern "C" {
9
31
  #endif
10
32
 
11
- #define BLAKE3_VERSION_STRING "1.3.3"
33
+ #define BLAKE3_VERSION_STRING "1.5.1"
12
34
  #define BLAKE3_KEY_LEN 32
13
35
  #define BLAKE3_OUT_LEN 32
14
36
  #define BLAKE3_BLOCK_LEN 64
@@ -38,20 +60,20 @@ typedef struct {
38
60
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
61
  } blake3_hasher;
40
62
 
41
- const char *blake3_version(void);
42
- void blake3_hasher_init(blake3_hasher *self);
43
- void blake3_hasher_init_keyed(blake3_hasher *self,
44
- const uint8_t key[BLAKE3_KEY_LEN]);
45
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
46
- void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
47
- size_t context_len);
48
- void blake3_hasher_update(blake3_hasher *self, const void *input,
49
- size_t input_len);
50
- void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
51
- size_t out_len);
52
- void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
53
- uint8_t *out, size_t out_len);
54
- void blake3_hasher_reset(blake3_hasher *self);
63
+ BLAKE3_API const char *blake3_version(void);
64
+ BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
65
+ BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
66
+ const uint8_t key[BLAKE3_KEY_LEN]);
67
+ BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
68
+ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
69
+ size_t context_len);
70
+ BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
71
+ size_t input_len);
72
+ BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
73
+ size_t out_len);
74
+ BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
75
+ uint8_t *out, size_t out_len);
76
+ BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
55
77
 
56
78
  #ifdef __cplusplus
57
79
  }
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
167
167
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168
168
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169
169
 
170
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
170
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
171
171
  // 11/33.
172
172
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173
173
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -1784,7 +1784,7 @@ blake3_hash_many_avx2:
1784
1784
  vmovdqu xmmword ptr [rbx+0x10], xmm1
1785
1785
  jmp 4b
1786
1786
 
1787
- .section .rodata
1787
+ .section .rdata
1788
1788
  .p2align 6
1789
1789
  ADD0:
1790
1790
  .long 0, 1, 2, 3, 4, 5, 6, 7
@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
429
429
  }
430
430
 
431
431
  INLINE void transpose_vecs_128(__m128i vecs[4]) {
432
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
432
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
433
433
  // 22/33. Note that this doesn't split the vector into two lanes, as the
434
434
  // AVX2 counterparts do.
435
435
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
684
684
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685
685
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
686
686
 
687
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
687
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
688
688
  // 11/33.
689
689
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690
690
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
959
959
  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960
960
  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
961
961
 
962
- // Interleave 64-bit lates. The _0 unpack is lanes
962
+ // Interleave 64-bit lanes. The _0 unpack is lanes
963
963
  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
964
964
  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
965
965
  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@@ -2587,7 +2587,7 @@ blake3_compress_xof_avx512:
2587
2587
  add rsp, 72
2588
2588
  ret
2589
2589
 
2590
- .section .rodata
2590
+ .section .rdata
2591
2591
  .p2align 6
2592
2592
  INDEX0:
2593
2593
  .long 0, 1, 2, 3, 16, 17, 18, 19
@@ -6,6 +6,7 @@
6
6
 
7
7
  #if defined(IS_X86)
8
8
  #if defined(_MSC_VER)
9
+ #include <Windows.h>
9
10
  #include <intrin.h>
10
11
  #elif defined(__GNUC__)
11
12
  #include <immintrin.h>
@@ -14,6 +15,32 @@
14
15
  #endif
15
16
  #endif
16
17
 
18
+ #if !defined(BLAKE3_ATOMICS)
19
+ #if defined(__has_include)
20
+ #if __has_include(<stdatomic.h>) && !defined(_MSC_VER)
21
+ #define BLAKE3_ATOMICS 1
22
+ #else
23
+ #define BLAKE3_ATOMICS 0
24
+ #endif /* __has_include(<stdatomic.h>) && !defined(_MSC_VER) */
25
+ #else
26
+ #define BLAKE3_ATOMICS 0
27
+ #endif /* defined(__has_include) */
28
+ #endif /* BLAKE3_ATOMICS */
29
+
30
+ #if BLAKE3_ATOMICS
31
+ #define ATOMIC_INT _Atomic int
32
+ #define ATOMIC_LOAD(x) x
33
+ #define ATOMIC_STORE(x, y) x = y
34
+ #elif defined(_MSC_VER)
35
+ #define ATOMIC_INT LONG
36
+ #define ATOMIC_LOAD(x) InterlockedOr(&x, 0)
37
+ #define ATOMIC_STORE(x, y) InterlockedExchange(&x, y)
38
+ #else
39
+ #define ATOMIC_INT int
40
+ #define ATOMIC_LOAD(x) x
41
+ #define ATOMIC_STORE(x, y) x = y
42
+ #endif
43
+
17
44
  #define MAYBE_UNUSED(x) (void)((x))
18
45
 
19
46
  #if defined(IS_X86)
@@ -76,7 +103,7 @@ enum cpu_feature {
76
103
  #if !defined(BLAKE3_TESTING)
77
104
  static /* Allow the variable to be controlled manually for testing */
78
105
  #endif
79
- enum cpu_feature g_cpu_features = UNDEFINED;
106
+ ATOMIC_INT g_cpu_features = UNDEFINED;
80
107
 
81
108
  #if !defined(BLAKE3_TESTING)
82
109
  static
@@ -84,14 +111,16 @@ static
84
111
  enum cpu_feature
85
112
  get_cpu_features(void) {
86
113
 
87
- if (g_cpu_features != UNDEFINED) {
88
- return g_cpu_features;
114
+ /* If TSAN detects a data race here, try compiling with -DBLAKE3_ATOMICS=1 */
115
+ enum cpu_feature features = ATOMIC_LOAD(g_cpu_features);
116
+ if (features != UNDEFINED) {
117
+ return features;
89
118
  } else {
90
119
  #if defined(IS_X86)
91
120
  uint32_t regs[4] = {0};
92
121
  uint32_t *eax = &regs[0], *ebx = &regs[1], *ecx = &regs[2], *edx = &regs[3];
93
122
  (void)edx;
94
- enum cpu_feature features = 0;
123
+ features = 0;
95
124
  cpuid(regs, 0);
96
125
  const int max_id = *eax;
97
126
  cpuid(regs, 1);
@@ -101,7 +130,7 @@ static
101
130
  if (*edx & (1UL << 26))
102
131
  features |= SSE2;
103
132
  #endif
104
- if (*ecx & (1UL << 0))
133
+ if (*ecx & (1UL << 9))
105
134
  features |= SSSE3;
106
135
  if (*ecx & (1UL << 19))
107
136
  features |= SSE41;
@@ -124,7 +153,7 @@ static
124
153
  }
125
154
  }
126
155
  }
127
- g_cpu_features = features;
156
+ ATOMIC_STORE(g_cpu_features, features);
128
157
  return features;
129
158
  #else
130
159
  /* How to detect NEON? */
@@ -51,7 +51,11 @@ enum blake3_flags {
51
51
  #if !defined(BLAKE3_USE_NEON)
52
52
  // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
53
53
  #if defined(IS_AARCH64)
54
- #define BLAKE3_USE_NEON 1
54
+ #if defined(__ARM_BIG_ENDIAN)
55
+ #define BLAKE3_USE_NEON 0
56
+ #else
57
+ #define BLAKE3_USE_NEON 1
58
+ #endif
55
59
  #else
56
60
  #define BLAKE3_USE_NEON 0
57
61
  #endif
@@ -87,7 +91,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
87
91
  /* x is assumed to be nonzero. */
88
92
  static unsigned int highest_one(uint64_t x) {
89
93
  #if defined(__GNUC__) || defined(__clang__)
90
- return 63 ^ __builtin_clzll(x);
94
+ return 63 ^ (unsigned int)__builtin_clzll(x);
91
95
  #elif defined(_MSC_VER) && defined(IS_X86_64)
92
96
  unsigned long index;
93
97
  _BitScanReverse64(&index, x);
@@ -117,7 +121,7 @@ static unsigned int highest_one(uint64_t x) {
117
121
  // Count the number of 1 bits.
118
122
  INLINE unsigned int popcnt(uint64_t x) {
119
123
  #if defined(__GNUC__) || defined(__clang__)
120
- return __builtin_popcountll(x);
124
+ return (unsigned int)__builtin_popcountll(x);
121
125
  #else
122
126
  unsigned int count = 0;
123
127
  while (x != 0) {
@@ -10,14 +10,12 @@
10
10
 
11
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
12
12
  // vld1q_u32 has alignment requirements. Don't use it.
13
- uint32x4_t x;
14
- memcpy(&x, src, 16);
15
- return x;
13
+ return vreinterpretq_u32_u8(vld1q_u8(src));
16
14
  }
17
15
 
18
16
  INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
19
17
  // vst1q_u32 has alignment requirements. Don't use it.
20
- memcpy(dest, &src, 16);
18
+ vst1q_u8(dest, vreinterpretq_u8_u32(src));
21
19
  }
22
20
 
23
21
  INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
@@ -36,19 +34,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36
34
  }
37
35
 
38
36
  INLINE uint32x4_t rot16_128(uint32x4_t x) {
39
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
37
+ // The straightfoward implementation would be two shifts and an or, but that's
38
+ // slower on microarchitectures we've tested. See
39
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
40
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
41
+ return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
40
42
  }
41
43
 
42
44
  INLINE uint32x4_t rot12_128(uint32x4_t x) {
43
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
45
+ // See comment in rot16_128.
46
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
47
+ return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
44
48
  }
45
49
 
46
50
  INLINE uint32x4_t rot8_128(uint32x4_t x) {
47
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
51
+ // See comment in rot16_128.
52
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
53
+ #if defined(__clang__)
54
+ return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
55
+ #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
56
+ static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
57
+ return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
58
+ #else
59
+ return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
60
+ #endif
48
61
  }
49
62
 
50
63
  INLINE uint32x4_t rot7_128(uint32x4_t x) {
51
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
64
+ // See comment in rot16_128.
65
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
66
+ return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
52
67
  }
53
68
 
54
69
  // TODO: compress_neon
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
396
396
  }
397
397
 
398
398
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
399
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
400
400
  // 22/33. Note that this doesn't split the vector into two lanes, as the
401
401
  // AVX2 counterparts do.
402
402
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2301,7 +2301,7 @@ blake3_compress_xof_sse2:
2301
2301
  ret
2302
2302
 
2303
2303
 
2304
- .section .rodata
2304
+ .section .rdata
2305
2305
  .p2align 6
2306
2306
  BLAKE3_IV:
2307
2307
  .long 0x6A09E667, 0xBB67AE85
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
390
390
  }
391
391
 
392
392
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
393
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
394
394
  // 22/33. Note that this doesn't split the vector into two lanes, as the
395
395
  // AVX2 counterparts do.
396
396
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2042,7 +2042,7 @@ blake3_compress_xof_sse41:
2042
2042
  ret
2043
2043
 
2044
2044
 
2045
- .section .rodata
2045
+ .section .rdata
2046
2046
  .p2align 6
2047
2047
  BLAKE3_IV:
2048
2048
  .long 0x6A09E667, 0xBB67AE85
@@ -2,6 +2,6 @@ require 'digest'
2
2
 
3
3
  module Digest
4
4
  class BLAKE3 < Base
5
- VERSION = "1.4.0.0"
5
+ VERSION = "1.5.1.0"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digest-blake3
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0.0
4
+ version: 1.5.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Bryant
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-17 00:00:00.000000000 Z
11
+ date: 2024-07-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler