digest-blake3 1.4.0.0 → 1.4.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 600afca6f08145f3e28b49fbe757b661368d58dc8ec20e1778a915407dcc660a
4
- data.tar.gz: 1cd455e9caf97fd0f514623ba6b9d7f74249071af8cb8b5377ba10de91c5eb34
3
+ metadata.gz: c1fde08d1bb8e1e082e30d543fff61c7bbad77a577109d400883c733c72404df
4
+ data.tar.gz: e8705dcc092f4cfe63d95d9d837233717a3aeea0f994e2fc49498dae947c7a42
5
5
  SHA512:
6
- metadata.gz: d515228fab5f92576d9b1f67d66ffff97623f154dcab9a1dcb140b9e69884325797991d1f87d7f5cb26cb6397a86e989a1c1f2163b478641efdc4b85b5772026
7
- data.tar.gz: ed75418dda098a8700554b871189c9995e1e6e4969d86cc8d9afd46d8439ecf985fb111f1a6d6adb9173f68cb33c7c261b35b0820439c48a92e354da2f662e35
6
+ metadata.gz: ca517b08b45f06d098a2938f9cf321a85069d0f59861606fa3ddb44f192d54ca6c8a3dc8607a7704692b81c0c695f1fe9d98bf14fc0068273b165b2772375047
7
+ data.tar.gz: c295b09c4cbbe7f840bfe4088d522b9e2ca362ed9118962b23dd7756b5b4c916bd974b6132e65bb08ea375d83e5fe5b5281f29fdb40be44b03f6da9451bfbfac
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
254
254
  // As a special case when the SIMD degree is 1, this function will still return
255
255
  // at least 2 outputs. This guarantees that this function doesn't perform the
256
256
  // root compression. (If it did, it would use the wrong flags, and also we
257
- // wouldn't be able to implement exendable output.) Note that this function is
257
+ // wouldn't be able to implement extendable output.) Note that this function is
258
258
  // not used when the whole input is only 1 chunk long; that's a different
259
259
  // codepath.
260
260
  //
@@ -4,11 +4,33 @@
4
4
  #include <stddef.h>
5
5
  #include <stdint.h>
6
6
 
7
+ #if !defined(BLAKE3_API)
8
+ # if defined(_WIN32) || defined(__CYGWIN__)
9
+ # if defined(BLAKE3_DLL)
10
+ # if defined(BLAKE3_DLL_EXPORTS)
11
+ # define BLAKE3_API __declspec(dllexport)
12
+ # else
13
+ # define BLAKE3_API __declspec(dllimport)
14
+ # endif
15
+ # define BLAKE3_PRIVATE
16
+ # else
17
+ # define BLAKE3_API
18
+ # define BLAKE3_PRIVATE
19
+ # endif
20
+ # elif __GNUC__ >= 4
21
+ # define BLAKE3_API __attribute__((visibility("default")))
22
+ # define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
23
+ # else
24
+ # define BLAKE3_API
25
+ # define BLAKE3_PRIVATE
26
+ # endif
27
+ #endif
28
+
7
29
  #ifdef __cplusplus
8
30
  extern "C" {
9
31
  #endif
10
32
 
11
- #define BLAKE3_VERSION_STRING "1.3.3"
33
+ #define BLAKE3_VERSION_STRING "1.4.1"
12
34
  #define BLAKE3_KEY_LEN 32
13
35
  #define BLAKE3_OUT_LEN 32
14
36
  #define BLAKE3_BLOCK_LEN 64
@@ -38,20 +60,20 @@ typedef struct {
38
60
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
61
  } blake3_hasher;
40
62
 
41
- const char *blake3_version(void);
42
- void blake3_hasher_init(blake3_hasher *self);
43
- void blake3_hasher_init_keyed(blake3_hasher *self,
44
- const uint8_t key[BLAKE3_KEY_LEN]);
45
- void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
46
- void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
47
- size_t context_len);
48
- void blake3_hasher_update(blake3_hasher *self, const void *input,
49
- size_t input_len);
50
- void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
51
- size_t out_len);
52
- void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
53
- uint8_t *out, size_t out_len);
54
- void blake3_hasher_reset(blake3_hasher *self);
63
+ BLAKE3_API const char *blake3_version(void);
64
+ BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
65
+ BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
66
+ const uint8_t key[BLAKE3_KEY_LEN]);
67
+ BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
68
+ BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
69
+ size_t context_len);
70
+ BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
71
+ size_t input_len);
72
+ BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
73
+ size_t out_len);
74
+ BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
75
+ uint8_t *out, size_t out_len);
76
+ BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
55
77
 
56
78
  #ifdef __cplusplus
57
79
  }
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
167
167
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168
168
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169
169
 
170
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
170
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
171
171
  // 11/33.
172
172
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173
173
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -1784,7 +1784,7 @@ blake3_hash_many_avx2:
1784
1784
  vmovdqu xmmword ptr [rbx+0x10], xmm1
1785
1785
  jmp 4b
1786
1786
 
1787
- .section .rodata
1787
+ .section .rdata
1788
1788
  .p2align 6
1789
1789
  ADD0:
1790
1790
  .long 0, 1, 2, 3, 4, 5, 6, 7
@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
429
429
  }
430
430
 
431
431
  INLINE void transpose_vecs_128(__m128i vecs[4]) {
432
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
432
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
433
433
  // 22/33. Note that this doesn't split the vector into two lanes, as the
434
434
  // AVX2 counterparts do.
435
435
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
684
684
  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685
685
  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
686
686
 
687
- // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
687
+ // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
688
688
  // 11/33.
689
689
  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690
690
  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
959
959
  __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960
960
  __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
961
961
 
962
- // Interleave 64-bit lates. The _0 unpack is lanes
962
+ // Interleave 64-bit lanes. The _0 unpack is lanes
963
963
  // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
964
964
  // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
965
965
  // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
@@ -2587,7 +2587,7 @@ blake3_compress_xof_avx512:
2587
2587
  add rsp, 72
2588
2588
  ret
2589
2589
 
2590
- .section .rodata
2590
+ .section .rdata
2591
2591
  .p2align 6
2592
2592
  INDEX0:
2593
2593
  .long 0, 1, 2, 3, 16, 17, 18, 19
@@ -101,7 +101,7 @@ static
101
101
  if (*edx & (1UL << 26))
102
102
  features |= SSE2;
103
103
  #endif
104
- if (*ecx & (1UL << 0))
104
+ if (*ecx & (1UL << 9))
105
105
  features |= SSSE3;
106
106
  if (*ecx & (1UL << 19))
107
107
  features |= SSE41;
@@ -87,7 +87,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
87
87
  /* x is assumed to be nonzero. */
88
88
  static unsigned int highest_one(uint64_t x) {
89
89
  #if defined(__GNUC__) || defined(__clang__)
90
- return 63 ^ __builtin_clzll(x);
90
+ return 63 ^ (unsigned int)__builtin_clzll(x);
91
91
  #elif defined(_MSC_VER) && defined(IS_X86_64)
92
92
  unsigned long index;
93
93
  _BitScanReverse64(&index, x);
@@ -117,7 +117,7 @@ static unsigned int highest_one(uint64_t x) {
117
117
  // Count the number of 1 bits.
118
118
  INLINE unsigned int popcnt(uint64_t x) {
119
119
  #if defined(__GNUC__) || defined(__clang__)
120
- return __builtin_popcountll(x);
120
+ return (unsigned int)__builtin_popcountll(x);
121
121
  #else
122
122
  unsigned int count = 0;
123
123
  while (x != 0) {
@@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
36
36
  }
37
37
 
38
38
  INLINE uint32x4_t rot16_128(uint32x4_t x) {
39
- return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
39
+ // The straightfoward implementation would be two shifts and an or, but that's
40
+ // slower on microarchitectures we've tested. See
41
+ // https://github.com/BLAKE3-team/BLAKE3/pull/319.
42
+ // return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
43
+ return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
40
44
  }
41
45
 
42
46
  INLINE uint32x4_t rot12_128(uint32x4_t x) {
43
- return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
47
+ // See comment in rot16_128.
48
+ // return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
49
+ return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
44
50
  }
45
51
 
46
52
  INLINE uint32x4_t rot8_128(uint32x4_t x) {
47
- return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
53
+ // See comment in rot16_128.
54
+ // return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
55
+ #if defined(__clang__)
56
+ return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
57
+ #elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
58
+ static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
59
+ return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
60
+ #else
61
+ return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
62
+ #endif
48
63
  }
49
64
 
50
65
  INLINE uint32x4_t rot7_128(uint32x4_t x) {
51
- return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
66
+ // See comment in rot16_128.
67
+ // return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
68
+ return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
52
69
  }
53
70
 
54
71
  // TODO: compress_neon
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
396
396
  }
397
397
 
398
398
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
399
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
399
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
400
400
  // 22/33. Note that this doesn't split the vector into two lanes, as the
401
401
  // AVX2 counterparts do.
402
402
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2301,7 +2301,7 @@ blake3_compress_xof_sse2:
2301
2301
  ret
2302
2302
 
2303
2303
 
2304
- .section .rodata
2304
+ .section .rdata
2305
2305
  .p2align 6
2306
2306
  BLAKE3_IV:
2307
2307
  .long 0x6A09E667, 0xBB67AE85
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
390
390
  }
391
391
 
392
392
  INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393
- // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
393
+ // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
394
394
  // 22/33. Note that this doesn't split the vector into two lanes, as the
395
395
  // AVX2 counterparts do.
396
396
  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
@@ -2042,7 +2042,7 @@ blake3_compress_xof_sse41:
2042
2042
  ret
2043
2043
 
2044
2044
 
2045
- .section .rodata
2045
+ .section .rdata
2046
2046
  .p2align 6
2047
2047
  BLAKE3_IV:
2048
2048
  .long 0x6A09E667, 0xBB67AE85
@@ -2,6 +2,6 @@ require 'digest'
2
2
 
3
3
  module Digest
4
4
  class BLAKE3 < Base
5
- VERSION = "1.4.0.0"
5
+ VERSION = "1.4.1.0"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digest-blake3
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.4.0.0
4
+ version: 1.4.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Bryant