digest-blake3 1.4.0.0 → 1.4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/digest/blake3/blake3.c +1 -1
- data/ext/digest/blake3/blake3.h +37 -15
- data/ext/digest/blake3/blake3_avx2.c +1 -1
- data/ext/digest/blake3/blake3_avx2_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_avx512.c +3 -3
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_dispatch.c +1 -1
- data/ext/digest/blake3/blake3_impl.h +2 -2
- data/ext/digest/blake3/blake3_neon.c +21 -4
- data/ext/digest/blake3/blake3_sse2.c +1 -1
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_sse41.c +1 -1
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +1 -1
- data/lib/digest/blake3/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1fde08d1bb8e1e082e30d543fff61c7bbad77a577109d400883c733c72404df
|
4
|
+
data.tar.gz: e8705dcc092f4cfe63d95d9d837233717a3aeea0f994e2fc49498dae947c7a42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca517b08b45f06d098a2938f9cf321a85069d0f59861606fa3ddb44f192d54ca6c8a3dc8607a7704692b81c0c695f1fe9d98bf14fc0068273b165b2772375047
|
7
|
+
data.tar.gz: c295b09c4cbbe7f840bfe4088d522b9e2ca362ed9118962b23dd7756b5b4c916bd974b6132e65bb08ea375d83e5fe5b5281f29fdb40be44b03f6da9451bfbfac
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
254
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
255
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
256
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
257
|
-
// wouldn't be able to implement
|
257
|
+
// wouldn't be able to implement extendable output.) Note that this function is
|
258
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
259
259
|
// codepath.
|
260
260
|
//
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -4,11 +4,33 @@
|
|
4
4
|
#include <stddef.h>
|
5
5
|
#include <stdint.h>
|
6
6
|
|
7
|
+
#if !defined(BLAKE3_API)
|
8
|
+
# if defined(_WIN32) || defined(__CYGWIN__)
|
9
|
+
# if defined(BLAKE3_DLL)
|
10
|
+
# if defined(BLAKE3_DLL_EXPORTS)
|
11
|
+
# define BLAKE3_API __declspec(dllexport)
|
12
|
+
# else
|
13
|
+
# define BLAKE3_API __declspec(dllimport)
|
14
|
+
# endif
|
15
|
+
# define BLAKE3_PRIVATE
|
16
|
+
# else
|
17
|
+
# define BLAKE3_API
|
18
|
+
# define BLAKE3_PRIVATE
|
19
|
+
# endif
|
20
|
+
# elif __GNUC__ >= 4
|
21
|
+
# define BLAKE3_API __attribute__((visibility("default")))
|
22
|
+
# define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
|
23
|
+
# else
|
24
|
+
# define BLAKE3_API
|
25
|
+
# define BLAKE3_PRIVATE
|
26
|
+
# endif
|
27
|
+
#endif
|
28
|
+
|
7
29
|
#ifdef __cplusplus
|
8
30
|
extern "C" {
|
9
31
|
#endif
|
10
32
|
|
11
|
-
#define BLAKE3_VERSION_STRING "1.
|
33
|
+
#define BLAKE3_VERSION_STRING "1.4.1"
|
12
34
|
#define BLAKE3_KEY_LEN 32
|
13
35
|
#define BLAKE3_OUT_LEN 32
|
14
36
|
#define BLAKE3_BLOCK_LEN 64
|
@@ -38,20 +60,20 @@ typedef struct {
|
|
38
60
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
61
|
} blake3_hasher;
|
40
62
|
|
41
|
-
const char *blake3_version(void);
|
42
|
-
void blake3_hasher_init(blake3_hasher *self);
|
43
|
-
void blake3_hasher_init_keyed(blake3_hasher *self,
|
44
|
-
|
45
|
-
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
46
|
-
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
47
|
-
|
48
|
-
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
49
|
-
|
50
|
-
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
51
|
-
|
52
|
-
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
|
-
|
54
|
-
void blake3_hasher_reset(blake3_hasher *self);
|
63
|
+
BLAKE3_API const char *blake3_version(void);
|
64
|
+
BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
|
65
|
+
BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
|
66
|
+
const uint8_t key[BLAKE3_KEY_LEN]);
|
67
|
+
BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
68
|
+
BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
69
|
+
size_t context_len);
|
70
|
+
BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
|
71
|
+
size_t input_len);
|
72
|
+
BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
73
|
+
size_t out_len);
|
74
|
+
BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
75
|
+
uint8_t *out, size_t out_len);
|
76
|
+
BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
|
55
77
|
|
56
78
|
#ifdef __cplusplus
|
57
79
|
}
|
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
|
|
167
167
|
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
|
168
168
|
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
|
169
169
|
|
170
|
-
// Interleave 64-bit
|
170
|
+
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
|
171
171
|
// 11/33.
|
172
172
|
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
|
173
173
|
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
|
@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
|
|
429
429
|
}
|
430
430
|
|
431
431
|
INLINE void transpose_vecs_128(__m128i vecs[4]) {
|
432
|
-
// Interleave 32-bit
|
432
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
433
433
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
434
434
|
// AVX2 counterparts do.
|
435
435
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
|
|
684
684
|
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
|
685
685
|
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
|
686
686
|
|
687
|
-
// Interleave 64-bit
|
687
|
+
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
|
688
688
|
// 11/33.
|
689
689
|
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
|
690
690
|
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
|
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
|
|
959
959
|
__m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
|
960
960
|
__m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
|
961
961
|
|
962
|
-
// Interleave 64-bit
|
962
|
+
// Interleave 64-bit lanes. The _0 unpack is lanes
|
963
963
|
// 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
|
964
964
|
// 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
|
965
965
|
// 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
|
@@ -87,7 +87,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
|
|
87
87
|
/* x is assumed to be nonzero. */
|
88
88
|
static unsigned int highest_one(uint64_t x) {
|
89
89
|
#if defined(__GNUC__) || defined(__clang__)
|
90
|
-
return 63 ^ __builtin_clzll(x);
|
90
|
+
return 63 ^ (unsigned int)__builtin_clzll(x);
|
91
91
|
#elif defined(_MSC_VER) && defined(IS_X86_64)
|
92
92
|
unsigned long index;
|
93
93
|
_BitScanReverse64(&index, x);
|
@@ -117,7 +117,7 @@ static unsigned int highest_one(uint64_t x) {
|
|
117
117
|
// Count the number of 1 bits.
|
118
118
|
INLINE unsigned int popcnt(uint64_t x) {
|
119
119
|
#if defined(__GNUC__) || defined(__clang__)
|
120
|
-
return __builtin_popcountll(x);
|
120
|
+
return (unsigned int)__builtin_popcountll(x);
|
121
121
|
#else
|
122
122
|
unsigned int count = 0;
|
123
123
|
while (x != 0) {
|
@@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
|
36
36
|
}
|
37
37
|
|
38
38
|
INLINE uint32x4_t rot16_128(uint32x4_t x) {
|
39
|
-
|
39
|
+
// The straightfoward implementation would be two shifts and an or, but that's
|
40
|
+
// slower on microarchitectures we've tested. See
|
41
|
+
// https://github.com/BLAKE3-team/BLAKE3/pull/319.
|
42
|
+
// return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
|
43
|
+
return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
|
40
44
|
}
|
41
45
|
|
42
46
|
INLINE uint32x4_t rot12_128(uint32x4_t x) {
|
43
|
-
|
47
|
+
// See comment in rot16_128.
|
48
|
+
// return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
|
49
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
|
44
50
|
}
|
45
51
|
|
46
52
|
INLINE uint32x4_t rot8_128(uint32x4_t x) {
|
47
|
-
|
53
|
+
// See comment in rot16_128.
|
54
|
+
// return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
|
55
|
+
#if defined(__clang__)
|
56
|
+
return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
|
57
|
+
#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
|
58
|
+
static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
|
59
|
+
return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
|
60
|
+
#else
|
61
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
|
62
|
+
#endif
|
48
63
|
}
|
49
64
|
|
50
65
|
INLINE uint32x4_t rot7_128(uint32x4_t x) {
|
51
|
-
|
66
|
+
// See comment in rot16_128.
|
67
|
+
// return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
|
68
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
|
52
69
|
}
|
53
70
|
|
54
71
|
// TODO: compress_neon
|
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
|
|
396
396
|
}
|
397
397
|
|
398
398
|
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
|
399
|
-
// Interleave 32-bit
|
399
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
400
400
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
401
401
|
// AVX2 counterparts do.
|
402
402
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
|
|
390
390
|
}
|
391
391
|
|
392
392
|
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
|
393
|
-
// Interleave 32-bit
|
393
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
394
394
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
395
395
|
// AVX2 counterparts do.
|
396
396
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|