digest-blake3 1.4.0.0 → 1.4.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/ext/digest/blake3/blake3.c +1 -1
- data/ext/digest/blake3/blake3.h +37 -15
- data/ext/digest/blake3/blake3_avx2.c +1 -1
- data/ext/digest/blake3/blake3_avx2_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_avx512.c +3 -3
- data/ext/digest/blake3/blake3_avx512_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_dispatch.c +1 -1
- data/ext/digest/blake3/blake3_impl.h +2 -2
- data/ext/digest/blake3/blake3_neon.c +21 -4
- data/ext/digest/blake3/blake3_sse2.c +1 -1
- data/ext/digest/blake3/blake3_sse2_x86-64_windows_gnu.S +1 -1
- data/ext/digest/blake3/blake3_sse41.c +1 -1
- data/ext/digest/blake3/blake3_sse41_x86-64_windows_gnu.S +1 -1
- data/lib/digest/blake3/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c1fde08d1bb8e1e082e30d543fff61c7bbad77a577109d400883c733c72404df
|
4
|
+
data.tar.gz: e8705dcc092f4cfe63d95d9d837233717a3aeea0f994e2fc49498dae947c7a42
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ca517b08b45f06d098a2938f9cf321a85069d0f59861606fa3ddb44f192d54ca6c8a3dc8607a7704692b81c0c695f1fe9d98bf14fc0068273b165b2772375047
|
7
|
+
data.tar.gz: c295b09c4cbbe7f840bfe4088d522b9e2ca362ed9118962b23dd7756b5b4c916bd974b6132e65bb08ea375d83e5fe5b5281f29fdb40be44b03f6da9451bfbfac
|
data/ext/digest/blake3/blake3.c
CHANGED
@@ -254,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
|
|
254
254
|
// As a special case when the SIMD degree is 1, this function will still return
|
255
255
|
// at least 2 outputs. This guarantees that this function doesn't perform the
|
256
256
|
// root compression. (If it did, it would use the wrong flags, and also we
|
257
|
-
// wouldn't be able to implement
|
257
|
+
// wouldn't be able to implement extendable output.) Note that this function is
|
258
258
|
// not used when the whole input is only 1 chunk long; that's a different
|
259
259
|
// codepath.
|
260
260
|
//
|
data/ext/digest/blake3/blake3.h
CHANGED
@@ -4,11 +4,33 @@
|
|
4
4
|
#include <stddef.h>
|
5
5
|
#include <stdint.h>
|
6
6
|
|
7
|
+
#if !defined(BLAKE3_API)
|
8
|
+
# if defined(_WIN32) || defined(__CYGWIN__)
|
9
|
+
# if defined(BLAKE3_DLL)
|
10
|
+
# if defined(BLAKE3_DLL_EXPORTS)
|
11
|
+
# define BLAKE3_API __declspec(dllexport)
|
12
|
+
# else
|
13
|
+
# define BLAKE3_API __declspec(dllimport)
|
14
|
+
# endif
|
15
|
+
# define BLAKE3_PRIVATE
|
16
|
+
# else
|
17
|
+
# define BLAKE3_API
|
18
|
+
# define BLAKE3_PRIVATE
|
19
|
+
# endif
|
20
|
+
# elif __GNUC__ >= 4
|
21
|
+
# define BLAKE3_API __attribute__((visibility("default")))
|
22
|
+
# define BLAKE3_PRIVATE __attribute__((visibility("hidden")))
|
23
|
+
# else
|
24
|
+
# define BLAKE3_API
|
25
|
+
# define BLAKE3_PRIVATE
|
26
|
+
# endif
|
27
|
+
#endif
|
28
|
+
|
7
29
|
#ifdef __cplusplus
|
8
30
|
extern "C" {
|
9
31
|
#endif
|
10
32
|
|
11
|
-
#define BLAKE3_VERSION_STRING "1.
|
33
|
+
#define BLAKE3_VERSION_STRING "1.4.1"
|
12
34
|
#define BLAKE3_KEY_LEN 32
|
13
35
|
#define BLAKE3_OUT_LEN 32
|
14
36
|
#define BLAKE3_BLOCK_LEN 64
|
@@ -38,20 +60,20 @@ typedef struct {
|
|
38
60
|
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
|
39
61
|
} blake3_hasher;
|
40
62
|
|
41
|
-
const char *blake3_version(void);
|
42
|
-
void blake3_hasher_init(blake3_hasher *self);
|
43
|
-
void blake3_hasher_init_keyed(blake3_hasher *self,
|
44
|
-
|
45
|
-
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
46
|
-
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
47
|
-
|
48
|
-
void blake3_hasher_update(blake3_hasher *self, const void *input,
|
49
|
-
|
50
|
-
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
51
|
-
|
52
|
-
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
53
|
-
|
54
|
-
void blake3_hasher_reset(blake3_hasher *self);
|
63
|
+
BLAKE3_API const char *blake3_version(void);
|
64
|
+
BLAKE3_API void blake3_hasher_init(blake3_hasher *self);
|
65
|
+
BLAKE3_API void blake3_hasher_init_keyed(blake3_hasher *self,
|
66
|
+
const uint8_t key[BLAKE3_KEY_LEN]);
|
67
|
+
BLAKE3_API void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
|
68
|
+
BLAKE3_API void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
|
69
|
+
size_t context_len);
|
70
|
+
BLAKE3_API void blake3_hasher_update(blake3_hasher *self, const void *input,
|
71
|
+
size_t input_len);
|
72
|
+
BLAKE3_API void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
|
73
|
+
size_t out_len);
|
74
|
+
BLAKE3_API void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
|
75
|
+
uint8_t *out, size_t out_len);
|
76
|
+
BLAKE3_API void blake3_hasher_reset(blake3_hasher *self);
|
55
77
|
|
56
78
|
#ifdef __cplusplus
|
57
79
|
}
|
@@ -167,7 +167,7 @@ INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
|
|
167
167
|
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
|
168
168
|
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
|
169
169
|
|
170
|
-
// Interleave 64-bit
|
170
|
+
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
|
171
171
|
// 11/33.
|
172
172
|
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
|
173
173
|
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
|
@@ -429,7 +429,7 @@ INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
|
|
429
429
|
}
|
430
430
|
|
431
431
|
INLINE void transpose_vecs_128(__m128i vecs[4]) {
|
432
|
-
// Interleave 32-bit
|
432
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
433
433
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
434
434
|
// AVX2 counterparts do.
|
435
435
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
@@ -684,7 +684,7 @@ INLINE void transpose_vecs_256(__m256i vecs[8]) {
|
|
684
684
|
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
|
685
685
|
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
|
686
686
|
|
687
|
-
// Interleave 64-bit
|
687
|
+
// Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
|
688
688
|
// 11/33.
|
689
689
|
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
|
690
690
|
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
|
@@ -959,7 +959,7 @@ INLINE void transpose_vecs_512(__m512i vecs[16]) {
|
|
959
959
|
__m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
|
960
960
|
__m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
|
961
961
|
|
962
|
-
// Interleave 64-bit
|
962
|
+
// Interleave 64-bit lanes. The _0 unpack is lanes
|
963
963
|
// 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
|
964
964
|
// 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
|
965
965
|
// 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
|
@@ -87,7 +87,7 @@ static const uint8_t MSG_SCHEDULE[7][16] = {
|
|
87
87
|
/* x is assumed to be nonzero. */
|
88
88
|
static unsigned int highest_one(uint64_t x) {
|
89
89
|
#if defined(__GNUC__) || defined(__clang__)
|
90
|
-
return 63 ^ __builtin_clzll(x);
|
90
|
+
return 63 ^ (unsigned int)__builtin_clzll(x);
|
91
91
|
#elif defined(_MSC_VER) && defined(IS_X86_64)
|
92
92
|
unsigned long index;
|
93
93
|
_BitScanReverse64(&index, x);
|
@@ -117,7 +117,7 @@ static unsigned int highest_one(uint64_t x) {
|
|
117
117
|
// Count the number of 1 bits.
|
118
118
|
INLINE unsigned int popcnt(uint64_t x) {
|
119
119
|
#if defined(__GNUC__) || defined(__clang__)
|
120
|
-
return __builtin_popcountll(x);
|
120
|
+
return (unsigned int)__builtin_popcountll(x);
|
121
121
|
#else
|
122
122
|
unsigned int count = 0;
|
123
123
|
while (x != 0) {
|
@@ -36,19 +36,36 @@ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
|
36
36
|
}
|
37
37
|
|
38
38
|
INLINE uint32x4_t rot16_128(uint32x4_t x) {
|
39
|
-
|
39
|
+
// The straightfoward implementation would be two shifts and an or, but that's
|
40
|
+
// slower on microarchitectures we've tested. See
|
41
|
+
// https://github.com/BLAKE3-team/BLAKE3/pull/319.
|
42
|
+
// return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
|
43
|
+
return vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x)));
|
40
44
|
}
|
41
45
|
|
42
46
|
INLINE uint32x4_t rot12_128(uint32x4_t x) {
|
43
|
-
|
47
|
+
// See comment in rot16_128.
|
48
|
+
// return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
|
49
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-12), x, 12);
|
44
50
|
}
|
45
51
|
|
46
52
|
INLINE uint32x4_t rot8_128(uint32x4_t x) {
|
47
|
-
|
53
|
+
// See comment in rot16_128.
|
54
|
+
// return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
|
55
|
+
#if defined(__clang__)
|
56
|
+
return vreinterpretq_u32_u8(__builtin_shufflevector(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), 1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12));
|
57
|
+
#elif __GNUC__ * 10000 + __GNUC_MINOR__ * 100 >=40700
|
58
|
+
static const uint8x16_t r8 = {1,2,3,0,5,6,7,4,9,10,11,8,13,14,15,12};
|
59
|
+
return vreinterpretq_u32_u8(__builtin_shuffle(vreinterpretq_u8_u32(x), vreinterpretq_u8_u32(x), r8));
|
60
|
+
#else
|
61
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-8), x, 8);
|
62
|
+
#endif
|
48
63
|
}
|
49
64
|
|
50
65
|
INLINE uint32x4_t rot7_128(uint32x4_t x) {
|
51
|
-
|
66
|
+
// See comment in rot16_128.
|
67
|
+
// return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
|
68
|
+
return vsriq_n_u32(vshlq_n_u32(x, 32-7), x, 7);
|
52
69
|
}
|
53
70
|
|
54
71
|
// TODO: compress_neon
|
@@ -396,7 +396,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
|
|
396
396
|
}
|
397
397
|
|
398
398
|
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
|
399
|
-
// Interleave 32-bit
|
399
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
400
400
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
401
401
|
// AVX2 counterparts do.
|
402
402
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|
@@ -390,7 +390,7 @@ INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
|
|
390
390
|
}
|
391
391
|
|
392
392
|
INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
|
393
|
-
// Interleave 32-bit
|
393
|
+
// Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
|
394
394
|
// 22/33. Note that this doesn't split the vector into two lanes, as the
|
395
395
|
// AVX2 counterparts do.
|
396
396
|
__m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
|