digest-blake3 0.37.0.1 → 1.3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
4
- data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
3
+ metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
4
+ data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
5
5
  SHA512:
6
- metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
7
- data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
6
+ metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
7
+ data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (0.37.0)
4
+ digest-blake3 (1.3.3.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.14.0)
10
- rake (13.0.1)
9
+ minitest (5.16.3)
10
+ rake (13.0.6)
11
11
 
12
12
  PLATFORMS
13
13
  ruby
@@ -19,4 +19,4 @@ DEPENDENCIES
19
19
  rake
20
20
 
21
21
  BUNDLED WITH
22
- 1.17.3
22
+ 2.3.1
@@ -5,6 +5,8 @@
5
5
  #include "blake3.h"
6
6
  #include "blake3_impl.h"
7
7
 
8
+ const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
9
+
8
10
  INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
9
11
  uint8_t flags) {
10
12
  memcpy(self->cv, key, BLAKE3_KEY_LEN);
@@ -244,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
244
246
 
245
247
  // The wide helper function returns (writes out) an array of chaining values
246
248
  // and returns the length of that array. The number of chaining values returned
247
- // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
249
+ // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
248
250
  // if the input is shorter than that many chunks. The reason for maintaining a
249
251
  // wide array of chaining values going back up the tree, is to allow the
250
252
  // implementation to hash as many parents in parallel as possible.
@@ -252,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
252
254
  // As a special case when the SIMD degree is 1, this function will still return
253
255
  // at least 2 outputs. This guarantees that this function doesn't perform the
254
256
  // root compression. (If it did, it would use the wrong flags, and also we
255
- // wouldn't be able to implement exendable ouput.) Note that this function is
257
+ // wouldn't be able to implement exendable output.) Note that this function is
256
258
  // not used when the whole input is only 1 chunk long; that's a different
257
259
  // codepath.
258
260
  //
@@ -338,12 +340,18 @@ INLINE void compress_subtree_to_parent_node(
338
340
  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
339
341
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
340
342
  chunk_counter, flags, cv_array);
343
+ assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
341
344
 
342
345
  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
343
346
  // compress_subtree_wide() returns more than 2 chaining values. Condense
344
347
  // them into 2 by forming parent nodes repeatedly.
345
348
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
346
- while (num_cvs > 2) {
349
+ // The second half of this loop condition is always true, and we just
350
+ // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
351
+ // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
352
+ // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
353
+ // this code, test it against that version.
354
+ while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
347
355
  num_cvs =
348
356
  compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
349
357
  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
@@ -601,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
601
609
  }
602
610
  output_root_bytes(&output, seek, out, out_len);
603
611
  }
612
+
613
+ void blake3_hasher_reset(blake3_hasher *self) {
614
+ chunk_state_reset(&self->chunk, self->key, 0);
615
+ self->cv_stack_len = 0;
616
+ }
@@ -8,12 +8,12 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ #define BLAKE3_VERSION_STRING "1.3.3"
11
12
  #define BLAKE3_KEY_LEN 32
12
13
  #define BLAKE3_OUT_LEN 32
13
14
  #define BLAKE3_BLOCK_LEN 64
14
15
  #define BLAKE3_CHUNK_LEN 1024
15
16
  #define BLAKE3_MAX_DEPTH 54
16
- #define BLAKE3_MAX_SIMD_DEGREE 16
17
17
 
18
18
  // This struct is a private implementation detail. It has to be here because
19
19
  // it's part of blake3_hasher below.
@@ -38,11 +38,12 @@ typedef struct {
38
38
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
39
  } blake3_hasher;
40
40
 
41
+ const char *blake3_version(void);
41
42
  void blake3_hasher_init(blake3_hasher *self);
42
43
  void blake3_hasher_init_keyed(blake3_hasher *self,
43
44
  const uint8_t key[BLAKE3_KEY_LEN]);
44
45
  void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
45
- void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
47
  size_t context_len);
47
48
  void blake3_hasher_update(blake3_hasher *self, const void *input,
48
49
  size_t input_len);
@@ -50,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
50
51
  size_t out_len);
51
52
  void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
52
53
  uint8_t *out, size_t out_len);
54
+ void blake3_hasher_reset(blake3_hasher *self);
53
55
 
54
56
  #ifdef __cplusplus
55
57
  }
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
208
208
  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
209
209
  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
210
210
  for (size_t i = 0; i < 8; ++i) {
211
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
211
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
212
212
  }
213
213
  transpose_vecs(&out[0]);
214
214
  transpose_vecs(&out[8]);
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
219
219
  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
220
220
  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
221
221
  const __m256i add1 = _mm256_and_si256(mask, add0);
222
- __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
222
+ __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
223
223
  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
224
224
  _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
225
- __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
225
+ __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
226
226
  *out_lo = l;
227
227
  *out_hi = h;
228
228
  }
229
229
 
230
+ static
230
231
  void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
231
232
  const uint32_t key[8], uint64_t counter,
232
233
  bool increment_counter, uint8_t flags,
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
468
468
  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
469
469
  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
470
470
  for (size_t i = 0; i < 4; ++i) {
471
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
471
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
472
472
  }
473
473
  transpose_vecs_128(&out[0]);
474
474
  transpose_vecs_128(&out[4]);
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
488
488
  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
489
489
  }
490
490
 
491
+ static
491
492
  void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
492
493
  const uint32_t key[8], uint64_t counter,
493
494
  bool increment_counter, uint8_t flags,
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
724
725
  out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
725
726
  out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
726
727
  for (size_t i = 0; i < 8; ++i) {
727
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
728
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
728
729
  }
729
730
  transpose_vecs_256(&out[0]);
730
731
  transpose_vecs_256(&out[8]);
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
742
743
  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
743
744
  }
744
745
 
746
+ static
745
747
  void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
746
748
  const uint32_t key[8], uint64_t counter,
747
749
  bool increment_counter, uint8_t flags,
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1037
1039
  out[14] = loadu_512(&inputs[14][block_offset]);
1038
1040
  out[15] = loadu_512(&inputs[15][block_offset]);
1039
1041
  for (size_t i = 0; i < 16; ++i) {
1040
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
1042
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1041
1043
  }
1042
1044
  transpose_vecs_512(out);
1043
1045
  }
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1045
1047
  INLINE void load_counters16(uint64_t counter, bool increment_counter,
1046
1048
  __m512i *out_lo, __m512i *out_hi) {
1047
1049
  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1048
- const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1049
- const __m512i add1 = _mm512_and_si512(mask, add0);
1050
- __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
1051
- __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
1052
- __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
1053
- *out_lo = l;
1054
- *out_hi = h;
1050
+ const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1051
+ const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
1052
+ const __m512i low_words = _mm512_add_epi32(
1053
+ _mm512_set1_epi32((int32_t)counter),
1054
+ masked_deltas);
1055
+ // The carry bit is 1 if the high bit of the word was 1 before addition and is
1056
+ // 0 after.
1057
+ // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1058
+ // compute the carry bits here, and originally we did, but that intrinsic is
1059
+ // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1060
+ const __m512i carries = _mm512_srli_epi32(
1061
+ _mm512_andnot_si512(
1062
+ low_words, // 0 after (gets inverted by andnot)
1063
+ _mm512_set1_epi32((int32_t)counter)), // and 1 before
1064
+ 31);
1065
+ const __m512i high_words = _mm512_add_epi32(
1066
+ _mm512_set1_epi32((int32_t)(counter >> 32)),
1067
+ carries);
1068
+ *out_lo = low_words;
1069
+ *out_hi = high_words;
1055
1070
  }
1056
1071
 
1072
+ static
1057
1073
  void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1058
1074
  const uint32_t key[8], uint64_t counter,
1059
1075
  bool increment_counter, uint8_t flags,
@@ -10,12 +10,14 @@
10
10
  #elif defined(__GNUC__)
11
11
  #include <immintrin.h>
12
12
  #else
13
- #error "Unimplemented!"
13
+ #undef IS_X86 /* Unimplemented! */
14
14
  #endif
15
15
  #endif
16
16
 
17
+ #define MAYBE_UNUSED(x) (void)((x))
18
+
17
19
  #if defined(IS_X86)
18
- static uint64_t xgetbv() {
20
+ static uint64_t xgetbv(void) {
19
21
  #if defined(_MSC_VER)
20
22
  return _xgetbv(0);
21
23
  #else
@@ -80,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
80
82
  static
81
83
  #endif
82
84
  enum cpu_feature
83
- get_cpu_features() {
85
+ get_cpu_features(void) {
84
86
 
85
87
  if (g_cpu_features != UNDEFINED) {
86
88
  return g_cpu_features;
@@ -137,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
137
139
  uint8_t flags) {
138
140
  #if defined(IS_X86)
139
141
  const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
140
143
  #if !defined(BLAKE3_NO_AVX512)
141
144
  if (features & AVX512VL) {
142
145
  blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
@@ -165,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
165
168
  uint8_t out[64]) {
166
169
  #if defined(IS_X86)
167
170
  const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
168
172
  #if !defined(BLAKE3_NO_AVX512)
169
173
  if (features & AVX512VL) {
170
174
  blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
@@ -193,6 +197,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
193
197
  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
194
198
  #if defined(IS_X86)
195
199
  const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
196
201
  #if !defined(BLAKE3_NO_AVX512)
197
202
  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
198
203
  blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
@@ -227,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
227
232
  #endif
228
233
  #endif
229
234
 
230
- #if defined(BLAKE3_USE_NEON)
235
+ #if BLAKE3_USE_NEON == 1
231
236
  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
232
237
  increment_counter, flags, flags_start, flags_end, out);
233
238
  return;
@@ -242,6 +247,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
242
247
  size_t blake3_simd_degree(void) {
243
248
  #if defined(IS_X86)
244
249
  const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
245
251
  #if !defined(BLAKE3_NO_AVX512)
246
252
  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
247
253
  return 16;
@@ -263,7 +269,7 @@ size_t blake3_simd_degree(void) {
263
269
  }
264
270
  #endif
265
271
  #endif
266
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
267
273
  return 4;
268
274
  #endif
269
275
  return 1;
@@ -38,16 +38,28 @@ enum blake3_flags {
38
38
  #define IS_X86_32
39
39
  #endif
40
40
 
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
41
45
  #if defined(IS_X86)
42
46
  #if defined(_MSC_VER)
43
47
  #include <intrin.h>
44
48
  #endif
45
- #include <immintrin.h>
49
+ #endif
50
+
51
+ #if !defined(BLAKE3_USE_NEON)
52
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
53
+ #if defined(IS_AARCH64)
54
+ #define BLAKE3_USE_NEON 1
55
+ #else
56
+ #define BLAKE3_USE_NEON 0
57
+ #endif
46
58
  #endif
47
59
 
48
60
  #if defined(IS_X86)
49
61
  #define MAX_SIMD_DEGREE 16
50
- #elif defined(BLAKE3_USE_NEON)
62
+ #elif BLAKE3_USE_NEON == 1
51
63
  #define MAX_SIMD_DEGREE 4
52
64
  #else
53
65
  #define MAX_SIMD_DEGREE 1
@@ -83,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
83
95
  #elif defined(_MSC_VER) && defined(IS_X86_32)
84
96
  if(x >> 32) {
85
97
  unsigned long index;
86
- _BitScanReverse(&index, x >> 32);
98
+ _BitScanReverse(&index, (unsigned long)(x >> 32));
87
99
  return 32 + index;
88
100
  } else {
89
101
  unsigned long index;
90
- _BitScanReverse(&index, x);
102
+ _BitScanReverse(&index, (unsigned long)x);
91
103
  return index;
92
104
  }
93
105
  #else
@@ -257,7 +269,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
257
269
  #endif
258
270
  #endif
259
271
 
260
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
261
273
  void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
262
274
  size_t blocks, const uint32_t key[8],
263
275
  uint64_t counter, bool increment_counter,
@@ -2,7 +2,12 @@
2
2
 
3
3
  #include <arm_neon.h>
4
4
 
5
- // TODO: This is probably incorrect for big-endian ARM. How should that work?
5
+ #ifdef __ARM_BIG_ENDIAN
6
+ #error "This implementation only supports little-endian ARM."
7
+ // It might be that all we need for big-endian support here is to get the loads
8
+ // and stores right, but step zero would be finding a way to test it in CI.
9
+ #endif
10
+
6
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
7
12
  // vld1q_u32 has alignment requirements. Don't use it.
8
13
  uint32x4_t x;
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
78
78
  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
79
79
  }
80
80
 
81
- INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
81
+ INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
82
82
  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
83
83
  __m128i mask = _mm_set1_epi16(imm8);
84
84
  mask = _mm_and_si128(mask, bits);
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
435
435
  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
436
436
  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
437
437
  for (size_t i = 0; i < 4; ++i) {
438
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
438
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
439
439
  }
440
440
  transpose_vecs(&out[0]);
441
441
  transpose_vecs(&out[4]);
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
448
448
  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
449
449
  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
450
450
  const __m128i add1 = _mm_and_si128(mask, add0);
451
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
451
+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
452
452
  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
453
453
  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
454
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
454
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
455
455
  *out_lo = l;
456
456
  *out_hi = h;
457
457
  }
458
458
 
459
+ static
459
460
  void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
460
461
  const uint32_t key[8], uint64_t counter,
461
462
  bool increment_counter, uint8_t flags,
@@ -1704,7 +1704,7 @@ blake3_hash_many_sse2:
1704
1704
  pshufd xmm15, xmm11, 0x93
1705
1705
  shl rax, 0x20
1706
1706
  or rax, 0x40
1707
- movd xmm3, rax
1707
+ movq xmm3, rax
1708
1708
  movdqa xmmword ptr [rsp+0x20], xmm3
1709
1709
  movaps xmm3, xmmword ptr [rsp]
1710
1710
  movaps xmm11, xmmword ptr [rsp+0x10]
@@ -1917,7 +1917,7 @@ blake3_hash_many_sse2:
1917
1917
  movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1918
1918
  shl rax, 32
1919
1919
  or rax, 64
1920
- movd xmm12, rax
1920
+ movq xmm12, rax
1921
1921
  movdqa xmm3, xmm13
1922
1922
  punpcklqdq xmm3, xmm12
1923
1923
  movups xmm4, xmmword ptr [r8+rdx-0x40]
@@ -1715,7 +1715,7 @@ blake3_hash_many_sse2:
1715
1715
  pshufd xmm15, xmm11, 0x93
1716
1716
  shl rax, 0x20
1717
1717
  or rax, 0x40
1718
- movd xmm3, rax
1718
+ movq xmm3, rax
1719
1719
  movdqa xmmword ptr [rsp+0x20], xmm3
1720
1720
  movaps xmm3, xmmword ptr [rsp]
1721
1721
  movaps xmm11, xmmword ptr [rsp+0x10]
@@ -1928,7 +1928,7 @@ blake3_hash_many_sse2:
1928
1928
  movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1929
1929
  shl rax, 32
1930
1930
  or rax, 64
1931
- movd xmm12, rax
1931
+ movq xmm12, rax
1932
1932
  movdqa xmm3, xmm13
1933
1933
  punpcklqdq xmm3, xmm12
1934
1934
  movups xmm4, xmmword ptr [r8+rdx-0x40]
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
2137
2137
  por xmm9, xmm8
2138
2138
  movdqa xmm8, xmm7
2139
2139
  punpcklqdq xmm8, xmm5
2140
- movdqa xmm10, xmm6
2140
+ movdqa xmm14, xmm6
2141
2141
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2142
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
- por xmm8, xmm10
2142
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
+ por xmm8, xmm14
2144
2144
  pshufd xmm8, xmm8, 0x78
2145
2145
  punpckhdq xmm5, xmm7
2146
2146
  punpckldq xmm6, xmm5
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
2268
2268
  por xmm9, xmm8
2269
2269
  movdqa xmm8, xmm7
2270
2270
  punpcklqdq xmm8, xmm5
2271
- movdqa xmm10, xmm6
2271
+ movdqa xmm14, xmm6
2272
2272
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2273
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
- por xmm8, xmm10
2273
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
+ por xmm8, xmm14
2275
2275
  pshufd xmm8, xmm8, 0x78
2276
2276
  punpckhdq xmm5, xmm7
2277
2277
  punpckldq xmm6, xmm5
@@ -2054,8 +2054,8 @@ _blake3_compress_in_place_sse2 PROC
2054
2054
  movzx r8d, r8b
2055
2055
  shl rax, 32
2056
2056
  add r8, rax
2057
- movq xmm3, r9
2058
- movq xmm4, r8
2057
+ movd xmm3, r9
2058
+ movd xmm4, r8
2059
2059
  punpcklqdq xmm3, xmm4
2060
2060
  movups xmm4, xmmword ptr [rdx]
2061
2061
  movups xmm5, xmmword ptr [rdx+10H]
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
2139
2139
  por xmm9, xmm8
2140
2140
  movdqa xmm8, xmm7
2141
2141
  punpcklqdq xmm8, xmm5
2142
- movdqa xmm10, xmm6
2142
+ movdqa xmm14, xmm6
2143
2143
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2144
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2145
- por xmm8, xmm10
2144
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2145
+ por xmm8, xmm14
2146
2146
  pshufd xmm8, xmm8, 78H
2147
2147
  punpckhdq xmm5, xmm7
2148
2148
  punpckldq xmm6, xmm5
@@ -2186,8 +2186,8 @@ _blake3_compress_xof_sse2 PROC
2186
2186
  mov r10, qword ptr [rsp+0A8H]
2187
2187
  shl rax, 32
2188
2188
  add r8, rax
2189
- movq xmm3, r9
2190
- movq xmm4, r8
2189
+ movd xmm3, r9
2190
+ movd xmm4, r8
2191
2191
  punpcklqdq xmm3, xmm4
2192
2192
  movups xmm4, xmmword ptr [rdx]
2193
2193
  movups xmm5, xmmword ptr [rdx+10H]
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
2271
2271
  por xmm9, xmm8
2272
2272
  movdqa xmm8, xmm7
2273
2273
  punpcklqdq xmm8, xmm5
2274
- movdqa xmm10, xmm6
2274
+ movdqa xmm14, xmm6
2275
2275
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2276
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2277
- por xmm8, xmm10
2276
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2277
+ por xmm8, xmm14
2278
2278
  pshufd xmm8, xmm8, 78H
2279
2279
  punpckhdq xmm5, xmm7
2280
2280
  punpckldq xmm6, xmm5
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
429
429
  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
430
430
  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
431
431
  for (size_t i = 0; i < 4; ++i) {
432
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
432
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
433
433
  }
434
434
  transpose_vecs(&out[0]);
435
435
  transpose_vecs(&out[4]);
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
442
442
  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
443
443
  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
444
444
  const __m128i add1 = _mm_and_si128(mask, add0);
445
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
445
+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
446
446
  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
447
447
  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
448
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
448
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
449
449
  *out_lo = l;
450
450
  *out_hi = h;
451
451
  }
452
452
 
453
+ static
453
454
  void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
454
455
  const uint32_t key[8], uint64_t counter,
455
456
  bool increment_counter, uint8_t flags,
@@ -1817,8 +1817,8 @@ _blake3_compress_in_place_sse41 PROC
1817
1817
  movzx r8d, r8b
1818
1818
  shl rax, 32
1819
1819
  add r8, rax
1820
- movq xmm3, r9
1821
- movq xmm4, r8
1820
+ movd xmm3, r9
1821
+ movd xmm4, r8
1822
1822
  punpcklqdq xmm3, xmm4
1823
1823
  movups xmm4, xmmword ptr [rdx]
1824
1824
  movups xmm5, xmmword ptr [rdx+10H]
@@ -1938,8 +1938,8 @@ _blake3_compress_xof_sse41 PROC
1938
1938
  mov r10, qword ptr [rsp+0A8H]
1939
1939
  shl rax, 32
1940
1940
  add r8, rax
1941
- movq xmm3, r9
1942
- movq xmm4, r8
1941
+ movd xmm3, r9
1942
+ movd xmm4, r8
1943
1943
  punpcklqdq xmm3, xmm4
1944
1944
  movups xmm4, xmmword ptr [rdx]
1945
1945
  movups xmm5, xmmword ptr [rdx+10H]
@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
34
34
  end
35
35
  end
36
36
 
37
- check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
38
- check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
39
- check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
40
- check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
37
+ unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
38
+ check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
39
+ check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
40
+ check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
41
+ check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
42
+ end
41
43
 
42
44
  if have_header("arm_neon.h")
43
45
  $objs << "blake3_neon.o"
@@ -2,6 +2,6 @@ require 'digest'
2
2
 
3
3
  module Digest
4
4
  class BLAKE3 < Base
5
- VERSION = "0.37.0.1"
5
+ VERSION = "1.3.3.1"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digest-blake3
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.37.0.1
4
+ version: 1.3.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Bryant
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-18 00:00:00.000000000 Z
11
+ date: 2022-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
100
100
  licenses:
101
101
  - MIT
102
102
  metadata: {}
103
- post_install_message:
103
+ post_install_message:
104
104
  rdoc_options: []
105
105
  require_paths:
106
106
  - lib
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  requirements: []
119
- rubygems_version: 3.0.3
120
- signing_key:
119
+ rubygems_version: 3.1.6
120
+ signing_key:
121
121
  specification_version: 4
122
122
  summary: BLAKE3 for Ruby
123
123
  test_files: []