digest-blake3 0.37.0.1 → 1.3.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4dc981436633bde6ba4fb278252d8a4a1ba58d039d0b1c8c794e36c4e47fa4a0
4
- data.tar.gz: ae40be72a0252730792f3e82a00da765546c9606d91405ff69d3bad078ad307f
3
+ metadata.gz: 6e9f7d8e4619bac26fee8eceb10b221935755588c05f333e53275080c61b83a5
4
+ data.tar.gz: fecaddd526e8bf374d675e80d49aefbcbed12dc491a6be50ea9beea335b943fd
5
5
  SHA512:
6
- metadata.gz: c18ca69b1f4b47ac8308ee00cc6db861eb48bd3921a85c291cb09ea595534b1a476988453c1931cb9982e9f96e7d14e4fa4356cbecaa9c584252c9b7ad30ac62
7
- data.tar.gz: b9cf5f04daf5d83a797191caa2f2c30e068ddd6b771d887acb96963b93550171e77ea81128fffc316b49641391a4f245a8484b8b909d53502be3a2fb3170ad76
6
+ metadata.gz: 6ecd8cdc8f5b320f152e4d89c263b68803f63ec609235965eebc11a8093d146bb7223b86302c24c0dc65abb69cbf6040e5635704fb07caeb115c8f76effe38c2
7
+ data.tar.gz: ec642c10fb95e51620ad78fa2915d467dc665b7c1683f8f30c56ab9ab5560a98983e6afdd4fca1c18d2088f0bd0e58248ea30a05230bbed48ff95d74851cd056
data/Gemfile.lock CHANGED
@@ -1,13 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- digest-blake3 (0.37.0)
4
+ digest-blake3 (1.3.3.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
- minitest (5.14.0)
10
- rake (13.0.1)
9
+ minitest (5.16.3)
10
+ rake (13.0.6)
11
11
 
12
12
  PLATFORMS
13
13
  ruby
@@ -19,4 +19,4 @@ DEPENDENCIES
19
19
  rake
20
20
 
21
21
  BUNDLED WITH
22
- 1.17.3
22
+ 2.3.1
@@ -5,6 +5,8 @@
5
5
  #include "blake3.h"
6
6
  #include "blake3_impl.h"
7
7
 
8
+ const char *blake3_version(void) { return BLAKE3_VERSION_STRING; }
9
+
8
10
  INLINE void chunk_state_init(blake3_chunk_state *self, const uint32_t key[8],
9
11
  uint8_t flags) {
10
12
  memcpy(self->cv, key, BLAKE3_KEY_LEN);
@@ -244,7 +246,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
244
246
 
245
247
  // The wide helper function returns (writes out) an array of chaining values
246
248
  // and returns the length of that array. The number of chaining values returned
247
- // is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
249
+ // is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer,
248
250
  // if the input is shorter than that many chunks. The reason for maintaining a
249
251
  // wide array of chaining values going back up the tree, is to allow the
250
252
  // implementation to hash as many parents in parallel as possible.
@@ -252,7 +254,7 @@ INLINE size_t compress_parents_parallel(const uint8_t *child_chaining_values,
252
254
  // As a special case when the SIMD degree is 1, this function will still return
253
255
  // at least 2 outputs. This guarantees that this function doesn't perform the
254
256
  // root compression. (If it did, it would use the wrong flags, and also we
255
- // wouldn't be able to implement exendable ouput.) Note that this function is
257
+ // wouldn't be able to implement exendable output.) Note that this function is
256
258
  // not used when the whole input is only 1 chunk long; that's a different
257
259
  // codepath.
258
260
  //
@@ -338,12 +340,18 @@ INLINE void compress_subtree_to_parent_node(
338
340
  uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN];
339
341
  size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
340
342
  chunk_counter, flags, cv_array);
343
+ assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);
341
344
 
342
345
  // If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
343
346
  // compress_subtree_wide() returns more than 2 chaining values. Condense
344
347
  // them into 2 by forming parent nodes repeatedly.
345
348
  uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
346
- while (num_cvs > 2) {
349
+ // The second half of this loop condition is always true, and we just
350
+ // asserted it above. But GCC can't tell that it's always true, and if NDEBUG
351
+ // is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
352
+ // warnings here. GCC 8.5 is particularly sensitive, so if you're changing
353
+ // this code, test it against that version.
354
+ while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
347
355
  num_cvs =
348
356
  compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
349
357
  memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
@@ -601,3 +609,8 @@ void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
601
609
  }
602
610
  output_root_bytes(&output, seek, out, out_len);
603
611
  }
612
+
613
+ void blake3_hasher_reset(blake3_hasher *self) {
614
+ chunk_state_reset(&self->chunk, self->key, 0);
615
+ self->cv_stack_len = 0;
616
+ }
@@ -8,12 +8,12 @@
8
8
  extern "C" {
9
9
  #endif
10
10
 
11
+ #define BLAKE3_VERSION_STRING "1.3.3"
11
12
  #define BLAKE3_KEY_LEN 32
12
13
  #define BLAKE3_OUT_LEN 32
13
14
  #define BLAKE3_BLOCK_LEN 64
14
15
  #define BLAKE3_CHUNK_LEN 1024
15
16
  #define BLAKE3_MAX_DEPTH 54
16
- #define BLAKE3_MAX_SIMD_DEGREE 16
17
17
 
18
18
  // This struct is a private implementation detail. It has to be here because
19
19
  // it's part of blake3_hasher below.
@@ -38,11 +38,12 @@ typedef struct {
38
38
  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
39
39
  } blake3_hasher;
40
40
 
41
+ const char *blake3_version(void);
41
42
  void blake3_hasher_init(blake3_hasher *self);
42
43
  void blake3_hasher_init_keyed(blake3_hasher *self,
43
44
  const uint8_t key[BLAKE3_KEY_LEN]);
44
45
  void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
45
- void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
+ void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
46
47
  size_t context_len);
47
48
  void blake3_hasher_update(blake3_hasher *self, const void *input,
48
49
  size_t input_len);
@@ -50,6 +51,7 @@ void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
50
51
  size_t out_len);
51
52
  void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
52
53
  uint8_t *out, size_t out_len);
54
+ void blake3_hasher_reset(blake3_hasher *self);
53
55
 
54
56
  #ifdef __cplusplus
55
57
  }
@@ -208,7 +208,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
208
208
  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
209
209
  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
210
210
  for (size_t i = 0; i < 8; ++i) {
211
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
211
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
212
212
  }
213
213
  transpose_vecs(&out[0]);
214
214
  transpose_vecs(&out[8]);
@@ -219,14 +219,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
219
219
  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
220
220
  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
221
221
  const __m256i add1 = _mm256_and_si256(mask, add0);
222
- __m256i l = _mm256_add_epi32(_mm256_set1_epi32(counter), add1);
222
+ __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
223
223
  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
224
224
  _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
225
- __m256i h = _mm256_sub_epi32(_mm256_set1_epi32(counter >> 32), carry);
225
+ __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
226
226
  *out_lo = l;
227
227
  *out_hi = h;
228
228
  }
229
229
 
230
+ static
230
231
  void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
231
232
  const uint32_t key[8], uint64_t counter,
232
233
  bool increment_counter, uint8_t flags,
@@ -468,7 +468,7 @@ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
468
468
  out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
469
469
  out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
470
470
  for (size_t i = 0; i < 4; ++i) {
471
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
471
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
472
472
  }
473
473
  transpose_vecs_128(&out[0]);
474
474
  transpose_vecs_128(&out[4]);
@@ -488,6 +488,7 @@ INLINE void load_counters4(uint64_t counter, bool increment_counter,
488
488
  *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
489
489
  }
490
490
 
491
+ static
491
492
  void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
492
493
  const uint32_t key[8], uint64_t counter,
493
494
  bool increment_counter, uint8_t flags,
@@ -724,7 +725,7 @@ INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
724
725
  out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
725
726
  out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
726
727
  for (size_t i = 0; i < 8; ++i) {
727
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
728
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
728
729
  }
729
730
  transpose_vecs_256(&out[0]);
730
731
  transpose_vecs_256(&out[8]);
@@ -742,6 +743,7 @@ INLINE void load_counters8(uint64_t counter, bool increment_counter,
742
743
  *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
743
744
  }
744
745
 
746
+ static
745
747
  void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
746
748
  const uint32_t key[8], uint64_t counter,
747
749
  bool increment_counter, uint8_t flags,
@@ -1037,7 +1039,7 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1037
1039
  out[14] = loadu_512(&inputs[14][block_offset]);
1038
1040
  out[15] = loadu_512(&inputs[15][block_offset]);
1039
1041
  for (size_t i = 0; i < 16; ++i) {
1040
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
1042
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1041
1043
  }
1042
1044
  transpose_vecs_512(out);
1043
1045
  }
@@ -1045,15 +1047,29 @@ INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1045
1047
  INLINE void load_counters16(uint64_t counter, bool increment_counter,
1046
1048
  __m512i *out_lo, __m512i *out_hi) {
1047
1049
  const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1048
- const __m512i add0 = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1049
- const __m512i add1 = _mm512_and_si512(mask, add0);
1050
- __m512i l = _mm512_add_epi32(_mm512_set1_epi32(counter), add1);
1051
- __mmask16 carry = _mm512_cmp_epu32_mask(l, add1, _MM_CMPINT_LT);
1052
- __m512i h = _mm512_mask_add_epi32(_mm512_set1_epi32(counter >> 32), carry, _mm512_set1_epi32(counter >> 32), _mm512_set1_epi32(1));
1053
- *out_lo = l;
1054
- *out_hi = h;
1050
+ const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1051
+ const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
1052
+ const __m512i low_words = _mm512_add_epi32(
1053
+ _mm512_set1_epi32((int32_t)counter),
1054
+ masked_deltas);
1055
+ // The carry bit is 1 if the high bit of the word was 1 before addition and is
1056
+ // 0 after.
1057
+ // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1058
+ // compute the carry bits here, and originally we did, but that intrinsic is
1059
+ // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1060
+ const __m512i carries = _mm512_srli_epi32(
1061
+ _mm512_andnot_si512(
1062
+ low_words, // 0 after (gets inverted by andnot)
1063
+ _mm512_set1_epi32((int32_t)counter)), // and 1 before
1064
+ 31);
1065
+ const __m512i high_words = _mm512_add_epi32(
1066
+ _mm512_set1_epi32((int32_t)(counter >> 32)),
1067
+ carries);
1068
+ *out_lo = low_words;
1069
+ *out_hi = high_words;
1055
1070
  }
1056
1071
 
1072
+ static
1057
1073
  void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1058
1074
  const uint32_t key[8], uint64_t counter,
1059
1075
  bool increment_counter, uint8_t flags,
@@ -10,12 +10,14 @@
10
10
  #elif defined(__GNUC__)
11
11
  #include <immintrin.h>
12
12
  #else
13
- #error "Unimplemented!"
13
+ #undef IS_X86 /* Unimplemented! */
14
14
  #endif
15
15
  #endif
16
16
 
17
+ #define MAYBE_UNUSED(x) (void)((x))
18
+
17
19
  #if defined(IS_X86)
18
- static uint64_t xgetbv() {
20
+ static uint64_t xgetbv(void) {
19
21
  #if defined(_MSC_VER)
20
22
  return _xgetbv(0);
21
23
  #else
@@ -80,7 +82,7 @@ static /* Allow the variable to be controlled manually for testing */
80
82
  static
81
83
  #endif
82
84
  enum cpu_feature
83
- get_cpu_features() {
85
+ get_cpu_features(void) {
84
86
 
85
87
  if (g_cpu_features != UNDEFINED) {
86
88
  return g_cpu_features;
@@ -137,6 +139,7 @@ void blake3_compress_in_place(uint32_t cv[8],
137
139
  uint8_t flags) {
138
140
  #if defined(IS_X86)
139
141
  const enum cpu_feature features = get_cpu_features();
142
+ MAYBE_UNUSED(features);
140
143
  #if !defined(BLAKE3_NO_AVX512)
141
144
  if (features & AVX512VL) {
142
145
  blake3_compress_in_place_avx512(cv, block, block_len, counter, flags);
@@ -165,6 +168,7 @@ void blake3_compress_xof(const uint32_t cv[8],
165
168
  uint8_t out[64]) {
166
169
  #if defined(IS_X86)
167
170
  const enum cpu_feature features = get_cpu_features();
171
+ MAYBE_UNUSED(features);
168
172
  #if !defined(BLAKE3_NO_AVX512)
169
173
  if (features & AVX512VL) {
170
174
  blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
@@ -193,6 +197,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
193
197
  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
194
198
  #if defined(IS_X86)
195
199
  const enum cpu_feature features = get_cpu_features();
200
+ MAYBE_UNUSED(features);
196
201
  #if !defined(BLAKE3_NO_AVX512)
197
202
  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
198
203
  blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter,
@@ -227,7 +232,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
227
232
  #endif
228
233
  #endif
229
234
 
230
- #if defined(BLAKE3_USE_NEON)
235
+ #if BLAKE3_USE_NEON == 1
231
236
  blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
232
237
  increment_counter, flags, flags_start, flags_end, out);
233
238
  return;
@@ -242,6 +247,7 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
242
247
  size_t blake3_simd_degree(void) {
243
248
  #if defined(IS_X86)
244
249
  const enum cpu_feature features = get_cpu_features();
250
+ MAYBE_UNUSED(features);
245
251
  #if !defined(BLAKE3_NO_AVX512)
246
252
  if ((features & (AVX512F|AVX512VL)) == (AVX512F|AVX512VL)) {
247
253
  return 16;
@@ -263,7 +269,7 @@ size_t blake3_simd_degree(void) {
263
269
  }
264
270
  #endif
265
271
  #endif
266
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
267
273
  return 4;
268
274
  #endif
269
275
  return 1;
@@ -38,16 +38,28 @@ enum blake3_flags {
38
38
  #define IS_X86_32
39
39
  #endif
40
40
 
41
+ #if defined(__aarch64__) || defined(_M_ARM64)
42
+ #define IS_AARCH64
43
+ #endif
44
+
41
45
  #if defined(IS_X86)
42
46
  #if defined(_MSC_VER)
43
47
  #include <intrin.h>
44
48
  #endif
45
- #include <immintrin.h>
49
+ #endif
50
+
51
+ #if !defined(BLAKE3_USE_NEON)
52
+ // If BLAKE3_USE_NEON not manually set, autodetect based on AArch64ness
53
+ #if defined(IS_AARCH64)
54
+ #define BLAKE3_USE_NEON 1
55
+ #else
56
+ #define BLAKE3_USE_NEON 0
57
+ #endif
46
58
  #endif
47
59
 
48
60
  #if defined(IS_X86)
49
61
  #define MAX_SIMD_DEGREE 16
50
- #elif defined(BLAKE3_USE_NEON)
62
+ #elif BLAKE3_USE_NEON == 1
51
63
  #define MAX_SIMD_DEGREE 4
52
64
  #else
53
65
  #define MAX_SIMD_DEGREE 1
@@ -83,11 +95,11 @@ static unsigned int highest_one(uint64_t x) {
83
95
  #elif defined(_MSC_VER) && defined(IS_X86_32)
84
96
  if(x >> 32) {
85
97
  unsigned long index;
86
- _BitScanReverse(&index, x >> 32);
98
+ _BitScanReverse(&index, (unsigned long)(x >> 32));
87
99
  return 32 + index;
88
100
  } else {
89
101
  unsigned long index;
90
- _BitScanReverse(&index, x);
102
+ _BitScanReverse(&index, (unsigned long)x);
91
103
  return index;
92
104
  }
93
105
  #else
@@ -257,7 +269,7 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
257
269
  #endif
258
270
  #endif
259
271
 
260
- #if defined(BLAKE3_USE_NEON)
272
+ #if BLAKE3_USE_NEON == 1
261
273
  void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
262
274
  size_t blocks, const uint32_t key[8],
263
275
  uint64_t counter, bool increment_counter,
@@ -2,7 +2,12 @@
2
2
 
3
3
  #include <arm_neon.h>
4
4
 
5
- // TODO: This is probably incorrect for big-endian ARM. How should that work?
5
+ #ifdef __ARM_BIG_ENDIAN
6
+ #error "This implementation only supports little-endian ARM."
7
+ // It might be that all we need for big-endian support here is to get the loads
8
+ // and stores right, but step zero would be finding a way to test it in CI.
9
+ #endif
10
+
6
11
  INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
7
12
  // vld1q_u32 has alignment requirements. Don't use it.
8
13
  uint32x4_t x;
@@ -78,7 +78,7 @@ INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
78
78
  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
79
79
  }
80
80
 
81
- INLINE __m128i blend_epi16(__m128i a, __m128i b, const int imm8) {
81
+ INLINE __m128i blend_epi16(__m128i a, __m128i b, const int16_t imm8) {
82
82
  const __m128i bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
83
83
  __m128i mask = _mm_set1_epi16(imm8);
84
84
  mask = _mm_and_si128(mask, bits);
@@ -435,7 +435,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
435
435
  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
436
436
  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
437
437
  for (size_t i = 0; i < 4; ++i) {
438
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
438
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
439
439
  }
440
440
  transpose_vecs(&out[0]);
441
441
  transpose_vecs(&out[4]);
@@ -448,14 +448,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
448
448
  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
449
449
  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
450
450
  const __m128i add1 = _mm_and_si128(mask, add0);
451
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
451
+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
452
452
  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
453
453
  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
454
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
454
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
455
455
  *out_lo = l;
456
456
  *out_hi = h;
457
457
  }
458
458
 
459
+ static
459
460
  void blake3_hash4_sse2(const uint8_t *const *inputs, size_t blocks,
460
461
  const uint32_t key[8], uint64_t counter,
461
462
  bool increment_counter, uint8_t flags,
@@ -1704,7 +1704,7 @@ blake3_hash_many_sse2:
1704
1704
  pshufd xmm15, xmm11, 0x93
1705
1705
  shl rax, 0x20
1706
1706
  or rax, 0x40
1707
- movd xmm3, rax
1707
+ movq xmm3, rax
1708
1708
  movdqa xmmword ptr [rsp+0x20], xmm3
1709
1709
  movaps xmm3, xmmword ptr [rsp]
1710
1710
  movaps xmm11, xmmword ptr [rsp+0x10]
@@ -1917,7 +1917,7 @@ blake3_hash_many_sse2:
1917
1917
  movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1918
1918
  shl rax, 32
1919
1919
  or rax, 64
1920
- movd xmm12, rax
1920
+ movq xmm12, rax
1921
1921
  movdqa xmm3, xmm13
1922
1922
  punpcklqdq xmm3, xmm12
1923
1923
  movups xmm4, xmmword ptr [r8+rdx-0x40]
@@ -1715,7 +1715,7 @@ blake3_hash_many_sse2:
1715
1715
  pshufd xmm15, xmm11, 0x93
1716
1716
  shl rax, 0x20
1717
1717
  or rax, 0x40
1718
- movd xmm3, rax
1718
+ movq xmm3, rax
1719
1719
  movdqa xmmword ptr [rsp+0x20], xmm3
1720
1720
  movaps xmm3, xmmword ptr [rsp]
1721
1721
  movaps xmm11, xmmword ptr [rsp+0x10]
@@ -1928,7 +1928,7 @@ blake3_hash_many_sse2:
1928
1928
  movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1929
1929
  shl rax, 32
1930
1930
  or rax, 64
1931
- movd xmm12, rax
1931
+ movq xmm12, rax
1932
1932
  movdqa xmm3, xmm13
1933
1933
  punpcklqdq xmm3, xmm12
1934
1934
  movups xmm4, xmmword ptr [r8+rdx-0x40]
@@ -2137,10 +2137,10 @@ _blake3_compress_in_place_sse2:
2137
2137
  por xmm9, xmm8
2138
2138
  movdqa xmm8, xmm7
2139
2139
  punpcklqdq xmm8, xmm5
2140
- movdqa xmm10, xmm6
2140
+ movdqa xmm14, xmm6
2141
2141
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2142
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
- por xmm8, xmm10
2142
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
+ por xmm8, xmm14
2144
2144
  pshufd xmm8, xmm8, 0x78
2145
2145
  punpckhdq xmm5, xmm7
2146
2146
  punpckldq xmm6, xmm5
@@ -2268,10 +2268,10 @@ blake3_compress_xof_sse2:
2268
2268
  por xmm9, xmm8
2269
2269
  movdqa xmm8, xmm7
2270
2270
  punpcklqdq xmm8, xmm5
2271
- movdqa xmm10, xmm6
2271
+ movdqa xmm14, xmm6
2272
2272
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2273
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
- por xmm8, xmm10
2273
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
+ por xmm8, xmm14
2275
2275
  pshufd xmm8, xmm8, 0x78
2276
2276
  punpckhdq xmm5, xmm7
2277
2277
  punpckldq xmm6, xmm5
@@ -2054,8 +2054,8 @@ _blake3_compress_in_place_sse2 PROC
2054
2054
  movzx r8d, r8b
2055
2055
  shl rax, 32
2056
2056
  add r8, rax
2057
- movq xmm3, r9
2058
- movq xmm4, r8
2057
+ movd xmm3, r9
2058
+ movd xmm4, r8
2059
2059
  punpcklqdq xmm3, xmm4
2060
2060
  movups xmm4, xmmword ptr [rdx]
2061
2061
  movups xmm5, xmmword ptr [rdx+10H]
@@ -2139,10 +2139,10 @@ _blake3_compress_in_place_sse2 PROC
2139
2139
  por xmm9, xmm8
2140
2140
  movdqa xmm8, xmm7
2141
2141
  punpcklqdq xmm8, xmm5
2142
- movdqa xmm10, xmm6
2142
+ movdqa xmm14, xmm6
2143
2143
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2144
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2145
- por xmm8, xmm10
2144
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2145
+ por xmm8, xmm14
2146
2146
  pshufd xmm8, xmm8, 78H
2147
2147
  punpckhdq xmm5, xmm7
2148
2148
  punpckldq xmm6, xmm5
@@ -2186,8 +2186,8 @@ _blake3_compress_xof_sse2 PROC
2186
2186
  mov r10, qword ptr [rsp+0A8H]
2187
2187
  shl rax, 32
2188
2188
  add r8, rax
2189
- movq xmm3, r9
2190
- movq xmm4, r8
2189
+ movd xmm3, r9
2190
+ movd xmm4, r8
2191
2191
  punpcklqdq xmm3, xmm4
2192
2192
  movups xmm4, xmmword ptr [rdx]
2193
2193
  movups xmm5, xmmword ptr [rdx+10H]
@@ -2271,10 +2271,10 @@ _blake3_compress_xof_sse2 PROC
2271
2271
  por xmm9, xmm8
2272
2272
  movdqa xmm8, xmm7
2273
2273
  punpcklqdq xmm8, xmm5
2274
- movdqa xmm10, xmm6
2274
+ movdqa xmm14, xmm6
2275
2275
  pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2276
- pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2277
- por xmm8, xmm10
2276
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK]
2277
+ por xmm8, xmm14
2278
2278
  pshufd xmm8, xmm8, 78H
2279
2279
  punpckhdq xmm5, xmm7
2280
2280
  punpckldq xmm6, xmm5
@@ -429,7 +429,7 @@ INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
429
429
  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
430
430
  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
431
431
  for (size_t i = 0; i < 4; ++i) {
432
- _mm_prefetch(&inputs[i][block_offset + 256], _MM_HINT_T0);
432
+ _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
433
433
  }
434
434
  transpose_vecs(&out[0]);
435
435
  transpose_vecs(&out[4]);
@@ -442,14 +442,15 @@ INLINE void load_counters(uint64_t counter, bool increment_counter,
442
442
  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
443
443
  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
444
444
  const __m128i add1 = _mm_and_si128(mask, add0);
445
- __m128i l = _mm_add_epi32(_mm_set1_epi32(counter), add1);
445
+ __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
446
446
  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
447
447
  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
448
- __m128i h = _mm_sub_epi32(_mm_set1_epi32(counter >> 32), carry);
448
+ __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
449
449
  *out_lo = l;
450
450
  *out_hi = h;
451
451
  }
452
452
 
453
+ static
453
454
  void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
454
455
  const uint32_t key[8], uint64_t counter,
455
456
  bool increment_counter, uint8_t flags,
@@ -1817,8 +1817,8 @@ _blake3_compress_in_place_sse41 PROC
1817
1817
  movzx r8d, r8b
1818
1818
  shl rax, 32
1819
1819
  add r8, rax
1820
- movq xmm3, r9
1821
- movq xmm4, r8
1820
+ movd xmm3, r9
1821
+ movd xmm4, r8
1822
1822
  punpcklqdq xmm3, xmm4
1823
1823
  movups xmm4, xmmword ptr [rdx]
1824
1824
  movups xmm5, xmmword ptr [rdx+10H]
@@ -1938,8 +1938,8 @@ _blake3_compress_xof_sse41 PROC
1938
1938
  mov r10, qword ptr [rsp+0A8H]
1939
1939
  shl rax, 32
1940
1940
  add r8, rax
1941
- movq xmm3, r9
1942
- movq xmm4, r8
1941
+ movd xmm3, r9
1942
+ movd xmm4, r8
1943
1943
  punpcklqdq xmm3, xmm4
1944
1944
  movups xmm4, xmmword ptr [rdx]
1945
1945
  movups xmm5, xmmword ptr [rdx+10H]
@@ -34,10 +34,12 @@ def check_supported_flags(flags, obj_if_enabled, def_if_disabled)
34
34
  end
35
35
  end
36
36
 
37
- check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
38
- check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
39
- check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
40
- check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
37
+ unless RUBY_PLATFORM.include? 'arm64' or RUBY_PLATFORM.include? 'aarch64'
38
+ check_supported_flags("-msse2", "blake3_sse2.o", "-DBLAKE3_NO_SSE2")
39
+ check_supported_flags("-msse4.1", "blake3_sse41.o", "-DBLAKE3_NO_SSE41")
40
+ check_supported_flags("-mavx2", "blake3_avx2.o", "-DBLAKE3_NO_AVX2")
41
+ check_supported_flags("-mavx512f -mavx512vl -mavx512bw", "blake3_avx512.o", "-DBLAKE3_NO_AVX512")
42
+ end
41
43
 
42
44
  if have_header("arm_neon.h")
43
45
  $objs << "blake3_neon.o"
@@ -2,6 +2,6 @@ require 'digest'
2
2
 
3
3
  module Digest
4
4
  class BLAKE3 < Base
5
- VERSION = "0.37.0.1"
5
+ VERSION = "1.3.3.1"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: digest-blake3
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.37.0.1
4
+ version: 1.3.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Will Bryant
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-10-18 00:00:00.000000000 Z
11
+ date: 2022-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -100,7 +100,7 @@ homepage: https://github.com/willbryant/digest-blake3
100
100
  licenses:
101
101
  - MIT
102
102
  metadata: {}
103
- post_install_message:
103
+ post_install_message:
104
104
  rdoc_options: []
105
105
  require_paths:
106
106
  - lib
@@ -116,8 +116,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
116
116
  - !ruby/object:Gem::Version
117
117
  version: '0'
118
118
  requirements: []
119
- rubygems_version: 3.0.3
120
- signing_key:
119
+ rubygems_version: 3.1.6
120
+ signing_key:
121
121
  specification_version: 4
122
122
  summary: BLAKE3 for Ruby
123
123
  test_files: []