uncle_blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.md +27 -0
  3. data/README.md +89 -0
  4. data/ext/Rakefile +55 -0
  5. data/ext/binding/uncle_blake3.c +41 -0
  6. data/ext/blake3/c/Makefile.testing +82 -0
  7. data/ext/blake3/c/README.md +316 -0
  8. data/ext/blake3/c/blake3.c +616 -0
  9. data/ext/blake3/c/blake3.h +60 -0
  10. data/ext/blake3/c/blake3_avx2.c +326 -0
  11. data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
  12. data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
  13. data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
  14. data/ext/blake3/c/blake3_avx512.c +1207 -0
  15. data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
  16. data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
  17. data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
  18. data/ext/blake3/c/blake3_dispatch.c +276 -0
  19. data/ext/blake3/c/blake3_impl.h +282 -0
  20. data/ext/blake3/c/blake3_neon.c +351 -0
  21. data/ext/blake3/c/blake3_portable.c +160 -0
  22. data/ext/blake3/c/blake3_sse2.c +566 -0
  23. data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
  24. data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
  25. data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
  26. data/ext/blake3/c/blake3_sse41.c +560 -0
  27. data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
  28. data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
  29. data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
  30. data/ext/blake3/c/example.c +37 -0
  31. data/ext/blake3/c/main.c +166 -0
  32. data/ext/blake3/c/test.py +97 -0
  33. data/lib/uncle_blake3/binding.rb +20 -0
  34. data/lib/uncle_blake3/build/loader.rb +40 -0
  35. data/lib/uncle_blake3/build/platform.rb +37 -0
  36. data/lib/uncle_blake3/build.rb +4 -0
  37. data/lib/uncle_blake3/digest.rb +119 -0
  38. data/lib/uncle_blake3/version.rb +5 -0
  39. data/lib/uncle_blake3.rb +7 -0
  40. metadata +112 -0
@@ -0,0 +1,351 @@
1
+ #include "blake3_impl.h"
2
+
3
+ #include <arm_neon.h>
4
+
5
+ #ifdef __ARM_BIG_ENDIAN
6
+ #error "This implementation only supports little-endian ARM."
7
+ // It might be that all we need for big-endian support here is to get the loads
8
+ // and stores right, but step zero would be finding a way to test it in CI.
9
+ #endif
10
+
11
+ INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
12
+ // vld1q_u32 has alignment requirements. Don't use it.
13
+ uint32x4_t x;
14
+ memcpy(&x, src, 16);
15
+ return x;
16
+ }
17
+
18
+ INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
19
+ // vst1q_u32 has alignment requirements. Don't use it.
20
+ memcpy(dest, &src, 16);
21
+ }
22
+
23
+ INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
24
+ return vaddq_u32(a, b);
25
+ }
26
+
27
+ INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
28
+ return veorq_u32(a, b);
29
+ }
30
+
31
+ INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
32
+
33
+ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
34
+ uint32_t array[4] = {a, b, c, d};
35
+ return vld1q_u32(array);
36
+ }
37
+
38
+ INLINE uint32x4_t rot16_128(uint32x4_t x) {
39
+ return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
40
+ }
41
+
42
+ INLINE uint32x4_t rot12_128(uint32x4_t x) {
43
+ return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
44
+ }
45
+
46
+ INLINE uint32x4_t rot8_128(uint32x4_t x) {
47
+ return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
48
+ }
49
+
50
+ INLINE uint32x4_t rot7_128(uint32x4_t x) {
51
+ return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
52
+ }
53
+
54
+ // TODO: compress_neon
55
+
56
+ // TODO: hash2_neon
57
+
58
+ /*
59
+ * ----------------------------------------------------------------------------
60
+ * hash4_neon
61
+ * ----------------------------------------------------------------------------
62
+ */
63
+
64
+ INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
65
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
66
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
67
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
68
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
69
+ v[0] = add_128(v[0], v[4]);
70
+ v[1] = add_128(v[1], v[5]);
71
+ v[2] = add_128(v[2], v[6]);
72
+ v[3] = add_128(v[3], v[7]);
73
+ v[12] = xor_128(v[12], v[0]);
74
+ v[13] = xor_128(v[13], v[1]);
75
+ v[14] = xor_128(v[14], v[2]);
76
+ v[15] = xor_128(v[15], v[3]);
77
+ v[12] = rot16_128(v[12]);
78
+ v[13] = rot16_128(v[13]);
79
+ v[14] = rot16_128(v[14]);
80
+ v[15] = rot16_128(v[15]);
81
+ v[8] = add_128(v[8], v[12]);
82
+ v[9] = add_128(v[9], v[13]);
83
+ v[10] = add_128(v[10], v[14]);
84
+ v[11] = add_128(v[11], v[15]);
85
+ v[4] = xor_128(v[4], v[8]);
86
+ v[5] = xor_128(v[5], v[9]);
87
+ v[6] = xor_128(v[6], v[10]);
88
+ v[7] = xor_128(v[7], v[11]);
89
+ v[4] = rot12_128(v[4]);
90
+ v[5] = rot12_128(v[5]);
91
+ v[6] = rot12_128(v[6]);
92
+ v[7] = rot12_128(v[7]);
93
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
94
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
95
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
96
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
97
+ v[0] = add_128(v[0], v[4]);
98
+ v[1] = add_128(v[1], v[5]);
99
+ v[2] = add_128(v[2], v[6]);
100
+ v[3] = add_128(v[3], v[7]);
101
+ v[12] = xor_128(v[12], v[0]);
102
+ v[13] = xor_128(v[13], v[1]);
103
+ v[14] = xor_128(v[14], v[2]);
104
+ v[15] = xor_128(v[15], v[3]);
105
+ v[12] = rot8_128(v[12]);
106
+ v[13] = rot8_128(v[13]);
107
+ v[14] = rot8_128(v[14]);
108
+ v[15] = rot8_128(v[15]);
109
+ v[8] = add_128(v[8], v[12]);
110
+ v[9] = add_128(v[9], v[13]);
111
+ v[10] = add_128(v[10], v[14]);
112
+ v[11] = add_128(v[11], v[15]);
113
+ v[4] = xor_128(v[4], v[8]);
114
+ v[5] = xor_128(v[5], v[9]);
115
+ v[6] = xor_128(v[6], v[10]);
116
+ v[7] = xor_128(v[7], v[11]);
117
+ v[4] = rot7_128(v[4]);
118
+ v[5] = rot7_128(v[5]);
119
+ v[6] = rot7_128(v[6]);
120
+ v[7] = rot7_128(v[7]);
121
+
122
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
123
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
124
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
125
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
126
+ v[0] = add_128(v[0], v[5]);
127
+ v[1] = add_128(v[1], v[6]);
128
+ v[2] = add_128(v[2], v[7]);
129
+ v[3] = add_128(v[3], v[4]);
130
+ v[15] = xor_128(v[15], v[0]);
131
+ v[12] = xor_128(v[12], v[1]);
132
+ v[13] = xor_128(v[13], v[2]);
133
+ v[14] = xor_128(v[14], v[3]);
134
+ v[15] = rot16_128(v[15]);
135
+ v[12] = rot16_128(v[12]);
136
+ v[13] = rot16_128(v[13]);
137
+ v[14] = rot16_128(v[14]);
138
+ v[10] = add_128(v[10], v[15]);
139
+ v[11] = add_128(v[11], v[12]);
140
+ v[8] = add_128(v[8], v[13]);
141
+ v[9] = add_128(v[9], v[14]);
142
+ v[5] = xor_128(v[5], v[10]);
143
+ v[6] = xor_128(v[6], v[11]);
144
+ v[7] = xor_128(v[7], v[8]);
145
+ v[4] = xor_128(v[4], v[9]);
146
+ v[5] = rot12_128(v[5]);
147
+ v[6] = rot12_128(v[6]);
148
+ v[7] = rot12_128(v[7]);
149
+ v[4] = rot12_128(v[4]);
150
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
151
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
152
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
153
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
154
+ v[0] = add_128(v[0], v[5]);
155
+ v[1] = add_128(v[1], v[6]);
156
+ v[2] = add_128(v[2], v[7]);
157
+ v[3] = add_128(v[3], v[4]);
158
+ v[15] = xor_128(v[15], v[0]);
159
+ v[12] = xor_128(v[12], v[1]);
160
+ v[13] = xor_128(v[13], v[2]);
161
+ v[14] = xor_128(v[14], v[3]);
162
+ v[15] = rot8_128(v[15]);
163
+ v[12] = rot8_128(v[12]);
164
+ v[13] = rot8_128(v[13]);
165
+ v[14] = rot8_128(v[14]);
166
+ v[10] = add_128(v[10], v[15]);
167
+ v[11] = add_128(v[11], v[12]);
168
+ v[8] = add_128(v[8], v[13]);
169
+ v[9] = add_128(v[9], v[14]);
170
+ v[5] = xor_128(v[5], v[10]);
171
+ v[6] = xor_128(v[6], v[11]);
172
+ v[7] = xor_128(v[7], v[8]);
173
+ v[4] = xor_128(v[4], v[9]);
174
+ v[5] = rot7_128(v[5]);
175
+ v[6] = rot7_128(v[6]);
176
+ v[7] = rot7_128(v[7]);
177
+ v[4] = rot7_128(v[4]);
178
+ }
179
+
180
+ INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
181
+ // Individually transpose the four 2x2 sub-matrices in each corner.
182
+ uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
183
+ uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
184
+
185
+ // Swap the top-right and bottom-left 2x2s (which just got transposed).
186
+ vecs[0] =
187
+ vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
188
+ vecs[1] =
189
+ vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
190
+ vecs[2] =
191
+ vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
192
+ vecs[3] =
193
+ vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
194
+ }
195
+
196
+ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
197
+ size_t block_offset, uint32x4_t out[16]) {
198
+ out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
199
+ out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
200
+ out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
201
+ out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
202
+ out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
203
+ out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
204
+ out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
205
+ out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
206
+ out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
207
+ out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
208
+ out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
209
+ out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
210
+ out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
211
+ out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
212
+ out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
213
+ out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
214
+ transpose_vecs_128(&out[0]);
215
+ transpose_vecs_128(&out[4]);
216
+ transpose_vecs_128(&out[8]);
217
+ transpose_vecs_128(&out[12]);
218
+ }
219
+
220
+ INLINE void load_counters4(uint64_t counter, bool increment_counter,
221
+ uint32x4_t *out_low, uint32x4_t *out_high) {
222
+ uint64_t mask = (increment_counter ? ~0 : 0);
223
+ *out_low = set4(
224
+ counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
225
+ counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
226
+ *out_high = set4(
227
+ counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
228
+ counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
229
+ }
230
+
231
+ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
232
+ const uint32_t key[8], uint64_t counter,
233
+ bool increment_counter, uint8_t flags,
234
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
235
+ uint32x4_t h_vecs[8] = {
236
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
237
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
238
+ };
239
+ uint32x4_t counter_low_vec, counter_high_vec;
240
+ load_counters4(counter, increment_counter, &counter_low_vec,
241
+ &counter_high_vec);
242
+ uint8_t block_flags = flags | flags_start;
243
+
244
+ for (size_t block = 0; block < blocks; block++) {
245
+ if (block + 1 == blocks) {
246
+ block_flags |= flags_end;
247
+ }
248
+ uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
249
+ uint32x4_t block_flags_vec = set1_128(block_flags);
250
+ uint32x4_t msg_vecs[16];
251
+ transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
252
+
253
+ uint32x4_t v[16] = {
254
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
255
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
256
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
257
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
258
+ };
259
+ round_fn4(v, msg_vecs, 0);
260
+ round_fn4(v, msg_vecs, 1);
261
+ round_fn4(v, msg_vecs, 2);
262
+ round_fn4(v, msg_vecs, 3);
263
+ round_fn4(v, msg_vecs, 4);
264
+ round_fn4(v, msg_vecs, 5);
265
+ round_fn4(v, msg_vecs, 6);
266
+ h_vecs[0] = xor_128(v[0], v[8]);
267
+ h_vecs[1] = xor_128(v[1], v[9]);
268
+ h_vecs[2] = xor_128(v[2], v[10]);
269
+ h_vecs[3] = xor_128(v[3], v[11]);
270
+ h_vecs[4] = xor_128(v[4], v[12]);
271
+ h_vecs[5] = xor_128(v[5], v[13]);
272
+ h_vecs[6] = xor_128(v[6], v[14]);
273
+ h_vecs[7] = xor_128(v[7], v[15]);
274
+
275
+ block_flags = flags;
276
+ }
277
+
278
+ transpose_vecs_128(&h_vecs[0]);
279
+ transpose_vecs_128(&h_vecs[4]);
280
+ // The first four vecs now contain the first half of each output, and the
281
+ // second four vecs contain the second half of each output.
282
+ storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
283
+ storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
284
+ storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
285
+ storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
286
+ storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
287
+ storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
288
+ storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
289
+ storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
290
+ }
291
+
292
+ /*
293
+ * ----------------------------------------------------------------------------
294
+ * hash_many_neon
295
+ * ----------------------------------------------------------------------------
296
+ */
297
+
298
+ void blake3_compress_in_place_portable(uint32_t cv[8],
299
+ const uint8_t block[BLAKE3_BLOCK_LEN],
300
+ uint8_t block_len, uint64_t counter,
301
+ uint8_t flags);
302
+
303
+ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
304
+ const uint32_t key[8], uint64_t counter,
305
+ uint8_t flags, uint8_t flags_start, uint8_t flags_end,
306
+ uint8_t out[BLAKE3_OUT_LEN]) {
307
+ uint32_t cv[8];
308
+ memcpy(cv, key, BLAKE3_KEY_LEN);
309
+ uint8_t block_flags = flags | flags_start;
310
+ while (blocks > 0) {
311
+ if (blocks == 1) {
312
+ block_flags |= flags_end;
313
+ }
314
+ // TODO: Implement compress_neon. However note that according to
315
+ // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
316
+ // compress_neon might not be any faster than compress_portable.
317
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
318
+ block_flags);
319
+ input = &input[BLAKE3_BLOCK_LEN];
320
+ blocks -= 1;
321
+ block_flags = flags;
322
+ }
323
+ memcpy(out, cv, BLAKE3_OUT_LEN);
324
+ }
325
+
326
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
327
+ size_t blocks, const uint32_t key[8],
328
+ uint64_t counter, bool increment_counter,
329
+ uint8_t flags, uint8_t flags_start,
330
+ uint8_t flags_end, uint8_t *out) {
331
+ while (num_inputs >= 4) {
332
+ blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
333
+ flags_start, flags_end, out);
334
+ if (increment_counter) {
335
+ counter += 4;
336
+ }
337
+ inputs += 4;
338
+ num_inputs -= 4;
339
+ out = &out[4 * BLAKE3_OUT_LEN];
340
+ }
341
+ while (num_inputs > 0) {
342
+ hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
343
+ flags_end, out);
344
+ if (increment_counter) {
345
+ counter += 1;
346
+ }
347
+ inputs += 1;
348
+ num_inputs -= 1;
349
+ out = &out[BLAKE3_OUT_LEN];
350
+ }
351
+ }
@@ -0,0 +1,160 @@
1
+ #include "blake3_impl.h"
2
+ #include <string.h>
3
+
4
+ INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
5
+ return (w >> c) | (w << (32 - c));
6
+ }
7
+
8
+ INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
9
+ uint32_t x, uint32_t y) {
10
+ state[a] = state[a] + state[b] + x;
11
+ state[d] = rotr32(state[d] ^ state[a], 16);
12
+ state[c] = state[c] + state[d];
13
+ state[b] = rotr32(state[b] ^ state[c], 12);
14
+ state[a] = state[a] + state[b] + y;
15
+ state[d] = rotr32(state[d] ^ state[a], 8);
16
+ state[c] = state[c] + state[d];
17
+ state[b] = rotr32(state[b] ^ state[c], 7);
18
+ }
19
+
20
+ INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
21
+ // Select the message schedule based on the round.
22
+ const uint8_t *schedule = MSG_SCHEDULE[round];
23
+
24
+ // Mix the columns.
25
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
26
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
27
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
28
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
29
+
30
+ // Mix the rows.
31
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
32
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
33
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
34
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
35
+ }
36
+
37
+ INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
38
+ const uint8_t block[BLAKE3_BLOCK_LEN],
39
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
40
+ uint32_t block_words[16];
41
+ block_words[0] = load32(block + 4 * 0);
42
+ block_words[1] = load32(block + 4 * 1);
43
+ block_words[2] = load32(block + 4 * 2);
44
+ block_words[3] = load32(block + 4 * 3);
45
+ block_words[4] = load32(block + 4 * 4);
46
+ block_words[5] = load32(block + 4 * 5);
47
+ block_words[6] = load32(block + 4 * 6);
48
+ block_words[7] = load32(block + 4 * 7);
49
+ block_words[8] = load32(block + 4 * 8);
50
+ block_words[9] = load32(block + 4 * 9);
51
+ block_words[10] = load32(block + 4 * 10);
52
+ block_words[11] = load32(block + 4 * 11);
53
+ block_words[12] = load32(block + 4 * 12);
54
+ block_words[13] = load32(block + 4 * 13);
55
+ block_words[14] = load32(block + 4 * 14);
56
+ block_words[15] = load32(block + 4 * 15);
57
+
58
+ state[0] = cv[0];
59
+ state[1] = cv[1];
60
+ state[2] = cv[2];
61
+ state[3] = cv[3];
62
+ state[4] = cv[4];
63
+ state[5] = cv[5];
64
+ state[6] = cv[6];
65
+ state[7] = cv[7];
66
+ state[8] = IV[0];
67
+ state[9] = IV[1];
68
+ state[10] = IV[2];
69
+ state[11] = IV[3];
70
+ state[12] = counter_low(counter);
71
+ state[13] = counter_high(counter);
72
+ state[14] = (uint32_t)block_len;
73
+ state[15] = (uint32_t)flags;
74
+
75
+ round_fn(state, &block_words[0], 0);
76
+ round_fn(state, &block_words[0], 1);
77
+ round_fn(state, &block_words[0], 2);
78
+ round_fn(state, &block_words[0], 3);
79
+ round_fn(state, &block_words[0], 4);
80
+ round_fn(state, &block_words[0], 5);
81
+ round_fn(state, &block_words[0], 6);
82
+ }
83
+
84
+ void blake3_compress_in_place_portable(uint32_t cv[8],
85
+ const uint8_t block[BLAKE3_BLOCK_LEN],
86
+ uint8_t block_len, uint64_t counter,
87
+ uint8_t flags) {
88
+ uint32_t state[16];
89
+ compress_pre(state, cv, block, block_len, counter, flags);
90
+ cv[0] = state[0] ^ state[8];
91
+ cv[1] = state[1] ^ state[9];
92
+ cv[2] = state[2] ^ state[10];
93
+ cv[3] = state[3] ^ state[11];
94
+ cv[4] = state[4] ^ state[12];
95
+ cv[5] = state[5] ^ state[13];
96
+ cv[6] = state[6] ^ state[14];
97
+ cv[7] = state[7] ^ state[15];
98
+ }
99
+
100
+ void blake3_compress_xof_portable(const uint32_t cv[8],
101
+ const uint8_t block[BLAKE3_BLOCK_LEN],
102
+ uint8_t block_len, uint64_t counter,
103
+ uint8_t flags, uint8_t out[64]) {
104
+ uint32_t state[16];
105
+ compress_pre(state, cv, block, block_len, counter, flags);
106
+
107
+ store32(&out[0 * 4], state[0] ^ state[8]);
108
+ store32(&out[1 * 4], state[1] ^ state[9]);
109
+ store32(&out[2 * 4], state[2] ^ state[10]);
110
+ store32(&out[3 * 4], state[3] ^ state[11]);
111
+ store32(&out[4 * 4], state[4] ^ state[12]);
112
+ store32(&out[5 * 4], state[5] ^ state[13]);
113
+ store32(&out[6 * 4], state[6] ^ state[14]);
114
+ store32(&out[7 * 4], state[7] ^ state[15]);
115
+ store32(&out[8 * 4], state[8] ^ cv[0]);
116
+ store32(&out[9 * 4], state[9] ^ cv[1]);
117
+ store32(&out[10 * 4], state[10] ^ cv[2]);
118
+ store32(&out[11 * 4], state[11] ^ cv[3]);
119
+ store32(&out[12 * 4], state[12] ^ cv[4]);
120
+ store32(&out[13 * 4], state[13] ^ cv[5]);
121
+ store32(&out[14 * 4], state[14] ^ cv[6]);
122
+ store32(&out[15 * 4], state[15] ^ cv[7]);
123
+ }
124
+
125
+ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
126
+ const uint32_t key[8], uint64_t counter,
127
+ uint8_t flags, uint8_t flags_start,
128
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
129
+ uint32_t cv[8];
130
+ memcpy(cv, key, BLAKE3_KEY_LEN);
131
+ uint8_t block_flags = flags | flags_start;
132
+ while (blocks > 0) {
133
+ if (blocks == 1) {
134
+ block_flags |= flags_end;
135
+ }
136
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
137
+ block_flags);
138
+ input = &input[BLAKE3_BLOCK_LEN];
139
+ blocks -= 1;
140
+ block_flags = flags;
141
+ }
142
+ store_cv_words(out, cv);
143
+ }
144
+
145
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
146
+ size_t blocks, const uint32_t key[8],
147
+ uint64_t counter, bool increment_counter,
148
+ uint8_t flags, uint8_t flags_start,
149
+ uint8_t flags_end, uint8_t *out) {
150
+ while (num_inputs > 0) {
151
+ hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
152
+ flags_end, out);
153
+ if (increment_counter) {
154
+ counter += 1;
155
+ }
156
+ inputs += 1;
157
+ num_inputs -= 1;
158
+ out = &out[BLAKE3_OUT_LEN];
159
+ }
160
+ }