digest-blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,346 @@
1
+ #include "blake3_impl.h"
2
+
3
+ #include <arm_neon.h>
4
+
5
+ // TODO: This is probably incorrect for big-endian ARM. How should that work?
6
+ INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
7
+ // vld1q_u32 has alignment requirements. Don't use it.
8
+ uint32x4_t x;
9
+ memcpy(&x, src, 16);
10
+ return x;
11
+ }
12
+
13
+ INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
14
+ // vst1q_u32 has alignment requirements. Don't use it.
15
+ memcpy(dest, &src, 16);
16
+ }
17
+
18
+ INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
19
+ return vaddq_u32(a, b);
20
+ }
21
+
22
+ INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
23
+ return veorq_u32(a, b);
24
+ }
25
+
26
+ INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
27
+
28
+ INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
29
+ uint32_t array[4] = {a, b, c, d};
30
+ return vld1q_u32(array);
31
+ }
32
+
33
+ INLINE uint32x4_t rot16_128(uint32x4_t x) {
34
+ return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
35
+ }
36
+
37
+ INLINE uint32x4_t rot12_128(uint32x4_t x) {
38
+ return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
39
+ }
40
+
41
+ INLINE uint32x4_t rot8_128(uint32x4_t x) {
42
+ return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
43
+ }
44
+
45
+ INLINE uint32x4_t rot7_128(uint32x4_t x) {
46
+ return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
47
+ }
48
+
49
+ // TODO: compress_neon
50
+
51
+ // TODO: hash2_neon
52
+
53
+ /*
54
+ * ----------------------------------------------------------------------------
55
+ * hash4_neon
56
+ * ----------------------------------------------------------------------------
57
+ */
58
+
59
+ INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
60
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
61
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
62
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
63
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
64
+ v[0] = add_128(v[0], v[4]);
65
+ v[1] = add_128(v[1], v[5]);
66
+ v[2] = add_128(v[2], v[6]);
67
+ v[3] = add_128(v[3], v[7]);
68
+ v[12] = xor_128(v[12], v[0]);
69
+ v[13] = xor_128(v[13], v[1]);
70
+ v[14] = xor_128(v[14], v[2]);
71
+ v[15] = xor_128(v[15], v[3]);
72
+ v[12] = rot16_128(v[12]);
73
+ v[13] = rot16_128(v[13]);
74
+ v[14] = rot16_128(v[14]);
75
+ v[15] = rot16_128(v[15]);
76
+ v[8] = add_128(v[8], v[12]);
77
+ v[9] = add_128(v[9], v[13]);
78
+ v[10] = add_128(v[10], v[14]);
79
+ v[11] = add_128(v[11], v[15]);
80
+ v[4] = xor_128(v[4], v[8]);
81
+ v[5] = xor_128(v[5], v[9]);
82
+ v[6] = xor_128(v[6], v[10]);
83
+ v[7] = xor_128(v[7], v[11]);
84
+ v[4] = rot12_128(v[4]);
85
+ v[5] = rot12_128(v[5]);
86
+ v[6] = rot12_128(v[6]);
87
+ v[7] = rot12_128(v[7]);
88
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
89
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
90
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
91
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
92
+ v[0] = add_128(v[0], v[4]);
93
+ v[1] = add_128(v[1], v[5]);
94
+ v[2] = add_128(v[2], v[6]);
95
+ v[3] = add_128(v[3], v[7]);
96
+ v[12] = xor_128(v[12], v[0]);
97
+ v[13] = xor_128(v[13], v[1]);
98
+ v[14] = xor_128(v[14], v[2]);
99
+ v[15] = xor_128(v[15], v[3]);
100
+ v[12] = rot8_128(v[12]);
101
+ v[13] = rot8_128(v[13]);
102
+ v[14] = rot8_128(v[14]);
103
+ v[15] = rot8_128(v[15]);
104
+ v[8] = add_128(v[8], v[12]);
105
+ v[9] = add_128(v[9], v[13]);
106
+ v[10] = add_128(v[10], v[14]);
107
+ v[11] = add_128(v[11], v[15]);
108
+ v[4] = xor_128(v[4], v[8]);
109
+ v[5] = xor_128(v[5], v[9]);
110
+ v[6] = xor_128(v[6], v[10]);
111
+ v[7] = xor_128(v[7], v[11]);
112
+ v[4] = rot7_128(v[4]);
113
+ v[5] = rot7_128(v[5]);
114
+ v[6] = rot7_128(v[6]);
115
+ v[7] = rot7_128(v[7]);
116
+
117
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
118
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
119
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
120
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
121
+ v[0] = add_128(v[0], v[5]);
122
+ v[1] = add_128(v[1], v[6]);
123
+ v[2] = add_128(v[2], v[7]);
124
+ v[3] = add_128(v[3], v[4]);
125
+ v[15] = xor_128(v[15], v[0]);
126
+ v[12] = xor_128(v[12], v[1]);
127
+ v[13] = xor_128(v[13], v[2]);
128
+ v[14] = xor_128(v[14], v[3]);
129
+ v[15] = rot16_128(v[15]);
130
+ v[12] = rot16_128(v[12]);
131
+ v[13] = rot16_128(v[13]);
132
+ v[14] = rot16_128(v[14]);
133
+ v[10] = add_128(v[10], v[15]);
134
+ v[11] = add_128(v[11], v[12]);
135
+ v[8] = add_128(v[8], v[13]);
136
+ v[9] = add_128(v[9], v[14]);
137
+ v[5] = xor_128(v[5], v[10]);
138
+ v[6] = xor_128(v[6], v[11]);
139
+ v[7] = xor_128(v[7], v[8]);
140
+ v[4] = xor_128(v[4], v[9]);
141
+ v[5] = rot12_128(v[5]);
142
+ v[6] = rot12_128(v[6]);
143
+ v[7] = rot12_128(v[7]);
144
+ v[4] = rot12_128(v[4]);
145
+ v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
146
+ v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
147
+ v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
148
+ v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
149
+ v[0] = add_128(v[0], v[5]);
150
+ v[1] = add_128(v[1], v[6]);
151
+ v[2] = add_128(v[2], v[7]);
152
+ v[3] = add_128(v[3], v[4]);
153
+ v[15] = xor_128(v[15], v[0]);
154
+ v[12] = xor_128(v[12], v[1]);
155
+ v[13] = xor_128(v[13], v[2]);
156
+ v[14] = xor_128(v[14], v[3]);
157
+ v[15] = rot8_128(v[15]);
158
+ v[12] = rot8_128(v[12]);
159
+ v[13] = rot8_128(v[13]);
160
+ v[14] = rot8_128(v[14]);
161
+ v[10] = add_128(v[10], v[15]);
162
+ v[11] = add_128(v[11], v[12]);
163
+ v[8] = add_128(v[8], v[13]);
164
+ v[9] = add_128(v[9], v[14]);
165
+ v[5] = xor_128(v[5], v[10]);
166
+ v[6] = xor_128(v[6], v[11]);
167
+ v[7] = xor_128(v[7], v[8]);
168
+ v[4] = xor_128(v[4], v[9]);
169
+ v[5] = rot7_128(v[5]);
170
+ v[6] = rot7_128(v[6]);
171
+ v[7] = rot7_128(v[7]);
172
+ v[4] = rot7_128(v[4]);
173
+ }
174
+
175
+ INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
176
+ // Individually transpose the four 2x2 sub-matrices in each corner.
177
+ uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
178
+ uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
179
+
180
+ // Swap the top-right and bottom-left 2x2s (which just got transposed).
181
+ vecs[0] =
182
+ vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
183
+ vecs[1] =
184
+ vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
185
+ vecs[2] =
186
+ vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
187
+ vecs[3] =
188
+ vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
189
+ }
190
+
191
+ INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
192
+ size_t block_offset, uint32x4_t out[16]) {
193
+ out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
194
+ out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
195
+ out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
196
+ out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
197
+ out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
198
+ out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
199
+ out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
200
+ out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
201
+ out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
202
+ out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
203
+ out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
204
+ out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
205
+ out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
206
+ out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
207
+ out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
208
+ out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
209
+ transpose_vecs_128(&out[0]);
210
+ transpose_vecs_128(&out[4]);
211
+ transpose_vecs_128(&out[8]);
212
+ transpose_vecs_128(&out[12]);
213
+ }
214
+
215
+ INLINE void load_counters4(uint64_t counter, bool increment_counter,
216
+ uint32x4_t *out_low, uint32x4_t *out_high) {
217
+ uint64_t mask = (increment_counter ? ~0 : 0);
218
+ *out_low = set4(
219
+ counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
220
+ counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
221
+ *out_high = set4(
222
+ counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
223
+ counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
224
+ }
225
+
226
+ void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
227
+ const uint32_t key[8], uint64_t counter,
228
+ bool increment_counter, uint8_t flags,
229
+ uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
230
+ uint32x4_t h_vecs[8] = {
231
+ set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
232
+ set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
233
+ };
234
+ uint32x4_t counter_low_vec, counter_high_vec;
235
+ load_counters4(counter, increment_counter, &counter_low_vec,
236
+ &counter_high_vec);
237
+ uint8_t block_flags = flags | flags_start;
238
+
239
+ for (size_t block = 0; block < blocks; block++) {
240
+ if (block + 1 == blocks) {
241
+ block_flags |= flags_end;
242
+ }
243
+ uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
244
+ uint32x4_t block_flags_vec = set1_128(block_flags);
245
+ uint32x4_t msg_vecs[16];
246
+ transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
247
+
248
+ uint32x4_t v[16] = {
249
+ h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
250
+ h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
251
+ set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
252
+ counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
253
+ };
254
+ round_fn4(v, msg_vecs, 0);
255
+ round_fn4(v, msg_vecs, 1);
256
+ round_fn4(v, msg_vecs, 2);
257
+ round_fn4(v, msg_vecs, 3);
258
+ round_fn4(v, msg_vecs, 4);
259
+ round_fn4(v, msg_vecs, 5);
260
+ round_fn4(v, msg_vecs, 6);
261
+ h_vecs[0] = xor_128(v[0], v[8]);
262
+ h_vecs[1] = xor_128(v[1], v[9]);
263
+ h_vecs[2] = xor_128(v[2], v[10]);
264
+ h_vecs[3] = xor_128(v[3], v[11]);
265
+ h_vecs[4] = xor_128(v[4], v[12]);
266
+ h_vecs[5] = xor_128(v[5], v[13]);
267
+ h_vecs[6] = xor_128(v[6], v[14]);
268
+ h_vecs[7] = xor_128(v[7], v[15]);
269
+
270
+ block_flags = flags;
271
+ }
272
+
273
+ transpose_vecs_128(&h_vecs[0]);
274
+ transpose_vecs_128(&h_vecs[4]);
275
+ // The first four vecs now contain the first half of each output, and the
276
+ // second four vecs contain the second half of each output.
277
+ storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
278
+ storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
279
+ storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
280
+ storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
281
+ storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
282
+ storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
283
+ storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
284
+ storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
285
+ }
286
+
287
+ /*
288
+ * ----------------------------------------------------------------------------
289
+ * hash_many_neon
290
+ * ----------------------------------------------------------------------------
291
+ */
292
+
293
+ void blake3_compress_in_place_portable(uint32_t cv[8],
294
+ const uint8_t block[BLAKE3_BLOCK_LEN],
295
+ uint8_t block_len, uint64_t counter,
296
+ uint8_t flags);
297
+
298
+ INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
299
+ const uint32_t key[8], uint64_t counter,
300
+ uint8_t flags, uint8_t flags_start, uint8_t flags_end,
301
+ uint8_t out[BLAKE3_OUT_LEN]) {
302
+ uint32_t cv[8];
303
+ memcpy(cv, key, BLAKE3_KEY_LEN);
304
+ uint8_t block_flags = flags | flags_start;
305
+ while (blocks > 0) {
306
+ if (blocks == 1) {
307
+ block_flags |= flags_end;
308
+ }
309
+ // TODO: Implement compress_neon. However note that according to
310
+ // https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
311
+ // compress_neon might not be any faster than compress_portable.
312
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
313
+ block_flags);
314
+ input = &input[BLAKE3_BLOCK_LEN];
315
+ blocks -= 1;
316
+ block_flags = flags;
317
+ }
318
+ memcpy(out, cv, BLAKE3_OUT_LEN);
319
+ }
320
+
321
+ void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
322
+ size_t blocks, const uint32_t key[8],
323
+ uint64_t counter, bool increment_counter,
324
+ uint8_t flags, uint8_t flags_start,
325
+ uint8_t flags_end, uint8_t *out) {
326
+ while (num_inputs >= 4) {
327
+ blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
328
+ flags_start, flags_end, out);
329
+ if (increment_counter) {
330
+ counter += 4;
331
+ }
332
+ inputs += 4;
333
+ num_inputs -= 4;
334
+ out = &out[4 * BLAKE3_OUT_LEN];
335
+ }
336
+ while (num_inputs > 0) {
337
+ hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
338
+ flags_end, out);
339
+ if (increment_counter) {
340
+ counter += 1;
341
+ }
342
+ inputs += 1;
343
+ num_inputs -= 1;
344
+ out = &out[BLAKE3_OUT_LEN];
345
+ }
346
+ }
@@ -0,0 +1,168 @@
1
+ #include "blake3_impl.h"
2
+ #include <string.h>
3
+
4
+ INLINE void store32(void *dst, uint32_t w) {
5
+ uint8_t *p = (uint8_t *)dst;
6
+ p[0] = (uint8_t)(w >> 0);
7
+ p[1] = (uint8_t)(w >> 8);
8
+ p[2] = (uint8_t)(w >> 16);
9
+ p[3] = (uint8_t)(w >> 24);
10
+ }
11
+
12
+ INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
13
+ return (w >> c) | (w << (32 - c));
14
+ }
15
+
16
+ INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
17
+ uint32_t x, uint32_t y) {
18
+ state[a] = state[a] + state[b] + x;
19
+ state[d] = rotr32(state[d] ^ state[a], 16);
20
+ state[c] = state[c] + state[d];
21
+ state[b] = rotr32(state[b] ^ state[c], 12);
22
+ state[a] = state[a] + state[b] + y;
23
+ state[d] = rotr32(state[d] ^ state[a], 8);
24
+ state[c] = state[c] + state[d];
25
+ state[b] = rotr32(state[b] ^ state[c], 7);
26
+ }
27
+
28
+ INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
29
+ // Select the message schedule based on the round.
30
+ const uint8_t *schedule = MSG_SCHEDULE[round];
31
+
32
+ // Mix the columns.
33
+ g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
34
+ g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
35
+ g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
36
+ g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
37
+
38
+ // Mix the rows.
39
+ g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
40
+ g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
41
+ g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
42
+ g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
43
+ }
44
+
45
+ INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
46
+ const uint8_t block[BLAKE3_BLOCK_LEN],
47
+ uint8_t block_len, uint64_t counter, uint8_t flags) {
48
+ uint32_t block_words[16];
49
+ block_words[0] = load32(block + 4 * 0);
50
+ block_words[1] = load32(block + 4 * 1);
51
+ block_words[2] = load32(block + 4 * 2);
52
+ block_words[3] = load32(block + 4 * 3);
53
+ block_words[4] = load32(block + 4 * 4);
54
+ block_words[5] = load32(block + 4 * 5);
55
+ block_words[6] = load32(block + 4 * 6);
56
+ block_words[7] = load32(block + 4 * 7);
57
+ block_words[8] = load32(block + 4 * 8);
58
+ block_words[9] = load32(block + 4 * 9);
59
+ block_words[10] = load32(block + 4 * 10);
60
+ block_words[11] = load32(block + 4 * 11);
61
+ block_words[12] = load32(block + 4 * 12);
62
+ block_words[13] = load32(block + 4 * 13);
63
+ block_words[14] = load32(block + 4 * 14);
64
+ block_words[15] = load32(block + 4 * 15);
65
+
66
+ state[0] = cv[0];
67
+ state[1] = cv[1];
68
+ state[2] = cv[2];
69
+ state[3] = cv[3];
70
+ state[4] = cv[4];
71
+ state[5] = cv[5];
72
+ state[6] = cv[6];
73
+ state[7] = cv[7];
74
+ state[8] = IV[0];
75
+ state[9] = IV[1];
76
+ state[10] = IV[2];
77
+ state[11] = IV[3];
78
+ state[12] = counter_low(counter);
79
+ state[13] = counter_high(counter);
80
+ state[14] = (uint32_t)block_len;
81
+ state[15] = (uint32_t)flags;
82
+
83
+ round_fn(state, &block_words[0], 0);
84
+ round_fn(state, &block_words[0], 1);
85
+ round_fn(state, &block_words[0], 2);
86
+ round_fn(state, &block_words[0], 3);
87
+ round_fn(state, &block_words[0], 4);
88
+ round_fn(state, &block_words[0], 5);
89
+ round_fn(state, &block_words[0], 6);
90
+ }
91
+
92
+ void blake3_compress_in_place_portable(uint32_t cv[8],
93
+ const uint8_t block[BLAKE3_BLOCK_LEN],
94
+ uint8_t block_len, uint64_t counter,
95
+ uint8_t flags) {
96
+ uint32_t state[16];
97
+ compress_pre(state, cv, block, block_len, counter, flags);
98
+ cv[0] = state[0] ^ state[8];
99
+ cv[1] = state[1] ^ state[9];
100
+ cv[2] = state[2] ^ state[10];
101
+ cv[3] = state[3] ^ state[11];
102
+ cv[4] = state[4] ^ state[12];
103
+ cv[5] = state[5] ^ state[13];
104
+ cv[6] = state[6] ^ state[14];
105
+ cv[7] = state[7] ^ state[15];
106
+ }
107
+
108
+ void blake3_compress_xof_portable(const uint32_t cv[8],
109
+ const uint8_t block[BLAKE3_BLOCK_LEN],
110
+ uint8_t block_len, uint64_t counter,
111
+ uint8_t flags, uint8_t out[64]) {
112
+ uint32_t state[16];
113
+ compress_pre(state, cv, block, block_len, counter, flags);
114
+
115
+ store32(&out[0 * 4], state[0] ^ state[8]);
116
+ store32(&out[1 * 4], state[1] ^ state[9]);
117
+ store32(&out[2 * 4], state[2] ^ state[10]);
118
+ store32(&out[3 * 4], state[3] ^ state[11]);
119
+ store32(&out[4 * 4], state[4] ^ state[12]);
120
+ store32(&out[5 * 4], state[5] ^ state[13]);
121
+ store32(&out[6 * 4], state[6] ^ state[14]);
122
+ store32(&out[7 * 4], state[7] ^ state[15]);
123
+ store32(&out[8 * 4], state[8] ^ cv[0]);
124
+ store32(&out[9 * 4], state[9] ^ cv[1]);
125
+ store32(&out[10 * 4], state[10] ^ cv[2]);
126
+ store32(&out[11 * 4], state[11] ^ cv[3]);
127
+ store32(&out[12 * 4], state[12] ^ cv[4]);
128
+ store32(&out[13 * 4], state[13] ^ cv[5]);
129
+ store32(&out[14 * 4], state[14] ^ cv[6]);
130
+ store32(&out[15 * 4], state[15] ^ cv[7]);
131
+ }
132
+
133
+ INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
134
+ const uint32_t key[8], uint64_t counter,
135
+ uint8_t flags, uint8_t flags_start,
136
+ uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
137
+ uint32_t cv[8];
138
+ memcpy(cv, key, BLAKE3_KEY_LEN);
139
+ uint8_t block_flags = flags | flags_start;
140
+ while (blocks > 0) {
141
+ if (blocks == 1) {
142
+ block_flags |= flags_end;
143
+ }
144
+ blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
145
+ block_flags);
146
+ input = &input[BLAKE3_BLOCK_LEN];
147
+ blocks -= 1;
148
+ block_flags = flags;
149
+ }
150
+ memcpy(out, cv, 32);
151
+ }
152
+
153
+ void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
154
+ size_t blocks, const uint32_t key[8],
155
+ uint64_t counter, bool increment_counter,
156
+ uint8_t flags, uint8_t flags_start,
157
+ uint8_t flags_end, uint8_t *out) {
158
+ while (num_inputs > 0) {
159
+ hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
160
+ flags_end, out);
161
+ if (increment_counter) {
162
+ counter += 1;
163
+ }
164
+ inputs += 1;
165
+ num_inputs -= 1;
166
+ out = &out[BLAKE3_OUT_LEN];
167
+ }
168
+ }
@@ -0,0 +1,38 @@
1
+ #include <ruby.h>
2
+ #include <ruby/digest.h>
3
+ #include "blake3.h"
4
+
5
+ int blake3_init(void *ctx) {
6
+ blake3_hasher_init((blake3_hasher *)ctx);
7
+ return 1;
8
+ }
9
+
10
+ int blake3_finish(void *ctx, unsigned char *out) {
11
+ blake3_hasher_finalize((blake3_hasher *)ctx, out, BLAKE3_OUT_LEN);
12
+ return 1;
13
+ }
14
+
15
+ static const rb_digest_metadata_t blake3 = {
16
+ RUBY_DIGEST_API_VERSION,
17
+ BLAKE3_OUT_LEN,
18
+ BLAKE3_BLOCK_LEN,
19
+ sizeof(blake3_hasher),
20
+ (rb_digest_hash_init_func_t)blake3_init,
21
+ (rb_digest_hash_update_func_t)blake3_hasher_update,
22
+ (rb_digest_hash_finish_func_t)blake3_finish,
23
+ };
24
+
25
+ void Init_blake3(void) {
26
+ VALUE mDigest, cDigest_Base, cDigest_BLAKE3;
27
+
28
+ rb_require("digest");
29
+
30
+ mDigest = rb_path2class("Digest");
31
+ cDigest_Base = rb_path2class("Digest::Base");
32
+ cDigest_BLAKE3 = rb_define_class_under(mDigest, "BLAKE3", cDigest_Base);
33
+
34
+ #undef RUBY_UNTYPED_DATA_WARNING
35
+ #define RUBY_UNTYPED_DATA_WARNING 0
36
+ rb_iv_set(cDigest_BLAKE3, "metadata",
37
+ Data_Wrap_Struct(0, 0, 0, (void *)&blake3));
38
+ }