uncle_blake3 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE.md +27 -0
- data/README.md +89 -0
- data/ext/Rakefile +55 -0
- data/ext/binding/uncle_blake3.c +41 -0
- data/ext/blake3/c/Makefile.testing +82 -0
- data/ext/blake3/c/README.md +316 -0
- data/ext/blake3/c/blake3.c +616 -0
- data/ext/blake3/c/blake3.h +60 -0
- data/ext/blake3/c/blake3_avx2.c +326 -0
- data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
- data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
- data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
- data/ext/blake3/c/blake3_avx512.c +1207 -0
- data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
- data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
- data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
- data/ext/blake3/c/blake3_dispatch.c +276 -0
- data/ext/blake3/c/blake3_impl.h +282 -0
- data/ext/blake3/c/blake3_neon.c +351 -0
- data/ext/blake3/c/blake3_portable.c +160 -0
- data/ext/blake3/c/blake3_sse2.c +566 -0
- data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
- data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
- data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
- data/ext/blake3/c/blake3_sse41.c +560 -0
- data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
- data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
- data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
- data/ext/blake3/c/example.c +37 -0
- data/ext/blake3/c/main.c +166 -0
- data/ext/blake3/c/test.py +97 -0
- data/lib/uncle_blake3/binding.rb +20 -0
- data/lib/uncle_blake3/build/loader.rb +40 -0
- data/lib/uncle_blake3/build/platform.rb +37 -0
- data/lib/uncle_blake3/build.rb +4 -0
- data/lib/uncle_blake3/digest.rb +119 -0
- data/lib/uncle_blake3/version.rb +5 -0
- data/lib/uncle_blake3.rb +7 -0
- metadata +112 -0
@@ -0,0 +1,351 @@
|
|
1
|
+
#include "blake3_impl.h"
|
2
|
+
|
3
|
+
#include <arm_neon.h>
|
4
|
+
|
5
|
+
#ifdef __ARM_BIG_ENDIAN
|
6
|
+
#error "This implementation only supports little-endian ARM."
|
7
|
+
// It might be that all we need for big-endian support here is to get the loads
|
8
|
+
// and stores right, but step zero would be finding a way to test it in CI.
|
9
|
+
#endif
|
10
|
+
|
11
|
+
INLINE uint32x4_t loadu_128(const uint8_t src[16]) {
|
12
|
+
// vld1q_u32 has alignment requirements. Don't use it.
|
13
|
+
uint32x4_t x;
|
14
|
+
memcpy(&x, src, 16);
|
15
|
+
return x;
|
16
|
+
}
|
17
|
+
|
18
|
+
INLINE void storeu_128(uint32x4_t src, uint8_t dest[16]) {
|
19
|
+
// vst1q_u32 has alignment requirements. Don't use it.
|
20
|
+
memcpy(dest, &src, 16);
|
21
|
+
}
|
22
|
+
|
23
|
+
INLINE uint32x4_t add_128(uint32x4_t a, uint32x4_t b) {
|
24
|
+
return vaddq_u32(a, b);
|
25
|
+
}
|
26
|
+
|
27
|
+
INLINE uint32x4_t xor_128(uint32x4_t a, uint32x4_t b) {
|
28
|
+
return veorq_u32(a, b);
|
29
|
+
}
|
30
|
+
|
31
|
+
INLINE uint32x4_t set1_128(uint32_t x) { return vld1q_dup_u32(&x); }
|
32
|
+
|
33
|
+
INLINE uint32x4_t set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
|
34
|
+
uint32_t array[4] = {a, b, c, d};
|
35
|
+
return vld1q_u32(array);
|
36
|
+
}
|
37
|
+
|
38
|
+
INLINE uint32x4_t rot16_128(uint32x4_t x) {
|
39
|
+
return vorrq_u32(vshrq_n_u32(x, 16), vshlq_n_u32(x, 32 - 16));
|
40
|
+
}
|
41
|
+
|
42
|
+
INLINE uint32x4_t rot12_128(uint32x4_t x) {
|
43
|
+
return vorrq_u32(vshrq_n_u32(x, 12), vshlq_n_u32(x, 32 - 12));
|
44
|
+
}
|
45
|
+
|
46
|
+
INLINE uint32x4_t rot8_128(uint32x4_t x) {
|
47
|
+
return vorrq_u32(vshrq_n_u32(x, 8), vshlq_n_u32(x, 32 - 8));
|
48
|
+
}
|
49
|
+
|
50
|
+
INLINE uint32x4_t rot7_128(uint32x4_t x) {
|
51
|
+
return vorrq_u32(vshrq_n_u32(x, 7), vshlq_n_u32(x, 32 - 7));
|
52
|
+
}
|
53
|
+
|
54
|
+
// TODO: compress_neon
|
55
|
+
|
56
|
+
// TODO: hash2_neon
|
57
|
+
|
58
|
+
/*
|
59
|
+
* ----------------------------------------------------------------------------
|
60
|
+
* hash4_neon
|
61
|
+
* ----------------------------------------------------------------------------
|
62
|
+
*/
|
63
|
+
|
64
|
+
INLINE void round_fn4(uint32x4_t v[16], uint32x4_t m[16], size_t r) {
|
65
|
+
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
|
66
|
+
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
|
67
|
+
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
|
68
|
+
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
|
69
|
+
v[0] = add_128(v[0], v[4]);
|
70
|
+
v[1] = add_128(v[1], v[5]);
|
71
|
+
v[2] = add_128(v[2], v[6]);
|
72
|
+
v[3] = add_128(v[3], v[7]);
|
73
|
+
v[12] = xor_128(v[12], v[0]);
|
74
|
+
v[13] = xor_128(v[13], v[1]);
|
75
|
+
v[14] = xor_128(v[14], v[2]);
|
76
|
+
v[15] = xor_128(v[15], v[3]);
|
77
|
+
v[12] = rot16_128(v[12]);
|
78
|
+
v[13] = rot16_128(v[13]);
|
79
|
+
v[14] = rot16_128(v[14]);
|
80
|
+
v[15] = rot16_128(v[15]);
|
81
|
+
v[8] = add_128(v[8], v[12]);
|
82
|
+
v[9] = add_128(v[9], v[13]);
|
83
|
+
v[10] = add_128(v[10], v[14]);
|
84
|
+
v[11] = add_128(v[11], v[15]);
|
85
|
+
v[4] = xor_128(v[4], v[8]);
|
86
|
+
v[5] = xor_128(v[5], v[9]);
|
87
|
+
v[6] = xor_128(v[6], v[10]);
|
88
|
+
v[7] = xor_128(v[7], v[11]);
|
89
|
+
v[4] = rot12_128(v[4]);
|
90
|
+
v[5] = rot12_128(v[5]);
|
91
|
+
v[6] = rot12_128(v[6]);
|
92
|
+
v[7] = rot12_128(v[7]);
|
93
|
+
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
|
94
|
+
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
|
95
|
+
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
|
96
|
+
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
|
97
|
+
v[0] = add_128(v[0], v[4]);
|
98
|
+
v[1] = add_128(v[1], v[5]);
|
99
|
+
v[2] = add_128(v[2], v[6]);
|
100
|
+
v[3] = add_128(v[3], v[7]);
|
101
|
+
v[12] = xor_128(v[12], v[0]);
|
102
|
+
v[13] = xor_128(v[13], v[1]);
|
103
|
+
v[14] = xor_128(v[14], v[2]);
|
104
|
+
v[15] = xor_128(v[15], v[3]);
|
105
|
+
v[12] = rot8_128(v[12]);
|
106
|
+
v[13] = rot8_128(v[13]);
|
107
|
+
v[14] = rot8_128(v[14]);
|
108
|
+
v[15] = rot8_128(v[15]);
|
109
|
+
v[8] = add_128(v[8], v[12]);
|
110
|
+
v[9] = add_128(v[9], v[13]);
|
111
|
+
v[10] = add_128(v[10], v[14]);
|
112
|
+
v[11] = add_128(v[11], v[15]);
|
113
|
+
v[4] = xor_128(v[4], v[8]);
|
114
|
+
v[5] = xor_128(v[5], v[9]);
|
115
|
+
v[6] = xor_128(v[6], v[10]);
|
116
|
+
v[7] = xor_128(v[7], v[11]);
|
117
|
+
v[4] = rot7_128(v[4]);
|
118
|
+
v[5] = rot7_128(v[5]);
|
119
|
+
v[6] = rot7_128(v[6]);
|
120
|
+
v[7] = rot7_128(v[7]);
|
121
|
+
|
122
|
+
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
|
123
|
+
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
|
124
|
+
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
|
125
|
+
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
|
126
|
+
v[0] = add_128(v[0], v[5]);
|
127
|
+
v[1] = add_128(v[1], v[6]);
|
128
|
+
v[2] = add_128(v[2], v[7]);
|
129
|
+
v[3] = add_128(v[3], v[4]);
|
130
|
+
v[15] = xor_128(v[15], v[0]);
|
131
|
+
v[12] = xor_128(v[12], v[1]);
|
132
|
+
v[13] = xor_128(v[13], v[2]);
|
133
|
+
v[14] = xor_128(v[14], v[3]);
|
134
|
+
v[15] = rot16_128(v[15]);
|
135
|
+
v[12] = rot16_128(v[12]);
|
136
|
+
v[13] = rot16_128(v[13]);
|
137
|
+
v[14] = rot16_128(v[14]);
|
138
|
+
v[10] = add_128(v[10], v[15]);
|
139
|
+
v[11] = add_128(v[11], v[12]);
|
140
|
+
v[8] = add_128(v[8], v[13]);
|
141
|
+
v[9] = add_128(v[9], v[14]);
|
142
|
+
v[5] = xor_128(v[5], v[10]);
|
143
|
+
v[6] = xor_128(v[6], v[11]);
|
144
|
+
v[7] = xor_128(v[7], v[8]);
|
145
|
+
v[4] = xor_128(v[4], v[9]);
|
146
|
+
v[5] = rot12_128(v[5]);
|
147
|
+
v[6] = rot12_128(v[6]);
|
148
|
+
v[7] = rot12_128(v[7]);
|
149
|
+
v[4] = rot12_128(v[4]);
|
150
|
+
v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
|
151
|
+
v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
|
152
|
+
v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
|
153
|
+
v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
|
154
|
+
v[0] = add_128(v[0], v[5]);
|
155
|
+
v[1] = add_128(v[1], v[6]);
|
156
|
+
v[2] = add_128(v[2], v[7]);
|
157
|
+
v[3] = add_128(v[3], v[4]);
|
158
|
+
v[15] = xor_128(v[15], v[0]);
|
159
|
+
v[12] = xor_128(v[12], v[1]);
|
160
|
+
v[13] = xor_128(v[13], v[2]);
|
161
|
+
v[14] = xor_128(v[14], v[3]);
|
162
|
+
v[15] = rot8_128(v[15]);
|
163
|
+
v[12] = rot8_128(v[12]);
|
164
|
+
v[13] = rot8_128(v[13]);
|
165
|
+
v[14] = rot8_128(v[14]);
|
166
|
+
v[10] = add_128(v[10], v[15]);
|
167
|
+
v[11] = add_128(v[11], v[12]);
|
168
|
+
v[8] = add_128(v[8], v[13]);
|
169
|
+
v[9] = add_128(v[9], v[14]);
|
170
|
+
v[5] = xor_128(v[5], v[10]);
|
171
|
+
v[6] = xor_128(v[6], v[11]);
|
172
|
+
v[7] = xor_128(v[7], v[8]);
|
173
|
+
v[4] = xor_128(v[4], v[9]);
|
174
|
+
v[5] = rot7_128(v[5]);
|
175
|
+
v[6] = rot7_128(v[6]);
|
176
|
+
v[7] = rot7_128(v[7]);
|
177
|
+
v[4] = rot7_128(v[4]);
|
178
|
+
}
|
179
|
+
|
180
|
+
INLINE void transpose_vecs_128(uint32x4_t vecs[4]) {
|
181
|
+
// Individually transpose the four 2x2 sub-matrices in each corner.
|
182
|
+
uint32x4x2_t rows01 = vtrnq_u32(vecs[0], vecs[1]);
|
183
|
+
uint32x4x2_t rows23 = vtrnq_u32(vecs[2], vecs[3]);
|
184
|
+
|
185
|
+
// Swap the top-right and bottom-left 2x2s (which just got transposed).
|
186
|
+
vecs[0] =
|
187
|
+
vcombine_u32(vget_low_u32(rows01.val[0]), vget_low_u32(rows23.val[0]));
|
188
|
+
vecs[1] =
|
189
|
+
vcombine_u32(vget_low_u32(rows01.val[1]), vget_low_u32(rows23.val[1]));
|
190
|
+
vecs[2] =
|
191
|
+
vcombine_u32(vget_high_u32(rows01.val[0]), vget_high_u32(rows23.val[0]));
|
192
|
+
vecs[3] =
|
193
|
+
vcombine_u32(vget_high_u32(rows01.val[1]), vget_high_u32(rows23.val[1]));
|
194
|
+
}
|
195
|
+
|
196
|
+
INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
|
197
|
+
size_t block_offset, uint32x4_t out[16]) {
|
198
|
+
out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(uint32x4_t)]);
|
199
|
+
out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(uint32x4_t)]);
|
200
|
+
out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(uint32x4_t)]);
|
201
|
+
out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(uint32x4_t)]);
|
202
|
+
out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(uint32x4_t)]);
|
203
|
+
out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(uint32x4_t)]);
|
204
|
+
out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(uint32x4_t)]);
|
205
|
+
out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(uint32x4_t)]);
|
206
|
+
out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(uint32x4_t)]);
|
207
|
+
out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(uint32x4_t)]);
|
208
|
+
out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(uint32x4_t)]);
|
209
|
+
out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(uint32x4_t)]);
|
210
|
+
out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(uint32x4_t)]);
|
211
|
+
out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(uint32x4_t)]);
|
212
|
+
out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(uint32x4_t)]);
|
213
|
+
out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(uint32x4_t)]);
|
214
|
+
transpose_vecs_128(&out[0]);
|
215
|
+
transpose_vecs_128(&out[4]);
|
216
|
+
transpose_vecs_128(&out[8]);
|
217
|
+
transpose_vecs_128(&out[12]);
|
218
|
+
}
|
219
|
+
|
220
|
+
INLINE void load_counters4(uint64_t counter, bool increment_counter,
|
221
|
+
uint32x4_t *out_low, uint32x4_t *out_high) {
|
222
|
+
uint64_t mask = (increment_counter ? ~0 : 0);
|
223
|
+
*out_low = set4(
|
224
|
+
counter_low(counter + (mask & 0)), counter_low(counter + (mask & 1)),
|
225
|
+
counter_low(counter + (mask & 2)), counter_low(counter + (mask & 3)));
|
226
|
+
*out_high = set4(
|
227
|
+
counter_high(counter + (mask & 0)), counter_high(counter + (mask & 1)),
|
228
|
+
counter_high(counter + (mask & 2)), counter_high(counter + (mask & 3)));
|
229
|
+
}
|
230
|
+
|
231
|
+
void blake3_hash4_neon(const uint8_t *const *inputs, size_t blocks,
|
232
|
+
const uint32_t key[8], uint64_t counter,
|
233
|
+
bool increment_counter, uint8_t flags,
|
234
|
+
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
|
235
|
+
uint32x4_t h_vecs[8] = {
|
236
|
+
set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
|
237
|
+
set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
|
238
|
+
};
|
239
|
+
uint32x4_t counter_low_vec, counter_high_vec;
|
240
|
+
load_counters4(counter, increment_counter, &counter_low_vec,
|
241
|
+
&counter_high_vec);
|
242
|
+
uint8_t block_flags = flags | flags_start;
|
243
|
+
|
244
|
+
for (size_t block = 0; block < blocks; block++) {
|
245
|
+
if (block + 1 == blocks) {
|
246
|
+
block_flags |= flags_end;
|
247
|
+
}
|
248
|
+
uint32x4_t block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
|
249
|
+
uint32x4_t block_flags_vec = set1_128(block_flags);
|
250
|
+
uint32x4_t msg_vecs[16];
|
251
|
+
transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
|
252
|
+
|
253
|
+
uint32x4_t v[16] = {
|
254
|
+
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
|
255
|
+
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
|
256
|
+
set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
|
257
|
+
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
|
258
|
+
};
|
259
|
+
round_fn4(v, msg_vecs, 0);
|
260
|
+
round_fn4(v, msg_vecs, 1);
|
261
|
+
round_fn4(v, msg_vecs, 2);
|
262
|
+
round_fn4(v, msg_vecs, 3);
|
263
|
+
round_fn4(v, msg_vecs, 4);
|
264
|
+
round_fn4(v, msg_vecs, 5);
|
265
|
+
round_fn4(v, msg_vecs, 6);
|
266
|
+
h_vecs[0] = xor_128(v[0], v[8]);
|
267
|
+
h_vecs[1] = xor_128(v[1], v[9]);
|
268
|
+
h_vecs[2] = xor_128(v[2], v[10]);
|
269
|
+
h_vecs[3] = xor_128(v[3], v[11]);
|
270
|
+
h_vecs[4] = xor_128(v[4], v[12]);
|
271
|
+
h_vecs[5] = xor_128(v[5], v[13]);
|
272
|
+
h_vecs[6] = xor_128(v[6], v[14]);
|
273
|
+
h_vecs[7] = xor_128(v[7], v[15]);
|
274
|
+
|
275
|
+
block_flags = flags;
|
276
|
+
}
|
277
|
+
|
278
|
+
transpose_vecs_128(&h_vecs[0]);
|
279
|
+
transpose_vecs_128(&h_vecs[4]);
|
280
|
+
// The first four vecs now contain the first half of each output, and the
|
281
|
+
// second four vecs contain the second half of each output.
|
282
|
+
storeu_128(h_vecs[0], &out[0 * sizeof(uint32x4_t)]);
|
283
|
+
storeu_128(h_vecs[4], &out[1 * sizeof(uint32x4_t)]);
|
284
|
+
storeu_128(h_vecs[1], &out[2 * sizeof(uint32x4_t)]);
|
285
|
+
storeu_128(h_vecs[5], &out[3 * sizeof(uint32x4_t)]);
|
286
|
+
storeu_128(h_vecs[2], &out[4 * sizeof(uint32x4_t)]);
|
287
|
+
storeu_128(h_vecs[6], &out[5 * sizeof(uint32x4_t)]);
|
288
|
+
storeu_128(h_vecs[3], &out[6 * sizeof(uint32x4_t)]);
|
289
|
+
storeu_128(h_vecs[7], &out[7 * sizeof(uint32x4_t)]);
|
290
|
+
}
|
291
|
+
|
292
|
+
/*
|
293
|
+
* ----------------------------------------------------------------------------
|
294
|
+
* hash_many_neon
|
295
|
+
* ----------------------------------------------------------------------------
|
296
|
+
*/
|
297
|
+
|
298
|
+
void blake3_compress_in_place_portable(uint32_t cv[8],
|
299
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
300
|
+
uint8_t block_len, uint64_t counter,
|
301
|
+
uint8_t flags);
|
302
|
+
|
303
|
+
INLINE void hash_one_neon(const uint8_t *input, size_t blocks,
|
304
|
+
const uint32_t key[8], uint64_t counter,
|
305
|
+
uint8_t flags, uint8_t flags_start, uint8_t flags_end,
|
306
|
+
uint8_t out[BLAKE3_OUT_LEN]) {
|
307
|
+
uint32_t cv[8];
|
308
|
+
memcpy(cv, key, BLAKE3_KEY_LEN);
|
309
|
+
uint8_t block_flags = flags | flags_start;
|
310
|
+
while (blocks > 0) {
|
311
|
+
if (blocks == 1) {
|
312
|
+
block_flags |= flags_end;
|
313
|
+
}
|
314
|
+
// TODO: Implement compress_neon. However note that according to
|
315
|
+
// https://github.com/BLAKE2/BLAKE2/commit/7965d3e6e1b4193438b8d3a656787587d2579227,
|
316
|
+
// compress_neon might not be any faster than compress_portable.
|
317
|
+
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
|
318
|
+
block_flags);
|
319
|
+
input = &input[BLAKE3_BLOCK_LEN];
|
320
|
+
blocks -= 1;
|
321
|
+
block_flags = flags;
|
322
|
+
}
|
323
|
+
memcpy(out, cv, BLAKE3_OUT_LEN);
|
324
|
+
}
|
325
|
+
|
326
|
+
void blake3_hash_many_neon(const uint8_t *const *inputs, size_t num_inputs,
|
327
|
+
size_t blocks, const uint32_t key[8],
|
328
|
+
uint64_t counter, bool increment_counter,
|
329
|
+
uint8_t flags, uint8_t flags_start,
|
330
|
+
uint8_t flags_end, uint8_t *out) {
|
331
|
+
while (num_inputs >= 4) {
|
332
|
+
blake3_hash4_neon(inputs, blocks, key, counter, increment_counter, flags,
|
333
|
+
flags_start, flags_end, out);
|
334
|
+
if (increment_counter) {
|
335
|
+
counter += 4;
|
336
|
+
}
|
337
|
+
inputs += 4;
|
338
|
+
num_inputs -= 4;
|
339
|
+
out = &out[4 * BLAKE3_OUT_LEN];
|
340
|
+
}
|
341
|
+
while (num_inputs > 0) {
|
342
|
+
hash_one_neon(inputs[0], blocks, key, counter, flags, flags_start,
|
343
|
+
flags_end, out);
|
344
|
+
if (increment_counter) {
|
345
|
+
counter += 1;
|
346
|
+
}
|
347
|
+
inputs += 1;
|
348
|
+
num_inputs -= 1;
|
349
|
+
out = &out[BLAKE3_OUT_LEN];
|
350
|
+
}
|
351
|
+
}
|
@@ -0,0 +1,160 @@
|
|
1
|
+
#include "blake3_impl.h"
|
2
|
+
#include <string.h>
|
3
|
+
|
4
|
+
INLINE uint32_t rotr32(uint32_t w, uint32_t c) {
|
5
|
+
return (w >> c) | (w << (32 - c));
|
6
|
+
}
|
7
|
+
|
8
|
+
INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
|
9
|
+
uint32_t x, uint32_t y) {
|
10
|
+
state[a] = state[a] + state[b] + x;
|
11
|
+
state[d] = rotr32(state[d] ^ state[a], 16);
|
12
|
+
state[c] = state[c] + state[d];
|
13
|
+
state[b] = rotr32(state[b] ^ state[c], 12);
|
14
|
+
state[a] = state[a] + state[b] + y;
|
15
|
+
state[d] = rotr32(state[d] ^ state[a], 8);
|
16
|
+
state[c] = state[c] + state[d];
|
17
|
+
state[b] = rotr32(state[b] ^ state[c], 7);
|
18
|
+
}
|
19
|
+
|
20
|
+
INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
|
21
|
+
// Select the message schedule based on the round.
|
22
|
+
const uint8_t *schedule = MSG_SCHEDULE[round];
|
23
|
+
|
24
|
+
// Mix the columns.
|
25
|
+
g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
|
26
|
+
g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
|
27
|
+
g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
|
28
|
+
g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
|
29
|
+
|
30
|
+
// Mix the rows.
|
31
|
+
g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
|
32
|
+
g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
|
33
|
+
g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
|
34
|
+
g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
|
35
|
+
}
|
36
|
+
|
37
|
+
INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
|
38
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
39
|
+
uint8_t block_len, uint64_t counter, uint8_t flags) {
|
40
|
+
uint32_t block_words[16];
|
41
|
+
block_words[0] = load32(block + 4 * 0);
|
42
|
+
block_words[1] = load32(block + 4 * 1);
|
43
|
+
block_words[2] = load32(block + 4 * 2);
|
44
|
+
block_words[3] = load32(block + 4 * 3);
|
45
|
+
block_words[4] = load32(block + 4 * 4);
|
46
|
+
block_words[5] = load32(block + 4 * 5);
|
47
|
+
block_words[6] = load32(block + 4 * 6);
|
48
|
+
block_words[7] = load32(block + 4 * 7);
|
49
|
+
block_words[8] = load32(block + 4 * 8);
|
50
|
+
block_words[9] = load32(block + 4 * 9);
|
51
|
+
block_words[10] = load32(block + 4 * 10);
|
52
|
+
block_words[11] = load32(block + 4 * 11);
|
53
|
+
block_words[12] = load32(block + 4 * 12);
|
54
|
+
block_words[13] = load32(block + 4 * 13);
|
55
|
+
block_words[14] = load32(block + 4 * 14);
|
56
|
+
block_words[15] = load32(block + 4 * 15);
|
57
|
+
|
58
|
+
state[0] = cv[0];
|
59
|
+
state[1] = cv[1];
|
60
|
+
state[2] = cv[2];
|
61
|
+
state[3] = cv[3];
|
62
|
+
state[4] = cv[4];
|
63
|
+
state[5] = cv[5];
|
64
|
+
state[6] = cv[6];
|
65
|
+
state[7] = cv[7];
|
66
|
+
state[8] = IV[0];
|
67
|
+
state[9] = IV[1];
|
68
|
+
state[10] = IV[2];
|
69
|
+
state[11] = IV[3];
|
70
|
+
state[12] = counter_low(counter);
|
71
|
+
state[13] = counter_high(counter);
|
72
|
+
state[14] = (uint32_t)block_len;
|
73
|
+
state[15] = (uint32_t)flags;
|
74
|
+
|
75
|
+
round_fn(state, &block_words[0], 0);
|
76
|
+
round_fn(state, &block_words[0], 1);
|
77
|
+
round_fn(state, &block_words[0], 2);
|
78
|
+
round_fn(state, &block_words[0], 3);
|
79
|
+
round_fn(state, &block_words[0], 4);
|
80
|
+
round_fn(state, &block_words[0], 5);
|
81
|
+
round_fn(state, &block_words[0], 6);
|
82
|
+
}
|
83
|
+
|
84
|
+
void blake3_compress_in_place_portable(uint32_t cv[8],
|
85
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
86
|
+
uint8_t block_len, uint64_t counter,
|
87
|
+
uint8_t flags) {
|
88
|
+
uint32_t state[16];
|
89
|
+
compress_pre(state, cv, block, block_len, counter, flags);
|
90
|
+
cv[0] = state[0] ^ state[8];
|
91
|
+
cv[1] = state[1] ^ state[9];
|
92
|
+
cv[2] = state[2] ^ state[10];
|
93
|
+
cv[3] = state[3] ^ state[11];
|
94
|
+
cv[4] = state[4] ^ state[12];
|
95
|
+
cv[5] = state[5] ^ state[13];
|
96
|
+
cv[6] = state[6] ^ state[14];
|
97
|
+
cv[7] = state[7] ^ state[15];
|
98
|
+
}
|
99
|
+
|
100
|
+
void blake3_compress_xof_portable(const uint32_t cv[8],
|
101
|
+
const uint8_t block[BLAKE3_BLOCK_LEN],
|
102
|
+
uint8_t block_len, uint64_t counter,
|
103
|
+
uint8_t flags, uint8_t out[64]) {
|
104
|
+
uint32_t state[16];
|
105
|
+
compress_pre(state, cv, block, block_len, counter, flags);
|
106
|
+
|
107
|
+
store32(&out[0 * 4], state[0] ^ state[8]);
|
108
|
+
store32(&out[1 * 4], state[1] ^ state[9]);
|
109
|
+
store32(&out[2 * 4], state[2] ^ state[10]);
|
110
|
+
store32(&out[3 * 4], state[3] ^ state[11]);
|
111
|
+
store32(&out[4 * 4], state[4] ^ state[12]);
|
112
|
+
store32(&out[5 * 4], state[5] ^ state[13]);
|
113
|
+
store32(&out[6 * 4], state[6] ^ state[14]);
|
114
|
+
store32(&out[7 * 4], state[7] ^ state[15]);
|
115
|
+
store32(&out[8 * 4], state[8] ^ cv[0]);
|
116
|
+
store32(&out[9 * 4], state[9] ^ cv[1]);
|
117
|
+
store32(&out[10 * 4], state[10] ^ cv[2]);
|
118
|
+
store32(&out[11 * 4], state[11] ^ cv[3]);
|
119
|
+
store32(&out[12 * 4], state[12] ^ cv[4]);
|
120
|
+
store32(&out[13 * 4], state[13] ^ cv[5]);
|
121
|
+
store32(&out[14 * 4], state[14] ^ cv[6]);
|
122
|
+
store32(&out[15 * 4], state[15] ^ cv[7]);
|
123
|
+
}
|
124
|
+
|
125
|
+
INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
|
126
|
+
const uint32_t key[8], uint64_t counter,
|
127
|
+
uint8_t flags, uint8_t flags_start,
|
128
|
+
uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
|
129
|
+
uint32_t cv[8];
|
130
|
+
memcpy(cv, key, BLAKE3_KEY_LEN);
|
131
|
+
uint8_t block_flags = flags | flags_start;
|
132
|
+
while (blocks > 0) {
|
133
|
+
if (blocks == 1) {
|
134
|
+
block_flags |= flags_end;
|
135
|
+
}
|
136
|
+
blake3_compress_in_place_portable(cv, input, BLAKE3_BLOCK_LEN, counter,
|
137
|
+
block_flags);
|
138
|
+
input = &input[BLAKE3_BLOCK_LEN];
|
139
|
+
blocks -= 1;
|
140
|
+
block_flags = flags;
|
141
|
+
}
|
142
|
+
store_cv_words(out, cv);
|
143
|
+
}
|
144
|
+
|
145
|
+
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
|
146
|
+
size_t blocks, const uint32_t key[8],
|
147
|
+
uint64_t counter, bool increment_counter,
|
148
|
+
uint8_t flags, uint8_t flags_start,
|
149
|
+
uint8_t flags_end, uint8_t *out) {
|
150
|
+
while (num_inputs > 0) {
|
151
|
+
hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
|
152
|
+
flags_end, out);
|
153
|
+
if (increment_counter) {
|
154
|
+
counter += 1;
|
155
|
+
}
|
156
|
+
inputs += 1;
|
157
|
+
num_inputs -= 1;
|
158
|
+
out = &out[BLAKE3_OUT_LEN];
|
159
|
+
}
|
160
|
+
}
|