uncle_blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.md +27 -0
  3. data/README.md +89 -0
  4. data/ext/Rakefile +55 -0
  5. data/ext/binding/uncle_blake3.c +41 -0
  6. data/ext/blake3/c/Makefile.testing +82 -0
  7. data/ext/blake3/c/README.md +316 -0
  8. data/ext/blake3/c/blake3.c +616 -0
  9. data/ext/blake3/c/blake3.h +60 -0
  10. data/ext/blake3/c/blake3_avx2.c +326 -0
  11. data/ext/blake3/c/blake3_avx2_x86-64_unix.S +1815 -0
  12. data/ext/blake3/c/blake3_avx2_x86-64_windows_gnu.S +1817 -0
  13. data/ext/blake3/c/blake3_avx2_x86-64_windows_msvc.asm +1828 -0
  14. data/ext/blake3/c/blake3_avx512.c +1207 -0
  15. data/ext/blake3/c/blake3_avx512_x86-64_unix.S +2585 -0
  16. data/ext/blake3/c/blake3_avx512_x86-64_windows_gnu.S +2615 -0
  17. data/ext/blake3/c/blake3_avx512_x86-64_windows_msvc.asm +2634 -0
  18. data/ext/blake3/c/blake3_dispatch.c +276 -0
  19. data/ext/blake3/c/blake3_impl.h +282 -0
  20. data/ext/blake3/c/blake3_neon.c +351 -0
  21. data/ext/blake3/c/blake3_portable.c +160 -0
  22. data/ext/blake3/c/blake3_sse2.c +566 -0
  23. data/ext/blake3/c/blake3_sse2_x86-64_unix.S +2291 -0
  24. data/ext/blake3/c/blake3_sse2_x86-64_windows_gnu.S +2332 -0
  25. data/ext/blake3/c/blake3_sse2_x86-64_windows_msvc.asm +2350 -0
  26. data/ext/blake3/c/blake3_sse41.c +560 -0
  27. data/ext/blake3/c/blake3_sse41_x86-64_unix.S +2028 -0
  28. data/ext/blake3/c/blake3_sse41_x86-64_windows_gnu.S +2069 -0
  29. data/ext/blake3/c/blake3_sse41_x86-64_windows_msvc.asm +2089 -0
  30. data/ext/blake3/c/example.c +37 -0
  31. data/ext/blake3/c/main.c +166 -0
  32. data/ext/blake3/c/test.py +97 -0
  33. data/lib/uncle_blake3/binding.rb +20 -0
  34. data/lib/uncle_blake3/build/loader.rb +40 -0
  35. data/lib/uncle_blake3/build/platform.rb +37 -0
  36. data/lib/uncle_blake3/build.rb +4 -0
  37. data/lib/uncle_blake3/digest.rb +119 -0
  38. data/lib/uncle_blake3/version.rb +5 -0
  39. data/lib/uncle_blake3.rb +7 -0
  40. metadata +112 -0
@@ -0,0 +1,2028 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
15
+ .intel_syntax noprefix
16
+ .global blake3_hash_many_sse41
17
+ .global _blake3_hash_many_sse41
18
+ .global blake3_compress_in_place_sse41
19
+ .global _blake3_compress_in_place_sse41
20
+ .global blake3_compress_xof_sse41
21
+ .global _blake3_compress_xof_sse41
22
+ #ifdef __APPLE__
23
+ .text
24
+ #else
25
+ .section .text
26
+ #endif
27
+ .p2align 6
28
+ _blake3_hash_many_sse41:
29
+ blake3_hash_many_sse41:
30
+ _CET_ENDBR
31
+ push r15
32
+ push r14
33
+ push r13
34
+ push r12
35
+ push rbx
36
+ push rbp
37
+ mov rbp, rsp
38
+ sub rsp, 360
39
+ and rsp, 0xFFFFFFFFFFFFFFC0
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 0x00
43
+ movdqa xmmword ptr [rsp+0x130], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0+rip]
46
+ pand xmm0, xmmword ptr [ADD1+rip]
47
+ movdqa xmmword ptr [rsp+0x150], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 0x00
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+0x110], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 0x00
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+0x120], xmm2
60
+ mov rbx, qword ptr [rbp+0x50]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+0x38]
64
+ movzx r12d, byte ptr [rbp+0x48]
65
+ cmp rsi, 4
66
+ jc 3f
67
+ 2:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 0x00
70
+ pshufd xmm1, xmm3, 0x55
71
+ pshufd xmm2, xmm3, 0xAA
72
+ pshufd xmm3, xmm3, 0xFF
73
+ movdqu xmm7, xmmword ptr [rcx+0x10]
74
+ pshufd xmm4, xmm7, 0x00
75
+ pshufd xmm5, xmm7, 0x55
76
+ pshufd xmm6, xmm7, 0xAA
77
+ pshufd xmm7, xmm7, 0xFF
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+0x8]
80
+ mov r10, qword ptr [rdi+0x10]
81
+ mov r11, qword ptr [rdi+0x18]
82
+ movzx eax, byte ptr [rbp+0x40]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ 9:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+0x10], xmm9
109
+ movdqa xmmword ptr [rsp+0x20], xmm12
110
+ movdqa xmmword ptr [rsp+0x30], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+0x40], xmm8
128
+ movdqa xmmword ptr [rsp+0x50], xmm9
129
+ movdqa xmmword ptr [rsp+0x60], xmm12
130
+ movdqa xmmword ptr [rsp+0x70], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+0x80], xmm8
148
+ movdqa xmmword ptr [rsp+0x90], xmm9
149
+ movdqa xmmword ptr [rsp+0xA0], xmm12
150
+ movdqa xmmword ptr [rsp+0xB0], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0xC0], xmm8
168
+ movdqa xmmword ptr [rsp+0xD0], xmm9
169
+ movdqa xmmword ptr [rsp+0xE0], xmm12
170
+ movdqa xmmword ptr [rsp+0xF0], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
174
+ movdqa xmm12, xmmword ptr [rsp+0x110]
175
+ movdqa xmm13, xmmword ptr [rsp+0x120]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 0x00
179
+ prefetcht0 [r8+rdx+0x80]
180
+ prefetcht0 [r9+rdx+0x80]
181
+ prefetcht0 [r10+rdx+0x80]
182
+ prefetcht0 [r11+rdx+0x80]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+0x20]
185
+ paddd xmm2, xmmword ptr [rsp+0x40]
186
+ paddd xmm3, xmmword ptr [rsp+0x60]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ movdqa xmm8, xmmword ptr [ROT16+rip]
196
+ pshufb xmm12, xmm8
197
+ pshufb xmm13, xmm8
198
+ pshufb xmm14, xmm8
199
+ pshufb xmm15, xmm8
200
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
201
+ paddd xmm8, xmm12
202
+ paddd xmm9, xmm13
203
+ paddd xmm10, xmm14
204
+ paddd xmm11, xmm15
205
+ pxor xmm4, xmm8
206
+ pxor xmm5, xmm9
207
+ pxor xmm6, xmm10
208
+ pxor xmm7, xmm11
209
+ movdqa xmmword ptr [rsp+0x100], xmm8
210
+ movdqa xmm8, xmm4
211
+ psrld xmm8, 12
212
+ pslld xmm4, 20
213
+ por xmm4, xmm8
214
+ movdqa xmm8, xmm5
215
+ psrld xmm8, 12
216
+ pslld xmm5, 20
217
+ por xmm5, xmm8
218
+ movdqa xmm8, xmm6
219
+ psrld xmm8, 12
220
+ pslld xmm6, 20
221
+ por xmm6, xmm8
222
+ movdqa xmm8, xmm7
223
+ psrld xmm8, 12
224
+ pslld xmm7, 20
225
+ por xmm7, xmm8
226
+ paddd xmm0, xmmword ptr [rsp+0x10]
227
+ paddd xmm1, xmmword ptr [rsp+0x30]
228
+ paddd xmm2, xmmword ptr [rsp+0x50]
229
+ paddd xmm3, xmmword ptr [rsp+0x70]
230
+ paddd xmm0, xmm4
231
+ paddd xmm1, xmm5
232
+ paddd xmm2, xmm6
233
+ paddd xmm3, xmm7
234
+ pxor xmm12, xmm0
235
+ pxor xmm13, xmm1
236
+ pxor xmm14, xmm2
237
+ pxor xmm15, xmm3
238
+ movdqa xmm8, xmmword ptr [ROT8+rip]
239
+ pshufb xmm12, xmm8
240
+ pshufb xmm13, xmm8
241
+ pshufb xmm14, xmm8
242
+ pshufb xmm15, xmm8
243
+ movdqa xmm8, xmmword ptr [rsp+0x100]
244
+ paddd xmm8, xmm12
245
+ paddd xmm9, xmm13
246
+ paddd xmm10, xmm14
247
+ paddd xmm11, xmm15
248
+ pxor xmm4, xmm8
249
+ pxor xmm5, xmm9
250
+ pxor xmm6, xmm10
251
+ pxor xmm7, xmm11
252
+ movdqa xmmword ptr [rsp+0x100], xmm8
253
+ movdqa xmm8, xmm4
254
+ psrld xmm8, 7
255
+ pslld xmm4, 25
256
+ por xmm4, xmm8
257
+ movdqa xmm8, xmm5
258
+ psrld xmm8, 7
259
+ pslld xmm5, 25
260
+ por xmm5, xmm8
261
+ movdqa xmm8, xmm6
262
+ psrld xmm8, 7
263
+ pslld xmm6, 25
264
+ por xmm6, xmm8
265
+ movdqa xmm8, xmm7
266
+ psrld xmm8, 7
267
+ pslld xmm7, 25
268
+ por xmm7, xmm8
269
+ paddd xmm0, xmmword ptr [rsp+0x80]
270
+ paddd xmm1, xmmword ptr [rsp+0xA0]
271
+ paddd xmm2, xmmword ptr [rsp+0xC0]
272
+ paddd xmm3, xmmword ptr [rsp+0xE0]
273
+ paddd xmm0, xmm5
274
+ paddd xmm1, xmm6
275
+ paddd xmm2, xmm7
276
+ paddd xmm3, xmm4
277
+ pxor xmm15, xmm0
278
+ pxor xmm12, xmm1
279
+ pxor xmm13, xmm2
280
+ pxor xmm14, xmm3
281
+ movdqa xmm8, xmmword ptr [ROT16+rip]
282
+ pshufb xmm15, xmm8
283
+ pshufb xmm12, xmm8
284
+ pshufb xmm13, xmm8
285
+ pshufb xmm14, xmm8
286
+ paddd xmm10, xmm15
287
+ paddd xmm11, xmm12
288
+ movdqa xmm8, xmmword ptr [rsp+0x100]
289
+ paddd xmm8, xmm13
290
+ paddd xmm9, xmm14
291
+ pxor xmm5, xmm10
292
+ pxor xmm6, xmm11
293
+ pxor xmm7, xmm8
294
+ pxor xmm4, xmm9
295
+ movdqa xmmword ptr [rsp+0x100], xmm8
296
+ movdqa xmm8, xmm5
297
+ psrld xmm8, 12
298
+ pslld xmm5, 20
299
+ por xmm5, xmm8
300
+ movdqa xmm8, xmm6
301
+ psrld xmm8, 12
302
+ pslld xmm6, 20
303
+ por xmm6, xmm8
304
+ movdqa xmm8, xmm7
305
+ psrld xmm8, 12
306
+ pslld xmm7, 20
307
+ por xmm7, xmm8
308
+ movdqa xmm8, xmm4
309
+ psrld xmm8, 12
310
+ pslld xmm4, 20
311
+ por xmm4, xmm8
312
+ paddd xmm0, xmmword ptr [rsp+0x90]
313
+ paddd xmm1, xmmword ptr [rsp+0xB0]
314
+ paddd xmm2, xmmword ptr [rsp+0xD0]
315
+ paddd xmm3, xmmword ptr [rsp+0xF0]
316
+ paddd xmm0, xmm5
317
+ paddd xmm1, xmm6
318
+ paddd xmm2, xmm7
319
+ paddd xmm3, xmm4
320
+ pxor xmm15, xmm0
321
+ pxor xmm12, xmm1
322
+ pxor xmm13, xmm2
323
+ pxor xmm14, xmm3
324
+ movdqa xmm8, xmmword ptr [ROT8+rip]
325
+ pshufb xmm15, xmm8
326
+ pshufb xmm12, xmm8
327
+ pshufb xmm13, xmm8
328
+ pshufb xmm14, xmm8
329
+ paddd xmm10, xmm15
330
+ paddd xmm11, xmm12
331
+ movdqa xmm8, xmmword ptr [rsp+0x100]
332
+ paddd xmm8, xmm13
333
+ paddd xmm9, xmm14
334
+ pxor xmm5, xmm10
335
+ pxor xmm6, xmm11
336
+ pxor xmm7, xmm8
337
+ pxor xmm4, xmm9
338
+ movdqa xmmword ptr [rsp+0x100], xmm8
339
+ movdqa xmm8, xmm5
340
+ psrld xmm8, 7
341
+ pslld xmm5, 25
342
+ por xmm5, xmm8
343
+ movdqa xmm8, xmm6
344
+ psrld xmm8, 7
345
+ pslld xmm6, 25
346
+ por xmm6, xmm8
347
+ movdqa xmm8, xmm7
348
+ psrld xmm8, 7
349
+ pslld xmm7, 25
350
+ por xmm7, xmm8
351
+ movdqa xmm8, xmm4
352
+ psrld xmm8, 7
353
+ pslld xmm4, 25
354
+ por xmm4, xmm8
355
+ paddd xmm0, xmmword ptr [rsp+0x20]
356
+ paddd xmm1, xmmword ptr [rsp+0x30]
357
+ paddd xmm2, xmmword ptr [rsp+0x70]
358
+ paddd xmm3, xmmword ptr [rsp+0x40]
359
+ paddd xmm0, xmm4
360
+ paddd xmm1, xmm5
361
+ paddd xmm2, xmm6
362
+ paddd xmm3, xmm7
363
+ pxor xmm12, xmm0
364
+ pxor xmm13, xmm1
365
+ pxor xmm14, xmm2
366
+ pxor xmm15, xmm3
367
+ movdqa xmm8, xmmword ptr [ROT16+rip]
368
+ pshufb xmm12, xmm8
369
+ pshufb xmm13, xmm8
370
+ pshufb xmm14, xmm8
371
+ pshufb xmm15, xmm8
372
+ movdqa xmm8, xmmword ptr [rsp+0x100]
373
+ paddd xmm8, xmm12
374
+ paddd xmm9, xmm13
375
+ paddd xmm10, xmm14
376
+ paddd xmm11, xmm15
377
+ pxor xmm4, xmm8
378
+ pxor xmm5, xmm9
379
+ pxor xmm6, xmm10
380
+ pxor xmm7, xmm11
381
+ movdqa xmmword ptr [rsp+0x100], xmm8
382
+ movdqa xmm8, xmm4
383
+ psrld xmm8, 12
384
+ pslld xmm4, 20
385
+ por xmm4, xmm8
386
+ movdqa xmm8, xmm5
387
+ psrld xmm8, 12
388
+ pslld xmm5, 20
389
+ por xmm5, xmm8
390
+ movdqa xmm8, xmm6
391
+ psrld xmm8, 12
392
+ pslld xmm6, 20
393
+ por xmm6, xmm8
394
+ movdqa xmm8, xmm7
395
+ psrld xmm8, 12
396
+ pslld xmm7, 20
397
+ por xmm7, xmm8
398
+ paddd xmm0, xmmword ptr [rsp+0x60]
399
+ paddd xmm1, xmmword ptr [rsp+0xA0]
400
+ paddd xmm2, xmmword ptr [rsp]
401
+ paddd xmm3, xmmword ptr [rsp+0xD0]
402
+ paddd xmm0, xmm4
403
+ paddd xmm1, xmm5
404
+ paddd xmm2, xmm6
405
+ paddd xmm3, xmm7
406
+ pxor xmm12, xmm0
407
+ pxor xmm13, xmm1
408
+ pxor xmm14, xmm2
409
+ pxor xmm15, xmm3
410
+ movdqa xmm8, xmmword ptr [ROT8+rip]
411
+ pshufb xmm12, xmm8
412
+ pshufb xmm13, xmm8
413
+ pshufb xmm14, xmm8
414
+ pshufb xmm15, xmm8
415
+ movdqa xmm8, xmmword ptr [rsp+0x100]
416
+ paddd xmm8, xmm12
417
+ paddd xmm9, xmm13
418
+ paddd xmm10, xmm14
419
+ paddd xmm11, xmm15
420
+ pxor xmm4, xmm8
421
+ pxor xmm5, xmm9
422
+ pxor xmm6, xmm10
423
+ pxor xmm7, xmm11
424
+ movdqa xmmword ptr [rsp+0x100], xmm8
425
+ movdqa xmm8, xmm4
426
+ psrld xmm8, 7
427
+ pslld xmm4, 25
428
+ por xmm4, xmm8
429
+ movdqa xmm8, xmm5
430
+ psrld xmm8, 7
431
+ pslld xmm5, 25
432
+ por xmm5, xmm8
433
+ movdqa xmm8, xmm6
434
+ psrld xmm8, 7
435
+ pslld xmm6, 25
436
+ por xmm6, xmm8
437
+ movdqa xmm8, xmm7
438
+ psrld xmm8, 7
439
+ pslld xmm7, 25
440
+ por xmm7, xmm8
441
+ paddd xmm0, xmmword ptr [rsp+0x10]
442
+ paddd xmm1, xmmword ptr [rsp+0xC0]
443
+ paddd xmm2, xmmword ptr [rsp+0x90]
444
+ paddd xmm3, xmmword ptr [rsp+0xF0]
445
+ paddd xmm0, xmm5
446
+ paddd xmm1, xmm6
447
+ paddd xmm2, xmm7
448
+ paddd xmm3, xmm4
449
+ pxor xmm15, xmm0
450
+ pxor xmm12, xmm1
451
+ pxor xmm13, xmm2
452
+ pxor xmm14, xmm3
453
+ movdqa xmm8, xmmword ptr [ROT16+rip]
454
+ pshufb xmm15, xmm8
455
+ pshufb xmm12, xmm8
456
+ pshufb xmm13, xmm8
457
+ pshufb xmm14, xmm8
458
+ paddd xmm10, xmm15
459
+ paddd xmm11, xmm12
460
+ movdqa xmm8, xmmword ptr [rsp+0x100]
461
+ paddd xmm8, xmm13
462
+ paddd xmm9, xmm14
463
+ pxor xmm5, xmm10
464
+ pxor xmm6, xmm11
465
+ pxor xmm7, xmm8
466
+ pxor xmm4, xmm9
467
+ movdqa xmmword ptr [rsp+0x100], xmm8
468
+ movdqa xmm8, xmm5
469
+ psrld xmm8, 12
470
+ pslld xmm5, 20
471
+ por xmm5, xmm8
472
+ movdqa xmm8, xmm6
473
+ psrld xmm8, 12
474
+ pslld xmm6, 20
475
+ por xmm6, xmm8
476
+ movdqa xmm8, xmm7
477
+ psrld xmm8, 12
478
+ pslld xmm7, 20
479
+ por xmm7, xmm8
480
+ movdqa xmm8, xmm4
481
+ psrld xmm8, 12
482
+ pslld xmm4, 20
483
+ por xmm4, xmm8
484
+ paddd xmm0, xmmword ptr [rsp+0xB0]
485
+ paddd xmm1, xmmword ptr [rsp+0x50]
486
+ paddd xmm2, xmmword ptr [rsp+0xE0]
487
+ paddd xmm3, xmmword ptr [rsp+0x80]
488
+ paddd xmm0, xmm5
489
+ paddd xmm1, xmm6
490
+ paddd xmm2, xmm7
491
+ paddd xmm3, xmm4
492
+ pxor xmm15, xmm0
493
+ pxor xmm12, xmm1
494
+ pxor xmm13, xmm2
495
+ pxor xmm14, xmm3
496
+ movdqa xmm8, xmmword ptr [ROT8+rip]
497
+ pshufb xmm15, xmm8
498
+ pshufb xmm12, xmm8
499
+ pshufb xmm13, xmm8
500
+ pshufb xmm14, xmm8
501
+ paddd xmm10, xmm15
502
+ paddd xmm11, xmm12
503
+ movdqa xmm8, xmmword ptr [rsp+0x100]
504
+ paddd xmm8, xmm13
505
+ paddd xmm9, xmm14
506
+ pxor xmm5, xmm10
507
+ pxor xmm6, xmm11
508
+ pxor xmm7, xmm8
509
+ pxor xmm4, xmm9
510
+ movdqa xmmword ptr [rsp+0x100], xmm8
511
+ movdqa xmm8, xmm5
512
+ psrld xmm8, 7
513
+ pslld xmm5, 25
514
+ por xmm5, xmm8
515
+ movdqa xmm8, xmm6
516
+ psrld xmm8, 7
517
+ pslld xmm6, 25
518
+ por xmm6, xmm8
519
+ movdqa xmm8, xmm7
520
+ psrld xmm8, 7
521
+ pslld xmm7, 25
522
+ por xmm7, xmm8
523
+ movdqa xmm8, xmm4
524
+ psrld xmm8, 7
525
+ pslld xmm4, 25
526
+ por xmm4, xmm8
527
+ paddd xmm0, xmmword ptr [rsp+0x30]
528
+ paddd xmm1, xmmword ptr [rsp+0xA0]
529
+ paddd xmm2, xmmword ptr [rsp+0xD0]
530
+ paddd xmm3, xmmword ptr [rsp+0x70]
531
+ paddd xmm0, xmm4
532
+ paddd xmm1, xmm5
533
+ paddd xmm2, xmm6
534
+ paddd xmm3, xmm7
535
+ pxor xmm12, xmm0
536
+ pxor xmm13, xmm1
537
+ pxor xmm14, xmm2
538
+ pxor xmm15, xmm3
539
+ movdqa xmm8, xmmword ptr [ROT16+rip]
540
+ pshufb xmm12, xmm8
541
+ pshufb xmm13, xmm8
542
+ pshufb xmm14, xmm8
543
+ pshufb xmm15, xmm8
544
+ movdqa xmm8, xmmword ptr [rsp+0x100]
545
+ paddd xmm8, xmm12
546
+ paddd xmm9, xmm13
547
+ paddd xmm10, xmm14
548
+ paddd xmm11, xmm15
549
+ pxor xmm4, xmm8
550
+ pxor xmm5, xmm9
551
+ pxor xmm6, xmm10
552
+ pxor xmm7, xmm11
553
+ movdqa xmmword ptr [rsp+0x100], xmm8
554
+ movdqa xmm8, xmm4
555
+ psrld xmm8, 12
556
+ pslld xmm4, 20
557
+ por xmm4, xmm8
558
+ movdqa xmm8, xmm5
559
+ psrld xmm8, 12
560
+ pslld xmm5, 20
561
+ por xmm5, xmm8
562
+ movdqa xmm8, xmm6
563
+ psrld xmm8, 12
564
+ pslld xmm6, 20
565
+ por xmm6, xmm8
566
+ movdqa xmm8, xmm7
567
+ psrld xmm8, 12
568
+ pslld xmm7, 20
569
+ por xmm7, xmm8
570
+ paddd xmm0, xmmword ptr [rsp+0x40]
571
+ paddd xmm1, xmmword ptr [rsp+0xC0]
572
+ paddd xmm2, xmmword ptr [rsp+0x20]
573
+ paddd xmm3, xmmword ptr [rsp+0xE0]
574
+ paddd xmm0, xmm4
575
+ paddd xmm1, xmm5
576
+ paddd xmm2, xmm6
577
+ paddd xmm3, xmm7
578
+ pxor xmm12, xmm0
579
+ pxor xmm13, xmm1
580
+ pxor xmm14, xmm2
581
+ pxor xmm15, xmm3
582
+ movdqa xmm8, xmmword ptr [ROT8+rip]
583
+ pshufb xmm12, xmm8
584
+ pshufb xmm13, xmm8
585
+ pshufb xmm14, xmm8
586
+ pshufb xmm15, xmm8
587
+ movdqa xmm8, xmmword ptr [rsp+0x100]
588
+ paddd xmm8, xmm12
589
+ paddd xmm9, xmm13
590
+ paddd xmm10, xmm14
591
+ paddd xmm11, xmm15
592
+ pxor xmm4, xmm8
593
+ pxor xmm5, xmm9
594
+ pxor xmm6, xmm10
595
+ pxor xmm7, xmm11
596
+ movdqa xmmword ptr [rsp+0x100], xmm8
597
+ movdqa xmm8, xmm4
598
+ psrld xmm8, 7
599
+ pslld xmm4, 25
600
+ por xmm4, xmm8
601
+ movdqa xmm8, xmm5
602
+ psrld xmm8, 7
603
+ pslld xmm5, 25
604
+ por xmm5, xmm8
605
+ movdqa xmm8, xmm6
606
+ psrld xmm8, 7
607
+ pslld xmm6, 25
608
+ por xmm6, xmm8
609
+ movdqa xmm8, xmm7
610
+ psrld xmm8, 7
611
+ pslld xmm7, 25
612
+ por xmm7, xmm8
613
+ paddd xmm0, xmmword ptr [rsp+0x60]
614
+ paddd xmm1, xmmword ptr [rsp+0x90]
615
+ paddd xmm2, xmmword ptr [rsp+0xB0]
616
+ paddd xmm3, xmmword ptr [rsp+0x80]
617
+ paddd xmm0, xmm5
618
+ paddd xmm1, xmm6
619
+ paddd xmm2, xmm7
620
+ paddd xmm3, xmm4
621
+ pxor xmm15, xmm0
622
+ pxor xmm12, xmm1
623
+ pxor xmm13, xmm2
624
+ pxor xmm14, xmm3
625
+ movdqa xmm8, xmmword ptr [ROT16+rip]
626
+ pshufb xmm15, xmm8
627
+ pshufb xmm12, xmm8
628
+ pshufb xmm13, xmm8
629
+ pshufb xmm14, xmm8
630
+ paddd xmm10, xmm15
631
+ paddd xmm11, xmm12
632
+ movdqa xmm8, xmmword ptr [rsp+0x100]
633
+ paddd xmm8, xmm13
634
+ paddd xmm9, xmm14
635
+ pxor xmm5, xmm10
636
+ pxor xmm6, xmm11
637
+ pxor xmm7, xmm8
638
+ pxor xmm4, xmm9
639
+ movdqa xmmword ptr [rsp+0x100], xmm8
640
+ movdqa xmm8, xmm5
641
+ psrld xmm8, 12
642
+ pslld xmm5, 20
643
+ por xmm5, xmm8
644
+ movdqa xmm8, xmm6
645
+ psrld xmm8, 12
646
+ pslld xmm6, 20
647
+ por xmm6, xmm8
648
+ movdqa xmm8, xmm7
649
+ psrld xmm8, 12
650
+ pslld xmm7, 20
651
+ por xmm7, xmm8
652
+ movdqa xmm8, xmm4
653
+ psrld xmm8, 12
654
+ pslld xmm4, 20
655
+ por xmm4, xmm8
656
+ paddd xmm0, xmmword ptr [rsp+0x50]
657
+ paddd xmm1, xmmword ptr [rsp]
658
+ paddd xmm2, xmmword ptr [rsp+0xF0]
659
+ paddd xmm3, xmmword ptr [rsp+0x10]
660
+ paddd xmm0, xmm5
661
+ paddd xmm1, xmm6
662
+ paddd xmm2, xmm7
663
+ paddd xmm3, xmm4
664
+ pxor xmm15, xmm0
665
+ pxor xmm12, xmm1
666
+ pxor xmm13, xmm2
667
+ pxor xmm14, xmm3
668
+ movdqa xmm8, xmmword ptr [ROT8+rip]
669
+ pshufb xmm15, xmm8
670
+ pshufb xmm12, xmm8
671
+ pshufb xmm13, xmm8
672
+ pshufb xmm14, xmm8
673
+ paddd xmm10, xmm15
674
+ paddd xmm11, xmm12
675
+ movdqa xmm8, xmmword ptr [rsp+0x100]
676
+ paddd xmm8, xmm13
677
+ paddd xmm9, xmm14
678
+ pxor xmm5, xmm10
679
+ pxor xmm6, xmm11
680
+ pxor xmm7, xmm8
681
+ pxor xmm4, xmm9
682
+ movdqa xmmword ptr [rsp+0x100], xmm8
683
+ movdqa xmm8, xmm5
684
+ psrld xmm8, 7
685
+ pslld xmm5, 25
686
+ por xmm5, xmm8
687
+ movdqa xmm8, xmm6
688
+ psrld xmm8, 7
689
+ pslld xmm6, 25
690
+ por xmm6, xmm8
691
+ movdqa xmm8, xmm7
692
+ psrld xmm8, 7
693
+ pslld xmm7, 25
694
+ por xmm7, xmm8
695
+ movdqa xmm8, xmm4
696
+ psrld xmm8, 7
697
+ pslld xmm4, 25
698
+ por xmm4, xmm8
699
+ paddd xmm0, xmmword ptr [rsp+0xA0]
700
+ paddd xmm1, xmmword ptr [rsp+0xC0]
701
+ paddd xmm2, xmmword ptr [rsp+0xE0]
702
+ paddd xmm3, xmmword ptr [rsp+0xD0]
703
+ paddd xmm0, xmm4
704
+ paddd xmm1, xmm5
705
+ paddd xmm2, xmm6
706
+ paddd xmm3, xmm7
707
+ pxor xmm12, xmm0
708
+ pxor xmm13, xmm1
709
+ pxor xmm14, xmm2
710
+ pxor xmm15, xmm3
711
+ movdqa xmm8, xmmword ptr [ROT16+rip]
712
+ pshufb xmm12, xmm8
713
+ pshufb xmm13, xmm8
714
+ pshufb xmm14, xmm8
715
+ pshufb xmm15, xmm8
716
+ movdqa xmm8, xmmword ptr [rsp+0x100]
717
+ paddd xmm8, xmm12
718
+ paddd xmm9, xmm13
719
+ paddd xmm10, xmm14
720
+ paddd xmm11, xmm15
721
+ pxor xmm4, xmm8
722
+ pxor xmm5, xmm9
723
+ pxor xmm6, xmm10
724
+ pxor xmm7, xmm11
725
+ movdqa xmmword ptr [rsp+0x100], xmm8
726
+ movdqa xmm8, xmm4
727
+ psrld xmm8, 12
728
+ pslld xmm4, 20
729
+ por xmm4, xmm8
730
+ movdqa xmm8, xmm5
731
+ psrld xmm8, 12
732
+ pslld xmm5, 20
733
+ por xmm5, xmm8
734
+ movdqa xmm8, xmm6
735
+ psrld xmm8, 12
736
+ pslld xmm6, 20
737
+ por xmm6, xmm8
738
+ movdqa xmm8, xmm7
739
+ psrld xmm8, 12
740
+ pslld xmm7, 20
741
+ por xmm7, xmm8
742
+ paddd xmm0, xmmword ptr [rsp+0x70]
743
+ paddd xmm1, xmmword ptr [rsp+0x90]
744
+ paddd xmm2, xmmword ptr [rsp+0x30]
745
+ paddd xmm3, xmmword ptr [rsp+0xF0]
746
+ paddd xmm0, xmm4
747
+ paddd xmm1, xmm5
748
+ paddd xmm2, xmm6
749
+ paddd xmm3, xmm7
750
+ pxor xmm12, xmm0
751
+ pxor xmm13, xmm1
752
+ pxor xmm14, xmm2
753
+ pxor xmm15, xmm3
754
+ movdqa xmm8, xmmword ptr [ROT8+rip]
755
+ pshufb xmm12, xmm8
756
+ pshufb xmm13, xmm8
757
+ pshufb xmm14, xmm8
758
+ pshufb xmm15, xmm8
759
+ movdqa xmm8, xmmword ptr [rsp+0x100]
760
+ paddd xmm8, xmm12
761
+ paddd xmm9, xmm13
762
+ paddd xmm10, xmm14
763
+ paddd xmm11, xmm15
764
+ pxor xmm4, xmm8
765
+ pxor xmm5, xmm9
766
+ pxor xmm6, xmm10
767
+ pxor xmm7, xmm11
768
+ movdqa xmmword ptr [rsp+0x100], xmm8
769
+ movdqa xmm8, xmm4
770
+ psrld xmm8, 7
771
+ pslld xmm4, 25
772
+ por xmm4, xmm8
773
+ movdqa xmm8, xmm5
774
+ psrld xmm8, 7
775
+ pslld xmm5, 25
776
+ por xmm5, xmm8
777
+ movdqa xmm8, xmm6
778
+ psrld xmm8, 7
779
+ pslld xmm6, 25
780
+ por xmm6, xmm8
781
+ movdqa xmm8, xmm7
782
+ psrld xmm8, 7
783
+ pslld xmm7, 25
784
+ por xmm7, xmm8
785
+ paddd xmm0, xmmword ptr [rsp+0x40]
786
+ paddd xmm1, xmmword ptr [rsp+0xB0]
787
+ paddd xmm2, xmmword ptr [rsp+0x50]
788
+ paddd xmm3, xmmword ptr [rsp+0x10]
789
+ paddd xmm0, xmm5
790
+ paddd xmm1, xmm6
791
+ paddd xmm2, xmm7
792
+ paddd xmm3, xmm4
793
+ pxor xmm15, xmm0
794
+ pxor xmm12, xmm1
795
+ pxor xmm13, xmm2
796
+ pxor xmm14, xmm3
797
+ movdqa xmm8, xmmword ptr [ROT16+rip]
798
+ pshufb xmm15, xmm8
799
+ pshufb xmm12, xmm8
800
+ pshufb xmm13, xmm8
801
+ pshufb xmm14, xmm8
802
+ paddd xmm10, xmm15
803
+ paddd xmm11, xmm12
804
+ movdqa xmm8, xmmword ptr [rsp+0x100]
805
+ paddd xmm8, xmm13
806
+ paddd xmm9, xmm14
807
+ pxor xmm5, xmm10
808
+ pxor xmm6, xmm11
809
+ pxor xmm7, xmm8
810
+ pxor xmm4, xmm9
811
+ movdqa xmmword ptr [rsp+0x100], xmm8
812
+ movdqa xmm8, xmm5
813
+ psrld xmm8, 12
814
+ pslld xmm5, 20
815
+ por xmm5, xmm8
816
+ movdqa xmm8, xmm6
817
+ psrld xmm8, 12
818
+ pslld xmm6, 20
819
+ por xmm6, xmm8
820
+ movdqa xmm8, xmm7
821
+ psrld xmm8, 12
822
+ pslld xmm7, 20
823
+ por xmm7, xmm8
824
+ movdqa xmm8, xmm4
825
+ psrld xmm8, 12
826
+ pslld xmm4, 20
827
+ por xmm4, xmm8
828
+ paddd xmm0, xmmword ptr [rsp]
829
+ paddd xmm1, xmmword ptr [rsp+0x20]
830
+ paddd xmm2, xmmword ptr [rsp+0x80]
831
+ paddd xmm3, xmmword ptr [rsp+0x60]
832
+ paddd xmm0, xmm5
833
+ paddd xmm1, xmm6
834
+ paddd xmm2, xmm7
835
+ paddd xmm3, xmm4
836
+ pxor xmm15, xmm0
837
+ pxor xmm12, xmm1
838
+ pxor xmm13, xmm2
839
+ pxor xmm14, xmm3
840
+ movdqa xmm8, xmmword ptr [ROT8+rip]
841
+ pshufb xmm15, xmm8
842
+ pshufb xmm12, xmm8
843
+ pshufb xmm13, xmm8
844
+ pshufb xmm14, xmm8
845
+ paddd xmm10, xmm15
846
+ paddd xmm11, xmm12
847
+ movdqa xmm8, xmmword ptr [rsp+0x100]
848
+ paddd xmm8, xmm13
849
+ paddd xmm9, xmm14
850
+ pxor xmm5, xmm10
851
+ pxor xmm6, xmm11
852
+ pxor xmm7, xmm8
853
+ pxor xmm4, xmm9
854
+ movdqa xmmword ptr [rsp+0x100], xmm8
855
+ movdqa xmm8, xmm5
856
+ psrld xmm8, 7
857
+ pslld xmm5, 25
858
+ por xmm5, xmm8
859
+ movdqa xmm8, xmm6
860
+ psrld xmm8, 7
861
+ pslld xmm6, 25
862
+ por xmm6, xmm8
863
+ movdqa xmm8, xmm7
864
+ psrld xmm8, 7
865
+ pslld xmm7, 25
866
+ por xmm7, xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ paddd xmm0, xmmword ptr [rsp+0xC0]
872
+ paddd xmm1, xmmword ptr [rsp+0x90]
873
+ paddd xmm2, xmmword ptr [rsp+0xF0]
874
+ paddd xmm3, xmmword ptr [rsp+0xE0]
875
+ paddd xmm0, xmm4
876
+ paddd xmm1, xmm5
877
+ paddd xmm2, xmm6
878
+ paddd xmm3, xmm7
879
+ pxor xmm12, xmm0
880
+ pxor xmm13, xmm1
881
+ pxor xmm14, xmm2
882
+ pxor xmm15, xmm3
883
+ movdqa xmm8, xmmword ptr [ROT16+rip]
884
+ pshufb xmm12, xmm8
885
+ pshufb xmm13, xmm8
886
+ pshufb xmm14, xmm8
887
+ pshufb xmm15, xmm8
888
+ movdqa xmm8, xmmword ptr [rsp+0x100]
889
+ paddd xmm8, xmm12
890
+ paddd xmm9, xmm13
891
+ paddd xmm10, xmm14
892
+ paddd xmm11, xmm15
893
+ pxor xmm4, xmm8
894
+ pxor xmm5, xmm9
895
+ pxor xmm6, xmm10
896
+ pxor xmm7, xmm11
897
+ movdqa xmmword ptr [rsp+0x100], xmm8
898
+ movdqa xmm8, xmm4
899
+ psrld xmm8, 12
900
+ pslld xmm4, 20
901
+ por xmm4, xmm8
902
+ movdqa xmm8, xmm5
903
+ psrld xmm8, 12
904
+ pslld xmm5, 20
905
+ por xmm5, xmm8
906
+ movdqa xmm8, xmm6
907
+ psrld xmm8, 12
908
+ pslld xmm6, 20
909
+ por xmm6, xmm8
910
+ movdqa xmm8, xmm7
911
+ psrld xmm8, 12
912
+ pslld xmm7, 20
913
+ por xmm7, xmm8
914
+ paddd xmm0, xmmword ptr [rsp+0xD0]
915
+ paddd xmm1, xmmword ptr [rsp+0xB0]
916
+ paddd xmm2, xmmword ptr [rsp+0xA0]
917
+ paddd xmm3, xmmword ptr [rsp+0x80]
918
+ paddd xmm0, xmm4
919
+ paddd xmm1, xmm5
920
+ paddd xmm2, xmm6
921
+ paddd xmm3, xmm7
922
+ pxor xmm12, xmm0
923
+ pxor xmm13, xmm1
924
+ pxor xmm14, xmm2
925
+ pxor xmm15, xmm3
926
+ movdqa xmm8, xmmword ptr [ROT8+rip]
927
+ pshufb xmm12, xmm8
928
+ pshufb xmm13, xmm8
929
+ pshufb xmm14, xmm8
930
+ pshufb xmm15, xmm8
931
+ movdqa xmm8, xmmword ptr [rsp+0x100]
932
+ paddd xmm8, xmm12
933
+ paddd xmm9, xmm13
934
+ paddd xmm10, xmm14
935
+ paddd xmm11, xmm15
936
+ pxor xmm4, xmm8
937
+ pxor xmm5, xmm9
938
+ pxor xmm6, xmm10
939
+ pxor xmm7, xmm11
940
+ movdqa xmmword ptr [rsp+0x100], xmm8
941
+ movdqa xmm8, xmm4
942
+ psrld xmm8, 7
943
+ pslld xmm4, 25
944
+ por xmm4, xmm8
945
+ movdqa xmm8, xmm5
946
+ psrld xmm8, 7
947
+ pslld xmm5, 25
948
+ por xmm5, xmm8
949
+ movdqa xmm8, xmm6
950
+ psrld xmm8, 7
951
+ pslld xmm6, 25
952
+ por xmm6, xmm8
953
+ movdqa xmm8, xmm7
954
+ psrld xmm8, 7
955
+ pslld xmm7, 25
956
+ por xmm7, xmm8
957
+ paddd xmm0, xmmword ptr [rsp+0x70]
958
+ paddd xmm1, xmmword ptr [rsp+0x50]
959
+ paddd xmm2, xmmword ptr [rsp]
960
+ paddd xmm3, xmmword ptr [rsp+0x60]
961
+ paddd xmm0, xmm5
962
+ paddd xmm1, xmm6
963
+ paddd xmm2, xmm7
964
+ paddd xmm3, xmm4
965
+ pxor xmm15, xmm0
966
+ pxor xmm12, xmm1
967
+ pxor xmm13, xmm2
968
+ pxor xmm14, xmm3
969
+ movdqa xmm8, xmmword ptr [ROT16+rip]
970
+ pshufb xmm15, xmm8
971
+ pshufb xmm12, xmm8
972
+ pshufb xmm13, xmm8
973
+ pshufb xmm14, xmm8
974
+ paddd xmm10, xmm15
975
+ paddd xmm11, xmm12
976
+ movdqa xmm8, xmmword ptr [rsp+0x100]
977
+ paddd xmm8, xmm13
978
+ paddd xmm9, xmm14
979
+ pxor xmm5, xmm10
980
+ pxor xmm6, xmm11
981
+ pxor xmm7, xmm8
982
+ pxor xmm4, xmm9
983
+ movdqa xmmword ptr [rsp+0x100], xmm8
984
+ movdqa xmm8, xmm5
985
+ psrld xmm8, 12
986
+ pslld xmm5, 20
987
+ por xmm5, xmm8
988
+ movdqa xmm8, xmm6
989
+ psrld xmm8, 12
990
+ pslld xmm6, 20
991
+ por xmm6, xmm8
992
+ movdqa xmm8, xmm7
993
+ psrld xmm8, 12
994
+ pslld xmm7, 20
995
+ por xmm7, xmm8
996
+ movdqa xmm8, xmm4
997
+ psrld xmm8, 12
998
+ pslld xmm4, 20
999
+ por xmm4, xmm8
1000
+ paddd xmm0, xmmword ptr [rsp+0x20]
1001
+ paddd xmm1, xmmword ptr [rsp+0x30]
1002
+ paddd xmm2, xmmword ptr [rsp+0x10]
1003
+ paddd xmm3, xmmword ptr [rsp+0x40]
1004
+ paddd xmm0, xmm5
1005
+ paddd xmm1, xmm6
1006
+ paddd xmm2, xmm7
1007
+ paddd xmm3, xmm4
1008
+ pxor xmm15, xmm0
1009
+ pxor xmm12, xmm1
1010
+ pxor xmm13, xmm2
1011
+ pxor xmm14, xmm3
1012
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1013
+ pshufb xmm15, xmm8
1014
+ pshufb xmm12, xmm8
1015
+ pshufb xmm13, xmm8
1016
+ pshufb xmm14, xmm8
1017
+ paddd xmm10, xmm15
1018
+ paddd xmm11, xmm12
1019
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1020
+ paddd xmm8, xmm13
1021
+ paddd xmm9, xmm14
1022
+ pxor xmm5, xmm10
1023
+ pxor xmm6, xmm11
1024
+ pxor xmm7, xmm8
1025
+ pxor xmm4, xmm9
1026
+ movdqa xmmword ptr [rsp+0x100], xmm8
1027
+ movdqa xmm8, xmm5
1028
+ psrld xmm8, 7
1029
+ pslld xmm5, 25
1030
+ por xmm5, xmm8
1031
+ movdqa xmm8, xmm6
1032
+ psrld xmm8, 7
1033
+ pslld xmm6, 25
1034
+ por xmm6, xmm8
1035
+ movdqa xmm8, xmm7
1036
+ psrld xmm8, 7
1037
+ pslld xmm7, 25
1038
+ por xmm7, xmm8
1039
+ movdqa xmm8, xmm4
1040
+ psrld xmm8, 7
1041
+ pslld xmm4, 25
1042
+ por xmm4, xmm8
1043
+ paddd xmm0, xmmword ptr [rsp+0x90]
1044
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1045
+ paddd xmm2, xmmword ptr [rsp+0x80]
1046
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1047
+ paddd xmm0, xmm4
1048
+ paddd xmm1, xmm5
1049
+ paddd xmm2, xmm6
1050
+ paddd xmm3, xmm7
1051
+ pxor xmm12, xmm0
1052
+ pxor xmm13, xmm1
1053
+ pxor xmm14, xmm2
1054
+ pxor xmm15, xmm3
1055
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1056
+ pshufb xmm12, xmm8
1057
+ pshufb xmm13, xmm8
1058
+ pshufb xmm14, xmm8
1059
+ pshufb xmm15, xmm8
1060
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1061
+ paddd xmm8, xmm12
1062
+ paddd xmm9, xmm13
1063
+ paddd xmm10, xmm14
1064
+ paddd xmm11, xmm15
1065
+ pxor xmm4, xmm8
1066
+ pxor xmm5, xmm9
1067
+ pxor xmm6, xmm10
1068
+ pxor xmm7, xmm11
1069
+ movdqa xmmword ptr [rsp+0x100], xmm8
1070
+ movdqa xmm8, xmm4
1071
+ psrld xmm8, 12
1072
+ pslld xmm4, 20
1073
+ por xmm4, xmm8
1074
+ movdqa xmm8, xmm5
1075
+ psrld xmm8, 12
1076
+ pslld xmm5, 20
1077
+ por xmm5, xmm8
1078
+ movdqa xmm8, xmm6
1079
+ psrld xmm8, 12
1080
+ pslld xmm6, 20
1081
+ por xmm6, xmm8
1082
+ movdqa xmm8, xmm7
1083
+ psrld xmm8, 12
1084
+ pslld xmm7, 20
1085
+ por xmm7, xmm8
1086
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1087
+ paddd xmm1, xmmword ptr [rsp+0x50]
1088
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1089
+ paddd xmm3, xmmword ptr [rsp+0x10]
1090
+ paddd xmm0, xmm4
1091
+ paddd xmm1, xmm5
1092
+ paddd xmm2, xmm6
1093
+ paddd xmm3, xmm7
1094
+ pxor xmm12, xmm0
1095
+ pxor xmm13, xmm1
1096
+ pxor xmm14, xmm2
1097
+ pxor xmm15, xmm3
1098
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1099
+ pshufb xmm12, xmm8
1100
+ pshufb xmm13, xmm8
1101
+ pshufb xmm14, xmm8
1102
+ pshufb xmm15, xmm8
1103
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1104
+ paddd xmm8, xmm12
1105
+ paddd xmm9, xmm13
1106
+ paddd xmm10, xmm14
1107
+ paddd xmm11, xmm15
1108
+ pxor xmm4, xmm8
1109
+ pxor xmm5, xmm9
1110
+ pxor xmm6, xmm10
1111
+ pxor xmm7, xmm11
1112
+ movdqa xmmword ptr [rsp+0x100], xmm8
1113
+ movdqa xmm8, xmm4
1114
+ psrld xmm8, 7
1115
+ pslld xmm4, 25
1116
+ por xmm4, xmm8
1117
+ movdqa xmm8, xmm5
1118
+ psrld xmm8, 7
1119
+ pslld xmm5, 25
1120
+ por xmm5, xmm8
1121
+ movdqa xmm8, xmm6
1122
+ psrld xmm8, 7
1123
+ pslld xmm6, 25
1124
+ por xmm6, xmm8
1125
+ movdqa xmm8, xmm7
1126
+ psrld xmm8, 7
1127
+ pslld xmm7, 25
1128
+ por xmm7, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1130
+ paddd xmm1, xmmword ptr [rsp]
1131
+ paddd xmm2, xmmword ptr [rsp+0x20]
1132
+ paddd xmm3, xmmword ptr [rsp+0x40]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1142
+ pshufb xmm15, xmm8
1143
+ pshufb xmm12, xmm8
1144
+ pshufb xmm13, xmm8
1145
+ pshufb xmm14, xmm8
1146
+ paddd xmm10, xmm15
1147
+ paddd xmm11, xmm12
1148
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1149
+ paddd xmm8, xmm13
1150
+ paddd xmm9, xmm14
1151
+ pxor xmm5, xmm10
1152
+ pxor xmm6, xmm11
1153
+ pxor xmm7, xmm8
1154
+ pxor xmm4, xmm9
1155
+ movdqa xmmword ptr [rsp+0x100], xmm8
1156
+ movdqa xmm8, xmm5
1157
+ psrld xmm8, 12
1158
+ pslld xmm5, 20
1159
+ por xmm5, xmm8
1160
+ movdqa xmm8, xmm6
1161
+ psrld xmm8, 12
1162
+ pslld xmm6, 20
1163
+ por xmm6, xmm8
1164
+ movdqa xmm8, xmm7
1165
+ psrld xmm8, 12
1166
+ pslld xmm7, 20
1167
+ por xmm7, xmm8
1168
+ movdqa xmm8, xmm4
1169
+ psrld xmm8, 12
1170
+ pslld xmm4, 20
1171
+ por xmm4, xmm8
1172
+ paddd xmm0, xmmword ptr [rsp+0x30]
1173
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1174
+ paddd xmm2, xmmword ptr [rsp+0x60]
1175
+ paddd xmm3, xmmword ptr [rsp+0x70]
1176
+ paddd xmm0, xmm5
1177
+ paddd xmm1, xmm6
1178
+ paddd xmm2, xmm7
1179
+ paddd xmm3, xmm4
1180
+ pxor xmm15, xmm0
1181
+ pxor xmm12, xmm1
1182
+ pxor xmm13, xmm2
1183
+ pxor xmm14, xmm3
1184
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1185
+ pshufb xmm15, xmm8
1186
+ pshufb xmm12, xmm8
1187
+ pshufb xmm13, xmm8
1188
+ pshufb xmm14, xmm8
1189
+ paddd xmm10, xmm15
1190
+ paddd xmm11, xmm12
1191
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1192
+ paddd xmm8, xmm13
1193
+ paddd xmm9, xmm14
1194
+ pxor xmm5, xmm10
1195
+ pxor xmm6, xmm11
1196
+ pxor xmm7, xmm8
1197
+ pxor xmm4, xmm9
1198
+ movdqa xmmword ptr [rsp+0x100], xmm8
1199
+ movdqa xmm8, xmm5
1200
+ psrld xmm8, 7
1201
+ pslld xmm5, 25
1202
+ por xmm5, xmm8
1203
+ movdqa xmm8, xmm6
1204
+ psrld xmm8, 7
1205
+ pslld xmm6, 25
1206
+ por xmm6, xmm8
1207
+ movdqa xmm8, xmm7
1208
+ psrld xmm8, 7
1209
+ pslld xmm7, 25
1210
+ por xmm7, xmm8
1211
+ movdqa xmm8, xmm4
1212
+ psrld xmm8, 7
1213
+ pslld xmm4, 25
1214
+ por xmm4, xmm8
1215
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1216
+ paddd xmm1, xmmword ptr [rsp+0x50]
1217
+ paddd xmm2, xmmword ptr [rsp+0x10]
1218
+ paddd xmm3, xmmword ptr [rsp+0x80]
1219
+ paddd xmm0, xmm4
1220
+ paddd xmm1, xmm5
1221
+ paddd xmm2, xmm6
1222
+ paddd xmm3, xmm7
1223
+ pxor xmm12, xmm0
1224
+ pxor xmm13, xmm1
1225
+ pxor xmm14, xmm2
1226
+ pxor xmm15, xmm3
1227
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1228
+ pshufb xmm12, xmm8
1229
+ pshufb xmm13, xmm8
1230
+ pshufb xmm14, xmm8
1231
+ pshufb xmm15, xmm8
1232
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1233
+ paddd xmm8, xmm12
1234
+ paddd xmm9, xmm13
1235
+ paddd xmm10, xmm14
1236
+ paddd xmm11, xmm15
1237
+ pxor xmm4, xmm8
1238
+ pxor xmm5, xmm9
1239
+ pxor xmm6, xmm10
1240
+ pxor xmm7, xmm11
1241
+ movdqa xmmword ptr [rsp+0x100], xmm8
1242
+ movdqa xmm8, xmm4
1243
+ psrld xmm8, 12
1244
+ pslld xmm4, 20
1245
+ por xmm4, xmm8
1246
+ movdqa xmm8, xmm5
1247
+ psrld xmm8, 12
1248
+ pslld xmm5, 20
1249
+ por xmm5, xmm8
1250
+ movdqa xmm8, xmm6
1251
+ psrld xmm8, 12
1252
+ pslld xmm6, 20
1253
+ por xmm6, xmm8
1254
+ movdqa xmm8, xmm7
1255
+ psrld xmm8, 12
1256
+ pslld xmm7, 20
1257
+ por xmm7, xmm8
1258
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1259
+ paddd xmm1, xmmword ptr [rsp]
1260
+ paddd xmm2, xmmword ptr [rsp+0x90]
1261
+ paddd xmm3, xmmword ptr [rsp+0x60]
1262
+ paddd xmm0, xmm4
1263
+ paddd xmm1, xmm5
1264
+ paddd xmm2, xmm6
1265
+ paddd xmm3, xmm7
1266
+ pxor xmm12, xmm0
1267
+ pxor xmm13, xmm1
1268
+ pxor xmm14, xmm2
1269
+ pxor xmm15, xmm3
1270
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1271
+ pshufb xmm12, xmm8
1272
+ pshufb xmm13, xmm8
1273
+ pshufb xmm14, xmm8
1274
+ pshufb xmm15, xmm8
1275
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1276
+ paddd xmm8, xmm12
1277
+ paddd xmm9, xmm13
1278
+ paddd xmm10, xmm14
1279
+ paddd xmm11, xmm15
1280
+ pxor xmm4, xmm8
1281
+ pxor xmm5, xmm9
1282
+ pxor xmm6, xmm10
1283
+ pxor xmm7, xmm11
1284
+ movdqa xmmword ptr [rsp+0x100], xmm8
1285
+ movdqa xmm8, xmm4
1286
+ psrld xmm8, 7
1287
+ pslld xmm4, 25
1288
+ por xmm4, xmm8
1289
+ movdqa xmm8, xmm5
1290
+ psrld xmm8, 7
1291
+ pslld xmm5, 25
1292
+ por xmm5, xmm8
1293
+ movdqa xmm8, xmm6
1294
+ psrld xmm8, 7
1295
+ pslld xmm6, 25
1296
+ por xmm6, xmm8
1297
+ movdqa xmm8, xmm7
1298
+ psrld xmm8, 7
1299
+ pslld xmm7, 25
1300
+ por xmm7, xmm8
1301
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1302
+ paddd xmm1, xmmword ptr [rsp+0x20]
1303
+ paddd xmm2, xmmword ptr [rsp+0x30]
1304
+ paddd xmm3, xmmword ptr [rsp+0x70]
1305
+ paddd xmm0, xmm5
1306
+ paddd xmm1, xmm6
1307
+ paddd xmm2, xmm7
1308
+ paddd xmm3, xmm4
1309
+ pxor xmm15, xmm0
1310
+ pxor xmm12, xmm1
1311
+ pxor xmm13, xmm2
1312
+ pxor xmm14, xmm3
1313
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1314
+ pshufb xmm15, xmm8
1315
+ pshufb xmm12, xmm8
1316
+ pshufb xmm13, xmm8
1317
+ pshufb xmm14, xmm8
1318
+ paddd xmm10, xmm15
1319
+ paddd xmm11, xmm12
1320
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1321
+ paddd xmm8, xmm13
1322
+ paddd xmm9, xmm14
1323
+ pxor xmm5, xmm10
1324
+ pxor xmm6, xmm11
1325
+ pxor xmm7, xmm8
1326
+ pxor xmm4, xmm9
1327
+ movdqa xmmword ptr [rsp+0x100], xmm8
1328
+ movdqa xmm8, xmm5
1329
+ psrld xmm8, 12
1330
+ pslld xmm5, 20
1331
+ por xmm5, xmm8
1332
+ movdqa xmm8, xmm6
1333
+ psrld xmm8, 12
1334
+ pslld xmm6, 20
1335
+ por xmm6, xmm8
1336
+ movdqa xmm8, xmm7
1337
+ psrld xmm8, 12
1338
+ pslld xmm7, 20
1339
+ por xmm7, xmm8
1340
+ movdqa xmm8, xmm4
1341
+ psrld xmm8, 12
1342
+ pslld xmm4, 20
1343
+ por xmm4, xmm8
1344
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1345
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1346
+ paddd xmm2, xmmword ptr [rsp+0x40]
1347
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1348
+ paddd xmm0, xmm5
1349
+ paddd xmm1, xmm6
1350
+ paddd xmm2, xmm7
1351
+ paddd xmm3, xmm4
1352
+ pxor xmm15, xmm0
1353
+ pxor xmm12, xmm1
1354
+ pxor xmm13, xmm2
1355
+ pxor xmm14, xmm3
1356
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1357
+ pshufb xmm15, xmm8
1358
+ pshufb xmm12, xmm8
1359
+ pshufb xmm13, xmm8
1360
+ pshufb xmm14, xmm8
1361
+ paddd xmm10, xmm15
1362
+ paddd xmm11, xmm12
1363
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1364
+ paddd xmm8, xmm13
1365
+ paddd xmm9, xmm14
1366
+ pxor xmm5, xmm10
1367
+ pxor xmm6, xmm11
1368
+ pxor xmm7, xmm8
1369
+ pxor xmm4, xmm9
1370
+ pxor xmm0, xmm8
1371
+ pxor xmm1, xmm9
1372
+ pxor xmm2, xmm10
1373
+ pxor xmm3, xmm11
1374
+ movdqa xmm8, xmm5
1375
+ psrld xmm8, 7
1376
+ pslld xmm5, 25
1377
+ por xmm5, xmm8
1378
+ movdqa xmm8, xmm6
1379
+ psrld xmm8, 7
1380
+ pslld xmm6, 25
1381
+ por xmm6, xmm8
1382
+ movdqa xmm8, xmm7
1383
+ psrld xmm8, 7
1384
+ pslld xmm7, 25
1385
+ por xmm7, xmm8
1386
+ movdqa xmm8, xmm4
1387
+ psrld xmm8, 7
1388
+ pslld xmm4, 25
1389
+ por xmm4, xmm8
1390
+ pxor xmm4, xmm12
1391
+ pxor xmm5, xmm13
1392
+ pxor xmm6, xmm14
1393
+ pxor xmm7, xmm15
1394
+ mov eax, r13d
1395
+ jne 9b
1396
+ movdqa xmm9, xmm0
1397
+ punpckldq xmm0, xmm1
1398
+ punpckhdq xmm9, xmm1
1399
+ movdqa xmm11, xmm2
1400
+ punpckldq xmm2, xmm3
1401
+ punpckhdq xmm11, xmm3
1402
+ movdqa xmm1, xmm0
1403
+ punpcklqdq xmm0, xmm2
1404
+ punpckhqdq xmm1, xmm2
1405
+ movdqa xmm3, xmm9
1406
+ punpcklqdq xmm9, xmm11
1407
+ punpckhqdq xmm3, xmm11
1408
+ movdqu xmmword ptr [rbx], xmm0
1409
+ movdqu xmmword ptr [rbx+0x20], xmm1
1410
+ movdqu xmmword ptr [rbx+0x40], xmm9
1411
+ movdqu xmmword ptr [rbx+0x60], xmm3
1412
+ movdqa xmm9, xmm4
1413
+ punpckldq xmm4, xmm5
1414
+ punpckhdq xmm9, xmm5
1415
+ movdqa xmm11, xmm6
1416
+ punpckldq xmm6, xmm7
1417
+ punpckhdq xmm11, xmm7
1418
+ movdqa xmm5, xmm4
1419
+ punpcklqdq xmm4, xmm6
1420
+ punpckhqdq xmm5, xmm6
1421
+ movdqa xmm7, xmm9
1422
+ punpcklqdq xmm9, xmm11
1423
+ punpckhqdq xmm7, xmm11
1424
+ movdqu xmmword ptr [rbx+0x10], xmm4
1425
+ movdqu xmmword ptr [rbx+0x30], xmm5
1426
+ movdqu xmmword ptr [rbx+0x50], xmm9
1427
+ movdqu xmmword ptr [rbx+0x70], xmm7
1428
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1429
+ movdqa xmm0, xmm1
1430
+ paddd xmm1, xmmword ptr [rsp+0x150]
1431
+ movdqa xmmword ptr [rsp+0x110], xmm1
1432
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1433
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1434
+ pcmpgtd xmm0, xmm1
1435
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1436
+ psubd xmm1, xmm0
1437
+ movdqa xmmword ptr [rsp+0x120], xmm1
1438
+ add rbx, 128
1439
+ add rdi, 32
1440
+ sub rsi, 4
1441
+ cmp rsi, 4
1442
+ jnc 2b
1443
+ test rsi, rsi
1444
+ jnz 3f
1445
+ 4:
1446
+ mov rsp, rbp
1447
+ pop rbp
1448
+ pop rbx
1449
+ pop r12
1450
+ pop r13
1451
+ pop r14
1452
+ pop r15
1453
+ ret
1454
+ .p2align 5
1455
+ 3:
1456
+ test esi, 0x2
1457
+ je 3f
1458
+ movups xmm0, xmmword ptr [rcx]
1459
+ movups xmm1, xmmword ptr [rcx+0x10]
1460
+ movaps xmm8, xmm0
1461
+ movaps xmm9, xmm1
1462
+ movd xmm13, dword ptr [rsp+0x110]
1463
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1464
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1465
+ movaps xmmword ptr [rsp], xmm13
1466
+ movd xmm14, dword ptr [rsp+0x114]
1467
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
1468
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1469
+ movaps xmmword ptr [rsp+0x10], xmm14
1470
+ mov r8, qword ptr [rdi]
1471
+ mov r9, qword ptr [rdi+0x8]
1472
+ movzx eax, byte ptr [rbp+0x40]
1473
+ or eax, r13d
1474
+ xor edx, edx
1475
+ 2:
1476
+ mov r14d, eax
1477
+ or eax, r12d
1478
+ add rdx, 64
1479
+ cmp rdx, r15
1480
+ cmovne eax, r14d
1481
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1482
+ movaps xmm10, xmm2
1483
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1484
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1485
+ movaps xmm3, xmm4
1486
+ shufps xmm4, xmm5, 136
1487
+ shufps xmm3, xmm5, 221
1488
+ movaps xmm5, xmm3
1489
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1490
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1491
+ movaps xmm3, xmm6
1492
+ shufps xmm6, xmm7, 136
1493
+ pshufd xmm6, xmm6, 0x93
1494
+ shufps xmm3, xmm7, 221
1495
+ pshufd xmm7, xmm3, 0x93
1496
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1497
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1498
+ movaps xmm11, xmm12
1499
+ shufps xmm12, xmm13, 136
1500
+ shufps xmm11, xmm13, 221
1501
+ movaps xmm13, xmm11
1502
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1503
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1504
+ movaps xmm11, xmm14
1505
+ shufps xmm14, xmm15, 136
1506
+ pshufd xmm14, xmm14, 0x93
1507
+ shufps xmm11, xmm15, 221
1508
+ pshufd xmm15, xmm11, 0x93
1509
+ movaps xmm3, xmmword ptr [rsp]
1510
+ movaps xmm11, xmmword ptr [rsp+0x10]
1511
+ pinsrd xmm3, eax, 3
1512
+ pinsrd xmm11, eax, 3
1513
+ mov al, 7
1514
+ 9:
1515
+ paddd xmm0, xmm4
1516
+ paddd xmm8, xmm12
1517
+ movaps xmmword ptr [rsp+0x20], xmm4
1518
+ movaps xmmword ptr [rsp+0x30], xmm12
1519
+ paddd xmm0, xmm1
1520
+ paddd xmm8, xmm9
1521
+ pxor xmm3, xmm0
1522
+ pxor xmm11, xmm8
1523
+ movaps xmm12, xmmword ptr [ROT16+rip]
1524
+ pshufb xmm3, xmm12
1525
+ pshufb xmm11, xmm12
1526
+ paddd xmm2, xmm3
1527
+ paddd xmm10, xmm11
1528
+ pxor xmm1, xmm2
1529
+ pxor xmm9, xmm10
1530
+ movdqa xmm4, xmm1
1531
+ pslld xmm1, 20
1532
+ psrld xmm4, 12
1533
+ por xmm1, xmm4
1534
+ movdqa xmm4, xmm9
1535
+ pslld xmm9, 20
1536
+ psrld xmm4, 12
1537
+ por xmm9, xmm4
1538
+ paddd xmm0, xmm5
1539
+ paddd xmm8, xmm13
1540
+ movaps xmmword ptr [rsp+0x40], xmm5
1541
+ movaps xmmword ptr [rsp+0x50], xmm13
1542
+ paddd xmm0, xmm1
1543
+ paddd xmm8, xmm9
1544
+ pxor xmm3, xmm0
1545
+ pxor xmm11, xmm8
1546
+ movaps xmm13, xmmword ptr [ROT8+rip]
1547
+ pshufb xmm3, xmm13
1548
+ pshufb xmm11, xmm13
1549
+ paddd xmm2, xmm3
1550
+ paddd xmm10, xmm11
1551
+ pxor xmm1, xmm2
1552
+ pxor xmm9, xmm10
1553
+ movdqa xmm4, xmm1
1554
+ pslld xmm1, 25
1555
+ psrld xmm4, 7
1556
+ por xmm1, xmm4
1557
+ movdqa xmm4, xmm9
1558
+ pslld xmm9, 25
1559
+ psrld xmm4, 7
1560
+ por xmm9, xmm4
1561
+ pshufd xmm0, xmm0, 0x93
1562
+ pshufd xmm8, xmm8, 0x93
1563
+ pshufd xmm3, xmm3, 0x4E
1564
+ pshufd xmm11, xmm11, 0x4E
1565
+ pshufd xmm2, xmm2, 0x39
1566
+ pshufd xmm10, xmm10, 0x39
1567
+ paddd xmm0, xmm6
1568
+ paddd xmm8, xmm14
1569
+ paddd xmm0, xmm1
1570
+ paddd xmm8, xmm9
1571
+ pxor xmm3, xmm0
1572
+ pxor xmm11, xmm8
1573
+ pshufb xmm3, xmm12
1574
+ pshufb xmm11, xmm12
1575
+ paddd xmm2, xmm3
1576
+ paddd xmm10, xmm11
1577
+ pxor xmm1, xmm2
1578
+ pxor xmm9, xmm10
1579
+ movdqa xmm4, xmm1
1580
+ pslld xmm1, 20
1581
+ psrld xmm4, 12
1582
+ por xmm1, xmm4
1583
+ movdqa xmm4, xmm9
1584
+ pslld xmm9, 20
1585
+ psrld xmm4, 12
1586
+ por xmm9, xmm4
1587
+ paddd xmm0, xmm7
1588
+ paddd xmm8, xmm15
1589
+ paddd xmm0, xmm1
1590
+ paddd xmm8, xmm9
1591
+ pxor xmm3, xmm0
1592
+ pxor xmm11, xmm8
1593
+ pshufb xmm3, xmm13
1594
+ pshufb xmm11, xmm13
1595
+ paddd xmm2, xmm3
1596
+ paddd xmm10, xmm11
1597
+ pxor xmm1, xmm2
1598
+ pxor xmm9, xmm10
1599
+ movdqa xmm4, xmm1
1600
+ pslld xmm1, 25
1601
+ psrld xmm4, 7
1602
+ por xmm1, xmm4
1603
+ movdqa xmm4, xmm9
1604
+ pslld xmm9, 25
1605
+ psrld xmm4, 7
1606
+ por xmm9, xmm4
1607
+ pshufd xmm0, xmm0, 0x39
1608
+ pshufd xmm8, xmm8, 0x39
1609
+ pshufd xmm3, xmm3, 0x4E
1610
+ pshufd xmm11, xmm11, 0x4E
1611
+ pshufd xmm2, xmm2, 0x93
1612
+ pshufd xmm10, xmm10, 0x93
1613
+ dec al
1614
+ je 9f
1615
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1616
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1617
+ pshufd xmm13, xmm12, 0x0F
1618
+ shufps xmm12, xmm5, 214
1619
+ pshufd xmm4, xmm12, 0x39
1620
+ movdqa xmm12, xmm6
1621
+ shufps xmm12, xmm7, 250
1622
+ pblendw xmm13, xmm12, 0xCC
1623
+ movdqa xmm12, xmm7
1624
+ punpcklqdq xmm12, xmm5
1625
+ pblendw xmm12, xmm6, 0xC0
1626
+ pshufd xmm12, xmm12, 0x78
1627
+ punpckhdq xmm5, xmm7
1628
+ punpckldq xmm6, xmm5
1629
+ pshufd xmm7, xmm6, 0x1E
1630
+ movdqa xmmword ptr [rsp+0x20], xmm13
1631
+ movdqa xmmword ptr [rsp+0x40], xmm12
1632
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1633
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1634
+ pshufd xmm6, xmm5, 0x0F
1635
+ shufps xmm5, xmm13, 214
1636
+ pshufd xmm12, xmm5, 0x39
1637
+ movdqa xmm5, xmm14
1638
+ shufps xmm5, xmm15, 250
1639
+ pblendw xmm6, xmm5, 0xCC
1640
+ movdqa xmm5, xmm15
1641
+ punpcklqdq xmm5, xmm13
1642
+ pblendw xmm5, xmm14, 0xC0
1643
+ pshufd xmm5, xmm5, 0x78
1644
+ punpckhdq xmm13, xmm15
1645
+ punpckldq xmm14, xmm13
1646
+ pshufd xmm15, xmm14, 0x1E
1647
+ movdqa xmm13, xmm6
1648
+ movdqa xmm14, xmm5
1649
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1650
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1651
+ jmp 9b
1652
+ 9:
1653
+ pxor xmm0, xmm2
1654
+ pxor xmm1, xmm3
1655
+ pxor xmm8, xmm10
1656
+ pxor xmm9, xmm11
1657
+ mov eax, r13d
1658
+ cmp rdx, r15
1659
+ jne 2b
1660
+ movups xmmword ptr [rbx], xmm0
1661
+ movups xmmword ptr [rbx+0x10], xmm1
1662
+ movups xmmword ptr [rbx+0x20], xmm8
1663
+ movups xmmword ptr [rbx+0x30], xmm9
1664
+ movdqa xmm0, xmmword ptr [rsp+0x130]
1665
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1666
+ movdqa xmm2, xmmword ptr [rsp+0x120]
1667
+ movdqu xmm3, xmmword ptr [rsp+0x118]
1668
+ movdqu xmm4, xmmword ptr [rsp+0x128]
1669
+ blendvps xmm1, xmm3, xmm0
1670
+ blendvps xmm2, xmm4, xmm0
1671
+ movdqa xmmword ptr [rsp+0x110], xmm1
1672
+ movdqa xmmword ptr [rsp+0x120], xmm2
1673
+ add rdi, 16
1674
+ add rbx, 64
1675
+ sub rsi, 2
1676
+ 3:
1677
+ test esi, 0x1
1678
+ je 4b
1679
+ movups xmm0, xmmword ptr [rcx]
1680
+ movups xmm1, xmmword ptr [rcx+0x10]
1681
+ movd xmm13, dword ptr [rsp+0x110]
1682
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1683
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1684
+ movaps xmm14, xmmword ptr [ROT8+rip]
1685
+ movaps xmm15, xmmword ptr [ROT16+rip]
1686
+ mov r8, qword ptr [rdi]
1687
+ movzx eax, byte ptr [rbp+0x40]
1688
+ or eax, r13d
1689
+ xor edx, edx
1690
+ 2:
1691
+ mov r14d, eax
1692
+ or eax, r12d
1693
+ add rdx, 64
1694
+ cmp rdx, r15
1695
+ cmovne eax, r14d
1696
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1697
+ movaps xmm3, xmm13
1698
+ pinsrd xmm3, eax, 3
1699
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1700
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1701
+ movaps xmm8, xmm4
1702
+ shufps xmm4, xmm5, 136
1703
+ shufps xmm8, xmm5, 221
1704
+ movaps xmm5, xmm8
1705
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1706
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1707
+ movaps xmm8, xmm6
1708
+ shufps xmm6, xmm7, 136
1709
+ pshufd xmm6, xmm6, 0x93
1710
+ shufps xmm8, xmm7, 221
1711
+ pshufd xmm7, xmm8, 0x93
1712
+ mov al, 7
1713
+ 9:
1714
+ paddd xmm0, xmm4
1715
+ paddd xmm0, xmm1
1716
+ pxor xmm3, xmm0
1717
+ pshufb xmm3, xmm15
1718
+ paddd xmm2, xmm3
1719
+ pxor xmm1, xmm2
1720
+ movdqa xmm11, xmm1
1721
+ pslld xmm1, 20
1722
+ psrld xmm11, 12
1723
+ por xmm1, xmm11
1724
+ paddd xmm0, xmm5
1725
+ paddd xmm0, xmm1
1726
+ pxor xmm3, xmm0
1727
+ pshufb xmm3, xmm14
1728
+ paddd xmm2, xmm3
1729
+ pxor xmm1, xmm2
1730
+ movdqa xmm11, xmm1
1731
+ pslld xmm1, 25
1732
+ psrld xmm11, 7
1733
+ por xmm1, xmm11
1734
+ pshufd xmm0, xmm0, 0x93
1735
+ pshufd xmm3, xmm3, 0x4E
1736
+ pshufd xmm2, xmm2, 0x39
1737
+ paddd xmm0, xmm6
1738
+ paddd xmm0, xmm1
1739
+ pxor xmm3, xmm0
1740
+ pshufb xmm3, xmm15
1741
+ paddd xmm2, xmm3
1742
+ pxor xmm1, xmm2
1743
+ movdqa xmm11, xmm1
1744
+ pslld xmm1, 20
1745
+ psrld xmm11, 12
1746
+ por xmm1, xmm11
1747
+ paddd xmm0, xmm7
1748
+ paddd xmm0, xmm1
1749
+ pxor xmm3, xmm0
1750
+ pshufb xmm3, xmm14
1751
+ paddd xmm2, xmm3
1752
+ pxor xmm1, xmm2
1753
+ movdqa xmm11, xmm1
1754
+ pslld xmm1, 25
1755
+ psrld xmm11, 7
1756
+ por xmm1, xmm11
1757
+ pshufd xmm0, xmm0, 0x39
1758
+ pshufd xmm3, xmm3, 0x4E
1759
+ pshufd xmm2, xmm2, 0x93
1760
+ dec al
1761
+ jz 9f
1762
+ movdqa xmm8, xmm4
1763
+ shufps xmm8, xmm5, 214
1764
+ pshufd xmm9, xmm4, 0x0F
1765
+ pshufd xmm4, xmm8, 0x39
1766
+ movdqa xmm8, xmm6
1767
+ shufps xmm8, xmm7, 250
1768
+ pblendw xmm9, xmm8, 0xCC
1769
+ movdqa xmm8, xmm7
1770
+ punpcklqdq xmm8, xmm5
1771
+ pblendw xmm8, xmm6, 0xC0
1772
+ pshufd xmm8, xmm8, 0x78
1773
+ punpckhdq xmm5, xmm7
1774
+ punpckldq xmm6, xmm5
1775
+ pshufd xmm7, xmm6, 0x1E
1776
+ movdqa xmm5, xmm9
1777
+ movdqa xmm6, xmm8
1778
+ jmp 9b
1779
+ 9:
1780
+ pxor xmm0, xmm2
1781
+ pxor xmm1, xmm3
1782
+ mov eax, r13d
1783
+ cmp rdx, r15
1784
+ jne 2b
1785
+ movups xmmword ptr [rbx], xmm0
1786
+ movups xmmword ptr [rbx+0x10], xmm1
1787
+ jmp 4b
1788
+
1789
+ .p2align 6
1790
+ blake3_compress_in_place_sse41:
1791
+ _blake3_compress_in_place_sse41:
1792
+ _CET_ENDBR
1793
+ movups xmm0, xmmword ptr [rdi]
1794
+ movups xmm1, xmmword ptr [rdi+0x10]
1795
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1796
+ shl r8, 32
1797
+ add rdx, r8
1798
+ movq xmm3, rcx
1799
+ movq xmm4, rdx
1800
+ punpcklqdq xmm3, xmm4
1801
+ movups xmm4, xmmword ptr [rsi]
1802
+ movups xmm5, xmmword ptr [rsi+0x10]
1803
+ movaps xmm8, xmm4
1804
+ shufps xmm4, xmm5, 136
1805
+ shufps xmm8, xmm5, 221
1806
+ movaps xmm5, xmm8
1807
+ movups xmm6, xmmword ptr [rsi+0x20]
1808
+ movups xmm7, xmmword ptr [rsi+0x30]
1809
+ movaps xmm8, xmm6
1810
+ shufps xmm6, xmm7, 136
1811
+ pshufd xmm6, xmm6, 0x93
1812
+ shufps xmm8, xmm7, 221
1813
+ pshufd xmm7, xmm8, 0x93
1814
+ movaps xmm14, xmmword ptr [ROT8+rip]
1815
+ movaps xmm15, xmmword ptr [ROT16+rip]
1816
+ mov al, 7
1817
+ 9:
1818
+ paddd xmm0, xmm4
1819
+ paddd xmm0, xmm1
1820
+ pxor xmm3, xmm0
1821
+ pshufb xmm3, xmm15
1822
+ paddd xmm2, xmm3
1823
+ pxor xmm1, xmm2
1824
+ movdqa xmm11, xmm1
1825
+ pslld xmm1, 20
1826
+ psrld xmm11, 12
1827
+ por xmm1, xmm11
1828
+ paddd xmm0, xmm5
1829
+ paddd xmm0, xmm1
1830
+ pxor xmm3, xmm0
1831
+ pshufb xmm3, xmm14
1832
+ paddd xmm2, xmm3
1833
+ pxor xmm1, xmm2
1834
+ movdqa xmm11, xmm1
1835
+ pslld xmm1, 25
1836
+ psrld xmm11, 7
1837
+ por xmm1, xmm11
1838
+ pshufd xmm0, xmm0, 0x93
1839
+ pshufd xmm3, xmm3, 0x4E
1840
+ pshufd xmm2, xmm2, 0x39
1841
+ paddd xmm0, xmm6
1842
+ paddd xmm0, xmm1
1843
+ pxor xmm3, xmm0
1844
+ pshufb xmm3, xmm15
1845
+ paddd xmm2, xmm3
1846
+ pxor xmm1, xmm2
1847
+ movdqa xmm11, xmm1
1848
+ pslld xmm1, 20
1849
+ psrld xmm11, 12
1850
+ por xmm1, xmm11
1851
+ paddd xmm0, xmm7
1852
+ paddd xmm0, xmm1
1853
+ pxor xmm3, xmm0
1854
+ pshufb xmm3, xmm14
1855
+ paddd xmm2, xmm3
1856
+ pxor xmm1, xmm2
1857
+ movdqa xmm11, xmm1
1858
+ pslld xmm1, 25
1859
+ psrld xmm11, 7
1860
+ por xmm1, xmm11
1861
+ pshufd xmm0, xmm0, 0x39
1862
+ pshufd xmm3, xmm3, 0x4E
1863
+ pshufd xmm2, xmm2, 0x93
1864
+ dec al
1865
+ jz 9f
1866
+ movdqa xmm8, xmm4
1867
+ shufps xmm8, xmm5, 214
1868
+ pshufd xmm9, xmm4, 0x0F
1869
+ pshufd xmm4, xmm8, 0x39
1870
+ movdqa xmm8, xmm6
1871
+ shufps xmm8, xmm7, 250
1872
+ pblendw xmm9, xmm8, 0xCC
1873
+ movdqa xmm8, xmm7
1874
+ punpcklqdq xmm8, xmm5
1875
+ pblendw xmm8, xmm6, 0xC0
1876
+ pshufd xmm8, xmm8, 0x78
1877
+ punpckhdq xmm5, xmm7
1878
+ punpckldq xmm6, xmm5
1879
+ pshufd xmm7, xmm6, 0x1E
1880
+ movdqa xmm5, xmm9
1881
+ movdqa xmm6, xmm8
1882
+ jmp 9b
1883
+ 9:
1884
+ pxor xmm0, xmm2
1885
+ pxor xmm1, xmm3
1886
+ movups xmmword ptr [rdi], xmm0
1887
+ movups xmmword ptr [rdi+0x10], xmm1
1888
+ ret
1889
+
1890
+ .p2align 6
1891
+ blake3_compress_xof_sse41:
1892
+ _blake3_compress_xof_sse41:
1893
+ _CET_ENDBR
1894
+ movups xmm0, xmmword ptr [rdi]
1895
+ movups xmm1, xmmword ptr [rdi+0x10]
1896
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1897
+ movzx eax, r8b
1898
+ movzx edx, dl
1899
+ shl rax, 32
1900
+ add rdx, rax
1901
+ movq xmm3, rcx
1902
+ movq xmm4, rdx
1903
+ punpcklqdq xmm3, xmm4
1904
+ movups xmm4, xmmword ptr [rsi]
1905
+ movups xmm5, xmmword ptr [rsi+0x10]
1906
+ movaps xmm8, xmm4
1907
+ shufps xmm4, xmm5, 136
1908
+ shufps xmm8, xmm5, 221
1909
+ movaps xmm5, xmm8
1910
+ movups xmm6, xmmword ptr [rsi+0x20]
1911
+ movups xmm7, xmmword ptr [rsi+0x30]
1912
+ movaps xmm8, xmm6
1913
+ shufps xmm6, xmm7, 136
1914
+ pshufd xmm6, xmm6, 0x93
1915
+ shufps xmm8, xmm7, 221
1916
+ pshufd xmm7, xmm8, 0x93
1917
+ movaps xmm14, xmmword ptr [ROT8+rip]
1918
+ movaps xmm15, xmmword ptr [ROT16+rip]
1919
+ mov al, 7
1920
+ 9:
1921
+ paddd xmm0, xmm4
1922
+ paddd xmm0, xmm1
1923
+ pxor xmm3, xmm0
1924
+ pshufb xmm3, xmm15
1925
+ paddd xmm2, xmm3
1926
+ pxor xmm1, xmm2
1927
+ movdqa xmm11, xmm1
1928
+ pslld xmm1, 20
1929
+ psrld xmm11, 12
1930
+ por xmm1, xmm11
1931
+ paddd xmm0, xmm5
1932
+ paddd xmm0, xmm1
1933
+ pxor xmm3, xmm0
1934
+ pshufb xmm3, xmm14
1935
+ paddd xmm2, xmm3
1936
+ pxor xmm1, xmm2
1937
+ movdqa xmm11, xmm1
1938
+ pslld xmm1, 25
1939
+ psrld xmm11, 7
1940
+ por xmm1, xmm11
1941
+ pshufd xmm0, xmm0, 0x93
1942
+ pshufd xmm3, xmm3, 0x4E
1943
+ pshufd xmm2, xmm2, 0x39
1944
+ paddd xmm0, xmm6
1945
+ paddd xmm0, xmm1
1946
+ pxor xmm3, xmm0
1947
+ pshufb xmm3, xmm15
1948
+ paddd xmm2, xmm3
1949
+ pxor xmm1, xmm2
1950
+ movdqa xmm11, xmm1
1951
+ pslld xmm1, 20
1952
+ psrld xmm11, 12
1953
+ por xmm1, xmm11
1954
+ paddd xmm0, xmm7
1955
+ paddd xmm0, xmm1
1956
+ pxor xmm3, xmm0
1957
+ pshufb xmm3, xmm14
1958
+ paddd xmm2, xmm3
1959
+ pxor xmm1, xmm2
1960
+ movdqa xmm11, xmm1
1961
+ pslld xmm1, 25
1962
+ psrld xmm11, 7
1963
+ por xmm1, xmm11
1964
+ pshufd xmm0, xmm0, 0x39
1965
+ pshufd xmm3, xmm3, 0x4E
1966
+ pshufd xmm2, xmm2, 0x93
1967
+ dec al
1968
+ jz 9f
1969
+ movdqa xmm8, xmm4
1970
+ shufps xmm8, xmm5, 214
1971
+ pshufd xmm9, xmm4, 0x0F
1972
+ pshufd xmm4, xmm8, 0x39
1973
+ movdqa xmm8, xmm6
1974
+ shufps xmm8, xmm7, 250
1975
+ pblendw xmm9, xmm8, 0xCC
1976
+ movdqa xmm8, xmm7
1977
+ punpcklqdq xmm8, xmm5
1978
+ pblendw xmm8, xmm6, 0xC0
1979
+ pshufd xmm8, xmm8, 0x78
1980
+ punpckhdq xmm5, xmm7
1981
+ punpckldq xmm6, xmm5
1982
+ pshufd xmm7, xmm6, 0x1E
1983
+ movdqa xmm5, xmm9
1984
+ movdqa xmm6, xmm8
1985
+ jmp 9b
1986
+ 9:
1987
+ movdqu xmm4, xmmword ptr [rdi]
1988
+ movdqu xmm5, xmmword ptr [rdi+0x10]
1989
+ pxor xmm0, xmm2
1990
+ pxor xmm1, xmm3
1991
+ pxor xmm2, xmm4
1992
+ pxor xmm3, xmm5
1993
+ movups xmmword ptr [r9], xmm0
1994
+ movups xmmword ptr [r9+0x10], xmm1
1995
+ movups xmmword ptr [r9+0x20], xmm2
1996
+ movups xmmword ptr [r9+0x30], xmm3
1997
+ ret
1998
+
1999
+
2000
+ #ifdef __APPLE__
2001
+ .static_data
2002
+ #else
2003
+ .section .rodata
2004
+ #endif
2005
+ .p2align 6
2006
+ BLAKE3_IV:
2007
+ .long 0x6A09E667, 0xBB67AE85
2008
+ .long 0x3C6EF372, 0xA54FF53A
2009
+ ROT16:
2010
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2011
+ ROT8:
2012
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2013
+ ADD0:
2014
+ .long 0, 1, 2, 3
2015
+ ADD1:
2016
+ .long 4, 4, 4, 4
2017
+ BLAKE3_IV_0:
2018
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2019
+ BLAKE3_IV_1:
2020
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2021
+ BLAKE3_IV_2:
2022
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2023
+ BLAKE3_IV_3:
2024
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2025
+ BLAKE3_BLOCK_LEN:
2026
+ .long 64, 64, 64, 64
2027
+ CMP_MSB_MASK:
2028
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000