digest-blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2057 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse41
3
+ .global _blake3_hash_many_sse41
4
+ .global blake3_compress_in_place_sse41
5
+ .global _blake3_compress_in_place_sse41
6
+ .global blake3_compress_xof_sse41
7
+ .global _blake3_compress_xof_sse41
8
+ .section .text
9
+ .p2align 6
10
+ _blake3_hash_many_sse41:
11
+ blake3_hash_many_sse41:
12
+ push r15
13
+ push r14
14
+ push r13
15
+ push r12
16
+ push rsi
17
+ push rdi
18
+ push rbx
19
+ push rbp
20
+ mov rbp, rsp
21
+ sub rsp, 528
22
+ and rsp, 0xFFFFFFFFFFFFFFC0
23
+ movdqa xmmword ptr [rsp+0x170], xmm6
24
+ movdqa xmmword ptr [rsp+0x180], xmm7
25
+ movdqa xmmword ptr [rsp+0x190], xmm8
26
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
27
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
28
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
29
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
30
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
31
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
32
+ movdqa xmmword ptr [rsp+0x200], xmm15
33
+ mov rdi, rcx
34
+ mov rsi, rdx
35
+ mov rdx, r8
36
+ mov rcx, r9
37
+ mov r8, qword ptr [rbp+0x68]
38
+ movzx r9, byte ptr [rbp+0x70]
39
+ neg r9d
40
+ movd xmm0, r9d
41
+ pshufd xmm0, xmm0, 0x00
42
+ movdqa xmmword ptr [rsp+0x130], xmm0
43
+ movdqa xmm1, xmm0
44
+ pand xmm1, xmmword ptr [ADD0+rip]
45
+ pand xmm0, xmmword ptr [ADD1+rip]
46
+ movdqa xmmword ptr [rsp+0x150], xmm0
47
+ movd xmm0, r8d
48
+ pshufd xmm0, xmm0, 0x00
49
+ paddd xmm0, xmm1
50
+ movdqa xmmword ptr [rsp+0x110], xmm0
51
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pcmpgtd xmm1, xmm0
54
+ shr r8, 32
55
+ movd xmm2, r8d
56
+ pshufd xmm2, xmm2, 0x00
57
+ psubd xmm2, xmm1
58
+ movdqa xmmword ptr [rsp+0x120], xmm2
59
+ mov rbx, qword ptr [rbp+0x90]
60
+ mov r15, rdx
61
+ shl r15, 6
62
+ movzx r13d, byte ptr [rbp+0x78]
63
+ movzx r12d, byte ptr [rbp+0x88]
64
+ cmp rsi, 4
65
+ jc 3f
66
+ 2:
67
+ movdqu xmm3, xmmword ptr [rcx]
68
+ pshufd xmm0, xmm3, 0x00
69
+ pshufd xmm1, xmm3, 0x55
70
+ pshufd xmm2, xmm3, 0xAA
71
+ pshufd xmm3, xmm3, 0xFF
72
+ movdqu xmm7, xmmword ptr [rcx+0x10]
73
+ pshufd xmm4, xmm7, 0x00
74
+ pshufd xmm5, xmm7, 0x55
75
+ pshufd xmm6, xmm7, 0xAA
76
+ pshufd xmm7, xmm7, 0xFF
77
+ mov r8, qword ptr [rdi]
78
+ mov r9, qword ptr [rdi+0x8]
79
+ mov r10, qword ptr [rdi+0x10]
80
+ mov r11, qword ptr [rdi+0x18]
81
+ movzx eax, byte ptr [rbp+0x80]
82
+ or eax, r13d
83
+ xor edx, edx
84
+ 9:
85
+ mov r14d, eax
86
+ or eax, r12d
87
+ add rdx, 64
88
+ cmp rdx, r15
89
+ cmovne eax, r14d
90
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
91
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
92
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
93
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
94
+ movdqa xmm12, xmm8
95
+ punpckldq xmm8, xmm9
96
+ punpckhdq xmm12, xmm9
97
+ movdqa xmm14, xmm10
98
+ punpckldq xmm10, xmm11
99
+ punpckhdq xmm14, xmm11
100
+ movdqa xmm9, xmm8
101
+ punpcklqdq xmm8, xmm10
102
+ punpckhqdq xmm9, xmm10
103
+ movdqa xmm13, xmm12
104
+ punpcklqdq xmm12, xmm14
105
+ punpckhqdq xmm13, xmm14
106
+ movdqa xmmword ptr [rsp], xmm8
107
+ movdqa xmmword ptr [rsp+0x10], xmm9
108
+ movdqa xmmword ptr [rsp+0x20], xmm12
109
+ movdqa xmmword ptr [rsp+0x30], xmm13
110
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
111
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
112
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
113
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
114
+ movdqa xmm12, xmm8
115
+ punpckldq xmm8, xmm9
116
+ punpckhdq xmm12, xmm9
117
+ movdqa xmm14, xmm10
118
+ punpckldq xmm10, xmm11
119
+ punpckhdq xmm14, xmm11
120
+ movdqa xmm9, xmm8
121
+ punpcklqdq xmm8, xmm10
122
+ punpckhqdq xmm9, xmm10
123
+ movdqa xmm13, xmm12
124
+ punpcklqdq xmm12, xmm14
125
+ punpckhqdq xmm13, xmm14
126
+ movdqa xmmword ptr [rsp+0x40], xmm8
127
+ movdqa xmmword ptr [rsp+0x50], xmm9
128
+ movdqa xmmword ptr [rsp+0x60], xmm12
129
+ movdqa xmmword ptr [rsp+0x70], xmm13
130
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
131
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
132
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
133
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
134
+ movdqa xmm12, xmm8
135
+ punpckldq xmm8, xmm9
136
+ punpckhdq xmm12, xmm9
137
+ movdqa xmm14, xmm10
138
+ punpckldq xmm10, xmm11
139
+ punpckhdq xmm14, xmm11
140
+ movdqa xmm9, xmm8
141
+ punpcklqdq xmm8, xmm10
142
+ punpckhqdq xmm9, xmm10
143
+ movdqa xmm13, xmm12
144
+ punpcklqdq xmm12, xmm14
145
+ punpckhqdq xmm13, xmm14
146
+ movdqa xmmword ptr [rsp+0x80], xmm8
147
+ movdqa xmmword ptr [rsp+0x90], xmm9
148
+ movdqa xmmword ptr [rsp+0xA0], xmm12
149
+ movdqa xmmword ptr [rsp+0xB0], xmm13
150
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
151
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
152
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
153
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
154
+ movdqa xmm12, xmm8
155
+ punpckldq xmm8, xmm9
156
+ punpckhdq xmm12, xmm9
157
+ movdqa xmm14, xmm10
158
+ punpckldq xmm10, xmm11
159
+ punpckhdq xmm14, xmm11
160
+ movdqa xmm9, xmm8
161
+ punpcklqdq xmm8, xmm10
162
+ punpckhqdq xmm9, xmm10
163
+ movdqa xmm13, xmm12
164
+ punpcklqdq xmm12, xmm14
165
+ punpckhqdq xmm13, xmm14
166
+ movdqa xmmword ptr [rsp+0xC0], xmm8
167
+ movdqa xmmword ptr [rsp+0xD0], xmm9
168
+ movdqa xmmword ptr [rsp+0xE0], xmm12
169
+ movdqa xmmword ptr [rsp+0xF0], xmm13
170
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173
+ movdqa xmm12, xmmword ptr [rsp+0x110]
174
+ movdqa xmm13, xmmword ptr [rsp+0x120]
175
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176
+ movd xmm15, eax
177
+ pshufd xmm15, xmm15, 0x00
178
+ prefetcht0 [r8+rdx+0x80]
179
+ prefetcht0 [r9+rdx+0x80]
180
+ prefetcht0 [r10+rdx+0x80]
181
+ prefetcht0 [r11+rdx+0x80]
182
+ paddd xmm0, xmmword ptr [rsp]
183
+ paddd xmm1, xmmword ptr [rsp+0x20]
184
+ paddd xmm2, xmmword ptr [rsp+0x40]
185
+ paddd xmm3, xmmword ptr [rsp+0x60]
186
+ paddd xmm0, xmm4
187
+ paddd xmm1, xmm5
188
+ paddd xmm2, xmm6
189
+ paddd xmm3, xmm7
190
+ pxor xmm12, xmm0
191
+ pxor xmm13, xmm1
192
+ pxor xmm14, xmm2
193
+ pxor xmm15, xmm3
194
+ movdqa xmm8, xmmword ptr [ROT16+rip]
195
+ pshufb xmm12, xmm8
196
+ pshufb xmm13, xmm8
197
+ pshufb xmm14, xmm8
198
+ pshufb xmm15, xmm8
199
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
200
+ paddd xmm8, xmm12
201
+ paddd xmm9, xmm13
202
+ paddd xmm10, xmm14
203
+ paddd xmm11, xmm15
204
+ pxor xmm4, xmm8
205
+ pxor xmm5, xmm9
206
+ pxor xmm6, xmm10
207
+ pxor xmm7, xmm11
208
+ movdqa xmmword ptr [rsp+0x100], xmm8
209
+ movdqa xmm8, xmm4
210
+ psrld xmm8, 12
211
+ pslld xmm4, 20
212
+ por xmm4, xmm8
213
+ movdqa xmm8, xmm5
214
+ psrld xmm8, 12
215
+ pslld xmm5, 20
216
+ por xmm5, xmm8
217
+ movdqa xmm8, xmm6
218
+ psrld xmm8, 12
219
+ pslld xmm6, 20
220
+ por xmm6, xmm8
221
+ movdqa xmm8, xmm7
222
+ psrld xmm8, 12
223
+ pslld xmm7, 20
224
+ por xmm7, xmm8
225
+ paddd xmm0, xmmword ptr [rsp+0x10]
226
+ paddd xmm1, xmmword ptr [rsp+0x30]
227
+ paddd xmm2, xmmword ptr [rsp+0x50]
228
+ paddd xmm3, xmmword ptr [rsp+0x70]
229
+ paddd xmm0, xmm4
230
+ paddd xmm1, xmm5
231
+ paddd xmm2, xmm6
232
+ paddd xmm3, xmm7
233
+ pxor xmm12, xmm0
234
+ pxor xmm13, xmm1
235
+ pxor xmm14, xmm2
236
+ pxor xmm15, xmm3
237
+ movdqa xmm8, xmmword ptr [ROT8+rip]
238
+ pshufb xmm12, xmm8
239
+ pshufb xmm13, xmm8
240
+ pshufb xmm14, xmm8
241
+ pshufb xmm15, xmm8
242
+ movdqa xmm8, xmmword ptr [rsp+0x100]
243
+ paddd xmm8, xmm12
244
+ paddd xmm9, xmm13
245
+ paddd xmm10, xmm14
246
+ paddd xmm11, xmm15
247
+ pxor xmm4, xmm8
248
+ pxor xmm5, xmm9
249
+ pxor xmm6, xmm10
250
+ pxor xmm7, xmm11
251
+ movdqa xmmword ptr [rsp+0x100], xmm8
252
+ movdqa xmm8, xmm4
253
+ psrld xmm8, 7
254
+ pslld xmm4, 25
255
+ por xmm4, xmm8
256
+ movdqa xmm8, xmm5
257
+ psrld xmm8, 7
258
+ pslld xmm5, 25
259
+ por xmm5, xmm8
260
+ movdqa xmm8, xmm6
261
+ psrld xmm8, 7
262
+ pslld xmm6, 25
263
+ por xmm6, xmm8
264
+ movdqa xmm8, xmm7
265
+ psrld xmm8, 7
266
+ pslld xmm7, 25
267
+ por xmm7, xmm8
268
+ paddd xmm0, xmmword ptr [rsp+0x80]
269
+ paddd xmm1, xmmword ptr [rsp+0xA0]
270
+ paddd xmm2, xmmword ptr [rsp+0xC0]
271
+ paddd xmm3, xmmword ptr [rsp+0xE0]
272
+ paddd xmm0, xmm5
273
+ paddd xmm1, xmm6
274
+ paddd xmm2, xmm7
275
+ paddd xmm3, xmm4
276
+ pxor xmm15, xmm0
277
+ pxor xmm12, xmm1
278
+ pxor xmm13, xmm2
279
+ pxor xmm14, xmm3
280
+ movdqa xmm8, xmmword ptr [ROT16+rip]
281
+ pshufb xmm15, xmm8
282
+ pshufb xmm12, xmm8
283
+ pshufb xmm13, xmm8
284
+ pshufb xmm14, xmm8
285
+ paddd xmm10, xmm15
286
+ paddd xmm11, xmm12
287
+ movdqa xmm8, xmmword ptr [rsp+0x100]
288
+ paddd xmm8, xmm13
289
+ paddd xmm9, xmm14
290
+ pxor xmm5, xmm10
291
+ pxor xmm6, xmm11
292
+ pxor xmm7, xmm8
293
+ pxor xmm4, xmm9
294
+ movdqa xmmword ptr [rsp+0x100], xmm8
295
+ movdqa xmm8, xmm5
296
+ psrld xmm8, 12
297
+ pslld xmm5, 20
298
+ por xmm5, xmm8
299
+ movdqa xmm8, xmm6
300
+ psrld xmm8, 12
301
+ pslld xmm6, 20
302
+ por xmm6, xmm8
303
+ movdqa xmm8, xmm7
304
+ psrld xmm8, 12
305
+ pslld xmm7, 20
306
+ por xmm7, xmm8
307
+ movdqa xmm8, xmm4
308
+ psrld xmm8, 12
309
+ pslld xmm4, 20
310
+ por xmm4, xmm8
311
+ paddd xmm0, xmmword ptr [rsp+0x90]
312
+ paddd xmm1, xmmword ptr [rsp+0xB0]
313
+ paddd xmm2, xmmword ptr [rsp+0xD0]
314
+ paddd xmm3, xmmword ptr [rsp+0xF0]
315
+ paddd xmm0, xmm5
316
+ paddd xmm1, xmm6
317
+ paddd xmm2, xmm7
318
+ paddd xmm3, xmm4
319
+ pxor xmm15, xmm0
320
+ pxor xmm12, xmm1
321
+ pxor xmm13, xmm2
322
+ pxor xmm14, xmm3
323
+ movdqa xmm8, xmmword ptr [ROT8+rip]
324
+ pshufb xmm15, xmm8
325
+ pshufb xmm12, xmm8
326
+ pshufb xmm13, xmm8
327
+ pshufb xmm14, xmm8
328
+ paddd xmm10, xmm15
329
+ paddd xmm11, xmm12
330
+ movdqa xmm8, xmmword ptr [rsp+0x100]
331
+ paddd xmm8, xmm13
332
+ paddd xmm9, xmm14
333
+ pxor xmm5, xmm10
334
+ pxor xmm6, xmm11
335
+ pxor xmm7, xmm8
336
+ pxor xmm4, xmm9
337
+ movdqa xmmword ptr [rsp+0x100], xmm8
338
+ movdqa xmm8, xmm5
339
+ psrld xmm8, 7
340
+ pslld xmm5, 25
341
+ por xmm5, xmm8
342
+ movdqa xmm8, xmm6
343
+ psrld xmm8, 7
344
+ pslld xmm6, 25
345
+ por xmm6, xmm8
346
+ movdqa xmm8, xmm7
347
+ psrld xmm8, 7
348
+ pslld xmm7, 25
349
+ por xmm7, xmm8
350
+ movdqa xmm8, xmm4
351
+ psrld xmm8, 7
352
+ pslld xmm4, 25
353
+ por xmm4, xmm8
354
+ paddd xmm0, xmmword ptr [rsp+0x20]
355
+ paddd xmm1, xmmword ptr [rsp+0x30]
356
+ paddd xmm2, xmmword ptr [rsp+0x70]
357
+ paddd xmm3, xmmword ptr [rsp+0x40]
358
+ paddd xmm0, xmm4
359
+ paddd xmm1, xmm5
360
+ paddd xmm2, xmm6
361
+ paddd xmm3, xmm7
362
+ pxor xmm12, xmm0
363
+ pxor xmm13, xmm1
364
+ pxor xmm14, xmm2
365
+ pxor xmm15, xmm3
366
+ movdqa xmm8, xmmword ptr [ROT16+rip]
367
+ pshufb xmm12, xmm8
368
+ pshufb xmm13, xmm8
369
+ pshufb xmm14, xmm8
370
+ pshufb xmm15, xmm8
371
+ movdqa xmm8, xmmword ptr [rsp+0x100]
372
+ paddd xmm8, xmm12
373
+ paddd xmm9, xmm13
374
+ paddd xmm10, xmm14
375
+ paddd xmm11, xmm15
376
+ pxor xmm4, xmm8
377
+ pxor xmm5, xmm9
378
+ pxor xmm6, xmm10
379
+ pxor xmm7, xmm11
380
+ movdqa xmmword ptr [rsp+0x100], xmm8
381
+ movdqa xmm8, xmm4
382
+ psrld xmm8, 12
383
+ pslld xmm4, 20
384
+ por xmm4, xmm8
385
+ movdqa xmm8, xmm5
386
+ psrld xmm8, 12
387
+ pslld xmm5, 20
388
+ por xmm5, xmm8
389
+ movdqa xmm8, xmm6
390
+ psrld xmm8, 12
391
+ pslld xmm6, 20
392
+ por xmm6, xmm8
393
+ movdqa xmm8, xmm7
394
+ psrld xmm8, 12
395
+ pslld xmm7, 20
396
+ por xmm7, xmm8
397
+ paddd xmm0, xmmword ptr [rsp+0x60]
398
+ paddd xmm1, xmmword ptr [rsp+0xA0]
399
+ paddd xmm2, xmmword ptr [rsp]
400
+ paddd xmm3, xmmword ptr [rsp+0xD0]
401
+ paddd xmm0, xmm4
402
+ paddd xmm1, xmm5
403
+ paddd xmm2, xmm6
404
+ paddd xmm3, xmm7
405
+ pxor xmm12, xmm0
406
+ pxor xmm13, xmm1
407
+ pxor xmm14, xmm2
408
+ pxor xmm15, xmm3
409
+ movdqa xmm8, xmmword ptr [ROT8+rip]
410
+ pshufb xmm12, xmm8
411
+ pshufb xmm13, xmm8
412
+ pshufb xmm14, xmm8
413
+ pshufb xmm15, xmm8
414
+ movdqa xmm8, xmmword ptr [rsp+0x100]
415
+ paddd xmm8, xmm12
416
+ paddd xmm9, xmm13
417
+ paddd xmm10, xmm14
418
+ paddd xmm11, xmm15
419
+ pxor xmm4, xmm8
420
+ pxor xmm5, xmm9
421
+ pxor xmm6, xmm10
422
+ pxor xmm7, xmm11
423
+ movdqa xmmword ptr [rsp+0x100], xmm8
424
+ movdqa xmm8, xmm4
425
+ psrld xmm8, 7
426
+ pslld xmm4, 25
427
+ por xmm4, xmm8
428
+ movdqa xmm8, xmm5
429
+ psrld xmm8, 7
430
+ pslld xmm5, 25
431
+ por xmm5, xmm8
432
+ movdqa xmm8, xmm6
433
+ psrld xmm8, 7
434
+ pslld xmm6, 25
435
+ por xmm6, xmm8
436
+ movdqa xmm8, xmm7
437
+ psrld xmm8, 7
438
+ pslld xmm7, 25
439
+ por xmm7, xmm8
440
+ paddd xmm0, xmmword ptr [rsp+0x10]
441
+ paddd xmm1, xmmword ptr [rsp+0xC0]
442
+ paddd xmm2, xmmword ptr [rsp+0x90]
443
+ paddd xmm3, xmmword ptr [rsp+0xF0]
444
+ paddd xmm0, xmm5
445
+ paddd xmm1, xmm6
446
+ paddd xmm2, xmm7
447
+ paddd xmm3, xmm4
448
+ pxor xmm15, xmm0
449
+ pxor xmm12, xmm1
450
+ pxor xmm13, xmm2
451
+ pxor xmm14, xmm3
452
+ movdqa xmm8, xmmword ptr [ROT16+rip]
453
+ pshufb xmm15, xmm8
454
+ pshufb xmm12, xmm8
455
+ pshufb xmm13, xmm8
456
+ pshufb xmm14, xmm8
457
+ paddd xmm10, xmm15
458
+ paddd xmm11, xmm12
459
+ movdqa xmm8, xmmword ptr [rsp+0x100]
460
+ paddd xmm8, xmm13
461
+ paddd xmm9, xmm14
462
+ pxor xmm5, xmm10
463
+ pxor xmm6, xmm11
464
+ pxor xmm7, xmm8
465
+ pxor xmm4, xmm9
466
+ movdqa xmmword ptr [rsp+0x100], xmm8
467
+ movdqa xmm8, xmm5
468
+ psrld xmm8, 12
469
+ pslld xmm5, 20
470
+ por xmm5, xmm8
471
+ movdqa xmm8, xmm6
472
+ psrld xmm8, 12
473
+ pslld xmm6, 20
474
+ por xmm6, xmm8
475
+ movdqa xmm8, xmm7
476
+ psrld xmm8, 12
477
+ pslld xmm7, 20
478
+ por xmm7, xmm8
479
+ movdqa xmm8, xmm4
480
+ psrld xmm8, 12
481
+ pslld xmm4, 20
482
+ por xmm4, xmm8
483
+ paddd xmm0, xmmword ptr [rsp+0xB0]
484
+ paddd xmm1, xmmword ptr [rsp+0x50]
485
+ paddd xmm2, xmmword ptr [rsp+0xE0]
486
+ paddd xmm3, xmmword ptr [rsp+0x80]
487
+ paddd xmm0, xmm5
488
+ paddd xmm1, xmm6
489
+ paddd xmm2, xmm7
490
+ paddd xmm3, xmm4
491
+ pxor xmm15, xmm0
492
+ pxor xmm12, xmm1
493
+ pxor xmm13, xmm2
494
+ pxor xmm14, xmm3
495
+ movdqa xmm8, xmmword ptr [ROT8+rip]
496
+ pshufb xmm15, xmm8
497
+ pshufb xmm12, xmm8
498
+ pshufb xmm13, xmm8
499
+ pshufb xmm14, xmm8
500
+ paddd xmm10, xmm15
501
+ paddd xmm11, xmm12
502
+ movdqa xmm8, xmmword ptr [rsp+0x100]
503
+ paddd xmm8, xmm13
504
+ paddd xmm9, xmm14
505
+ pxor xmm5, xmm10
506
+ pxor xmm6, xmm11
507
+ pxor xmm7, xmm8
508
+ pxor xmm4, xmm9
509
+ movdqa xmmword ptr [rsp+0x100], xmm8
510
+ movdqa xmm8, xmm5
511
+ psrld xmm8, 7
512
+ pslld xmm5, 25
513
+ por xmm5, xmm8
514
+ movdqa xmm8, xmm6
515
+ psrld xmm8, 7
516
+ pslld xmm6, 25
517
+ por xmm6, xmm8
518
+ movdqa xmm8, xmm7
519
+ psrld xmm8, 7
520
+ pslld xmm7, 25
521
+ por xmm7, xmm8
522
+ movdqa xmm8, xmm4
523
+ psrld xmm8, 7
524
+ pslld xmm4, 25
525
+ por xmm4, xmm8
526
+ paddd xmm0, xmmword ptr [rsp+0x30]
527
+ paddd xmm1, xmmword ptr [rsp+0xA0]
528
+ paddd xmm2, xmmword ptr [rsp+0xD0]
529
+ paddd xmm3, xmmword ptr [rsp+0x70]
530
+ paddd xmm0, xmm4
531
+ paddd xmm1, xmm5
532
+ paddd xmm2, xmm6
533
+ paddd xmm3, xmm7
534
+ pxor xmm12, xmm0
535
+ pxor xmm13, xmm1
536
+ pxor xmm14, xmm2
537
+ pxor xmm15, xmm3
538
+ movdqa xmm8, xmmword ptr [ROT16+rip]
539
+ pshufb xmm12, xmm8
540
+ pshufb xmm13, xmm8
541
+ pshufb xmm14, xmm8
542
+ pshufb xmm15, xmm8
543
+ movdqa xmm8, xmmword ptr [rsp+0x100]
544
+ paddd xmm8, xmm12
545
+ paddd xmm9, xmm13
546
+ paddd xmm10, xmm14
547
+ paddd xmm11, xmm15
548
+ pxor xmm4, xmm8
549
+ pxor xmm5, xmm9
550
+ pxor xmm6, xmm10
551
+ pxor xmm7, xmm11
552
+ movdqa xmmword ptr [rsp+0x100], xmm8
553
+ movdqa xmm8, xmm4
554
+ psrld xmm8, 12
555
+ pslld xmm4, 20
556
+ por xmm4, xmm8
557
+ movdqa xmm8, xmm5
558
+ psrld xmm8, 12
559
+ pslld xmm5, 20
560
+ por xmm5, xmm8
561
+ movdqa xmm8, xmm6
562
+ psrld xmm8, 12
563
+ pslld xmm6, 20
564
+ por xmm6, xmm8
565
+ movdqa xmm8, xmm7
566
+ psrld xmm8, 12
567
+ pslld xmm7, 20
568
+ por xmm7, xmm8
569
+ paddd xmm0, xmmword ptr [rsp+0x40]
570
+ paddd xmm1, xmmword ptr [rsp+0xC0]
571
+ paddd xmm2, xmmword ptr [rsp+0x20]
572
+ paddd xmm3, xmmword ptr [rsp+0xE0]
573
+ paddd xmm0, xmm4
574
+ paddd xmm1, xmm5
575
+ paddd xmm2, xmm6
576
+ paddd xmm3, xmm7
577
+ pxor xmm12, xmm0
578
+ pxor xmm13, xmm1
579
+ pxor xmm14, xmm2
580
+ pxor xmm15, xmm3
581
+ movdqa xmm8, xmmword ptr [ROT8+rip]
582
+ pshufb xmm12, xmm8
583
+ pshufb xmm13, xmm8
584
+ pshufb xmm14, xmm8
585
+ pshufb xmm15, xmm8
586
+ movdqa xmm8, xmmword ptr [rsp+0x100]
587
+ paddd xmm8, xmm12
588
+ paddd xmm9, xmm13
589
+ paddd xmm10, xmm14
590
+ paddd xmm11, xmm15
591
+ pxor xmm4, xmm8
592
+ pxor xmm5, xmm9
593
+ pxor xmm6, xmm10
594
+ pxor xmm7, xmm11
595
+ movdqa xmmword ptr [rsp+0x100], xmm8
596
+ movdqa xmm8, xmm4
597
+ psrld xmm8, 7
598
+ pslld xmm4, 25
599
+ por xmm4, xmm8
600
+ movdqa xmm8, xmm5
601
+ psrld xmm8, 7
602
+ pslld xmm5, 25
603
+ por xmm5, xmm8
604
+ movdqa xmm8, xmm6
605
+ psrld xmm8, 7
606
+ pslld xmm6, 25
607
+ por xmm6, xmm8
608
+ movdqa xmm8, xmm7
609
+ psrld xmm8, 7
610
+ pslld xmm7, 25
611
+ por xmm7, xmm8
612
+ paddd xmm0, xmmword ptr [rsp+0x60]
613
+ paddd xmm1, xmmword ptr [rsp+0x90]
614
+ paddd xmm2, xmmword ptr [rsp+0xB0]
615
+ paddd xmm3, xmmword ptr [rsp+0x80]
616
+ paddd xmm0, xmm5
617
+ paddd xmm1, xmm6
618
+ paddd xmm2, xmm7
619
+ paddd xmm3, xmm4
620
+ pxor xmm15, xmm0
621
+ pxor xmm12, xmm1
622
+ pxor xmm13, xmm2
623
+ pxor xmm14, xmm3
624
+ movdqa xmm8, xmmword ptr [ROT16+rip]
625
+ pshufb xmm15, xmm8
626
+ pshufb xmm12, xmm8
627
+ pshufb xmm13, xmm8
628
+ pshufb xmm14, xmm8
629
+ paddd xmm10, xmm15
630
+ paddd xmm11, xmm12
631
+ movdqa xmm8, xmmword ptr [rsp+0x100]
632
+ paddd xmm8, xmm13
633
+ paddd xmm9, xmm14
634
+ pxor xmm5, xmm10
635
+ pxor xmm6, xmm11
636
+ pxor xmm7, xmm8
637
+ pxor xmm4, xmm9
638
+ movdqa xmmword ptr [rsp+0x100], xmm8
639
+ movdqa xmm8, xmm5
640
+ psrld xmm8, 12
641
+ pslld xmm5, 20
642
+ por xmm5, xmm8
643
+ movdqa xmm8, xmm6
644
+ psrld xmm8, 12
645
+ pslld xmm6, 20
646
+ por xmm6, xmm8
647
+ movdqa xmm8, xmm7
648
+ psrld xmm8, 12
649
+ pslld xmm7, 20
650
+ por xmm7, xmm8
651
+ movdqa xmm8, xmm4
652
+ psrld xmm8, 12
653
+ pslld xmm4, 20
654
+ por xmm4, xmm8
655
+ paddd xmm0, xmmword ptr [rsp+0x50]
656
+ paddd xmm1, xmmword ptr [rsp]
657
+ paddd xmm2, xmmword ptr [rsp+0xF0]
658
+ paddd xmm3, xmmword ptr [rsp+0x10]
659
+ paddd xmm0, xmm5
660
+ paddd xmm1, xmm6
661
+ paddd xmm2, xmm7
662
+ paddd xmm3, xmm4
663
+ pxor xmm15, xmm0
664
+ pxor xmm12, xmm1
665
+ pxor xmm13, xmm2
666
+ pxor xmm14, xmm3
667
+ movdqa xmm8, xmmword ptr [ROT8+rip]
668
+ pshufb xmm15, xmm8
669
+ pshufb xmm12, xmm8
670
+ pshufb xmm13, xmm8
671
+ pshufb xmm14, xmm8
672
+ paddd xmm10, xmm15
673
+ paddd xmm11, xmm12
674
+ movdqa xmm8, xmmword ptr [rsp+0x100]
675
+ paddd xmm8, xmm13
676
+ paddd xmm9, xmm14
677
+ pxor xmm5, xmm10
678
+ pxor xmm6, xmm11
679
+ pxor xmm7, xmm8
680
+ pxor xmm4, xmm9
681
+ movdqa xmmword ptr [rsp+0x100], xmm8
682
+ movdqa xmm8, xmm5
683
+ psrld xmm8, 7
684
+ pslld xmm5, 25
685
+ por xmm5, xmm8
686
+ movdqa xmm8, xmm6
687
+ psrld xmm8, 7
688
+ pslld xmm6, 25
689
+ por xmm6, xmm8
690
+ movdqa xmm8, xmm7
691
+ psrld xmm8, 7
692
+ pslld xmm7, 25
693
+ por xmm7, xmm8
694
+ movdqa xmm8, xmm4
695
+ psrld xmm8, 7
696
+ pslld xmm4, 25
697
+ por xmm4, xmm8
698
+ paddd xmm0, xmmword ptr [rsp+0xA0]
699
+ paddd xmm1, xmmword ptr [rsp+0xC0]
700
+ paddd xmm2, xmmword ptr [rsp+0xE0]
701
+ paddd xmm3, xmmword ptr [rsp+0xD0]
702
+ paddd xmm0, xmm4
703
+ paddd xmm1, xmm5
704
+ paddd xmm2, xmm6
705
+ paddd xmm3, xmm7
706
+ pxor xmm12, xmm0
707
+ pxor xmm13, xmm1
708
+ pxor xmm14, xmm2
709
+ pxor xmm15, xmm3
710
+ movdqa xmm8, xmmword ptr [ROT16+rip]
711
+ pshufb xmm12, xmm8
712
+ pshufb xmm13, xmm8
713
+ pshufb xmm14, xmm8
714
+ pshufb xmm15, xmm8
715
+ movdqa xmm8, xmmword ptr [rsp+0x100]
716
+ paddd xmm8, xmm12
717
+ paddd xmm9, xmm13
718
+ paddd xmm10, xmm14
719
+ paddd xmm11, xmm15
720
+ pxor xmm4, xmm8
721
+ pxor xmm5, xmm9
722
+ pxor xmm6, xmm10
723
+ pxor xmm7, xmm11
724
+ movdqa xmmword ptr [rsp+0x100], xmm8
725
+ movdqa xmm8, xmm4
726
+ psrld xmm8, 12
727
+ pslld xmm4, 20
728
+ por xmm4, xmm8
729
+ movdqa xmm8, xmm5
730
+ psrld xmm8, 12
731
+ pslld xmm5, 20
732
+ por xmm5, xmm8
733
+ movdqa xmm8, xmm6
734
+ psrld xmm8, 12
735
+ pslld xmm6, 20
736
+ por xmm6, xmm8
737
+ movdqa xmm8, xmm7
738
+ psrld xmm8, 12
739
+ pslld xmm7, 20
740
+ por xmm7, xmm8
741
+ paddd xmm0, xmmword ptr [rsp+0x70]
742
+ paddd xmm1, xmmword ptr [rsp+0x90]
743
+ paddd xmm2, xmmword ptr [rsp+0x30]
744
+ paddd xmm3, xmmword ptr [rsp+0xF0]
745
+ paddd xmm0, xmm4
746
+ paddd xmm1, xmm5
747
+ paddd xmm2, xmm6
748
+ paddd xmm3, xmm7
749
+ pxor xmm12, xmm0
750
+ pxor xmm13, xmm1
751
+ pxor xmm14, xmm2
752
+ pxor xmm15, xmm3
753
+ movdqa xmm8, xmmword ptr [ROT8+rip]
754
+ pshufb xmm12, xmm8
755
+ pshufb xmm13, xmm8
756
+ pshufb xmm14, xmm8
757
+ pshufb xmm15, xmm8
758
+ movdqa xmm8, xmmword ptr [rsp+0x100]
759
+ paddd xmm8, xmm12
760
+ paddd xmm9, xmm13
761
+ paddd xmm10, xmm14
762
+ paddd xmm11, xmm15
763
+ pxor xmm4, xmm8
764
+ pxor xmm5, xmm9
765
+ pxor xmm6, xmm10
766
+ pxor xmm7, xmm11
767
+ movdqa xmmword ptr [rsp+0x100], xmm8
768
+ movdqa xmm8, xmm4
769
+ psrld xmm8, 7
770
+ pslld xmm4, 25
771
+ por xmm4, xmm8
772
+ movdqa xmm8, xmm5
773
+ psrld xmm8, 7
774
+ pslld xmm5, 25
775
+ por xmm5, xmm8
776
+ movdqa xmm8, xmm6
777
+ psrld xmm8, 7
778
+ pslld xmm6, 25
779
+ por xmm6, xmm8
780
+ movdqa xmm8, xmm7
781
+ psrld xmm8, 7
782
+ pslld xmm7, 25
783
+ por xmm7, xmm8
784
+ paddd xmm0, xmmword ptr [rsp+0x40]
785
+ paddd xmm1, xmmword ptr [rsp+0xB0]
786
+ paddd xmm2, xmmword ptr [rsp+0x50]
787
+ paddd xmm3, xmmword ptr [rsp+0x10]
788
+ paddd xmm0, xmm5
789
+ paddd xmm1, xmm6
790
+ paddd xmm2, xmm7
791
+ paddd xmm3, xmm4
792
+ pxor xmm15, xmm0
793
+ pxor xmm12, xmm1
794
+ pxor xmm13, xmm2
795
+ pxor xmm14, xmm3
796
+ movdqa xmm8, xmmword ptr [ROT16+rip]
797
+ pshufb xmm15, xmm8
798
+ pshufb xmm12, xmm8
799
+ pshufb xmm13, xmm8
800
+ pshufb xmm14, xmm8
801
+ paddd xmm10, xmm15
802
+ paddd xmm11, xmm12
803
+ movdqa xmm8, xmmword ptr [rsp+0x100]
804
+ paddd xmm8, xmm13
805
+ paddd xmm9, xmm14
806
+ pxor xmm5, xmm10
807
+ pxor xmm6, xmm11
808
+ pxor xmm7, xmm8
809
+ pxor xmm4, xmm9
810
+ movdqa xmmword ptr [rsp+0x100], xmm8
811
+ movdqa xmm8, xmm5
812
+ psrld xmm8, 12
813
+ pslld xmm5, 20
814
+ por xmm5, xmm8
815
+ movdqa xmm8, xmm6
816
+ psrld xmm8, 12
817
+ pslld xmm6, 20
818
+ por xmm6, xmm8
819
+ movdqa xmm8, xmm7
820
+ psrld xmm8, 12
821
+ pslld xmm7, 20
822
+ por xmm7, xmm8
823
+ movdqa xmm8, xmm4
824
+ psrld xmm8, 12
825
+ pslld xmm4, 20
826
+ por xmm4, xmm8
827
+ paddd xmm0, xmmword ptr [rsp]
828
+ paddd xmm1, xmmword ptr [rsp+0x20]
829
+ paddd xmm2, xmmword ptr [rsp+0x80]
830
+ paddd xmm3, xmmword ptr [rsp+0x60]
831
+ paddd xmm0, xmm5
832
+ paddd xmm1, xmm6
833
+ paddd xmm2, xmm7
834
+ paddd xmm3, xmm4
835
+ pxor xmm15, xmm0
836
+ pxor xmm12, xmm1
837
+ pxor xmm13, xmm2
838
+ pxor xmm14, xmm3
839
+ movdqa xmm8, xmmword ptr [ROT8+rip]
840
+ pshufb xmm15, xmm8
841
+ pshufb xmm12, xmm8
842
+ pshufb xmm13, xmm8
843
+ pshufb xmm14, xmm8
844
+ paddd xmm10, xmm15
845
+ paddd xmm11, xmm12
846
+ movdqa xmm8, xmmword ptr [rsp+0x100]
847
+ paddd xmm8, xmm13
848
+ paddd xmm9, xmm14
849
+ pxor xmm5, xmm10
850
+ pxor xmm6, xmm11
851
+ pxor xmm7, xmm8
852
+ pxor xmm4, xmm9
853
+ movdqa xmmword ptr [rsp+0x100], xmm8
854
+ movdqa xmm8, xmm5
855
+ psrld xmm8, 7
856
+ pslld xmm5, 25
857
+ por xmm5, xmm8
858
+ movdqa xmm8, xmm6
859
+ psrld xmm8, 7
860
+ pslld xmm6, 25
861
+ por xmm6, xmm8
862
+ movdqa xmm8, xmm7
863
+ psrld xmm8, 7
864
+ pslld xmm7, 25
865
+ por xmm7, xmm8
866
+ movdqa xmm8, xmm4
867
+ psrld xmm8, 7
868
+ pslld xmm4, 25
869
+ por xmm4, xmm8
870
+ paddd xmm0, xmmword ptr [rsp+0xC0]
871
+ paddd xmm1, xmmword ptr [rsp+0x90]
872
+ paddd xmm2, xmmword ptr [rsp+0xF0]
873
+ paddd xmm3, xmmword ptr [rsp+0xE0]
874
+ paddd xmm0, xmm4
875
+ paddd xmm1, xmm5
876
+ paddd xmm2, xmm6
877
+ paddd xmm3, xmm7
878
+ pxor xmm12, xmm0
879
+ pxor xmm13, xmm1
880
+ pxor xmm14, xmm2
881
+ pxor xmm15, xmm3
882
+ movdqa xmm8, xmmword ptr [ROT16+rip]
883
+ pshufb xmm12, xmm8
884
+ pshufb xmm13, xmm8
885
+ pshufb xmm14, xmm8
886
+ pshufb xmm15, xmm8
887
+ movdqa xmm8, xmmword ptr [rsp+0x100]
888
+ paddd xmm8, xmm12
889
+ paddd xmm9, xmm13
890
+ paddd xmm10, xmm14
891
+ paddd xmm11, xmm15
892
+ pxor xmm4, xmm8
893
+ pxor xmm5, xmm9
894
+ pxor xmm6, xmm10
895
+ pxor xmm7, xmm11
896
+ movdqa xmmword ptr [rsp+0x100], xmm8
897
+ movdqa xmm8, xmm4
898
+ psrld xmm8, 12
899
+ pslld xmm4, 20
900
+ por xmm4, xmm8
901
+ movdqa xmm8, xmm5
902
+ psrld xmm8, 12
903
+ pslld xmm5, 20
904
+ por xmm5, xmm8
905
+ movdqa xmm8, xmm6
906
+ psrld xmm8, 12
907
+ pslld xmm6, 20
908
+ por xmm6, xmm8
909
+ movdqa xmm8, xmm7
910
+ psrld xmm8, 12
911
+ pslld xmm7, 20
912
+ por xmm7, xmm8
913
+ paddd xmm0, xmmword ptr [rsp+0xD0]
914
+ paddd xmm1, xmmword ptr [rsp+0xB0]
915
+ paddd xmm2, xmmword ptr [rsp+0xA0]
916
+ paddd xmm3, xmmword ptr [rsp+0x80]
917
+ paddd xmm0, xmm4
918
+ paddd xmm1, xmm5
919
+ paddd xmm2, xmm6
920
+ paddd xmm3, xmm7
921
+ pxor xmm12, xmm0
922
+ pxor xmm13, xmm1
923
+ pxor xmm14, xmm2
924
+ pxor xmm15, xmm3
925
+ movdqa xmm8, xmmword ptr [ROT8+rip]
926
+ pshufb xmm12, xmm8
927
+ pshufb xmm13, xmm8
928
+ pshufb xmm14, xmm8
929
+ pshufb xmm15, xmm8
930
+ movdqa xmm8, xmmword ptr [rsp+0x100]
931
+ paddd xmm8, xmm12
932
+ paddd xmm9, xmm13
933
+ paddd xmm10, xmm14
934
+ paddd xmm11, xmm15
935
+ pxor xmm4, xmm8
936
+ pxor xmm5, xmm9
937
+ pxor xmm6, xmm10
938
+ pxor xmm7, xmm11
939
+ movdqa xmmword ptr [rsp+0x100], xmm8
940
+ movdqa xmm8, xmm4
941
+ psrld xmm8, 7
942
+ pslld xmm4, 25
943
+ por xmm4, xmm8
944
+ movdqa xmm8, xmm5
945
+ psrld xmm8, 7
946
+ pslld xmm5, 25
947
+ por xmm5, xmm8
948
+ movdqa xmm8, xmm6
949
+ psrld xmm8, 7
950
+ pslld xmm6, 25
951
+ por xmm6, xmm8
952
+ movdqa xmm8, xmm7
953
+ psrld xmm8, 7
954
+ pslld xmm7, 25
955
+ por xmm7, xmm8
956
+ paddd xmm0, xmmword ptr [rsp+0x70]
957
+ paddd xmm1, xmmword ptr [rsp+0x50]
958
+ paddd xmm2, xmmword ptr [rsp]
959
+ paddd xmm3, xmmword ptr [rsp+0x60]
960
+ paddd xmm0, xmm5
961
+ paddd xmm1, xmm6
962
+ paddd xmm2, xmm7
963
+ paddd xmm3, xmm4
964
+ pxor xmm15, xmm0
965
+ pxor xmm12, xmm1
966
+ pxor xmm13, xmm2
967
+ pxor xmm14, xmm3
968
+ movdqa xmm8, xmmword ptr [ROT16+rip]
969
+ pshufb xmm15, xmm8
970
+ pshufb xmm12, xmm8
971
+ pshufb xmm13, xmm8
972
+ pshufb xmm14, xmm8
973
+ paddd xmm10, xmm15
974
+ paddd xmm11, xmm12
975
+ movdqa xmm8, xmmword ptr [rsp+0x100]
976
+ paddd xmm8, xmm13
977
+ paddd xmm9, xmm14
978
+ pxor xmm5, xmm10
979
+ pxor xmm6, xmm11
980
+ pxor xmm7, xmm8
981
+ pxor xmm4, xmm9
982
+ movdqa xmmword ptr [rsp+0x100], xmm8
983
+ movdqa xmm8, xmm5
984
+ psrld xmm8, 12
985
+ pslld xmm5, 20
986
+ por xmm5, xmm8
987
+ movdqa xmm8, xmm6
988
+ psrld xmm8, 12
989
+ pslld xmm6, 20
990
+ por xmm6, xmm8
991
+ movdqa xmm8, xmm7
992
+ psrld xmm8, 12
993
+ pslld xmm7, 20
994
+ por xmm7, xmm8
995
+ movdqa xmm8, xmm4
996
+ psrld xmm8, 12
997
+ pslld xmm4, 20
998
+ por xmm4, xmm8
999
+ paddd xmm0, xmmword ptr [rsp+0x20]
1000
+ paddd xmm1, xmmword ptr [rsp+0x30]
1001
+ paddd xmm2, xmmword ptr [rsp+0x10]
1002
+ paddd xmm3, xmmword ptr [rsp+0x40]
1003
+ paddd xmm0, xmm5
1004
+ paddd xmm1, xmm6
1005
+ paddd xmm2, xmm7
1006
+ paddd xmm3, xmm4
1007
+ pxor xmm15, xmm0
1008
+ pxor xmm12, xmm1
1009
+ pxor xmm13, xmm2
1010
+ pxor xmm14, xmm3
1011
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1012
+ pshufb xmm15, xmm8
1013
+ pshufb xmm12, xmm8
1014
+ pshufb xmm13, xmm8
1015
+ pshufb xmm14, xmm8
1016
+ paddd xmm10, xmm15
1017
+ paddd xmm11, xmm12
1018
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1019
+ paddd xmm8, xmm13
1020
+ paddd xmm9, xmm14
1021
+ pxor xmm5, xmm10
1022
+ pxor xmm6, xmm11
1023
+ pxor xmm7, xmm8
1024
+ pxor xmm4, xmm9
1025
+ movdqa xmmword ptr [rsp+0x100], xmm8
1026
+ movdqa xmm8, xmm5
1027
+ psrld xmm8, 7
1028
+ pslld xmm5, 25
1029
+ por xmm5, xmm8
1030
+ movdqa xmm8, xmm6
1031
+ psrld xmm8, 7
1032
+ pslld xmm6, 25
1033
+ por xmm6, xmm8
1034
+ movdqa xmm8, xmm7
1035
+ psrld xmm8, 7
1036
+ pslld xmm7, 25
1037
+ por xmm7, xmm8
1038
+ movdqa xmm8, xmm4
1039
+ psrld xmm8, 7
1040
+ pslld xmm4, 25
1041
+ por xmm4, xmm8
1042
+ paddd xmm0, xmmword ptr [rsp+0x90]
1043
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1044
+ paddd xmm2, xmmword ptr [rsp+0x80]
1045
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1046
+ paddd xmm0, xmm4
1047
+ paddd xmm1, xmm5
1048
+ paddd xmm2, xmm6
1049
+ paddd xmm3, xmm7
1050
+ pxor xmm12, xmm0
1051
+ pxor xmm13, xmm1
1052
+ pxor xmm14, xmm2
1053
+ pxor xmm15, xmm3
1054
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1055
+ pshufb xmm12, xmm8
1056
+ pshufb xmm13, xmm8
1057
+ pshufb xmm14, xmm8
1058
+ pshufb xmm15, xmm8
1059
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1060
+ paddd xmm8, xmm12
1061
+ paddd xmm9, xmm13
1062
+ paddd xmm10, xmm14
1063
+ paddd xmm11, xmm15
1064
+ pxor xmm4, xmm8
1065
+ pxor xmm5, xmm9
1066
+ pxor xmm6, xmm10
1067
+ pxor xmm7, xmm11
1068
+ movdqa xmmword ptr [rsp+0x100], xmm8
1069
+ movdqa xmm8, xmm4
1070
+ psrld xmm8, 12
1071
+ pslld xmm4, 20
1072
+ por xmm4, xmm8
1073
+ movdqa xmm8, xmm5
1074
+ psrld xmm8, 12
1075
+ pslld xmm5, 20
1076
+ por xmm5, xmm8
1077
+ movdqa xmm8, xmm6
1078
+ psrld xmm8, 12
1079
+ pslld xmm6, 20
1080
+ por xmm6, xmm8
1081
+ movdqa xmm8, xmm7
1082
+ psrld xmm8, 12
1083
+ pslld xmm7, 20
1084
+ por xmm7, xmm8
1085
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1086
+ paddd xmm1, xmmword ptr [rsp+0x50]
1087
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1088
+ paddd xmm3, xmmword ptr [rsp+0x10]
1089
+ paddd xmm0, xmm4
1090
+ paddd xmm1, xmm5
1091
+ paddd xmm2, xmm6
1092
+ paddd xmm3, xmm7
1093
+ pxor xmm12, xmm0
1094
+ pxor xmm13, xmm1
1095
+ pxor xmm14, xmm2
1096
+ pxor xmm15, xmm3
1097
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1098
+ pshufb xmm12, xmm8
1099
+ pshufb xmm13, xmm8
1100
+ pshufb xmm14, xmm8
1101
+ pshufb xmm15, xmm8
1102
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1103
+ paddd xmm8, xmm12
1104
+ paddd xmm9, xmm13
1105
+ paddd xmm10, xmm14
1106
+ paddd xmm11, xmm15
1107
+ pxor xmm4, xmm8
1108
+ pxor xmm5, xmm9
1109
+ pxor xmm6, xmm10
1110
+ pxor xmm7, xmm11
1111
+ movdqa xmmword ptr [rsp+0x100], xmm8
1112
+ movdqa xmm8, xmm4
1113
+ psrld xmm8, 7
1114
+ pslld xmm4, 25
1115
+ por xmm4, xmm8
1116
+ movdqa xmm8, xmm5
1117
+ psrld xmm8, 7
1118
+ pslld xmm5, 25
1119
+ por xmm5, xmm8
1120
+ movdqa xmm8, xmm6
1121
+ psrld xmm8, 7
1122
+ pslld xmm6, 25
1123
+ por xmm6, xmm8
1124
+ movdqa xmm8, xmm7
1125
+ psrld xmm8, 7
1126
+ pslld xmm7, 25
1127
+ por xmm7, xmm8
1128
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1129
+ paddd xmm1, xmmword ptr [rsp]
1130
+ paddd xmm2, xmmword ptr [rsp+0x20]
1131
+ paddd xmm3, xmmword ptr [rsp+0x40]
1132
+ paddd xmm0, xmm5
1133
+ paddd xmm1, xmm6
1134
+ paddd xmm2, xmm7
1135
+ paddd xmm3, xmm4
1136
+ pxor xmm15, xmm0
1137
+ pxor xmm12, xmm1
1138
+ pxor xmm13, xmm2
1139
+ pxor xmm14, xmm3
1140
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1141
+ pshufb xmm15, xmm8
1142
+ pshufb xmm12, xmm8
1143
+ pshufb xmm13, xmm8
1144
+ pshufb xmm14, xmm8
1145
+ paddd xmm10, xmm15
1146
+ paddd xmm11, xmm12
1147
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1148
+ paddd xmm8, xmm13
1149
+ paddd xmm9, xmm14
1150
+ pxor xmm5, xmm10
1151
+ pxor xmm6, xmm11
1152
+ pxor xmm7, xmm8
1153
+ pxor xmm4, xmm9
1154
+ movdqa xmmword ptr [rsp+0x100], xmm8
1155
+ movdqa xmm8, xmm5
1156
+ psrld xmm8, 12
1157
+ pslld xmm5, 20
1158
+ por xmm5, xmm8
1159
+ movdqa xmm8, xmm6
1160
+ psrld xmm8, 12
1161
+ pslld xmm6, 20
1162
+ por xmm6, xmm8
1163
+ movdqa xmm8, xmm7
1164
+ psrld xmm8, 12
1165
+ pslld xmm7, 20
1166
+ por xmm7, xmm8
1167
+ movdqa xmm8, xmm4
1168
+ psrld xmm8, 12
1169
+ pslld xmm4, 20
1170
+ por xmm4, xmm8
1171
+ paddd xmm0, xmmword ptr [rsp+0x30]
1172
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1173
+ paddd xmm2, xmmword ptr [rsp+0x60]
1174
+ paddd xmm3, xmmword ptr [rsp+0x70]
1175
+ paddd xmm0, xmm5
1176
+ paddd xmm1, xmm6
1177
+ paddd xmm2, xmm7
1178
+ paddd xmm3, xmm4
1179
+ pxor xmm15, xmm0
1180
+ pxor xmm12, xmm1
1181
+ pxor xmm13, xmm2
1182
+ pxor xmm14, xmm3
1183
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1184
+ pshufb xmm15, xmm8
1185
+ pshufb xmm12, xmm8
1186
+ pshufb xmm13, xmm8
1187
+ pshufb xmm14, xmm8
1188
+ paddd xmm10, xmm15
1189
+ paddd xmm11, xmm12
1190
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1191
+ paddd xmm8, xmm13
1192
+ paddd xmm9, xmm14
1193
+ pxor xmm5, xmm10
1194
+ pxor xmm6, xmm11
1195
+ pxor xmm7, xmm8
1196
+ pxor xmm4, xmm9
1197
+ movdqa xmmword ptr [rsp+0x100], xmm8
1198
+ movdqa xmm8, xmm5
1199
+ psrld xmm8, 7
1200
+ pslld xmm5, 25
1201
+ por xmm5, xmm8
1202
+ movdqa xmm8, xmm6
1203
+ psrld xmm8, 7
1204
+ pslld xmm6, 25
1205
+ por xmm6, xmm8
1206
+ movdqa xmm8, xmm7
1207
+ psrld xmm8, 7
1208
+ pslld xmm7, 25
1209
+ por xmm7, xmm8
1210
+ movdqa xmm8, xmm4
1211
+ psrld xmm8, 7
1212
+ pslld xmm4, 25
1213
+ por xmm4, xmm8
1214
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1215
+ paddd xmm1, xmmword ptr [rsp+0x50]
1216
+ paddd xmm2, xmmword ptr [rsp+0x10]
1217
+ paddd xmm3, xmmword ptr [rsp+0x80]
1218
+ paddd xmm0, xmm4
1219
+ paddd xmm1, xmm5
1220
+ paddd xmm2, xmm6
1221
+ paddd xmm3, xmm7
1222
+ pxor xmm12, xmm0
1223
+ pxor xmm13, xmm1
1224
+ pxor xmm14, xmm2
1225
+ pxor xmm15, xmm3
1226
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1227
+ pshufb xmm12, xmm8
1228
+ pshufb xmm13, xmm8
1229
+ pshufb xmm14, xmm8
1230
+ pshufb xmm15, xmm8
1231
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1232
+ paddd xmm8, xmm12
1233
+ paddd xmm9, xmm13
1234
+ paddd xmm10, xmm14
1235
+ paddd xmm11, xmm15
1236
+ pxor xmm4, xmm8
1237
+ pxor xmm5, xmm9
1238
+ pxor xmm6, xmm10
1239
+ pxor xmm7, xmm11
1240
+ movdqa xmmword ptr [rsp+0x100], xmm8
1241
+ movdqa xmm8, xmm4
1242
+ psrld xmm8, 12
1243
+ pslld xmm4, 20
1244
+ por xmm4, xmm8
1245
+ movdqa xmm8, xmm5
1246
+ psrld xmm8, 12
1247
+ pslld xmm5, 20
1248
+ por xmm5, xmm8
1249
+ movdqa xmm8, xmm6
1250
+ psrld xmm8, 12
1251
+ pslld xmm6, 20
1252
+ por xmm6, xmm8
1253
+ movdqa xmm8, xmm7
1254
+ psrld xmm8, 12
1255
+ pslld xmm7, 20
1256
+ por xmm7, xmm8
1257
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1258
+ paddd xmm1, xmmword ptr [rsp]
1259
+ paddd xmm2, xmmword ptr [rsp+0x90]
1260
+ paddd xmm3, xmmword ptr [rsp+0x60]
1261
+ paddd xmm0, xmm4
1262
+ paddd xmm1, xmm5
1263
+ paddd xmm2, xmm6
1264
+ paddd xmm3, xmm7
1265
+ pxor xmm12, xmm0
1266
+ pxor xmm13, xmm1
1267
+ pxor xmm14, xmm2
1268
+ pxor xmm15, xmm3
1269
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1270
+ pshufb xmm12, xmm8
1271
+ pshufb xmm13, xmm8
1272
+ pshufb xmm14, xmm8
1273
+ pshufb xmm15, xmm8
1274
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1275
+ paddd xmm8, xmm12
1276
+ paddd xmm9, xmm13
1277
+ paddd xmm10, xmm14
1278
+ paddd xmm11, xmm15
1279
+ pxor xmm4, xmm8
1280
+ pxor xmm5, xmm9
1281
+ pxor xmm6, xmm10
1282
+ pxor xmm7, xmm11
1283
+ movdqa xmmword ptr [rsp+0x100], xmm8
1284
+ movdqa xmm8, xmm4
1285
+ psrld xmm8, 7
1286
+ pslld xmm4, 25
1287
+ por xmm4, xmm8
1288
+ movdqa xmm8, xmm5
1289
+ psrld xmm8, 7
1290
+ pslld xmm5, 25
1291
+ por xmm5, xmm8
1292
+ movdqa xmm8, xmm6
1293
+ psrld xmm8, 7
1294
+ pslld xmm6, 25
1295
+ por xmm6, xmm8
1296
+ movdqa xmm8, xmm7
1297
+ psrld xmm8, 7
1298
+ pslld xmm7, 25
1299
+ por xmm7, xmm8
1300
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1301
+ paddd xmm1, xmmword ptr [rsp+0x20]
1302
+ paddd xmm2, xmmword ptr [rsp+0x30]
1303
+ paddd xmm3, xmmword ptr [rsp+0x70]
1304
+ paddd xmm0, xmm5
1305
+ paddd xmm1, xmm6
1306
+ paddd xmm2, xmm7
1307
+ paddd xmm3, xmm4
1308
+ pxor xmm15, xmm0
1309
+ pxor xmm12, xmm1
1310
+ pxor xmm13, xmm2
1311
+ pxor xmm14, xmm3
1312
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1313
+ pshufb xmm15, xmm8
1314
+ pshufb xmm12, xmm8
1315
+ pshufb xmm13, xmm8
1316
+ pshufb xmm14, xmm8
1317
+ paddd xmm10, xmm15
1318
+ paddd xmm11, xmm12
1319
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1320
+ paddd xmm8, xmm13
1321
+ paddd xmm9, xmm14
1322
+ pxor xmm5, xmm10
1323
+ pxor xmm6, xmm11
1324
+ pxor xmm7, xmm8
1325
+ pxor xmm4, xmm9
1326
+ movdqa xmmword ptr [rsp+0x100], xmm8
1327
+ movdqa xmm8, xmm5
1328
+ psrld xmm8, 12
1329
+ pslld xmm5, 20
1330
+ por xmm5, xmm8
1331
+ movdqa xmm8, xmm6
1332
+ psrld xmm8, 12
1333
+ pslld xmm6, 20
1334
+ por xmm6, xmm8
1335
+ movdqa xmm8, xmm7
1336
+ psrld xmm8, 12
1337
+ pslld xmm7, 20
1338
+ por xmm7, xmm8
1339
+ movdqa xmm8, xmm4
1340
+ psrld xmm8, 12
1341
+ pslld xmm4, 20
1342
+ por xmm4, xmm8
1343
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1344
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1345
+ paddd xmm2, xmmword ptr [rsp+0x40]
1346
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1347
+ paddd xmm0, xmm5
1348
+ paddd xmm1, xmm6
1349
+ paddd xmm2, xmm7
1350
+ paddd xmm3, xmm4
1351
+ pxor xmm15, xmm0
1352
+ pxor xmm12, xmm1
1353
+ pxor xmm13, xmm2
1354
+ pxor xmm14, xmm3
1355
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1356
+ pshufb xmm15, xmm8
1357
+ pshufb xmm12, xmm8
1358
+ pshufb xmm13, xmm8
1359
+ pshufb xmm14, xmm8
1360
+ paddd xmm10, xmm15
1361
+ paddd xmm11, xmm12
1362
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1363
+ paddd xmm8, xmm13
1364
+ paddd xmm9, xmm14
1365
+ pxor xmm5, xmm10
1366
+ pxor xmm6, xmm11
1367
+ pxor xmm7, xmm8
1368
+ pxor xmm4, xmm9
1369
+ pxor xmm0, xmm8
1370
+ pxor xmm1, xmm9
1371
+ pxor xmm2, xmm10
1372
+ pxor xmm3, xmm11
1373
+ movdqa xmm8, xmm5
1374
+ psrld xmm8, 7
1375
+ pslld xmm5, 25
1376
+ por xmm5, xmm8
1377
+ movdqa xmm8, xmm6
1378
+ psrld xmm8, 7
1379
+ pslld xmm6, 25
1380
+ por xmm6, xmm8
1381
+ movdqa xmm8, xmm7
1382
+ psrld xmm8, 7
1383
+ pslld xmm7, 25
1384
+ por xmm7, xmm8
1385
+ movdqa xmm8, xmm4
1386
+ psrld xmm8, 7
1387
+ pslld xmm4, 25
1388
+ por xmm4, xmm8
1389
+ pxor xmm4, xmm12
1390
+ pxor xmm5, xmm13
1391
+ pxor xmm6, xmm14
1392
+ pxor xmm7, xmm15
1393
+ mov eax, r13d
1394
+ jne 9b
1395
+ movdqa xmm9, xmm0
1396
+ punpckldq xmm0, xmm1
1397
+ punpckhdq xmm9, xmm1
1398
+ movdqa xmm11, xmm2
1399
+ punpckldq xmm2, xmm3
1400
+ punpckhdq xmm11, xmm3
1401
+ movdqa xmm1, xmm0
1402
+ punpcklqdq xmm0, xmm2
1403
+ punpckhqdq xmm1, xmm2
1404
+ movdqa xmm3, xmm9
1405
+ punpcklqdq xmm9, xmm11
1406
+ punpckhqdq xmm3, xmm11
1407
+ movdqu xmmword ptr [rbx], xmm0
1408
+ movdqu xmmword ptr [rbx+0x20], xmm1
1409
+ movdqu xmmword ptr [rbx+0x40], xmm9
1410
+ movdqu xmmword ptr [rbx+0x60], xmm3
1411
+ movdqa xmm9, xmm4
1412
+ punpckldq xmm4, xmm5
1413
+ punpckhdq xmm9, xmm5
1414
+ movdqa xmm11, xmm6
1415
+ punpckldq xmm6, xmm7
1416
+ punpckhdq xmm11, xmm7
1417
+ movdqa xmm5, xmm4
1418
+ punpcklqdq xmm4, xmm6
1419
+ punpckhqdq xmm5, xmm6
1420
+ movdqa xmm7, xmm9
1421
+ punpcklqdq xmm9, xmm11
1422
+ punpckhqdq xmm7, xmm11
1423
+ movdqu xmmword ptr [rbx+0x10], xmm4
1424
+ movdqu xmmword ptr [rbx+0x30], xmm5
1425
+ movdqu xmmword ptr [rbx+0x50], xmm9
1426
+ movdqu xmmword ptr [rbx+0x70], xmm7
1427
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1428
+ movdqa xmm0, xmm1
1429
+ paddd xmm1, xmmword ptr [rsp+0x150]
1430
+ movdqa xmmword ptr [rsp+0x110], xmm1
1431
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1432
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1433
+ pcmpgtd xmm0, xmm1
1434
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1435
+ psubd xmm1, xmm0
1436
+ movdqa xmmword ptr [rsp+0x120], xmm1
1437
+ add rbx, 128
1438
+ add rdi, 32
1439
+ sub rsi, 4
1440
+ cmp rsi, 4
1441
+ jnc 2b
1442
+ test rsi, rsi
1443
+ jne 3f
1444
+ 4:
1445
+ movdqa xmm6, xmmword ptr [rsp+0x170]
1446
+ movdqa xmm7, xmmword ptr [rsp+0x180]
1447
+ movdqa xmm8, xmmword ptr [rsp+0x190]
1448
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
1449
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
1450
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
1451
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
1452
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
1453
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
1454
+ movdqa xmm15, xmmword ptr [rsp+0x200]
1455
+ mov rsp, rbp
1456
+ pop rbp
1457
+ pop rbx
1458
+ pop rdi
1459
+ pop rsi
1460
+ pop r12
1461
+ pop r13
1462
+ pop r14
1463
+ pop r15
1464
+ ret
1465
+ .p2align 5
1466
+ 3:
1467
+ test esi, 0x2
1468
+ je 3f
1469
+ movups xmm0, xmmword ptr [rcx]
1470
+ movups xmm1, xmmword ptr [rcx+0x10]
1471
+ movaps xmm8, xmm0
1472
+ movaps xmm9, xmm1
1473
+ movd xmm13, dword ptr [rsp+0x110]
1474
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1475
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1476
+ movaps xmmword ptr [rsp], xmm13
1477
+ movd xmm14, dword ptr [rsp+0x114]
1478
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
1479
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1480
+ movaps xmmword ptr [rsp+0x10], xmm14
1481
+ mov r8, qword ptr [rdi]
1482
+ mov r9, qword ptr [rdi+0x8]
1483
+ movzx eax, byte ptr [rbp+0x80]
1484
+ or eax, r13d
1485
+ xor edx, edx
1486
+ 2:
1487
+ mov r14d, eax
1488
+ or eax, r12d
1489
+ add rdx, 64
1490
+ cmp rdx, r15
1491
+ cmovne eax, r14d
1492
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1493
+ movaps xmm10, xmm2
1494
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1495
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1496
+ movaps xmm3, xmm4
1497
+ shufps xmm4, xmm5, 136
1498
+ shufps xmm3, xmm5, 221
1499
+ movaps xmm5, xmm3
1500
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1501
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1502
+ movaps xmm3, xmm6
1503
+ shufps xmm6, xmm7, 136
1504
+ pshufd xmm6, xmm6, 0x93
1505
+ shufps xmm3, xmm7, 221
1506
+ pshufd xmm7, xmm3, 0x93
1507
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1508
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1509
+ movaps xmm11, xmm12
1510
+ shufps xmm12, xmm13, 136
1511
+ shufps xmm11, xmm13, 221
1512
+ movaps xmm13, xmm11
1513
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1514
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1515
+ movaps xmm11, xmm14
1516
+ shufps xmm14, xmm15, 136
1517
+ pshufd xmm14, xmm14, 0x93
1518
+ shufps xmm11, xmm15, 221
1519
+ pshufd xmm15, xmm11, 0x93
1520
+ movaps xmm3, xmmword ptr [rsp]
1521
+ movaps xmm11, xmmword ptr [rsp+0x10]
1522
+ pinsrd xmm3, eax, 3
1523
+ pinsrd xmm11, eax, 3
1524
+ mov al, 7
1525
+ 9:
1526
+ paddd xmm0, xmm4
1527
+ paddd xmm8, xmm12
1528
+ movaps xmmword ptr [rsp+0x20], xmm4
1529
+ movaps xmmword ptr [rsp+0x30], xmm12
1530
+ paddd xmm0, xmm1
1531
+ paddd xmm8, xmm9
1532
+ pxor xmm3, xmm0
1533
+ pxor xmm11, xmm8
1534
+ movaps xmm12, xmmword ptr [ROT16+rip]
1535
+ pshufb xmm3, xmm12
1536
+ pshufb xmm11, xmm12
1537
+ paddd xmm2, xmm3
1538
+ paddd xmm10, xmm11
1539
+ pxor xmm1, xmm2
1540
+ pxor xmm9, xmm10
1541
+ movdqa xmm4, xmm1
1542
+ pslld xmm1, 20
1543
+ psrld xmm4, 12
1544
+ por xmm1, xmm4
1545
+ movdqa xmm4, xmm9
1546
+ pslld xmm9, 20
1547
+ psrld xmm4, 12
1548
+ por xmm9, xmm4
1549
+ paddd xmm0, xmm5
1550
+ paddd xmm8, xmm13
1551
+ movaps xmmword ptr [rsp+0x40], xmm5
1552
+ movaps xmmword ptr [rsp+0x50], xmm13
1553
+ paddd xmm0, xmm1
1554
+ paddd xmm8, xmm9
1555
+ pxor xmm3, xmm0
1556
+ pxor xmm11, xmm8
1557
+ movaps xmm13, xmmword ptr [ROT8+rip]
1558
+ pshufb xmm3, xmm13
1559
+ pshufb xmm11, xmm13
1560
+ paddd xmm2, xmm3
1561
+ paddd xmm10, xmm11
1562
+ pxor xmm1, xmm2
1563
+ pxor xmm9, xmm10
1564
+ movdqa xmm4, xmm1
1565
+ pslld xmm1, 25
1566
+ psrld xmm4, 7
1567
+ por xmm1, xmm4
1568
+ movdqa xmm4, xmm9
1569
+ pslld xmm9, 25
1570
+ psrld xmm4, 7
1571
+ por xmm9, xmm4
1572
+ pshufd xmm0, xmm0, 0x93
1573
+ pshufd xmm8, xmm8, 0x93
1574
+ pshufd xmm3, xmm3, 0x4E
1575
+ pshufd xmm11, xmm11, 0x4E
1576
+ pshufd xmm2, xmm2, 0x39
1577
+ pshufd xmm10, xmm10, 0x39
1578
+ paddd xmm0, xmm6
1579
+ paddd xmm8, xmm14
1580
+ paddd xmm0, xmm1
1581
+ paddd xmm8, xmm9
1582
+ pxor xmm3, xmm0
1583
+ pxor xmm11, xmm8
1584
+ pshufb xmm3, xmm12
1585
+ pshufb xmm11, xmm12
1586
+ paddd xmm2, xmm3
1587
+ paddd xmm10, xmm11
1588
+ pxor xmm1, xmm2
1589
+ pxor xmm9, xmm10
1590
+ movdqa xmm4, xmm1
1591
+ pslld xmm1, 20
1592
+ psrld xmm4, 12
1593
+ por xmm1, xmm4
1594
+ movdqa xmm4, xmm9
1595
+ pslld xmm9, 20
1596
+ psrld xmm4, 12
1597
+ por xmm9, xmm4
1598
+ paddd xmm0, xmm7
1599
+ paddd xmm8, xmm15
1600
+ paddd xmm0, xmm1
1601
+ paddd xmm8, xmm9
1602
+ pxor xmm3, xmm0
1603
+ pxor xmm11, xmm8
1604
+ pshufb xmm3, xmm13
1605
+ pshufb xmm11, xmm13
1606
+ paddd xmm2, xmm3
1607
+ paddd xmm10, xmm11
1608
+ pxor xmm1, xmm2
1609
+ pxor xmm9, xmm10
1610
+ movdqa xmm4, xmm1
1611
+ pslld xmm1, 25
1612
+ psrld xmm4, 7
1613
+ por xmm1, xmm4
1614
+ movdqa xmm4, xmm9
1615
+ pslld xmm9, 25
1616
+ psrld xmm4, 7
1617
+ por xmm9, xmm4
1618
+ pshufd xmm0, xmm0, 0x39
1619
+ pshufd xmm8, xmm8, 0x39
1620
+ pshufd xmm3, xmm3, 0x4E
1621
+ pshufd xmm11, xmm11, 0x4E
1622
+ pshufd xmm2, xmm2, 0x93
1623
+ pshufd xmm10, xmm10, 0x93
1624
+ dec al
1625
+ je 9f
1626
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1627
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1628
+ pshufd xmm13, xmm12, 0x0F
1629
+ shufps xmm12, xmm5, 214
1630
+ pshufd xmm4, xmm12, 0x39
1631
+ movdqa xmm12, xmm6
1632
+ shufps xmm12, xmm7, 250
1633
+ pblendw xmm13, xmm12, 0xCC
1634
+ movdqa xmm12, xmm7
1635
+ punpcklqdq xmm12, xmm5
1636
+ pblendw xmm12, xmm6, 0xC0
1637
+ pshufd xmm12, xmm12, 0x78
1638
+ punpckhdq xmm5, xmm7
1639
+ punpckldq xmm6, xmm5
1640
+ pshufd xmm7, xmm6, 0x1E
1641
+ movdqa xmmword ptr [rsp+0x20], xmm13
1642
+ movdqa xmmword ptr [rsp+0x40], xmm12
1643
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1644
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1645
+ pshufd xmm6, xmm5, 0x0F
1646
+ shufps xmm5, xmm13, 214
1647
+ pshufd xmm12, xmm5, 0x39
1648
+ movdqa xmm5, xmm14
1649
+ shufps xmm5, xmm15, 250
1650
+ pblendw xmm6, xmm5, 0xCC
1651
+ movdqa xmm5, xmm15
1652
+ punpcklqdq xmm5, xmm13
1653
+ pblendw xmm5, xmm14, 0xC0
1654
+ pshufd xmm5, xmm5, 0x78
1655
+ punpckhdq xmm13, xmm15
1656
+ punpckldq xmm14, xmm13
1657
+ pshufd xmm15, xmm14, 0x1E
1658
+ movdqa xmm13, xmm6
1659
+ movdqa xmm14, xmm5
1660
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1661
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1662
+ jmp 9b
1663
+ 9:
1664
+ pxor xmm0, xmm2
1665
+ pxor xmm1, xmm3
1666
+ pxor xmm8, xmm10
1667
+ pxor xmm9, xmm11
1668
+ mov eax, r13d
1669
+ cmp rdx, r15
1670
+ jne 2b
1671
+ movups xmmword ptr [rbx], xmm0
1672
+ movups xmmword ptr [rbx+0x10], xmm1
1673
+ movups xmmword ptr [rbx+0x20], xmm8
1674
+ movups xmmword ptr [rbx+0x30], xmm9
1675
+ movdqa xmm0, xmmword ptr [rsp+0x130]
1676
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1677
+ movdqa xmm2, xmmword ptr [rsp+0x120]
1678
+ movdqu xmm3, xmmword ptr [rsp+0x118]
1679
+ movdqu xmm4, xmmword ptr [rsp+0x128]
1680
+ blendvps xmm1, xmm3, xmm0
1681
+ blendvps xmm2, xmm4, xmm0
1682
+ movdqa xmmword ptr [rsp+0x110], xmm1
1683
+ movdqa xmmword ptr [rsp+0x120], xmm2
1684
+ add rdi, 16
1685
+ add rbx, 64
1686
+ sub rsi, 2
1687
+ 3:
1688
+ test esi, 0x1
1689
+ je 4b
1690
+ movups xmm0, xmmword ptr [rcx]
1691
+ movups xmm1, xmmword ptr [rcx+0x10]
1692
+ movd xmm13, dword ptr [rsp+0x110]
1693
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1694
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1695
+ movaps xmm14, xmmword ptr [ROT8+rip]
1696
+ movaps xmm15, xmmword ptr [ROT16+rip]
1697
+ mov r8, qword ptr [rdi]
1698
+ movzx eax, byte ptr [rbp+0x80]
1699
+ or eax, r13d
1700
+ xor edx, edx
1701
+ 2:
1702
+ mov r14d, eax
1703
+ or eax, r12d
1704
+ add rdx, 64
1705
+ cmp rdx, r15
1706
+ cmovne eax, r14d
1707
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1708
+ movaps xmm3, xmm13
1709
+ pinsrd xmm3, eax, 3
1710
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1711
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1712
+ movaps xmm8, xmm4
1713
+ shufps xmm4, xmm5, 136
1714
+ shufps xmm8, xmm5, 221
1715
+ movaps xmm5, xmm8
1716
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1717
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1718
+ movaps xmm8, xmm6
1719
+ shufps xmm6, xmm7, 136
1720
+ pshufd xmm6, xmm6, 0x93
1721
+ shufps xmm8, xmm7, 221
1722
+ pshufd xmm7, xmm8, 0x93
1723
+ mov al, 7
1724
+ 9:
1725
+ paddd xmm0, xmm4
1726
+ paddd xmm0, xmm1
1727
+ pxor xmm3, xmm0
1728
+ pshufb xmm3, xmm15
1729
+ paddd xmm2, xmm3
1730
+ pxor xmm1, xmm2
1731
+ movdqa xmm11, xmm1
1732
+ pslld xmm1, 20
1733
+ psrld xmm11, 12
1734
+ por xmm1, xmm11
1735
+ paddd xmm0, xmm5
1736
+ paddd xmm0, xmm1
1737
+ pxor xmm3, xmm0
1738
+ pshufb xmm3, xmm14
1739
+ paddd xmm2, xmm3
1740
+ pxor xmm1, xmm2
1741
+ movdqa xmm11, xmm1
1742
+ pslld xmm1, 25
1743
+ psrld xmm11, 7
1744
+ por xmm1, xmm11
1745
+ pshufd xmm0, xmm0, 0x93
1746
+ pshufd xmm3, xmm3, 0x4E
1747
+ pshufd xmm2, xmm2, 0x39
1748
+ paddd xmm0, xmm6
1749
+ paddd xmm0, xmm1
1750
+ pxor xmm3, xmm0
1751
+ pshufb xmm3, xmm15
1752
+ paddd xmm2, xmm3
1753
+ pxor xmm1, xmm2
1754
+ movdqa xmm11, xmm1
1755
+ pslld xmm1, 20
1756
+ psrld xmm11, 12
1757
+ por xmm1, xmm11
1758
+ paddd xmm0, xmm7
1759
+ paddd xmm0, xmm1
1760
+ pxor xmm3, xmm0
1761
+ pshufb xmm3, xmm14
1762
+ paddd xmm2, xmm3
1763
+ pxor xmm1, xmm2
1764
+ movdqa xmm11, xmm1
1765
+ pslld xmm1, 25
1766
+ psrld xmm11, 7
1767
+ por xmm1, xmm11
1768
+ pshufd xmm0, xmm0, 0x39
1769
+ pshufd xmm3, xmm3, 0x4E
1770
+ pshufd xmm2, xmm2, 0x93
1771
+ dec al
1772
+ jz 9f
1773
+ movdqa xmm8, xmm4
1774
+ shufps xmm8, xmm5, 214
1775
+ pshufd xmm9, xmm4, 0x0F
1776
+ pshufd xmm4, xmm8, 0x39
1777
+ movdqa xmm8, xmm6
1778
+ shufps xmm8, xmm7, 250
1779
+ pblendw xmm9, xmm8, 0xCC
1780
+ movdqa xmm8, xmm7
1781
+ punpcklqdq xmm8, xmm5
1782
+ pblendw xmm8, xmm6, 0xC0
1783
+ pshufd xmm8, xmm8, 0x78
1784
+ punpckhdq xmm5, xmm7
1785
+ punpckldq xmm6, xmm5
1786
+ pshufd xmm7, xmm6, 0x1E
1787
+ movdqa xmm5, xmm9
1788
+ movdqa xmm6, xmm8
1789
+ jmp 9b
1790
+ 9:
1791
+ pxor xmm0, xmm2
1792
+ pxor xmm1, xmm3
1793
+ mov eax, r13d
1794
+ cmp rdx, r15
1795
+ jne 2b
1796
+ movups xmmword ptr [rbx], xmm0
1797
+ movups xmmword ptr [rbx+0x10], xmm1
1798
+ jmp 4b
1799
+
1800
+ .p2align 6
1801
+ blake3_compress_in_place_sse41:
1802
+ _blake3_compress_in_place_sse41:
1803
+ sub rsp, 72
1804
+ movdqa xmmword ptr [rsp], xmm6
1805
+ movdqa xmmword ptr [rsp+0x10], xmm7
1806
+ movdqa xmmword ptr [rsp+0x20], xmm8
1807
+ movdqa xmmword ptr [rsp+0x30], xmm9
1808
+ movups xmm0, xmmword ptr [rcx]
1809
+ movups xmm1, xmmword ptr [rcx+0x10]
1810
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1811
+ movzx eax, byte ptr [rsp+0x70]
1812
+ movzx r8d, r8b
1813
+ shl rax, 32
1814
+ add r8, rax
1815
+ movq xmm3, r9
1816
+ movq xmm4, r8
1817
+ punpcklqdq xmm3, xmm4
1818
+ movups xmm4, xmmword ptr [rdx]
1819
+ movups xmm5, xmmword ptr [rdx+0x10]
1820
+ movaps xmm8, xmm4
1821
+ shufps xmm4, xmm5, 136
1822
+ shufps xmm8, xmm5, 221
1823
+ movaps xmm5, xmm8
1824
+ movups xmm6, xmmword ptr [rdx+0x20]
1825
+ movups xmm7, xmmword ptr [rdx+0x30]
1826
+ movaps xmm8, xmm6
1827
+ shufps xmm6, xmm7, 136
1828
+ pshufd xmm6, xmm6, 0x93
1829
+ shufps xmm8, xmm7, 221
1830
+ pshufd xmm7, xmm8, 0x93
1831
+ movaps xmm14, xmmword ptr [ROT8+rip]
1832
+ movaps xmm15, xmmword ptr [ROT16+rip]
1833
+ mov al, 7
1834
+ 9:
1835
+ paddd xmm0, xmm4
1836
+ paddd xmm0, xmm1
1837
+ pxor xmm3, xmm0
1838
+ pshufb xmm3, xmm15
1839
+ paddd xmm2, xmm3
1840
+ pxor xmm1, xmm2
1841
+ movdqa xmm11, xmm1
1842
+ pslld xmm1, 20
1843
+ psrld xmm11, 12
1844
+ por xmm1, xmm11
1845
+ paddd xmm0, xmm5
1846
+ paddd xmm0, xmm1
1847
+ pxor xmm3, xmm0
1848
+ pshufb xmm3, xmm14
1849
+ paddd xmm2, xmm3
1850
+ pxor xmm1, xmm2
1851
+ movdqa xmm11, xmm1
1852
+ pslld xmm1, 25
1853
+ psrld xmm11, 7
1854
+ por xmm1, xmm11
1855
+ pshufd xmm0, xmm0, 0x93
1856
+ pshufd xmm3, xmm3, 0x4E
1857
+ pshufd xmm2, xmm2, 0x39
1858
+ paddd xmm0, xmm6
1859
+ paddd xmm0, xmm1
1860
+ pxor xmm3, xmm0
1861
+ pshufb xmm3, xmm15
1862
+ paddd xmm2, xmm3
1863
+ pxor xmm1, xmm2
1864
+ movdqa xmm11, xmm1
1865
+ pslld xmm1, 20
1866
+ psrld xmm11, 12
1867
+ por xmm1, xmm11
1868
+ paddd xmm0, xmm7
1869
+ paddd xmm0, xmm1
1870
+ pxor xmm3, xmm0
1871
+ pshufb xmm3, xmm14
1872
+ paddd xmm2, xmm3
1873
+ pxor xmm1, xmm2
1874
+ movdqa xmm11, xmm1
1875
+ pslld xmm1, 25
1876
+ psrld xmm11, 7
1877
+ por xmm1, xmm11
1878
+ pshufd xmm0, xmm0, 0x39
1879
+ pshufd xmm3, xmm3, 0x4E
1880
+ pshufd xmm2, xmm2, 0x93
1881
+ dec al
1882
+ jz 9f
1883
+ movdqa xmm8, xmm4
1884
+ shufps xmm8, xmm5, 214
1885
+ pshufd xmm9, xmm4, 0x0F
1886
+ pshufd xmm4, xmm8, 0x39
1887
+ movdqa xmm8, xmm6
1888
+ shufps xmm8, xmm7, 250
1889
+ pblendw xmm9, xmm8, 0xCC
1890
+ movdqa xmm8, xmm7
1891
+ punpcklqdq xmm8, xmm5
1892
+ pblendw xmm8, xmm6, 0xC0
1893
+ pshufd xmm8, xmm8, 0x78
1894
+ punpckhdq xmm5, xmm7
1895
+ punpckldq xmm6, xmm5
1896
+ pshufd xmm7, xmm6, 0x1E
1897
+ movdqa xmm5, xmm9
1898
+ movdqa xmm6, xmm8
1899
+ jmp 9b
1900
+ 9:
1901
+ pxor xmm0, xmm2
1902
+ pxor xmm1, xmm3
1903
+ movups xmmword ptr [rcx], xmm0
1904
+ movups xmmword ptr [rcx+0x10], xmm1
1905
+ movdqa xmm6, xmmword ptr [rsp]
1906
+ movdqa xmm7, xmmword ptr [rsp+0x10]
1907
+ movdqa xmm8, xmmword ptr [rsp+0x20]
1908
+ movdqa xmm9, xmmword ptr [rsp+0x30]
1909
+ add rsp, 72
1910
+ ret
1911
+
1912
+
1913
+ .p2align 6
1914
+ _blake3_compress_xof_sse41:
1915
+ blake3_compress_xof_sse41:
1916
+ sub rsp, 72
1917
+ movdqa xmmword ptr [rsp], xmm6
1918
+ movdqa xmmword ptr [rsp+0x10], xmm7
1919
+ movdqa xmmword ptr [rsp+0x20], xmm8
1920
+ movdqa xmmword ptr [rsp+0x30], xmm9
1921
+ movups xmm0, xmmword ptr [rcx]
1922
+ movups xmm1, xmmword ptr [rcx+0x10]
1923
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1924
+ movzx eax, byte ptr [rsp+0x70]
1925
+ movzx r8d, r8b
1926
+ mov r10, qword ptr [rsp+0x78]
1927
+ shl rax, 32
1928
+ add r8, rax
1929
+ movq xmm3, r9
1930
+ movq xmm4, r8
1931
+ punpcklqdq xmm3, xmm4
1932
+ movups xmm4, xmmword ptr [rdx]
1933
+ movups xmm5, xmmword ptr [rdx+0x10]
1934
+ movaps xmm8, xmm4
1935
+ shufps xmm4, xmm5, 136
1936
+ shufps xmm8, xmm5, 221
1937
+ movaps xmm5, xmm8
1938
+ movups xmm6, xmmword ptr [rdx+0x20]
1939
+ movups xmm7, xmmword ptr [rdx+0x30]
1940
+ movaps xmm8, xmm6
1941
+ shufps xmm6, xmm7, 136
1942
+ pshufd xmm6, xmm6, 0x93
1943
+ shufps xmm8, xmm7, 221
1944
+ pshufd xmm7, xmm8, 0x93
1945
+ movaps xmm14, xmmword ptr [ROT8+rip]
1946
+ movaps xmm15, xmmword ptr [ROT16+rip]
1947
+ mov al, 7
1948
+ 9:
1949
+ paddd xmm0, xmm4
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ pshufb xmm3, xmm15
1953
+ paddd xmm2, xmm3
1954
+ pxor xmm1, xmm2
1955
+ movdqa xmm11, xmm1
1956
+ pslld xmm1, 20
1957
+ psrld xmm11, 12
1958
+ por xmm1, xmm11
1959
+ paddd xmm0, xmm5
1960
+ paddd xmm0, xmm1
1961
+ pxor xmm3, xmm0
1962
+ pshufb xmm3, xmm14
1963
+ paddd xmm2, xmm3
1964
+ pxor xmm1, xmm2
1965
+ movdqa xmm11, xmm1
1966
+ pslld xmm1, 25
1967
+ psrld xmm11, 7
1968
+ por xmm1, xmm11
1969
+ pshufd xmm0, xmm0, 0x93
1970
+ pshufd xmm3, xmm3, 0x4E
1971
+ pshufd xmm2, xmm2, 0x39
1972
+ paddd xmm0, xmm6
1973
+ paddd xmm0, xmm1
1974
+ pxor xmm3, xmm0
1975
+ pshufb xmm3, xmm15
1976
+ paddd xmm2, xmm3
1977
+ pxor xmm1, xmm2
1978
+ movdqa xmm11, xmm1
1979
+ pslld xmm1, 20
1980
+ psrld xmm11, 12
1981
+ por xmm1, xmm11
1982
+ paddd xmm0, xmm7
1983
+ paddd xmm0, xmm1
1984
+ pxor xmm3, xmm0
1985
+ pshufb xmm3, xmm14
1986
+ paddd xmm2, xmm3
1987
+ pxor xmm1, xmm2
1988
+ movdqa xmm11, xmm1
1989
+ pslld xmm1, 25
1990
+ psrld xmm11, 7
1991
+ por xmm1, xmm11
1992
+ pshufd xmm0, xmm0, 0x39
1993
+ pshufd xmm3, xmm3, 0x4E
1994
+ pshufd xmm2, xmm2, 0x93
1995
+ dec al
1996
+ jz 9f
1997
+ movdqa xmm8, xmm4
1998
+ shufps xmm8, xmm5, 214
1999
+ pshufd xmm9, xmm4, 0x0F
2000
+ pshufd xmm4, xmm8, 0x39
2001
+ movdqa xmm8, xmm6
2002
+ shufps xmm8, xmm7, 250
2003
+ pblendw xmm9, xmm8, 0xCC
2004
+ movdqa xmm8, xmm7
2005
+ punpcklqdq xmm8, xmm5
2006
+ pblendw xmm8, xmm6, 0xC0
2007
+ pshufd xmm8, xmm8, 0x78
2008
+ punpckhdq xmm5, xmm7
2009
+ punpckldq xmm6, xmm5
2010
+ pshufd xmm7, xmm6, 0x1E
2011
+ movdqa xmm5, xmm9
2012
+ movdqa xmm6, xmm8
2013
+ jmp 9b
2014
+ 9:
2015
+ movdqu xmm4, xmmword ptr [rcx]
2016
+ movdqu xmm5, xmmword ptr [rcx+0x10]
2017
+ pxor xmm0, xmm2
2018
+ pxor xmm1, xmm3
2019
+ pxor xmm2, xmm4
2020
+ pxor xmm3, xmm5
2021
+ movups xmmword ptr [r10], xmm0
2022
+ movups xmmword ptr [r10+0x10], xmm1
2023
+ movups xmmword ptr [r10+0x20], xmm2
2024
+ movups xmmword ptr [r10+0x30], xmm3
2025
+ movdqa xmm6, xmmword ptr [rsp]
2026
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2027
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2028
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2029
+ add rsp, 72
2030
+ ret
2031
+
2032
+
2033
+ .section .rodata
2034
+ .p2align 6
2035
+ BLAKE3_IV:
2036
+ .long 0x6A09E667, 0xBB67AE85
2037
+ .long 0x3C6EF372, 0xA54FF53A
2038
+ ROT16:
2039
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2040
+ ROT8:
2041
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2042
+ ADD0:
2043
+ .long 0, 1, 2, 3
2044
+ ADD1:
2045
+ .long 4, 4, 4, 4
2046
+ BLAKE3_IV_0:
2047
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2048
+ BLAKE3_IV_1:
2049
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2050
+ BLAKE3_IV_2:
2051
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2052
+ BLAKE3_IV_3:
2053
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2054
+ BLAKE3_BLOCK_LEN:
2055
+ .long 64, 64, 64, 64
2056
+ CMP_MSB_MASK:
2057
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000