digest-blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2011 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse41
3
+ .global _blake3_hash_many_sse41
4
+ .global blake3_compress_in_place_sse41
5
+ .global _blake3_compress_in_place_sse41
6
+ .global blake3_compress_xof_sse41
7
+ .global _blake3_compress_xof_sse41
8
+ #ifdef __APPLE__
9
+ .text
10
+ #else
11
+ .section .text
12
+ #endif
13
+ .p2align 6
14
+ _blake3_hash_many_sse41:
15
+ blake3_hash_many_sse41:
16
+ push r15
17
+ push r14
18
+ push r13
19
+ push r12
20
+ push rbx
21
+ push rbp
22
+ mov rbp, rsp
23
+ sub rsp, 360
24
+ and rsp, 0xFFFFFFFFFFFFFFC0
25
+ neg r9d
26
+ movd xmm0, r9d
27
+ pshufd xmm0, xmm0, 0x00
28
+ movdqa xmmword ptr [rsp+0x130], xmm0
29
+ movdqa xmm1, xmm0
30
+ pand xmm1, xmmword ptr [ADD0+rip]
31
+ pand xmm0, xmmword ptr [ADD1+rip]
32
+ movdqa xmmword ptr [rsp+0x150], xmm0
33
+ movd xmm0, r8d
34
+ pshufd xmm0, xmm0, 0x00
35
+ paddd xmm0, xmm1
36
+ movdqa xmmword ptr [rsp+0x110], xmm0
37
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
38
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
39
+ pcmpgtd xmm1, xmm0
40
+ shr r8, 32
41
+ movd xmm2, r8d
42
+ pshufd xmm2, xmm2, 0x00
43
+ psubd xmm2, xmm1
44
+ movdqa xmmword ptr [rsp+0x120], xmm2
45
+ mov rbx, qword ptr [rbp+0x50]
46
+ mov r15, rdx
47
+ shl r15, 6
48
+ movzx r13d, byte ptr [rbp+0x38]
49
+ movzx r12d, byte ptr [rbp+0x48]
50
+ cmp rsi, 4
51
+ jc 3f
52
+ 2:
53
+ movdqu xmm3, xmmword ptr [rcx]
54
+ pshufd xmm0, xmm3, 0x00
55
+ pshufd xmm1, xmm3, 0x55
56
+ pshufd xmm2, xmm3, 0xAA
57
+ pshufd xmm3, xmm3, 0xFF
58
+ movdqu xmm7, xmmword ptr [rcx+0x10]
59
+ pshufd xmm4, xmm7, 0x00
60
+ pshufd xmm5, xmm7, 0x55
61
+ pshufd xmm6, xmm7, 0xAA
62
+ pshufd xmm7, xmm7, 0xFF
63
+ mov r8, qword ptr [rdi]
64
+ mov r9, qword ptr [rdi+0x8]
65
+ mov r10, qword ptr [rdi+0x10]
66
+ mov r11, qword ptr [rdi+0x18]
67
+ movzx eax, byte ptr [rbp+0x40]
68
+ or eax, r13d
69
+ xor edx, edx
70
+ 9:
71
+ mov r14d, eax
72
+ or eax, r12d
73
+ add rdx, 64
74
+ cmp rdx, r15
75
+ cmovne eax, r14d
76
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
77
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
78
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
79
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
80
+ movdqa xmm12, xmm8
81
+ punpckldq xmm8, xmm9
82
+ punpckhdq xmm12, xmm9
83
+ movdqa xmm14, xmm10
84
+ punpckldq xmm10, xmm11
85
+ punpckhdq xmm14, xmm11
86
+ movdqa xmm9, xmm8
87
+ punpcklqdq xmm8, xmm10
88
+ punpckhqdq xmm9, xmm10
89
+ movdqa xmm13, xmm12
90
+ punpcklqdq xmm12, xmm14
91
+ punpckhqdq xmm13, xmm14
92
+ movdqa xmmword ptr [rsp], xmm8
93
+ movdqa xmmword ptr [rsp+0x10], xmm9
94
+ movdqa xmmword ptr [rsp+0x20], xmm12
95
+ movdqa xmmword ptr [rsp+0x30], xmm13
96
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
97
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
98
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
99
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
100
+ movdqa xmm12, xmm8
101
+ punpckldq xmm8, xmm9
102
+ punpckhdq xmm12, xmm9
103
+ movdqa xmm14, xmm10
104
+ punpckldq xmm10, xmm11
105
+ punpckhdq xmm14, xmm11
106
+ movdqa xmm9, xmm8
107
+ punpcklqdq xmm8, xmm10
108
+ punpckhqdq xmm9, xmm10
109
+ movdqa xmm13, xmm12
110
+ punpcklqdq xmm12, xmm14
111
+ punpckhqdq xmm13, xmm14
112
+ movdqa xmmword ptr [rsp+0x40], xmm8
113
+ movdqa xmmword ptr [rsp+0x50], xmm9
114
+ movdqa xmmword ptr [rsp+0x60], xmm12
115
+ movdqa xmmword ptr [rsp+0x70], xmm13
116
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
117
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
118
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
119
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
120
+ movdqa xmm12, xmm8
121
+ punpckldq xmm8, xmm9
122
+ punpckhdq xmm12, xmm9
123
+ movdqa xmm14, xmm10
124
+ punpckldq xmm10, xmm11
125
+ punpckhdq xmm14, xmm11
126
+ movdqa xmm9, xmm8
127
+ punpcklqdq xmm8, xmm10
128
+ punpckhqdq xmm9, xmm10
129
+ movdqa xmm13, xmm12
130
+ punpcklqdq xmm12, xmm14
131
+ punpckhqdq xmm13, xmm14
132
+ movdqa xmmword ptr [rsp+0x80], xmm8
133
+ movdqa xmmword ptr [rsp+0x90], xmm9
134
+ movdqa xmmword ptr [rsp+0xA0], xmm12
135
+ movdqa xmmword ptr [rsp+0xB0], xmm13
136
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
137
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
138
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
139
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
140
+ movdqa xmm12, xmm8
141
+ punpckldq xmm8, xmm9
142
+ punpckhdq xmm12, xmm9
143
+ movdqa xmm14, xmm10
144
+ punpckldq xmm10, xmm11
145
+ punpckhdq xmm14, xmm11
146
+ movdqa xmm9, xmm8
147
+ punpcklqdq xmm8, xmm10
148
+ punpckhqdq xmm9, xmm10
149
+ movdqa xmm13, xmm12
150
+ punpcklqdq xmm12, xmm14
151
+ punpckhqdq xmm13, xmm14
152
+ movdqa xmmword ptr [rsp+0xC0], xmm8
153
+ movdqa xmmword ptr [rsp+0xD0], xmm9
154
+ movdqa xmmword ptr [rsp+0xE0], xmm12
155
+ movdqa xmmword ptr [rsp+0xF0], xmm13
156
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
157
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
158
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
159
+ movdqa xmm12, xmmword ptr [rsp+0x110]
160
+ movdqa xmm13, xmmword ptr [rsp+0x120]
161
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
162
+ movd xmm15, eax
163
+ pshufd xmm15, xmm15, 0x00
164
+ prefetcht0 [r8+rdx+0x80]
165
+ prefetcht0 [r9+rdx+0x80]
166
+ prefetcht0 [r10+rdx+0x80]
167
+ prefetcht0 [r11+rdx+0x80]
168
+ paddd xmm0, xmmword ptr [rsp]
169
+ paddd xmm1, xmmword ptr [rsp+0x20]
170
+ paddd xmm2, xmmword ptr [rsp+0x40]
171
+ paddd xmm3, xmmword ptr [rsp+0x60]
172
+ paddd xmm0, xmm4
173
+ paddd xmm1, xmm5
174
+ paddd xmm2, xmm6
175
+ paddd xmm3, xmm7
176
+ pxor xmm12, xmm0
177
+ pxor xmm13, xmm1
178
+ pxor xmm14, xmm2
179
+ pxor xmm15, xmm3
180
+ movdqa xmm8, xmmword ptr [ROT16+rip]
181
+ pshufb xmm12, xmm8
182
+ pshufb xmm13, xmm8
183
+ pshufb xmm14, xmm8
184
+ pshufb xmm15, xmm8
185
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
186
+ paddd xmm8, xmm12
187
+ paddd xmm9, xmm13
188
+ paddd xmm10, xmm14
189
+ paddd xmm11, xmm15
190
+ pxor xmm4, xmm8
191
+ pxor xmm5, xmm9
192
+ pxor xmm6, xmm10
193
+ pxor xmm7, xmm11
194
+ movdqa xmmword ptr [rsp+0x100], xmm8
195
+ movdqa xmm8, xmm4
196
+ psrld xmm8, 12
197
+ pslld xmm4, 20
198
+ por xmm4, xmm8
199
+ movdqa xmm8, xmm5
200
+ psrld xmm8, 12
201
+ pslld xmm5, 20
202
+ por xmm5, xmm8
203
+ movdqa xmm8, xmm6
204
+ psrld xmm8, 12
205
+ pslld xmm6, 20
206
+ por xmm6, xmm8
207
+ movdqa xmm8, xmm7
208
+ psrld xmm8, 12
209
+ pslld xmm7, 20
210
+ por xmm7, xmm8
211
+ paddd xmm0, xmmword ptr [rsp+0x10]
212
+ paddd xmm1, xmmword ptr [rsp+0x30]
213
+ paddd xmm2, xmmword ptr [rsp+0x50]
214
+ paddd xmm3, xmmword ptr [rsp+0x70]
215
+ paddd xmm0, xmm4
216
+ paddd xmm1, xmm5
217
+ paddd xmm2, xmm6
218
+ paddd xmm3, xmm7
219
+ pxor xmm12, xmm0
220
+ pxor xmm13, xmm1
221
+ pxor xmm14, xmm2
222
+ pxor xmm15, xmm3
223
+ movdqa xmm8, xmmword ptr [ROT8+rip]
224
+ pshufb xmm12, xmm8
225
+ pshufb xmm13, xmm8
226
+ pshufb xmm14, xmm8
227
+ pshufb xmm15, xmm8
228
+ movdqa xmm8, xmmword ptr [rsp+0x100]
229
+ paddd xmm8, xmm12
230
+ paddd xmm9, xmm13
231
+ paddd xmm10, xmm14
232
+ paddd xmm11, xmm15
233
+ pxor xmm4, xmm8
234
+ pxor xmm5, xmm9
235
+ pxor xmm6, xmm10
236
+ pxor xmm7, xmm11
237
+ movdqa xmmword ptr [rsp+0x100], xmm8
238
+ movdqa xmm8, xmm4
239
+ psrld xmm8, 7
240
+ pslld xmm4, 25
241
+ por xmm4, xmm8
242
+ movdqa xmm8, xmm5
243
+ psrld xmm8, 7
244
+ pslld xmm5, 25
245
+ por xmm5, xmm8
246
+ movdqa xmm8, xmm6
247
+ psrld xmm8, 7
248
+ pslld xmm6, 25
249
+ por xmm6, xmm8
250
+ movdqa xmm8, xmm7
251
+ psrld xmm8, 7
252
+ pslld xmm7, 25
253
+ por xmm7, xmm8
254
+ paddd xmm0, xmmword ptr [rsp+0x80]
255
+ paddd xmm1, xmmword ptr [rsp+0xA0]
256
+ paddd xmm2, xmmword ptr [rsp+0xC0]
257
+ paddd xmm3, xmmword ptr [rsp+0xE0]
258
+ paddd xmm0, xmm5
259
+ paddd xmm1, xmm6
260
+ paddd xmm2, xmm7
261
+ paddd xmm3, xmm4
262
+ pxor xmm15, xmm0
263
+ pxor xmm12, xmm1
264
+ pxor xmm13, xmm2
265
+ pxor xmm14, xmm3
266
+ movdqa xmm8, xmmword ptr [ROT16+rip]
267
+ pshufb xmm15, xmm8
268
+ pshufb xmm12, xmm8
269
+ pshufb xmm13, xmm8
270
+ pshufb xmm14, xmm8
271
+ paddd xmm10, xmm15
272
+ paddd xmm11, xmm12
273
+ movdqa xmm8, xmmword ptr [rsp+0x100]
274
+ paddd xmm8, xmm13
275
+ paddd xmm9, xmm14
276
+ pxor xmm5, xmm10
277
+ pxor xmm6, xmm11
278
+ pxor xmm7, xmm8
279
+ pxor xmm4, xmm9
280
+ movdqa xmmword ptr [rsp+0x100], xmm8
281
+ movdqa xmm8, xmm5
282
+ psrld xmm8, 12
283
+ pslld xmm5, 20
284
+ por xmm5, xmm8
285
+ movdqa xmm8, xmm6
286
+ psrld xmm8, 12
287
+ pslld xmm6, 20
288
+ por xmm6, xmm8
289
+ movdqa xmm8, xmm7
290
+ psrld xmm8, 12
291
+ pslld xmm7, 20
292
+ por xmm7, xmm8
293
+ movdqa xmm8, xmm4
294
+ psrld xmm8, 12
295
+ pslld xmm4, 20
296
+ por xmm4, xmm8
297
+ paddd xmm0, xmmword ptr [rsp+0x90]
298
+ paddd xmm1, xmmword ptr [rsp+0xB0]
299
+ paddd xmm2, xmmword ptr [rsp+0xD0]
300
+ paddd xmm3, xmmword ptr [rsp+0xF0]
301
+ paddd xmm0, xmm5
302
+ paddd xmm1, xmm6
303
+ paddd xmm2, xmm7
304
+ paddd xmm3, xmm4
305
+ pxor xmm15, xmm0
306
+ pxor xmm12, xmm1
307
+ pxor xmm13, xmm2
308
+ pxor xmm14, xmm3
309
+ movdqa xmm8, xmmword ptr [ROT8+rip]
310
+ pshufb xmm15, xmm8
311
+ pshufb xmm12, xmm8
312
+ pshufb xmm13, xmm8
313
+ pshufb xmm14, xmm8
314
+ paddd xmm10, xmm15
315
+ paddd xmm11, xmm12
316
+ movdqa xmm8, xmmword ptr [rsp+0x100]
317
+ paddd xmm8, xmm13
318
+ paddd xmm9, xmm14
319
+ pxor xmm5, xmm10
320
+ pxor xmm6, xmm11
321
+ pxor xmm7, xmm8
322
+ pxor xmm4, xmm9
323
+ movdqa xmmword ptr [rsp+0x100], xmm8
324
+ movdqa xmm8, xmm5
325
+ psrld xmm8, 7
326
+ pslld xmm5, 25
327
+ por xmm5, xmm8
328
+ movdqa xmm8, xmm6
329
+ psrld xmm8, 7
330
+ pslld xmm6, 25
331
+ por xmm6, xmm8
332
+ movdqa xmm8, xmm7
333
+ psrld xmm8, 7
334
+ pslld xmm7, 25
335
+ por xmm7, xmm8
336
+ movdqa xmm8, xmm4
337
+ psrld xmm8, 7
338
+ pslld xmm4, 25
339
+ por xmm4, xmm8
340
+ paddd xmm0, xmmword ptr [rsp+0x20]
341
+ paddd xmm1, xmmword ptr [rsp+0x30]
342
+ paddd xmm2, xmmword ptr [rsp+0x70]
343
+ paddd xmm3, xmmword ptr [rsp+0x40]
344
+ paddd xmm0, xmm4
345
+ paddd xmm1, xmm5
346
+ paddd xmm2, xmm6
347
+ paddd xmm3, xmm7
348
+ pxor xmm12, xmm0
349
+ pxor xmm13, xmm1
350
+ pxor xmm14, xmm2
351
+ pxor xmm15, xmm3
352
+ movdqa xmm8, xmmword ptr [ROT16+rip]
353
+ pshufb xmm12, xmm8
354
+ pshufb xmm13, xmm8
355
+ pshufb xmm14, xmm8
356
+ pshufb xmm15, xmm8
357
+ movdqa xmm8, xmmword ptr [rsp+0x100]
358
+ paddd xmm8, xmm12
359
+ paddd xmm9, xmm13
360
+ paddd xmm10, xmm14
361
+ paddd xmm11, xmm15
362
+ pxor xmm4, xmm8
363
+ pxor xmm5, xmm9
364
+ pxor xmm6, xmm10
365
+ pxor xmm7, xmm11
366
+ movdqa xmmword ptr [rsp+0x100], xmm8
367
+ movdqa xmm8, xmm4
368
+ psrld xmm8, 12
369
+ pslld xmm4, 20
370
+ por xmm4, xmm8
371
+ movdqa xmm8, xmm5
372
+ psrld xmm8, 12
373
+ pslld xmm5, 20
374
+ por xmm5, xmm8
375
+ movdqa xmm8, xmm6
376
+ psrld xmm8, 12
377
+ pslld xmm6, 20
378
+ por xmm6, xmm8
379
+ movdqa xmm8, xmm7
380
+ psrld xmm8, 12
381
+ pslld xmm7, 20
382
+ por xmm7, xmm8
383
+ paddd xmm0, xmmword ptr [rsp+0x60]
384
+ paddd xmm1, xmmword ptr [rsp+0xA0]
385
+ paddd xmm2, xmmword ptr [rsp]
386
+ paddd xmm3, xmmword ptr [rsp+0xD0]
387
+ paddd xmm0, xmm4
388
+ paddd xmm1, xmm5
389
+ paddd xmm2, xmm6
390
+ paddd xmm3, xmm7
391
+ pxor xmm12, xmm0
392
+ pxor xmm13, xmm1
393
+ pxor xmm14, xmm2
394
+ pxor xmm15, xmm3
395
+ movdqa xmm8, xmmword ptr [ROT8+rip]
396
+ pshufb xmm12, xmm8
397
+ pshufb xmm13, xmm8
398
+ pshufb xmm14, xmm8
399
+ pshufb xmm15, xmm8
400
+ movdqa xmm8, xmmword ptr [rsp+0x100]
401
+ paddd xmm8, xmm12
402
+ paddd xmm9, xmm13
403
+ paddd xmm10, xmm14
404
+ paddd xmm11, xmm15
405
+ pxor xmm4, xmm8
406
+ pxor xmm5, xmm9
407
+ pxor xmm6, xmm10
408
+ pxor xmm7, xmm11
409
+ movdqa xmmword ptr [rsp+0x100], xmm8
410
+ movdqa xmm8, xmm4
411
+ psrld xmm8, 7
412
+ pslld xmm4, 25
413
+ por xmm4, xmm8
414
+ movdqa xmm8, xmm5
415
+ psrld xmm8, 7
416
+ pslld xmm5, 25
417
+ por xmm5, xmm8
418
+ movdqa xmm8, xmm6
419
+ psrld xmm8, 7
420
+ pslld xmm6, 25
421
+ por xmm6, xmm8
422
+ movdqa xmm8, xmm7
423
+ psrld xmm8, 7
424
+ pslld xmm7, 25
425
+ por xmm7, xmm8
426
+ paddd xmm0, xmmword ptr [rsp+0x10]
427
+ paddd xmm1, xmmword ptr [rsp+0xC0]
428
+ paddd xmm2, xmmword ptr [rsp+0x90]
429
+ paddd xmm3, xmmword ptr [rsp+0xF0]
430
+ paddd xmm0, xmm5
431
+ paddd xmm1, xmm6
432
+ paddd xmm2, xmm7
433
+ paddd xmm3, xmm4
434
+ pxor xmm15, xmm0
435
+ pxor xmm12, xmm1
436
+ pxor xmm13, xmm2
437
+ pxor xmm14, xmm3
438
+ movdqa xmm8, xmmword ptr [ROT16+rip]
439
+ pshufb xmm15, xmm8
440
+ pshufb xmm12, xmm8
441
+ pshufb xmm13, xmm8
442
+ pshufb xmm14, xmm8
443
+ paddd xmm10, xmm15
444
+ paddd xmm11, xmm12
445
+ movdqa xmm8, xmmword ptr [rsp+0x100]
446
+ paddd xmm8, xmm13
447
+ paddd xmm9, xmm14
448
+ pxor xmm5, xmm10
449
+ pxor xmm6, xmm11
450
+ pxor xmm7, xmm8
451
+ pxor xmm4, xmm9
452
+ movdqa xmmword ptr [rsp+0x100], xmm8
453
+ movdqa xmm8, xmm5
454
+ psrld xmm8, 12
455
+ pslld xmm5, 20
456
+ por xmm5, xmm8
457
+ movdqa xmm8, xmm6
458
+ psrld xmm8, 12
459
+ pslld xmm6, 20
460
+ por xmm6, xmm8
461
+ movdqa xmm8, xmm7
462
+ psrld xmm8, 12
463
+ pslld xmm7, 20
464
+ por xmm7, xmm8
465
+ movdqa xmm8, xmm4
466
+ psrld xmm8, 12
467
+ pslld xmm4, 20
468
+ por xmm4, xmm8
469
+ paddd xmm0, xmmword ptr [rsp+0xB0]
470
+ paddd xmm1, xmmword ptr [rsp+0x50]
471
+ paddd xmm2, xmmword ptr [rsp+0xE0]
472
+ paddd xmm3, xmmword ptr [rsp+0x80]
473
+ paddd xmm0, xmm5
474
+ paddd xmm1, xmm6
475
+ paddd xmm2, xmm7
476
+ paddd xmm3, xmm4
477
+ pxor xmm15, xmm0
478
+ pxor xmm12, xmm1
479
+ pxor xmm13, xmm2
480
+ pxor xmm14, xmm3
481
+ movdqa xmm8, xmmword ptr [ROT8+rip]
482
+ pshufb xmm15, xmm8
483
+ pshufb xmm12, xmm8
484
+ pshufb xmm13, xmm8
485
+ pshufb xmm14, xmm8
486
+ paddd xmm10, xmm15
487
+ paddd xmm11, xmm12
488
+ movdqa xmm8, xmmword ptr [rsp+0x100]
489
+ paddd xmm8, xmm13
490
+ paddd xmm9, xmm14
491
+ pxor xmm5, xmm10
492
+ pxor xmm6, xmm11
493
+ pxor xmm7, xmm8
494
+ pxor xmm4, xmm9
495
+ movdqa xmmword ptr [rsp+0x100], xmm8
496
+ movdqa xmm8, xmm5
497
+ psrld xmm8, 7
498
+ pslld xmm5, 25
499
+ por xmm5, xmm8
500
+ movdqa xmm8, xmm6
501
+ psrld xmm8, 7
502
+ pslld xmm6, 25
503
+ por xmm6, xmm8
504
+ movdqa xmm8, xmm7
505
+ psrld xmm8, 7
506
+ pslld xmm7, 25
507
+ por xmm7, xmm8
508
+ movdqa xmm8, xmm4
509
+ psrld xmm8, 7
510
+ pslld xmm4, 25
511
+ por xmm4, xmm8
512
+ paddd xmm0, xmmword ptr [rsp+0x30]
513
+ paddd xmm1, xmmword ptr [rsp+0xA0]
514
+ paddd xmm2, xmmword ptr [rsp+0xD0]
515
+ paddd xmm3, xmmword ptr [rsp+0x70]
516
+ paddd xmm0, xmm4
517
+ paddd xmm1, xmm5
518
+ paddd xmm2, xmm6
519
+ paddd xmm3, xmm7
520
+ pxor xmm12, xmm0
521
+ pxor xmm13, xmm1
522
+ pxor xmm14, xmm2
523
+ pxor xmm15, xmm3
524
+ movdqa xmm8, xmmword ptr [ROT16+rip]
525
+ pshufb xmm12, xmm8
526
+ pshufb xmm13, xmm8
527
+ pshufb xmm14, xmm8
528
+ pshufb xmm15, xmm8
529
+ movdqa xmm8, xmmword ptr [rsp+0x100]
530
+ paddd xmm8, xmm12
531
+ paddd xmm9, xmm13
532
+ paddd xmm10, xmm14
533
+ paddd xmm11, xmm15
534
+ pxor xmm4, xmm8
535
+ pxor xmm5, xmm9
536
+ pxor xmm6, xmm10
537
+ pxor xmm7, xmm11
538
+ movdqa xmmword ptr [rsp+0x100], xmm8
539
+ movdqa xmm8, xmm4
540
+ psrld xmm8, 12
541
+ pslld xmm4, 20
542
+ por xmm4, xmm8
543
+ movdqa xmm8, xmm5
544
+ psrld xmm8, 12
545
+ pslld xmm5, 20
546
+ por xmm5, xmm8
547
+ movdqa xmm8, xmm6
548
+ psrld xmm8, 12
549
+ pslld xmm6, 20
550
+ por xmm6, xmm8
551
+ movdqa xmm8, xmm7
552
+ psrld xmm8, 12
553
+ pslld xmm7, 20
554
+ por xmm7, xmm8
555
+ paddd xmm0, xmmword ptr [rsp+0x40]
556
+ paddd xmm1, xmmword ptr [rsp+0xC0]
557
+ paddd xmm2, xmmword ptr [rsp+0x20]
558
+ paddd xmm3, xmmword ptr [rsp+0xE0]
559
+ paddd xmm0, xmm4
560
+ paddd xmm1, xmm5
561
+ paddd xmm2, xmm6
562
+ paddd xmm3, xmm7
563
+ pxor xmm12, xmm0
564
+ pxor xmm13, xmm1
565
+ pxor xmm14, xmm2
566
+ pxor xmm15, xmm3
567
+ movdqa xmm8, xmmword ptr [ROT8+rip]
568
+ pshufb xmm12, xmm8
569
+ pshufb xmm13, xmm8
570
+ pshufb xmm14, xmm8
571
+ pshufb xmm15, xmm8
572
+ movdqa xmm8, xmmword ptr [rsp+0x100]
573
+ paddd xmm8, xmm12
574
+ paddd xmm9, xmm13
575
+ paddd xmm10, xmm14
576
+ paddd xmm11, xmm15
577
+ pxor xmm4, xmm8
578
+ pxor xmm5, xmm9
579
+ pxor xmm6, xmm10
580
+ pxor xmm7, xmm11
581
+ movdqa xmmword ptr [rsp+0x100], xmm8
582
+ movdqa xmm8, xmm4
583
+ psrld xmm8, 7
584
+ pslld xmm4, 25
585
+ por xmm4, xmm8
586
+ movdqa xmm8, xmm5
587
+ psrld xmm8, 7
588
+ pslld xmm5, 25
589
+ por xmm5, xmm8
590
+ movdqa xmm8, xmm6
591
+ psrld xmm8, 7
592
+ pslld xmm6, 25
593
+ por xmm6, xmm8
594
+ movdqa xmm8, xmm7
595
+ psrld xmm8, 7
596
+ pslld xmm7, 25
597
+ por xmm7, xmm8
598
+ paddd xmm0, xmmword ptr [rsp+0x60]
599
+ paddd xmm1, xmmword ptr [rsp+0x90]
600
+ paddd xmm2, xmmword ptr [rsp+0xB0]
601
+ paddd xmm3, xmmword ptr [rsp+0x80]
602
+ paddd xmm0, xmm5
603
+ paddd xmm1, xmm6
604
+ paddd xmm2, xmm7
605
+ paddd xmm3, xmm4
606
+ pxor xmm15, xmm0
607
+ pxor xmm12, xmm1
608
+ pxor xmm13, xmm2
609
+ pxor xmm14, xmm3
610
+ movdqa xmm8, xmmword ptr [ROT16+rip]
611
+ pshufb xmm15, xmm8
612
+ pshufb xmm12, xmm8
613
+ pshufb xmm13, xmm8
614
+ pshufb xmm14, xmm8
615
+ paddd xmm10, xmm15
616
+ paddd xmm11, xmm12
617
+ movdqa xmm8, xmmword ptr [rsp+0x100]
618
+ paddd xmm8, xmm13
619
+ paddd xmm9, xmm14
620
+ pxor xmm5, xmm10
621
+ pxor xmm6, xmm11
622
+ pxor xmm7, xmm8
623
+ pxor xmm4, xmm9
624
+ movdqa xmmword ptr [rsp+0x100], xmm8
625
+ movdqa xmm8, xmm5
626
+ psrld xmm8, 12
627
+ pslld xmm5, 20
628
+ por xmm5, xmm8
629
+ movdqa xmm8, xmm6
630
+ psrld xmm8, 12
631
+ pslld xmm6, 20
632
+ por xmm6, xmm8
633
+ movdqa xmm8, xmm7
634
+ psrld xmm8, 12
635
+ pslld xmm7, 20
636
+ por xmm7, xmm8
637
+ movdqa xmm8, xmm4
638
+ psrld xmm8, 12
639
+ pslld xmm4, 20
640
+ por xmm4, xmm8
641
+ paddd xmm0, xmmword ptr [rsp+0x50]
642
+ paddd xmm1, xmmword ptr [rsp]
643
+ paddd xmm2, xmmword ptr [rsp+0xF0]
644
+ paddd xmm3, xmmword ptr [rsp+0x10]
645
+ paddd xmm0, xmm5
646
+ paddd xmm1, xmm6
647
+ paddd xmm2, xmm7
648
+ paddd xmm3, xmm4
649
+ pxor xmm15, xmm0
650
+ pxor xmm12, xmm1
651
+ pxor xmm13, xmm2
652
+ pxor xmm14, xmm3
653
+ movdqa xmm8, xmmword ptr [ROT8+rip]
654
+ pshufb xmm15, xmm8
655
+ pshufb xmm12, xmm8
656
+ pshufb xmm13, xmm8
657
+ pshufb xmm14, xmm8
658
+ paddd xmm10, xmm15
659
+ paddd xmm11, xmm12
660
+ movdqa xmm8, xmmword ptr [rsp+0x100]
661
+ paddd xmm8, xmm13
662
+ paddd xmm9, xmm14
663
+ pxor xmm5, xmm10
664
+ pxor xmm6, xmm11
665
+ pxor xmm7, xmm8
666
+ pxor xmm4, xmm9
667
+ movdqa xmmword ptr [rsp+0x100], xmm8
668
+ movdqa xmm8, xmm5
669
+ psrld xmm8, 7
670
+ pslld xmm5, 25
671
+ por xmm5, xmm8
672
+ movdqa xmm8, xmm6
673
+ psrld xmm8, 7
674
+ pslld xmm6, 25
675
+ por xmm6, xmm8
676
+ movdqa xmm8, xmm7
677
+ psrld xmm8, 7
678
+ pslld xmm7, 25
679
+ por xmm7, xmm8
680
+ movdqa xmm8, xmm4
681
+ psrld xmm8, 7
682
+ pslld xmm4, 25
683
+ por xmm4, xmm8
684
+ paddd xmm0, xmmword ptr [rsp+0xA0]
685
+ paddd xmm1, xmmword ptr [rsp+0xC0]
686
+ paddd xmm2, xmmword ptr [rsp+0xE0]
687
+ paddd xmm3, xmmword ptr [rsp+0xD0]
688
+ paddd xmm0, xmm4
689
+ paddd xmm1, xmm5
690
+ paddd xmm2, xmm6
691
+ paddd xmm3, xmm7
692
+ pxor xmm12, xmm0
693
+ pxor xmm13, xmm1
694
+ pxor xmm14, xmm2
695
+ pxor xmm15, xmm3
696
+ movdqa xmm8, xmmword ptr [ROT16+rip]
697
+ pshufb xmm12, xmm8
698
+ pshufb xmm13, xmm8
699
+ pshufb xmm14, xmm8
700
+ pshufb xmm15, xmm8
701
+ movdqa xmm8, xmmword ptr [rsp+0x100]
702
+ paddd xmm8, xmm12
703
+ paddd xmm9, xmm13
704
+ paddd xmm10, xmm14
705
+ paddd xmm11, xmm15
706
+ pxor xmm4, xmm8
707
+ pxor xmm5, xmm9
708
+ pxor xmm6, xmm10
709
+ pxor xmm7, xmm11
710
+ movdqa xmmword ptr [rsp+0x100], xmm8
711
+ movdqa xmm8, xmm4
712
+ psrld xmm8, 12
713
+ pslld xmm4, 20
714
+ por xmm4, xmm8
715
+ movdqa xmm8, xmm5
716
+ psrld xmm8, 12
717
+ pslld xmm5, 20
718
+ por xmm5, xmm8
719
+ movdqa xmm8, xmm6
720
+ psrld xmm8, 12
721
+ pslld xmm6, 20
722
+ por xmm6, xmm8
723
+ movdqa xmm8, xmm7
724
+ psrld xmm8, 12
725
+ pslld xmm7, 20
726
+ por xmm7, xmm8
727
+ paddd xmm0, xmmword ptr [rsp+0x70]
728
+ paddd xmm1, xmmword ptr [rsp+0x90]
729
+ paddd xmm2, xmmword ptr [rsp+0x30]
730
+ paddd xmm3, xmmword ptr [rsp+0xF0]
731
+ paddd xmm0, xmm4
732
+ paddd xmm1, xmm5
733
+ paddd xmm2, xmm6
734
+ paddd xmm3, xmm7
735
+ pxor xmm12, xmm0
736
+ pxor xmm13, xmm1
737
+ pxor xmm14, xmm2
738
+ pxor xmm15, xmm3
739
+ movdqa xmm8, xmmword ptr [ROT8+rip]
740
+ pshufb xmm12, xmm8
741
+ pshufb xmm13, xmm8
742
+ pshufb xmm14, xmm8
743
+ pshufb xmm15, xmm8
744
+ movdqa xmm8, xmmword ptr [rsp+0x100]
745
+ paddd xmm8, xmm12
746
+ paddd xmm9, xmm13
747
+ paddd xmm10, xmm14
748
+ paddd xmm11, xmm15
749
+ pxor xmm4, xmm8
750
+ pxor xmm5, xmm9
751
+ pxor xmm6, xmm10
752
+ pxor xmm7, xmm11
753
+ movdqa xmmword ptr [rsp+0x100], xmm8
754
+ movdqa xmm8, xmm4
755
+ psrld xmm8, 7
756
+ pslld xmm4, 25
757
+ por xmm4, xmm8
758
+ movdqa xmm8, xmm5
759
+ psrld xmm8, 7
760
+ pslld xmm5, 25
761
+ por xmm5, xmm8
762
+ movdqa xmm8, xmm6
763
+ psrld xmm8, 7
764
+ pslld xmm6, 25
765
+ por xmm6, xmm8
766
+ movdqa xmm8, xmm7
767
+ psrld xmm8, 7
768
+ pslld xmm7, 25
769
+ por xmm7, xmm8
770
+ paddd xmm0, xmmword ptr [rsp+0x40]
771
+ paddd xmm1, xmmword ptr [rsp+0xB0]
772
+ paddd xmm2, xmmword ptr [rsp+0x50]
773
+ paddd xmm3, xmmword ptr [rsp+0x10]
774
+ paddd xmm0, xmm5
775
+ paddd xmm1, xmm6
776
+ paddd xmm2, xmm7
777
+ paddd xmm3, xmm4
778
+ pxor xmm15, xmm0
779
+ pxor xmm12, xmm1
780
+ pxor xmm13, xmm2
781
+ pxor xmm14, xmm3
782
+ movdqa xmm8, xmmword ptr [ROT16+rip]
783
+ pshufb xmm15, xmm8
784
+ pshufb xmm12, xmm8
785
+ pshufb xmm13, xmm8
786
+ pshufb xmm14, xmm8
787
+ paddd xmm10, xmm15
788
+ paddd xmm11, xmm12
789
+ movdqa xmm8, xmmword ptr [rsp+0x100]
790
+ paddd xmm8, xmm13
791
+ paddd xmm9, xmm14
792
+ pxor xmm5, xmm10
793
+ pxor xmm6, xmm11
794
+ pxor xmm7, xmm8
795
+ pxor xmm4, xmm9
796
+ movdqa xmmword ptr [rsp+0x100], xmm8
797
+ movdqa xmm8, xmm5
798
+ psrld xmm8, 12
799
+ pslld xmm5, 20
800
+ por xmm5, xmm8
801
+ movdqa xmm8, xmm6
802
+ psrld xmm8, 12
803
+ pslld xmm6, 20
804
+ por xmm6, xmm8
805
+ movdqa xmm8, xmm7
806
+ psrld xmm8, 12
807
+ pslld xmm7, 20
808
+ por xmm7, xmm8
809
+ movdqa xmm8, xmm4
810
+ psrld xmm8, 12
811
+ pslld xmm4, 20
812
+ por xmm4, xmm8
813
+ paddd xmm0, xmmword ptr [rsp]
814
+ paddd xmm1, xmmword ptr [rsp+0x20]
815
+ paddd xmm2, xmmword ptr [rsp+0x80]
816
+ paddd xmm3, xmmword ptr [rsp+0x60]
817
+ paddd xmm0, xmm5
818
+ paddd xmm1, xmm6
819
+ paddd xmm2, xmm7
820
+ paddd xmm3, xmm4
821
+ pxor xmm15, xmm0
822
+ pxor xmm12, xmm1
823
+ pxor xmm13, xmm2
824
+ pxor xmm14, xmm3
825
+ movdqa xmm8, xmmword ptr [ROT8+rip]
826
+ pshufb xmm15, xmm8
827
+ pshufb xmm12, xmm8
828
+ pshufb xmm13, xmm8
829
+ pshufb xmm14, xmm8
830
+ paddd xmm10, xmm15
831
+ paddd xmm11, xmm12
832
+ movdqa xmm8, xmmword ptr [rsp+0x100]
833
+ paddd xmm8, xmm13
834
+ paddd xmm9, xmm14
835
+ pxor xmm5, xmm10
836
+ pxor xmm6, xmm11
837
+ pxor xmm7, xmm8
838
+ pxor xmm4, xmm9
839
+ movdqa xmmword ptr [rsp+0x100], xmm8
840
+ movdqa xmm8, xmm5
841
+ psrld xmm8, 7
842
+ pslld xmm5, 25
843
+ por xmm5, xmm8
844
+ movdqa xmm8, xmm6
845
+ psrld xmm8, 7
846
+ pslld xmm6, 25
847
+ por xmm6, xmm8
848
+ movdqa xmm8, xmm7
849
+ psrld xmm8, 7
850
+ pslld xmm7, 25
851
+ por xmm7, xmm8
852
+ movdqa xmm8, xmm4
853
+ psrld xmm8, 7
854
+ pslld xmm4, 25
855
+ por xmm4, xmm8
856
+ paddd xmm0, xmmword ptr [rsp+0xC0]
857
+ paddd xmm1, xmmword ptr [rsp+0x90]
858
+ paddd xmm2, xmmword ptr [rsp+0xF0]
859
+ paddd xmm3, xmmword ptr [rsp+0xE0]
860
+ paddd xmm0, xmm4
861
+ paddd xmm1, xmm5
862
+ paddd xmm2, xmm6
863
+ paddd xmm3, xmm7
864
+ pxor xmm12, xmm0
865
+ pxor xmm13, xmm1
866
+ pxor xmm14, xmm2
867
+ pxor xmm15, xmm3
868
+ movdqa xmm8, xmmword ptr [ROT16+rip]
869
+ pshufb xmm12, xmm8
870
+ pshufb xmm13, xmm8
871
+ pshufb xmm14, xmm8
872
+ pshufb xmm15, xmm8
873
+ movdqa xmm8, xmmword ptr [rsp+0x100]
874
+ paddd xmm8, xmm12
875
+ paddd xmm9, xmm13
876
+ paddd xmm10, xmm14
877
+ paddd xmm11, xmm15
878
+ pxor xmm4, xmm8
879
+ pxor xmm5, xmm9
880
+ pxor xmm6, xmm10
881
+ pxor xmm7, xmm11
882
+ movdqa xmmword ptr [rsp+0x100], xmm8
883
+ movdqa xmm8, xmm4
884
+ psrld xmm8, 12
885
+ pslld xmm4, 20
886
+ por xmm4, xmm8
887
+ movdqa xmm8, xmm5
888
+ psrld xmm8, 12
889
+ pslld xmm5, 20
890
+ por xmm5, xmm8
891
+ movdqa xmm8, xmm6
892
+ psrld xmm8, 12
893
+ pslld xmm6, 20
894
+ por xmm6, xmm8
895
+ movdqa xmm8, xmm7
896
+ psrld xmm8, 12
897
+ pslld xmm7, 20
898
+ por xmm7, xmm8
899
+ paddd xmm0, xmmword ptr [rsp+0xD0]
900
+ paddd xmm1, xmmword ptr [rsp+0xB0]
901
+ paddd xmm2, xmmword ptr [rsp+0xA0]
902
+ paddd xmm3, xmmword ptr [rsp+0x80]
903
+ paddd xmm0, xmm4
904
+ paddd xmm1, xmm5
905
+ paddd xmm2, xmm6
906
+ paddd xmm3, xmm7
907
+ pxor xmm12, xmm0
908
+ pxor xmm13, xmm1
909
+ pxor xmm14, xmm2
910
+ pxor xmm15, xmm3
911
+ movdqa xmm8, xmmword ptr [ROT8+rip]
912
+ pshufb xmm12, xmm8
913
+ pshufb xmm13, xmm8
914
+ pshufb xmm14, xmm8
915
+ pshufb xmm15, xmm8
916
+ movdqa xmm8, xmmword ptr [rsp+0x100]
917
+ paddd xmm8, xmm12
918
+ paddd xmm9, xmm13
919
+ paddd xmm10, xmm14
920
+ paddd xmm11, xmm15
921
+ pxor xmm4, xmm8
922
+ pxor xmm5, xmm9
923
+ pxor xmm6, xmm10
924
+ pxor xmm7, xmm11
925
+ movdqa xmmword ptr [rsp+0x100], xmm8
926
+ movdqa xmm8, xmm4
927
+ psrld xmm8, 7
928
+ pslld xmm4, 25
929
+ por xmm4, xmm8
930
+ movdqa xmm8, xmm5
931
+ psrld xmm8, 7
932
+ pslld xmm5, 25
933
+ por xmm5, xmm8
934
+ movdqa xmm8, xmm6
935
+ psrld xmm8, 7
936
+ pslld xmm6, 25
937
+ por xmm6, xmm8
938
+ movdqa xmm8, xmm7
939
+ psrld xmm8, 7
940
+ pslld xmm7, 25
941
+ por xmm7, xmm8
942
+ paddd xmm0, xmmword ptr [rsp+0x70]
943
+ paddd xmm1, xmmword ptr [rsp+0x50]
944
+ paddd xmm2, xmmword ptr [rsp]
945
+ paddd xmm3, xmmword ptr [rsp+0x60]
946
+ paddd xmm0, xmm5
947
+ paddd xmm1, xmm6
948
+ paddd xmm2, xmm7
949
+ paddd xmm3, xmm4
950
+ pxor xmm15, xmm0
951
+ pxor xmm12, xmm1
952
+ pxor xmm13, xmm2
953
+ pxor xmm14, xmm3
954
+ movdqa xmm8, xmmword ptr [ROT16+rip]
955
+ pshufb xmm15, xmm8
956
+ pshufb xmm12, xmm8
957
+ pshufb xmm13, xmm8
958
+ pshufb xmm14, xmm8
959
+ paddd xmm10, xmm15
960
+ paddd xmm11, xmm12
961
+ movdqa xmm8, xmmword ptr [rsp+0x100]
962
+ paddd xmm8, xmm13
963
+ paddd xmm9, xmm14
964
+ pxor xmm5, xmm10
965
+ pxor xmm6, xmm11
966
+ pxor xmm7, xmm8
967
+ pxor xmm4, xmm9
968
+ movdqa xmmword ptr [rsp+0x100], xmm8
969
+ movdqa xmm8, xmm5
970
+ psrld xmm8, 12
971
+ pslld xmm5, 20
972
+ por xmm5, xmm8
973
+ movdqa xmm8, xmm6
974
+ psrld xmm8, 12
975
+ pslld xmm6, 20
976
+ por xmm6, xmm8
977
+ movdqa xmm8, xmm7
978
+ psrld xmm8, 12
979
+ pslld xmm7, 20
980
+ por xmm7, xmm8
981
+ movdqa xmm8, xmm4
982
+ psrld xmm8, 12
983
+ pslld xmm4, 20
984
+ por xmm4, xmm8
985
+ paddd xmm0, xmmword ptr [rsp+0x20]
986
+ paddd xmm1, xmmword ptr [rsp+0x30]
987
+ paddd xmm2, xmmword ptr [rsp+0x10]
988
+ paddd xmm3, xmmword ptr [rsp+0x40]
989
+ paddd xmm0, xmm5
990
+ paddd xmm1, xmm6
991
+ paddd xmm2, xmm7
992
+ paddd xmm3, xmm4
993
+ pxor xmm15, xmm0
994
+ pxor xmm12, xmm1
995
+ pxor xmm13, xmm2
996
+ pxor xmm14, xmm3
997
+ movdqa xmm8, xmmword ptr [ROT8+rip]
998
+ pshufb xmm15, xmm8
999
+ pshufb xmm12, xmm8
1000
+ pshufb xmm13, xmm8
1001
+ pshufb xmm14, xmm8
1002
+ paddd xmm10, xmm15
1003
+ paddd xmm11, xmm12
1004
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1005
+ paddd xmm8, xmm13
1006
+ paddd xmm9, xmm14
1007
+ pxor xmm5, xmm10
1008
+ pxor xmm6, xmm11
1009
+ pxor xmm7, xmm8
1010
+ pxor xmm4, xmm9
1011
+ movdqa xmmword ptr [rsp+0x100], xmm8
1012
+ movdqa xmm8, xmm5
1013
+ psrld xmm8, 7
1014
+ pslld xmm5, 25
1015
+ por xmm5, xmm8
1016
+ movdqa xmm8, xmm6
1017
+ psrld xmm8, 7
1018
+ pslld xmm6, 25
1019
+ por xmm6, xmm8
1020
+ movdqa xmm8, xmm7
1021
+ psrld xmm8, 7
1022
+ pslld xmm7, 25
1023
+ por xmm7, xmm8
1024
+ movdqa xmm8, xmm4
1025
+ psrld xmm8, 7
1026
+ pslld xmm4, 25
1027
+ por xmm4, xmm8
1028
+ paddd xmm0, xmmword ptr [rsp+0x90]
1029
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1030
+ paddd xmm2, xmmword ptr [rsp+0x80]
1031
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1032
+ paddd xmm0, xmm4
1033
+ paddd xmm1, xmm5
1034
+ paddd xmm2, xmm6
1035
+ paddd xmm3, xmm7
1036
+ pxor xmm12, xmm0
1037
+ pxor xmm13, xmm1
1038
+ pxor xmm14, xmm2
1039
+ pxor xmm15, xmm3
1040
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1041
+ pshufb xmm12, xmm8
1042
+ pshufb xmm13, xmm8
1043
+ pshufb xmm14, xmm8
1044
+ pshufb xmm15, xmm8
1045
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1046
+ paddd xmm8, xmm12
1047
+ paddd xmm9, xmm13
1048
+ paddd xmm10, xmm14
1049
+ paddd xmm11, xmm15
1050
+ pxor xmm4, xmm8
1051
+ pxor xmm5, xmm9
1052
+ pxor xmm6, xmm10
1053
+ pxor xmm7, xmm11
1054
+ movdqa xmmword ptr [rsp+0x100], xmm8
1055
+ movdqa xmm8, xmm4
1056
+ psrld xmm8, 12
1057
+ pslld xmm4, 20
1058
+ por xmm4, xmm8
1059
+ movdqa xmm8, xmm5
1060
+ psrld xmm8, 12
1061
+ pslld xmm5, 20
1062
+ por xmm5, xmm8
1063
+ movdqa xmm8, xmm6
1064
+ psrld xmm8, 12
1065
+ pslld xmm6, 20
1066
+ por xmm6, xmm8
1067
+ movdqa xmm8, xmm7
1068
+ psrld xmm8, 12
1069
+ pslld xmm7, 20
1070
+ por xmm7, xmm8
1071
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1072
+ paddd xmm1, xmmword ptr [rsp+0x50]
1073
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1074
+ paddd xmm3, xmmword ptr [rsp+0x10]
1075
+ paddd xmm0, xmm4
1076
+ paddd xmm1, xmm5
1077
+ paddd xmm2, xmm6
1078
+ paddd xmm3, xmm7
1079
+ pxor xmm12, xmm0
1080
+ pxor xmm13, xmm1
1081
+ pxor xmm14, xmm2
1082
+ pxor xmm15, xmm3
1083
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1084
+ pshufb xmm12, xmm8
1085
+ pshufb xmm13, xmm8
1086
+ pshufb xmm14, xmm8
1087
+ pshufb xmm15, xmm8
1088
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1089
+ paddd xmm8, xmm12
1090
+ paddd xmm9, xmm13
1091
+ paddd xmm10, xmm14
1092
+ paddd xmm11, xmm15
1093
+ pxor xmm4, xmm8
1094
+ pxor xmm5, xmm9
1095
+ pxor xmm6, xmm10
1096
+ pxor xmm7, xmm11
1097
+ movdqa xmmword ptr [rsp+0x100], xmm8
1098
+ movdqa xmm8, xmm4
1099
+ psrld xmm8, 7
1100
+ pslld xmm4, 25
1101
+ por xmm4, xmm8
1102
+ movdqa xmm8, xmm5
1103
+ psrld xmm8, 7
1104
+ pslld xmm5, 25
1105
+ por xmm5, xmm8
1106
+ movdqa xmm8, xmm6
1107
+ psrld xmm8, 7
1108
+ pslld xmm6, 25
1109
+ por xmm6, xmm8
1110
+ movdqa xmm8, xmm7
1111
+ psrld xmm8, 7
1112
+ pslld xmm7, 25
1113
+ por xmm7, xmm8
1114
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1115
+ paddd xmm1, xmmword ptr [rsp]
1116
+ paddd xmm2, xmmword ptr [rsp+0x20]
1117
+ paddd xmm3, xmmword ptr [rsp+0x40]
1118
+ paddd xmm0, xmm5
1119
+ paddd xmm1, xmm6
1120
+ paddd xmm2, xmm7
1121
+ paddd xmm3, xmm4
1122
+ pxor xmm15, xmm0
1123
+ pxor xmm12, xmm1
1124
+ pxor xmm13, xmm2
1125
+ pxor xmm14, xmm3
1126
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1127
+ pshufb xmm15, xmm8
1128
+ pshufb xmm12, xmm8
1129
+ pshufb xmm13, xmm8
1130
+ pshufb xmm14, xmm8
1131
+ paddd xmm10, xmm15
1132
+ paddd xmm11, xmm12
1133
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1134
+ paddd xmm8, xmm13
1135
+ paddd xmm9, xmm14
1136
+ pxor xmm5, xmm10
1137
+ pxor xmm6, xmm11
1138
+ pxor xmm7, xmm8
1139
+ pxor xmm4, xmm9
1140
+ movdqa xmmword ptr [rsp+0x100], xmm8
1141
+ movdqa xmm8, xmm5
1142
+ psrld xmm8, 12
1143
+ pslld xmm5, 20
1144
+ por xmm5, xmm8
1145
+ movdqa xmm8, xmm6
1146
+ psrld xmm8, 12
1147
+ pslld xmm6, 20
1148
+ por xmm6, xmm8
1149
+ movdqa xmm8, xmm7
1150
+ psrld xmm8, 12
1151
+ pslld xmm7, 20
1152
+ por xmm7, xmm8
1153
+ movdqa xmm8, xmm4
1154
+ psrld xmm8, 12
1155
+ pslld xmm4, 20
1156
+ por xmm4, xmm8
1157
+ paddd xmm0, xmmword ptr [rsp+0x30]
1158
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1159
+ paddd xmm2, xmmword ptr [rsp+0x60]
1160
+ paddd xmm3, xmmword ptr [rsp+0x70]
1161
+ paddd xmm0, xmm5
1162
+ paddd xmm1, xmm6
1163
+ paddd xmm2, xmm7
1164
+ paddd xmm3, xmm4
1165
+ pxor xmm15, xmm0
1166
+ pxor xmm12, xmm1
1167
+ pxor xmm13, xmm2
1168
+ pxor xmm14, xmm3
1169
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1170
+ pshufb xmm15, xmm8
1171
+ pshufb xmm12, xmm8
1172
+ pshufb xmm13, xmm8
1173
+ pshufb xmm14, xmm8
1174
+ paddd xmm10, xmm15
1175
+ paddd xmm11, xmm12
1176
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1177
+ paddd xmm8, xmm13
1178
+ paddd xmm9, xmm14
1179
+ pxor xmm5, xmm10
1180
+ pxor xmm6, xmm11
1181
+ pxor xmm7, xmm8
1182
+ pxor xmm4, xmm9
1183
+ movdqa xmmword ptr [rsp+0x100], xmm8
1184
+ movdqa xmm8, xmm5
1185
+ psrld xmm8, 7
1186
+ pslld xmm5, 25
1187
+ por xmm5, xmm8
1188
+ movdqa xmm8, xmm6
1189
+ psrld xmm8, 7
1190
+ pslld xmm6, 25
1191
+ por xmm6, xmm8
1192
+ movdqa xmm8, xmm7
1193
+ psrld xmm8, 7
1194
+ pslld xmm7, 25
1195
+ por xmm7, xmm8
1196
+ movdqa xmm8, xmm4
1197
+ psrld xmm8, 7
1198
+ pslld xmm4, 25
1199
+ por xmm4, xmm8
1200
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1201
+ paddd xmm1, xmmword ptr [rsp+0x50]
1202
+ paddd xmm2, xmmword ptr [rsp+0x10]
1203
+ paddd xmm3, xmmword ptr [rsp+0x80]
1204
+ paddd xmm0, xmm4
1205
+ paddd xmm1, xmm5
1206
+ paddd xmm2, xmm6
1207
+ paddd xmm3, xmm7
1208
+ pxor xmm12, xmm0
1209
+ pxor xmm13, xmm1
1210
+ pxor xmm14, xmm2
1211
+ pxor xmm15, xmm3
1212
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1213
+ pshufb xmm12, xmm8
1214
+ pshufb xmm13, xmm8
1215
+ pshufb xmm14, xmm8
1216
+ pshufb xmm15, xmm8
1217
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1218
+ paddd xmm8, xmm12
1219
+ paddd xmm9, xmm13
1220
+ paddd xmm10, xmm14
1221
+ paddd xmm11, xmm15
1222
+ pxor xmm4, xmm8
1223
+ pxor xmm5, xmm9
1224
+ pxor xmm6, xmm10
1225
+ pxor xmm7, xmm11
1226
+ movdqa xmmword ptr [rsp+0x100], xmm8
1227
+ movdqa xmm8, xmm4
1228
+ psrld xmm8, 12
1229
+ pslld xmm4, 20
1230
+ por xmm4, xmm8
1231
+ movdqa xmm8, xmm5
1232
+ psrld xmm8, 12
1233
+ pslld xmm5, 20
1234
+ por xmm5, xmm8
1235
+ movdqa xmm8, xmm6
1236
+ psrld xmm8, 12
1237
+ pslld xmm6, 20
1238
+ por xmm6, xmm8
1239
+ movdqa xmm8, xmm7
1240
+ psrld xmm8, 12
1241
+ pslld xmm7, 20
1242
+ por xmm7, xmm8
1243
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1244
+ paddd xmm1, xmmword ptr [rsp]
1245
+ paddd xmm2, xmmword ptr [rsp+0x90]
1246
+ paddd xmm3, xmmword ptr [rsp+0x60]
1247
+ paddd xmm0, xmm4
1248
+ paddd xmm1, xmm5
1249
+ paddd xmm2, xmm6
1250
+ paddd xmm3, xmm7
1251
+ pxor xmm12, xmm0
1252
+ pxor xmm13, xmm1
1253
+ pxor xmm14, xmm2
1254
+ pxor xmm15, xmm3
1255
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1256
+ pshufb xmm12, xmm8
1257
+ pshufb xmm13, xmm8
1258
+ pshufb xmm14, xmm8
1259
+ pshufb xmm15, xmm8
1260
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1261
+ paddd xmm8, xmm12
1262
+ paddd xmm9, xmm13
1263
+ paddd xmm10, xmm14
1264
+ paddd xmm11, xmm15
1265
+ pxor xmm4, xmm8
1266
+ pxor xmm5, xmm9
1267
+ pxor xmm6, xmm10
1268
+ pxor xmm7, xmm11
1269
+ movdqa xmmword ptr [rsp+0x100], xmm8
1270
+ movdqa xmm8, xmm4
1271
+ psrld xmm8, 7
1272
+ pslld xmm4, 25
1273
+ por xmm4, xmm8
1274
+ movdqa xmm8, xmm5
1275
+ psrld xmm8, 7
1276
+ pslld xmm5, 25
1277
+ por xmm5, xmm8
1278
+ movdqa xmm8, xmm6
1279
+ psrld xmm8, 7
1280
+ pslld xmm6, 25
1281
+ por xmm6, xmm8
1282
+ movdqa xmm8, xmm7
1283
+ psrld xmm8, 7
1284
+ pslld xmm7, 25
1285
+ por xmm7, xmm8
1286
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1287
+ paddd xmm1, xmmword ptr [rsp+0x20]
1288
+ paddd xmm2, xmmword ptr [rsp+0x30]
1289
+ paddd xmm3, xmmword ptr [rsp+0x70]
1290
+ paddd xmm0, xmm5
1291
+ paddd xmm1, xmm6
1292
+ paddd xmm2, xmm7
1293
+ paddd xmm3, xmm4
1294
+ pxor xmm15, xmm0
1295
+ pxor xmm12, xmm1
1296
+ pxor xmm13, xmm2
1297
+ pxor xmm14, xmm3
1298
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1299
+ pshufb xmm15, xmm8
1300
+ pshufb xmm12, xmm8
1301
+ pshufb xmm13, xmm8
1302
+ pshufb xmm14, xmm8
1303
+ paddd xmm10, xmm15
1304
+ paddd xmm11, xmm12
1305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1306
+ paddd xmm8, xmm13
1307
+ paddd xmm9, xmm14
1308
+ pxor xmm5, xmm10
1309
+ pxor xmm6, xmm11
1310
+ pxor xmm7, xmm8
1311
+ pxor xmm4, xmm9
1312
+ movdqa xmmword ptr [rsp+0x100], xmm8
1313
+ movdqa xmm8, xmm5
1314
+ psrld xmm8, 12
1315
+ pslld xmm5, 20
1316
+ por xmm5, xmm8
1317
+ movdqa xmm8, xmm6
1318
+ psrld xmm8, 12
1319
+ pslld xmm6, 20
1320
+ por xmm6, xmm8
1321
+ movdqa xmm8, xmm7
1322
+ psrld xmm8, 12
1323
+ pslld xmm7, 20
1324
+ por xmm7, xmm8
1325
+ movdqa xmm8, xmm4
1326
+ psrld xmm8, 12
1327
+ pslld xmm4, 20
1328
+ por xmm4, xmm8
1329
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1330
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1331
+ paddd xmm2, xmmword ptr [rsp+0x40]
1332
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1333
+ paddd xmm0, xmm5
1334
+ paddd xmm1, xmm6
1335
+ paddd xmm2, xmm7
1336
+ paddd xmm3, xmm4
1337
+ pxor xmm15, xmm0
1338
+ pxor xmm12, xmm1
1339
+ pxor xmm13, xmm2
1340
+ pxor xmm14, xmm3
1341
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1342
+ pshufb xmm15, xmm8
1343
+ pshufb xmm12, xmm8
1344
+ pshufb xmm13, xmm8
1345
+ pshufb xmm14, xmm8
1346
+ paddd xmm10, xmm15
1347
+ paddd xmm11, xmm12
1348
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1349
+ paddd xmm8, xmm13
1350
+ paddd xmm9, xmm14
1351
+ pxor xmm5, xmm10
1352
+ pxor xmm6, xmm11
1353
+ pxor xmm7, xmm8
1354
+ pxor xmm4, xmm9
1355
+ pxor xmm0, xmm8
1356
+ pxor xmm1, xmm9
1357
+ pxor xmm2, xmm10
1358
+ pxor xmm3, xmm11
1359
+ movdqa xmm8, xmm5
1360
+ psrld xmm8, 7
1361
+ pslld xmm5, 25
1362
+ por xmm5, xmm8
1363
+ movdqa xmm8, xmm6
1364
+ psrld xmm8, 7
1365
+ pslld xmm6, 25
1366
+ por xmm6, xmm8
1367
+ movdqa xmm8, xmm7
1368
+ psrld xmm8, 7
1369
+ pslld xmm7, 25
1370
+ por xmm7, xmm8
1371
+ movdqa xmm8, xmm4
1372
+ psrld xmm8, 7
1373
+ pslld xmm4, 25
1374
+ por xmm4, xmm8
1375
+ pxor xmm4, xmm12
1376
+ pxor xmm5, xmm13
1377
+ pxor xmm6, xmm14
1378
+ pxor xmm7, xmm15
1379
+ mov eax, r13d
1380
+ jne 9b
1381
+ movdqa xmm9, xmm0
1382
+ punpckldq xmm0, xmm1
1383
+ punpckhdq xmm9, xmm1
1384
+ movdqa xmm11, xmm2
1385
+ punpckldq xmm2, xmm3
1386
+ punpckhdq xmm11, xmm3
1387
+ movdqa xmm1, xmm0
1388
+ punpcklqdq xmm0, xmm2
1389
+ punpckhqdq xmm1, xmm2
1390
+ movdqa xmm3, xmm9
1391
+ punpcklqdq xmm9, xmm11
1392
+ punpckhqdq xmm3, xmm11
1393
+ movdqu xmmword ptr [rbx], xmm0
1394
+ movdqu xmmword ptr [rbx+0x20], xmm1
1395
+ movdqu xmmword ptr [rbx+0x40], xmm9
1396
+ movdqu xmmword ptr [rbx+0x60], xmm3
1397
+ movdqa xmm9, xmm4
1398
+ punpckldq xmm4, xmm5
1399
+ punpckhdq xmm9, xmm5
1400
+ movdqa xmm11, xmm6
1401
+ punpckldq xmm6, xmm7
1402
+ punpckhdq xmm11, xmm7
1403
+ movdqa xmm5, xmm4
1404
+ punpcklqdq xmm4, xmm6
1405
+ punpckhqdq xmm5, xmm6
1406
+ movdqa xmm7, xmm9
1407
+ punpcklqdq xmm9, xmm11
1408
+ punpckhqdq xmm7, xmm11
1409
+ movdqu xmmword ptr [rbx+0x10], xmm4
1410
+ movdqu xmmword ptr [rbx+0x30], xmm5
1411
+ movdqu xmmword ptr [rbx+0x50], xmm9
1412
+ movdqu xmmword ptr [rbx+0x70], xmm7
1413
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1414
+ movdqa xmm0, xmm1
1415
+ paddd xmm1, xmmword ptr [rsp+0x150]
1416
+ movdqa xmmword ptr [rsp+0x110], xmm1
1417
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1418
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1419
+ pcmpgtd xmm0, xmm1
1420
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1421
+ psubd xmm1, xmm0
1422
+ movdqa xmmword ptr [rsp+0x120], xmm1
1423
+ add rbx, 128
1424
+ add rdi, 32
1425
+ sub rsi, 4
1426
+ cmp rsi, 4
1427
+ jnc 2b
1428
+ test rsi, rsi
1429
+ jnz 3f
1430
+ 4:
1431
+ mov rsp, rbp
1432
+ pop rbp
1433
+ pop rbx
1434
+ pop r12
1435
+ pop r13
1436
+ pop r14
1437
+ pop r15
1438
+ ret
1439
+ .p2align 5
1440
+ 3:
1441
+ test esi, 0x2
1442
+ je 3f
1443
+ movups xmm0, xmmword ptr [rcx]
1444
+ movups xmm1, xmmword ptr [rcx+0x10]
1445
+ movaps xmm8, xmm0
1446
+ movaps xmm9, xmm1
1447
+ movd xmm13, dword ptr [rsp+0x110]
1448
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1449
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1450
+ movaps xmmword ptr [rsp], xmm13
1451
+ movd xmm14, dword ptr [rsp+0x114]
1452
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
1453
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1454
+ movaps xmmword ptr [rsp+0x10], xmm14
1455
+ mov r8, qword ptr [rdi]
1456
+ mov r9, qword ptr [rdi+0x8]
1457
+ movzx eax, byte ptr [rbp+0x40]
1458
+ or eax, r13d
1459
+ xor edx, edx
1460
+ 2:
1461
+ mov r14d, eax
1462
+ or eax, r12d
1463
+ add rdx, 64
1464
+ cmp rdx, r15
1465
+ cmovne eax, r14d
1466
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1467
+ movaps xmm10, xmm2
1468
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1469
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1470
+ movaps xmm3, xmm4
1471
+ shufps xmm4, xmm5, 136
1472
+ shufps xmm3, xmm5, 221
1473
+ movaps xmm5, xmm3
1474
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1475
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1476
+ movaps xmm3, xmm6
1477
+ shufps xmm6, xmm7, 136
1478
+ pshufd xmm6, xmm6, 0x93
1479
+ shufps xmm3, xmm7, 221
1480
+ pshufd xmm7, xmm3, 0x93
1481
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1482
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1483
+ movaps xmm11, xmm12
1484
+ shufps xmm12, xmm13, 136
1485
+ shufps xmm11, xmm13, 221
1486
+ movaps xmm13, xmm11
1487
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1488
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1489
+ movaps xmm11, xmm14
1490
+ shufps xmm14, xmm15, 136
1491
+ pshufd xmm14, xmm14, 0x93
1492
+ shufps xmm11, xmm15, 221
1493
+ pshufd xmm15, xmm11, 0x93
1494
+ movaps xmm3, xmmword ptr [rsp]
1495
+ movaps xmm11, xmmword ptr [rsp+0x10]
1496
+ pinsrd xmm3, eax, 3
1497
+ pinsrd xmm11, eax, 3
1498
+ mov al, 7
1499
+ 9:
1500
+ paddd xmm0, xmm4
1501
+ paddd xmm8, xmm12
1502
+ movaps xmmword ptr [rsp+0x20], xmm4
1503
+ movaps xmmword ptr [rsp+0x30], xmm12
1504
+ paddd xmm0, xmm1
1505
+ paddd xmm8, xmm9
1506
+ pxor xmm3, xmm0
1507
+ pxor xmm11, xmm8
1508
+ movaps xmm12, xmmword ptr [ROT16+rip]
1509
+ pshufb xmm3, xmm12
1510
+ pshufb xmm11, xmm12
1511
+ paddd xmm2, xmm3
1512
+ paddd xmm10, xmm11
1513
+ pxor xmm1, xmm2
1514
+ pxor xmm9, xmm10
1515
+ movdqa xmm4, xmm1
1516
+ pslld xmm1, 20
1517
+ psrld xmm4, 12
1518
+ por xmm1, xmm4
1519
+ movdqa xmm4, xmm9
1520
+ pslld xmm9, 20
1521
+ psrld xmm4, 12
1522
+ por xmm9, xmm4
1523
+ paddd xmm0, xmm5
1524
+ paddd xmm8, xmm13
1525
+ movaps xmmword ptr [rsp+0x40], xmm5
1526
+ movaps xmmword ptr [rsp+0x50], xmm13
1527
+ paddd xmm0, xmm1
1528
+ paddd xmm8, xmm9
1529
+ pxor xmm3, xmm0
1530
+ pxor xmm11, xmm8
1531
+ movaps xmm13, xmmword ptr [ROT8+rip]
1532
+ pshufb xmm3, xmm13
1533
+ pshufb xmm11, xmm13
1534
+ paddd xmm2, xmm3
1535
+ paddd xmm10, xmm11
1536
+ pxor xmm1, xmm2
1537
+ pxor xmm9, xmm10
1538
+ movdqa xmm4, xmm1
1539
+ pslld xmm1, 25
1540
+ psrld xmm4, 7
1541
+ por xmm1, xmm4
1542
+ movdqa xmm4, xmm9
1543
+ pslld xmm9, 25
1544
+ psrld xmm4, 7
1545
+ por xmm9, xmm4
1546
+ pshufd xmm0, xmm0, 0x93
1547
+ pshufd xmm8, xmm8, 0x93
1548
+ pshufd xmm3, xmm3, 0x4E
1549
+ pshufd xmm11, xmm11, 0x4E
1550
+ pshufd xmm2, xmm2, 0x39
1551
+ pshufd xmm10, xmm10, 0x39
1552
+ paddd xmm0, xmm6
1553
+ paddd xmm8, xmm14
1554
+ paddd xmm0, xmm1
1555
+ paddd xmm8, xmm9
1556
+ pxor xmm3, xmm0
1557
+ pxor xmm11, xmm8
1558
+ pshufb xmm3, xmm12
1559
+ pshufb xmm11, xmm12
1560
+ paddd xmm2, xmm3
1561
+ paddd xmm10, xmm11
1562
+ pxor xmm1, xmm2
1563
+ pxor xmm9, xmm10
1564
+ movdqa xmm4, xmm1
1565
+ pslld xmm1, 20
1566
+ psrld xmm4, 12
1567
+ por xmm1, xmm4
1568
+ movdqa xmm4, xmm9
1569
+ pslld xmm9, 20
1570
+ psrld xmm4, 12
1571
+ por xmm9, xmm4
1572
+ paddd xmm0, xmm7
1573
+ paddd xmm8, xmm15
1574
+ paddd xmm0, xmm1
1575
+ paddd xmm8, xmm9
1576
+ pxor xmm3, xmm0
1577
+ pxor xmm11, xmm8
1578
+ pshufb xmm3, xmm13
1579
+ pshufb xmm11, xmm13
1580
+ paddd xmm2, xmm3
1581
+ paddd xmm10, xmm11
1582
+ pxor xmm1, xmm2
1583
+ pxor xmm9, xmm10
1584
+ movdqa xmm4, xmm1
1585
+ pslld xmm1, 25
1586
+ psrld xmm4, 7
1587
+ por xmm1, xmm4
1588
+ movdqa xmm4, xmm9
1589
+ pslld xmm9, 25
1590
+ psrld xmm4, 7
1591
+ por xmm9, xmm4
1592
+ pshufd xmm0, xmm0, 0x39
1593
+ pshufd xmm8, xmm8, 0x39
1594
+ pshufd xmm3, xmm3, 0x4E
1595
+ pshufd xmm11, xmm11, 0x4E
1596
+ pshufd xmm2, xmm2, 0x93
1597
+ pshufd xmm10, xmm10, 0x93
1598
+ dec al
1599
+ je 9f
1600
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1601
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1602
+ pshufd xmm13, xmm12, 0x0F
1603
+ shufps xmm12, xmm5, 214
1604
+ pshufd xmm4, xmm12, 0x39
1605
+ movdqa xmm12, xmm6
1606
+ shufps xmm12, xmm7, 250
1607
+ pblendw xmm13, xmm12, 0xCC
1608
+ movdqa xmm12, xmm7
1609
+ punpcklqdq xmm12, xmm5
1610
+ pblendw xmm12, xmm6, 0xC0
1611
+ pshufd xmm12, xmm12, 0x78
1612
+ punpckhdq xmm5, xmm7
1613
+ punpckldq xmm6, xmm5
1614
+ pshufd xmm7, xmm6, 0x1E
1615
+ movdqa xmmword ptr [rsp+0x20], xmm13
1616
+ movdqa xmmword ptr [rsp+0x40], xmm12
1617
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1618
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1619
+ pshufd xmm6, xmm5, 0x0F
1620
+ shufps xmm5, xmm13, 214
1621
+ pshufd xmm12, xmm5, 0x39
1622
+ movdqa xmm5, xmm14
1623
+ shufps xmm5, xmm15, 250
1624
+ pblendw xmm6, xmm5, 0xCC
1625
+ movdqa xmm5, xmm15
1626
+ punpcklqdq xmm5, xmm13
1627
+ pblendw xmm5, xmm14, 0xC0
1628
+ pshufd xmm5, xmm5, 0x78
1629
+ punpckhdq xmm13, xmm15
1630
+ punpckldq xmm14, xmm13
1631
+ pshufd xmm15, xmm14, 0x1E
1632
+ movdqa xmm13, xmm6
1633
+ movdqa xmm14, xmm5
1634
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1635
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1636
+ jmp 9b
1637
+ 9:
1638
+ pxor xmm0, xmm2
1639
+ pxor xmm1, xmm3
1640
+ pxor xmm8, xmm10
1641
+ pxor xmm9, xmm11
1642
+ mov eax, r13d
1643
+ cmp rdx, r15
1644
+ jne 2b
1645
+ movups xmmword ptr [rbx], xmm0
1646
+ movups xmmword ptr [rbx+0x10], xmm1
1647
+ movups xmmword ptr [rbx+0x20], xmm8
1648
+ movups xmmword ptr [rbx+0x30], xmm9
1649
+ movdqa xmm0, xmmword ptr [rsp+0x130]
1650
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1651
+ movdqa xmm2, xmmword ptr [rsp+0x120]
1652
+ movdqu xmm3, xmmword ptr [rsp+0x118]
1653
+ movdqu xmm4, xmmword ptr [rsp+0x128]
1654
+ blendvps xmm1, xmm3, xmm0
1655
+ blendvps xmm2, xmm4, xmm0
1656
+ movdqa xmmword ptr [rsp+0x110], xmm1
1657
+ movdqa xmmword ptr [rsp+0x120], xmm2
1658
+ add rdi, 16
1659
+ add rbx, 64
1660
+ sub rsi, 2
1661
+ 3:
1662
+ test esi, 0x1
1663
+ je 4b
1664
+ movups xmm0, xmmword ptr [rcx]
1665
+ movups xmm1, xmmword ptr [rcx+0x10]
1666
+ movd xmm13, dword ptr [rsp+0x110]
1667
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1668
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1669
+ movaps xmm14, xmmword ptr [ROT8+rip]
1670
+ movaps xmm15, xmmword ptr [ROT16+rip]
1671
+ mov r8, qword ptr [rdi]
1672
+ movzx eax, byte ptr [rbp+0x40]
1673
+ or eax, r13d
1674
+ xor edx, edx
1675
+ 2:
1676
+ mov r14d, eax
1677
+ or eax, r12d
1678
+ add rdx, 64
1679
+ cmp rdx, r15
1680
+ cmovne eax, r14d
1681
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1682
+ movaps xmm3, xmm13
1683
+ pinsrd xmm3, eax, 3
1684
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1685
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1686
+ movaps xmm8, xmm4
1687
+ shufps xmm4, xmm5, 136
1688
+ shufps xmm8, xmm5, 221
1689
+ movaps xmm5, xmm8
1690
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1691
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1692
+ movaps xmm8, xmm6
1693
+ shufps xmm6, xmm7, 136
1694
+ pshufd xmm6, xmm6, 0x93
1695
+ shufps xmm8, xmm7, 221
1696
+ pshufd xmm7, xmm8, 0x93
1697
+ mov al, 7
1698
+ 9:
1699
+ paddd xmm0, xmm4
1700
+ paddd xmm0, xmm1
1701
+ pxor xmm3, xmm0
1702
+ pshufb xmm3, xmm15
1703
+ paddd xmm2, xmm3
1704
+ pxor xmm1, xmm2
1705
+ movdqa xmm11, xmm1
1706
+ pslld xmm1, 20
1707
+ psrld xmm11, 12
1708
+ por xmm1, xmm11
1709
+ paddd xmm0, xmm5
1710
+ paddd xmm0, xmm1
1711
+ pxor xmm3, xmm0
1712
+ pshufb xmm3, xmm14
1713
+ paddd xmm2, xmm3
1714
+ pxor xmm1, xmm2
1715
+ movdqa xmm11, xmm1
1716
+ pslld xmm1, 25
1717
+ psrld xmm11, 7
1718
+ por xmm1, xmm11
1719
+ pshufd xmm0, xmm0, 0x93
1720
+ pshufd xmm3, xmm3, 0x4E
1721
+ pshufd xmm2, xmm2, 0x39
1722
+ paddd xmm0, xmm6
1723
+ paddd xmm0, xmm1
1724
+ pxor xmm3, xmm0
1725
+ pshufb xmm3, xmm15
1726
+ paddd xmm2, xmm3
1727
+ pxor xmm1, xmm2
1728
+ movdqa xmm11, xmm1
1729
+ pslld xmm1, 20
1730
+ psrld xmm11, 12
1731
+ por xmm1, xmm11
1732
+ paddd xmm0, xmm7
1733
+ paddd xmm0, xmm1
1734
+ pxor xmm3, xmm0
1735
+ pshufb xmm3, xmm14
1736
+ paddd xmm2, xmm3
1737
+ pxor xmm1, xmm2
1738
+ movdqa xmm11, xmm1
1739
+ pslld xmm1, 25
1740
+ psrld xmm11, 7
1741
+ por xmm1, xmm11
1742
+ pshufd xmm0, xmm0, 0x39
1743
+ pshufd xmm3, xmm3, 0x4E
1744
+ pshufd xmm2, xmm2, 0x93
1745
+ dec al
1746
+ jz 9f
1747
+ movdqa xmm8, xmm4
1748
+ shufps xmm8, xmm5, 214
1749
+ pshufd xmm9, xmm4, 0x0F
1750
+ pshufd xmm4, xmm8, 0x39
1751
+ movdqa xmm8, xmm6
1752
+ shufps xmm8, xmm7, 250
1753
+ pblendw xmm9, xmm8, 0xCC
1754
+ movdqa xmm8, xmm7
1755
+ punpcklqdq xmm8, xmm5
1756
+ pblendw xmm8, xmm6, 0xC0
1757
+ pshufd xmm8, xmm8, 0x78
1758
+ punpckhdq xmm5, xmm7
1759
+ punpckldq xmm6, xmm5
1760
+ pshufd xmm7, xmm6, 0x1E
1761
+ movdqa xmm5, xmm9
1762
+ movdqa xmm6, xmm8
1763
+ jmp 9b
1764
+ 9:
1765
+ pxor xmm0, xmm2
1766
+ pxor xmm1, xmm3
1767
+ mov eax, r13d
1768
+ cmp rdx, r15
1769
+ jne 2b
1770
+ movups xmmword ptr [rbx], xmm0
1771
+ movups xmmword ptr [rbx+0x10], xmm1
1772
+ jmp 4b
1773
+
1774
+ .p2align 6
1775
+ blake3_compress_in_place_sse41:
1776
+ _blake3_compress_in_place_sse41:
1777
+ movups xmm0, xmmword ptr [rdi]
1778
+ movups xmm1, xmmword ptr [rdi+0x10]
1779
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1780
+ shl r8, 32
1781
+ add rdx, r8
1782
+ movq xmm3, rcx
1783
+ movq xmm4, rdx
1784
+ punpcklqdq xmm3, xmm4
1785
+ movups xmm4, xmmword ptr [rsi]
1786
+ movups xmm5, xmmword ptr [rsi+0x10]
1787
+ movaps xmm8, xmm4
1788
+ shufps xmm4, xmm5, 136
1789
+ shufps xmm8, xmm5, 221
1790
+ movaps xmm5, xmm8
1791
+ movups xmm6, xmmword ptr [rsi+0x20]
1792
+ movups xmm7, xmmword ptr [rsi+0x30]
1793
+ movaps xmm8, xmm6
1794
+ shufps xmm6, xmm7, 136
1795
+ pshufd xmm6, xmm6, 0x93
1796
+ shufps xmm8, xmm7, 221
1797
+ pshufd xmm7, xmm8, 0x93
1798
+ movaps xmm14, xmmword ptr [ROT8+rip]
1799
+ movaps xmm15, xmmword ptr [ROT16+rip]
1800
+ mov al, 7
1801
+ 9:
1802
+ paddd xmm0, xmm4
1803
+ paddd xmm0, xmm1
1804
+ pxor xmm3, xmm0
1805
+ pshufb xmm3, xmm15
1806
+ paddd xmm2, xmm3
1807
+ pxor xmm1, xmm2
1808
+ movdqa xmm11, xmm1
1809
+ pslld xmm1, 20
1810
+ psrld xmm11, 12
1811
+ por xmm1, xmm11
1812
+ paddd xmm0, xmm5
1813
+ paddd xmm0, xmm1
1814
+ pxor xmm3, xmm0
1815
+ pshufb xmm3, xmm14
1816
+ paddd xmm2, xmm3
1817
+ pxor xmm1, xmm2
1818
+ movdqa xmm11, xmm1
1819
+ pslld xmm1, 25
1820
+ psrld xmm11, 7
1821
+ por xmm1, xmm11
1822
+ pshufd xmm0, xmm0, 0x93
1823
+ pshufd xmm3, xmm3, 0x4E
1824
+ pshufd xmm2, xmm2, 0x39
1825
+ paddd xmm0, xmm6
1826
+ paddd xmm0, xmm1
1827
+ pxor xmm3, xmm0
1828
+ pshufb xmm3, xmm15
1829
+ paddd xmm2, xmm3
1830
+ pxor xmm1, xmm2
1831
+ movdqa xmm11, xmm1
1832
+ pslld xmm1, 20
1833
+ psrld xmm11, 12
1834
+ por xmm1, xmm11
1835
+ paddd xmm0, xmm7
1836
+ paddd xmm0, xmm1
1837
+ pxor xmm3, xmm0
1838
+ pshufb xmm3, xmm14
1839
+ paddd xmm2, xmm3
1840
+ pxor xmm1, xmm2
1841
+ movdqa xmm11, xmm1
1842
+ pslld xmm1, 25
1843
+ psrld xmm11, 7
1844
+ por xmm1, xmm11
1845
+ pshufd xmm0, xmm0, 0x39
1846
+ pshufd xmm3, xmm3, 0x4E
1847
+ pshufd xmm2, xmm2, 0x93
1848
+ dec al
1849
+ jz 9f
1850
+ movdqa xmm8, xmm4
1851
+ shufps xmm8, xmm5, 214
1852
+ pshufd xmm9, xmm4, 0x0F
1853
+ pshufd xmm4, xmm8, 0x39
1854
+ movdqa xmm8, xmm6
1855
+ shufps xmm8, xmm7, 250
1856
+ pblendw xmm9, xmm8, 0xCC
1857
+ movdqa xmm8, xmm7
1858
+ punpcklqdq xmm8, xmm5
1859
+ pblendw xmm8, xmm6, 0xC0
1860
+ pshufd xmm8, xmm8, 0x78
1861
+ punpckhdq xmm5, xmm7
1862
+ punpckldq xmm6, xmm5
1863
+ pshufd xmm7, xmm6, 0x1E
1864
+ movdqa xmm5, xmm9
1865
+ movdqa xmm6, xmm8
1866
+ jmp 9b
1867
+ 9:
1868
+ pxor xmm0, xmm2
1869
+ pxor xmm1, xmm3
1870
+ movups xmmword ptr [rdi], xmm0
1871
+ movups xmmword ptr [rdi+0x10], xmm1
1872
+ ret
1873
+
1874
+ .p2align 6
1875
+ blake3_compress_xof_sse41:
1876
+ _blake3_compress_xof_sse41:
1877
+ movups xmm0, xmmword ptr [rdi]
1878
+ movups xmm1, xmmword ptr [rdi+0x10]
1879
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1880
+ movzx eax, r8b
1881
+ movzx edx, dl
1882
+ shl rax, 32
1883
+ add rdx, rax
1884
+ movq xmm3, rcx
1885
+ movq xmm4, rdx
1886
+ punpcklqdq xmm3, xmm4
1887
+ movups xmm4, xmmword ptr [rsi]
1888
+ movups xmm5, xmmword ptr [rsi+0x10]
1889
+ movaps xmm8, xmm4
1890
+ shufps xmm4, xmm5, 136
1891
+ shufps xmm8, xmm5, 221
1892
+ movaps xmm5, xmm8
1893
+ movups xmm6, xmmword ptr [rsi+0x20]
1894
+ movups xmm7, xmmword ptr [rsi+0x30]
1895
+ movaps xmm8, xmm6
1896
+ shufps xmm6, xmm7, 136
1897
+ pshufd xmm6, xmm6, 0x93
1898
+ shufps xmm8, xmm7, 221
1899
+ pshufd xmm7, xmm8, 0x93
1900
+ movaps xmm14, xmmword ptr [ROT8+rip]
1901
+ movaps xmm15, xmmword ptr [ROT16+rip]
1902
+ mov al, 7
1903
+ 9:
1904
+ paddd xmm0, xmm4
1905
+ paddd xmm0, xmm1
1906
+ pxor xmm3, xmm0
1907
+ pshufb xmm3, xmm15
1908
+ paddd xmm2, xmm3
1909
+ pxor xmm1, xmm2
1910
+ movdqa xmm11, xmm1
1911
+ pslld xmm1, 20
1912
+ psrld xmm11, 12
1913
+ por xmm1, xmm11
1914
+ paddd xmm0, xmm5
1915
+ paddd xmm0, xmm1
1916
+ pxor xmm3, xmm0
1917
+ pshufb xmm3, xmm14
1918
+ paddd xmm2, xmm3
1919
+ pxor xmm1, xmm2
1920
+ movdqa xmm11, xmm1
1921
+ pslld xmm1, 25
1922
+ psrld xmm11, 7
1923
+ por xmm1, xmm11
1924
+ pshufd xmm0, xmm0, 0x93
1925
+ pshufd xmm3, xmm3, 0x4E
1926
+ pshufd xmm2, xmm2, 0x39
1927
+ paddd xmm0, xmm6
1928
+ paddd xmm0, xmm1
1929
+ pxor xmm3, xmm0
1930
+ pshufb xmm3, xmm15
1931
+ paddd xmm2, xmm3
1932
+ pxor xmm1, xmm2
1933
+ movdqa xmm11, xmm1
1934
+ pslld xmm1, 20
1935
+ psrld xmm11, 12
1936
+ por xmm1, xmm11
1937
+ paddd xmm0, xmm7
1938
+ paddd xmm0, xmm1
1939
+ pxor xmm3, xmm0
1940
+ pshufb xmm3, xmm14
1941
+ paddd xmm2, xmm3
1942
+ pxor xmm1, xmm2
1943
+ movdqa xmm11, xmm1
1944
+ pslld xmm1, 25
1945
+ psrld xmm11, 7
1946
+ por xmm1, xmm11
1947
+ pshufd xmm0, xmm0, 0x39
1948
+ pshufd xmm3, xmm3, 0x4E
1949
+ pshufd xmm2, xmm2, 0x93
1950
+ dec al
1951
+ jz 9f
1952
+ movdqa xmm8, xmm4
1953
+ shufps xmm8, xmm5, 214
1954
+ pshufd xmm9, xmm4, 0x0F
1955
+ pshufd xmm4, xmm8, 0x39
1956
+ movdqa xmm8, xmm6
1957
+ shufps xmm8, xmm7, 250
1958
+ pblendw xmm9, xmm8, 0xCC
1959
+ movdqa xmm8, xmm7
1960
+ punpcklqdq xmm8, xmm5
1961
+ pblendw xmm8, xmm6, 0xC0
1962
+ pshufd xmm8, xmm8, 0x78
1963
+ punpckhdq xmm5, xmm7
1964
+ punpckldq xmm6, xmm5
1965
+ pshufd xmm7, xmm6, 0x1E
1966
+ movdqa xmm5, xmm9
1967
+ movdqa xmm6, xmm8
1968
+ jmp 9b
1969
+ 9:
1970
+ movdqu xmm4, xmmword ptr [rdi]
1971
+ movdqu xmm5, xmmword ptr [rdi+0x10]
1972
+ pxor xmm0, xmm2
1973
+ pxor xmm1, xmm3
1974
+ pxor xmm2, xmm4
1975
+ pxor xmm3, xmm5
1976
+ movups xmmword ptr [r9], xmm0
1977
+ movups xmmword ptr [r9+0x10], xmm1
1978
+ movups xmmword ptr [r9+0x20], xmm2
1979
+ movups xmmword ptr [r9+0x30], xmm3
1980
+ ret
1981
+
1982
+
1983
+ #ifdef __APPLE__
1984
+ .static_data
1985
+ #else
1986
+ .section .rodata
1987
+ #endif
1988
+ .p2align 6
1989
+ BLAKE3_IV:
1990
+ .long 0x6A09E667, 0xBB67AE85
1991
+ .long 0x3C6EF372, 0xA54FF53A
1992
+ ROT16:
1993
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
1994
+ ROT8:
1995
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
1996
+ ADD0:
1997
+ .long 0, 1, 2, 3
1998
+ ADD1:
1999
+ .long 4, 4, 4, 4
2000
+ BLAKE3_IV_0:
2001
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2002
+ BLAKE3_IV_1:
2003
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2004
+ BLAKE3_IV_2:
2005
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2006
+ BLAKE3_IV_3:
2007
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2008
+ BLAKE3_BLOCK_LEN:
2009
+ .long 64, 64, 64, 64
2010
+ CMP_MSB_MASK:
2011
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000