digest-blake3 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2011 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse41
3
+ .global _blake3_hash_many_sse41
4
+ .global blake3_compress_in_place_sse41
5
+ .global _blake3_compress_in_place_sse41
6
+ .global blake3_compress_xof_sse41
7
+ .global _blake3_compress_xof_sse41
8
+ #ifdef __APPLE__
9
+ .text
10
+ #else
11
+ .section .text
12
+ #endif
13
+ .p2align 6
14
+ _blake3_hash_many_sse41:
15
+ blake3_hash_many_sse41:
16
+ push r15
17
+ push r14
18
+ push r13
19
+ push r12
20
+ push rbx
21
+ push rbp
22
+ mov rbp, rsp
23
+ sub rsp, 360
24
+ and rsp, 0xFFFFFFFFFFFFFFC0
25
+ neg r9d
26
+ movd xmm0, r9d
27
+ pshufd xmm0, xmm0, 0x00
28
+ movdqa xmmword ptr [rsp+0x130], xmm0
29
+ movdqa xmm1, xmm0
30
+ pand xmm1, xmmword ptr [ADD0+rip]
31
+ pand xmm0, xmmword ptr [ADD1+rip]
32
+ movdqa xmmword ptr [rsp+0x150], xmm0
33
+ movd xmm0, r8d
34
+ pshufd xmm0, xmm0, 0x00
35
+ paddd xmm0, xmm1
36
+ movdqa xmmword ptr [rsp+0x110], xmm0
37
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
38
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
39
+ pcmpgtd xmm1, xmm0
40
+ shr r8, 32
41
+ movd xmm2, r8d
42
+ pshufd xmm2, xmm2, 0x00
43
+ psubd xmm2, xmm1
44
+ movdqa xmmword ptr [rsp+0x120], xmm2
45
+ mov rbx, qword ptr [rbp+0x50]
46
+ mov r15, rdx
47
+ shl r15, 6
48
+ movzx r13d, byte ptr [rbp+0x38]
49
+ movzx r12d, byte ptr [rbp+0x48]
50
+ cmp rsi, 4
51
+ jc 3f
52
+ 2:
53
+ movdqu xmm3, xmmword ptr [rcx]
54
+ pshufd xmm0, xmm3, 0x00
55
+ pshufd xmm1, xmm3, 0x55
56
+ pshufd xmm2, xmm3, 0xAA
57
+ pshufd xmm3, xmm3, 0xFF
58
+ movdqu xmm7, xmmword ptr [rcx+0x10]
59
+ pshufd xmm4, xmm7, 0x00
60
+ pshufd xmm5, xmm7, 0x55
61
+ pshufd xmm6, xmm7, 0xAA
62
+ pshufd xmm7, xmm7, 0xFF
63
+ mov r8, qword ptr [rdi]
64
+ mov r9, qword ptr [rdi+0x8]
65
+ mov r10, qword ptr [rdi+0x10]
66
+ mov r11, qword ptr [rdi+0x18]
67
+ movzx eax, byte ptr [rbp+0x40]
68
+ or eax, r13d
69
+ xor edx, edx
70
+ 9:
71
+ mov r14d, eax
72
+ or eax, r12d
73
+ add rdx, 64
74
+ cmp rdx, r15
75
+ cmovne eax, r14d
76
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
77
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
78
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
79
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
80
+ movdqa xmm12, xmm8
81
+ punpckldq xmm8, xmm9
82
+ punpckhdq xmm12, xmm9
83
+ movdqa xmm14, xmm10
84
+ punpckldq xmm10, xmm11
85
+ punpckhdq xmm14, xmm11
86
+ movdqa xmm9, xmm8
87
+ punpcklqdq xmm8, xmm10
88
+ punpckhqdq xmm9, xmm10
89
+ movdqa xmm13, xmm12
90
+ punpcklqdq xmm12, xmm14
91
+ punpckhqdq xmm13, xmm14
92
+ movdqa xmmword ptr [rsp], xmm8
93
+ movdqa xmmword ptr [rsp+0x10], xmm9
94
+ movdqa xmmword ptr [rsp+0x20], xmm12
95
+ movdqa xmmword ptr [rsp+0x30], xmm13
96
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
97
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
98
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
99
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
100
+ movdqa xmm12, xmm8
101
+ punpckldq xmm8, xmm9
102
+ punpckhdq xmm12, xmm9
103
+ movdqa xmm14, xmm10
104
+ punpckldq xmm10, xmm11
105
+ punpckhdq xmm14, xmm11
106
+ movdqa xmm9, xmm8
107
+ punpcklqdq xmm8, xmm10
108
+ punpckhqdq xmm9, xmm10
109
+ movdqa xmm13, xmm12
110
+ punpcklqdq xmm12, xmm14
111
+ punpckhqdq xmm13, xmm14
112
+ movdqa xmmword ptr [rsp+0x40], xmm8
113
+ movdqa xmmword ptr [rsp+0x50], xmm9
114
+ movdqa xmmword ptr [rsp+0x60], xmm12
115
+ movdqa xmmword ptr [rsp+0x70], xmm13
116
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
117
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
118
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
119
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
120
+ movdqa xmm12, xmm8
121
+ punpckldq xmm8, xmm9
122
+ punpckhdq xmm12, xmm9
123
+ movdqa xmm14, xmm10
124
+ punpckldq xmm10, xmm11
125
+ punpckhdq xmm14, xmm11
126
+ movdqa xmm9, xmm8
127
+ punpcklqdq xmm8, xmm10
128
+ punpckhqdq xmm9, xmm10
129
+ movdqa xmm13, xmm12
130
+ punpcklqdq xmm12, xmm14
131
+ punpckhqdq xmm13, xmm14
132
+ movdqa xmmword ptr [rsp+0x80], xmm8
133
+ movdqa xmmword ptr [rsp+0x90], xmm9
134
+ movdqa xmmword ptr [rsp+0xA0], xmm12
135
+ movdqa xmmword ptr [rsp+0xB0], xmm13
136
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
137
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
138
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
139
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
140
+ movdqa xmm12, xmm8
141
+ punpckldq xmm8, xmm9
142
+ punpckhdq xmm12, xmm9
143
+ movdqa xmm14, xmm10
144
+ punpckldq xmm10, xmm11
145
+ punpckhdq xmm14, xmm11
146
+ movdqa xmm9, xmm8
147
+ punpcklqdq xmm8, xmm10
148
+ punpckhqdq xmm9, xmm10
149
+ movdqa xmm13, xmm12
150
+ punpcklqdq xmm12, xmm14
151
+ punpckhqdq xmm13, xmm14
152
+ movdqa xmmword ptr [rsp+0xC0], xmm8
153
+ movdqa xmmword ptr [rsp+0xD0], xmm9
154
+ movdqa xmmword ptr [rsp+0xE0], xmm12
155
+ movdqa xmmword ptr [rsp+0xF0], xmm13
156
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
157
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
158
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
159
+ movdqa xmm12, xmmword ptr [rsp+0x110]
160
+ movdqa xmm13, xmmword ptr [rsp+0x120]
161
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
162
+ movd xmm15, eax
163
+ pshufd xmm15, xmm15, 0x00
164
+ prefetcht0 [r8+rdx+0x80]
165
+ prefetcht0 [r9+rdx+0x80]
166
+ prefetcht0 [r10+rdx+0x80]
167
+ prefetcht0 [r11+rdx+0x80]
168
+ paddd xmm0, xmmword ptr [rsp]
169
+ paddd xmm1, xmmword ptr [rsp+0x20]
170
+ paddd xmm2, xmmword ptr [rsp+0x40]
171
+ paddd xmm3, xmmword ptr [rsp+0x60]
172
+ paddd xmm0, xmm4
173
+ paddd xmm1, xmm5
174
+ paddd xmm2, xmm6
175
+ paddd xmm3, xmm7
176
+ pxor xmm12, xmm0
177
+ pxor xmm13, xmm1
178
+ pxor xmm14, xmm2
179
+ pxor xmm15, xmm3
180
+ movdqa xmm8, xmmword ptr [ROT16+rip]
181
+ pshufb xmm12, xmm8
182
+ pshufb xmm13, xmm8
183
+ pshufb xmm14, xmm8
184
+ pshufb xmm15, xmm8
185
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
186
+ paddd xmm8, xmm12
187
+ paddd xmm9, xmm13
188
+ paddd xmm10, xmm14
189
+ paddd xmm11, xmm15
190
+ pxor xmm4, xmm8
191
+ pxor xmm5, xmm9
192
+ pxor xmm6, xmm10
193
+ pxor xmm7, xmm11
194
+ movdqa xmmword ptr [rsp+0x100], xmm8
195
+ movdqa xmm8, xmm4
196
+ psrld xmm8, 12
197
+ pslld xmm4, 20
198
+ por xmm4, xmm8
199
+ movdqa xmm8, xmm5
200
+ psrld xmm8, 12
201
+ pslld xmm5, 20
202
+ por xmm5, xmm8
203
+ movdqa xmm8, xmm6
204
+ psrld xmm8, 12
205
+ pslld xmm6, 20
206
+ por xmm6, xmm8
207
+ movdqa xmm8, xmm7
208
+ psrld xmm8, 12
209
+ pslld xmm7, 20
210
+ por xmm7, xmm8
211
+ paddd xmm0, xmmword ptr [rsp+0x10]
212
+ paddd xmm1, xmmword ptr [rsp+0x30]
213
+ paddd xmm2, xmmword ptr [rsp+0x50]
214
+ paddd xmm3, xmmword ptr [rsp+0x70]
215
+ paddd xmm0, xmm4
216
+ paddd xmm1, xmm5
217
+ paddd xmm2, xmm6
218
+ paddd xmm3, xmm7
219
+ pxor xmm12, xmm0
220
+ pxor xmm13, xmm1
221
+ pxor xmm14, xmm2
222
+ pxor xmm15, xmm3
223
+ movdqa xmm8, xmmword ptr [ROT8+rip]
224
+ pshufb xmm12, xmm8
225
+ pshufb xmm13, xmm8
226
+ pshufb xmm14, xmm8
227
+ pshufb xmm15, xmm8
228
+ movdqa xmm8, xmmword ptr [rsp+0x100]
229
+ paddd xmm8, xmm12
230
+ paddd xmm9, xmm13
231
+ paddd xmm10, xmm14
232
+ paddd xmm11, xmm15
233
+ pxor xmm4, xmm8
234
+ pxor xmm5, xmm9
235
+ pxor xmm6, xmm10
236
+ pxor xmm7, xmm11
237
+ movdqa xmmword ptr [rsp+0x100], xmm8
238
+ movdqa xmm8, xmm4
239
+ psrld xmm8, 7
240
+ pslld xmm4, 25
241
+ por xmm4, xmm8
242
+ movdqa xmm8, xmm5
243
+ psrld xmm8, 7
244
+ pslld xmm5, 25
245
+ por xmm5, xmm8
246
+ movdqa xmm8, xmm6
247
+ psrld xmm8, 7
248
+ pslld xmm6, 25
249
+ por xmm6, xmm8
250
+ movdqa xmm8, xmm7
251
+ psrld xmm8, 7
252
+ pslld xmm7, 25
253
+ por xmm7, xmm8
254
+ paddd xmm0, xmmword ptr [rsp+0x80]
255
+ paddd xmm1, xmmword ptr [rsp+0xA0]
256
+ paddd xmm2, xmmword ptr [rsp+0xC0]
257
+ paddd xmm3, xmmword ptr [rsp+0xE0]
258
+ paddd xmm0, xmm5
259
+ paddd xmm1, xmm6
260
+ paddd xmm2, xmm7
261
+ paddd xmm3, xmm4
262
+ pxor xmm15, xmm0
263
+ pxor xmm12, xmm1
264
+ pxor xmm13, xmm2
265
+ pxor xmm14, xmm3
266
+ movdqa xmm8, xmmword ptr [ROT16+rip]
267
+ pshufb xmm15, xmm8
268
+ pshufb xmm12, xmm8
269
+ pshufb xmm13, xmm8
270
+ pshufb xmm14, xmm8
271
+ paddd xmm10, xmm15
272
+ paddd xmm11, xmm12
273
+ movdqa xmm8, xmmword ptr [rsp+0x100]
274
+ paddd xmm8, xmm13
275
+ paddd xmm9, xmm14
276
+ pxor xmm5, xmm10
277
+ pxor xmm6, xmm11
278
+ pxor xmm7, xmm8
279
+ pxor xmm4, xmm9
280
+ movdqa xmmword ptr [rsp+0x100], xmm8
281
+ movdqa xmm8, xmm5
282
+ psrld xmm8, 12
283
+ pslld xmm5, 20
284
+ por xmm5, xmm8
285
+ movdqa xmm8, xmm6
286
+ psrld xmm8, 12
287
+ pslld xmm6, 20
288
+ por xmm6, xmm8
289
+ movdqa xmm8, xmm7
290
+ psrld xmm8, 12
291
+ pslld xmm7, 20
292
+ por xmm7, xmm8
293
+ movdqa xmm8, xmm4
294
+ psrld xmm8, 12
295
+ pslld xmm4, 20
296
+ por xmm4, xmm8
297
+ paddd xmm0, xmmword ptr [rsp+0x90]
298
+ paddd xmm1, xmmword ptr [rsp+0xB0]
299
+ paddd xmm2, xmmword ptr [rsp+0xD0]
300
+ paddd xmm3, xmmword ptr [rsp+0xF0]
301
+ paddd xmm0, xmm5
302
+ paddd xmm1, xmm6
303
+ paddd xmm2, xmm7
304
+ paddd xmm3, xmm4
305
+ pxor xmm15, xmm0
306
+ pxor xmm12, xmm1
307
+ pxor xmm13, xmm2
308
+ pxor xmm14, xmm3
309
+ movdqa xmm8, xmmword ptr [ROT8+rip]
310
+ pshufb xmm15, xmm8
311
+ pshufb xmm12, xmm8
312
+ pshufb xmm13, xmm8
313
+ pshufb xmm14, xmm8
314
+ paddd xmm10, xmm15
315
+ paddd xmm11, xmm12
316
+ movdqa xmm8, xmmword ptr [rsp+0x100]
317
+ paddd xmm8, xmm13
318
+ paddd xmm9, xmm14
319
+ pxor xmm5, xmm10
320
+ pxor xmm6, xmm11
321
+ pxor xmm7, xmm8
322
+ pxor xmm4, xmm9
323
+ movdqa xmmword ptr [rsp+0x100], xmm8
324
+ movdqa xmm8, xmm5
325
+ psrld xmm8, 7
326
+ pslld xmm5, 25
327
+ por xmm5, xmm8
328
+ movdqa xmm8, xmm6
329
+ psrld xmm8, 7
330
+ pslld xmm6, 25
331
+ por xmm6, xmm8
332
+ movdqa xmm8, xmm7
333
+ psrld xmm8, 7
334
+ pslld xmm7, 25
335
+ por xmm7, xmm8
336
+ movdqa xmm8, xmm4
337
+ psrld xmm8, 7
338
+ pslld xmm4, 25
339
+ por xmm4, xmm8
340
+ paddd xmm0, xmmword ptr [rsp+0x20]
341
+ paddd xmm1, xmmword ptr [rsp+0x30]
342
+ paddd xmm2, xmmword ptr [rsp+0x70]
343
+ paddd xmm3, xmmword ptr [rsp+0x40]
344
+ paddd xmm0, xmm4
345
+ paddd xmm1, xmm5
346
+ paddd xmm2, xmm6
347
+ paddd xmm3, xmm7
348
+ pxor xmm12, xmm0
349
+ pxor xmm13, xmm1
350
+ pxor xmm14, xmm2
351
+ pxor xmm15, xmm3
352
+ movdqa xmm8, xmmword ptr [ROT16+rip]
353
+ pshufb xmm12, xmm8
354
+ pshufb xmm13, xmm8
355
+ pshufb xmm14, xmm8
356
+ pshufb xmm15, xmm8
357
+ movdqa xmm8, xmmword ptr [rsp+0x100]
358
+ paddd xmm8, xmm12
359
+ paddd xmm9, xmm13
360
+ paddd xmm10, xmm14
361
+ paddd xmm11, xmm15
362
+ pxor xmm4, xmm8
363
+ pxor xmm5, xmm9
364
+ pxor xmm6, xmm10
365
+ pxor xmm7, xmm11
366
+ movdqa xmmword ptr [rsp+0x100], xmm8
367
+ movdqa xmm8, xmm4
368
+ psrld xmm8, 12
369
+ pslld xmm4, 20
370
+ por xmm4, xmm8
371
+ movdqa xmm8, xmm5
372
+ psrld xmm8, 12
373
+ pslld xmm5, 20
374
+ por xmm5, xmm8
375
+ movdqa xmm8, xmm6
376
+ psrld xmm8, 12
377
+ pslld xmm6, 20
378
+ por xmm6, xmm8
379
+ movdqa xmm8, xmm7
380
+ psrld xmm8, 12
381
+ pslld xmm7, 20
382
+ por xmm7, xmm8
383
+ paddd xmm0, xmmword ptr [rsp+0x60]
384
+ paddd xmm1, xmmword ptr [rsp+0xA0]
385
+ paddd xmm2, xmmword ptr [rsp]
386
+ paddd xmm3, xmmword ptr [rsp+0xD0]
387
+ paddd xmm0, xmm4
388
+ paddd xmm1, xmm5
389
+ paddd xmm2, xmm6
390
+ paddd xmm3, xmm7
391
+ pxor xmm12, xmm0
392
+ pxor xmm13, xmm1
393
+ pxor xmm14, xmm2
394
+ pxor xmm15, xmm3
395
+ movdqa xmm8, xmmword ptr [ROT8+rip]
396
+ pshufb xmm12, xmm8
397
+ pshufb xmm13, xmm8
398
+ pshufb xmm14, xmm8
399
+ pshufb xmm15, xmm8
400
+ movdqa xmm8, xmmword ptr [rsp+0x100]
401
+ paddd xmm8, xmm12
402
+ paddd xmm9, xmm13
403
+ paddd xmm10, xmm14
404
+ paddd xmm11, xmm15
405
+ pxor xmm4, xmm8
406
+ pxor xmm5, xmm9
407
+ pxor xmm6, xmm10
408
+ pxor xmm7, xmm11
409
+ movdqa xmmword ptr [rsp+0x100], xmm8
410
+ movdqa xmm8, xmm4
411
+ psrld xmm8, 7
412
+ pslld xmm4, 25
413
+ por xmm4, xmm8
414
+ movdqa xmm8, xmm5
415
+ psrld xmm8, 7
416
+ pslld xmm5, 25
417
+ por xmm5, xmm8
418
+ movdqa xmm8, xmm6
419
+ psrld xmm8, 7
420
+ pslld xmm6, 25
421
+ por xmm6, xmm8
422
+ movdqa xmm8, xmm7
423
+ psrld xmm8, 7
424
+ pslld xmm7, 25
425
+ por xmm7, xmm8
426
+ paddd xmm0, xmmword ptr [rsp+0x10]
427
+ paddd xmm1, xmmword ptr [rsp+0xC0]
428
+ paddd xmm2, xmmword ptr [rsp+0x90]
429
+ paddd xmm3, xmmword ptr [rsp+0xF0]
430
+ paddd xmm0, xmm5
431
+ paddd xmm1, xmm6
432
+ paddd xmm2, xmm7
433
+ paddd xmm3, xmm4
434
+ pxor xmm15, xmm0
435
+ pxor xmm12, xmm1
436
+ pxor xmm13, xmm2
437
+ pxor xmm14, xmm3
438
+ movdqa xmm8, xmmword ptr [ROT16+rip]
439
+ pshufb xmm15, xmm8
440
+ pshufb xmm12, xmm8
441
+ pshufb xmm13, xmm8
442
+ pshufb xmm14, xmm8
443
+ paddd xmm10, xmm15
444
+ paddd xmm11, xmm12
445
+ movdqa xmm8, xmmword ptr [rsp+0x100]
446
+ paddd xmm8, xmm13
447
+ paddd xmm9, xmm14
448
+ pxor xmm5, xmm10
449
+ pxor xmm6, xmm11
450
+ pxor xmm7, xmm8
451
+ pxor xmm4, xmm9
452
+ movdqa xmmword ptr [rsp+0x100], xmm8
453
+ movdqa xmm8, xmm5
454
+ psrld xmm8, 12
455
+ pslld xmm5, 20
456
+ por xmm5, xmm8
457
+ movdqa xmm8, xmm6
458
+ psrld xmm8, 12
459
+ pslld xmm6, 20
460
+ por xmm6, xmm8
461
+ movdqa xmm8, xmm7
462
+ psrld xmm8, 12
463
+ pslld xmm7, 20
464
+ por xmm7, xmm8
465
+ movdqa xmm8, xmm4
466
+ psrld xmm8, 12
467
+ pslld xmm4, 20
468
+ por xmm4, xmm8
469
+ paddd xmm0, xmmword ptr [rsp+0xB0]
470
+ paddd xmm1, xmmword ptr [rsp+0x50]
471
+ paddd xmm2, xmmword ptr [rsp+0xE0]
472
+ paddd xmm3, xmmword ptr [rsp+0x80]
473
+ paddd xmm0, xmm5
474
+ paddd xmm1, xmm6
475
+ paddd xmm2, xmm7
476
+ paddd xmm3, xmm4
477
+ pxor xmm15, xmm0
478
+ pxor xmm12, xmm1
479
+ pxor xmm13, xmm2
480
+ pxor xmm14, xmm3
481
+ movdqa xmm8, xmmword ptr [ROT8+rip]
482
+ pshufb xmm15, xmm8
483
+ pshufb xmm12, xmm8
484
+ pshufb xmm13, xmm8
485
+ pshufb xmm14, xmm8
486
+ paddd xmm10, xmm15
487
+ paddd xmm11, xmm12
488
+ movdqa xmm8, xmmword ptr [rsp+0x100]
489
+ paddd xmm8, xmm13
490
+ paddd xmm9, xmm14
491
+ pxor xmm5, xmm10
492
+ pxor xmm6, xmm11
493
+ pxor xmm7, xmm8
494
+ pxor xmm4, xmm9
495
+ movdqa xmmword ptr [rsp+0x100], xmm8
496
+ movdqa xmm8, xmm5
497
+ psrld xmm8, 7
498
+ pslld xmm5, 25
499
+ por xmm5, xmm8
500
+ movdqa xmm8, xmm6
501
+ psrld xmm8, 7
502
+ pslld xmm6, 25
503
+ por xmm6, xmm8
504
+ movdqa xmm8, xmm7
505
+ psrld xmm8, 7
506
+ pslld xmm7, 25
507
+ por xmm7, xmm8
508
+ movdqa xmm8, xmm4
509
+ psrld xmm8, 7
510
+ pslld xmm4, 25
511
+ por xmm4, xmm8
512
+ paddd xmm0, xmmword ptr [rsp+0x30]
513
+ paddd xmm1, xmmword ptr [rsp+0xA0]
514
+ paddd xmm2, xmmword ptr [rsp+0xD0]
515
+ paddd xmm3, xmmword ptr [rsp+0x70]
516
+ paddd xmm0, xmm4
517
+ paddd xmm1, xmm5
518
+ paddd xmm2, xmm6
519
+ paddd xmm3, xmm7
520
+ pxor xmm12, xmm0
521
+ pxor xmm13, xmm1
522
+ pxor xmm14, xmm2
523
+ pxor xmm15, xmm3
524
+ movdqa xmm8, xmmword ptr [ROT16+rip]
525
+ pshufb xmm12, xmm8
526
+ pshufb xmm13, xmm8
527
+ pshufb xmm14, xmm8
528
+ pshufb xmm15, xmm8
529
+ movdqa xmm8, xmmword ptr [rsp+0x100]
530
+ paddd xmm8, xmm12
531
+ paddd xmm9, xmm13
532
+ paddd xmm10, xmm14
533
+ paddd xmm11, xmm15
534
+ pxor xmm4, xmm8
535
+ pxor xmm5, xmm9
536
+ pxor xmm6, xmm10
537
+ pxor xmm7, xmm11
538
+ movdqa xmmword ptr [rsp+0x100], xmm8
539
+ movdqa xmm8, xmm4
540
+ psrld xmm8, 12
541
+ pslld xmm4, 20
542
+ por xmm4, xmm8
543
+ movdqa xmm8, xmm5
544
+ psrld xmm8, 12
545
+ pslld xmm5, 20
546
+ por xmm5, xmm8
547
+ movdqa xmm8, xmm6
548
+ psrld xmm8, 12
549
+ pslld xmm6, 20
550
+ por xmm6, xmm8
551
+ movdqa xmm8, xmm7
552
+ psrld xmm8, 12
553
+ pslld xmm7, 20
554
+ por xmm7, xmm8
555
+ paddd xmm0, xmmword ptr [rsp+0x40]
556
+ paddd xmm1, xmmword ptr [rsp+0xC0]
557
+ paddd xmm2, xmmword ptr [rsp+0x20]
558
+ paddd xmm3, xmmword ptr [rsp+0xE0]
559
+ paddd xmm0, xmm4
560
+ paddd xmm1, xmm5
561
+ paddd xmm2, xmm6
562
+ paddd xmm3, xmm7
563
+ pxor xmm12, xmm0
564
+ pxor xmm13, xmm1
565
+ pxor xmm14, xmm2
566
+ pxor xmm15, xmm3
567
+ movdqa xmm8, xmmword ptr [ROT8+rip]
568
+ pshufb xmm12, xmm8
569
+ pshufb xmm13, xmm8
570
+ pshufb xmm14, xmm8
571
+ pshufb xmm15, xmm8
572
+ movdqa xmm8, xmmword ptr [rsp+0x100]
573
+ paddd xmm8, xmm12
574
+ paddd xmm9, xmm13
575
+ paddd xmm10, xmm14
576
+ paddd xmm11, xmm15
577
+ pxor xmm4, xmm8
578
+ pxor xmm5, xmm9
579
+ pxor xmm6, xmm10
580
+ pxor xmm7, xmm11
581
+ movdqa xmmword ptr [rsp+0x100], xmm8
582
+ movdqa xmm8, xmm4
583
+ psrld xmm8, 7
584
+ pslld xmm4, 25
585
+ por xmm4, xmm8
586
+ movdqa xmm8, xmm5
587
+ psrld xmm8, 7
588
+ pslld xmm5, 25
589
+ por xmm5, xmm8
590
+ movdqa xmm8, xmm6
591
+ psrld xmm8, 7
592
+ pslld xmm6, 25
593
+ por xmm6, xmm8
594
+ movdqa xmm8, xmm7
595
+ psrld xmm8, 7
596
+ pslld xmm7, 25
597
+ por xmm7, xmm8
598
+ paddd xmm0, xmmword ptr [rsp+0x60]
599
+ paddd xmm1, xmmword ptr [rsp+0x90]
600
+ paddd xmm2, xmmword ptr [rsp+0xB0]
601
+ paddd xmm3, xmmword ptr [rsp+0x80]
602
+ paddd xmm0, xmm5
603
+ paddd xmm1, xmm6
604
+ paddd xmm2, xmm7
605
+ paddd xmm3, xmm4
606
+ pxor xmm15, xmm0
607
+ pxor xmm12, xmm1
608
+ pxor xmm13, xmm2
609
+ pxor xmm14, xmm3
610
+ movdqa xmm8, xmmword ptr [ROT16+rip]
611
+ pshufb xmm15, xmm8
612
+ pshufb xmm12, xmm8
613
+ pshufb xmm13, xmm8
614
+ pshufb xmm14, xmm8
615
+ paddd xmm10, xmm15
616
+ paddd xmm11, xmm12
617
+ movdqa xmm8, xmmword ptr [rsp+0x100]
618
+ paddd xmm8, xmm13
619
+ paddd xmm9, xmm14
620
+ pxor xmm5, xmm10
621
+ pxor xmm6, xmm11
622
+ pxor xmm7, xmm8
623
+ pxor xmm4, xmm9
624
+ movdqa xmmword ptr [rsp+0x100], xmm8
625
+ movdqa xmm8, xmm5
626
+ psrld xmm8, 12
627
+ pslld xmm5, 20
628
+ por xmm5, xmm8
629
+ movdqa xmm8, xmm6
630
+ psrld xmm8, 12
631
+ pslld xmm6, 20
632
+ por xmm6, xmm8
633
+ movdqa xmm8, xmm7
634
+ psrld xmm8, 12
635
+ pslld xmm7, 20
636
+ por xmm7, xmm8
637
+ movdqa xmm8, xmm4
638
+ psrld xmm8, 12
639
+ pslld xmm4, 20
640
+ por xmm4, xmm8
641
+ paddd xmm0, xmmword ptr [rsp+0x50]
642
+ paddd xmm1, xmmword ptr [rsp]
643
+ paddd xmm2, xmmword ptr [rsp+0xF0]
644
+ paddd xmm3, xmmword ptr [rsp+0x10]
645
+ paddd xmm0, xmm5
646
+ paddd xmm1, xmm6
647
+ paddd xmm2, xmm7
648
+ paddd xmm3, xmm4
649
+ pxor xmm15, xmm0
650
+ pxor xmm12, xmm1
651
+ pxor xmm13, xmm2
652
+ pxor xmm14, xmm3
653
+ movdqa xmm8, xmmword ptr [ROT8+rip]
654
+ pshufb xmm15, xmm8
655
+ pshufb xmm12, xmm8
656
+ pshufb xmm13, xmm8
657
+ pshufb xmm14, xmm8
658
+ paddd xmm10, xmm15
659
+ paddd xmm11, xmm12
660
+ movdqa xmm8, xmmword ptr [rsp+0x100]
661
+ paddd xmm8, xmm13
662
+ paddd xmm9, xmm14
663
+ pxor xmm5, xmm10
664
+ pxor xmm6, xmm11
665
+ pxor xmm7, xmm8
666
+ pxor xmm4, xmm9
667
+ movdqa xmmword ptr [rsp+0x100], xmm8
668
+ movdqa xmm8, xmm5
669
+ psrld xmm8, 7
670
+ pslld xmm5, 25
671
+ por xmm5, xmm8
672
+ movdqa xmm8, xmm6
673
+ psrld xmm8, 7
674
+ pslld xmm6, 25
675
+ por xmm6, xmm8
676
+ movdqa xmm8, xmm7
677
+ psrld xmm8, 7
678
+ pslld xmm7, 25
679
+ por xmm7, xmm8
680
+ movdqa xmm8, xmm4
681
+ psrld xmm8, 7
682
+ pslld xmm4, 25
683
+ por xmm4, xmm8
684
+ paddd xmm0, xmmword ptr [rsp+0xA0]
685
+ paddd xmm1, xmmword ptr [rsp+0xC0]
686
+ paddd xmm2, xmmword ptr [rsp+0xE0]
687
+ paddd xmm3, xmmword ptr [rsp+0xD0]
688
+ paddd xmm0, xmm4
689
+ paddd xmm1, xmm5
690
+ paddd xmm2, xmm6
691
+ paddd xmm3, xmm7
692
+ pxor xmm12, xmm0
693
+ pxor xmm13, xmm1
694
+ pxor xmm14, xmm2
695
+ pxor xmm15, xmm3
696
+ movdqa xmm8, xmmword ptr [ROT16+rip]
697
+ pshufb xmm12, xmm8
698
+ pshufb xmm13, xmm8
699
+ pshufb xmm14, xmm8
700
+ pshufb xmm15, xmm8
701
+ movdqa xmm8, xmmword ptr [rsp+0x100]
702
+ paddd xmm8, xmm12
703
+ paddd xmm9, xmm13
704
+ paddd xmm10, xmm14
705
+ paddd xmm11, xmm15
706
+ pxor xmm4, xmm8
707
+ pxor xmm5, xmm9
708
+ pxor xmm6, xmm10
709
+ pxor xmm7, xmm11
710
+ movdqa xmmword ptr [rsp+0x100], xmm8
711
+ movdqa xmm8, xmm4
712
+ psrld xmm8, 12
713
+ pslld xmm4, 20
714
+ por xmm4, xmm8
715
+ movdqa xmm8, xmm5
716
+ psrld xmm8, 12
717
+ pslld xmm5, 20
718
+ por xmm5, xmm8
719
+ movdqa xmm8, xmm6
720
+ psrld xmm8, 12
721
+ pslld xmm6, 20
722
+ por xmm6, xmm8
723
+ movdqa xmm8, xmm7
724
+ psrld xmm8, 12
725
+ pslld xmm7, 20
726
+ por xmm7, xmm8
727
+ paddd xmm0, xmmword ptr [rsp+0x70]
728
+ paddd xmm1, xmmword ptr [rsp+0x90]
729
+ paddd xmm2, xmmword ptr [rsp+0x30]
730
+ paddd xmm3, xmmword ptr [rsp+0xF0]
731
+ paddd xmm0, xmm4
732
+ paddd xmm1, xmm5
733
+ paddd xmm2, xmm6
734
+ paddd xmm3, xmm7
735
+ pxor xmm12, xmm0
736
+ pxor xmm13, xmm1
737
+ pxor xmm14, xmm2
738
+ pxor xmm15, xmm3
739
+ movdqa xmm8, xmmword ptr [ROT8+rip]
740
+ pshufb xmm12, xmm8
741
+ pshufb xmm13, xmm8
742
+ pshufb xmm14, xmm8
743
+ pshufb xmm15, xmm8
744
+ movdqa xmm8, xmmword ptr [rsp+0x100]
745
+ paddd xmm8, xmm12
746
+ paddd xmm9, xmm13
747
+ paddd xmm10, xmm14
748
+ paddd xmm11, xmm15
749
+ pxor xmm4, xmm8
750
+ pxor xmm5, xmm9
751
+ pxor xmm6, xmm10
752
+ pxor xmm7, xmm11
753
+ movdqa xmmword ptr [rsp+0x100], xmm8
754
+ movdqa xmm8, xmm4
755
+ psrld xmm8, 7
756
+ pslld xmm4, 25
757
+ por xmm4, xmm8
758
+ movdqa xmm8, xmm5
759
+ psrld xmm8, 7
760
+ pslld xmm5, 25
761
+ por xmm5, xmm8
762
+ movdqa xmm8, xmm6
763
+ psrld xmm8, 7
764
+ pslld xmm6, 25
765
+ por xmm6, xmm8
766
+ movdqa xmm8, xmm7
767
+ psrld xmm8, 7
768
+ pslld xmm7, 25
769
+ por xmm7, xmm8
770
+ paddd xmm0, xmmword ptr [rsp+0x40]
771
+ paddd xmm1, xmmword ptr [rsp+0xB0]
772
+ paddd xmm2, xmmword ptr [rsp+0x50]
773
+ paddd xmm3, xmmword ptr [rsp+0x10]
774
+ paddd xmm0, xmm5
775
+ paddd xmm1, xmm6
776
+ paddd xmm2, xmm7
777
+ paddd xmm3, xmm4
778
+ pxor xmm15, xmm0
779
+ pxor xmm12, xmm1
780
+ pxor xmm13, xmm2
781
+ pxor xmm14, xmm3
782
+ movdqa xmm8, xmmword ptr [ROT16+rip]
783
+ pshufb xmm15, xmm8
784
+ pshufb xmm12, xmm8
785
+ pshufb xmm13, xmm8
786
+ pshufb xmm14, xmm8
787
+ paddd xmm10, xmm15
788
+ paddd xmm11, xmm12
789
+ movdqa xmm8, xmmword ptr [rsp+0x100]
790
+ paddd xmm8, xmm13
791
+ paddd xmm9, xmm14
792
+ pxor xmm5, xmm10
793
+ pxor xmm6, xmm11
794
+ pxor xmm7, xmm8
795
+ pxor xmm4, xmm9
796
+ movdqa xmmword ptr [rsp+0x100], xmm8
797
+ movdqa xmm8, xmm5
798
+ psrld xmm8, 12
799
+ pslld xmm5, 20
800
+ por xmm5, xmm8
801
+ movdqa xmm8, xmm6
802
+ psrld xmm8, 12
803
+ pslld xmm6, 20
804
+ por xmm6, xmm8
805
+ movdqa xmm8, xmm7
806
+ psrld xmm8, 12
807
+ pslld xmm7, 20
808
+ por xmm7, xmm8
809
+ movdqa xmm8, xmm4
810
+ psrld xmm8, 12
811
+ pslld xmm4, 20
812
+ por xmm4, xmm8
813
+ paddd xmm0, xmmword ptr [rsp]
814
+ paddd xmm1, xmmword ptr [rsp+0x20]
815
+ paddd xmm2, xmmword ptr [rsp+0x80]
816
+ paddd xmm3, xmmword ptr [rsp+0x60]
817
+ paddd xmm0, xmm5
818
+ paddd xmm1, xmm6
819
+ paddd xmm2, xmm7
820
+ paddd xmm3, xmm4
821
+ pxor xmm15, xmm0
822
+ pxor xmm12, xmm1
823
+ pxor xmm13, xmm2
824
+ pxor xmm14, xmm3
825
+ movdqa xmm8, xmmword ptr [ROT8+rip]
826
+ pshufb xmm15, xmm8
827
+ pshufb xmm12, xmm8
828
+ pshufb xmm13, xmm8
829
+ pshufb xmm14, xmm8
830
+ paddd xmm10, xmm15
831
+ paddd xmm11, xmm12
832
+ movdqa xmm8, xmmword ptr [rsp+0x100]
833
+ paddd xmm8, xmm13
834
+ paddd xmm9, xmm14
835
+ pxor xmm5, xmm10
836
+ pxor xmm6, xmm11
837
+ pxor xmm7, xmm8
838
+ pxor xmm4, xmm9
839
+ movdqa xmmword ptr [rsp+0x100], xmm8
840
+ movdqa xmm8, xmm5
841
+ psrld xmm8, 7
842
+ pslld xmm5, 25
843
+ por xmm5, xmm8
844
+ movdqa xmm8, xmm6
845
+ psrld xmm8, 7
846
+ pslld xmm6, 25
847
+ por xmm6, xmm8
848
+ movdqa xmm8, xmm7
849
+ psrld xmm8, 7
850
+ pslld xmm7, 25
851
+ por xmm7, xmm8
852
+ movdqa xmm8, xmm4
853
+ psrld xmm8, 7
854
+ pslld xmm4, 25
855
+ por xmm4, xmm8
856
+ paddd xmm0, xmmword ptr [rsp+0xC0]
857
+ paddd xmm1, xmmword ptr [rsp+0x90]
858
+ paddd xmm2, xmmword ptr [rsp+0xF0]
859
+ paddd xmm3, xmmword ptr [rsp+0xE0]
860
+ paddd xmm0, xmm4
861
+ paddd xmm1, xmm5
862
+ paddd xmm2, xmm6
863
+ paddd xmm3, xmm7
864
+ pxor xmm12, xmm0
865
+ pxor xmm13, xmm1
866
+ pxor xmm14, xmm2
867
+ pxor xmm15, xmm3
868
+ movdqa xmm8, xmmword ptr [ROT16+rip]
869
+ pshufb xmm12, xmm8
870
+ pshufb xmm13, xmm8
871
+ pshufb xmm14, xmm8
872
+ pshufb xmm15, xmm8
873
+ movdqa xmm8, xmmword ptr [rsp+0x100]
874
+ paddd xmm8, xmm12
875
+ paddd xmm9, xmm13
876
+ paddd xmm10, xmm14
877
+ paddd xmm11, xmm15
878
+ pxor xmm4, xmm8
879
+ pxor xmm5, xmm9
880
+ pxor xmm6, xmm10
881
+ pxor xmm7, xmm11
882
+ movdqa xmmword ptr [rsp+0x100], xmm8
883
+ movdqa xmm8, xmm4
884
+ psrld xmm8, 12
885
+ pslld xmm4, 20
886
+ por xmm4, xmm8
887
+ movdqa xmm8, xmm5
888
+ psrld xmm8, 12
889
+ pslld xmm5, 20
890
+ por xmm5, xmm8
891
+ movdqa xmm8, xmm6
892
+ psrld xmm8, 12
893
+ pslld xmm6, 20
894
+ por xmm6, xmm8
895
+ movdqa xmm8, xmm7
896
+ psrld xmm8, 12
897
+ pslld xmm7, 20
898
+ por xmm7, xmm8
899
+ paddd xmm0, xmmword ptr [rsp+0xD0]
900
+ paddd xmm1, xmmword ptr [rsp+0xB0]
901
+ paddd xmm2, xmmword ptr [rsp+0xA0]
902
+ paddd xmm3, xmmword ptr [rsp+0x80]
903
+ paddd xmm0, xmm4
904
+ paddd xmm1, xmm5
905
+ paddd xmm2, xmm6
906
+ paddd xmm3, xmm7
907
+ pxor xmm12, xmm0
908
+ pxor xmm13, xmm1
909
+ pxor xmm14, xmm2
910
+ pxor xmm15, xmm3
911
+ movdqa xmm8, xmmword ptr [ROT8+rip]
912
+ pshufb xmm12, xmm8
913
+ pshufb xmm13, xmm8
914
+ pshufb xmm14, xmm8
915
+ pshufb xmm15, xmm8
916
+ movdqa xmm8, xmmword ptr [rsp+0x100]
917
+ paddd xmm8, xmm12
918
+ paddd xmm9, xmm13
919
+ paddd xmm10, xmm14
920
+ paddd xmm11, xmm15
921
+ pxor xmm4, xmm8
922
+ pxor xmm5, xmm9
923
+ pxor xmm6, xmm10
924
+ pxor xmm7, xmm11
925
+ movdqa xmmword ptr [rsp+0x100], xmm8
926
+ movdqa xmm8, xmm4
927
+ psrld xmm8, 7
928
+ pslld xmm4, 25
929
+ por xmm4, xmm8
930
+ movdqa xmm8, xmm5
931
+ psrld xmm8, 7
932
+ pslld xmm5, 25
933
+ por xmm5, xmm8
934
+ movdqa xmm8, xmm6
935
+ psrld xmm8, 7
936
+ pslld xmm6, 25
937
+ por xmm6, xmm8
938
+ movdqa xmm8, xmm7
939
+ psrld xmm8, 7
940
+ pslld xmm7, 25
941
+ por xmm7, xmm8
942
+ paddd xmm0, xmmword ptr [rsp+0x70]
943
+ paddd xmm1, xmmword ptr [rsp+0x50]
944
+ paddd xmm2, xmmword ptr [rsp]
945
+ paddd xmm3, xmmword ptr [rsp+0x60]
946
+ paddd xmm0, xmm5
947
+ paddd xmm1, xmm6
948
+ paddd xmm2, xmm7
949
+ paddd xmm3, xmm4
950
+ pxor xmm15, xmm0
951
+ pxor xmm12, xmm1
952
+ pxor xmm13, xmm2
953
+ pxor xmm14, xmm3
954
+ movdqa xmm8, xmmword ptr [ROT16+rip]
955
+ pshufb xmm15, xmm8
956
+ pshufb xmm12, xmm8
957
+ pshufb xmm13, xmm8
958
+ pshufb xmm14, xmm8
959
+ paddd xmm10, xmm15
960
+ paddd xmm11, xmm12
961
+ movdqa xmm8, xmmword ptr [rsp+0x100]
962
+ paddd xmm8, xmm13
963
+ paddd xmm9, xmm14
964
+ pxor xmm5, xmm10
965
+ pxor xmm6, xmm11
966
+ pxor xmm7, xmm8
967
+ pxor xmm4, xmm9
968
+ movdqa xmmword ptr [rsp+0x100], xmm8
969
+ movdqa xmm8, xmm5
970
+ psrld xmm8, 12
971
+ pslld xmm5, 20
972
+ por xmm5, xmm8
973
+ movdqa xmm8, xmm6
974
+ psrld xmm8, 12
975
+ pslld xmm6, 20
976
+ por xmm6, xmm8
977
+ movdqa xmm8, xmm7
978
+ psrld xmm8, 12
979
+ pslld xmm7, 20
980
+ por xmm7, xmm8
981
+ movdqa xmm8, xmm4
982
+ psrld xmm8, 12
983
+ pslld xmm4, 20
984
+ por xmm4, xmm8
985
+ paddd xmm0, xmmword ptr [rsp+0x20]
986
+ paddd xmm1, xmmword ptr [rsp+0x30]
987
+ paddd xmm2, xmmword ptr [rsp+0x10]
988
+ paddd xmm3, xmmword ptr [rsp+0x40]
989
+ paddd xmm0, xmm5
990
+ paddd xmm1, xmm6
991
+ paddd xmm2, xmm7
992
+ paddd xmm3, xmm4
993
+ pxor xmm15, xmm0
994
+ pxor xmm12, xmm1
995
+ pxor xmm13, xmm2
996
+ pxor xmm14, xmm3
997
+ movdqa xmm8, xmmword ptr [ROT8+rip]
998
+ pshufb xmm15, xmm8
999
+ pshufb xmm12, xmm8
1000
+ pshufb xmm13, xmm8
1001
+ pshufb xmm14, xmm8
1002
+ paddd xmm10, xmm15
1003
+ paddd xmm11, xmm12
1004
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1005
+ paddd xmm8, xmm13
1006
+ paddd xmm9, xmm14
1007
+ pxor xmm5, xmm10
1008
+ pxor xmm6, xmm11
1009
+ pxor xmm7, xmm8
1010
+ pxor xmm4, xmm9
1011
+ movdqa xmmword ptr [rsp+0x100], xmm8
1012
+ movdqa xmm8, xmm5
1013
+ psrld xmm8, 7
1014
+ pslld xmm5, 25
1015
+ por xmm5, xmm8
1016
+ movdqa xmm8, xmm6
1017
+ psrld xmm8, 7
1018
+ pslld xmm6, 25
1019
+ por xmm6, xmm8
1020
+ movdqa xmm8, xmm7
1021
+ psrld xmm8, 7
1022
+ pslld xmm7, 25
1023
+ por xmm7, xmm8
1024
+ movdqa xmm8, xmm4
1025
+ psrld xmm8, 7
1026
+ pslld xmm4, 25
1027
+ por xmm4, xmm8
1028
+ paddd xmm0, xmmword ptr [rsp+0x90]
1029
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1030
+ paddd xmm2, xmmword ptr [rsp+0x80]
1031
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1032
+ paddd xmm0, xmm4
1033
+ paddd xmm1, xmm5
1034
+ paddd xmm2, xmm6
1035
+ paddd xmm3, xmm7
1036
+ pxor xmm12, xmm0
1037
+ pxor xmm13, xmm1
1038
+ pxor xmm14, xmm2
1039
+ pxor xmm15, xmm3
1040
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1041
+ pshufb xmm12, xmm8
1042
+ pshufb xmm13, xmm8
1043
+ pshufb xmm14, xmm8
1044
+ pshufb xmm15, xmm8
1045
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1046
+ paddd xmm8, xmm12
1047
+ paddd xmm9, xmm13
1048
+ paddd xmm10, xmm14
1049
+ paddd xmm11, xmm15
1050
+ pxor xmm4, xmm8
1051
+ pxor xmm5, xmm9
1052
+ pxor xmm6, xmm10
1053
+ pxor xmm7, xmm11
1054
+ movdqa xmmword ptr [rsp+0x100], xmm8
1055
+ movdqa xmm8, xmm4
1056
+ psrld xmm8, 12
1057
+ pslld xmm4, 20
1058
+ por xmm4, xmm8
1059
+ movdqa xmm8, xmm5
1060
+ psrld xmm8, 12
1061
+ pslld xmm5, 20
1062
+ por xmm5, xmm8
1063
+ movdqa xmm8, xmm6
1064
+ psrld xmm8, 12
1065
+ pslld xmm6, 20
1066
+ por xmm6, xmm8
1067
+ movdqa xmm8, xmm7
1068
+ psrld xmm8, 12
1069
+ pslld xmm7, 20
1070
+ por xmm7, xmm8
1071
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1072
+ paddd xmm1, xmmword ptr [rsp+0x50]
1073
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1074
+ paddd xmm3, xmmword ptr [rsp+0x10]
1075
+ paddd xmm0, xmm4
1076
+ paddd xmm1, xmm5
1077
+ paddd xmm2, xmm6
1078
+ paddd xmm3, xmm7
1079
+ pxor xmm12, xmm0
1080
+ pxor xmm13, xmm1
1081
+ pxor xmm14, xmm2
1082
+ pxor xmm15, xmm3
1083
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1084
+ pshufb xmm12, xmm8
1085
+ pshufb xmm13, xmm8
1086
+ pshufb xmm14, xmm8
1087
+ pshufb xmm15, xmm8
1088
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1089
+ paddd xmm8, xmm12
1090
+ paddd xmm9, xmm13
1091
+ paddd xmm10, xmm14
1092
+ paddd xmm11, xmm15
1093
+ pxor xmm4, xmm8
1094
+ pxor xmm5, xmm9
1095
+ pxor xmm6, xmm10
1096
+ pxor xmm7, xmm11
1097
+ movdqa xmmword ptr [rsp+0x100], xmm8
1098
+ movdqa xmm8, xmm4
1099
+ psrld xmm8, 7
1100
+ pslld xmm4, 25
1101
+ por xmm4, xmm8
1102
+ movdqa xmm8, xmm5
1103
+ psrld xmm8, 7
1104
+ pslld xmm5, 25
1105
+ por xmm5, xmm8
1106
+ movdqa xmm8, xmm6
1107
+ psrld xmm8, 7
1108
+ pslld xmm6, 25
1109
+ por xmm6, xmm8
1110
+ movdqa xmm8, xmm7
1111
+ psrld xmm8, 7
1112
+ pslld xmm7, 25
1113
+ por xmm7, xmm8
1114
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1115
+ paddd xmm1, xmmword ptr [rsp]
1116
+ paddd xmm2, xmmword ptr [rsp+0x20]
1117
+ paddd xmm3, xmmword ptr [rsp+0x40]
1118
+ paddd xmm0, xmm5
1119
+ paddd xmm1, xmm6
1120
+ paddd xmm2, xmm7
1121
+ paddd xmm3, xmm4
1122
+ pxor xmm15, xmm0
1123
+ pxor xmm12, xmm1
1124
+ pxor xmm13, xmm2
1125
+ pxor xmm14, xmm3
1126
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1127
+ pshufb xmm15, xmm8
1128
+ pshufb xmm12, xmm8
1129
+ pshufb xmm13, xmm8
1130
+ pshufb xmm14, xmm8
1131
+ paddd xmm10, xmm15
1132
+ paddd xmm11, xmm12
1133
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1134
+ paddd xmm8, xmm13
1135
+ paddd xmm9, xmm14
1136
+ pxor xmm5, xmm10
1137
+ pxor xmm6, xmm11
1138
+ pxor xmm7, xmm8
1139
+ pxor xmm4, xmm9
1140
+ movdqa xmmword ptr [rsp+0x100], xmm8
1141
+ movdqa xmm8, xmm5
1142
+ psrld xmm8, 12
1143
+ pslld xmm5, 20
1144
+ por xmm5, xmm8
1145
+ movdqa xmm8, xmm6
1146
+ psrld xmm8, 12
1147
+ pslld xmm6, 20
1148
+ por xmm6, xmm8
1149
+ movdqa xmm8, xmm7
1150
+ psrld xmm8, 12
1151
+ pslld xmm7, 20
1152
+ por xmm7, xmm8
1153
+ movdqa xmm8, xmm4
1154
+ psrld xmm8, 12
1155
+ pslld xmm4, 20
1156
+ por xmm4, xmm8
1157
+ paddd xmm0, xmmword ptr [rsp+0x30]
1158
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1159
+ paddd xmm2, xmmword ptr [rsp+0x60]
1160
+ paddd xmm3, xmmword ptr [rsp+0x70]
1161
+ paddd xmm0, xmm5
1162
+ paddd xmm1, xmm6
1163
+ paddd xmm2, xmm7
1164
+ paddd xmm3, xmm4
1165
+ pxor xmm15, xmm0
1166
+ pxor xmm12, xmm1
1167
+ pxor xmm13, xmm2
1168
+ pxor xmm14, xmm3
1169
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1170
+ pshufb xmm15, xmm8
1171
+ pshufb xmm12, xmm8
1172
+ pshufb xmm13, xmm8
1173
+ pshufb xmm14, xmm8
1174
+ paddd xmm10, xmm15
1175
+ paddd xmm11, xmm12
1176
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1177
+ paddd xmm8, xmm13
1178
+ paddd xmm9, xmm14
1179
+ pxor xmm5, xmm10
1180
+ pxor xmm6, xmm11
1181
+ pxor xmm7, xmm8
1182
+ pxor xmm4, xmm9
1183
+ movdqa xmmword ptr [rsp+0x100], xmm8
1184
+ movdqa xmm8, xmm5
1185
+ psrld xmm8, 7
1186
+ pslld xmm5, 25
1187
+ por xmm5, xmm8
1188
+ movdqa xmm8, xmm6
1189
+ psrld xmm8, 7
1190
+ pslld xmm6, 25
1191
+ por xmm6, xmm8
1192
+ movdqa xmm8, xmm7
1193
+ psrld xmm8, 7
1194
+ pslld xmm7, 25
1195
+ por xmm7, xmm8
1196
+ movdqa xmm8, xmm4
1197
+ psrld xmm8, 7
1198
+ pslld xmm4, 25
1199
+ por xmm4, xmm8
1200
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1201
+ paddd xmm1, xmmword ptr [rsp+0x50]
1202
+ paddd xmm2, xmmword ptr [rsp+0x10]
1203
+ paddd xmm3, xmmword ptr [rsp+0x80]
1204
+ paddd xmm0, xmm4
1205
+ paddd xmm1, xmm5
1206
+ paddd xmm2, xmm6
1207
+ paddd xmm3, xmm7
1208
+ pxor xmm12, xmm0
1209
+ pxor xmm13, xmm1
1210
+ pxor xmm14, xmm2
1211
+ pxor xmm15, xmm3
1212
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1213
+ pshufb xmm12, xmm8
1214
+ pshufb xmm13, xmm8
1215
+ pshufb xmm14, xmm8
1216
+ pshufb xmm15, xmm8
1217
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1218
+ paddd xmm8, xmm12
1219
+ paddd xmm9, xmm13
1220
+ paddd xmm10, xmm14
1221
+ paddd xmm11, xmm15
1222
+ pxor xmm4, xmm8
1223
+ pxor xmm5, xmm9
1224
+ pxor xmm6, xmm10
1225
+ pxor xmm7, xmm11
1226
+ movdqa xmmword ptr [rsp+0x100], xmm8
1227
+ movdqa xmm8, xmm4
1228
+ psrld xmm8, 12
1229
+ pslld xmm4, 20
1230
+ por xmm4, xmm8
1231
+ movdqa xmm8, xmm5
1232
+ psrld xmm8, 12
1233
+ pslld xmm5, 20
1234
+ por xmm5, xmm8
1235
+ movdqa xmm8, xmm6
1236
+ psrld xmm8, 12
1237
+ pslld xmm6, 20
1238
+ por xmm6, xmm8
1239
+ movdqa xmm8, xmm7
1240
+ psrld xmm8, 12
1241
+ pslld xmm7, 20
1242
+ por xmm7, xmm8
1243
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1244
+ paddd xmm1, xmmword ptr [rsp]
1245
+ paddd xmm2, xmmword ptr [rsp+0x90]
1246
+ paddd xmm3, xmmword ptr [rsp+0x60]
1247
+ paddd xmm0, xmm4
1248
+ paddd xmm1, xmm5
1249
+ paddd xmm2, xmm6
1250
+ paddd xmm3, xmm7
1251
+ pxor xmm12, xmm0
1252
+ pxor xmm13, xmm1
1253
+ pxor xmm14, xmm2
1254
+ pxor xmm15, xmm3
1255
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1256
+ pshufb xmm12, xmm8
1257
+ pshufb xmm13, xmm8
1258
+ pshufb xmm14, xmm8
1259
+ pshufb xmm15, xmm8
1260
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1261
+ paddd xmm8, xmm12
1262
+ paddd xmm9, xmm13
1263
+ paddd xmm10, xmm14
1264
+ paddd xmm11, xmm15
1265
+ pxor xmm4, xmm8
1266
+ pxor xmm5, xmm9
1267
+ pxor xmm6, xmm10
1268
+ pxor xmm7, xmm11
1269
+ movdqa xmmword ptr [rsp+0x100], xmm8
1270
+ movdqa xmm8, xmm4
1271
+ psrld xmm8, 7
1272
+ pslld xmm4, 25
1273
+ por xmm4, xmm8
1274
+ movdqa xmm8, xmm5
1275
+ psrld xmm8, 7
1276
+ pslld xmm5, 25
1277
+ por xmm5, xmm8
1278
+ movdqa xmm8, xmm6
1279
+ psrld xmm8, 7
1280
+ pslld xmm6, 25
1281
+ por xmm6, xmm8
1282
+ movdqa xmm8, xmm7
1283
+ psrld xmm8, 7
1284
+ pslld xmm7, 25
1285
+ por xmm7, xmm8
1286
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1287
+ paddd xmm1, xmmword ptr [rsp+0x20]
1288
+ paddd xmm2, xmmword ptr [rsp+0x30]
1289
+ paddd xmm3, xmmword ptr [rsp+0x70]
1290
+ paddd xmm0, xmm5
1291
+ paddd xmm1, xmm6
1292
+ paddd xmm2, xmm7
1293
+ paddd xmm3, xmm4
1294
+ pxor xmm15, xmm0
1295
+ pxor xmm12, xmm1
1296
+ pxor xmm13, xmm2
1297
+ pxor xmm14, xmm3
1298
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1299
+ pshufb xmm15, xmm8
1300
+ pshufb xmm12, xmm8
1301
+ pshufb xmm13, xmm8
1302
+ pshufb xmm14, xmm8
1303
+ paddd xmm10, xmm15
1304
+ paddd xmm11, xmm12
1305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1306
+ paddd xmm8, xmm13
1307
+ paddd xmm9, xmm14
1308
+ pxor xmm5, xmm10
1309
+ pxor xmm6, xmm11
1310
+ pxor xmm7, xmm8
1311
+ pxor xmm4, xmm9
1312
+ movdqa xmmword ptr [rsp+0x100], xmm8
1313
+ movdqa xmm8, xmm5
1314
+ psrld xmm8, 12
1315
+ pslld xmm5, 20
1316
+ por xmm5, xmm8
1317
+ movdqa xmm8, xmm6
1318
+ psrld xmm8, 12
1319
+ pslld xmm6, 20
1320
+ por xmm6, xmm8
1321
+ movdqa xmm8, xmm7
1322
+ psrld xmm8, 12
1323
+ pslld xmm7, 20
1324
+ por xmm7, xmm8
1325
+ movdqa xmm8, xmm4
1326
+ psrld xmm8, 12
1327
+ pslld xmm4, 20
1328
+ por xmm4, xmm8
1329
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1330
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1331
+ paddd xmm2, xmmword ptr [rsp+0x40]
1332
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1333
+ paddd xmm0, xmm5
1334
+ paddd xmm1, xmm6
1335
+ paddd xmm2, xmm7
1336
+ paddd xmm3, xmm4
1337
+ pxor xmm15, xmm0
1338
+ pxor xmm12, xmm1
1339
+ pxor xmm13, xmm2
1340
+ pxor xmm14, xmm3
1341
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1342
+ pshufb xmm15, xmm8
1343
+ pshufb xmm12, xmm8
1344
+ pshufb xmm13, xmm8
1345
+ pshufb xmm14, xmm8
1346
+ paddd xmm10, xmm15
1347
+ paddd xmm11, xmm12
1348
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1349
+ paddd xmm8, xmm13
1350
+ paddd xmm9, xmm14
1351
+ pxor xmm5, xmm10
1352
+ pxor xmm6, xmm11
1353
+ pxor xmm7, xmm8
1354
+ pxor xmm4, xmm9
1355
+ pxor xmm0, xmm8
1356
+ pxor xmm1, xmm9
1357
+ pxor xmm2, xmm10
1358
+ pxor xmm3, xmm11
1359
+ movdqa xmm8, xmm5
1360
+ psrld xmm8, 7
1361
+ pslld xmm5, 25
1362
+ por xmm5, xmm8
1363
+ movdqa xmm8, xmm6
1364
+ psrld xmm8, 7
1365
+ pslld xmm6, 25
1366
+ por xmm6, xmm8
1367
+ movdqa xmm8, xmm7
1368
+ psrld xmm8, 7
1369
+ pslld xmm7, 25
1370
+ por xmm7, xmm8
1371
+ movdqa xmm8, xmm4
1372
+ psrld xmm8, 7
1373
+ pslld xmm4, 25
1374
+ por xmm4, xmm8
1375
+ pxor xmm4, xmm12
1376
+ pxor xmm5, xmm13
1377
+ pxor xmm6, xmm14
1378
+ pxor xmm7, xmm15
1379
+ mov eax, r13d
1380
+ jne 9b
1381
+ movdqa xmm9, xmm0
1382
+ punpckldq xmm0, xmm1
1383
+ punpckhdq xmm9, xmm1
1384
+ movdqa xmm11, xmm2
1385
+ punpckldq xmm2, xmm3
1386
+ punpckhdq xmm11, xmm3
1387
+ movdqa xmm1, xmm0
1388
+ punpcklqdq xmm0, xmm2
1389
+ punpckhqdq xmm1, xmm2
1390
+ movdqa xmm3, xmm9
1391
+ punpcklqdq xmm9, xmm11
1392
+ punpckhqdq xmm3, xmm11
1393
+ movdqu xmmword ptr [rbx], xmm0
1394
+ movdqu xmmword ptr [rbx+0x20], xmm1
1395
+ movdqu xmmword ptr [rbx+0x40], xmm9
1396
+ movdqu xmmword ptr [rbx+0x60], xmm3
1397
+ movdqa xmm9, xmm4
1398
+ punpckldq xmm4, xmm5
1399
+ punpckhdq xmm9, xmm5
1400
+ movdqa xmm11, xmm6
1401
+ punpckldq xmm6, xmm7
1402
+ punpckhdq xmm11, xmm7
1403
+ movdqa xmm5, xmm4
1404
+ punpcklqdq xmm4, xmm6
1405
+ punpckhqdq xmm5, xmm6
1406
+ movdqa xmm7, xmm9
1407
+ punpcklqdq xmm9, xmm11
1408
+ punpckhqdq xmm7, xmm11
1409
+ movdqu xmmword ptr [rbx+0x10], xmm4
1410
+ movdqu xmmword ptr [rbx+0x30], xmm5
1411
+ movdqu xmmword ptr [rbx+0x50], xmm9
1412
+ movdqu xmmword ptr [rbx+0x70], xmm7
1413
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1414
+ movdqa xmm0, xmm1
1415
+ paddd xmm1, xmmword ptr [rsp+0x150]
1416
+ movdqa xmmword ptr [rsp+0x110], xmm1
1417
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1418
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1419
+ pcmpgtd xmm0, xmm1
1420
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1421
+ psubd xmm1, xmm0
1422
+ movdqa xmmword ptr [rsp+0x120], xmm1
1423
+ add rbx, 128
1424
+ add rdi, 32
1425
+ sub rsi, 4
1426
+ cmp rsi, 4
1427
+ jnc 2b
1428
+ test rsi, rsi
1429
+ jnz 3f
1430
+ 4:
1431
+ mov rsp, rbp
1432
+ pop rbp
1433
+ pop rbx
1434
+ pop r12
1435
+ pop r13
1436
+ pop r14
1437
+ pop r15
1438
+ ret
1439
+ .p2align 5
1440
+ 3:
1441
+ test esi, 0x2
1442
+ je 3f
1443
+ movups xmm0, xmmword ptr [rcx]
1444
+ movups xmm1, xmmword ptr [rcx+0x10]
1445
+ movaps xmm8, xmm0
1446
+ movaps xmm9, xmm1
1447
+ movd xmm13, dword ptr [rsp+0x110]
1448
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1449
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1450
+ movaps xmmword ptr [rsp], xmm13
1451
+ movd xmm14, dword ptr [rsp+0x114]
1452
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
1453
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1454
+ movaps xmmword ptr [rsp+0x10], xmm14
1455
+ mov r8, qword ptr [rdi]
1456
+ mov r9, qword ptr [rdi+0x8]
1457
+ movzx eax, byte ptr [rbp+0x40]
1458
+ or eax, r13d
1459
+ xor edx, edx
1460
+ 2:
1461
+ mov r14d, eax
1462
+ or eax, r12d
1463
+ add rdx, 64
1464
+ cmp rdx, r15
1465
+ cmovne eax, r14d
1466
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1467
+ movaps xmm10, xmm2
1468
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1469
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1470
+ movaps xmm3, xmm4
1471
+ shufps xmm4, xmm5, 136
1472
+ shufps xmm3, xmm5, 221
1473
+ movaps xmm5, xmm3
1474
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1475
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1476
+ movaps xmm3, xmm6
1477
+ shufps xmm6, xmm7, 136
1478
+ pshufd xmm6, xmm6, 0x93
1479
+ shufps xmm3, xmm7, 221
1480
+ pshufd xmm7, xmm3, 0x93
1481
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1482
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1483
+ movaps xmm11, xmm12
1484
+ shufps xmm12, xmm13, 136
1485
+ shufps xmm11, xmm13, 221
1486
+ movaps xmm13, xmm11
1487
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1488
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1489
+ movaps xmm11, xmm14
1490
+ shufps xmm14, xmm15, 136
1491
+ pshufd xmm14, xmm14, 0x93
1492
+ shufps xmm11, xmm15, 221
1493
+ pshufd xmm15, xmm11, 0x93
1494
+ movaps xmm3, xmmword ptr [rsp]
1495
+ movaps xmm11, xmmword ptr [rsp+0x10]
1496
+ pinsrd xmm3, eax, 3
1497
+ pinsrd xmm11, eax, 3
1498
+ mov al, 7
1499
+ 9:
1500
+ paddd xmm0, xmm4
1501
+ paddd xmm8, xmm12
1502
+ movaps xmmword ptr [rsp+0x20], xmm4
1503
+ movaps xmmword ptr [rsp+0x30], xmm12
1504
+ paddd xmm0, xmm1
1505
+ paddd xmm8, xmm9
1506
+ pxor xmm3, xmm0
1507
+ pxor xmm11, xmm8
1508
+ movaps xmm12, xmmword ptr [ROT16+rip]
1509
+ pshufb xmm3, xmm12
1510
+ pshufb xmm11, xmm12
1511
+ paddd xmm2, xmm3
1512
+ paddd xmm10, xmm11
1513
+ pxor xmm1, xmm2
1514
+ pxor xmm9, xmm10
1515
+ movdqa xmm4, xmm1
1516
+ pslld xmm1, 20
1517
+ psrld xmm4, 12
1518
+ por xmm1, xmm4
1519
+ movdqa xmm4, xmm9
1520
+ pslld xmm9, 20
1521
+ psrld xmm4, 12
1522
+ por xmm9, xmm4
1523
+ paddd xmm0, xmm5
1524
+ paddd xmm8, xmm13
1525
+ movaps xmmword ptr [rsp+0x40], xmm5
1526
+ movaps xmmword ptr [rsp+0x50], xmm13
1527
+ paddd xmm0, xmm1
1528
+ paddd xmm8, xmm9
1529
+ pxor xmm3, xmm0
1530
+ pxor xmm11, xmm8
1531
+ movaps xmm13, xmmword ptr [ROT8+rip]
1532
+ pshufb xmm3, xmm13
1533
+ pshufb xmm11, xmm13
1534
+ paddd xmm2, xmm3
1535
+ paddd xmm10, xmm11
1536
+ pxor xmm1, xmm2
1537
+ pxor xmm9, xmm10
1538
+ movdqa xmm4, xmm1
1539
+ pslld xmm1, 25
1540
+ psrld xmm4, 7
1541
+ por xmm1, xmm4
1542
+ movdqa xmm4, xmm9
1543
+ pslld xmm9, 25
1544
+ psrld xmm4, 7
1545
+ por xmm9, xmm4
1546
+ pshufd xmm0, xmm0, 0x93
1547
+ pshufd xmm8, xmm8, 0x93
1548
+ pshufd xmm3, xmm3, 0x4E
1549
+ pshufd xmm11, xmm11, 0x4E
1550
+ pshufd xmm2, xmm2, 0x39
1551
+ pshufd xmm10, xmm10, 0x39
1552
+ paddd xmm0, xmm6
1553
+ paddd xmm8, xmm14
1554
+ paddd xmm0, xmm1
1555
+ paddd xmm8, xmm9
1556
+ pxor xmm3, xmm0
1557
+ pxor xmm11, xmm8
1558
+ pshufb xmm3, xmm12
1559
+ pshufb xmm11, xmm12
1560
+ paddd xmm2, xmm3
1561
+ paddd xmm10, xmm11
1562
+ pxor xmm1, xmm2
1563
+ pxor xmm9, xmm10
1564
+ movdqa xmm4, xmm1
1565
+ pslld xmm1, 20
1566
+ psrld xmm4, 12
1567
+ por xmm1, xmm4
1568
+ movdqa xmm4, xmm9
1569
+ pslld xmm9, 20
1570
+ psrld xmm4, 12
1571
+ por xmm9, xmm4
1572
+ paddd xmm0, xmm7
1573
+ paddd xmm8, xmm15
1574
+ paddd xmm0, xmm1
1575
+ paddd xmm8, xmm9
1576
+ pxor xmm3, xmm0
1577
+ pxor xmm11, xmm8
1578
+ pshufb xmm3, xmm13
1579
+ pshufb xmm11, xmm13
1580
+ paddd xmm2, xmm3
1581
+ paddd xmm10, xmm11
1582
+ pxor xmm1, xmm2
1583
+ pxor xmm9, xmm10
1584
+ movdqa xmm4, xmm1
1585
+ pslld xmm1, 25
1586
+ psrld xmm4, 7
1587
+ por xmm1, xmm4
1588
+ movdqa xmm4, xmm9
1589
+ pslld xmm9, 25
1590
+ psrld xmm4, 7
1591
+ por xmm9, xmm4
1592
+ pshufd xmm0, xmm0, 0x39
1593
+ pshufd xmm8, xmm8, 0x39
1594
+ pshufd xmm3, xmm3, 0x4E
1595
+ pshufd xmm11, xmm11, 0x4E
1596
+ pshufd xmm2, xmm2, 0x93
1597
+ pshufd xmm10, xmm10, 0x93
1598
+ dec al
1599
+ je 9f
1600
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1601
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1602
+ pshufd xmm13, xmm12, 0x0F
1603
+ shufps xmm12, xmm5, 214
1604
+ pshufd xmm4, xmm12, 0x39
1605
+ movdqa xmm12, xmm6
1606
+ shufps xmm12, xmm7, 250
1607
+ pblendw xmm13, xmm12, 0xCC
1608
+ movdqa xmm12, xmm7
1609
+ punpcklqdq xmm12, xmm5
1610
+ pblendw xmm12, xmm6, 0xC0
1611
+ pshufd xmm12, xmm12, 0x78
1612
+ punpckhdq xmm5, xmm7
1613
+ punpckldq xmm6, xmm5
1614
+ pshufd xmm7, xmm6, 0x1E
1615
+ movdqa xmmword ptr [rsp+0x20], xmm13
1616
+ movdqa xmmword ptr [rsp+0x40], xmm12
1617
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1618
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1619
+ pshufd xmm6, xmm5, 0x0F
1620
+ shufps xmm5, xmm13, 214
1621
+ pshufd xmm12, xmm5, 0x39
1622
+ movdqa xmm5, xmm14
1623
+ shufps xmm5, xmm15, 250
1624
+ pblendw xmm6, xmm5, 0xCC
1625
+ movdqa xmm5, xmm15
1626
+ punpcklqdq xmm5, xmm13
1627
+ pblendw xmm5, xmm14, 0xC0
1628
+ pshufd xmm5, xmm5, 0x78
1629
+ punpckhdq xmm13, xmm15
1630
+ punpckldq xmm14, xmm13
1631
+ pshufd xmm15, xmm14, 0x1E
1632
+ movdqa xmm13, xmm6
1633
+ movdqa xmm14, xmm5
1634
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1635
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1636
+ jmp 9b
1637
+ 9:
1638
+ pxor xmm0, xmm2
1639
+ pxor xmm1, xmm3
1640
+ pxor xmm8, xmm10
1641
+ pxor xmm9, xmm11
1642
+ mov eax, r13d
1643
+ cmp rdx, r15
1644
+ jne 2b
1645
+ movups xmmword ptr [rbx], xmm0
1646
+ movups xmmword ptr [rbx+0x10], xmm1
1647
+ movups xmmword ptr [rbx+0x20], xmm8
1648
+ movups xmmword ptr [rbx+0x30], xmm9
1649
+ movdqa xmm0, xmmword ptr [rsp+0x130]
1650
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1651
+ movdqa xmm2, xmmword ptr [rsp+0x120]
1652
+ movdqu xmm3, xmmword ptr [rsp+0x118]
1653
+ movdqu xmm4, xmmword ptr [rsp+0x128]
1654
+ blendvps xmm1, xmm3, xmm0
1655
+ blendvps xmm2, xmm4, xmm0
1656
+ movdqa xmmword ptr [rsp+0x110], xmm1
1657
+ movdqa xmmword ptr [rsp+0x120], xmm2
1658
+ add rdi, 16
1659
+ add rbx, 64
1660
+ sub rsi, 2
1661
+ 3:
1662
+ test esi, 0x1
1663
+ je 4b
1664
+ movups xmm0, xmmword ptr [rcx]
1665
+ movups xmm1, xmmword ptr [rcx+0x10]
1666
+ movd xmm13, dword ptr [rsp+0x110]
1667
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1668
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1669
+ movaps xmm14, xmmword ptr [ROT8+rip]
1670
+ movaps xmm15, xmmword ptr [ROT16+rip]
1671
+ mov r8, qword ptr [rdi]
1672
+ movzx eax, byte ptr [rbp+0x40]
1673
+ or eax, r13d
1674
+ xor edx, edx
1675
+ 2:
1676
+ mov r14d, eax
1677
+ or eax, r12d
1678
+ add rdx, 64
1679
+ cmp rdx, r15
1680
+ cmovne eax, r14d
1681
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1682
+ movaps xmm3, xmm13
1683
+ pinsrd xmm3, eax, 3
1684
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1685
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1686
+ movaps xmm8, xmm4
1687
+ shufps xmm4, xmm5, 136
1688
+ shufps xmm8, xmm5, 221
1689
+ movaps xmm5, xmm8
1690
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1691
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1692
+ movaps xmm8, xmm6
1693
+ shufps xmm6, xmm7, 136
1694
+ pshufd xmm6, xmm6, 0x93
1695
+ shufps xmm8, xmm7, 221
1696
+ pshufd xmm7, xmm8, 0x93
1697
+ mov al, 7
1698
+ 9:
1699
+ paddd xmm0, xmm4
1700
+ paddd xmm0, xmm1
1701
+ pxor xmm3, xmm0
1702
+ pshufb xmm3, xmm15
1703
+ paddd xmm2, xmm3
1704
+ pxor xmm1, xmm2
1705
+ movdqa xmm11, xmm1
1706
+ pslld xmm1, 20
1707
+ psrld xmm11, 12
1708
+ por xmm1, xmm11
1709
+ paddd xmm0, xmm5
1710
+ paddd xmm0, xmm1
1711
+ pxor xmm3, xmm0
1712
+ pshufb xmm3, xmm14
1713
+ paddd xmm2, xmm3
1714
+ pxor xmm1, xmm2
1715
+ movdqa xmm11, xmm1
1716
+ pslld xmm1, 25
1717
+ psrld xmm11, 7
1718
+ por xmm1, xmm11
1719
+ pshufd xmm0, xmm0, 0x93
1720
+ pshufd xmm3, xmm3, 0x4E
1721
+ pshufd xmm2, xmm2, 0x39
1722
+ paddd xmm0, xmm6
1723
+ paddd xmm0, xmm1
1724
+ pxor xmm3, xmm0
1725
+ pshufb xmm3, xmm15
1726
+ paddd xmm2, xmm3
1727
+ pxor xmm1, xmm2
1728
+ movdqa xmm11, xmm1
1729
+ pslld xmm1, 20
1730
+ psrld xmm11, 12
1731
+ por xmm1, xmm11
1732
+ paddd xmm0, xmm7
1733
+ paddd xmm0, xmm1
1734
+ pxor xmm3, xmm0
1735
+ pshufb xmm3, xmm14
1736
+ paddd xmm2, xmm3
1737
+ pxor xmm1, xmm2
1738
+ movdqa xmm11, xmm1
1739
+ pslld xmm1, 25
1740
+ psrld xmm11, 7
1741
+ por xmm1, xmm11
1742
+ pshufd xmm0, xmm0, 0x39
1743
+ pshufd xmm3, xmm3, 0x4E
1744
+ pshufd xmm2, xmm2, 0x93
1745
+ dec al
1746
+ jz 9f
1747
+ movdqa xmm8, xmm4
1748
+ shufps xmm8, xmm5, 214
1749
+ pshufd xmm9, xmm4, 0x0F
1750
+ pshufd xmm4, xmm8, 0x39
1751
+ movdqa xmm8, xmm6
1752
+ shufps xmm8, xmm7, 250
1753
+ pblendw xmm9, xmm8, 0xCC
1754
+ movdqa xmm8, xmm7
1755
+ punpcklqdq xmm8, xmm5
1756
+ pblendw xmm8, xmm6, 0xC0
1757
+ pshufd xmm8, xmm8, 0x78
1758
+ punpckhdq xmm5, xmm7
1759
+ punpckldq xmm6, xmm5
1760
+ pshufd xmm7, xmm6, 0x1E
1761
+ movdqa xmm5, xmm9
1762
+ movdqa xmm6, xmm8
1763
+ jmp 9b
1764
+ 9:
1765
+ pxor xmm0, xmm2
1766
+ pxor xmm1, xmm3
1767
+ mov eax, r13d
1768
+ cmp rdx, r15
1769
+ jne 2b
1770
+ movups xmmword ptr [rbx], xmm0
1771
+ movups xmmword ptr [rbx+0x10], xmm1
1772
+ jmp 4b
1773
+
1774
+ .p2align 6
1775
+ blake3_compress_in_place_sse41:
1776
+ _blake3_compress_in_place_sse41:
1777
+ movups xmm0, xmmword ptr [rdi]
1778
+ movups xmm1, xmmword ptr [rdi+0x10]
1779
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1780
+ shl r8, 32
1781
+ add rdx, r8
1782
+ movq xmm3, rcx
1783
+ movq xmm4, rdx
1784
+ punpcklqdq xmm3, xmm4
1785
+ movups xmm4, xmmword ptr [rsi]
1786
+ movups xmm5, xmmword ptr [rsi+0x10]
1787
+ movaps xmm8, xmm4
1788
+ shufps xmm4, xmm5, 136
1789
+ shufps xmm8, xmm5, 221
1790
+ movaps xmm5, xmm8
1791
+ movups xmm6, xmmword ptr [rsi+0x20]
1792
+ movups xmm7, xmmword ptr [rsi+0x30]
1793
+ movaps xmm8, xmm6
1794
+ shufps xmm6, xmm7, 136
1795
+ pshufd xmm6, xmm6, 0x93
1796
+ shufps xmm8, xmm7, 221
1797
+ pshufd xmm7, xmm8, 0x93
1798
+ movaps xmm14, xmmword ptr [ROT8+rip]
1799
+ movaps xmm15, xmmword ptr [ROT16+rip]
1800
+ mov al, 7
1801
+ 9:
1802
+ paddd xmm0, xmm4
1803
+ paddd xmm0, xmm1
1804
+ pxor xmm3, xmm0
1805
+ pshufb xmm3, xmm15
1806
+ paddd xmm2, xmm3
1807
+ pxor xmm1, xmm2
1808
+ movdqa xmm11, xmm1
1809
+ pslld xmm1, 20
1810
+ psrld xmm11, 12
1811
+ por xmm1, xmm11
1812
+ paddd xmm0, xmm5
1813
+ paddd xmm0, xmm1
1814
+ pxor xmm3, xmm0
1815
+ pshufb xmm3, xmm14
1816
+ paddd xmm2, xmm3
1817
+ pxor xmm1, xmm2
1818
+ movdqa xmm11, xmm1
1819
+ pslld xmm1, 25
1820
+ psrld xmm11, 7
1821
+ por xmm1, xmm11
1822
+ pshufd xmm0, xmm0, 0x93
1823
+ pshufd xmm3, xmm3, 0x4E
1824
+ pshufd xmm2, xmm2, 0x39
1825
+ paddd xmm0, xmm6
1826
+ paddd xmm0, xmm1
1827
+ pxor xmm3, xmm0
1828
+ pshufb xmm3, xmm15
1829
+ paddd xmm2, xmm3
1830
+ pxor xmm1, xmm2
1831
+ movdqa xmm11, xmm1
1832
+ pslld xmm1, 20
1833
+ psrld xmm11, 12
1834
+ por xmm1, xmm11
1835
+ paddd xmm0, xmm7
1836
+ paddd xmm0, xmm1
1837
+ pxor xmm3, xmm0
1838
+ pshufb xmm3, xmm14
1839
+ paddd xmm2, xmm3
1840
+ pxor xmm1, xmm2
1841
+ movdqa xmm11, xmm1
1842
+ pslld xmm1, 25
1843
+ psrld xmm11, 7
1844
+ por xmm1, xmm11
1845
+ pshufd xmm0, xmm0, 0x39
1846
+ pshufd xmm3, xmm3, 0x4E
1847
+ pshufd xmm2, xmm2, 0x93
1848
+ dec al
1849
+ jz 9f
1850
+ movdqa xmm8, xmm4
1851
+ shufps xmm8, xmm5, 214
1852
+ pshufd xmm9, xmm4, 0x0F
1853
+ pshufd xmm4, xmm8, 0x39
1854
+ movdqa xmm8, xmm6
1855
+ shufps xmm8, xmm7, 250
1856
+ pblendw xmm9, xmm8, 0xCC
1857
+ movdqa xmm8, xmm7
1858
+ punpcklqdq xmm8, xmm5
1859
+ pblendw xmm8, xmm6, 0xC0
1860
+ pshufd xmm8, xmm8, 0x78
1861
+ punpckhdq xmm5, xmm7
1862
+ punpckldq xmm6, xmm5
1863
+ pshufd xmm7, xmm6, 0x1E
1864
+ movdqa xmm5, xmm9
1865
+ movdqa xmm6, xmm8
1866
+ jmp 9b
1867
+ 9:
1868
+ pxor xmm0, xmm2
1869
+ pxor xmm1, xmm3
1870
+ movups xmmword ptr [rdi], xmm0
1871
+ movups xmmword ptr [rdi+0x10], xmm1
1872
+ ret
1873
+
1874
+ .p2align 6
1875
+ blake3_compress_xof_sse41:
1876
+ _blake3_compress_xof_sse41:
1877
+ movups xmm0, xmmword ptr [rdi]
1878
+ movups xmm1, xmmword ptr [rdi+0x10]
1879
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1880
+ movzx eax, r8b
1881
+ movzx edx, dl
1882
+ shl rax, 32
1883
+ add rdx, rax
1884
+ movq xmm3, rcx
1885
+ movq xmm4, rdx
1886
+ punpcklqdq xmm3, xmm4
1887
+ movups xmm4, xmmword ptr [rsi]
1888
+ movups xmm5, xmmword ptr [rsi+0x10]
1889
+ movaps xmm8, xmm4
1890
+ shufps xmm4, xmm5, 136
1891
+ shufps xmm8, xmm5, 221
1892
+ movaps xmm5, xmm8
1893
+ movups xmm6, xmmword ptr [rsi+0x20]
1894
+ movups xmm7, xmmword ptr [rsi+0x30]
1895
+ movaps xmm8, xmm6
1896
+ shufps xmm6, xmm7, 136
1897
+ pshufd xmm6, xmm6, 0x93
1898
+ shufps xmm8, xmm7, 221
1899
+ pshufd xmm7, xmm8, 0x93
1900
+ movaps xmm14, xmmword ptr [ROT8+rip]
1901
+ movaps xmm15, xmmword ptr [ROT16+rip]
1902
+ mov al, 7
1903
+ 9:
1904
+ paddd xmm0, xmm4
1905
+ paddd xmm0, xmm1
1906
+ pxor xmm3, xmm0
1907
+ pshufb xmm3, xmm15
1908
+ paddd xmm2, xmm3
1909
+ pxor xmm1, xmm2
1910
+ movdqa xmm11, xmm1
1911
+ pslld xmm1, 20
1912
+ psrld xmm11, 12
1913
+ por xmm1, xmm11
1914
+ paddd xmm0, xmm5
1915
+ paddd xmm0, xmm1
1916
+ pxor xmm3, xmm0
1917
+ pshufb xmm3, xmm14
1918
+ paddd xmm2, xmm3
1919
+ pxor xmm1, xmm2
1920
+ movdqa xmm11, xmm1
1921
+ pslld xmm1, 25
1922
+ psrld xmm11, 7
1923
+ por xmm1, xmm11
1924
+ pshufd xmm0, xmm0, 0x93
1925
+ pshufd xmm3, xmm3, 0x4E
1926
+ pshufd xmm2, xmm2, 0x39
1927
+ paddd xmm0, xmm6
1928
+ paddd xmm0, xmm1
1929
+ pxor xmm3, xmm0
1930
+ pshufb xmm3, xmm15
1931
+ paddd xmm2, xmm3
1932
+ pxor xmm1, xmm2
1933
+ movdqa xmm11, xmm1
1934
+ pslld xmm1, 20
1935
+ psrld xmm11, 12
1936
+ por xmm1, xmm11
1937
+ paddd xmm0, xmm7
1938
+ paddd xmm0, xmm1
1939
+ pxor xmm3, xmm0
1940
+ pshufb xmm3, xmm14
1941
+ paddd xmm2, xmm3
1942
+ pxor xmm1, xmm2
1943
+ movdqa xmm11, xmm1
1944
+ pslld xmm1, 25
1945
+ psrld xmm11, 7
1946
+ por xmm1, xmm11
1947
+ pshufd xmm0, xmm0, 0x39
1948
+ pshufd xmm3, xmm3, 0x4E
1949
+ pshufd xmm2, xmm2, 0x93
1950
+ dec al
1951
+ jz 9f
1952
+ movdqa xmm8, xmm4
1953
+ shufps xmm8, xmm5, 214
1954
+ pshufd xmm9, xmm4, 0x0F
1955
+ pshufd xmm4, xmm8, 0x39
1956
+ movdqa xmm8, xmm6
1957
+ shufps xmm8, xmm7, 250
1958
+ pblendw xmm9, xmm8, 0xCC
1959
+ movdqa xmm8, xmm7
1960
+ punpcklqdq xmm8, xmm5
1961
+ pblendw xmm8, xmm6, 0xC0
1962
+ pshufd xmm8, xmm8, 0x78
1963
+ punpckhdq xmm5, xmm7
1964
+ punpckldq xmm6, xmm5
1965
+ pshufd xmm7, xmm6, 0x1E
1966
+ movdqa xmm5, xmm9
1967
+ movdqa xmm6, xmm8
1968
+ jmp 9b
1969
+ 9:
1970
+ movdqu xmm4, xmmword ptr [rdi]
1971
+ movdqu xmm5, xmmword ptr [rdi+0x10]
1972
+ pxor xmm0, xmm2
1973
+ pxor xmm1, xmm3
1974
+ pxor xmm2, xmm4
1975
+ pxor xmm3, xmm5
1976
+ movups xmmword ptr [r9], xmm0
1977
+ movups xmmword ptr [r9+0x10], xmm1
1978
+ movups xmmword ptr [r9+0x20], xmm2
1979
+ movups xmmword ptr [r9+0x30], xmm3
1980
+ ret
1981
+
1982
+
1983
+ #ifdef __APPLE__
1984
+ .static_data
1985
+ #else
1986
+ .section .rodata
1987
+ #endif
1988
+ .p2align 6
1989
+ BLAKE3_IV:
1990
+ .long 0x6A09E667, 0xBB67AE85
1991
+ .long 0x3C6EF372, 0xA54FF53A
1992
+ ROT16:
1993
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
1994
+ ROT8:
1995
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
1996
+ ADD0:
1997
+ .long 0, 1, 2, 3
1998
+ ADD1:
1999
+ .long 4, 4, 4, 4
2000
+ BLAKE3_IV_0:
2001
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2002
+ BLAKE3_IV_1:
2003
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2004
+ BLAKE3_IV_2:
2005
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2006
+ BLAKE3_IV_3:
2007
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2008
+ BLAKE3_BLOCK_LEN:
2009
+ .long 64, 64, 64, 64
2010
+ CMP_MSB_MASK:
2011
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000