digest-blake3 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2057 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse41
3
+ .global _blake3_hash_many_sse41
4
+ .global blake3_compress_in_place_sse41
5
+ .global _blake3_compress_in_place_sse41
6
+ .global blake3_compress_xof_sse41
7
+ .global _blake3_compress_xof_sse41
8
+ .section .text
9
+ .p2align 6
10
+ _blake3_hash_many_sse41:
11
+ blake3_hash_many_sse41:
12
+ push r15
13
+ push r14
14
+ push r13
15
+ push r12
16
+ push rsi
17
+ push rdi
18
+ push rbx
19
+ push rbp
20
+ mov rbp, rsp
21
+ sub rsp, 528
22
+ and rsp, 0xFFFFFFFFFFFFFFC0
23
+ movdqa xmmword ptr [rsp+0x170], xmm6
24
+ movdqa xmmword ptr [rsp+0x180], xmm7
25
+ movdqa xmmword ptr [rsp+0x190], xmm8
26
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
27
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
28
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
29
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
30
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
31
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
32
+ movdqa xmmword ptr [rsp+0x200], xmm15
33
+ mov rdi, rcx
34
+ mov rsi, rdx
35
+ mov rdx, r8
36
+ mov rcx, r9
37
+ mov r8, qword ptr [rbp+0x68]
38
+ movzx r9, byte ptr [rbp+0x70]
39
+ neg r9d
40
+ movd xmm0, r9d
41
+ pshufd xmm0, xmm0, 0x00
42
+ movdqa xmmword ptr [rsp+0x130], xmm0
43
+ movdqa xmm1, xmm0
44
+ pand xmm1, xmmword ptr [ADD0+rip]
45
+ pand xmm0, xmmword ptr [ADD1+rip]
46
+ movdqa xmmword ptr [rsp+0x150], xmm0
47
+ movd xmm0, r8d
48
+ pshufd xmm0, xmm0, 0x00
49
+ paddd xmm0, xmm1
50
+ movdqa xmmword ptr [rsp+0x110], xmm0
51
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pcmpgtd xmm1, xmm0
54
+ shr r8, 32
55
+ movd xmm2, r8d
56
+ pshufd xmm2, xmm2, 0x00
57
+ psubd xmm2, xmm1
58
+ movdqa xmmword ptr [rsp+0x120], xmm2
59
+ mov rbx, qword ptr [rbp+0x90]
60
+ mov r15, rdx
61
+ shl r15, 6
62
+ movzx r13d, byte ptr [rbp+0x78]
63
+ movzx r12d, byte ptr [rbp+0x88]
64
+ cmp rsi, 4
65
+ jc 3f
66
+ 2:
67
+ movdqu xmm3, xmmword ptr [rcx]
68
+ pshufd xmm0, xmm3, 0x00
69
+ pshufd xmm1, xmm3, 0x55
70
+ pshufd xmm2, xmm3, 0xAA
71
+ pshufd xmm3, xmm3, 0xFF
72
+ movdqu xmm7, xmmword ptr [rcx+0x10]
73
+ pshufd xmm4, xmm7, 0x00
74
+ pshufd xmm5, xmm7, 0x55
75
+ pshufd xmm6, xmm7, 0xAA
76
+ pshufd xmm7, xmm7, 0xFF
77
+ mov r8, qword ptr [rdi]
78
+ mov r9, qword ptr [rdi+0x8]
79
+ mov r10, qword ptr [rdi+0x10]
80
+ mov r11, qword ptr [rdi+0x18]
81
+ movzx eax, byte ptr [rbp+0x80]
82
+ or eax, r13d
83
+ xor edx, edx
84
+ 9:
85
+ mov r14d, eax
86
+ or eax, r12d
87
+ add rdx, 64
88
+ cmp rdx, r15
89
+ cmovne eax, r14d
90
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
91
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
92
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
93
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
94
+ movdqa xmm12, xmm8
95
+ punpckldq xmm8, xmm9
96
+ punpckhdq xmm12, xmm9
97
+ movdqa xmm14, xmm10
98
+ punpckldq xmm10, xmm11
99
+ punpckhdq xmm14, xmm11
100
+ movdqa xmm9, xmm8
101
+ punpcklqdq xmm8, xmm10
102
+ punpckhqdq xmm9, xmm10
103
+ movdqa xmm13, xmm12
104
+ punpcklqdq xmm12, xmm14
105
+ punpckhqdq xmm13, xmm14
106
+ movdqa xmmword ptr [rsp], xmm8
107
+ movdqa xmmword ptr [rsp+0x10], xmm9
108
+ movdqa xmmword ptr [rsp+0x20], xmm12
109
+ movdqa xmmword ptr [rsp+0x30], xmm13
110
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
111
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
112
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
113
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
114
+ movdqa xmm12, xmm8
115
+ punpckldq xmm8, xmm9
116
+ punpckhdq xmm12, xmm9
117
+ movdqa xmm14, xmm10
118
+ punpckldq xmm10, xmm11
119
+ punpckhdq xmm14, xmm11
120
+ movdqa xmm9, xmm8
121
+ punpcklqdq xmm8, xmm10
122
+ punpckhqdq xmm9, xmm10
123
+ movdqa xmm13, xmm12
124
+ punpcklqdq xmm12, xmm14
125
+ punpckhqdq xmm13, xmm14
126
+ movdqa xmmword ptr [rsp+0x40], xmm8
127
+ movdqa xmmword ptr [rsp+0x50], xmm9
128
+ movdqa xmmword ptr [rsp+0x60], xmm12
129
+ movdqa xmmword ptr [rsp+0x70], xmm13
130
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
131
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
132
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
133
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
134
+ movdqa xmm12, xmm8
135
+ punpckldq xmm8, xmm9
136
+ punpckhdq xmm12, xmm9
137
+ movdqa xmm14, xmm10
138
+ punpckldq xmm10, xmm11
139
+ punpckhdq xmm14, xmm11
140
+ movdqa xmm9, xmm8
141
+ punpcklqdq xmm8, xmm10
142
+ punpckhqdq xmm9, xmm10
143
+ movdqa xmm13, xmm12
144
+ punpcklqdq xmm12, xmm14
145
+ punpckhqdq xmm13, xmm14
146
+ movdqa xmmword ptr [rsp+0x80], xmm8
147
+ movdqa xmmword ptr [rsp+0x90], xmm9
148
+ movdqa xmmword ptr [rsp+0xA0], xmm12
149
+ movdqa xmmword ptr [rsp+0xB0], xmm13
150
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
151
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
152
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
153
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
154
+ movdqa xmm12, xmm8
155
+ punpckldq xmm8, xmm9
156
+ punpckhdq xmm12, xmm9
157
+ movdqa xmm14, xmm10
158
+ punpckldq xmm10, xmm11
159
+ punpckhdq xmm14, xmm11
160
+ movdqa xmm9, xmm8
161
+ punpcklqdq xmm8, xmm10
162
+ punpckhqdq xmm9, xmm10
163
+ movdqa xmm13, xmm12
164
+ punpcklqdq xmm12, xmm14
165
+ punpckhqdq xmm13, xmm14
166
+ movdqa xmmword ptr [rsp+0xC0], xmm8
167
+ movdqa xmmword ptr [rsp+0xD0], xmm9
168
+ movdqa xmmword ptr [rsp+0xE0], xmm12
169
+ movdqa xmmword ptr [rsp+0xF0], xmm13
170
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173
+ movdqa xmm12, xmmword ptr [rsp+0x110]
174
+ movdqa xmm13, xmmword ptr [rsp+0x120]
175
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176
+ movd xmm15, eax
177
+ pshufd xmm15, xmm15, 0x00
178
+ prefetcht0 [r8+rdx+0x80]
179
+ prefetcht0 [r9+rdx+0x80]
180
+ prefetcht0 [r10+rdx+0x80]
181
+ prefetcht0 [r11+rdx+0x80]
182
+ paddd xmm0, xmmword ptr [rsp]
183
+ paddd xmm1, xmmword ptr [rsp+0x20]
184
+ paddd xmm2, xmmword ptr [rsp+0x40]
185
+ paddd xmm3, xmmword ptr [rsp+0x60]
186
+ paddd xmm0, xmm4
187
+ paddd xmm1, xmm5
188
+ paddd xmm2, xmm6
189
+ paddd xmm3, xmm7
190
+ pxor xmm12, xmm0
191
+ pxor xmm13, xmm1
192
+ pxor xmm14, xmm2
193
+ pxor xmm15, xmm3
194
+ movdqa xmm8, xmmword ptr [ROT16+rip]
195
+ pshufb xmm12, xmm8
196
+ pshufb xmm13, xmm8
197
+ pshufb xmm14, xmm8
198
+ pshufb xmm15, xmm8
199
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
200
+ paddd xmm8, xmm12
201
+ paddd xmm9, xmm13
202
+ paddd xmm10, xmm14
203
+ paddd xmm11, xmm15
204
+ pxor xmm4, xmm8
205
+ pxor xmm5, xmm9
206
+ pxor xmm6, xmm10
207
+ pxor xmm7, xmm11
208
+ movdqa xmmword ptr [rsp+0x100], xmm8
209
+ movdqa xmm8, xmm4
210
+ psrld xmm8, 12
211
+ pslld xmm4, 20
212
+ por xmm4, xmm8
213
+ movdqa xmm8, xmm5
214
+ psrld xmm8, 12
215
+ pslld xmm5, 20
216
+ por xmm5, xmm8
217
+ movdqa xmm8, xmm6
218
+ psrld xmm8, 12
219
+ pslld xmm6, 20
220
+ por xmm6, xmm8
221
+ movdqa xmm8, xmm7
222
+ psrld xmm8, 12
223
+ pslld xmm7, 20
224
+ por xmm7, xmm8
225
+ paddd xmm0, xmmword ptr [rsp+0x10]
226
+ paddd xmm1, xmmword ptr [rsp+0x30]
227
+ paddd xmm2, xmmword ptr [rsp+0x50]
228
+ paddd xmm3, xmmword ptr [rsp+0x70]
229
+ paddd xmm0, xmm4
230
+ paddd xmm1, xmm5
231
+ paddd xmm2, xmm6
232
+ paddd xmm3, xmm7
233
+ pxor xmm12, xmm0
234
+ pxor xmm13, xmm1
235
+ pxor xmm14, xmm2
236
+ pxor xmm15, xmm3
237
+ movdqa xmm8, xmmword ptr [ROT8+rip]
238
+ pshufb xmm12, xmm8
239
+ pshufb xmm13, xmm8
240
+ pshufb xmm14, xmm8
241
+ pshufb xmm15, xmm8
242
+ movdqa xmm8, xmmword ptr [rsp+0x100]
243
+ paddd xmm8, xmm12
244
+ paddd xmm9, xmm13
245
+ paddd xmm10, xmm14
246
+ paddd xmm11, xmm15
247
+ pxor xmm4, xmm8
248
+ pxor xmm5, xmm9
249
+ pxor xmm6, xmm10
250
+ pxor xmm7, xmm11
251
+ movdqa xmmword ptr [rsp+0x100], xmm8
252
+ movdqa xmm8, xmm4
253
+ psrld xmm8, 7
254
+ pslld xmm4, 25
255
+ por xmm4, xmm8
256
+ movdqa xmm8, xmm5
257
+ psrld xmm8, 7
258
+ pslld xmm5, 25
259
+ por xmm5, xmm8
260
+ movdqa xmm8, xmm6
261
+ psrld xmm8, 7
262
+ pslld xmm6, 25
263
+ por xmm6, xmm8
264
+ movdqa xmm8, xmm7
265
+ psrld xmm8, 7
266
+ pslld xmm7, 25
267
+ por xmm7, xmm8
268
+ paddd xmm0, xmmword ptr [rsp+0x80]
269
+ paddd xmm1, xmmword ptr [rsp+0xA0]
270
+ paddd xmm2, xmmword ptr [rsp+0xC0]
271
+ paddd xmm3, xmmword ptr [rsp+0xE0]
272
+ paddd xmm0, xmm5
273
+ paddd xmm1, xmm6
274
+ paddd xmm2, xmm7
275
+ paddd xmm3, xmm4
276
+ pxor xmm15, xmm0
277
+ pxor xmm12, xmm1
278
+ pxor xmm13, xmm2
279
+ pxor xmm14, xmm3
280
+ movdqa xmm8, xmmword ptr [ROT16+rip]
281
+ pshufb xmm15, xmm8
282
+ pshufb xmm12, xmm8
283
+ pshufb xmm13, xmm8
284
+ pshufb xmm14, xmm8
285
+ paddd xmm10, xmm15
286
+ paddd xmm11, xmm12
287
+ movdqa xmm8, xmmword ptr [rsp+0x100]
288
+ paddd xmm8, xmm13
289
+ paddd xmm9, xmm14
290
+ pxor xmm5, xmm10
291
+ pxor xmm6, xmm11
292
+ pxor xmm7, xmm8
293
+ pxor xmm4, xmm9
294
+ movdqa xmmword ptr [rsp+0x100], xmm8
295
+ movdqa xmm8, xmm5
296
+ psrld xmm8, 12
297
+ pslld xmm5, 20
298
+ por xmm5, xmm8
299
+ movdqa xmm8, xmm6
300
+ psrld xmm8, 12
301
+ pslld xmm6, 20
302
+ por xmm6, xmm8
303
+ movdqa xmm8, xmm7
304
+ psrld xmm8, 12
305
+ pslld xmm7, 20
306
+ por xmm7, xmm8
307
+ movdqa xmm8, xmm4
308
+ psrld xmm8, 12
309
+ pslld xmm4, 20
310
+ por xmm4, xmm8
311
+ paddd xmm0, xmmword ptr [rsp+0x90]
312
+ paddd xmm1, xmmword ptr [rsp+0xB0]
313
+ paddd xmm2, xmmword ptr [rsp+0xD0]
314
+ paddd xmm3, xmmword ptr [rsp+0xF0]
315
+ paddd xmm0, xmm5
316
+ paddd xmm1, xmm6
317
+ paddd xmm2, xmm7
318
+ paddd xmm3, xmm4
319
+ pxor xmm15, xmm0
320
+ pxor xmm12, xmm1
321
+ pxor xmm13, xmm2
322
+ pxor xmm14, xmm3
323
+ movdqa xmm8, xmmword ptr [ROT8+rip]
324
+ pshufb xmm15, xmm8
325
+ pshufb xmm12, xmm8
326
+ pshufb xmm13, xmm8
327
+ pshufb xmm14, xmm8
328
+ paddd xmm10, xmm15
329
+ paddd xmm11, xmm12
330
+ movdqa xmm8, xmmword ptr [rsp+0x100]
331
+ paddd xmm8, xmm13
332
+ paddd xmm9, xmm14
333
+ pxor xmm5, xmm10
334
+ pxor xmm6, xmm11
335
+ pxor xmm7, xmm8
336
+ pxor xmm4, xmm9
337
+ movdqa xmmword ptr [rsp+0x100], xmm8
338
+ movdqa xmm8, xmm5
339
+ psrld xmm8, 7
340
+ pslld xmm5, 25
341
+ por xmm5, xmm8
342
+ movdqa xmm8, xmm6
343
+ psrld xmm8, 7
344
+ pslld xmm6, 25
345
+ por xmm6, xmm8
346
+ movdqa xmm8, xmm7
347
+ psrld xmm8, 7
348
+ pslld xmm7, 25
349
+ por xmm7, xmm8
350
+ movdqa xmm8, xmm4
351
+ psrld xmm8, 7
352
+ pslld xmm4, 25
353
+ por xmm4, xmm8
354
+ paddd xmm0, xmmword ptr [rsp+0x20]
355
+ paddd xmm1, xmmword ptr [rsp+0x30]
356
+ paddd xmm2, xmmword ptr [rsp+0x70]
357
+ paddd xmm3, xmmword ptr [rsp+0x40]
358
+ paddd xmm0, xmm4
359
+ paddd xmm1, xmm5
360
+ paddd xmm2, xmm6
361
+ paddd xmm3, xmm7
362
+ pxor xmm12, xmm0
363
+ pxor xmm13, xmm1
364
+ pxor xmm14, xmm2
365
+ pxor xmm15, xmm3
366
+ movdqa xmm8, xmmword ptr [ROT16+rip]
367
+ pshufb xmm12, xmm8
368
+ pshufb xmm13, xmm8
369
+ pshufb xmm14, xmm8
370
+ pshufb xmm15, xmm8
371
+ movdqa xmm8, xmmword ptr [rsp+0x100]
372
+ paddd xmm8, xmm12
373
+ paddd xmm9, xmm13
374
+ paddd xmm10, xmm14
375
+ paddd xmm11, xmm15
376
+ pxor xmm4, xmm8
377
+ pxor xmm5, xmm9
378
+ pxor xmm6, xmm10
379
+ pxor xmm7, xmm11
380
+ movdqa xmmword ptr [rsp+0x100], xmm8
381
+ movdqa xmm8, xmm4
382
+ psrld xmm8, 12
383
+ pslld xmm4, 20
384
+ por xmm4, xmm8
385
+ movdqa xmm8, xmm5
386
+ psrld xmm8, 12
387
+ pslld xmm5, 20
388
+ por xmm5, xmm8
389
+ movdqa xmm8, xmm6
390
+ psrld xmm8, 12
391
+ pslld xmm6, 20
392
+ por xmm6, xmm8
393
+ movdqa xmm8, xmm7
394
+ psrld xmm8, 12
395
+ pslld xmm7, 20
396
+ por xmm7, xmm8
397
+ paddd xmm0, xmmword ptr [rsp+0x60]
398
+ paddd xmm1, xmmword ptr [rsp+0xA0]
399
+ paddd xmm2, xmmword ptr [rsp]
400
+ paddd xmm3, xmmword ptr [rsp+0xD0]
401
+ paddd xmm0, xmm4
402
+ paddd xmm1, xmm5
403
+ paddd xmm2, xmm6
404
+ paddd xmm3, xmm7
405
+ pxor xmm12, xmm0
406
+ pxor xmm13, xmm1
407
+ pxor xmm14, xmm2
408
+ pxor xmm15, xmm3
409
+ movdqa xmm8, xmmword ptr [ROT8+rip]
410
+ pshufb xmm12, xmm8
411
+ pshufb xmm13, xmm8
412
+ pshufb xmm14, xmm8
413
+ pshufb xmm15, xmm8
414
+ movdqa xmm8, xmmword ptr [rsp+0x100]
415
+ paddd xmm8, xmm12
416
+ paddd xmm9, xmm13
417
+ paddd xmm10, xmm14
418
+ paddd xmm11, xmm15
419
+ pxor xmm4, xmm8
420
+ pxor xmm5, xmm9
421
+ pxor xmm6, xmm10
422
+ pxor xmm7, xmm11
423
+ movdqa xmmword ptr [rsp+0x100], xmm8
424
+ movdqa xmm8, xmm4
425
+ psrld xmm8, 7
426
+ pslld xmm4, 25
427
+ por xmm4, xmm8
428
+ movdqa xmm8, xmm5
429
+ psrld xmm8, 7
430
+ pslld xmm5, 25
431
+ por xmm5, xmm8
432
+ movdqa xmm8, xmm6
433
+ psrld xmm8, 7
434
+ pslld xmm6, 25
435
+ por xmm6, xmm8
436
+ movdqa xmm8, xmm7
437
+ psrld xmm8, 7
438
+ pslld xmm7, 25
439
+ por xmm7, xmm8
440
+ paddd xmm0, xmmword ptr [rsp+0x10]
441
+ paddd xmm1, xmmword ptr [rsp+0xC0]
442
+ paddd xmm2, xmmword ptr [rsp+0x90]
443
+ paddd xmm3, xmmword ptr [rsp+0xF0]
444
+ paddd xmm0, xmm5
445
+ paddd xmm1, xmm6
446
+ paddd xmm2, xmm7
447
+ paddd xmm3, xmm4
448
+ pxor xmm15, xmm0
449
+ pxor xmm12, xmm1
450
+ pxor xmm13, xmm2
451
+ pxor xmm14, xmm3
452
+ movdqa xmm8, xmmword ptr [ROT16+rip]
453
+ pshufb xmm15, xmm8
454
+ pshufb xmm12, xmm8
455
+ pshufb xmm13, xmm8
456
+ pshufb xmm14, xmm8
457
+ paddd xmm10, xmm15
458
+ paddd xmm11, xmm12
459
+ movdqa xmm8, xmmword ptr [rsp+0x100]
460
+ paddd xmm8, xmm13
461
+ paddd xmm9, xmm14
462
+ pxor xmm5, xmm10
463
+ pxor xmm6, xmm11
464
+ pxor xmm7, xmm8
465
+ pxor xmm4, xmm9
466
+ movdqa xmmword ptr [rsp+0x100], xmm8
467
+ movdqa xmm8, xmm5
468
+ psrld xmm8, 12
469
+ pslld xmm5, 20
470
+ por xmm5, xmm8
471
+ movdqa xmm8, xmm6
472
+ psrld xmm8, 12
473
+ pslld xmm6, 20
474
+ por xmm6, xmm8
475
+ movdqa xmm8, xmm7
476
+ psrld xmm8, 12
477
+ pslld xmm7, 20
478
+ por xmm7, xmm8
479
+ movdqa xmm8, xmm4
480
+ psrld xmm8, 12
481
+ pslld xmm4, 20
482
+ por xmm4, xmm8
483
+ paddd xmm0, xmmword ptr [rsp+0xB0]
484
+ paddd xmm1, xmmword ptr [rsp+0x50]
485
+ paddd xmm2, xmmword ptr [rsp+0xE0]
486
+ paddd xmm3, xmmword ptr [rsp+0x80]
487
+ paddd xmm0, xmm5
488
+ paddd xmm1, xmm6
489
+ paddd xmm2, xmm7
490
+ paddd xmm3, xmm4
491
+ pxor xmm15, xmm0
492
+ pxor xmm12, xmm1
493
+ pxor xmm13, xmm2
494
+ pxor xmm14, xmm3
495
+ movdqa xmm8, xmmword ptr [ROT8+rip]
496
+ pshufb xmm15, xmm8
497
+ pshufb xmm12, xmm8
498
+ pshufb xmm13, xmm8
499
+ pshufb xmm14, xmm8
500
+ paddd xmm10, xmm15
501
+ paddd xmm11, xmm12
502
+ movdqa xmm8, xmmword ptr [rsp+0x100]
503
+ paddd xmm8, xmm13
504
+ paddd xmm9, xmm14
505
+ pxor xmm5, xmm10
506
+ pxor xmm6, xmm11
507
+ pxor xmm7, xmm8
508
+ pxor xmm4, xmm9
509
+ movdqa xmmword ptr [rsp+0x100], xmm8
510
+ movdqa xmm8, xmm5
511
+ psrld xmm8, 7
512
+ pslld xmm5, 25
513
+ por xmm5, xmm8
514
+ movdqa xmm8, xmm6
515
+ psrld xmm8, 7
516
+ pslld xmm6, 25
517
+ por xmm6, xmm8
518
+ movdqa xmm8, xmm7
519
+ psrld xmm8, 7
520
+ pslld xmm7, 25
521
+ por xmm7, xmm8
522
+ movdqa xmm8, xmm4
523
+ psrld xmm8, 7
524
+ pslld xmm4, 25
525
+ por xmm4, xmm8
526
+ paddd xmm0, xmmword ptr [rsp+0x30]
527
+ paddd xmm1, xmmword ptr [rsp+0xA0]
528
+ paddd xmm2, xmmword ptr [rsp+0xD0]
529
+ paddd xmm3, xmmword ptr [rsp+0x70]
530
+ paddd xmm0, xmm4
531
+ paddd xmm1, xmm5
532
+ paddd xmm2, xmm6
533
+ paddd xmm3, xmm7
534
+ pxor xmm12, xmm0
535
+ pxor xmm13, xmm1
536
+ pxor xmm14, xmm2
537
+ pxor xmm15, xmm3
538
+ movdqa xmm8, xmmword ptr [ROT16+rip]
539
+ pshufb xmm12, xmm8
540
+ pshufb xmm13, xmm8
541
+ pshufb xmm14, xmm8
542
+ pshufb xmm15, xmm8
543
+ movdqa xmm8, xmmword ptr [rsp+0x100]
544
+ paddd xmm8, xmm12
545
+ paddd xmm9, xmm13
546
+ paddd xmm10, xmm14
547
+ paddd xmm11, xmm15
548
+ pxor xmm4, xmm8
549
+ pxor xmm5, xmm9
550
+ pxor xmm6, xmm10
551
+ pxor xmm7, xmm11
552
+ movdqa xmmword ptr [rsp+0x100], xmm8
553
+ movdqa xmm8, xmm4
554
+ psrld xmm8, 12
555
+ pslld xmm4, 20
556
+ por xmm4, xmm8
557
+ movdqa xmm8, xmm5
558
+ psrld xmm8, 12
559
+ pslld xmm5, 20
560
+ por xmm5, xmm8
561
+ movdqa xmm8, xmm6
562
+ psrld xmm8, 12
563
+ pslld xmm6, 20
564
+ por xmm6, xmm8
565
+ movdqa xmm8, xmm7
566
+ psrld xmm8, 12
567
+ pslld xmm7, 20
568
+ por xmm7, xmm8
569
+ paddd xmm0, xmmword ptr [rsp+0x40]
570
+ paddd xmm1, xmmword ptr [rsp+0xC0]
571
+ paddd xmm2, xmmword ptr [rsp+0x20]
572
+ paddd xmm3, xmmword ptr [rsp+0xE0]
573
+ paddd xmm0, xmm4
574
+ paddd xmm1, xmm5
575
+ paddd xmm2, xmm6
576
+ paddd xmm3, xmm7
577
+ pxor xmm12, xmm0
578
+ pxor xmm13, xmm1
579
+ pxor xmm14, xmm2
580
+ pxor xmm15, xmm3
581
+ movdqa xmm8, xmmword ptr [ROT8+rip]
582
+ pshufb xmm12, xmm8
583
+ pshufb xmm13, xmm8
584
+ pshufb xmm14, xmm8
585
+ pshufb xmm15, xmm8
586
+ movdqa xmm8, xmmword ptr [rsp+0x100]
587
+ paddd xmm8, xmm12
588
+ paddd xmm9, xmm13
589
+ paddd xmm10, xmm14
590
+ paddd xmm11, xmm15
591
+ pxor xmm4, xmm8
592
+ pxor xmm5, xmm9
593
+ pxor xmm6, xmm10
594
+ pxor xmm7, xmm11
595
+ movdqa xmmword ptr [rsp+0x100], xmm8
596
+ movdqa xmm8, xmm4
597
+ psrld xmm8, 7
598
+ pslld xmm4, 25
599
+ por xmm4, xmm8
600
+ movdqa xmm8, xmm5
601
+ psrld xmm8, 7
602
+ pslld xmm5, 25
603
+ por xmm5, xmm8
604
+ movdqa xmm8, xmm6
605
+ psrld xmm8, 7
606
+ pslld xmm6, 25
607
+ por xmm6, xmm8
608
+ movdqa xmm8, xmm7
609
+ psrld xmm8, 7
610
+ pslld xmm7, 25
611
+ por xmm7, xmm8
612
+ paddd xmm0, xmmword ptr [rsp+0x60]
613
+ paddd xmm1, xmmword ptr [rsp+0x90]
614
+ paddd xmm2, xmmword ptr [rsp+0xB0]
615
+ paddd xmm3, xmmword ptr [rsp+0x80]
616
+ paddd xmm0, xmm5
617
+ paddd xmm1, xmm6
618
+ paddd xmm2, xmm7
619
+ paddd xmm3, xmm4
620
+ pxor xmm15, xmm0
621
+ pxor xmm12, xmm1
622
+ pxor xmm13, xmm2
623
+ pxor xmm14, xmm3
624
+ movdqa xmm8, xmmword ptr [ROT16+rip]
625
+ pshufb xmm15, xmm8
626
+ pshufb xmm12, xmm8
627
+ pshufb xmm13, xmm8
628
+ pshufb xmm14, xmm8
629
+ paddd xmm10, xmm15
630
+ paddd xmm11, xmm12
631
+ movdqa xmm8, xmmword ptr [rsp+0x100]
632
+ paddd xmm8, xmm13
633
+ paddd xmm9, xmm14
634
+ pxor xmm5, xmm10
635
+ pxor xmm6, xmm11
636
+ pxor xmm7, xmm8
637
+ pxor xmm4, xmm9
638
+ movdqa xmmword ptr [rsp+0x100], xmm8
639
+ movdqa xmm8, xmm5
640
+ psrld xmm8, 12
641
+ pslld xmm5, 20
642
+ por xmm5, xmm8
643
+ movdqa xmm8, xmm6
644
+ psrld xmm8, 12
645
+ pslld xmm6, 20
646
+ por xmm6, xmm8
647
+ movdqa xmm8, xmm7
648
+ psrld xmm8, 12
649
+ pslld xmm7, 20
650
+ por xmm7, xmm8
651
+ movdqa xmm8, xmm4
652
+ psrld xmm8, 12
653
+ pslld xmm4, 20
654
+ por xmm4, xmm8
655
+ paddd xmm0, xmmword ptr [rsp+0x50]
656
+ paddd xmm1, xmmword ptr [rsp]
657
+ paddd xmm2, xmmword ptr [rsp+0xF0]
658
+ paddd xmm3, xmmword ptr [rsp+0x10]
659
+ paddd xmm0, xmm5
660
+ paddd xmm1, xmm6
661
+ paddd xmm2, xmm7
662
+ paddd xmm3, xmm4
663
+ pxor xmm15, xmm0
664
+ pxor xmm12, xmm1
665
+ pxor xmm13, xmm2
666
+ pxor xmm14, xmm3
667
+ movdqa xmm8, xmmword ptr [ROT8+rip]
668
+ pshufb xmm15, xmm8
669
+ pshufb xmm12, xmm8
670
+ pshufb xmm13, xmm8
671
+ pshufb xmm14, xmm8
672
+ paddd xmm10, xmm15
673
+ paddd xmm11, xmm12
674
+ movdqa xmm8, xmmword ptr [rsp+0x100]
675
+ paddd xmm8, xmm13
676
+ paddd xmm9, xmm14
677
+ pxor xmm5, xmm10
678
+ pxor xmm6, xmm11
679
+ pxor xmm7, xmm8
680
+ pxor xmm4, xmm9
681
+ movdqa xmmword ptr [rsp+0x100], xmm8
682
+ movdqa xmm8, xmm5
683
+ psrld xmm8, 7
684
+ pslld xmm5, 25
685
+ por xmm5, xmm8
686
+ movdqa xmm8, xmm6
687
+ psrld xmm8, 7
688
+ pslld xmm6, 25
689
+ por xmm6, xmm8
690
+ movdqa xmm8, xmm7
691
+ psrld xmm8, 7
692
+ pslld xmm7, 25
693
+ por xmm7, xmm8
694
+ movdqa xmm8, xmm4
695
+ psrld xmm8, 7
696
+ pslld xmm4, 25
697
+ por xmm4, xmm8
698
+ paddd xmm0, xmmword ptr [rsp+0xA0]
699
+ paddd xmm1, xmmword ptr [rsp+0xC0]
700
+ paddd xmm2, xmmword ptr [rsp+0xE0]
701
+ paddd xmm3, xmmword ptr [rsp+0xD0]
702
+ paddd xmm0, xmm4
703
+ paddd xmm1, xmm5
704
+ paddd xmm2, xmm6
705
+ paddd xmm3, xmm7
706
+ pxor xmm12, xmm0
707
+ pxor xmm13, xmm1
708
+ pxor xmm14, xmm2
709
+ pxor xmm15, xmm3
710
+ movdqa xmm8, xmmword ptr [ROT16+rip]
711
+ pshufb xmm12, xmm8
712
+ pshufb xmm13, xmm8
713
+ pshufb xmm14, xmm8
714
+ pshufb xmm15, xmm8
715
+ movdqa xmm8, xmmword ptr [rsp+0x100]
716
+ paddd xmm8, xmm12
717
+ paddd xmm9, xmm13
718
+ paddd xmm10, xmm14
719
+ paddd xmm11, xmm15
720
+ pxor xmm4, xmm8
721
+ pxor xmm5, xmm9
722
+ pxor xmm6, xmm10
723
+ pxor xmm7, xmm11
724
+ movdqa xmmword ptr [rsp+0x100], xmm8
725
+ movdqa xmm8, xmm4
726
+ psrld xmm8, 12
727
+ pslld xmm4, 20
728
+ por xmm4, xmm8
729
+ movdqa xmm8, xmm5
730
+ psrld xmm8, 12
731
+ pslld xmm5, 20
732
+ por xmm5, xmm8
733
+ movdqa xmm8, xmm6
734
+ psrld xmm8, 12
735
+ pslld xmm6, 20
736
+ por xmm6, xmm8
737
+ movdqa xmm8, xmm7
738
+ psrld xmm8, 12
739
+ pslld xmm7, 20
740
+ por xmm7, xmm8
741
+ paddd xmm0, xmmword ptr [rsp+0x70]
742
+ paddd xmm1, xmmword ptr [rsp+0x90]
743
+ paddd xmm2, xmmword ptr [rsp+0x30]
744
+ paddd xmm3, xmmword ptr [rsp+0xF0]
745
+ paddd xmm0, xmm4
746
+ paddd xmm1, xmm5
747
+ paddd xmm2, xmm6
748
+ paddd xmm3, xmm7
749
+ pxor xmm12, xmm0
750
+ pxor xmm13, xmm1
751
+ pxor xmm14, xmm2
752
+ pxor xmm15, xmm3
753
+ movdqa xmm8, xmmword ptr [ROT8+rip]
754
+ pshufb xmm12, xmm8
755
+ pshufb xmm13, xmm8
756
+ pshufb xmm14, xmm8
757
+ pshufb xmm15, xmm8
758
+ movdqa xmm8, xmmword ptr [rsp+0x100]
759
+ paddd xmm8, xmm12
760
+ paddd xmm9, xmm13
761
+ paddd xmm10, xmm14
762
+ paddd xmm11, xmm15
763
+ pxor xmm4, xmm8
764
+ pxor xmm5, xmm9
765
+ pxor xmm6, xmm10
766
+ pxor xmm7, xmm11
767
+ movdqa xmmword ptr [rsp+0x100], xmm8
768
+ movdqa xmm8, xmm4
769
+ psrld xmm8, 7
770
+ pslld xmm4, 25
771
+ por xmm4, xmm8
772
+ movdqa xmm8, xmm5
773
+ psrld xmm8, 7
774
+ pslld xmm5, 25
775
+ por xmm5, xmm8
776
+ movdqa xmm8, xmm6
777
+ psrld xmm8, 7
778
+ pslld xmm6, 25
779
+ por xmm6, xmm8
780
+ movdqa xmm8, xmm7
781
+ psrld xmm8, 7
782
+ pslld xmm7, 25
783
+ por xmm7, xmm8
784
+ paddd xmm0, xmmword ptr [rsp+0x40]
785
+ paddd xmm1, xmmword ptr [rsp+0xB0]
786
+ paddd xmm2, xmmword ptr [rsp+0x50]
787
+ paddd xmm3, xmmword ptr [rsp+0x10]
788
+ paddd xmm0, xmm5
789
+ paddd xmm1, xmm6
790
+ paddd xmm2, xmm7
791
+ paddd xmm3, xmm4
792
+ pxor xmm15, xmm0
793
+ pxor xmm12, xmm1
794
+ pxor xmm13, xmm2
795
+ pxor xmm14, xmm3
796
+ movdqa xmm8, xmmword ptr [ROT16+rip]
797
+ pshufb xmm15, xmm8
798
+ pshufb xmm12, xmm8
799
+ pshufb xmm13, xmm8
800
+ pshufb xmm14, xmm8
801
+ paddd xmm10, xmm15
802
+ paddd xmm11, xmm12
803
+ movdqa xmm8, xmmword ptr [rsp+0x100]
804
+ paddd xmm8, xmm13
805
+ paddd xmm9, xmm14
806
+ pxor xmm5, xmm10
807
+ pxor xmm6, xmm11
808
+ pxor xmm7, xmm8
809
+ pxor xmm4, xmm9
810
+ movdqa xmmword ptr [rsp+0x100], xmm8
811
+ movdqa xmm8, xmm5
812
+ psrld xmm8, 12
813
+ pslld xmm5, 20
814
+ por xmm5, xmm8
815
+ movdqa xmm8, xmm6
816
+ psrld xmm8, 12
817
+ pslld xmm6, 20
818
+ por xmm6, xmm8
819
+ movdqa xmm8, xmm7
820
+ psrld xmm8, 12
821
+ pslld xmm7, 20
822
+ por xmm7, xmm8
823
+ movdqa xmm8, xmm4
824
+ psrld xmm8, 12
825
+ pslld xmm4, 20
826
+ por xmm4, xmm8
827
+ paddd xmm0, xmmword ptr [rsp]
828
+ paddd xmm1, xmmword ptr [rsp+0x20]
829
+ paddd xmm2, xmmword ptr [rsp+0x80]
830
+ paddd xmm3, xmmword ptr [rsp+0x60]
831
+ paddd xmm0, xmm5
832
+ paddd xmm1, xmm6
833
+ paddd xmm2, xmm7
834
+ paddd xmm3, xmm4
835
+ pxor xmm15, xmm0
836
+ pxor xmm12, xmm1
837
+ pxor xmm13, xmm2
838
+ pxor xmm14, xmm3
839
+ movdqa xmm8, xmmword ptr [ROT8+rip]
840
+ pshufb xmm15, xmm8
841
+ pshufb xmm12, xmm8
842
+ pshufb xmm13, xmm8
843
+ pshufb xmm14, xmm8
844
+ paddd xmm10, xmm15
845
+ paddd xmm11, xmm12
846
+ movdqa xmm8, xmmword ptr [rsp+0x100]
847
+ paddd xmm8, xmm13
848
+ paddd xmm9, xmm14
849
+ pxor xmm5, xmm10
850
+ pxor xmm6, xmm11
851
+ pxor xmm7, xmm8
852
+ pxor xmm4, xmm9
853
+ movdqa xmmword ptr [rsp+0x100], xmm8
854
+ movdqa xmm8, xmm5
855
+ psrld xmm8, 7
856
+ pslld xmm5, 25
857
+ por xmm5, xmm8
858
+ movdqa xmm8, xmm6
859
+ psrld xmm8, 7
860
+ pslld xmm6, 25
861
+ por xmm6, xmm8
862
+ movdqa xmm8, xmm7
863
+ psrld xmm8, 7
864
+ pslld xmm7, 25
865
+ por xmm7, xmm8
866
+ movdqa xmm8, xmm4
867
+ psrld xmm8, 7
868
+ pslld xmm4, 25
869
+ por xmm4, xmm8
870
+ paddd xmm0, xmmword ptr [rsp+0xC0]
871
+ paddd xmm1, xmmword ptr [rsp+0x90]
872
+ paddd xmm2, xmmword ptr [rsp+0xF0]
873
+ paddd xmm3, xmmword ptr [rsp+0xE0]
874
+ paddd xmm0, xmm4
875
+ paddd xmm1, xmm5
876
+ paddd xmm2, xmm6
877
+ paddd xmm3, xmm7
878
+ pxor xmm12, xmm0
879
+ pxor xmm13, xmm1
880
+ pxor xmm14, xmm2
881
+ pxor xmm15, xmm3
882
+ movdqa xmm8, xmmword ptr [ROT16+rip]
883
+ pshufb xmm12, xmm8
884
+ pshufb xmm13, xmm8
885
+ pshufb xmm14, xmm8
886
+ pshufb xmm15, xmm8
887
+ movdqa xmm8, xmmword ptr [rsp+0x100]
888
+ paddd xmm8, xmm12
889
+ paddd xmm9, xmm13
890
+ paddd xmm10, xmm14
891
+ paddd xmm11, xmm15
892
+ pxor xmm4, xmm8
893
+ pxor xmm5, xmm9
894
+ pxor xmm6, xmm10
895
+ pxor xmm7, xmm11
896
+ movdqa xmmword ptr [rsp+0x100], xmm8
897
+ movdqa xmm8, xmm4
898
+ psrld xmm8, 12
899
+ pslld xmm4, 20
900
+ por xmm4, xmm8
901
+ movdqa xmm8, xmm5
902
+ psrld xmm8, 12
903
+ pslld xmm5, 20
904
+ por xmm5, xmm8
905
+ movdqa xmm8, xmm6
906
+ psrld xmm8, 12
907
+ pslld xmm6, 20
908
+ por xmm6, xmm8
909
+ movdqa xmm8, xmm7
910
+ psrld xmm8, 12
911
+ pslld xmm7, 20
912
+ por xmm7, xmm8
913
+ paddd xmm0, xmmword ptr [rsp+0xD0]
914
+ paddd xmm1, xmmword ptr [rsp+0xB0]
915
+ paddd xmm2, xmmword ptr [rsp+0xA0]
916
+ paddd xmm3, xmmword ptr [rsp+0x80]
917
+ paddd xmm0, xmm4
918
+ paddd xmm1, xmm5
919
+ paddd xmm2, xmm6
920
+ paddd xmm3, xmm7
921
+ pxor xmm12, xmm0
922
+ pxor xmm13, xmm1
923
+ pxor xmm14, xmm2
924
+ pxor xmm15, xmm3
925
+ movdqa xmm8, xmmword ptr [ROT8+rip]
926
+ pshufb xmm12, xmm8
927
+ pshufb xmm13, xmm8
928
+ pshufb xmm14, xmm8
929
+ pshufb xmm15, xmm8
930
+ movdqa xmm8, xmmword ptr [rsp+0x100]
931
+ paddd xmm8, xmm12
932
+ paddd xmm9, xmm13
933
+ paddd xmm10, xmm14
934
+ paddd xmm11, xmm15
935
+ pxor xmm4, xmm8
936
+ pxor xmm5, xmm9
937
+ pxor xmm6, xmm10
938
+ pxor xmm7, xmm11
939
+ movdqa xmmword ptr [rsp+0x100], xmm8
940
+ movdqa xmm8, xmm4
941
+ psrld xmm8, 7
942
+ pslld xmm4, 25
943
+ por xmm4, xmm8
944
+ movdqa xmm8, xmm5
945
+ psrld xmm8, 7
946
+ pslld xmm5, 25
947
+ por xmm5, xmm8
948
+ movdqa xmm8, xmm6
949
+ psrld xmm8, 7
950
+ pslld xmm6, 25
951
+ por xmm6, xmm8
952
+ movdqa xmm8, xmm7
953
+ psrld xmm8, 7
954
+ pslld xmm7, 25
955
+ por xmm7, xmm8
956
+ paddd xmm0, xmmword ptr [rsp+0x70]
957
+ paddd xmm1, xmmword ptr [rsp+0x50]
958
+ paddd xmm2, xmmword ptr [rsp]
959
+ paddd xmm3, xmmword ptr [rsp+0x60]
960
+ paddd xmm0, xmm5
961
+ paddd xmm1, xmm6
962
+ paddd xmm2, xmm7
963
+ paddd xmm3, xmm4
964
+ pxor xmm15, xmm0
965
+ pxor xmm12, xmm1
966
+ pxor xmm13, xmm2
967
+ pxor xmm14, xmm3
968
+ movdqa xmm8, xmmword ptr [ROT16+rip]
969
+ pshufb xmm15, xmm8
970
+ pshufb xmm12, xmm8
971
+ pshufb xmm13, xmm8
972
+ pshufb xmm14, xmm8
973
+ paddd xmm10, xmm15
974
+ paddd xmm11, xmm12
975
+ movdqa xmm8, xmmword ptr [rsp+0x100]
976
+ paddd xmm8, xmm13
977
+ paddd xmm9, xmm14
978
+ pxor xmm5, xmm10
979
+ pxor xmm6, xmm11
980
+ pxor xmm7, xmm8
981
+ pxor xmm4, xmm9
982
+ movdqa xmmword ptr [rsp+0x100], xmm8
983
+ movdqa xmm8, xmm5
984
+ psrld xmm8, 12
985
+ pslld xmm5, 20
986
+ por xmm5, xmm8
987
+ movdqa xmm8, xmm6
988
+ psrld xmm8, 12
989
+ pslld xmm6, 20
990
+ por xmm6, xmm8
991
+ movdqa xmm8, xmm7
992
+ psrld xmm8, 12
993
+ pslld xmm7, 20
994
+ por xmm7, xmm8
995
+ movdqa xmm8, xmm4
996
+ psrld xmm8, 12
997
+ pslld xmm4, 20
998
+ por xmm4, xmm8
999
+ paddd xmm0, xmmword ptr [rsp+0x20]
1000
+ paddd xmm1, xmmword ptr [rsp+0x30]
1001
+ paddd xmm2, xmmword ptr [rsp+0x10]
1002
+ paddd xmm3, xmmword ptr [rsp+0x40]
1003
+ paddd xmm0, xmm5
1004
+ paddd xmm1, xmm6
1005
+ paddd xmm2, xmm7
1006
+ paddd xmm3, xmm4
1007
+ pxor xmm15, xmm0
1008
+ pxor xmm12, xmm1
1009
+ pxor xmm13, xmm2
1010
+ pxor xmm14, xmm3
1011
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1012
+ pshufb xmm15, xmm8
1013
+ pshufb xmm12, xmm8
1014
+ pshufb xmm13, xmm8
1015
+ pshufb xmm14, xmm8
1016
+ paddd xmm10, xmm15
1017
+ paddd xmm11, xmm12
1018
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1019
+ paddd xmm8, xmm13
1020
+ paddd xmm9, xmm14
1021
+ pxor xmm5, xmm10
1022
+ pxor xmm6, xmm11
1023
+ pxor xmm7, xmm8
1024
+ pxor xmm4, xmm9
1025
+ movdqa xmmword ptr [rsp+0x100], xmm8
1026
+ movdqa xmm8, xmm5
1027
+ psrld xmm8, 7
1028
+ pslld xmm5, 25
1029
+ por xmm5, xmm8
1030
+ movdqa xmm8, xmm6
1031
+ psrld xmm8, 7
1032
+ pslld xmm6, 25
1033
+ por xmm6, xmm8
1034
+ movdqa xmm8, xmm7
1035
+ psrld xmm8, 7
1036
+ pslld xmm7, 25
1037
+ por xmm7, xmm8
1038
+ movdqa xmm8, xmm4
1039
+ psrld xmm8, 7
1040
+ pslld xmm4, 25
1041
+ por xmm4, xmm8
1042
+ paddd xmm0, xmmword ptr [rsp+0x90]
1043
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1044
+ paddd xmm2, xmmword ptr [rsp+0x80]
1045
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1046
+ paddd xmm0, xmm4
1047
+ paddd xmm1, xmm5
1048
+ paddd xmm2, xmm6
1049
+ paddd xmm3, xmm7
1050
+ pxor xmm12, xmm0
1051
+ pxor xmm13, xmm1
1052
+ pxor xmm14, xmm2
1053
+ pxor xmm15, xmm3
1054
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1055
+ pshufb xmm12, xmm8
1056
+ pshufb xmm13, xmm8
1057
+ pshufb xmm14, xmm8
1058
+ pshufb xmm15, xmm8
1059
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1060
+ paddd xmm8, xmm12
1061
+ paddd xmm9, xmm13
1062
+ paddd xmm10, xmm14
1063
+ paddd xmm11, xmm15
1064
+ pxor xmm4, xmm8
1065
+ pxor xmm5, xmm9
1066
+ pxor xmm6, xmm10
1067
+ pxor xmm7, xmm11
1068
+ movdqa xmmword ptr [rsp+0x100], xmm8
1069
+ movdqa xmm8, xmm4
1070
+ psrld xmm8, 12
1071
+ pslld xmm4, 20
1072
+ por xmm4, xmm8
1073
+ movdqa xmm8, xmm5
1074
+ psrld xmm8, 12
1075
+ pslld xmm5, 20
1076
+ por xmm5, xmm8
1077
+ movdqa xmm8, xmm6
1078
+ psrld xmm8, 12
1079
+ pslld xmm6, 20
1080
+ por xmm6, xmm8
1081
+ movdqa xmm8, xmm7
1082
+ psrld xmm8, 12
1083
+ pslld xmm7, 20
1084
+ por xmm7, xmm8
1085
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1086
+ paddd xmm1, xmmword ptr [rsp+0x50]
1087
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1088
+ paddd xmm3, xmmword ptr [rsp+0x10]
1089
+ paddd xmm0, xmm4
1090
+ paddd xmm1, xmm5
1091
+ paddd xmm2, xmm6
1092
+ paddd xmm3, xmm7
1093
+ pxor xmm12, xmm0
1094
+ pxor xmm13, xmm1
1095
+ pxor xmm14, xmm2
1096
+ pxor xmm15, xmm3
1097
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1098
+ pshufb xmm12, xmm8
1099
+ pshufb xmm13, xmm8
1100
+ pshufb xmm14, xmm8
1101
+ pshufb xmm15, xmm8
1102
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1103
+ paddd xmm8, xmm12
1104
+ paddd xmm9, xmm13
1105
+ paddd xmm10, xmm14
1106
+ paddd xmm11, xmm15
1107
+ pxor xmm4, xmm8
1108
+ pxor xmm5, xmm9
1109
+ pxor xmm6, xmm10
1110
+ pxor xmm7, xmm11
1111
+ movdqa xmmword ptr [rsp+0x100], xmm8
1112
+ movdqa xmm8, xmm4
1113
+ psrld xmm8, 7
1114
+ pslld xmm4, 25
1115
+ por xmm4, xmm8
1116
+ movdqa xmm8, xmm5
1117
+ psrld xmm8, 7
1118
+ pslld xmm5, 25
1119
+ por xmm5, xmm8
1120
+ movdqa xmm8, xmm6
1121
+ psrld xmm8, 7
1122
+ pslld xmm6, 25
1123
+ por xmm6, xmm8
1124
+ movdqa xmm8, xmm7
1125
+ psrld xmm8, 7
1126
+ pslld xmm7, 25
1127
+ por xmm7, xmm8
1128
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1129
+ paddd xmm1, xmmword ptr [rsp]
1130
+ paddd xmm2, xmmword ptr [rsp+0x20]
1131
+ paddd xmm3, xmmword ptr [rsp+0x40]
1132
+ paddd xmm0, xmm5
1133
+ paddd xmm1, xmm6
1134
+ paddd xmm2, xmm7
1135
+ paddd xmm3, xmm4
1136
+ pxor xmm15, xmm0
1137
+ pxor xmm12, xmm1
1138
+ pxor xmm13, xmm2
1139
+ pxor xmm14, xmm3
1140
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1141
+ pshufb xmm15, xmm8
1142
+ pshufb xmm12, xmm8
1143
+ pshufb xmm13, xmm8
1144
+ pshufb xmm14, xmm8
1145
+ paddd xmm10, xmm15
1146
+ paddd xmm11, xmm12
1147
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1148
+ paddd xmm8, xmm13
1149
+ paddd xmm9, xmm14
1150
+ pxor xmm5, xmm10
1151
+ pxor xmm6, xmm11
1152
+ pxor xmm7, xmm8
1153
+ pxor xmm4, xmm9
1154
+ movdqa xmmword ptr [rsp+0x100], xmm8
1155
+ movdqa xmm8, xmm5
1156
+ psrld xmm8, 12
1157
+ pslld xmm5, 20
1158
+ por xmm5, xmm8
1159
+ movdqa xmm8, xmm6
1160
+ psrld xmm8, 12
1161
+ pslld xmm6, 20
1162
+ por xmm6, xmm8
1163
+ movdqa xmm8, xmm7
1164
+ psrld xmm8, 12
1165
+ pslld xmm7, 20
1166
+ por xmm7, xmm8
1167
+ movdqa xmm8, xmm4
1168
+ psrld xmm8, 12
1169
+ pslld xmm4, 20
1170
+ por xmm4, xmm8
1171
+ paddd xmm0, xmmword ptr [rsp+0x30]
1172
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1173
+ paddd xmm2, xmmword ptr [rsp+0x60]
1174
+ paddd xmm3, xmmword ptr [rsp+0x70]
1175
+ paddd xmm0, xmm5
1176
+ paddd xmm1, xmm6
1177
+ paddd xmm2, xmm7
1178
+ paddd xmm3, xmm4
1179
+ pxor xmm15, xmm0
1180
+ pxor xmm12, xmm1
1181
+ pxor xmm13, xmm2
1182
+ pxor xmm14, xmm3
1183
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1184
+ pshufb xmm15, xmm8
1185
+ pshufb xmm12, xmm8
1186
+ pshufb xmm13, xmm8
1187
+ pshufb xmm14, xmm8
1188
+ paddd xmm10, xmm15
1189
+ paddd xmm11, xmm12
1190
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1191
+ paddd xmm8, xmm13
1192
+ paddd xmm9, xmm14
1193
+ pxor xmm5, xmm10
1194
+ pxor xmm6, xmm11
1195
+ pxor xmm7, xmm8
1196
+ pxor xmm4, xmm9
1197
+ movdqa xmmword ptr [rsp+0x100], xmm8
1198
+ movdqa xmm8, xmm5
1199
+ psrld xmm8, 7
1200
+ pslld xmm5, 25
1201
+ por xmm5, xmm8
1202
+ movdqa xmm8, xmm6
1203
+ psrld xmm8, 7
1204
+ pslld xmm6, 25
1205
+ por xmm6, xmm8
1206
+ movdqa xmm8, xmm7
1207
+ psrld xmm8, 7
1208
+ pslld xmm7, 25
1209
+ por xmm7, xmm8
1210
+ movdqa xmm8, xmm4
1211
+ psrld xmm8, 7
1212
+ pslld xmm4, 25
1213
+ por xmm4, xmm8
1214
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1215
+ paddd xmm1, xmmword ptr [rsp+0x50]
1216
+ paddd xmm2, xmmword ptr [rsp+0x10]
1217
+ paddd xmm3, xmmword ptr [rsp+0x80]
1218
+ paddd xmm0, xmm4
1219
+ paddd xmm1, xmm5
1220
+ paddd xmm2, xmm6
1221
+ paddd xmm3, xmm7
1222
+ pxor xmm12, xmm0
1223
+ pxor xmm13, xmm1
1224
+ pxor xmm14, xmm2
1225
+ pxor xmm15, xmm3
1226
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1227
+ pshufb xmm12, xmm8
1228
+ pshufb xmm13, xmm8
1229
+ pshufb xmm14, xmm8
1230
+ pshufb xmm15, xmm8
1231
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1232
+ paddd xmm8, xmm12
1233
+ paddd xmm9, xmm13
1234
+ paddd xmm10, xmm14
1235
+ paddd xmm11, xmm15
1236
+ pxor xmm4, xmm8
1237
+ pxor xmm5, xmm9
1238
+ pxor xmm6, xmm10
1239
+ pxor xmm7, xmm11
1240
+ movdqa xmmword ptr [rsp+0x100], xmm8
1241
+ movdqa xmm8, xmm4
1242
+ psrld xmm8, 12
1243
+ pslld xmm4, 20
1244
+ por xmm4, xmm8
1245
+ movdqa xmm8, xmm5
1246
+ psrld xmm8, 12
1247
+ pslld xmm5, 20
1248
+ por xmm5, xmm8
1249
+ movdqa xmm8, xmm6
1250
+ psrld xmm8, 12
1251
+ pslld xmm6, 20
1252
+ por xmm6, xmm8
1253
+ movdqa xmm8, xmm7
1254
+ psrld xmm8, 12
1255
+ pslld xmm7, 20
1256
+ por xmm7, xmm8
1257
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1258
+ paddd xmm1, xmmword ptr [rsp]
1259
+ paddd xmm2, xmmword ptr [rsp+0x90]
1260
+ paddd xmm3, xmmword ptr [rsp+0x60]
1261
+ paddd xmm0, xmm4
1262
+ paddd xmm1, xmm5
1263
+ paddd xmm2, xmm6
1264
+ paddd xmm3, xmm7
1265
+ pxor xmm12, xmm0
1266
+ pxor xmm13, xmm1
1267
+ pxor xmm14, xmm2
1268
+ pxor xmm15, xmm3
1269
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1270
+ pshufb xmm12, xmm8
1271
+ pshufb xmm13, xmm8
1272
+ pshufb xmm14, xmm8
1273
+ pshufb xmm15, xmm8
1274
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1275
+ paddd xmm8, xmm12
1276
+ paddd xmm9, xmm13
1277
+ paddd xmm10, xmm14
1278
+ paddd xmm11, xmm15
1279
+ pxor xmm4, xmm8
1280
+ pxor xmm5, xmm9
1281
+ pxor xmm6, xmm10
1282
+ pxor xmm7, xmm11
1283
+ movdqa xmmword ptr [rsp+0x100], xmm8
1284
+ movdqa xmm8, xmm4
1285
+ psrld xmm8, 7
1286
+ pslld xmm4, 25
1287
+ por xmm4, xmm8
1288
+ movdqa xmm8, xmm5
1289
+ psrld xmm8, 7
1290
+ pslld xmm5, 25
1291
+ por xmm5, xmm8
1292
+ movdqa xmm8, xmm6
1293
+ psrld xmm8, 7
1294
+ pslld xmm6, 25
1295
+ por xmm6, xmm8
1296
+ movdqa xmm8, xmm7
1297
+ psrld xmm8, 7
1298
+ pslld xmm7, 25
1299
+ por xmm7, xmm8
1300
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1301
+ paddd xmm1, xmmword ptr [rsp+0x20]
1302
+ paddd xmm2, xmmword ptr [rsp+0x30]
1303
+ paddd xmm3, xmmword ptr [rsp+0x70]
1304
+ paddd xmm0, xmm5
1305
+ paddd xmm1, xmm6
1306
+ paddd xmm2, xmm7
1307
+ paddd xmm3, xmm4
1308
+ pxor xmm15, xmm0
1309
+ pxor xmm12, xmm1
1310
+ pxor xmm13, xmm2
1311
+ pxor xmm14, xmm3
1312
+ movdqa xmm8, xmmword ptr [ROT16+rip]
1313
+ pshufb xmm15, xmm8
1314
+ pshufb xmm12, xmm8
1315
+ pshufb xmm13, xmm8
1316
+ pshufb xmm14, xmm8
1317
+ paddd xmm10, xmm15
1318
+ paddd xmm11, xmm12
1319
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1320
+ paddd xmm8, xmm13
1321
+ paddd xmm9, xmm14
1322
+ pxor xmm5, xmm10
1323
+ pxor xmm6, xmm11
1324
+ pxor xmm7, xmm8
1325
+ pxor xmm4, xmm9
1326
+ movdqa xmmword ptr [rsp+0x100], xmm8
1327
+ movdqa xmm8, xmm5
1328
+ psrld xmm8, 12
1329
+ pslld xmm5, 20
1330
+ por xmm5, xmm8
1331
+ movdqa xmm8, xmm6
1332
+ psrld xmm8, 12
1333
+ pslld xmm6, 20
1334
+ por xmm6, xmm8
1335
+ movdqa xmm8, xmm7
1336
+ psrld xmm8, 12
1337
+ pslld xmm7, 20
1338
+ por xmm7, xmm8
1339
+ movdqa xmm8, xmm4
1340
+ psrld xmm8, 12
1341
+ pslld xmm4, 20
1342
+ por xmm4, xmm8
1343
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1344
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1345
+ paddd xmm2, xmmword ptr [rsp+0x40]
1346
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1347
+ paddd xmm0, xmm5
1348
+ paddd xmm1, xmm6
1349
+ paddd xmm2, xmm7
1350
+ paddd xmm3, xmm4
1351
+ pxor xmm15, xmm0
1352
+ pxor xmm12, xmm1
1353
+ pxor xmm13, xmm2
1354
+ pxor xmm14, xmm3
1355
+ movdqa xmm8, xmmword ptr [ROT8+rip]
1356
+ pshufb xmm15, xmm8
1357
+ pshufb xmm12, xmm8
1358
+ pshufb xmm13, xmm8
1359
+ pshufb xmm14, xmm8
1360
+ paddd xmm10, xmm15
1361
+ paddd xmm11, xmm12
1362
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1363
+ paddd xmm8, xmm13
1364
+ paddd xmm9, xmm14
1365
+ pxor xmm5, xmm10
1366
+ pxor xmm6, xmm11
1367
+ pxor xmm7, xmm8
1368
+ pxor xmm4, xmm9
1369
+ pxor xmm0, xmm8
1370
+ pxor xmm1, xmm9
1371
+ pxor xmm2, xmm10
1372
+ pxor xmm3, xmm11
1373
+ movdqa xmm8, xmm5
1374
+ psrld xmm8, 7
1375
+ pslld xmm5, 25
1376
+ por xmm5, xmm8
1377
+ movdqa xmm8, xmm6
1378
+ psrld xmm8, 7
1379
+ pslld xmm6, 25
1380
+ por xmm6, xmm8
1381
+ movdqa xmm8, xmm7
1382
+ psrld xmm8, 7
1383
+ pslld xmm7, 25
1384
+ por xmm7, xmm8
1385
+ movdqa xmm8, xmm4
1386
+ psrld xmm8, 7
1387
+ pslld xmm4, 25
1388
+ por xmm4, xmm8
1389
+ pxor xmm4, xmm12
1390
+ pxor xmm5, xmm13
1391
+ pxor xmm6, xmm14
1392
+ pxor xmm7, xmm15
1393
+ mov eax, r13d
1394
+ jne 9b
1395
+ movdqa xmm9, xmm0
1396
+ punpckldq xmm0, xmm1
1397
+ punpckhdq xmm9, xmm1
1398
+ movdqa xmm11, xmm2
1399
+ punpckldq xmm2, xmm3
1400
+ punpckhdq xmm11, xmm3
1401
+ movdqa xmm1, xmm0
1402
+ punpcklqdq xmm0, xmm2
1403
+ punpckhqdq xmm1, xmm2
1404
+ movdqa xmm3, xmm9
1405
+ punpcklqdq xmm9, xmm11
1406
+ punpckhqdq xmm3, xmm11
1407
+ movdqu xmmword ptr [rbx], xmm0
1408
+ movdqu xmmword ptr [rbx+0x20], xmm1
1409
+ movdqu xmmword ptr [rbx+0x40], xmm9
1410
+ movdqu xmmword ptr [rbx+0x60], xmm3
1411
+ movdqa xmm9, xmm4
1412
+ punpckldq xmm4, xmm5
1413
+ punpckhdq xmm9, xmm5
1414
+ movdqa xmm11, xmm6
1415
+ punpckldq xmm6, xmm7
1416
+ punpckhdq xmm11, xmm7
1417
+ movdqa xmm5, xmm4
1418
+ punpcklqdq xmm4, xmm6
1419
+ punpckhqdq xmm5, xmm6
1420
+ movdqa xmm7, xmm9
1421
+ punpcklqdq xmm9, xmm11
1422
+ punpckhqdq xmm7, xmm11
1423
+ movdqu xmmword ptr [rbx+0x10], xmm4
1424
+ movdqu xmmword ptr [rbx+0x30], xmm5
1425
+ movdqu xmmword ptr [rbx+0x50], xmm9
1426
+ movdqu xmmword ptr [rbx+0x70], xmm7
1427
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1428
+ movdqa xmm0, xmm1
1429
+ paddd xmm1, xmmword ptr [rsp+0x150]
1430
+ movdqa xmmword ptr [rsp+0x110], xmm1
1431
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1432
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1433
+ pcmpgtd xmm0, xmm1
1434
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1435
+ psubd xmm1, xmm0
1436
+ movdqa xmmword ptr [rsp+0x120], xmm1
1437
+ add rbx, 128
1438
+ add rdi, 32
1439
+ sub rsi, 4
1440
+ cmp rsi, 4
1441
+ jnc 2b
1442
+ test rsi, rsi
1443
+ jne 3f
1444
+ 4:
1445
+ movdqa xmm6, xmmword ptr [rsp+0x170]
1446
+ movdqa xmm7, xmmword ptr [rsp+0x180]
1447
+ movdqa xmm8, xmmword ptr [rsp+0x190]
1448
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
1449
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
1450
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
1451
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
1452
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
1453
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
1454
+ movdqa xmm15, xmmword ptr [rsp+0x200]
1455
+ mov rsp, rbp
1456
+ pop rbp
1457
+ pop rbx
1458
+ pop rdi
1459
+ pop rsi
1460
+ pop r12
1461
+ pop r13
1462
+ pop r14
1463
+ pop r15
1464
+ ret
1465
+ .p2align 5
1466
+ 3:
1467
+ test esi, 0x2
1468
+ je 3f
1469
+ movups xmm0, xmmword ptr [rcx]
1470
+ movups xmm1, xmmword ptr [rcx+0x10]
1471
+ movaps xmm8, xmm0
1472
+ movaps xmm9, xmm1
1473
+ movd xmm13, dword ptr [rsp+0x110]
1474
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1475
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1476
+ movaps xmmword ptr [rsp], xmm13
1477
+ movd xmm14, dword ptr [rsp+0x114]
1478
+ pinsrd xmm14, dword ptr [rsp+0x124], 1
1479
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1480
+ movaps xmmword ptr [rsp+0x10], xmm14
1481
+ mov r8, qword ptr [rdi]
1482
+ mov r9, qword ptr [rdi+0x8]
1483
+ movzx eax, byte ptr [rbp+0x80]
1484
+ or eax, r13d
1485
+ xor edx, edx
1486
+ 2:
1487
+ mov r14d, eax
1488
+ or eax, r12d
1489
+ add rdx, 64
1490
+ cmp rdx, r15
1491
+ cmovne eax, r14d
1492
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1493
+ movaps xmm10, xmm2
1494
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1495
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1496
+ movaps xmm3, xmm4
1497
+ shufps xmm4, xmm5, 136
1498
+ shufps xmm3, xmm5, 221
1499
+ movaps xmm5, xmm3
1500
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1501
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1502
+ movaps xmm3, xmm6
1503
+ shufps xmm6, xmm7, 136
1504
+ pshufd xmm6, xmm6, 0x93
1505
+ shufps xmm3, xmm7, 221
1506
+ pshufd xmm7, xmm3, 0x93
1507
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1508
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1509
+ movaps xmm11, xmm12
1510
+ shufps xmm12, xmm13, 136
1511
+ shufps xmm11, xmm13, 221
1512
+ movaps xmm13, xmm11
1513
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1514
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1515
+ movaps xmm11, xmm14
1516
+ shufps xmm14, xmm15, 136
1517
+ pshufd xmm14, xmm14, 0x93
1518
+ shufps xmm11, xmm15, 221
1519
+ pshufd xmm15, xmm11, 0x93
1520
+ movaps xmm3, xmmword ptr [rsp]
1521
+ movaps xmm11, xmmword ptr [rsp+0x10]
1522
+ pinsrd xmm3, eax, 3
1523
+ pinsrd xmm11, eax, 3
1524
+ mov al, 7
1525
+ 9:
1526
+ paddd xmm0, xmm4
1527
+ paddd xmm8, xmm12
1528
+ movaps xmmword ptr [rsp+0x20], xmm4
1529
+ movaps xmmword ptr [rsp+0x30], xmm12
1530
+ paddd xmm0, xmm1
1531
+ paddd xmm8, xmm9
1532
+ pxor xmm3, xmm0
1533
+ pxor xmm11, xmm8
1534
+ movaps xmm12, xmmword ptr [ROT16+rip]
1535
+ pshufb xmm3, xmm12
1536
+ pshufb xmm11, xmm12
1537
+ paddd xmm2, xmm3
1538
+ paddd xmm10, xmm11
1539
+ pxor xmm1, xmm2
1540
+ pxor xmm9, xmm10
1541
+ movdqa xmm4, xmm1
1542
+ pslld xmm1, 20
1543
+ psrld xmm4, 12
1544
+ por xmm1, xmm4
1545
+ movdqa xmm4, xmm9
1546
+ pslld xmm9, 20
1547
+ psrld xmm4, 12
1548
+ por xmm9, xmm4
1549
+ paddd xmm0, xmm5
1550
+ paddd xmm8, xmm13
1551
+ movaps xmmword ptr [rsp+0x40], xmm5
1552
+ movaps xmmword ptr [rsp+0x50], xmm13
1553
+ paddd xmm0, xmm1
1554
+ paddd xmm8, xmm9
1555
+ pxor xmm3, xmm0
1556
+ pxor xmm11, xmm8
1557
+ movaps xmm13, xmmword ptr [ROT8+rip]
1558
+ pshufb xmm3, xmm13
1559
+ pshufb xmm11, xmm13
1560
+ paddd xmm2, xmm3
1561
+ paddd xmm10, xmm11
1562
+ pxor xmm1, xmm2
1563
+ pxor xmm9, xmm10
1564
+ movdqa xmm4, xmm1
1565
+ pslld xmm1, 25
1566
+ psrld xmm4, 7
1567
+ por xmm1, xmm4
1568
+ movdqa xmm4, xmm9
1569
+ pslld xmm9, 25
1570
+ psrld xmm4, 7
1571
+ por xmm9, xmm4
1572
+ pshufd xmm0, xmm0, 0x93
1573
+ pshufd xmm8, xmm8, 0x93
1574
+ pshufd xmm3, xmm3, 0x4E
1575
+ pshufd xmm11, xmm11, 0x4E
1576
+ pshufd xmm2, xmm2, 0x39
1577
+ pshufd xmm10, xmm10, 0x39
1578
+ paddd xmm0, xmm6
1579
+ paddd xmm8, xmm14
1580
+ paddd xmm0, xmm1
1581
+ paddd xmm8, xmm9
1582
+ pxor xmm3, xmm0
1583
+ pxor xmm11, xmm8
1584
+ pshufb xmm3, xmm12
1585
+ pshufb xmm11, xmm12
1586
+ paddd xmm2, xmm3
1587
+ paddd xmm10, xmm11
1588
+ pxor xmm1, xmm2
1589
+ pxor xmm9, xmm10
1590
+ movdqa xmm4, xmm1
1591
+ pslld xmm1, 20
1592
+ psrld xmm4, 12
1593
+ por xmm1, xmm4
1594
+ movdqa xmm4, xmm9
1595
+ pslld xmm9, 20
1596
+ psrld xmm4, 12
1597
+ por xmm9, xmm4
1598
+ paddd xmm0, xmm7
1599
+ paddd xmm8, xmm15
1600
+ paddd xmm0, xmm1
1601
+ paddd xmm8, xmm9
1602
+ pxor xmm3, xmm0
1603
+ pxor xmm11, xmm8
1604
+ pshufb xmm3, xmm13
1605
+ pshufb xmm11, xmm13
1606
+ paddd xmm2, xmm3
1607
+ paddd xmm10, xmm11
1608
+ pxor xmm1, xmm2
1609
+ pxor xmm9, xmm10
1610
+ movdqa xmm4, xmm1
1611
+ pslld xmm1, 25
1612
+ psrld xmm4, 7
1613
+ por xmm1, xmm4
1614
+ movdqa xmm4, xmm9
1615
+ pslld xmm9, 25
1616
+ psrld xmm4, 7
1617
+ por xmm9, xmm4
1618
+ pshufd xmm0, xmm0, 0x39
1619
+ pshufd xmm8, xmm8, 0x39
1620
+ pshufd xmm3, xmm3, 0x4E
1621
+ pshufd xmm11, xmm11, 0x4E
1622
+ pshufd xmm2, xmm2, 0x93
1623
+ pshufd xmm10, xmm10, 0x93
1624
+ dec al
1625
+ je 9f
1626
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1627
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1628
+ pshufd xmm13, xmm12, 0x0F
1629
+ shufps xmm12, xmm5, 214
1630
+ pshufd xmm4, xmm12, 0x39
1631
+ movdqa xmm12, xmm6
1632
+ shufps xmm12, xmm7, 250
1633
+ pblendw xmm13, xmm12, 0xCC
1634
+ movdqa xmm12, xmm7
1635
+ punpcklqdq xmm12, xmm5
1636
+ pblendw xmm12, xmm6, 0xC0
1637
+ pshufd xmm12, xmm12, 0x78
1638
+ punpckhdq xmm5, xmm7
1639
+ punpckldq xmm6, xmm5
1640
+ pshufd xmm7, xmm6, 0x1E
1641
+ movdqa xmmword ptr [rsp+0x20], xmm13
1642
+ movdqa xmmword ptr [rsp+0x40], xmm12
1643
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1644
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1645
+ pshufd xmm6, xmm5, 0x0F
1646
+ shufps xmm5, xmm13, 214
1647
+ pshufd xmm12, xmm5, 0x39
1648
+ movdqa xmm5, xmm14
1649
+ shufps xmm5, xmm15, 250
1650
+ pblendw xmm6, xmm5, 0xCC
1651
+ movdqa xmm5, xmm15
1652
+ punpcklqdq xmm5, xmm13
1653
+ pblendw xmm5, xmm14, 0xC0
1654
+ pshufd xmm5, xmm5, 0x78
1655
+ punpckhdq xmm13, xmm15
1656
+ punpckldq xmm14, xmm13
1657
+ pshufd xmm15, xmm14, 0x1E
1658
+ movdqa xmm13, xmm6
1659
+ movdqa xmm14, xmm5
1660
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1661
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1662
+ jmp 9b
1663
+ 9:
1664
+ pxor xmm0, xmm2
1665
+ pxor xmm1, xmm3
1666
+ pxor xmm8, xmm10
1667
+ pxor xmm9, xmm11
1668
+ mov eax, r13d
1669
+ cmp rdx, r15
1670
+ jne 2b
1671
+ movups xmmword ptr [rbx], xmm0
1672
+ movups xmmword ptr [rbx+0x10], xmm1
1673
+ movups xmmword ptr [rbx+0x20], xmm8
1674
+ movups xmmword ptr [rbx+0x30], xmm9
1675
+ movdqa xmm0, xmmword ptr [rsp+0x130]
1676
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1677
+ movdqa xmm2, xmmword ptr [rsp+0x120]
1678
+ movdqu xmm3, xmmword ptr [rsp+0x118]
1679
+ movdqu xmm4, xmmword ptr [rsp+0x128]
1680
+ blendvps xmm1, xmm3, xmm0
1681
+ blendvps xmm2, xmm4, xmm0
1682
+ movdqa xmmword ptr [rsp+0x110], xmm1
1683
+ movdqa xmmword ptr [rsp+0x120], xmm2
1684
+ add rdi, 16
1685
+ add rbx, 64
1686
+ sub rsi, 2
1687
+ 3:
1688
+ test esi, 0x1
1689
+ je 4b
1690
+ movups xmm0, xmmword ptr [rcx]
1691
+ movups xmm1, xmmword ptr [rcx+0x10]
1692
+ movd xmm13, dword ptr [rsp+0x110]
1693
+ pinsrd xmm13, dword ptr [rsp+0x120], 1
1694
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2
1695
+ movaps xmm14, xmmword ptr [ROT8+rip]
1696
+ movaps xmm15, xmmword ptr [ROT16+rip]
1697
+ mov r8, qword ptr [rdi]
1698
+ movzx eax, byte ptr [rbp+0x80]
1699
+ or eax, r13d
1700
+ xor edx, edx
1701
+ 2:
1702
+ mov r14d, eax
1703
+ or eax, r12d
1704
+ add rdx, 64
1705
+ cmp rdx, r15
1706
+ cmovne eax, r14d
1707
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1708
+ movaps xmm3, xmm13
1709
+ pinsrd xmm3, eax, 3
1710
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1711
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1712
+ movaps xmm8, xmm4
1713
+ shufps xmm4, xmm5, 136
1714
+ shufps xmm8, xmm5, 221
1715
+ movaps xmm5, xmm8
1716
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1717
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1718
+ movaps xmm8, xmm6
1719
+ shufps xmm6, xmm7, 136
1720
+ pshufd xmm6, xmm6, 0x93
1721
+ shufps xmm8, xmm7, 221
1722
+ pshufd xmm7, xmm8, 0x93
1723
+ mov al, 7
1724
+ 9:
1725
+ paddd xmm0, xmm4
1726
+ paddd xmm0, xmm1
1727
+ pxor xmm3, xmm0
1728
+ pshufb xmm3, xmm15
1729
+ paddd xmm2, xmm3
1730
+ pxor xmm1, xmm2
1731
+ movdqa xmm11, xmm1
1732
+ pslld xmm1, 20
1733
+ psrld xmm11, 12
1734
+ por xmm1, xmm11
1735
+ paddd xmm0, xmm5
1736
+ paddd xmm0, xmm1
1737
+ pxor xmm3, xmm0
1738
+ pshufb xmm3, xmm14
1739
+ paddd xmm2, xmm3
1740
+ pxor xmm1, xmm2
1741
+ movdqa xmm11, xmm1
1742
+ pslld xmm1, 25
1743
+ psrld xmm11, 7
1744
+ por xmm1, xmm11
1745
+ pshufd xmm0, xmm0, 0x93
1746
+ pshufd xmm3, xmm3, 0x4E
1747
+ pshufd xmm2, xmm2, 0x39
1748
+ paddd xmm0, xmm6
1749
+ paddd xmm0, xmm1
1750
+ pxor xmm3, xmm0
1751
+ pshufb xmm3, xmm15
1752
+ paddd xmm2, xmm3
1753
+ pxor xmm1, xmm2
1754
+ movdqa xmm11, xmm1
1755
+ pslld xmm1, 20
1756
+ psrld xmm11, 12
1757
+ por xmm1, xmm11
1758
+ paddd xmm0, xmm7
1759
+ paddd xmm0, xmm1
1760
+ pxor xmm3, xmm0
1761
+ pshufb xmm3, xmm14
1762
+ paddd xmm2, xmm3
1763
+ pxor xmm1, xmm2
1764
+ movdqa xmm11, xmm1
1765
+ pslld xmm1, 25
1766
+ psrld xmm11, 7
1767
+ por xmm1, xmm11
1768
+ pshufd xmm0, xmm0, 0x39
1769
+ pshufd xmm3, xmm3, 0x4E
1770
+ pshufd xmm2, xmm2, 0x93
1771
+ dec al
1772
+ jz 9f
1773
+ movdqa xmm8, xmm4
1774
+ shufps xmm8, xmm5, 214
1775
+ pshufd xmm9, xmm4, 0x0F
1776
+ pshufd xmm4, xmm8, 0x39
1777
+ movdqa xmm8, xmm6
1778
+ shufps xmm8, xmm7, 250
1779
+ pblendw xmm9, xmm8, 0xCC
1780
+ movdqa xmm8, xmm7
1781
+ punpcklqdq xmm8, xmm5
1782
+ pblendw xmm8, xmm6, 0xC0
1783
+ pshufd xmm8, xmm8, 0x78
1784
+ punpckhdq xmm5, xmm7
1785
+ punpckldq xmm6, xmm5
1786
+ pshufd xmm7, xmm6, 0x1E
1787
+ movdqa xmm5, xmm9
1788
+ movdqa xmm6, xmm8
1789
+ jmp 9b
1790
+ 9:
1791
+ pxor xmm0, xmm2
1792
+ pxor xmm1, xmm3
1793
+ mov eax, r13d
1794
+ cmp rdx, r15
1795
+ jne 2b
1796
+ movups xmmword ptr [rbx], xmm0
1797
+ movups xmmword ptr [rbx+0x10], xmm1
1798
+ jmp 4b
1799
+
1800
+ .p2align 6
1801
+ blake3_compress_in_place_sse41:
1802
+ _blake3_compress_in_place_sse41:
1803
+ sub rsp, 72
1804
+ movdqa xmmword ptr [rsp], xmm6
1805
+ movdqa xmmword ptr [rsp+0x10], xmm7
1806
+ movdqa xmmword ptr [rsp+0x20], xmm8
1807
+ movdqa xmmword ptr [rsp+0x30], xmm9
1808
+ movups xmm0, xmmword ptr [rcx]
1809
+ movups xmm1, xmmword ptr [rcx+0x10]
1810
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1811
+ movzx eax, byte ptr [rsp+0x70]
1812
+ movzx r8d, r8b
1813
+ shl rax, 32
1814
+ add r8, rax
1815
+ movq xmm3, r9
1816
+ movq xmm4, r8
1817
+ punpcklqdq xmm3, xmm4
1818
+ movups xmm4, xmmword ptr [rdx]
1819
+ movups xmm5, xmmword ptr [rdx+0x10]
1820
+ movaps xmm8, xmm4
1821
+ shufps xmm4, xmm5, 136
1822
+ shufps xmm8, xmm5, 221
1823
+ movaps xmm5, xmm8
1824
+ movups xmm6, xmmword ptr [rdx+0x20]
1825
+ movups xmm7, xmmword ptr [rdx+0x30]
1826
+ movaps xmm8, xmm6
1827
+ shufps xmm6, xmm7, 136
1828
+ pshufd xmm6, xmm6, 0x93
1829
+ shufps xmm8, xmm7, 221
1830
+ pshufd xmm7, xmm8, 0x93
1831
+ movaps xmm14, xmmword ptr [ROT8+rip]
1832
+ movaps xmm15, xmmword ptr [ROT16+rip]
1833
+ mov al, 7
1834
+ 9:
1835
+ paddd xmm0, xmm4
1836
+ paddd xmm0, xmm1
1837
+ pxor xmm3, xmm0
1838
+ pshufb xmm3, xmm15
1839
+ paddd xmm2, xmm3
1840
+ pxor xmm1, xmm2
1841
+ movdqa xmm11, xmm1
1842
+ pslld xmm1, 20
1843
+ psrld xmm11, 12
1844
+ por xmm1, xmm11
1845
+ paddd xmm0, xmm5
1846
+ paddd xmm0, xmm1
1847
+ pxor xmm3, xmm0
1848
+ pshufb xmm3, xmm14
1849
+ paddd xmm2, xmm3
1850
+ pxor xmm1, xmm2
1851
+ movdqa xmm11, xmm1
1852
+ pslld xmm1, 25
1853
+ psrld xmm11, 7
1854
+ por xmm1, xmm11
1855
+ pshufd xmm0, xmm0, 0x93
1856
+ pshufd xmm3, xmm3, 0x4E
1857
+ pshufd xmm2, xmm2, 0x39
1858
+ paddd xmm0, xmm6
1859
+ paddd xmm0, xmm1
1860
+ pxor xmm3, xmm0
1861
+ pshufb xmm3, xmm15
1862
+ paddd xmm2, xmm3
1863
+ pxor xmm1, xmm2
1864
+ movdqa xmm11, xmm1
1865
+ pslld xmm1, 20
1866
+ psrld xmm11, 12
1867
+ por xmm1, xmm11
1868
+ paddd xmm0, xmm7
1869
+ paddd xmm0, xmm1
1870
+ pxor xmm3, xmm0
1871
+ pshufb xmm3, xmm14
1872
+ paddd xmm2, xmm3
1873
+ pxor xmm1, xmm2
1874
+ movdqa xmm11, xmm1
1875
+ pslld xmm1, 25
1876
+ psrld xmm11, 7
1877
+ por xmm1, xmm11
1878
+ pshufd xmm0, xmm0, 0x39
1879
+ pshufd xmm3, xmm3, 0x4E
1880
+ pshufd xmm2, xmm2, 0x93
1881
+ dec al
1882
+ jz 9f
1883
+ movdqa xmm8, xmm4
1884
+ shufps xmm8, xmm5, 214
1885
+ pshufd xmm9, xmm4, 0x0F
1886
+ pshufd xmm4, xmm8, 0x39
1887
+ movdqa xmm8, xmm6
1888
+ shufps xmm8, xmm7, 250
1889
+ pblendw xmm9, xmm8, 0xCC
1890
+ movdqa xmm8, xmm7
1891
+ punpcklqdq xmm8, xmm5
1892
+ pblendw xmm8, xmm6, 0xC0
1893
+ pshufd xmm8, xmm8, 0x78
1894
+ punpckhdq xmm5, xmm7
1895
+ punpckldq xmm6, xmm5
1896
+ pshufd xmm7, xmm6, 0x1E
1897
+ movdqa xmm5, xmm9
1898
+ movdqa xmm6, xmm8
1899
+ jmp 9b
1900
+ 9:
1901
+ pxor xmm0, xmm2
1902
+ pxor xmm1, xmm3
1903
+ movups xmmword ptr [rcx], xmm0
1904
+ movups xmmword ptr [rcx+0x10], xmm1
1905
+ movdqa xmm6, xmmword ptr [rsp]
1906
+ movdqa xmm7, xmmword ptr [rsp+0x10]
1907
+ movdqa xmm8, xmmword ptr [rsp+0x20]
1908
+ movdqa xmm9, xmmword ptr [rsp+0x30]
1909
+ add rsp, 72
1910
+ ret
1911
+
1912
+
1913
+ .p2align 6
1914
+ _blake3_compress_xof_sse41:
1915
+ blake3_compress_xof_sse41:
1916
+ sub rsp, 72
1917
+ movdqa xmmword ptr [rsp], xmm6
1918
+ movdqa xmmword ptr [rsp+0x10], xmm7
1919
+ movdqa xmmword ptr [rsp+0x20], xmm8
1920
+ movdqa xmmword ptr [rsp+0x30], xmm9
1921
+ movups xmm0, xmmword ptr [rcx]
1922
+ movups xmm1, xmmword ptr [rcx+0x10]
1923
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1924
+ movzx eax, byte ptr [rsp+0x70]
1925
+ movzx r8d, r8b
1926
+ mov r10, qword ptr [rsp+0x78]
1927
+ shl rax, 32
1928
+ add r8, rax
1929
+ movq xmm3, r9
1930
+ movq xmm4, r8
1931
+ punpcklqdq xmm3, xmm4
1932
+ movups xmm4, xmmword ptr [rdx]
1933
+ movups xmm5, xmmword ptr [rdx+0x10]
1934
+ movaps xmm8, xmm4
1935
+ shufps xmm4, xmm5, 136
1936
+ shufps xmm8, xmm5, 221
1937
+ movaps xmm5, xmm8
1938
+ movups xmm6, xmmword ptr [rdx+0x20]
1939
+ movups xmm7, xmmword ptr [rdx+0x30]
1940
+ movaps xmm8, xmm6
1941
+ shufps xmm6, xmm7, 136
1942
+ pshufd xmm6, xmm6, 0x93
1943
+ shufps xmm8, xmm7, 221
1944
+ pshufd xmm7, xmm8, 0x93
1945
+ movaps xmm14, xmmword ptr [ROT8+rip]
1946
+ movaps xmm15, xmmword ptr [ROT16+rip]
1947
+ mov al, 7
1948
+ 9:
1949
+ paddd xmm0, xmm4
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ pshufb xmm3, xmm15
1953
+ paddd xmm2, xmm3
1954
+ pxor xmm1, xmm2
1955
+ movdqa xmm11, xmm1
1956
+ pslld xmm1, 20
1957
+ psrld xmm11, 12
1958
+ por xmm1, xmm11
1959
+ paddd xmm0, xmm5
1960
+ paddd xmm0, xmm1
1961
+ pxor xmm3, xmm0
1962
+ pshufb xmm3, xmm14
1963
+ paddd xmm2, xmm3
1964
+ pxor xmm1, xmm2
1965
+ movdqa xmm11, xmm1
1966
+ pslld xmm1, 25
1967
+ psrld xmm11, 7
1968
+ por xmm1, xmm11
1969
+ pshufd xmm0, xmm0, 0x93
1970
+ pshufd xmm3, xmm3, 0x4E
1971
+ pshufd xmm2, xmm2, 0x39
1972
+ paddd xmm0, xmm6
1973
+ paddd xmm0, xmm1
1974
+ pxor xmm3, xmm0
1975
+ pshufb xmm3, xmm15
1976
+ paddd xmm2, xmm3
1977
+ pxor xmm1, xmm2
1978
+ movdqa xmm11, xmm1
1979
+ pslld xmm1, 20
1980
+ psrld xmm11, 12
1981
+ por xmm1, xmm11
1982
+ paddd xmm0, xmm7
1983
+ paddd xmm0, xmm1
1984
+ pxor xmm3, xmm0
1985
+ pshufb xmm3, xmm14
1986
+ paddd xmm2, xmm3
1987
+ pxor xmm1, xmm2
1988
+ movdqa xmm11, xmm1
1989
+ pslld xmm1, 25
1990
+ psrld xmm11, 7
1991
+ por xmm1, xmm11
1992
+ pshufd xmm0, xmm0, 0x39
1993
+ pshufd xmm3, xmm3, 0x4E
1994
+ pshufd xmm2, xmm2, 0x93
1995
+ dec al
1996
+ jz 9f
1997
+ movdqa xmm8, xmm4
1998
+ shufps xmm8, xmm5, 214
1999
+ pshufd xmm9, xmm4, 0x0F
2000
+ pshufd xmm4, xmm8, 0x39
2001
+ movdqa xmm8, xmm6
2002
+ shufps xmm8, xmm7, 250
2003
+ pblendw xmm9, xmm8, 0xCC
2004
+ movdqa xmm8, xmm7
2005
+ punpcklqdq xmm8, xmm5
2006
+ pblendw xmm8, xmm6, 0xC0
2007
+ pshufd xmm8, xmm8, 0x78
2008
+ punpckhdq xmm5, xmm7
2009
+ punpckldq xmm6, xmm5
2010
+ pshufd xmm7, xmm6, 0x1E
2011
+ movdqa xmm5, xmm9
2012
+ movdqa xmm6, xmm8
2013
+ jmp 9b
2014
+ 9:
2015
+ movdqu xmm4, xmmword ptr [rcx]
2016
+ movdqu xmm5, xmmword ptr [rcx+0x10]
2017
+ pxor xmm0, xmm2
2018
+ pxor xmm1, xmm3
2019
+ pxor xmm2, xmm4
2020
+ pxor xmm3, xmm5
2021
+ movups xmmword ptr [r10], xmm0
2022
+ movups xmmword ptr [r10+0x10], xmm1
2023
+ movups xmmword ptr [r10+0x20], xmm2
2024
+ movups xmmword ptr [r10+0x30], xmm3
2025
+ movdqa xmm6, xmmword ptr [rsp]
2026
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2027
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2028
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2029
+ add rsp, 72
2030
+ ret
2031
+
2032
+
2033
+ .section .rodata
2034
+ .p2align 6
2035
+ BLAKE3_IV:
2036
+ .long 0x6A09E667, 0xBB67AE85
2037
+ .long 0x3C6EF372, 0xA54FF53A
2038
+ ROT16:
2039
+ .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2040
+ ROT8:
2041
+ .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2042
+ ADD0:
2043
+ .long 0, 1, 2, 3
2044
+ ADD1:
2045
+ .long 4, 4, 4, 4
2046
+ BLAKE3_IV_0:
2047
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2048
+ BLAKE3_IV_1:
2049
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2050
+ BLAKE3_IV_2:
2051
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2052
+ BLAKE3_IV_3:
2053
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2054
+ BLAKE3_BLOCK_LEN:
2055
+ .long 64, 64, 64, 64
2056
+ CMP_MSB_MASK:
2057
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000