digest-blake3 0.34.0 → 0.37.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2332 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse2
3
+ .global _blake3_hash_many_sse2
4
+ .global blake3_compress_in_place_sse2
5
+ .global _blake3_compress_in_place_sse2
6
+ .global blake3_compress_xof_sse2
7
+ .global _blake3_compress_xof_sse2
8
+ .section .text
9
+ .p2align 6
10
+ _blake3_hash_many_sse2:
11
+ blake3_hash_many_sse2:
12
+ push r15
13
+ push r14
14
+ push r13
15
+ push r12
16
+ push rsi
17
+ push rdi
18
+ push rbx
19
+ push rbp
20
+ mov rbp, rsp
21
+ sub rsp, 528
22
+ and rsp, 0xFFFFFFFFFFFFFFC0
23
+ movdqa xmmword ptr [rsp+0x170], xmm6
24
+ movdqa xmmword ptr [rsp+0x180], xmm7
25
+ movdqa xmmword ptr [rsp+0x190], xmm8
26
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
27
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
28
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
29
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
30
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
31
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
32
+ movdqa xmmword ptr [rsp+0x200], xmm15
33
+ mov rdi, rcx
34
+ mov rsi, rdx
35
+ mov rdx, r8
36
+ mov rcx, r9
37
+ mov r8, qword ptr [rbp+0x68]
38
+ movzx r9, byte ptr [rbp+0x70]
39
+ neg r9d
40
+ movd xmm0, r9d
41
+ pshufd xmm0, xmm0, 0x00
42
+ movdqa xmmword ptr [rsp+0x130], xmm0
43
+ movdqa xmm1, xmm0
44
+ pand xmm1, xmmword ptr [ADD0+rip]
45
+ pand xmm0, xmmword ptr [ADD1+rip]
46
+ movdqa xmmword ptr [rsp+0x150], xmm0
47
+ movd xmm0, r8d
48
+ pshufd xmm0, xmm0, 0x00
49
+ paddd xmm0, xmm1
50
+ movdqa xmmword ptr [rsp+0x110], xmm0
51
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pcmpgtd xmm1, xmm0
54
+ shr r8, 32
55
+ movd xmm2, r8d
56
+ pshufd xmm2, xmm2, 0x00
57
+ psubd xmm2, xmm1
58
+ movdqa xmmword ptr [rsp+0x120], xmm2
59
+ mov rbx, qword ptr [rbp+0x90]
60
+ mov r15, rdx
61
+ shl r15, 6
62
+ movzx r13d, byte ptr [rbp+0x78]
63
+ movzx r12d, byte ptr [rbp+0x88]
64
+ cmp rsi, 4
65
+ jc 3f
66
+ 2:
67
+ movdqu xmm3, xmmword ptr [rcx]
68
+ pshufd xmm0, xmm3, 0x00
69
+ pshufd xmm1, xmm3, 0x55
70
+ pshufd xmm2, xmm3, 0xAA
71
+ pshufd xmm3, xmm3, 0xFF
72
+ movdqu xmm7, xmmword ptr [rcx+0x10]
73
+ pshufd xmm4, xmm7, 0x00
74
+ pshufd xmm5, xmm7, 0x55
75
+ pshufd xmm6, xmm7, 0xAA
76
+ pshufd xmm7, xmm7, 0xFF
77
+ mov r8, qword ptr [rdi]
78
+ mov r9, qword ptr [rdi+0x8]
79
+ mov r10, qword ptr [rdi+0x10]
80
+ mov r11, qword ptr [rdi+0x18]
81
+ movzx eax, byte ptr [rbp+0x80]
82
+ or eax, r13d
83
+ xor edx, edx
84
+ 9:
85
+ mov r14d, eax
86
+ or eax, r12d
87
+ add rdx, 64
88
+ cmp rdx, r15
89
+ cmovne eax, r14d
90
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
91
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
92
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
93
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
94
+ movdqa xmm12, xmm8
95
+ punpckldq xmm8, xmm9
96
+ punpckhdq xmm12, xmm9
97
+ movdqa xmm14, xmm10
98
+ punpckldq xmm10, xmm11
99
+ punpckhdq xmm14, xmm11
100
+ movdqa xmm9, xmm8
101
+ punpcklqdq xmm8, xmm10
102
+ punpckhqdq xmm9, xmm10
103
+ movdqa xmm13, xmm12
104
+ punpcklqdq xmm12, xmm14
105
+ punpckhqdq xmm13, xmm14
106
+ movdqa xmmword ptr [rsp], xmm8
107
+ movdqa xmmword ptr [rsp+0x10], xmm9
108
+ movdqa xmmword ptr [rsp+0x20], xmm12
109
+ movdqa xmmword ptr [rsp+0x30], xmm13
110
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
111
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
112
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
113
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
114
+ movdqa xmm12, xmm8
115
+ punpckldq xmm8, xmm9
116
+ punpckhdq xmm12, xmm9
117
+ movdqa xmm14, xmm10
118
+ punpckldq xmm10, xmm11
119
+ punpckhdq xmm14, xmm11
120
+ movdqa xmm9, xmm8
121
+ punpcklqdq xmm8, xmm10
122
+ punpckhqdq xmm9, xmm10
123
+ movdqa xmm13, xmm12
124
+ punpcklqdq xmm12, xmm14
125
+ punpckhqdq xmm13, xmm14
126
+ movdqa xmmword ptr [rsp+0x40], xmm8
127
+ movdqa xmmword ptr [rsp+0x50], xmm9
128
+ movdqa xmmword ptr [rsp+0x60], xmm12
129
+ movdqa xmmword ptr [rsp+0x70], xmm13
130
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
131
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
132
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
133
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
134
+ movdqa xmm12, xmm8
135
+ punpckldq xmm8, xmm9
136
+ punpckhdq xmm12, xmm9
137
+ movdqa xmm14, xmm10
138
+ punpckldq xmm10, xmm11
139
+ punpckhdq xmm14, xmm11
140
+ movdqa xmm9, xmm8
141
+ punpcklqdq xmm8, xmm10
142
+ punpckhqdq xmm9, xmm10
143
+ movdqa xmm13, xmm12
144
+ punpcklqdq xmm12, xmm14
145
+ punpckhqdq xmm13, xmm14
146
+ movdqa xmmword ptr [rsp+0x80], xmm8
147
+ movdqa xmmword ptr [rsp+0x90], xmm9
148
+ movdqa xmmword ptr [rsp+0xA0], xmm12
149
+ movdqa xmmword ptr [rsp+0xB0], xmm13
150
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
151
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
152
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
153
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
154
+ movdqa xmm12, xmm8
155
+ punpckldq xmm8, xmm9
156
+ punpckhdq xmm12, xmm9
157
+ movdqa xmm14, xmm10
158
+ punpckldq xmm10, xmm11
159
+ punpckhdq xmm14, xmm11
160
+ movdqa xmm9, xmm8
161
+ punpcklqdq xmm8, xmm10
162
+ punpckhqdq xmm9, xmm10
163
+ movdqa xmm13, xmm12
164
+ punpcklqdq xmm12, xmm14
165
+ punpckhqdq xmm13, xmm14
166
+ movdqa xmmword ptr [rsp+0xC0], xmm8
167
+ movdqa xmmword ptr [rsp+0xD0], xmm9
168
+ movdqa xmmword ptr [rsp+0xE0], xmm12
169
+ movdqa xmmword ptr [rsp+0xF0], xmm13
170
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173
+ movdqa xmm12, xmmword ptr [rsp+0x110]
174
+ movdqa xmm13, xmmword ptr [rsp+0x120]
175
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176
+ movd xmm15, eax
177
+ pshufd xmm15, xmm15, 0x00
178
+ prefetcht0 [r8+rdx+0x80]
179
+ prefetcht0 [r9+rdx+0x80]
180
+ prefetcht0 [r10+rdx+0x80]
181
+ prefetcht0 [r11+rdx+0x80]
182
+ paddd xmm0, xmmword ptr [rsp]
183
+ paddd xmm1, xmmword ptr [rsp+0x20]
184
+ paddd xmm2, xmmword ptr [rsp+0x40]
185
+ paddd xmm3, xmmword ptr [rsp+0x60]
186
+ paddd xmm0, xmm4
187
+ paddd xmm1, xmm5
188
+ paddd xmm2, xmm6
189
+ paddd xmm3, xmm7
190
+ pxor xmm12, xmm0
191
+ pxor xmm13, xmm1
192
+ pxor xmm14, xmm2
193
+ pxor xmm15, xmm3
194
+ pshuflw xmm12, xmm12, 0xB1
195
+ pshufhw xmm12, xmm12, 0xB1
196
+ pshuflw xmm13, xmm13, 0xB1
197
+ pshufhw xmm13, xmm13, 0xB1
198
+ pshuflw xmm14, xmm14, 0xB1
199
+ pshufhw xmm14, xmm14, 0xB1
200
+ pshuflw xmm15, xmm15, 0xB1
201
+ pshufhw xmm15, xmm15, 0xB1
202
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
203
+ paddd xmm8, xmm12
204
+ paddd xmm9, xmm13
205
+ paddd xmm10, xmm14
206
+ paddd xmm11, xmm15
207
+ pxor xmm4, xmm8
208
+ pxor xmm5, xmm9
209
+ pxor xmm6, xmm10
210
+ pxor xmm7, xmm11
211
+ movdqa xmmword ptr [rsp+0x100], xmm8
212
+ movdqa xmm8, xmm4
213
+ psrld xmm8, 12
214
+ pslld xmm4, 20
215
+ por xmm4, xmm8
216
+ movdqa xmm8, xmm5
217
+ psrld xmm8, 12
218
+ pslld xmm5, 20
219
+ por xmm5, xmm8
220
+ movdqa xmm8, xmm6
221
+ psrld xmm8, 12
222
+ pslld xmm6, 20
223
+ por xmm6, xmm8
224
+ movdqa xmm8, xmm7
225
+ psrld xmm8, 12
226
+ pslld xmm7, 20
227
+ por xmm7, xmm8
228
+ paddd xmm0, xmmword ptr [rsp+0x10]
229
+ paddd xmm1, xmmword ptr [rsp+0x30]
230
+ paddd xmm2, xmmword ptr [rsp+0x50]
231
+ paddd xmm3, xmmword ptr [rsp+0x70]
232
+ paddd xmm0, xmm4
233
+ paddd xmm1, xmm5
234
+ paddd xmm2, xmm6
235
+ paddd xmm3, xmm7
236
+ pxor xmm12, xmm0
237
+ pxor xmm13, xmm1
238
+ pxor xmm14, xmm2
239
+ pxor xmm15, xmm3
240
+ movdqa xmm8, xmm12
241
+ psrld xmm12, 8
242
+ pslld xmm8, 24
243
+ pxor xmm12, xmm8
244
+ movdqa xmm8, xmm13
245
+ psrld xmm13, 8
246
+ pslld xmm8, 24
247
+ pxor xmm13, xmm8
248
+ movdqa xmm8, xmm14
249
+ psrld xmm14, 8
250
+ pslld xmm8, 24
251
+ pxor xmm14, xmm8
252
+ movdqa xmm8, xmm15
253
+ psrld xmm15, 8
254
+ pslld xmm8, 24
255
+ pxor xmm15, xmm8
256
+ movdqa xmm8, xmmword ptr [rsp+0x100]
257
+ paddd xmm8, xmm12
258
+ paddd xmm9, xmm13
259
+ paddd xmm10, xmm14
260
+ paddd xmm11, xmm15
261
+ pxor xmm4, xmm8
262
+ pxor xmm5, xmm9
263
+ pxor xmm6, xmm10
264
+ pxor xmm7, xmm11
265
+ movdqa xmmword ptr [rsp+0x100], xmm8
266
+ movdqa xmm8, xmm4
267
+ psrld xmm8, 7
268
+ pslld xmm4, 25
269
+ por xmm4, xmm8
270
+ movdqa xmm8, xmm5
271
+ psrld xmm8, 7
272
+ pslld xmm5, 25
273
+ por xmm5, xmm8
274
+ movdqa xmm8, xmm6
275
+ psrld xmm8, 7
276
+ pslld xmm6, 25
277
+ por xmm6, xmm8
278
+ movdqa xmm8, xmm7
279
+ psrld xmm8, 7
280
+ pslld xmm7, 25
281
+ por xmm7, xmm8
282
+ paddd xmm0, xmmword ptr [rsp+0x80]
283
+ paddd xmm1, xmmword ptr [rsp+0xA0]
284
+ paddd xmm2, xmmword ptr [rsp+0xC0]
285
+ paddd xmm3, xmmword ptr [rsp+0xE0]
286
+ paddd xmm0, xmm5
287
+ paddd xmm1, xmm6
288
+ paddd xmm2, xmm7
289
+ paddd xmm3, xmm4
290
+ pxor xmm15, xmm0
291
+ pxor xmm12, xmm1
292
+ pxor xmm13, xmm2
293
+ pxor xmm14, xmm3
294
+ pshuflw xmm15, xmm15, 0xB1
295
+ pshufhw xmm15, xmm15, 0xB1
296
+ pshuflw xmm12, xmm12, 0xB1
297
+ pshufhw xmm12, xmm12, 0xB1
298
+ pshuflw xmm13, xmm13, 0xB1
299
+ pshufhw xmm13, xmm13, 0xB1
300
+ pshuflw xmm14, xmm14, 0xB1
301
+ pshufhw xmm14, xmm14, 0xB1
302
+ paddd xmm10, xmm15
303
+ paddd xmm11, xmm12
304
+ movdqa xmm8, xmmword ptr [rsp+0x100]
305
+ paddd xmm8, xmm13
306
+ paddd xmm9, xmm14
307
+ pxor xmm5, xmm10
308
+ pxor xmm6, xmm11
309
+ pxor xmm7, xmm8
310
+ pxor xmm4, xmm9
311
+ movdqa xmmword ptr [rsp+0x100], xmm8
312
+ movdqa xmm8, xmm5
313
+ psrld xmm8, 12
314
+ pslld xmm5, 20
315
+ por xmm5, xmm8
316
+ movdqa xmm8, xmm6
317
+ psrld xmm8, 12
318
+ pslld xmm6, 20
319
+ por xmm6, xmm8
320
+ movdqa xmm8, xmm7
321
+ psrld xmm8, 12
322
+ pslld xmm7, 20
323
+ por xmm7, xmm8
324
+ movdqa xmm8, xmm4
325
+ psrld xmm8, 12
326
+ pslld xmm4, 20
327
+ por xmm4, xmm8
328
+ paddd xmm0, xmmword ptr [rsp+0x90]
329
+ paddd xmm1, xmmword ptr [rsp+0xB0]
330
+ paddd xmm2, xmmword ptr [rsp+0xD0]
331
+ paddd xmm3, xmmword ptr [rsp+0xF0]
332
+ paddd xmm0, xmm5
333
+ paddd xmm1, xmm6
334
+ paddd xmm2, xmm7
335
+ paddd xmm3, xmm4
336
+ pxor xmm15, xmm0
337
+ pxor xmm12, xmm1
338
+ pxor xmm13, xmm2
339
+ pxor xmm14, xmm3
340
+ movdqa xmm8, xmm15
341
+ psrld xmm15, 8
342
+ pslld xmm8, 24
343
+ pxor xmm15, xmm8
344
+ movdqa xmm8, xmm12
345
+ psrld xmm12, 8
346
+ pslld xmm8, 24
347
+ pxor xmm12, xmm8
348
+ movdqa xmm8, xmm13
349
+ psrld xmm13, 8
350
+ pslld xmm8, 24
351
+ pxor xmm13, xmm8
352
+ movdqa xmm8, xmm14
353
+ psrld xmm14, 8
354
+ pslld xmm8, 24
355
+ pxor xmm14, xmm8
356
+ paddd xmm10, xmm15
357
+ paddd xmm11, xmm12
358
+ movdqa xmm8, xmmword ptr [rsp+0x100]
359
+ paddd xmm8, xmm13
360
+ paddd xmm9, xmm14
361
+ pxor xmm5, xmm10
362
+ pxor xmm6, xmm11
363
+ pxor xmm7, xmm8
364
+ pxor xmm4, xmm9
365
+ movdqa xmmword ptr [rsp+0x100], xmm8
366
+ movdqa xmm8, xmm5
367
+ psrld xmm8, 7
368
+ pslld xmm5, 25
369
+ por xmm5, xmm8
370
+ movdqa xmm8, xmm6
371
+ psrld xmm8, 7
372
+ pslld xmm6, 25
373
+ por xmm6, xmm8
374
+ movdqa xmm8, xmm7
375
+ psrld xmm8, 7
376
+ pslld xmm7, 25
377
+ por xmm7, xmm8
378
+ movdqa xmm8, xmm4
379
+ psrld xmm8, 7
380
+ pslld xmm4, 25
381
+ por xmm4, xmm8
382
+ paddd xmm0, xmmword ptr [rsp+0x20]
383
+ paddd xmm1, xmmword ptr [rsp+0x30]
384
+ paddd xmm2, xmmword ptr [rsp+0x70]
385
+ paddd xmm3, xmmword ptr [rsp+0x40]
386
+ paddd xmm0, xmm4
387
+ paddd xmm1, xmm5
388
+ paddd xmm2, xmm6
389
+ paddd xmm3, xmm7
390
+ pxor xmm12, xmm0
391
+ pxor xmm13, xmm1
392
+ pxor xmm14, xmm2
393
+ pxor xmm15, xmm3
394
+ pshuflw xmm12, xmm12, 0xB1
395
+ pshufhw xmm12, xmm12, 0xB1
396
+ pshuflw xmm13, xmm13, 0xB1
397
+ pshufhw xmm13, xmm13, 0xB1
398
+ pshuflw xmm14, xmm14, 0xB1
399
+ pshufhw xmm14, xmm14, 0xB1
400
+ pshuflw xmm15, xmm15, 0xB1
401
+ pshufhw xmm15, xmm15, 0xB1
402
+ movdqa xmm8, xmmword ptr [rsp+0x100]
403
+ paddd xmm8, xmm12
404
+ paddd xmm9, xmm13
405
+ paddd xmm10, xmm14
406
+ paddd xmm11, xmm15
407
+ pxor xmm4, xmm8
408
+ pxor xmm5, xmm9
409
+ pxor xmm6, xmm10
410
+ pxor xmm7, xmm11
411
+ movdqa xmmword ptr [rsp+0x100], xmm8
412
+ movdqa xmm8, xmm4
413
+ psrld xmm8, 12
414
+ pslld xmm4, 20
415
+ por xmm4, xmm8
416
+ movdqa xmm8, xmm5
417
+ psrld xmm8, 12
418
+ pslld xmm5, 20
419
+ por xmm5, xmm8
420
+ movdqa xmm8, xmm6
421
+ psrld xmm8, 12
422
+ pslld xmm6, 20
423
+ por xmm6, xmm8
424
+ movdqa xmm8, xmm7
425
+ psrld xmm8, 12
426
+ pslld xmm7, 20
427
+ por xmm7, xmm8
428
+ paddd xmm0, xmmword ptr [rsp+0x60]
429
+ paddd xmm1, xmmword ptr [rsp+0xA0]
430
+ paddd xmm2, xmmword ptr [rsp]
431
+ paddd xmm3, xmmword ptr [rsp+0xD0]
432
+ paddd xmm0, xmm4
433
+ paddd xmm1, xmm5
434
+ paddd xmm2, xmm6
435
+ paddd xmm3, xmm7
436
+ pxor xmm12, xmm0
437
+ pxor xmm13, xmm1
438
+ pxor xmm14, xmm2
439
+ pxor xmm15, xmm3
440
+ movdqa xmm8, xmm12
441
+ psrld xmm12, 8
442
+ pslld xmm8, 24
443
+ pxor xmm12, xmm8
444
+ movdqa xmm8, xmm13
445
+ psrld xmm13, 8
446
+ pslld xmm8, 24
447
+ pxor xmm13, xmm8
448
+ movdqa xmm8, xmm14
449
+ psrld xmm14, 8
450
+ pslld xmm8, 24
451
+ pxor xmm14, xmm8
452
+ movdqa xmm8, xmm15
453
+ psrld xmm15, 8
454
+ pslld xmm8, 24
455
+ pxor xmm15, xmm8
456
+ movdqa xmm8, xmmword ptr [rsp+0x100]
457
+ paddd xmm8, xmm12
458
+ paddd xmm9, xmm13
459
+ paddd xmm10, xmm14
460
+ paddd xmm11, xmm15
461
+ pxor xmm4, xmm8
462
+ pxor xmm5, xmm9
463
+ pxor xmm6, xmm10
464
+ pxor xmm7, xmm11
465
+ movdqa xmmword ptr [rsp+0x100], xmm8
466
+ movdqa xmm8, xmm4
467
+ psrld xmm8, 7
468
+ pslld xmm4, 25
469
+ por xmm4, xmm8
470
+ movdqa xmm8, xmm5
471
+ psrld xmm8, 7
472
+ pslld xmm5, 25
473
+ por xmm5, xmm8
474
+ movdqa xmm8, xmm6
475
+ psrld xmm8, 7
476
+ pslld xmm6, 25
477
+ por xmm6, xmm8
478
+ movdqa xmm8, xmm7
479
+ psrld xmm8, 7
480
+ pslld xmm7, 25
481
+ por xmm7, xmm8
482
+ paddd xmm0, xmmword ptr [rsp+0x10]
483
+ paddd xmm1, xmmword ptr [rsp+0xC0]
484
+ paddd xmm2, xmmword ptr [rsp+0x90]
485
+ paddd xmm3, xmmword ptr [rsp+0xF0]
486
+ paddd xmm0, xmm5
487
+ paddd xmm1, xmm6
488
+ paddd xmm2, xmm7
489
+ paddd xmm3, xmm4
490
+ pxor xmm15, xmm0
491
+ pxor xmm12, xmm1
492
+ pxor xmm13, xmm2
493
+ pxor xmm14, xmm3
494
+ pshuflw xmm15, xmm15, 0xB1
495
+ pshufhw xmm15, xmm15, 0xB1
496
+ pshuflw xmm12, xmm12, 0xB1
497
+ pshufhw xmm12, xmm12, 0xB1
498
+ pshuflw xmm13, xmm13, 0xB1
499
+ pshufhw xmm13, xmm13, 0xB1
500
+ pshuflw xmm14, xmm14, 0xB1
501
+ pshufhw xmm14, xmm14, 0xB1
502
+ paddd xmm10, xmm15
503
+ paddd xmm11, xmm12
504
+ movdqa xmm8, xmmword ptr [rsp+0x100]
505
+ paddd xmm8, xmm13
506
+ paddd xmm9, xmm14
507
+ pxor xmm5, xmm10
508
+ pxor xmm6, xmm11
509
+ pxor xmm7, xmm8
510
+ pxor xmm4, xmm9
511
+ movdqa xmmword ptr [rsp+0x100], xmm8
512
+ movdqa xmm8, xmm5
513
+ psrld xmm8, 12
514
+ pslld xmm5, 20
515
+ por xmm5, xmm8
516
+ movdqa xmm8, xmm6
517
+ psrld xmm8, 12
518
+ pslld xmm6, 20
519
+ por xmm6, xmm8
520
+ movdqa xmm8, xmm7
521
+ psrld xmm8, 12
522
+ pslld xmm7, 20
523
+ por xmm7, xmm8
524
+ movdqa xmm8, xmm4
525
+ psrld xmm8, 12
526
+ pslld xmm4, 20
527
+ por xmm4, xmm8
528
+ paddd xmm0, xmmword ptr [rsp+0xB0]
529
+ paddd xmm1, xmmword ptr [rsp+0x50]
530
+ paddd xmm2, xmmword ptr [rsp+0xE0]
531
+ paddd xmm3, xmmword ptr [rsp+0x80]
532
+ paddd xmm0, xmm5
533
+ paddd xmm1, xmm6
534
+ paddd xmm2, xmm7
535
+ paddd xmm3, xmm4
536
+ pxor xmm15, xmm0
537
+ pxor xmm12, xmm1
538
+ pxor xmm13, xmm2
539
+ pxor xmm14, xmm3
540
+ movdqa xmm8, xmm15
541
+ psrld xmm15, 8
542
+ pslld xmm8, 24
543
+ pxor xmm15, xmm8
544
+ movdqa xmm8, xmm12
545
+ psrld xmm12, 8
546
+ pslld xmm8, 24
547
+ pxor xmm12, xmm8
548
+ movdqa xmm8, xmm13
549
+ psrld xmm13, 8
550
+ pslld xmm8, 24
551
+ pxor xmm13, xmm8
552
+ movdqa xmm8, xmm14
553
+ psrld xmm14, 8
554
+ pslld xmm8, 24
555
+ pxor xmm14, xmm8
556
+ paddd xmm10, xmm15
557
+ paddd xmm11, xmm12
558
+ movdqa xmm8, xmmword ptr [rsp+0x100]
559
+ paddd xmm8, xmm13
560
+ paddd xmm9, xmm14
561
+ pxor xmm5, xmm10
562
+ pxor xmm6, xmm11
563
+ pxor xmm7, xmm8
564
+ pxor xmm4, xmm9
565
+ movdqa xmmword ptr [rsp+0x100], xmm8
566
+ movdqa xmm8, xmm5
567
+ psrld xmm8, 7
568
+ pslld xmm5, 25
569
+ por xmm5, xmm8
570
+ movdqa xmm8, xmm6
571
+ psrld xmm8, 7
572
+ pslld xmm6, 25
573
+ por xmm6, xmm8
574
+ movdqa xmm8, xmm7
575
+ psrld xmm8, 7
576
+ pslld xmm7, 25
577
+ por xmm7, xmm8
578
+ movdqa xmm8, xmm4
579
+ psrld xmm8, 7
580
+ pslld xmm4, 25
581
+ por xmm4, xmm8
582
+ paddd xmm0, xmmword ptr [rsp+0x30]
583
+ paddd xmm1, xmmword ptr [rsp+0xA0]
584
+ paddd xmm2, xmmword ptr [rsp+0xD0]
585
+ paddd xmm3, xmmword ptr [rsp+0x70]
586
+ paddd xmm0, xmm4
587
+ paddd xmm1, xmm5
588
+ paddd xmm2, xmm6
589
+ paddd xmm3, xmm7
590
+ pxor xmm12, xmm0
591
+ pxor xmm13, xmm1
592
+ pxor xmm14, xmm2
593
+ pxor xmm15, xmm3
594
+ pshuflw xmm12, xmm12, 0xB1
595
+ pshufhw xmm12, xmm12, 0xB1
596
+ pshuflw xmm13, xmm13, 0xB1
597
+ pshufhw xmm13, xmm13, 0xB1
598
+ pshuflw xmm14, xmm14, 0xB1
599
+ pshufhw xmm14, xmm14, 0xB1
600
+ pshuflw xmm15, xmm15, 0xB1
601
+ pshufhw xmm15, xmm15, 0xB1
602
+ movdqa xmm8, xmmword ptr [rsp+0x100]
603
+ paddd xmm8, xmm12
604
+ paddd xmm9, xmm13
605
+ paddd xmm10, xmm14
606
+ paddd xmm11, xmm15
607
+ pxor xmm4, xmm8
608
+ pxor xmm5, xmm9
609
+ pxor xmm6, xmm10
610
+ pxor xmm7, xmm11
611
+ movdqa xmmword ptr [rsp+0x100], xmm8
612
+ movdqa xmm8, xmm4
613
+ psrld xmm8, 12
614
+ pslld xmm4, 20
615
+ por xmm4, xmm8
616
+ movdqa xmm8, xmm5
617
+ psrld xmm8, 12
618
+ pslld xmm5, 20
619
+ por xmm5, xmm8
620
+ movdqa xmm8, xmm6
621
+ psrld xmm8, 12
622
+ pslld xmm6, 20
623
+ por xmm6, xmm8
624
+ movdqa xmm8, xmm7
625
+ psrld xmm8, 12
626
+ pslld xmm7, 20
627
+ por xmm7, xmm8
628
+ paddd xmm0, xmmword ptr [rsp+0x40]
629
+ paddd xmm1, xmmword ptr [rsp+0xC0]
630
+ paddd xmm2, xmmword ptr [rsp+0x20]
631
+ paddd xmm3, xmmword ptr [rsp+0xE0]
632
+ paddd xmm0, xmm4
633
+ paddd xmm1, xmm5
634
+ paddd xmm2, xmm6
635
+ paddd xmm3, xmm7
636
+ pxor xmm12, xmm0
637
+ pxor xmm13, xmm1
638
+ pxor xmm14, xmm2
639
+ pxor xmm15, xmm3
640
+ movdqa xmm8, xmm12
641
+ psrld xmm12, 8
642
+ pslld xmm8, 24
643
+ pxor xmm12, xmm8
644
+ movdqa xmm8, xmm13
645
+ psrld xmm13, 8
646
+ pslld xmm8, 24
647
+ pxor xmm13, xmm8
648
+ movdqa xmm8, xmm14
649
+ psrld xmm14, 8
650
+ pslld xmm8, 24
651
+ pxor xmm14, xmm8
652
+ movdqa xmm8, xmm15
653
+ psrld xmm15, 8
654
+ pslld xmm8, 24
655
+ pxor xmm15, xmm8
656
+ movdqa xmm8, xmmword ptr [rsp+0x100]
657
+ paddd xmm8, xmm12
658
+ paddd xmm9, xmm13
659
+ paddd xmm10, xmm14
660
+ paddd xmm11, xmm15
661
+ pxor xmm4, xmm8
662
+ pxor xmm5, xmm9
663
+ pxor xmm6, xmm10
664
+ pxor xmm7, xmm11
665
+ movdqa xmmword ptr [rsp+0x100], xmm8
666
+ movdqa xmm8, xmm4
667
+ psrld xmm8, 7
668
+ pslld xmm4, 25
669
+ por xmm4, xmm8
670
+ movdqa xmm8, xmm5
671
+ psrld xmm8, 7
672
+ pslld xmm5, 25
673
+ por xmm5, xmm8
674
+ movdqa xmm8, xmm6
675
+ psrld xmm8, 7
676
+ pslld xmm6, 25
677
+ por xmm6, xmm8
678
+ movdqa xmm8, xmm7
679
+ psrld xmm8, 7
680
+ pslld xmm7, 25
681
+ por xmm7, xmm8
682
+ paddd xmm0, xmmword ptr [rsp+0x60]
683
+ paddd xmm1, xmmword ptr [rsp+0x90]
684
+ paddd xmm2, xmmword ptr [rsp+0xB0]
685
+ paddd xmm3, xmmword ptr [rsp+0x80]
686
+ paddd xmm0, xmm5
687
+ paddd xmm1, xmm6
688
+ paddd xmm2, xmm7
689
+ paddd xmm3, xmm4
690
+ pxor xmm15, xmm0
691
+ pxor xmm12, xmm1
692
+ pxor xmm13, xmm2
693
+ pxor xmm14, xmm3
694
+ pshuflw xmm15, xmm15, 0xB1
695
+ pshufhw xmm15, xmm15, 0xB1
696
+ pshuflw xmm12, xmm12, 0xB1
697
+ pshufhw xmm12, xmm12, 0xB1
698
+ pshuflw xmm13, xmm13, 0xB1
699
+ pshufhw xmm13, xmm13, 0xB1
700
+ pshuflw xmm14, xmm14, 0xB1
701
+ pshufhw xmm14, xmm14, 0xB1
702
+ paddd xmm10, xmm15
703
+ paddd xmm11, xmm12
704
+ movdqa xmm8, xmmword ptr [rsp+0x100]
705
+ paddd xmm8, xmm13
706
+ paddd xmm9, xmm14
707
+ pxor xmm5, xmm10
708
+ pxor xmm6, xmm11
709
+ pxor xmm7, xmm8
710
+ pxor xmm4, xmm9
711
+ movdqa xmmword ptr [rsp+0x100], xmm8
712
+ movdqa xmm8, xmm5
713
+ psrld xmm8, 12
714
+ pslld xmm5, 20
715
+ por xmm5, xmm8
716
+ movdqa xmm8, xmm6
717
+ psrld xmm8, 12
718
+ pslld xmm6, 20
719
+ por xmm6, xmm8
720
+ movdqa xmm8, xmm7
721
+ psrld xmm8, 12
722
+ pslld xmm7, 20
723
+ por xmm7, xmm8
724
+ movdqa xmm8, xmm4
725
+ psrld xmm8, 12
726
+ pslld xmm4, 20
727
+ por xmm4, xmm8
728
+ paddd xmm0, xmmword ptr [rsp+0x50]
729
+ paddd xmm1, xmmword ptr [rsp]
730
+ paddd xmm2, xmmword ptr [rsp+0xF0]
731
+ paddd xmm3, xmmword ptr [rsp+0x10]
732
+ paddd xmm0, xmm5
733
+ paddd xmm1, xmm6
734
+ paddd xmm2, xmm7
735
+ paddd xmm3, xmm4
736
+ pxor xmm15, xmm0
737
+ pxor xmm12, xmm1
738
+ pxor xmm13, xmm2
739
+ pxor xmm14, xmm3
740
+ movdqa xmm8, xmm15
741
+ psrld xmm15, 8
742
+ pslld xmm8, 24
743
+ pxor xmm15, xmm8
744
+ movdqa xmm8, xmm12
745
+ psrld xmm12, 8
746
+ pslld xmm8, 24
747
+ pxor xmm12, xmm8
748
+ movdqa xmm8, xmm13
749
+ psrld xmm13, 8
750
+ pslld xmm8, 24
751
+ pxor xmm13, xmm8
752
+ movdqa xmm8, xmm14
753
+ psrld xmm14, 8
754
+ pslld xmm8, 24
755
+ pxor xmm14, xmm8
756
+ paddd xmm10, xmm15
757
+ paddd xmm11, xmm12
758
+ movdqa xmm8, xmmword ptr [rsp+0x100]
759
+ paddd xmm8, xmm13
760
+ paddd xmm9, xmm14
761
+ pxor xmm5, xmm10
762
+ pxor xmm6, xmm11
763
+ pxor xmm7, xmm8
764
+ pxor xmm4, xmm9
765
+ movdqa xmmword ptr [rsp+0x100], xmm8
766
+ movdqa xmm8, xmm5
767
+ psrld xmm8, 7
768
+ pslld xmm5, 25
769
+ por xmm5, xmm8
770
+ movdqa xmm8, xmm6
771
+ psrld xmm8, 7
772
+ pslld xmm6, 25
773
+ por xmm6, xmm8
774
+ movdqa xmm8, xmm7
775
+ psrld xmm8, 7
776
+ pslld xmm7, 25
777
+ por xmm7, xmm8
778
+ movdqa xmm8, xmm4
779
+ psrld xmm8, 7
780
+ pslld xmm4, 25
781
+ por xmm4, xmm8
782
+ paddd xmm0, xmmword ptr [rsp+0xA0]
783
+ paddd xmm1, xmmword ptr [rsp+0xC0]
784
+ paddd xmm2, xmmword ptr [rsp+0xE0]
785
+ paddd xmm3, xmmword ptr [rsp+0xD0]
786
+ paddd xmm0, xmm4
787
+ paddd xmm1, xmm5
788
+ paddd xmm2, xmm6
789
+ paddd xmm3, xmm7
790
+ pxor xmm12, xmm0
791
+ pxor xmm13, xmm1
792
+ pxor xmm14, xmm2
793
+ pxor xmm15, xmm3
794
+ pshuflw xmm12, xmm12, 0xB1
795
+ pshufhw xmm12, xmm12, 0xB1
796
+ pshuflw xmm13, xmm13, 0xB1
797
+ pshufhw xmm13, xmm13, 0xB1
798
+ pshuflw xmm14, xmm14, 0xB1
799
+ pshufhw xmm14, xmm14, 0xB1
800
+ pshuflw xmm15, xmm15, 0xB1
801
+ pshufhw xmm15, xmm15, 0xB1
802
+ movdqa xmm8, xmmword ptr [rsp+0x100]
803
+ paddd xmm8, xmm12
804
+ paddd xmm9, xmm13
805
+ paddd xmm10, xmm14
806
+ paddd xmm11, xmm15
807
+ pxor xmm4, xmm8
808
+ pxor xmm5, xmm9
809
+ pxor xmm6, xmm10
810
+ pxor xmm7, xmm11
811
+ movdqa xmmword ptr [rsp+0x100], xmm8
812
+ movdqa xmm8, xmm4
813
+ psrld xmm8, 12
814
+ pslld xmm4, 20
815
+ por xmm4, xmm8
816
+ movdqa xmm8, xmm5
817
+ psrld xmm8, 12
818
+ pslld xmm5, 20
819
+ por xmm5, xmm8
820
+ movdqa xmm8, xmm6
821
+ psrld xmm8, 12
822
+ pslld xmm6, 20
823
+ por xmm6, xmm8
824
+ movdqa xmm8, xmm7
825
+ psrld xmm8, 12
826
+ pslld xmm7, 20
827
+ por xmm7, xmm8
828
+ paddd xmm0, xmmword ptr [rsp+0x70]
829
+ paddd xmm1, xmmword ptr [rsp+0x90]
830
+ paddd xmm2, xmmword ptr [rsp+0x30]
831
+ paddd xmm3, xmmword ptr [rsp+0xF0]
832
+ paddd xmm0, xmm4
833
+ paddd xmm1, xmm5
834
+ paddd xmm2, xmm6
835
+ paddd xmm3, xmm7
836
+ pxor xmm12, xmm0
837
+ pxor xmm13, xmm1
838
+ pxor xmm14, xmm2
839
+ pxor xmm15, xmm3
840
+ movdqa xmm8, xmm12
841
+ psrld xmm12, 8
842
+ pslld xmm8, 24
843
+ pxor xmm12, xmm8
844
+ movdqa xmm8, xmm13
845
+ psrld xmm13, 8
846
+ pslld xmm8, 24
847
+ pxor xmm13, xmm8
848
+ movdqa xmm8, xmm14
849
+ psrld xmm14, 8
850
+ pslld xmm8, 24
851
+ pxor xmm14, xmm8
852
+ movdqa xmm8, xmm15
853
+ psrld xmm15, 8
854
+ pslld xmm8, 24
855
+ pxor xmm15, xmm8
856
+ movdqa xmm8, xmmword ptr [rsp+0x100]
857
+ paddd xmm8, xmm12
858
+ paddd xmm9, xmm13
859
+ paddd xmm10, xmm14
860
+ paddd xmm11, xmm15
861
+ pxor xmm4, xmm8
862
+ pxor xmm5, xmm9
863
+ pxor xmm6, xmm10
864
+ pxor xmm7, xmm11
865
+ movdqa xmmword ptr [rsp+0x100], xmm8
866
+ movdqa xmm8, xmm4
867
+ psrld xmm8, 7
868
+ pslld xmm4, 25
869
+ por xmm4, xmm8
870
+ movdqa xmm8, xmm5
871
+ psrld xmm8, 7
872
+ pslld xmm5, 25
873
+ por xmm5, xmm8
874
+ movdqa xmm8, xmm6
875
+ psrld xmm8, 7
876
+ pslld xmm6, 25
877
+ por xmm6, xmm8
878
+ movdqa xmm8, xmm7
879
+ psrld xmm8, 7
880
+ pslld xmm7, 25
881
+ por xmm7, xmm8
882
+ paddd xmm0, xmmword ptr [rsp+0x40]
883
+ paddd xmm1, xmmword ptr [rsp+0xB0]
884
+ paddd xmm2, xmmword ptr [rsp+0x50]
885
+ paddd xmm3, xmmword ptr [rsp+0x10]
886
+ paddd xmm0, xmm5
887
+ paddd xmm1, xmm6
888
+ paddd xmm2, xmm7
889
+ paddd xmm3, xmm4
890
+ pxor xmm15, xmm0
891
+ pxor xmm12, xmm1
892
+ pxor xmm13, xmm2
893
+ pxor xmm14, xmm3
894
+ pshuflw xmm15, xmm15, 0xB1
895
+ pshufhw xmm15, xmm15, 0xB1
896
+ pshuflw xmm12, xmm12, 0xB1
897
+ pshufhw xmm12, xmm12, 0xB1
898
+ pshuflw xmm13, xmm13, 0xB1
899
+ pshufhw xmm13, xmm13, 0xB1
900
+ pshuflw xmm14, xmm14, 0xB1
901
+ pshufhw xmm14, xmm14, 0xB1
902
+ paddd xmm10, xmm15
903
+ paddd xmm11, xmm12
904
+ movdqa xmm8, xmmword ptr [rsp+0x100]
905
+ paddd xmm8, xmm13
906
+ paddd xmm9, xmm14
907
+ pxor xmm5, xmm10
908
+ pxor xmm6, xmm11
909
+ pxor xmm7, xmm8
910
+ pxor xmm4, xmm9
911
+ movdqa xmmword ptr [rsp+0x100], xmm8
912
+ movdqa xmm8, xmm5
913
+ psrld xmm8, 12
914
+ pslld xmm5, 20
915
+ por xmm5, xmm8
916
+ movdqa xmm8, xmm6
917
+ psrld xmm8, 12
918
+ pslld xmm6, 20
919
+ por xmm6, xmm8
920
+ movdqa xmm8, xmm7
921
+ psrld xmm8, 12
922
+ pslld xmm7, 20
923
+ por xmm7, xmm8
924
+ movdqa xmm8, xmm4
925
+ psrld xmm8, 12
926
+ pslld xmm4, 20
927
+ por xmm4, xmm8
928
+ paddd xmm0, xmmword ptr [rsp]
929
+ paddd xmm1, xmmword ptr [rsp+0x20]
930
+ paddd xmm2, xmmword ptr [rsp+0x80]
931
+ paddd xmm3, xmmword ptr [rsp+0x60]
932
+ paddd xmm0, xmm5
933
+ paddd xmm1, xmm6
934
+ paddd xmm2, xmm7
935
+ paddd xmm3, xmm4
936
+ pxor xmm15, xmm0
937
+ pxor xmm12, xmm1
938
+ pxor xmm13, xmm2
939
+ pxor xmm14, xmm3
940
+ movdqa xmm8, xmm15
941
+ psrld xmm15, 8
942
+ pslld xmm8, 24
943
+ pxor xmm15, xmm8
944
+ movdqa xmm8, xmm12
945
+ psrld xmm12, 8
946
+ pslld xmm8, 24
947
+ pxor xmm12, xmm8
948
+ movdqa xmm8, xmm13
949
+ psrld xmm13, 8
950
+ pslld xmm8, 24
951
+ pxor xmm13, xmm8
952
+ movdqa xmm8, xmm14
953
+ psrld xmm14, 8
954
+ pslld xmm8, 24
955
+ pxor xmm14, xmm8
956
+ paddd xmm10, xmm15
957
+ paddd xmm11, xmm12
958
+ movdqa xmm8, xmmword ptr [rsp+0x100]
959
+ paddd xmm8, xmm13
960
+ paddd xmm9, xmm14
961
+ pxor xmm5, xmm10
962
+ pxor xmm6, xmm11
963
+ pxor xmm7, xmm8
964
+ pxor xmm4, xmm9
965
+ movdqa xmmword ptr [rsp+0x100], xmm8
966
+ movdqa xmm8, xmm5
967
+ psrld xmm8, 7
968
+ pslld xmm5, 25
969
+ por xmm5, xmm8
970
+ movdqa xmm8, xmm6
971
+ psrld xmm8, 7
972
+ pslld xmm6, 25
973
+ por xmm6, xmm8
974
+ movdqa xmm8, xmm7
975
+ psrld xmm8, 7
976
+ pslld xmm7, 25
977
+ por xmm7, xmm8
978
+ movdqa xmm8, xmm4
979
+ psrld xmm8, 7
980
+ pslld xmm4, 25
981
+ por xmm4, xmm8
982
+ paddd xmm0, xmmword ptr [rsp+0xC0]
983
+ paddd xmm1, xmmword ptr [rsp+0x90]
984
+ paddd xmm2, xmmword ptr [rsp+0xF0]
985
+ paddd xmm3, xmmword ptr [rsp+0xE0]
986
+ paddd xmm0, xmm4
987
+ paddd xmm1, xmm5
988
+ paddd xmm2, xmm6
989
+ paddd xmm3, xmm7
990
+ pxor xmm12, xmm0
991
+ pxor xmm13, xmm1
992
+ pxor xmm14, xmm2
993
+ pxor xmm15, xmm3
994
+ pshuflw xmm12, xmm12, 0xB1
995
+ pshufhw xmm12, xmm12, 0xB1
996
+ pshuflw xmm13, xmm13, 0xB1
997
+ pshufhw xmm13, xmm13, 0xB1
998
+ pshuflw xmm14, xmm14, 0xB1
999
+ pshufhw xmm14, xmm14, 0xB1
1000
+ pshuflw xmm15, xmm15, 0xB1
1001
+ pshufhw xmm15, xmm15, 0xB1
1002
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1003
+ paddd xmm8, xmm12
1004
+ paddd xmm9, xmm13
1005
+ paddd xmm10, xmm14
1006
+ paddd xmm11, xmm15
1007
+ pxor xmm4, xmm8
1008
+ pxor xmm5, xmm9
1009
+ pxor xmm6, xmm10
1010
+ pxor xmm7, xmm11
1011
+ movdqa xmmword ptr [rsp+0x100], xmm8
1012
+ movdqa xmm8, xmm4
1013
+ psrld xmm8, 12
1014
+ pslld xmm4, 20
1015
+ por xmm4, xmm8
1016
+ movdqa xmm8, xmm5
1017
+ psrld xmm8, 12
1018
+ pslld xmm5, 20
1019
+ por xmm5, xmm8
1020
+ movdqa xmm8, xmm6
1021
+ psrld xmm8, 12
1022
+ pslld xmm6, 20
1023
+ por xmm6, xmm8
1024
+ movdqa xmm8, xmm7
1025
+ psrld xmm8, 12
1026
+ pslld xmm7, 20
1027
+ por xmm7, xmm8
1028
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1029
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1030
+ paddd xmm2, xmmword ptr [rsp+0xA0]
1031
+ paddd xmm3, xmmword ptr [rsp+0x80]
1032
+ paddd xmm0, xmm4
1033
+ paddd xmm1, xmm5
1034
+ paddd xmm2, xmm6
1035
+ paddd xmm3, xmm7
1036
+ pxor xmm12, xmm0
1037
+ pxor xmm13, xmm1
1038
+ pxor xmm14, xmm2
1039
+ pxor xmm15, xmm3
1040
+ movdqa xmm8, xmm12
1041
+ psrld xmm12, 8
1042
+ pslld xmm8, 24
1043
+ pxor xmm12, xmm8
1044
+ movdqa xmm8, xmm13
1045
+ psrld xmm13, 8
1046
+ pslld xmm8, 24
1047
+ pxor xmm13, xmm8
1048
+ movdqa xmm8, xmm14
1049
+ psrld xmm14, 8
1050
+ pslld xmm8, 24
1051
+ pxor xmm14, xmm8
1052
+ movdqa xmm8, xmm15
1053
+ psrld xmm15, 8
1054
+ pslld xmm8, 24
1055
+ pxor xmm15, xmm8
1056
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1057
+ paddd xmm8, xmm12
1058
+ paddd xmm9, xmm13
1059
+ paddd xmm10, xmm14
1060
+ paddd xmm11, xmm15
1061
+ pxor xmm4, xmm8
1062
+ pxor xmm5, xmm9
1063
+ pxor xmm6, xmm10
1064
+ pxor xmm7, xmm11
1065
+ movdqa xmmword ptr [rsp+0x100], xmm8
1066
+ movdqa xmm8, xmm4
1067
+ psrld xmm8, 7
1068
+ pslld xmm4, 25
1069
+ por xmm4, xmm8
1070
+ movdqa xmm8, xmm5
1071
+ psrld xmm8, 7
1072
+ pslld xmm5, 25
1073
+ por xmm5, xmm8
1074
+ movdqa xmm8, xmm6
1075
+ psrld xmm8, 7
1076
+ pslld xmm6, 25
1077
+ por xmm6, xmm8
1078
+ movdqa xmm8, xmm7
1079
+ psrld xmm8, 7
1080
+ pslld xmm7, 25
1081
+ por xmm7, xmm8
1082
+ paddd xmm0, xmmword ptr [rsp+0x70]
1083
+ paddd xmm1, xmmword ptr [rsp+0x50]
1084
+ paddd xmm2, xmmword ptr [rsp]
1085
+ paddd xmm3, xmmword ptr [rsp+0x60]
1086
+ paddd xmm0, xmm5
1087
+ paddd xmm1, xmm6
1088
+ paddd xmm2, xmm7
1089
+ paddd xmm3, xmm4
1090
+ pxor xmm15, xmm0
1091
+ pxor xmm12, xmm1
1092
+ pxor xmm13, xmm2
1093
+ pxor xmm14, xmm3
1094
+ pshuflw xmm15, xmm15, 0xB1
1095
+ pshufhw xmm15, xmm15, 0xB1
1096
+ pshuflw xmm12, xmm12, 0xB1
1097
+ pshufhw xmm12, xmm12, 0xB1
1098
+ pshuflw xmm13, xmm13, 0xB1
1099
+ pshufhw xmm13, xmm13, 0xB1
1100
+ pshuflw xmm14, xmm14, 0xB1
1101
+ pshufhw xmm14, xmm14, 0xB1
1102
+ paddd xmm10, xmm15
1103
+ paddd xmm11, xmm12
1104
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1105
+ paddd xmm8, xmm13
1106
+ paddd xmm9, xmm14
1107
+ pxor xmm5, xmm10
1108
+ pxor xmm6, xmm11
1109
+ pxor xmm7, xmm8
1110
+ pxor xmm4, xmm9
1111
+ movdqa xmmword ptr [rsp+0x100], xmm8
1112
+ movdqa xmm8, xmm5
1113
+ psrld xmm8, 12
1114
+ pslld xmm5, 20
1115
+ por xmm5, xmm8
1116
+ movdqa xmm8, xmm6
1117
+ psrld xmm8, 12
1118
+ pslld xmm6, 20
1119
+ por xmm6, xmm8
1120
+ movdqa xmm8, xmm7
1121
+ psrld xmm8, 12
1122
+ pslld xmm7, 20
1123
+ por xmm7, xmm8
1124
+ movdqa xmm8, xmm4
1125
+ psrld xmm8, 12
1126
+ pslld xmm4, 20
1127
+ por xmm4, xmm8
1128
+ paddd xmm0, xmmword ptr [rsp+0x20]
1129
+ paddd xmm1, xmmword ptr [rsp+0x30]
1130
+ paddd xmm2, xmmword ptr [rsp+0x10]
1131
+ paddd xmm3, xmmword ptr [rsp+0x40]
1132
+ paddd xmm0, xmm5
1133
+ paddd xmm1, xmm6
1134
+ paddd xmm2, xmm7
1135
+ paddd xmm3, xmm4
1136
+ pxor xmm15, xmm0
1137
+ pxor xmm12, xmm1
1138
+ pxor xmm13, xmm2
1139
+ pxor xmm14, xmm3
1140
+ movdqa xmm8, xmm15
1141
+ psrld xmm15, 8
1142
+ pslld xmm8, 24
1143
+ pxor xmm15, xmm8
1144
+ movdqa xmm8, xmm12
1145
+ psrld xmm12, 8
1146
+ pslld xmm8, 24
1147
+ pxor xmm12, xmm8
1148
+ movdqa xmm8, xmm13
1149
+ psrld xmm13, 8
1150
+ pslld xmm8, 24
1151
+ pxor xmm13, xmm8
1152
+ movdqa xmm8, xmm14
1153
+ psrld xmm14, 8
1154
+ pslld xmm8, 24
1155
+ pxor xmm14, xmm8
1156
+ paddd xmm10, xmm15
1157
+ paddd xmm11, xmm12
1158
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1159
+ paddd xmm8, xmm13
1160
+ paddd xmm9, xmm14
1161
+ pxor xmm5, xmm10
1162
+ pxor xmm6, xmm11
1163
+ pxor xmm7, xmm8
1164
+ pxor xmm4, xmm9
1165
+ movdqa xmmword ptr [rsp+0x100], xmm8
1166
+ movdqa xmm8, xmm5
1167
+ psrld xmm8, 7
1168
+ pslld xmm5, 25
1169
+ por xmm5, xmm8
1170
+ movdqa xmm8, xmm6
1171
+ psrld xmm8, 7
1172
+ pslld xmm6, 25
1173
+ por xmm6, xmm8
1174
+ movdqa xmm8, xmm7
1175
+ psrld xmm8, 7
1176
+ pslld xmm7, 25
1177
+ por xmm7, xmm8
1178
+ movdqa xmm8, xmm4
1179
+ psrld xmm8, 7
1180
+ pslld xmm4, 25
1181
+ por xmm4, xmm8
1182
+ paddd xmm0, xmmword ptr [rsp+0x90]
1183
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1184
+ paddd xmm2, xmmword ptr [rsp+0x80]
1185
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1186
+ paddd xmm0, xmm4
1187
+ paddd xmm1, xmm5
1188
+ paddd xmm2, xmm6
1189
+ paddd xmm3, xmm7
1190
+ pxor xmm12, xmm0
1191
+ pxor xmm13, xmm1
1192
+ pxor xmm14, xmm2
1193
+ pxor xmm15, xmm3
1194
+ pshuflw xmm12, xmm12, 0xB1
1195
+ pshufhw xmm12, xmm12, 0xB1
1196
+ pshuflw xmm13, xmm13, 0xB1
1197
+ pshufhw xmm13, xmm13, 0xB1
1198
+ pshuflw xmm14, xmm14, 0xB1
1199
+ pshufhw xmm14, xmm14, 0xB1
1200
+ pshuflw xmm15, xmm15, 0xB1
1201
+ pshufhw xmm15, xmm15, 0xB1
1202
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1203
+ paddd xmm8, xmm12
1204
+ paddd xmm9, xmm13
1205
+ paddd xmm10, xmm14
1206
+ paddd xmm11, xmm15
1207
+ pxor xmm4, xmm8
1208
+ pxor xmm5, xmm9
1209
+ pxor xmm6, xmm10
1210
+ pxor xmm7, xmm11
1211
+ movdqa xmmword ptr [rsp+0x100], xmm8
1212
+ movdqa xmm8, xmm4
1213
+ psrld xmm8, 12
1214
+ pslld xmm4, 20
1215
+ por xmm4, xmm8
1216
+ movdqa xmm8, xmm5
1217
+ psrld xmm8, 12
1218
+ pslld xmm5, 20
1219
+ por xmm5, xmm8
1220
+ movdqa xmm8, xmm6
1221
+ psrld xmm8, 12
1222
+ pslld xmm6, 20
1223
+ por xmm6, xmm8
1224
+ movdqa xmm8, xmm7
1225
+ psrld xmm8, 12
1226
+ pslld xmm7, 20
1227
+ por xmm7, xmm8
1228
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1229
+ paddd xmm1, xmmword ptr [rsp+0x50]
1230
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1231
+ paddd xmm3, xmmword ptr [rsp+0x10]
1232
+ paddd xmm0, xmm4
1233
+ paddd xmm1, xmm5
1234
+ paddd xmm2, xmm6
1235
+ paddd xmm3, xmm7
1236
+ pxor xmm12, xmm0
1237
+ pxor xmm13, xmm1
1238
+ pxor xmm14, xmm2
1239
+ pxor xmm15, xmm3
1240
+ movdqa xmm8, xmm12
1241
+ psrld xmm12, 8
1242
+ pslld xmm8, 24
1243
+ pxor xmm12, xmm8
1244
+ movdqa xmm8, xmm13
1245
+ psrld xmm13, 8
1246
+ pslld xmm8, 24
1247
+ pxor xmm13, xmm8
1248
+ movdqa xmm8, xmm14
1249
+ psrld xmm14, 8
1250
+ pslld xmm8, 24
1251
+ pxor xmm14, xmm8
1252
+ movdqa xmm8, xmm15
1253
+ psrld xmm15, 8
1254
+ pslld xmm8, 24
1255
+ pxor xmm15, xmm8
1256
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1257
+ paddd xmm8, xmm12
1258
+ paddd xmm9, xmm13
1259
+ paddd xmm10, xmm14
1260
+ paddd xmm11, xmm15
1261
+ pxor xmm4, xmm8
1262
+ pxor xmm5, xmm9
1263
+ pxor xmm6, xmm10
1264
+ pxor xmm7, xmm11
1265
+ movdqa xmmword ptr [rsp+0x100], xmm8
1266
+ movdqa xmm8, xmm4
1267
+ psrld xmm8, 7
1268
+ pslld xmm4, 25
1269
+ por xmm4, xmm8
1270
+ movdqa xmm8, xmm5
1271
+ psrld xmm8, 7
1272
+ pslld xmm5, 25
1273
+ por xmm5, xmm8
1274
+ movdqa xmm8, xmm6
1275
+ psrld xmm8, 7
1276
+ pslld xmm6, 25
1277
+ por xmm6, xmm8
1278
+ movdqa xmm8, xmm7
1279
+ psrld xmm8, 7
1280
+ pslld xmm7, 25
1281
+ por xmm7, xmm8
1282
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1283
+ paddd xmm1, xmmword ptr [rsp]
1284
+ paddd xmm2, xmmword ptr [rsp+0x20]
1285
+ paddd xmm3, xmmword ptr [rsp+0x40]
1286
+ paddd xmm0, xmm5
1287
+ paddd xmm1, xmm6
1288
+ paddd xmm2, xmm7
1289
+ paddd xmm3, xmm4
1290
+ pxor xmm15, xmm0
1291
+ pxor xmm12, xmm1
1292
+ pxor xmm13, xmm2
1293
+ pxor xmm14, xmm3
1294
+ pshuflw xmm15, xmm15, 0xB1
1295
+ pshufhw xmm15, xmm15, 0xB1
1296
+ pshuflw xmm12, xmm12, 0xB1
1297
+ pshufhw xmm12, xmm12, 0xB1
1298
+ pshuflw xmm13, xmm13, 0xB1
1299
+ pshufhw xmm13, xmm13, 0xB1
1300
+ pshuflw xmm14, xmm14, 0xB1
1301
+ pshufhw xmm14, xmm14, 0xB1
1302
+ paddd xmm10, xmm15
1303
+ paddd xmm11, xmm12
1304
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1305
+ paddd xmm8, xmm13
1306
+ paddd xmm9, xmm14
1307
+ pxor xmm5, xmm10
1308
+ pxor xmm6, xmm11
1309
+ pxor xmm7, xmm8
1310
+ pxor xmm4, xmm9
1311
+ movdqa xmmword ptr [rsp+0x100], xmm8
1312
+ movdqa xmm8, xmm5
1313
+ psrld xmm8, 12
1314
+ pslld xmm5, 20
1315
+ por xmm5, xmm8
1316
+ movdqa xmm8, xmm6
1317
+ psrld xmm8, 12
1318
+ pslld xmm6, 20
1319
+ por xmm6, xmm8
1320
+ movdqa xmm8, xmm7
1321
+ psrld xmm8, 12
1322
+ pslld xmm7, 20
1323
+ por xmm7, xmm8
1324
+ movdqa xmm8, xmm4
1325
+ psrld xmm8, 12
1326
+ pslld xmm4, 20
1327
+ por xmm4, xmm8
1328
+ paddd xmm0, xmmword ptr [rsp+0x30]
1329
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1330
+ paddd xmm2, xmmword ptr [rsp+0x60]
1331
+ paddd xmm3, xmmword ptr [rsp+0x70]
1332
+ paddd xmm0, xmm5
1333
+ paddd xmm1, xmm6
1334
+ paddd xmm2, xmm7
1335
+ paddd xmm3, xmm4
1336
+ pxor xmm15, xmm0
1337
+ pxor xmm12, xmm1
1338
+ pxor xmm13, xmm2
1339
+ pxor xmm14, xmm3
1340
+ movdqa xmm8, xmm15
1341
+ psrld xmm15, 8
1342
+ pslld xmm8, 24
1343
+ pxor xmm15, xmm8
1344
+ movdqa xmm8, xmm12
1345
+ psrld xmm12, 8
1346
+ pslld xmm8, 24
1347
+ pxor xmm12, xmm8
1348
+ movdqa xmm8, xmm13
1349
+ psrld xmm13, 8
1350
+ pslld xmm8, 24
1351
+ pxor xmm13, xmm8
1352
+ movdqa xmm8, xmm14
1353
+ psrld xmm14, 8
1354
+ pslld xmm8, 24
1355
+ pxor xmm14, xmm8
1356
+ paddd xmm10, xmm15
1357
+ paddd xmm11, xmm12
1358
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1359
+ paddd xmm8, xmm13
1360
+ paddd xmm9, xmm14
1361
+ pxor xmm5, xmm10
1362
+ pxor xmm6, xmm11
1363
+ pxor xmm7, xmm8
1364
+ pxor xmm4, xmm9
1365
+ movdqa xmmword ptr [rsp+0x100], xmm8
1366
+ movdqa xmm8, xmm5
1367
+ psrld xmm8, 7
1368
+ pslld xmm5, 25
1369
+ por xmm5, xmm8
1370
+ movdqa xmm8, xmm6
1371
+ psrld xmm8, 7
1372
+ pslld xmm6, 25
1373
+ por xmm6, xmm8
1374
+ movdqa xmm8, xmm7
1375
+ psrld xmm8, 7
1376
+ pslld xmm7, 25
1377
+ por xmm7, xmm8
1378
+ movdqa xmm8, xmm4
1379
+ psrld xmm8, 7
1380
+ pslld xmm4, 25
1381
+ por xmm4, xmm8
1382
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1383
+ paddd xmm1, xmmword ptr [rsp+0x50]
1384
+ paddd xmm2, xmmword ptr [rsp+0x10]
1385
+ paddd xmm3, xmmword ptr [rsp+0x80]
1386
+ paddd xmm0, xmm4
1387
+ paddd xmm1, xmm5
1388
+ paddd xmm2, xmm6
1389
+ paddd xmm3, xmm7
1390
+ pxor xmm12, xmm0
1391
+ pxor xmm13, xmm1
1392
+ pxor xmm14, xmm2
1393
+ pxor xmm15, xmm3
1394
+ pshuflw xmm12, xmm12, 0xB1
1395
+ pshufhw xmm12, xmm12, 0xB1
1396
+ pshuflw xmm13, xmm13, 0xB1
1397
+ pshufhw xmm13, xmm13, 0xB1
1398
+ pshuflw xmm14, xmm14, 0xB1
1399
+ pshufhw xmm14, xmm14, 0xB1
1400
+ pshuflw xmm15, xmm15, 0xB1
1401
+ pshufhw xmm15, xmm15, 0xB1
1402
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1403
+ paddd xmm8, xmm12
1404
+ paddd xmm9, xmm13
1405
+ paddd xmm10, xmm14
1406
+ paddd xmm11, xmm15
1407
+ pxor xmm4, xmm8
1408
+ pxor xmm5, xmm9
1409
+ pxor xmm6, xmm10
1410
+ pxor xmm7, xmm11
1411
+ movdqa xmmword ptr [rsp+0x100], xmm8
1412
+ movdqa xmm8, xmm4
1413
+ psrld xmm8, 12
1414
+ pslld xmm4, 20
1415
+ por xmm4, xmm8
1416
+ movdqa xmm8, xmm5
1417
+ psrld xmm8, 12
1418
+ pslld xmm5, 20
1419
+ por xmm5, xmm8
1420
+ movdqa xmm8, xmm6
1421
+ psrld xmm8, 12
1422
+ pslld xmm6, 20
1423
+ por xmm6, xmm8
1424
+ movdqa xmm8, xmm7
1425
+ psrld xmm8, 12
1426
+ pslld xmm7, 20
1427
+ por xmm7, xmm8
1428
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1429
+ paddd xmm1, xmmword ptr [rsp]
1430
+ paddd xmm2, xmmword ptr [rsp+0x90]
1431
+ paddd xmm3, xmmword ptr [rsp+0x60]
1432
+ paddd xmm0, xmm4
1433
+ paddd xmm1, xmm5
1434
+ paddd xmm2, xmm6
1435
+ paddd xmm3, xmm7
1436
+ pxor xmm12, xmm0
1437
+ pxor xmm13, xmm1
1438
+ pxor xmm14, xmm2
1439
+ pxor xmm15, xmm3
1440
+ movdqa xmm8, xmm12
1441
+ psrld xmm12, 8
1442
+ pslld xmm8, 24
1443
+ pxor xmm12, xmm8
1444
+ movdqa xmm8, xmm13
1445
+ psrld xmm13, 8
1446
+ pslld xmm8, 24
1447
+ pxor xmm13, xmm8
1448
+ movdqa xmm8, xmm14
1449
+ psrld xmm14, 8
1450
+ pslld xmm8, 24
1451
+ pxor xmm14, xmm8
1452
+ movdqa xmm8, xmm15
1453
+ psrld xmm15, 8
1454
+ pslld xmm8, 24
1455
+ pxor xmm15, xmm8
1456
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1457
+ paddd xmm8, xmm12
1458
+ paddd xmm9, xmm13
1459
+ paddd xmm10, xmm14
1460
+ paddd xmm11, xmm15
1461
+ pxor xmm4, xmm8
1462
+ pxor xmm5, xmm9
1463
+ pxor xmm6, xmm10
1464
+ pxor xmm7, xmm11
1465
+ movdqa xmmword ptr [rsp+0x100], xmm8
1466
+ movdqa xmm8, xmm4
1467
+ psrld xmm8, 7
1468
+ pslld xmm4, 25
1469
+ por xmm4, xmm8
1470
+ movdqa xmm8, xmm5
1471
+ psrld xmm8, 7
1472
+ pslld xmm5, 25
1473
+ por xmm5, xmm8
1474
+ movdqa xmm8, xmm6
1475
+ psrld xmm8, 7
1476
+ pslld xmm6, 25
1477
+ por xmm6, xmm8
1478
+ movdqa xmm8, xmm7
1479
+ psrld xmm8, 7
1480
+ pslld xmm7, 25
1481
+ por xmm7, xmm8
1482
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1483
+ paddd xmm1, xmmword ptr [rsp+0x20]
1484
+ paddd xmm2, xmmword ptr [rsp+0x30]
1485
+ paddd xmm3, xmmword ptr [rsp+0x70]
1486
+ paddd xmm0, xmm5
1487
+ paddd xmm1, xmm6
1488
+ paddd xmm2, xmm7
1489
+ paddd xmm3, xmm4
1490
+ pxor xmm15, xmm0
1491
+ pxor xmm12, xmm1
1492
+ pxor xmm13, xmm2
1493
+ pxor xmm14, xmm3
1494
+ pshuflw xmm15, xmm15, 0xB1
1495
+ pshufhw xmm15, xmm15, 0xB1
1496
+ pshuflw xmm12, xmm12, 0xB1
1497
+ pshufhw xmm12, xmm12, 0xB1
1498
+ pshuflw xmm13, xmm13, 0xB1
1499
+ pshufhw xmm13, xmm13, 0xB1
1500
+ pshuflw xmm14, xmm14, 0xB1
1501
+ pshufhw xmm14, xmm14, 0xB1
1502
+ paddd xmm10, xmm15
1503
+ paddd xmm11, xmm12
1504
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1505
+ paddd xmm8, xmm13
1506
+ paddd xmm9, xmm14
1507
+ pxor xmm5, xmm10
1508
+ pxor xmm6, xmm11
1509
+ pxor xmm7, xmm8
1510
+ pxor xmm4, xmm9
1511
+ movdqa xmmword ptr [rsp+0x100], xmm8
1512
+ movdqa xmm8, xmm5
1513
+ psrld xmm8, 12
1514
+ pslld xmm5, 20
1515
+ por xmm5, xmm8
1516
+ movdqa xmm8, xmm6
1517
+ psrld xmm8, 12
1518
+ pslld xmm6, 20
1519
+ por xmm6, xmm8
1520
+ movdqa xmm8, xmm7
1521
+ psrld xmm8, 12
1522
+ pslld xmm7, 20
1523
+ por xmm7, xmm8
1524
+ movdqa xmm8, xmm4
1525
+ psrld xmm8, 12
1526
+ pslld xmm4, 20
1527
+ por xmm4, xmm8
1528
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1529
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1530
+ paddd xmm2, xmmword ptr [rsp+0x40]
1531
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1532
+ paddd xmm0, xmm5
1533
+ paddd xmm1, xmm6
1534
+ paddd xmm2, xmm7
1535
+ paddd xmm3, xmm4
1536
+ pxor xmm15, xmm0
1537
+ pxor xmm12, xmm1
1538
+ pxor xmm13, xmm2
1539
+ pxor xmm14, xmm3
1540
+ movdqa xmm8, xmm15
1541
+ psrld xmm15, 8
1542
+ pslld xmm8, 24
1543
+ pxor xmm15, xmm8
1544
+ movdqa xmm8, xmm12
1545
+ psrld xmm12, 8
1546
+ pslld xmm8, 24
1547
+ pxor xmm12, xmm8
1548
+ movdqa xmm8, xmm13
1549
+ psrld xmm13, 8
1550
+ pslld xmm8, 24
1551
+ pxor xmm13, xmm8
1552
+ movdqa xmm8, xmm14
1553
+ psrld xmm14, 8
1554
+ pslld xmm8, 24
1555
+ pxor xmm14, xmm8
1556
+ paddd xmm10, xmm15
1557
+ paddd xmm11, xmm12
1558
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1559
+ paddd xmm8, xmm13
1560
+ paddd xmm9, xmm14
1561
+ pxor xmm5, xmm10
1562
+ pxor xmm6, xmm11
1563
+ pxor xmm7, xmm8
1564
+ pxor xmm4, xmm9
1565
+ pxor xmm0, xmm8
1566
+ pxor xmm1, xmm9
1567
+ pxor xmm2, xmm10
1568
+ pxor xmm3, xmm11
1569
+ movdqa xmm8, xmm5
1570
+ psrld xmm8, 7
1571
+ pslld xmm5, 25
1572
+ por xmm5, xmm8
1573
+ movdqa xmm8, xmm6
1574
+ psrld xmm8, 7
1575
+ pslld xmm6, 25
1576
+ por xmm6, xmm8
1577
+ movdqa xmm8, xmm7
1578
+ psrld xmm8, 7
1579
+ pslld xmm7, 25
1580
+ por xmm7, xmm8
1581
+ movdqa xmm8, xmm4
1582
+ psrld xmm8, 7
1583
+ pslld xmm4, 25
1584
+ por xmm4, xmm8
1585
+ pxor xmm4, xmm12
1586
+ pxor xmm5, xmm13
1587
+ pxor xmm6, xmm14
1588
+ pxor xmm7, xmm15
1589
+ mov eax, r13d
1590
+ jne 9b
1591
+ movdqa xmm9, xmm0
1592
+ punpckldq xmm0, xmm1
1593
+ punpckhdq xmm9, xmm1
1594
+ movdqa xmm11, xmm2
1595
+ punpckldq xmm2, xmm3
1596
+ punpckhdq xmm11, xmm3
1597
+ movdqa xmm1, xmm0
1598
+ punpcklqdq xmm0, xmm2
1599
+ punpckhqdq xmm1, xmm2
1600
+ movdqa xmm3, xmm9
1601
+ punpcklqdq xmm9, xmm11
1602
+ punpckhqdq xmm3, xmm11
1603
+ movdqu xmmword ptr [rbx], xmm0
1604
+ movdqu xmmword ptr [rbx+0x20], xmm1
1605
+ movdqu xmmword ptr [rbx+0x40], xmm9
1606
+ movdqu xmmword ptr [rbx+0x60], xmm3
1607
+ movdqa xmm9, xmm4
1608
+ punpckldq xmm4, xmm5
1609
+ punpckhdq xmm9, xmm5
1610
+ movdqa xmm11, xmm6
1611
+ punpckldq xmm6, xmm7
1612
+ punpckhdq xmm11, xmm7
1613
+ movdqa xmm5, xmm4
1614
+ punpcklqdq xmm4, xmm6
1615
+ punpckhqdq xmm5, xmm6
1616
+ movdqa xmm7, xmm9
1617
+ punpcklqdq xmm9, xmm11
1618
+ punpckhqdq xmm7, xmm11
1619
+ movdqu xmmword ptr [rbx+0x10], xmm4
1620
+ movdqu xmmword ptr [rbx+0x30], xmm5
1621
+ movdqu xmmword ptr [rbx+0x50], xmm9
1622
+ movdqu xmmword ptr [rbx+0x70], xmm7
1623
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1624
+ movdqa xmm0, xmm1
1625
+ paddd xmm1, xmmword ptr [rsp+0x150]
1626
+ movdqa xmmword ptr [rsp+0x110], xmm1
1627
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1628
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1629
+ pcmpgtd xmm0, xmm1
1630
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1631
+ psubd xmm1, xmm0
1632
+ movdqa xmmword ptr [rsp+0x120], xmm1
1633
+ add rbx, 128
1634
+ add rdi, 32
1635
+ sub rsi, 4
1636
+ cmp rsi, 4
1637
+ jnc 2b
1638
+ test rsi, rsi
1639
+ jne 3f
1640
+ 4:
1641
+ movdqa xmm6, xmmword ptr [rsp+0x170]
1642
+ movdqa xmm7, xmmword ptr [rsp+0x180]
1643
+ movdqa xmm8, xmmword ptr [rsp+0x190]
1644
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
1645
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
1646
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
1647
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
1648
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
1649
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
1650
+ movdqa xmm15, xmmword ptr [rsp+0x200]
1651
+ mov rsp, rbp
1652
+ pop rbp
1653
+ pop rbx
1654
+ pop rdi
1655
+ pop rsi
1656
+ pop r12
1657
+ pop r13
1658
+ pop r14
1659
+ pop r15
1660
+ ret
1661
+ .p2align 5
1662
+ 3:
1663
+ test esi, 0x2
1664
+ je 3f
1665
+ movups xmm0, xmmword ptr [rcx]
1666
+ movups xmm1, xmmword ptr [rcx+0x10]
1667
+ movaps xmm8, xmm0
1668
+ movaps xmm9, xmm1
1669
+ movd xmm13, dword ptr [rsp+0x110]
1670
+ movd xmm14, dword ptr [rsp+0x120]
1671
+ punpckldq xmm13, xmm14
1672
+ movaps xmmword ptr [rsp], xmm13
1673
+ movd xmm14, dword ptr [rsp+0x114]
1674
+ movd xmm13, dword ptr [rsp+0x124]
1675
+ punpckldq xmm14, xmm13
1676
+ movaps xmmword ptr [rsp+0x10], xmm14
1677
+ mov r8, qword ptr [rdi]
1678
+ mov r9, qword ptr [rdi+0x8]
1679
+ movzx eax, byte ptr [rbp+0x80]
1680
+ or eax, r13d
1681
+ xor edx, edx
1682
+ 2:
1683
+ mov r14d, eax
1684
+ or eax, r12d
1685
+ add rdx, 64
1686
+ cmp rdx, r15
1687
+ cmovne eax, r14d
1688
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1689
+ movaps xmm10, xmm2
1690
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1691
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1692
+ movaps xmm3, xmm4
1693
+ shufps xmm4, xmm5, 136
1694
+ shufps xmm3, xmm5, 221
1695
+ movaps xmm5, xmm3
1696
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1697
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1698
+ movaps xmm3, xmm6
1699
+ shufps xmm6, xmm7, 136
1700
+ pshufd xmm6, xmm6, 0x93
1701
+ shufps xmm3, xmm7, 221
1702
+ pshufd xmm7, xmm3, 0x93
1703
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1704
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1705
+ movaps xmm11, xmm12
1706
+ shufps xmm12, xmm13, 136
1707
+ shufps xmm11, xmm13, 221
1708
+ movaps xmm13, xmm11
1709
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1710
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1711
+ movaps xmm11, xmm14
1712
+ shufps xmm14, xmm15, 136
1713
+ pshufd xmm14, xmm14, 0x93
1714
+ shufps xmm11, xmm15, 221
1715
+ pshufd xmm15, xmm11, 0x93
1716
+ shl rax, 0x20
1717
+ or rax, 0x40
1718
+ movd xmm3, rax
1719
+ movdqa xmmword ptr [rsp+0x20], xmm3
1720
+ movaps xmm3, xmmword ptr [rsp]
1721
+ movaps xmm11, xmmword ptr [rsp+0x10]
1722
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1723
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1724
+ mov al, 7
1725
+ 9:
1726
+ paddd xmm0, xmm4
1727
+ paddd xmm8, xmm12
1728
+ movaps xmmword ptr [rsp+0x20], xmm4
1729
+ movaps xmmword ptr [rsp+0x30], xmm12
1730
+ paddd xmm0, xmm1
1731
+ paddd xmm8, xmm9
1732
+ pxor xmm3, xmm0
1733
+ pxor xmm11, xmm8
1734
+ pshuflw xmm3, xmm3, 0xB1
1735
+ pshufhw xmm3, xmm3, 0xB1
1736
+ pshuflw xmm11, xmm11, 0xB1
1737
+ pshufhw xmm11, xmm11, 0xB1
1738
+ paddd xmm2, xmm3
1739
+ paddd xmm10, xmm11
1740
+ pxor xmm1, xmm2
1741
+ pxor xmm9, xmm10
1742
+ movdqa xmm4, xmm1
1743
+ pslld xmm1, 20
1744
+ psrld xmm4, 12
1745
+ por xmm1, xmm4
1746
+ movdqa xmm4, xmm9
1747
+ pslld xmm9, 20
1748
+ psrld xmm4, 12
1749
+ por xmm9, xmm4
1750
+ paddd xmm0, xmm5
1751
+ paddd xmm8, xmm13
1752
+ movaps xmmword ptr [rsp+0x40], xmm5
1753
+ movaps xmmword ptr [rsp+0x50], xmm13
1754
+ paddd xmm0, xmm1
1755
+ paddd xmm8, xmm9
1756
+ pxor xmm3, xmm0
1757
+ pxor xmm11, xmm8
1758
+ movdqa xmm13, xmm3
1759
+ psrld xmm3, 8
1760
+ pslld xmm13, 24
1761
+ pxor xmm3, xmm13
1762
+ movdqa xmm13, xmm11
1763
+ psrld xmm11, 8
1764
+ pslld xmm13, 24
1765
+ pxor xmm11, xmm13
1766
+ paddd xmm2, xmm3
1767
+ paddd xmm10, xmm11
1768
+ pxor xmm1, xmm2
1769
+ pxor xmm9, xmm10
1770
+ movdqa xmm4, xmm1
1771
+ pslld xmm1, 25
1772
+ psrld xmm4, 7
1773
+ por xmm1, xmm4
1774
+ movdqa xmm4, xmm9
1775
+ pslld xmm9, 25
1776
+ psrld xmm4, 7
1777
+ por xmm9, xmm4
1778
+ pshufd xmm0, xmm0, 0x93
1779
+ pshufd xmm8, xmm8, 0x93
1780
+ pshufd xmm3, xmm3, 0x4E
1781
+ pshufd xmm11, xmm11, 0x4E
1782
+ pshufd xmm2, xmm2, 0x39
1783
+ pshufd xmm10, xmm10, 0x39
1784
+ paddd xmm0, xmm6
1785
+ paddd xmm8, xmm14
1786
+ paddd xmm0, xmm1
1787
+ paddd xmm8, xmm9
1788
+ pxor xmm3, xmm0
1789
+ pxor xmm11, xmm8
1790
+ pshuflw xmm3, xmm3, 0xB1
1791
+ pshufhw xmm3, xmm3, 0xB1
1792
+ pshuflw xmm11, xmm11, 0xB1
1793
+ pshufhw xmm11, xmm11, 0xB1
1794
+ paddd xmm2, xmm3
1795
+ paddd xmm10, xmm11
1796
+ pxor xmm1, xmm2
1797
+ pxor xmm9, xmm10
1798
+ movdqa xmm4, xmm1
1799
+ pslld xmm1, 20
1800
+ psrld xmm4, 12
1801
+ por xmm1, xmm4
1802
+ movdqa xmm4, xmm9
1803
+ pslld xmm9, 20
1804
+ psrld xmm4, 12
1805
+ por xmm9, xmm4
1806
+ paddd xmm0, xmm7
1807
+ paddd xmm8, xmm15
1808
+ paddd xmm0, xmm1
1809
+ paddd xmm8, xmm9
1810
+ pxor xmm3, xmm0
1811
+ pxor xmm11, xmm8
1812
+ movdqa xmm13, xmm3
1813
+ psrld xmm3, 8
1814
+ pslld xmm13, 24
1815
+ pxor xmm3, xmm13
1816
+ movdqa xmm13, xmm11
1817
+ psrld xmm11, 8
1818
+ pslld xmm13, 24
1819
+ pxor xmm11, xmm13
1820
+ paddd xmm2, xmm3
1821
+ paddd xmm10, xmm11
1822
+ pxor xmm1, xmm2
1823
+ pxor xmm9, xmm10
1824
+ movdqa xmm4, xmm1
1825
+ pslld xmm1, 25
1826
+ psrld xmm4, 7
1827
+ por xmm1, xmm4
1828
+ movdqa xmm4, xmm9
1829
+ pslld xmm9, 25
1830
+ psrld xmm4, 7
1831
+ por xmm9, xmm4
1832
+ pshufd xmm0, xmm0, 0x39
1833
+ pshufd xmm8, xmm8, 0x39
1834
+ pshufd xmm3, xmm3, 0x4E
1835
+ pshufd xmm11, xmm11, 0x4E
1836
+ pshufd xmm2, xmm2, 0x93
1837
+ pshufd xmm10, xmm10, 0x93
1838
+ dec al
1839
+ je 9f
1840
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1841
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1842
+ pshufd xmm13, xmm12, 0x0F
1843
+ shufps xmm12, xmm5, 214
1844
+ pshufd xmm4, xmm12, 0x39
1845
+ movdqa xmm12, xmm6
1846
+ shufps xmm12, xmm7, 250
1847
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1848
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1849
+ por xmm13, xmm12
1850
+ movdqa xmmword ptr [rsp+0x20], xmm13
1851
+ movdqa xmm12, xmm7
1852
+ punpcklqdq xmm12, xmm5
1853
+ movdqa xmm13, xmm6
1854
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1855
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1856
+ por xmm12, xmm13
1857
+ pshufd xmm12, xmm12, 0x78
1858
+ punpckhdq xmm5, xmm7
1859
+ punpckldq xmm6, xmm5
1860
+ pshufd xmm7, xmm6, 0x1E
1861
+ movdqa xmmword ptr [rsp+0x40], xmm12
1862
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1863
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1864
+ pshufd xmm6, xmm5, 0x0F
1865
+ shufps xmm5, xmm13, 214
1866
+ pshufd xmm12, xmm5, 0x39
1867
+ movdqa xmm5, xmm14
1868
+ shufps xmm5, xmm15, 250
1869
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1870
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1871
+ por xmm6, xmm5
1872
+ movdqa xmm5, xmm15
1873
+ punpcklqdq xmm5, xmm13
1874
+ movdqa xmmword ptr [rsp+0x30], xmm2
1875
+ movdqa xmm2, xmm14
1876
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1877
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1878
+ por xmm5, xmm2
1879
+ movdqa xmm2, xmmword ptr [rsp+0x30]
1880
+ pshufd xmm5, xmm5, 0x78
1881
+ punpckhdq xmm13, xmm15
1882
+ punpckldq xmm14, xmm13
1883
+ pshufd xmm15, xmm14, 0x1E
1884
+ movdqa xmm13, xmm6
1885
+ movdqa xmm14, xmm5
1886
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1887
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1888
+ jmp 9b
1889
+ 9:
1890
+ pxor xmm0, xmm2
1891
+ pxor xmm1, xmm3
1892
+ pxor xmm8, xmm10
1893
+ pxor xmm9, xmm11
1894
+ mov eax, r13d
1895
+ cmp rdx, r15
1896
+ jne 2b
1897
+ movups xmmword ptr [rbx], xmm0
1898
+ movups xmmword ptr [rbx+0x10], xmm1
1899
+ movups xmmword ptr [rbx+0x20], xmm8
1900
+ movups xmmword ptr [rbx+0x30], xmm9
1901
+ mov eax, dword ptr [rsp+0x130]
1902
+ neg eax
1903
+ mov r10d, dword ptr [rsp+0x110+8*rax]
1904
+ mov r11d, dword ptr [rsp+0x120+8*rax]
1905
+ mov dword ptr [rsp+0x110], r10d
1906
+ mov dword ptr [rsp+0x120], r11d
1907
+ add rdi, 16
1908
+ add rbx, 64
1909
+ sub rsi, 2
1910
+ 3:
1911
+ test esi, 0x1
1912
+ je 4b
1913
+ movups xmm0, xmmword ptr [rcx]
1914
+ movups xmm1, xmmword ptr [rcx+0x10]
1915
+ movd xmm13, dword ptr [rsp+0x110]
1916
+ movd xmm14, dword ptr [rsp+0x120]
1917
+ punpckldq xmm13, xmm14
1918
+ mov r8, qword ptr [rdi]
1919
+ movzx eax, byte ptr [rbp+0x80]
1920
+ or eax, r13d
1921
+ xor edx, edx
1922
+ 2:
1923
+ mov r14d, eax
1924
+ or eax, r12d
1925
+ add rdx, 64
1926
+ cmp rdx, r15
1927
+ cmovne eax, r14d
1928
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1929
+ shl rax, 32
1930
+ or rax, 64
1931
+ movd xmm12, rax
1932
+ movdqa xmm3, xmm13
1933
+ punpcklqdq xmm3, xmm12
1934
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1935
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1936
+ movaps xmm8, xmm4
1937
+ shufps xmm4, xmm5, 136
1938
+ shufps xmm8, xmm5, 221
1939
+ movaps xmm5, xmm8
1940
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1941
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1942
+ movaps xmm8, xmm6
1943
+ shufps xmm6, xmm7, 136
1944
+ pshufd xmm6, xmm6, 0x93
1945
+ shufps xmm8, xmm7, 221
1946
+ pshufd xmm7, xmm8, 0x93
1947
+ mov al, 7
1948
+ 9:
1949
+ paddd xmm0, xmm4
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ pshuflw xmm3, xmm3, 0xB1
1953
+ pshufhw xmm3, xmm3, 0xB1
1954
+ paddd xmm2, xmm3
1955
+ pxor xmm1, xmm2
1956
+ movdqa xmm11, xmm1
1957
+ pslld xmm1, 20
1958
+ psrld xmm11, 12
1959
+ por xmm1, xmm11
1960
+ paddd xmm0, xmm5
1961
+ paddd xmm0, xmm1
1962
+ pxor xmm3, xmm0
1963
+ movdqa xmm14, xmm3
1964
+ psrld xmm3, 8
1965
+ pslld xmm14, 24
1966
+ pxor xmm3, xmm14
1967
+ paddd xmm2, xmm3
1968
+ pxor xmm1, xmm2
1969
+ movdqa xmm11, xmm1
1970
+ pslld xmm1, 25
1971
+ psrld xmm11, 7
1972
+ por xmm1, xmm11
1973
+ pshufd xmm0, xmm0, 0x93
1974
+ pshufd xmm3, xmm3, 0x4E
1975
+ pshufd xmm2, xmm2, 0x39
1976
+ paddd xmm0, xmm6
1977
+ paddd xmm0, xmm1
1978
+ pxor xmm3, xmm0
1979
+ pshuflw xmm3, xmm3, 0xB1
1980
+ pshufhw xmm3, xmm3, 0xB1
1981
+ paddd xmm2, xmm3
1982
+ pxor xmm1, xmm2
1983
+ movdqa xmm11, xmm1
1984
+ pslld xmm1, 20
1985
+ psrld xmm11, 12
1986
+ por xmm1, xmm11
1987
+ paddd xmm0, xmm7
1988
+ paddd xmm0, xmm1
1989
+ pxor xmm3, xmm0
1990
+ movdqa xmm14, xmm3
1991
+ psrld xmm3, 8
1992
+ pslld xmm14, 24
1993
+ pxor xmm3, xmm14
1994
+ paddd xmm2, xmm3
1995
+ pxor xmm1, xmm2
1996
+ movdqa xmm11, xmm1
1997
+ pslld xmm1, 25
1998
+ psrld xmm11, 7
1999
+ por xmm1, xmm11
2000
+ pshufd xmm0, xmm0, 0x39
2001
+ pshufd xmm3, xmm3, 0x4E
2002
+ pshufd xmm2, xmm2, 0x93
2003
+ dec al
2004
+ jz 9f
2005
+ movdqa xmm8, xmm4
2006
+ shufps xmm8, xmm5, 214
2007
+ pshufd xmm9, xmm4, 0x0F
2008
+ pshufd xmm4, xmm8, 0x39
2009
+ movdqa xmm8, xmm6
2010
+ shufps xmm8, xmm7, 250
2011
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2012
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2013
+ por xmm9, xmm8
2014
+ movdqa xmm8, xmm7
2015
+ punpcklqdq xmm8, xmm5
2016
+ movdqa xmm10, xmm6
2017
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2018
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2019
+ por xmm8, xmm10
2020
+ pshufd xmm8, xmm8, 0x78
2021
+ punpckhdq xmm5, xmm7
2022
+ punpckldq xmm6, xmm5
2023
+ pshufd xmm7, xmm6, 0x1E
2024
+ movdqa xmm5, xmm9
2025
+ movdqa xmm6, xmm8
2026
+ jmp 9b
2027
+ 9:
2028
+ pxor xmm0, xmm2
2029
+ pxor xmm1, xmm3
2030
+ mov eax, r13d
2031
+ cmp rdx, r15
2032
+ jne 2b
2033
+ movups xmmword ptr [rbx], xmm0
2034
+ movups xmmword ptr [rbx+0x10], xmm1
2035
+ jmp 4b
2036
+
2037
+ .p2align 6
2038
+ blake3_compress_in_place_sse2:
2039
+ _blake3_compress_in_place_sse2:
2040
+ sub rsp, 120
2041
+ movdqa xmmword ptr [rsp], xmm6
2042
+ movdqa xmmword ptr [rsp+0x10], xmm7
2043
+ movdqa xmmword ptr [rsp+0x20], xmm8
2044
+ movdqa xmmword ptr [rsp+0x30], xmm9
2045
+ movdqa xmmword ptr [rsp+0x40], xmm11
2046
+ movdqa xmmword ptr [rsp+0x50], xmm14
2047
+ movdqa xmmword ptr [rsp+0x60], xmm15
2048
+ movups xmm0, xmmword ptr [rcx]
2049
+ movups xmm1, xmmword ptr [rcx+0x10]
2050
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2051
+ movzx eax, byte ptr [rsp+0xA0]
2052
+ movzx r8d, r8b
2053
+ shl rax, 32
2054
+ add r8, rax
2055
+ movq xmm3, r9
2056
+ movq xmm4, r8
2057
+ punpcklqdq xmm3, xmm4
2058
+ movups xmm4, xmmword ptr [rdx]
2059
+ movups xmm5, xmmword ptr [rdx+0x10]
2060
+ movaps xmm8, xmm4
2061
+ shufps xmm4, xmm5, 136
2062
+ shufps xmm8, xmm5, 221
2063
+ movaps xmm5, xmm8
2064
+ movups xmm6, xmmword ptr [rdx+0x20]
2065
+ movups xmm7, xmmword ptr [rdx+0x30]
2066
+ movaps xmm8, xmm6
2067
+ shufps xmm6, xmm7, 136
2068
+ pshufd xmm6, xmm6, 0x93
2069
+ shufps xmm8, xmm7, 221
2070
+ pshufd xmm7, xmm8, 0x93
2071
+ mov al, 7
2072
+ 9:
2073
+ paddd xmm0, xmm4
2074
+ paddd xmm0, xmm1
2075
+ pxor xmm3, xmm0
2076
+ pshuflw xmm3, xmm3, 0xB1
2077
+ pshufhw xmm3, xmm3, 0xB1
2078
+ paddd xmm2, xmm3
2079
+ pxor xmm1, xmm2
2080
+ movdqa xmm11, xmm1
2081
+ pslld xmm1, 20
2082
+ psrld xmm11, 12
2083
+ por xmm1, xmm11
2084
+ paddd xmm0, xmm5
2085
+ paddd xmm0, xmm1
2086
+ pxor xmm3, xmm0
2087
+ movdqa xmm14, xmm3
2088
+ psrld xmm3, 8
2089
+ pslld xmm14, 24
2090
+ pxor xmm3, xmm14
2091
+ paddd xmm2, xmm3
2092
+ pxor xmm1, xmm2
2093
+ movdqa xmm11, xmm1
2094
+ pslld xmm1, 25
2095
+ psrld xmm11, 7
2096
+ por xmm1, xmm11
2097
+ pshufd xmm0, xmm0, 0x93
2098
+ pshufd xmm3, xmm3, 0x4E
2099
+ pshufd xmm2, xmm2, 0x39
2100
+ paddd xmm0, xmm6
2101
+ paddd xmm0, xmm1
2102
+ pxor xmm3, xmm0
2103
+ pshuflw xmm3, xmm3, 0xB1
2104
+ pshufhw xmm3, xmm3, 0xB1
2105
+ paddd xmm2, xmm3
2106
+ pxor xmm1, xmm2
2107
+ movdqa xmm11, xmm1
2108
+ pslld xmm1, 20
2109
+ psrld xmm11, 12
2110
+ por xmm1, xmm11
2111
+ paddd xmm0, xmm7
2112
+ paddd xmm0, xmm1
2113
+ pxor xmm3, xmm0
2114
+ movdqa xmm14, xmm3
2115
+ psrld xmm3, 8
2116
+ pslld xmm14, 24
2117
+ pxor xmm3, xmm14
2118
+ paddd xmm2, xmm3
2119
+ pxor xmm1, xmm2
2120
+ movdqa xmm11, xmm1
2121
+ pslld xmm1, 25
2122
+ psrld xmm11, 7
2123
+ por xmm1, xmm11
2124
+ pshufd xmm0, xmm0, 0x39
2125
+ pshufd xmm3, xmm3, 0x4E
2126
+ pshufd xmm2, xmm2, 0x93
2127
+ dec al
2128
+ jz 9f
2129
+ movdqa xmm8, xmm4
2130
+ shufps xmm8, xmm5, 214
2131
+ pshufd xmm9, xmm4, 0x0F
2132
+ pshufd xmm4, xmm8, 0x39
2133
+ movdqa xmm8, xmm6
2134
+ shufps xmm8, xmm7, 250
2135
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2136
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2137
+ por xmm9, xmm8
2138
+ movdqa xmm8, xmm7
2139
+ punpcklqdq xmm8, xmm5
2140
+ movdqa xmm10, xmm6
2141
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2142
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
+ por xmm8, xmm10
2144
+ pshufd xmm8, xmm8, 0x78
2145
+ punpckhdq xmm5, xmm7
2146
+ punpckldq xmm6, xmm5
2147
+ pshufd xmm7, xmm6, 0x1E
2148
+ movdqa xmm5, xmm9
2149
+ movdqa xmm6, xmm8
2150
+ jmp 9b
2151
+ 9:
2152
+ pxor xmm0, xmm2
2153
+ pxor xmm1, xmm3
2154
+ movups xmmword ptr [rcx], xmm0
2155
+ movups xmmword ptr [rcx+0x10], xmm1
2156
+ movdqa xmm6, xmmword ptr [rsp]
2157
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2158
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2159
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2160
+ movdqa xmm11, xmmword ptr [rsp+0x40]
2161
+ movdqa xmm14, xmmword ptr [rsp+0x50]
2162
+ movdqa xmm15, xmmword ptr [rsp+0x60]
2163
+ add rsp, 120
2164
+ ret
2165
+
2166
+
2167
+ .p2align 6
2168
+ _blake3_compress_xof_sse2:
2169
+ blake3_compress_xof_sse2:
2170
+ sub rsp, 120
2171
+ movdqa xmmword ptr [rsp], xmm6
2172
+ movdqa xmmword ptr [rsp+0x10], xmm7
2173
+ movdqa xmmword ptr [rsp+0x20], xmm8
2174
+ movdqa xmmword ptr [rsp+0x30], xmm9
2175
+ movdqa xmmword ptr [rsp+0x40], xmm11
2176
+ movdqa xmmword ptr [rsp+0x50], xmm14
2177
+ movdqa xmmword ptr [rsp+0x60], xmm15
2178
+ movups xmm0, xmmword ptr [rcx]
2179
+ movups xmm1, xmmword ptr [rcx+0x10]
2180
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2181
+ movzx eax, byte ptr [rsp+0xA0]
2182
+ movzx r8d, r8b
2183
+ mov r10, qword ptr [rsp+0xA8]
2184
+ shl rax, 32
2185
+ add r8, rax
2186
+ movq xmm3, r9
2187
+ movq xmm4, r8
2188
+ punpcklqdq xmm3, xmm4
2189
+ movups xmm4, xmmword ptr [rdx]
2190
+ movups xmm5, xmmword ptr [rdx+0x10]
2191
+ movaps xmm8, xmm4
2192
+ shufps xmm4, xmm5, 136
2193
+ shufps xmm8, xmm5, 221
2194
+ movaps xmm5, xmm8
2195
+ movups xmm6, xmmword ptr [rdx+0x20]
2196
+ movups xmm7, xmmword ptr [rdx+0x30]
2197
+ movaps xmm8, xmm6
2198
+ shufps xmm6, xmm7, 136
2199
+ pshufd xmm6, xmm6, 0x93
2200
+ shufps xmm8, xmm7, 221
2201
+ pshufd xmm7, xmm8, 0x93
2202
+ mov al, 7
2203
+ 9:
2204
+ paddd xmm0, xmm4
2205
+ paddd xmm0, xmm1
2206
+ pxor xmm3, xmm0
2207
+ pshuflw xmm3, xmm3, 0xB1
2208
+ pshufhw xmm3, xmm3, 0xB1
2209
+ paddd xmm2, xmm3
2210
+ pxor xmm1, xmm2
2211
+ movdqa xmm11, xmm1
2212
+ pslld xmm1, 20
2213
+ psrld xmm11, 12
2214
+ por xmm1, xmm11
2215
+ paddd xmm0, xmm5
2216
+ paddd xmm0, xmm1
2217
+ pxor xmm3, xmm0
2218
+ movdqa xmm14, xmm3
2219
+ psrld xmm3, 8
2220
+ pslld xmm14, 24
2221
+ pxor xmm3, xmm14
2222
+ paddd xmm2, xmm3
2223
+ pxor xmm1, xmm2
2224
+ movdqa xmm11, xmm1
2225
+ pslld xmm1, 25
2226
+ psrld xmm11, 7
2227
+ por xmm1, xmm11
2228
+ pshufd xmm0, xmm0, 0x93
2229
+ pshufd xmm3, xmm3, 0x4E
2230
+ pshufd xmm2, xmm2, 0x39
2231
+ paddd xmm0, xmm6
2232
+ paddd xmm0, xmm1
2233
+ pxor xmm3, xmm0
2234
+ pshuflw xmm3, xmm3, 0xB1
2235
+ pshufhw xmm3, xmm3, 0xB1
2236
+ paddd xmm2, xmm3
2237
+ pxor xmm1, xmm2
2238
+ movdqa xmm11, xmm1
2239
+ pslld xmm1, 20
2240
+ psrld xmm11, 12
2241
+ por xmm1, xmm11
2242
+ paddd xmm0, xmm7
2243
+ paddd xmm0, xmm1
2244
+ pxor xmm3, xmm0
2245
+ movdqa xmm14, xmm3
2246
+ psrld xmm3, 8
2247
+ pslld xmm14, 24
2248
+ pxor xmm3, xmm14
2249
+ paddd xmm2, xmm3
2250
+ pxor xmm1, xmm2
2251
+ movdqa xmm11, xmm1
2252
+ pslld xmm1, 25
2253
+ psrld xmm11, 7
2254
+ por xmm1, xmm11
2255
+ pshufd xmm0, xmm0, 0x39
2256
+ pshufd xmm3, xmm3, 0x4E
2257
+ pshufd xmm2, xmm2, 0x93
2258
+ dec al
2259
+ jz 9f
2260
+ movdqa xmm8, xmm4
2261
+ shufps xmm8, xmm5, 214
2262
+ pshufd xmm9, xmm4, 0x0F
2263
+ pshufd xmm4, xmm8, 0x39
2264
+ movdqa xmm8, xmm6
2265
+ shufps xmm8, xmm7, 250
2266
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2267
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2268
+ por xmm9, xmm8
2269
+ movdqa xmm8, xmm7
2270
+ punpcklqdq xmm8, xmm5
2271
+ movdqa xmm10, xmm6
2272
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2273
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
+ por xmm8, xmm10
2275
+ pshufd xmm8, xmm8, 0x78
2276
+ punpckhdq xmm5, xmm7
2277
+ punpckldq xmm6, xmm5
2278
+ pshufd xmm7, xmm6, 0x1E
2279
+ movdqa xmm5, xmm9
2280
+ movdqa xmm6, xmm8
2281
+ jmp 9b
2282
+ 9:
2283
+ movdqu xmm4, xmmword ptr [rcx]
2284
+ movdqu xmm5, xmmword ptr [rcx+0x10]
2285
+ pxor xmm0, xmm2
2286
+ pxor xmm1, xmm3
2287
+ pxor xmm2, xmm4
2288
+ pxor xmm3, xmm5
2289
+ movups xmmword ptr [r10], xmm0
2290
+ movups xmmword ptr [r10+0x10], xmm1
2291
+ movups xmmword ptr [r10+0x20], xmm2
2292
+ movups xmmword ptr [r10+0x30], xmm3
2293
+ movdqa xmm6, xmmword ptr [rsp]
2294
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2295
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2296
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2297
+ movdqa xmm11, xmmword ptr [rsp+0x40]
2298
+ movdqa xmm14, xmmword ptr [rsp+0x50]
2299
+ movdqa xmm15, xmmword ptr [rsp+0x60]
2300
+ add rsp, 120
2301
+ ret
2302
+
2303
+
2304
+ .section .rodata
2305
+ .p2align 6
2306
+ BLAKE3_IV:
2307
+ .long 0x6A09E667, 0xBB67AE85
2308
+ .long 0x3C6EF372, 0xA54FF53A
2309
+ ADD0:
2310
+ .long 0, 1, 2, 3
2311
+ ADD1:
2312
+ .long 4, 4, 4, 4
2313
+ BLAKE3_IV_0:
2314
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2315
+ BLAKE3_IV_1:
2316
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2317
+ BLAKE3_IV_2:
2318
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2319
+ BLAKE3_IV_3:
2320
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2321
+ BLAKE3_BLOCK_LEN:
2322
+ .long 64, 64, 64, 64
2323
+ CMP_MSB_MASK:
2324
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2325
+ PBLENDW_0x33_MASK:
2326
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2327
+ PBLENDW_0xCC_MASK:
2328
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2329
+ PBLENDW_0x3F_MASK:
2330
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2331
+ PBLENDW_0xC0_MASK:
2332
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF