digest-blake3 0.22.1 → 1.2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2332 @@
1
+ .intel_syntax noprefix
2
+ .global blake3_hash_many_sse2
3
+ .global _blake3_hash_many_sse2
4
+ .global blake3_compress_in_place_sse2
5
+ .global _blake3_compress_in_place_sse2
6
+ .global blake3_compress_xof_sse2
7
+ .global _blake3_compress_xof_sse2
8
+ .section .text
9
+ .p2align 6
10
+ _blake3_hash_many_sse2:
11
+ blake3_hash_many_sse2:
12
+ push r15
13
+ push r14
14
+ push r13
15
+ push r12
16
+ push rsi
17
+ push rdi
18
+ push rbx
19
+ push rbp
20
+ mov rbp, rsp
21
+ sub rsp, 528
22
+ and rsp, 0xFFFFFFFFFFFFFFC0
23
+ movdqa xmmword ptr [rsp+0x170], xmm6
24
+ movdqa xmmword ptr [rsp+0x180], xmm7
25
+ movdqa xmmword ptr [rsp+0x190], xmm8
26
+ movdqa xmmword ptr [rsp+0x1A0], xmm9
27
+ movdqa xmmword ptr [rsp+0x1B0], xmm10
28
+ movdqa xmmword ptr [rsp+0x1C0], xmm11
29
+ movdqa xmmword ptr [rsp+0x1D0], xmm12
30
+ movdqa xmmword ptr [rsp+0x1E0], xmm13
31
+ movdqa xmmword ptr [rsp+0x1F0], xmm14
32
+ movdqa xmmword ptr [rsp+0x200], xmm15
33
+ mov rdi, rcx
34
+ mov rsi, rdx
35
+ mov rdx, r8
36
+ mov rcx, r9
37
+ mov r8, qword ptr [rbp+0x68]
38
+ movzx r9, byte ptr [rbp+0x70]
39
+ neg r9d
40
+ movd xmm0, r9d
41
+ pshufd xmm0, xmm0, 0x00
42
+ movdqa xmmword ptr [rsp+0x130], xmm0
43
+ movdqa xmm1, xmm0
44
+ pand xmm1, xmmword ptr [ADD0+rip]
45
+ pand xmm0, xmmword ptr [ADD1+rip]
46
+ movdqa xmmword ptr [rsp+0x150], xmm0
47
+ movd xmm0, r8d
48
+ pshufd xmm0, xmm0, 0x00
49
+ paddd xmm0, xmm1
50
+ movdqa xmmword ptr [rsp+0x110], xmm0
51
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
52
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pcmpgtd xmm1, xmm0
54
+ shr r8, 32
55
+ movd xmm2, r8d
56
+ pshufd xmm2, xmm2, 0x00
57
+ psubd xmm2, xmm1
58
+ movdqa xmmword ptr [rsp+0x120], xmm2
59
+ mov rbx, qword ptr [rbp+0x90]
60
+ mov r15, rdx
61
+ shl r15, 6
62
+ movzx r13d, byte ptr [rbp+0x78]
63
+ movzx r12d, byte ptr [rbp+0x88]
64
+ cmp rsi, 4
65
+ jc 3f
66
+ 2:
67
+ movdqu xmm3, xmmword ptr [rcx]
68
+ pshufd xmm0, xmm3, 0x00
69
+ pshufd xmm1, xmm3, 0x55
70
+ pshufd xmm2, xmm3, 0xAA
71
+ pshufd xmm3, xmm3, 0xFF
72
+ movdqu xmm7, xmmword ptr [rcx+0x10]
73
+ pshufd xmm4, xmm7, 0x00
74
+ pshufd xmm5, xmm7, 0x55
75
+ pshufd xmm6, xmm7, 0xAA
76
+ pshufd xmm7, xmm7, 0xFF
77
+ mov r8, qword ptr [rdi]
78
+ mov r9, qword ptr [rdi+0x8]
79
+ mov r10, qword ptr [rdi+0x10]
80
+ mov r11, qword ptr [rdi+0x18]
81
+ movzx eax, byte ptr [rbp+0x80]
82
+ or eax, r13d
83
+ xor edx, edx
84
+ 9:
85
+ mov r14d, eax
86
+ or eax, r12d
87
+ add rdx, 64
88
+ cmp rdx, r15
89
+ cmovne eax, r14d
90
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
91
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
92
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
93
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
94
+ movdqa xmm12, xmm8
95
+ punpckldq xmm8, xmm9
96
+ punpckhdq xmm12, xmm9
97
+ movdqa xmm14, xmm10
98
+ punpckldq xmm10, xmm11
99
+ punpckhdq xmm14, xmm11
100
+ movdqa xmm9, xmm8
101
+ punpcklqdq xmm8, xmm10
102
+ punpckhqdq xmm9, xmm10
103
+ movdqa xmm13, xmm12
104
+ punpcklqdq xmm12, xmm14
105
+ punpckhqdq xmm13, xmm14
106
+ movdqa xmmword ptr [rsp], xmm8
107
+ movdqa xmmword ptr [rsp+0x10], xmm9
108
+ movdqa xmmword ptr [rsp+0x20], xmm12
109
+ movdqa xmmword ptr [rsp+0x30], xmm13
110
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
111
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
112
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
113
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
114
+ movdqa xmm12, xmm8
115
+ punpckldq xmm8, xmm9
116
+ punpckhdq xmm12, xmm9
117
+ movdqa xmm14, xmm10
118
+ punpckldq xmm10, xmm11
119
+ punpckhdq xmm14, xmm11
120
+ movdqa xmm9, xmm8
121
+ punpcklqdq xmm8, xmm10
122
+ punpckhqdq xmm9, xmm10
123
+ movdqa xmm13, xmm12
124
+ punpcklqdq xmm12, xmm14
125
+ punpckhqdq xmm13, xmm14
126
+ movdqa xmmword ptr [rsp+0x40], xmm8
127
+ movdqa xmmword ptr [rsp+0x50], xmm9
128
+ movdqa xmmword ptr [rsp+0x60], xmm12
129
+ movdqa xmmword ptr [rsp+0x70], xmm13
130
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
131
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
132
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
133
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
134
+ movdqa xmm12, xmm8
135
+ punpckldq xmm8, xmm9
136
+ punpckhdq xmm12, xmm9
137
+ movdqa xmm14, xmm10
138
+ punpckldq xmm10, xmm11
139
+ punpckhdq xmm14, xmm11
140
+ movdqa xmm9, xmm8
141
+ punpcklqdq xmm8, xmm10
142
+ punpckhqdq xmm9, xmm10
143
+ movdqa xmm13, xmm12
144
+ punpcklqdq xmm12, xmm14
145
+ punpckhqdq xmm13, xmm14
146
+ movdqa xmmword ptr [rsp+0x80], xmm8
147
+ movdqa xmmword ptr [rsp+0x90], xmm9
148
+ movdqa xmmword ptr [rsp+0xA0], xmm12
149
+ movdqa xmmword ptr [rsp+0xB0], xmm13
150
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
151
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
152
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
153
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
154
+ movdqa xmm12, xmm8
155
+ punpckldq xmm8, xmm9
156
+ punpckhdq xmm12, xmm9
157
+ movdqa xmm14, xmm10
158
+ punpckldq xmm10, xmm11
159
+ punpckhdq xmm14, xmm11
160
+ movdqa xmm9, xmm8
161
+ punpcklqdq xmm8, xmm10
162
+ punpckhqdq xmm9, xmm10
163
+ movdqa xmm13, xmm12
164
+ punpcklqdq xmm12, xmm14
165
+ punpckhqdq xmm13, xmm14
166
+ movdqa xmmword ptr [rsp+0xC0], xmm8
167
+ movdqa xmmword ptr [rsp+0xD0], xmm9
168
+ movdqa xmmword ptr [rsp+0xE0], xmm12
169
+ movdqa xmmword ptr [rsp+0xF0], xmm13
170
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
171
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
172
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
173
+ movdqa xmm12, xmmword ptr [rsp+0x110]
174
+ movdqa xmm13, xmmword ptr [rsp+0x120]
175
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
176
+ movd xmm15, eax
177
+ pshufd xmm15, xmm15, 0x00
178
+ prefetcht0 [r8+rdx+0x80]
179
+ prefetcht0 [r9+rdx+0x80]
180
+ prefetcht0 [r10+rdx+0x80]
181
+ prefetcht0 [r11+rdx+0x80]
182
+ paddd xmm0, xmmword ptr [rsp]
183
+ paddd xmm1, xmmword ptr [rsp+0x20]
184
+ paddd xmm2, xmmword ptr [rsp+0x40]
185
+ paddd xmm3, xmmword ptr [rsp+0x60]
186
+ paddd xmm0, xmm4
187
+ paddd xmm1, xmm5
188
+ paddd xmm2, xmm6
189
+ paddd xmm3, xmm7
190
+ pxor xmm12, xmm0
191
+ pxor xmm13, xmm1
192
+ pxor xmm14, xmm2
193
+ pxor xmm15, xmm3
194
+ pshuflw xmm12, xmm12, 0xB1
195
+ pshufhw xmm12, xmm12, 0xB1
196
+ pshuflw xmm13, xmm13, 0xB1
197
+ pshufhw xmm13, xmm13, 0xB1
198
+ pshuflw xmm14, xmm14, 0xB1
199
+ pshufhw xmm14, xmm14, 0xB1
200
+ pshuflw xmm15, xmm15, 0xB1
201
+ pshufhw xmm15, xmm15, 0xB1
202
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
203
+ paddd xmm8, xmm12
204
+ paddd xmm9, xmm13
205
+ paddd xmm10, xmm14
206
+ paddd xmm11, xmm15
207
+ pxor xmm4, xmm8
208
+ pxor xmm5, xmm9
209
+ pxor xmm6, xmm10
210
+ pxor xmm7, xmm11
211
+ movdqa xmmword ptr [rsp+0x100], xmm8
212
+ movdqa xmm8, xmm4
213
+ psrld xmm8, 12
214
+ pslld xmm4, 20
215
+ por xmm4, xmm8
216
+ movdqa xmm8, xmm5
217
+ psrld xmm8, 12
218
+ pslld xmm5, 20
219
+ por xmm5, xmm8
220
+ movdqa xmm8, xmm6
221
+ psrld xmm8, 12
222
+ pslld xmm6, 20
223
+ por xmm6, xmm8
224
+ movdqa xmm8, xmm7
225
+ psrld xmm8, 12
226
+ pslld xmm7, 20
227
+ por xmm7, xmm8
228
+ paddd xmm0, xmmword ptr [rsp+0x10]
229
+ paddd xmm1, xmmword ptr [rsp+0x30]
230
+ paddd xmm2, xmmword ptr [rsp+0x50]
231
+ paddd xmm3, xmmword ptr [rsp+0x70]
232
+ paddd xmm0, xmm4
233
+ paddd xmm1, xmm5
234
+ paddd xmm2, xmm6
235
+ paddd xmm3, xmm7
236
+ pxor xmm12, xmm0
237
+ pxor xmm13, xmm1
238
+ pxor xmm14, xmm2
239
+ pxor xmm15, xmm3
240
+ movdqa xmm8, xmm12
241
+ psrld xmm12, 8
242
+ pslld xmm8, 24
243
+ pxor xmm12, xmm8
244
+ movdqa xmm8, xmm13
245
+ psrld xmm13, 8
246
+ pslld xmm8, 24
247
+ pxor xmm13, xmm8
248
+ movdqa xmm8, xmm14
249
+ psrld xmm14, 8
250
+ pslld xmm8, 24
251
+ pxor xmm14, xmm8
252
+ movdqa xmm8, xmm15
253
+ psrld xmm15, 8
254
+ pslld xmm8, 24
255
+ pxor xmm15, xmm8
256
+ movdqa xmm8, xmmword ptr [rsp+0x100]
257
+ paddd xmm8, xmm12
258
+ paddd xmm9, xmm13
259
+ paddd xmm10, xmm14
260
+ paddd xmm11, xmm15
261
+ pxor xmm4, xmm8
262
+ pxor xmm5, xmm9
263
+ pxor xmm6, xmm10
264
+ pxor xmm7, xmm11
265
+ movdqa xmmword ptr [rsp+0x100], xmm8
266
+ movdqa xmm8, xmm4
267
+ psrld xmm8, 7
268
+ pslld xmm4, 25
269
+ por xmm4, xmm8
270
+ movdqa xmm8, xmm5
271
+ psrld xmm8, 7
272
+ pslld xmm5, 25
273
+ por xmm5, xmm8
274
+ movdqa xmm8, xmm6
275
+ psrld xmm8, 7
276
+ pslld xmm6, 25
277
+ por xmm6, xmm8
278
+ movdqa xmm8, xmm7
279
+ psrld xmm8, 7
280
+ pslld xmm7, 25
281
+ por xmm7, xmm8
282
+ paddd xmm0, xmmword ptr [rsp+0x80]
283
+ paddd xmm1, xmmword ptr [rsp+0xA0]
284
+ paddd xmm2, xmmword ptr [rsp+0xC0]
285
+ paddd xmm3, xmmword ptr [rsp+0xE0]
286
+ paddd xmm0, xmm5
287
+ paddd xmm1, xmm6
288
+ paddd xmm2, xmm7
289
+ paddd xmm3, xmm4
290
+ pxor xmm15, xmm0
291
+ pxor xmm12, xmm1
292
+ pxor xmm13, xmm2
293
+ pxor xmm14, xmm3
294
+ pshuflw xmm15, xmm15, 0xB1
295
+ pshufhw xmm15, xmm15, 0xB1
296
+ pshuflw xmm12, xmm12, 0xB1
297
+ pshufhw xmm12, xmm12, 0xB1
298
+ pshuflw xmm13, xmm13, 0xB1
299
+ pshufhw xmm13, xmm13, 0xB1
300
+ pshuflw xmm14, xmm14, 0xB1
301
+ pshufhw xmm14, xmm14, 0xB1
302
+ paddd xmm10, xmm15
303
+ paddd xmm11, xmm12
304
+ movdqa xmm8, xmmword ptr [rsp+0x100]
305
+ paddd xmm8, xmm13
306
+ paddd xmm9, xmm14
307
+ pxor xmm5, xmm10
308
+ pxor xmm6, xmm11
309
+ pxor xmm7, xmm8
310
+ pxor xmm4, xmm9
311
+ movdqa xmmword ptr [rsp+0x100], xmm8
312
+ movdqa xmm8, xmm5
313
+ psrld xmm8, 12
314
+ pslld xmm5, 20
315
+ por xmm5, xmm8
316
+ movdqa xmm8, xmm6
317
+ psrld xmm8, 12
318
+ pslld xmm6, 20
319
+ por xmm6, xmm8
320
+ movdqa xmm8, xmm7
321
+ psrld xmm8, 12
322
+ pslld xmm7, 20
323
+ por xmm7, xmm8
324
+ movdqa xmm8, xmm4
325
+ psrld xmm8, 12
326
+ pslld xmm4, 20
327
+ por xmm4, xmm8
328
+ paddd xmm0, xmmword ptr [rsp+0x90]
329
+ paddd xmm1, xmmword ptr [rsp+0xB0]
330
+ paddd xmm2, xmmword ptr [rsp+0xD0]
331
+ paddd xmm3, xmmword ptr [rsp+0xF0]
332
+ paddd xmm0, xmm5
333
+ paddd xmm1, xmm6
334
+ paddd xmm2, xmm7
335
+ paddd xmm3, xmm4
336
+ pxor xmm15, xmm0
337
+ pxor xmm12, xmm1
338
+ pxor xmm13, xmm2
339
+ pxor xmm14, xmm3
340
+ movdqa xmm8, xmm15
341
+ psrld xmm15, 8
342
+ pslld xmm8, 24
343
+ pxor xmm15, xmm8
344
+ movdqa xmm8, xmm12
345
+ psrld xmm12, 8
346
+ pslld xmm8, 24
347
+ pxor xmm12, xmm8
348
+ movdqa xmm8, xmm13
349
+ psrld xmm13, 8
350
+ pslld xmm8, 24
351
+ pxor xmm13, xmm8
352
+ movdqa xmm8, xmm14
353
+ psrld xmm14, 8
354
+ pslld xmm8, 24
355
+ pxor xmm14, xmm8
356
+ paddd xmm10, xmm15
357
+ paddd xmm11, xmm12
358
+ movdqa xmm8, xmmword ptr [rsp+0x100]
359
+ paddd xmm8, xmm13
360
+ paddd xmm9, xmm14
361
+ pxor xmm5, xmm10
362
+ pxor xmm6, xmm11
363
+ pxor xmm7, xmm8
364
+ pxor xmm4, xmm9
365
+ movdqa xmmword ptr [rsp+0x100], xmm8
366
+ movdqa xmm8, xmm5
367
+ psrld xmm8, 7
368
+ pslld xmm5, 25
369
+ por xmm5, xmm8
370
+ movdqa xmm8, xmm6
371
+ psrld xmm8, 7
372
+ pslld xmm6, 25
373
+ por xmm6, xmm8
374
+ movdqa xmm8, xmm7
375
+ psrld xmm8, 7
376
+ pslld xmm7, 25
377
+ por xmm7, xmm8
378
+ movdqa xmm8, xmm4
379
+ psrld xmm8, 7
380
+ pslld xmm4, 25
381
+ por xmm4, xmm8
382
+ paddd xmm0, xmmword ptr [rsp+0x20]
383
+ paddd xmm1, xmmword ptr [rsp+0x30]
384
+ paddd xmm2, xmmword ptr [rsp+0x70]
385
+ paddd xmm3, xmmword ptr [rsp+0x40]
386
+ paddd xmm0, xmm4
387
+ paddd xmm1, xmm5
388
+ paddd xmm2, xmm6
389
+ paddd xmm3, xmm7
390
+ pxor xmm12, xmm0
391
+ pxor xmm13, xmm1
392
+ pxor xmm14, xmm2
393
+ pxor xmm15, xmm3
394
+ pshuflw xmm12, xmm12, 0xB1
395
+ pshufhw xmm12, xmm12, 0xB1
396
+ pshuflw xmm13, xmm13, 0xB1
397
+ pshufhw xmm13, xmm13, 0xB1
398
+ pshuflw xmm14, xmm14, 0xB1
399
+ pshufhw xmm14, xmm14, 0xB1
400
+ pshuflw xmm15, xmm15, 0xB1
401
+ pshufhw xmm15, xmm15, 0xB1
402
+ movdqa xmm8, xmmword ptr [rsp+0x100]
403
+ paddd xmm8, xmm12
404
+ paddd xmm9, xmm13
405
+ paddd xmm10, xmm14
406
+ paddd xmm11, xmm15
407
+ pxor xmm4, xmm8
408
+ pxor xmm5, xmm9
409
+ pxor xmm6, xmm10
410
+ pxor xmm7, xmm11
411
+ movdqa xmmword ptr [rsp+0x100], xmm8
412
+ movdqa xmm8, xmm4
413
+ psrld xmm8, 12
414
+ pslld xmm4, 20
415
+ por xmm4, xmm8
416
+ movdqa xmm8, xmm5
417
+ psrld xmm8, 12
418
+ pslld xmm5, 20
419
+ por xmm5, xmm8
420
+ movdqa xmm8, xmm6
421
+ psrld xmm8, 12
422
+ pslld xmm6, 20
423
+ por xmm6, xmm8
424
+ movdqa xmm8, xmm7
425
+ psrld xmm8, 12
426
+ pslld xmm7, 20
427
+ por xmm7, xmm8
428
+ paddd xmm0, xmmword ptr [rsp+0x60]
429
+ paddd xmm1, xmmword ptr [rsp+0xA0]
430
+ paddd xmm2, xmmword ptr [rsp]
431
+ paddd xmm3, xmmword ptr [rsp+0xD0]
432
+ paddd xmm0, xmm4
433
+ paddd xmm1, xmm5
434
+ paddd xmm2, xmm6
435
+ paddd xmm3, xmm7
436
+ pxor xmm12, xmm0
437
+ pxor xmm13, xmm1
438
+ pxor xmm14, xmm2
439
+ pxor xmm15, xmm3
440
+ movdqa xmm8, xmm12
441
+ psrld xmm12, 8
442
+ pslld xmm8, 24
443
+ pxor xmm12, xmm8
444
+ movdqa xmm8, xmm13
445
+ psrld xmm13, 8
446
+ pslld xmm8, 24
447
+ pxor xmm13, xmm8
448
+ movdqa xmm8, xmm14
449
+ psrld xmm14, 8
450
+ pslld xmm8, 24
451
+ pxor xmm14, xmm8
452
+ movdqa xmm8, xmm15
453
+ psrld xmm15, 8
454
+ pslld xmm8, 24
455
+ pxor xmm15, xmm8
456
+ movdqa xmm8, xmmword ptr [rsp+0x100]
457
+ paddd xmm8, xmm12
458
+ paddd xmm9, xmm13
459
+ paddd xmm10, xmm14
460
+ paddd xmm11, xmm15
461
+ pxor xmm4, xmm8
462
+ pxor xmm5, xmm9
463
+ pxor xmm6, xmm10
464
+ pxor xmm7, xmm11
465
+ movdqa xmmword ptr [rsp+0x100], xmm8
466
+ movdqa xmm8, xmm4
467
+ psrld xmm8, 7
468
+ pslld xmm4, 25
469
+ por xmm4, xmm8
470
+ movdqa xmm8, xmm5
471
+ psrld xmm8, 7
472
+ pslld xmm5, 25
473
+ por xmm5, xmm8
474
+ movdqa xmm8, xmm6
475
+ psrld xmm8, 7
476
+ pslld xmm6, 25
477
+ por xmm6, xmm8
478
+ movdqa xmm8, xmm7
479
+ psrld xmm8, 7
480
+ pslld xmm7, 25
481
+ por xmm7, xmm8
482
+ paddd xmm0, xmmword ptr [rsp+0x10]
483
+ paddd xmm1, xmmword ptr [rsp+0xC0]
484
+ paddd xmm2, xmmword ptr [rsp+0x90]
485
+ paddd xmm3, xmmword ptr [rsp+0xF0]
486
+ paddd xmm0, xmm5
487
+ paddd xmm1, xmm6
488
+ paddd xmm2, xmm7
489
+ paddd xmm3, xmm4
490
+ pxor xmm15, xmm0
491
+ pxor xmm12, xmm1
492
+ pxor xmm13, xmm2
493
+ pxor xmm14, xmm3
494
+ pshuflw xmm15, xmm15, 0xB1
495
+ pshufhw xmm15, xmm15, 0xB1
496
+ pshuflw xmm12, xmm12, 0xB1
497
+ pshufhw xmm12, xmm12, 0xB1
498
+ pshuflw xmm13, xmm13, 0xB1
499
+ pshufhw xmm13, xmm13, 0xB1
500
+ pshuflw xmm14, xmm14, 0xB1
501
+ pshufhw xmm14, xmm14, 0xB1
502
+ paddd xmm10, xmm15
503
+ paddd xmm11, xmm12
504
+ movdqa xmm8, xmmword ptr [rsp+0x100]
505
+ paddd xmm8, xmm13
506
+ paddd xmm9, xmm14
507
+ pxor xmm5, xmm10
508
+ pxor xmm6, xmm11
509
+ pxor xmm7, xmm8
510
+ pxor xmm4, xmm9
511
+ movdqa xmmword ptr [rsp+0x100], xmm8
512
+ movdqa xmm8, xmm5
513
+ psrld xmm8, 12
514
+ pslld xmm5, 20
515
+ por xmm5, xmm8
516
+ movdqa xmm8, xmm6
517
+ psrld xmm8, 12
518
+ pslld xmm6, 20
519
+ por xmm6, xmm8
520
+ movdqa xmm8, xmm7
521
+ psrld xmm8, 12
522
+ pslld xmm7, 20
523
+ por xmm7, xmm8
524
+ movdqa xmm8, xmm4
525
+ psrld xmm8, 12
526
+ pslld xmm4, 20
527
+ por xmm4, xmm8
528
+ paddd xmm0, xmmword ptr [rsp+0xB0]
529
+ paddd xmm1, xmmword ptr [rsp+0x50]
530
+ paddd xmm2, xmmword ptr [rsp+0xE0]
531
+ paddd xmm3, xmmword ptr [rsp+0x80]
532
+ paddd xmm0, xmm5
533
+ paddd xmm1, xmm6
534
+ paddd xmm2, xmm7
535
+ paddd xmm3, xmm4
536
+ pxor xmm15, xmm0
537
+ pxor xmm12, xmm1
538
+ pxor xmm13, xmm2
539
+ pxor xmm14, xmm3
540
+ movdqa xmm8, xmm15
541
+ psrld xmm15, 8
542
+ pslld xmm8, 24
543
+ pxor xmm15, xmm8
544
+ movdqa xmm8, xmm12
545
+ psrld xmm12, 8
546
+ pslld xmm8, 24
547
+ pxor xmm12, xmm8
548
+ movdqa xmm8, xmm13
549
+ psrld xmm13, 8
550
+ pslld xmm8, 24
551
+ pxor xmm13, xmm8
552
+ movdqa xmm8, xmm14
553
+ psrld xmm14, 8
554
+ pslld xmm8, 24
555
+ pxor xmm14, xmm8
556
+ paddd xmm10, xmm15
557
+ paddd xmm11, xmm12
558
+ movdqa xmm8, xmmword ptr [rsp+0x100]
559
+ paddd xmm8, xmm13
560
+ paddd xmm9, xmm14
561
+ pxor xmm5, xmm10
562
+ pxor xmm6, xmm11
563
+ pxor xmm7, xmm8
564
+ pxor xmm4, xmm9
565
+ movdqa xmmword ptr [rsp+0x100], xmm8
566
+ movdqa xmm8, xmm5
567
+ psrld xmm8, 7
568
+ pslld xmm5, 25
569
+ por xmm5, xmm8
570
+ movdqa xmm8, xmm6
571
+ psrld xmm8, 7
572
+ pslld xmm6, 25
573
+ por xmm6, xmm8
574
+ movdqa xmm8, xmm7
575
+ psrld xmm8, 7
576
+ pslld xmm7, 25
577
+ por xmm7, xmm8
578
+ movdqa xmm8, xmm4
579
+ psrld xmm8, 7
580
+ pslld xmm4, 25
581
+ por xmm4, xmm8
582
+ paddd xmm0, xmmword ptr [rsp+0x30]
583
+ paddd xmm1, xmmword ptr [rsp+0xA0]
584
+ paddd xmm2, xmmword ptr [rsp+0xD0]
585
+ paddd xmm3, xmmword ptr [rsp+0x70]
586
+ paddd xmm0, xmm4
587
+ paddd xmm1, xmm5
588
+ paddd xmm2, xmm6
589
+ paddd xmm3, xmm7
590
+ pxor xmm12, xmm0
591
+ pxor xmm13, xmm1
592
+ pxor xmm14, xmm2
593
+ pxor xmm15, xmm3
594
+ pshuflw xmm12, xmm12, 0xB1
595
+ pshufhw xmm12, xmm12, 0xB1
596
+ pshuflw xmm13, xmm13, 0xB1
597
+ pshufhw xmm13, xmm13, 0xB1
598
+ pshuflw xmm14, xmm14, 0xB1
599
+ pshufhw xmm14, xmm14, 0xB1
600
+ pshuflw xmm15, xmm15, 0xB1
601
+ pshufhw xmm15, xmm15, 0xB1
602
+ movdqa xmm8, xmmword ptr [rsp+0x100]
603
+ paddd xmm8, xmm12
604
+ paddd xmm9, xmm13
605
+ paddd xmm10, xmm14
606
+ paddd xmm11, xmm15
607
+ pxor xmm4, xmm8
608
+ pxor xmm5, xmm9
609
+ pxor xmm6, xmm10
610
+ pxor xmm7, xmm11
611
+ movdqa xmmword ptr [rsp+0x100], xmm8
612
+ movdqa xmm8, xmm4
613
+ psrld xmm8, 12
614
+ pslld xmm4, 20
615
+ por xmm4, xmm8
616
+ movdqa xmm8, xmm5
617
+ psrld xmm8, 12
618
+ pslld xmm5, 20
619
+ por xmm5, xmm8
620
+ movdqa xmm8, xmm6
621
+ psrld xmm8, 12
622
+ pslld xmm6, 20
623
+ por xmm6, xmm8
624
+ movdqa xmm8, xmm7
625
+ psrld xmm8, 12
626
+ pslld xmm7, 20
627
+ por xmm7, xmm8
628
+ paddd xmm0, xmmword ptr [rsp+0x40]
629
+ paddd xmm1, xmmword ptr [rsp+0xC0]
630
+ paddd xmm2, xmmword ptr [rsp+0x20]
631
+ paddd xmm3, xmmword ptr [rsp+0xE0]
632
+ paddd xmm0, xmm4
633
+ paddd xmm1, xmm5
634
+ paddd xmm2, xmm6
635
+ paddd xmm3, xmm7
636
+ pxor xmm12, xmm0
637
+ pxor xmm13, xmm1
638
+ pxor xmm14, xmm2
639
+ pxor xmm15, xmm3
640
+ movdqa xmm8, xmm12
641
+ psrld xmm12, 8
642
+ pslld xmm8, 24
643
+ pxor xmm12, xmm8
644
+ movdqa xmm8, xmm13
645
+ psrld xmm13, 8
646
+ pslld xmm8, 24
647
+ pxor xmm13, xmm8
648
+ movdqa xmm8, xmm14
649
+ psrld xmm14, 8
650
+ pslld xmm8, 24
651
+ pxor xmm14, xmm8
652
+ movdqa xmm8, xmm15
653
+ psrld xmm15, 8
654
+ pslld xmm8, 24
655
+ pxor xmm15, xmm8
656
+ movdqa xmm8, xmmword ptr [rsp+0x100]
657
+ paddd xmm8, xmm12
658
+ paddd xmm9, xmm13
659
+ paddd xmm10, xmm14
660
+ paddd xmm11, xmm15
661
+ pxor xmm4, xmm8
662
+ pxor xmm5, xmm9
663
+ pxor xmm6, xmm10
664
+ pxor xmm7, xmm11
665
+ movdqa xmmword ptr [rsp+0x100], xmm8
666
+ movdqa xmm8, xmm4
667
+ psrld xmm8, 7
668
+ pslld xmm4, 25
669
+ por xmm4, xmm8
670
+ movdqa xmm8, xmm5
671
+ psrld xmm8, 7
672
+ pslld xmm5, 25
673
+ por xmm5, xmm8
674
+ movdqa xmm8, xmm6
675
+ psrld xmm8, 7
676
+ pslld xmm6, 25
677
+ por xmm6, xmm8
678
+ movdqa xmm8, xmm7
679
+ psrld xmm8, 7
680
+ pslld xmm7, 25
681
+ por xmm7, xmm8
682
+ paddd xmm0, xmmword ptr [rsp+0x60]
683
+ paddd xmm1, xmmword ptr [rsp+0x90]
684
+ paddd xmm2, xmmword ptr [rsp+0xB0]
685
+ paddd xmm3, xmmword ptr [rsp+0x80]
686
+ paddd xmm0, xmm5
687
+ paddd xmm1, xmm6
688
+ paddd xmm2, xmm7
689
+ paddd xmm3, xmm4
690
+ pxor xmm15, xmm0
691
+ pxor xmm12, xmm1
692
+ pxor xmm13, xmm2
693
+ pxor xmm14, xmm3
694
+ pshuflw xmm15, xmm15, 0xB1
695
+ pshufhw xmm15, xmm15, 0xB1
696
+ pshuflw xmm12, xmm12, 0xB1
697
+ pshufhw xmm12, xmm12, 0xB1
698
+ pshuflw xmm13, xmm13, 0xB1
699
+ pshufhw xmm13, xmm13, 0xB1
700
+ pshuflw xmm14, xmm14, 0xB1
701
+ pshufhw xmm14, xmm14, 0xB1
702
+ paddd xmm10, xmm15
703
+ paddd xmm11, xmm12
704
+ movdqa xmm8, xmmword ptr [rsp+0x100]
705
+ paddd xmm8, xmm13
706
+ paddd xmm9, xmm14
707
+ pxor xmm5, xmm10
708
+ pxor xmm6, xmm11
709
+ pxor xmm7, xmm8
710
+ pxor xmm4, xmm9
711
+ movdqa xmmword ptr [rsp+0x100], xmm8
712
+ movdqa xmm8, xmm5
713
+ psrld xmm8, 12
714
+ pslld xmm5, 20
715
+ por xmm5, xmm8
716
+ movdqa xmm8, xmm6
717
+ psrld xmm8, 12
718
+ pslld xmm6, 20
719
+ por xmm6, xmm8
720
+ movdqa xmm8, xmm7
721
+ psrld xmm8, 12
722
+ pslld xmm7, 20
723
+ por xmm7, xmm8
724
+ movdqa xmm8, xmm4
725
+ psrld xmm8, 12
726
+ pslld xmm4, 20
727
+ por xmm4, xmm8
728
+ paddd xmm0, xmmword ptr [rsp+0x50]
729
+ paddd xmm1, xmmword ptr [rsp]
730
+ paddd xmm2, xmmword ptr [rsp+0xF0]
731
+ paddd xmm3, xmmword ptr [rsp+0x10]
732
+ paddd xmm0, xmm5
733
+ paddd xmm1, xmm6
734
+ paddd xmm2, xmm7
735
+ paddd xmm3, xmm4
736
+ pxor xmm15, xmm0
737
+ pxor xmm12, xmm1
738
+ pxor xmm13, xmm2
739
+ pxor xmm14, xmm3
740
+ movdqa xmm8, xmm15
741
+ psrld xmm15, 8
742
+ pslld xmm8, 24
743
+ pxor xmm15, xmm8
744
+ movdqa xmm8, xmm12
745
+ psrld xmm12, 8
746
+ pslld xmm8, 24
747
+ pxor xmm12, xmm8
748
+ movdqa xmm8, xmm13
749
+ psrld xmm13, 8
750
+ pslld xmm8, 24
751
+ pxor xmm13, xmm8
752
+ movdqa xmm8, xmm14
753
+ psrld xmm14, 8
754
+ pslld xmm8, 24
755
+ pxor xmm14, xmm8
756
+ paddd xmm10, xmm15
757
+ paddd xmm11, xmm12
758
+ movdqa xmm8, xmmword ptr [rsp+0x100]
759
+ paddd xmm8, xmm13
760
+ paddd xmm9, xmm14
761
+ pxor xmm5, xmm10
762
+ pxor xmm6, xmm11
763
+ pxor xmm7, xmm8
764
+ pxor xmm4, xmm9
765
+ movdqa xmmword ptr [rsp+0x100], xmm8
766
+ movdqa xmm8, xmm5
767
+ psrld xmm8, 7
768
+ pslld xmm5, 25
769
+ por xmm5, xmm8
770
+ movdqa xmm8, xmm6
771
+ psrld xmm8, 7
772
+ pslld xmm6, 25
773
+ por xmm6, xmm8
774
+ movdqa xmm8, xmm7
775
+ psrld xmm8, 7
776
+ pslld xmm7, 25
777
+ por xmm7, xmm8
778
+ movdqa xmm8, xmm4
779
+ psrld xmm8, 7
780
+ pslld xmm4, 25
781
+ por xmm4, xmm8
782
+ paddd xmm0, xmmword ptr [rsp+0xA0]
783
+ paddd xmm1, xmmword ptr [rsp+0xC0]
784
+ paddd xmm2, xmmword ptr [rsp+0xE0]
785
+ paddd xmm3, xmmword ptr [rsp+0xD0]
786
+ paddd xmm0, xmm4
787
+ paddd xmm1, xmm5
788
+ paddd xmm2, xmm6
789
+ paddd xmm3, xmm7
790
+ pxor xmm12, xmm0
791
+ pxor xmm13, xmm1
792
+ pxor xmm14, xmm2
793
+ pxor xmm15, xmm3
794
+ pshuflw xmm12, xmm12, 0xB1
795
+ pshufhw xmm12, xmm12, 0xB1
796
+ pshuflw xmm13, xmm13, 0xB1
797
+ pshufhw xmm13, xmm13, 0xB1
798
+ pshuflw xmm14, xmm14, 0xB1
799
+ pshufhw xmm14, xmm14, 0xB1
800
+ pshuflw xmm15, xmm15, 0xB1
801
+ pshufhw xmm15, xmm15, 0xB1
802
+ movdqa xmm8, xmmword ptr [rsp+0x100]
803
+ paddd xmm8, xmm12
804
+ paddd xmm9, xmm13
805
+ paddd xmm10, xmm14
806
+ paddd xmm11, xmm15
807
+ pxor xmm4, xmm8
808
+ pxor xmm5, xmm9
809
+ pxor xmm6, xmm10
810
+ pxor xmm7, xmm11
811
+ movdqa xmmword ptr [rsp+0x100], xmm8
812
+ movdqa xmm8, xmm4
813
+ psrld xmm8, 12
814
+ pslld xmm4, 20
815
+ por xmm4, xmm8
816
+ movdqa xmm8, xmm5
817
+ psrld xmm8, 12
818
+ pslld xmm5, 20
819
+ por xmm5, xmm8
820
+ movdqa xmm8, xmm6
821
+ psrld xmm8, 12
822
+ pslld xmm6, 20
823
+ por xmm6, xmm8
824
+ movdqa xmm8, xmm7
825
+ psrld xmm8, 12
826
+ pslld xmm7, 20
827
+ por xmm7, xmm8
828
+ paddd xmm0, xmmword ptr [rsp+0x70]
829
+ paddd xmm1, xmmword ptr [rsp+0x90]
830
+ paddd xmm2, xmmword ptr [rsp+0x30]
831
+ paddd xmm3, xmmword ptr [rsp+0xF0]
832
+ paddd xmm0, xmm4
833
+ paddd xmm1, xmm5
834
+ paddd xmm2, xmm6
835
+ paddd xmm3, xmm7
836
+ pxor xmm12, xmm0
837
+ pxor xmm13, xmm1
838
+ pxor xmm14, xmm2
839
+ pxor xmm15, xmm3
840
+ movdqa xmm8, xmm12
841
+ psrld xmm12, 8
842
+ pslld xmm8, 24
843
+ pxor xmm12, xmm8
844
+ movdqa xmm8, xmm13
845
+ psrld xmm13, 8
846
+ pslld xmm8, 24
847
+ pxor xmm13, xmm8
848
+ movdqa xmm8, xmm14
849
+ psrld xmm14, 8
850
+ pslld xmm8, 24
851
+ pxor xmm14, xmm8
852
+ movdqa xmm8, xmm15
853
+ psrld xmm15, 8
854
+ pslld xmm8, 24
855
+ pxor xmm15, xmm8
856
+ movdqa xmm8, xmmword ptr [rsp+0x100]
857
+ paddd xmm8, xmm12
858
+ paddd xmm9, xmm13
859
+ paddd xmm10, xmm14
860
+ paddd xmm11, xmm15
861
+ pxor xmm4, xmm8
862
+ pxor xmm5, xmm9
863
+ pxor xmm6, xmm10
864
+ pxor xmm7, xmm11
865
+ movdqa xmmword ptr [rsp+0x100], xmm8
866
+ movdqa xmm8, xmm4
867
+ psrld xmm8, 7
868
+ pslld xmm4, 25
869
+ por xmm4, xmm8
870
+ movdqa xmm8, xmm5
871
+ psrld xmm8, 7
872
+ pslld xmm5, 25
873
+ por xmm5, xmm8
874
+ movdqa xmm8, xmm6
875
+ psrld xmm8, 7
876
+ pslld xmm6, 25
877
+ por xmm6, xmm8
878
+ movdqa xmm8, xmm7
879
+ psrld xmm8, 7
880
+ pslld xmm7, 25
881
+ por xmm7, xmm8
882
+ paddd xmm0, xmmword ptr [rsp+0x40]
883
+ paddd xmm1, xmmword ptr [rsp+0xB0]
884
+ paddd xmm2, xmmword ptr [rsp+0x50]
885
+ paddd xmm3, xmmword ptr [rsp+0x10]
886
+ paddd xmm0, xmm5
887
+ paddd xmm1, xmm6
888
+ paddd xmm2, xmm7
889
+ paddd xmm3, xmm4
890
+ pxor xmm15, xmm0
891
+ pxor xmm12, xmm1
892
+ pxor xmm13, xmm2
893
+ pxor xmm14, xmm3
894
+ pshuflw xmm15, xmm15, 0xB1
895
+ pshufhw xmm15, xmm15, 0xB1
896
+ pshuflw xmm12, xmm12, 0xB1
897
+ pshufhw xmm12, xmm12, 0xB1
898
+ pshuflw xmm13, xmm13, 0xB1
899
+ pshufhw xmm13, xmm13, 0xB1
900
+ pshuflw xmm14, xmm14, 0xB1
901
+ pshufhw xmm14, xmm14, 0xB1
902
+ paddd xmm10, xmm15
903
+ paddd xmm11, xmm12
904
+ movdqa xmm8, xmmword ptr [rsp+0x100]
905
+ paddd xmm8, xmm13
906
+ paddd xmm9, xmm14
907
+ pxor xmm5, xmm10
908
+ pxor xmm6, xmm11
909
+ pxor xmm7, xmm8
910
+ pxor xmm4, xmm9
911
+ movdqa xmmword ptr [rsp+0x100], xmm8
912
+ movdqa xmm8, xmm5
913
+ psrld xmm8, 12
914
+ pslld xmm5, 20
915
+ por xmm5, xmm8
916
+ movdqa xmm8, xmm6
917
+ psrld xmm8, 12
918
+ pslld xmm6, 20
919
+ por xmm6, xmm8
920
+ movdqa xmm8, xmm7
921
+ psrld xmm8, 12
922
+ pslld xmm7, 20
923
+ por xmm7, xmm8
924
+ movdqa xmm8, xmm4
925
+ psrld xmm8, 12
926
+ pslld xmm4, 20
927
+ por xmm4, xmm8
928
+ paddd xmm0, xmmword ptr [rsp]
929
+ paddd xmm1, xmmword ptr [rsp+0x20]
930
+ paddd xmm2, xmmword ptr [rsp+0x80]
931
+ paddd xmm3, xmmword ptr [rsp+0x60]
932
+ paddd xmm0, xmm5
933
+ paddd xmm1, xmm6
934
+ paddd xmm2, xmm7
935
+ paddd xmm3, xmm4
936
+ pxor xmm15, xmm0
937
+ pxor xmm12, xmm1
938
+ pxor xmm13, xmm2
939
+ pxor xmm14, xmm3
940
+ movdqa xmm8, xmm15
941
+ psrld xmm15, 8
942
+ pslld xmm8, 24
943
+ pxor xmm15, xmm8
944
+ movdqa xmm8, xmm12
945
+ psrld xmm12, 8
946
+ pslld xmm8, 24
947
+ pxor xmm12, xmm8
948
+ movdqa xmm8, xmm13
949
+ psrld xmm13, 8
950
+ pslld xmm8, 24
951
+ pxor xmm13, xmm8
952
+ movdqa xmm8, xmm14
953
+ psrld xmm14, 8
954
+ pslld xmm8, 24
955
+ pxor xmm14, xmm8
956
+ paddd xmm10, xmm15
957
+ paddd xmm11, xmm12
958
+ movdqa xmm8, xmmword ptr [rsp+0x100]
959
+ paddd xmm8, xmm13
960
+ paddd xmm9, xmm14
961
+ pxor xmm5, xmm10
962
+ pxor xmm6, xmm11
963
+ pxor xmm7, xmm8
964
+ pxor xmm4, xmm9
965
+ movdqa xmmword ptr [rsp+0x100], xmm8
966
+ movdqa xmm8, xmm5
967
+ psrld xmm8, 7
968
+ pslld xmm5, 25
969
+ por xmm5, xmm8
970
+ movdqa xmm8, xmm6
971
+ psrld xmm8, 7
972
+ pslld xmm6, 25
973
+ por xmm6, xmm8
974
+ movdqa xmm8, xmm7
975
+ psrld xmm8, 7
976
+ pslld xmm7, 25
977
+ por xmm7, xmm8
978
+ movdqa xmm8, xmm4
979
+ psrld xmm8, 7
980
+ pslld xmm4, 25
981
+ por xmm4, xmm8
982
+ paddd xmm0, xmmword ptr [rsp+0xC0]
983
+ paddd xmm1, xmmword ptr [rsp+0x90]
984
+ paddd xmm2, xmmword ptr [rsp+0xF0]
985
+ paddd xmm3, xmmword ptr [rsp+0xE0]
986
+ paddd xmm0, xmm4
987
+ paddd xmm1, xmm5
988
+ paddd xmm2, xmm6
989
+ paddd xmm3, xmm7
990
+ pxor xmm12, xmm0
991
+ pxor xmm13, xmm1
992
+ pxor xmm14, xmm2
993
+ pxor xmm15, xmm3
994
+ pshuflw xmm12, xmm12, 0xB1
995
+ pshufhw xmm12, xmm12, 0xB1
996
+ pshuflw xmm13, xmm13, 0xB1
997
+ pshufhw xmm13, xmm13, 0xB1
998
+ pshuflw xmm14, xmm14, 0xB1
999
+ pshufhw xmm14, xmm14, 0xB1
1000
+ pshuflw xmm15, xmm15, 0xB1
1001
+ pshufhw xmm15, xmm15, 0xB1
1002
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1003
+ paddd xmm8, xmm12
1004
+ paddd xmm9, xmm13
1005
+ paddd xmm10, xmm14
1006
+ paddd xmm11, xmm15
1007
+ pxor xmm4, xmm8
1008
+ pxor xmm5, xmm9
1009
+ pxor xmm6, xmm10
1010
+ pxor xmm7, xmm11
1011
+ movdqa xmmword ptr [rsp+0x100], xmm8
1012
+ movdqa xmm8, xmm4
1013
+ psrld xmm8, 12
1014
+ pslld xmm4, 20
1015
+ por xmm4, xmm8
1016
+ movdqa xmm8, xmm5
1017
+ psrld xmm8, 12
1018
+ pslld xmm5, 20
1019
+ por xmm5, xmm8
1020
+ movdqa xmm8, xmm6
1021
+ psrld xmm8, 12
1022
+ pslld xmm6, 20
1023
+ por xmm6, xmm8
1024
+ movdqa xmm8, xmm7
1025
+ psrld xmm8, 12
1026
+ pslld xmm7, 20
1027
+ por xmm7, xmm8
1028
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1029
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1030
+ paddd xmm2, xmmword ptr [rsp+0xA0]
1031
+ paddd xmm3, xmmword ptr [rsp+0x80]
1032
+ paddd xmm0, xmm4
1033
+ paddd xmm1, xmm5
1034
+ paddd xmm2, xmm6
1035
+ paddd xmm3, xmm7
1036
+ pxor xmm12, xmm0
1037
+ pxor xmm13, xmm1
1038
+ pxor xmm14, xmm2
1039
+ pxor xmm15, xmm3
1040
+ movdqa xmm8, xmm12
1041
+ psrld xmm12, 8
1042
+ pslld xmm8, 24
1043
+ pxor xmm12, xmm8
1044
+ movdqa xmm8, xmm13
1045
+ psrld xmm13, 8
1046
+ pslld xmm8, 24
1047
+ pxor xmm13, xmm8
1048
+ movdqa xmm8, xmm14
1049
+ psrld xmm14, 8
1050
+ pslld xmm8, 24
1051
+ pxor xmm14, xmm8
1052
+ movdqa xmm8, xmm15
1053
+ psrld xmm15, 8
1054
+ pslld xmm8, 24
1055
+ pxor xmm15, xmm8
1056
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1057
+ paddd xmm8, xmm12
1058
+ paddd xmm9, xmm13
1059
+ paddd xmm10, xmm14
1060
+ paddd xmm11, xmm15
1061
+ pxor xmm4, xmm8
1062
+ pxor xmm5, xmm9
1063
+ pxor xmm6, xmm10
1064
+ pxor xmm7, xmm11
1065
+ movdqa xmmword ptr [rsp+0x100], xmm8
1066
+ movdqa xmm8, xmm4
1067
+ psrld xmm8, 7
1068
+ pslld xmm4, 25
1069
+ por xmm4, xmm8
1070
+ movdqa xmm8, xmm5
1071
+ psrld xmm8, 7
1072
+ pslld xmm5, 25
1073
+ por xmm5, xmm8
1074
+ movdqa xmm8, xmm6
1075
+ psrld xmm8, 7
1076
+ pslld xmm6, 25
1077
+ por xmm6, xmm8
1078
+ movdqa xmm8, xmm7
1079
+ psrld xmm8, 7
1080
+ pslld xmm7, 25
1081
+ por xmm7, xmm8
1082
+ paddd xmm0, xmmword ptr [rsp+0x70]
1083
+ paddd xmm1, xmmword ptr [rsp+0x50]
1084
+ paddd xmm2, xmmword ptr [rsp]
1085
+ paddd xmm3, xmmword ptr [rsp+0x60]
1086
+ paddd xmm0, xmm5
1087
+ paddd xmm1, xmm6
1088
+ paddd xmm2, xmm7
1089
+ paddd xmm3, xmm4
1090
+ pxor xmm15, xmm0
1091
+ pxor xmm12, xmm1
1092
+ pxor xmm13, xmm2
1093
+ pxor xmm14, xmm3
1094
+ pshuflw xmm15, xmm15, 0xB1
1095
+ pshufhw xmm15, xmm15, 0xB1
1096
+ pshuflw xmm12, xmm12, 0xB1
1097
+ pshufhw xmm12, xmm12, 0xB1
1098
+ pshuflw xmm13, xmm13, 0xB1
1099
+ pshufhw xmm13, xmm13, 0xB1
1100
+ pshuflw xmm14, xmm14, 0xB1
1101
+ pshufhw xmm14, xmm14, 0xB1
1102
+ paddd xmm10, xmm15
1103
+ paddd xmm11, xmm12
1104
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1105
+ paddd xmm8, xmm13
1106
+ paddd xmm9, xmm14
1107
+ pxor xmm5, xmm10
1108
+ pxor xmm6, xmm11
1109
+ pxor xmm7, xmm8
1110
+ pxor xmm4, xmm9
1111
+ movdqa xmmword ptr [rsp+0x100], xmm8
1112
+ movdqa xmm8, xmm5
1113
+ psrld xmm8, 12
1114
+ pslld xmm5, 20
1115
+ por xmm5, xmm8
1116
+ movdqa xmm8, xmm6
1117
+ psrld xmm8, 12
1118
+ pslld xmm6, 20
1119
+ por xmm6, xmm8
1120
+ movdqa xmm8, xmm7
1121
+ psrld xmm8, 12
1122
+ pslld xmm7, 20
1123
+ por xmm7, xmm8
1124
+ movdqa xmm8, xmm4
1125
+ psrld xmm8, 12
1126
+ pslld xmm4, 20
1127
+ por xmm4, xmm8
1128
+ paddd xmm0, xmmword ptr [rsp+0x20]
1129
+ paddd xmm1, xmmword ptr [rsp+0x30]
1130
+ paddd xmm2, xmmword ptr [rsp+0x10]
1131
+ paddd xmm3, xmmword ptr [rsp+0x40]
1132
+ paddd xmm0, xmm5
1133
+ paddd xmm1, xmm6
1134
+ paddd xmm2, xmm7
1135
+ paddd xmm3, xmm4
1136
+ pxor xmm15, xmm0
1137
+ pxor xmm12, xmm1
1138
+ pxor xmm13, xmm2
1139
+ pxor xmm14, xmm3
1140
+ movdqa xmm8, xmm15
1141
+ psrld xmm15, 8
1142
+ pslld xmm8, 24
1143
+ pxor xmm15, xmm8
1144
+ movdqa xmm8, xmm12
1145
+ psrld xmm12, 8
1146
+ pslld xmm8, 24
1147
+ pxor xmm12, xmm8
1148
+ movdqa xmm8, xmm13
1149
+ psrld xmm13, 8
1150
+ pslld xmm8, 24
1151
+ pxor xmm13, xmm8
1152
+ movdqa xmm8, xmm14
1153
+ psrld xmm14, 8
1154
+ pslld xmm8, 24
1155
+ pxor xmm14, xmm8
1156
+ paddd xmm10, xmm15
1157
+ paddd xmm11, xmm12
1158
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1159
+ paddd xmm8, xmm13
1160
+ paddd xmm9, xmm14
1161
+ pxor xmm5, xmm10
1162
+ pxor xmm6, xmm11
1163
+ pxor xmm7, xmm8
1164
+ pxor xmm4, xmm9
1165
+ movdqa xmmword ptr [rsp+0x100], xmm8
1166
+ movdqa xmm8, xmm5
1167
+ psrld xmm8, 7
1168
+ pslld xmm5, 25
1169
+ por xmm5, xmm8
1170
+ movdqa xmm8, xmm6
1171
+ psrld xmm8, 7
1172
+ pslld xmm6, 25
1173
+ por xmm6, xmm8
1174
+ movdqa xmm8, xmm7
1175
+ psrld xmm8, 7
1176
+ pslld xmm7, 25
1177
+ por xmm7, xmm8
1178
+ movdqa xmm8, xmm4
1179
+ psrld xmm8, 7
1180
+ pslld xmm4, 25
1181
+ por xmm4, xmm8
1182
+ paddd xmm0, xmmword ptr [rsp+0x90]
1183
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1184
+ paddd xmm2, xmmword ptr [rsp+0x80]
1185
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1186
+ paddd xmm0, xmm4
1187
+ paddd xmm1, xmm5
1188
+ paddd xmm2, xmm6
1189
+ paddd xmm3, xmm7
1190
+ pxor xmm12, xmm0
1191
+ pxor xmm13, xmm1
1192
+ pxor xmm14, xmm2
1193
+ pxor xmm15, xmm3
1194
+ pshuflw xmm12, xmm12, 0xB1
1195
+ pshufhw xmm12, xmm12, 0xB1
1196
+ pshuflw xmm13, xmm13, 0xB1
1197
+ pshufhw xmm13, xmm13, 0xB1
1198
+ pshuflw xmm14, xmm14, 0xB1
1199
+ pshufhw xmm14, xmm14, 0xB1
1200
+ pshuflw xmm15, xmm15, 0xB1
1201
+ pshufhw xmm15, xmm15, 0xB1
1202
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1203
+ paddd xmm8, xmm12
1204
+ paddd xmm9, xmm13
1205
+ paddd xmm10, xmm14
1206
+ paddd xmm11, xmm15
1207
+ pxor xmm4, xmm8
1208
+ pxor xmm5, xmm9
1209
+ pxor xmm6, xmm10
1210
+ pxor xmm7, xmm11
1211
+ movdqa xmmword ptr [rsp+0x100], xmm8
1212
+ movdqa xmm8, xmm4
1213
+ psrld xmm8, 12
1214
+ pslld xmm4, 20
1215
+ por xmm4, xmm8
1216
+ movdqa xmm8, xmm5
1217
+ psrld xmm8, 12
1218
+ pslld xmm5, 20
1219
+ por xmm5, xmm8
1220
+ movdqa xmm8, xmm6
1221
+ psrld xmm8, 12
1222
+ pslld xmm6, 20
1223
+ por xmm6, xmm8
1224
+ movdqa xmm8, xmm7
1225
+ psrld xmm8, 12
1226
+ pslld xmm7, 20
1227
+ por xmm7, xmm8
1228
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1229
+ paddd xmm1, xmmword ptr [rsp+0x50]
1230
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1231
+ paddd xmm3, xmmword ptr [rsp+0x10]
1232
+ paddd xmm0, xmm4
1233
+ paddd xmm1, xmm5
1234
+ paddd xmm2, xmm6
1235
+ paddd xmm3, xmm7
1236
+ pxor xmm12, xmm0
1237
+ pxor xmm13, xmm1
1238
+ pxor xmm14, xmm2
1239
+ pxor xmm15, xmm3
1240
+ movdqa xmm8, xmm12
1241
+ psrld xmm12, 8
1242
+ pslld xmm8, 24
1243
+ pxor xmm12, xmm8
1244
+ movdqa xmm8, xmm13
1245
+ psrld xmm13, 8
1246
+ pslld xmm8, 24
1247
+ pxor xmm13, xmm8
1248
+ movdqa xmm8, xmm14
1249
+ psrld xmm14, 8
1250
+ pslld xmm8, 24
1251
+ pxor xmm14, xmm8
1252
+ movdqa xmm8, xmm15
1253
+ psrld xmm15, 8
1254
+ pslld xmm8, 24
1255
+ pxor xmm15, xmm8
1256
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1257
+ paddd xmm8, xmm12
1258
+ paddd xmm9, xmm13
1259
+ paddd xmm10, xmm14
1260
+ paddd xmm11, xmm15
1261
+ pxor xmm4, xmm8
1262
+ pxor xmm5, xmm9
1263
+ pxor xmm6, xmm10
1264
+ pxor xmm7, xmm11
1265
+ movdqa xmmword ptr [rsp+0x100], xmm8
1266
+ movdqa xmm8, xmm4
1267
+ psrld xmm8, 7
1268
+ pslld xmm4, 25
1269
+ por xmm4, xmm8
1270
+ movdqa xmm8, xmm5
1271
+ psrld xmm8, 7
1272
+ pslld xmm5, 25
1273
+ por xmm5, xmm8
1274
+ movdqa xmm8, xmm6
1275
+ psrld xmm8, 7
1276
+ pslld xmm6, 25
1277
+ por xmm6, xmm8
1278
+ movdqa xmm8, xmm7
1279
+ psrld xmm8, 7
1280
+ pslld xmm7, 25
1281
+ por xmm7, xmm8
1282
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1283
+ paddd xmm1, xmmword ptr [rsp]
1284
+ paddd xmm2, xmmword ptr [rsp+0x20]
1285
+ paddd xmm3, xmmword ptr [rsp+0x40]
1286
+ paddd xmm0, xmm5
1287
+ paddd xmm1, xmm6
1288
+ paddd xmm2, xmm7
1289
+ paddd xmm3, xmm4
1290
+ pxor xmm15, xmm0
1291
+ pxor xmm12, xmm1
1292
+ pxor xmm13, xmm2
1293
+ pxor xmm14, xmm3
1294
+ pshuflw xmm15, xmm15, 0xB1
1295
+ pshufhw xmm15, xmm15, 0xB1
1296
+ pshuflw xmm12, xmm12, 0xB1
1297
+ pshufhw xmm12, xmm12, 0xB1
1298
+ pshuflw xmm13, xmm13, 0xB1
1299
+ pshufhw xmm13, xmm13, 0xB1
1300
+ pshuflw xmm14, xmm14, 0xB1
1301
+ pshufhw xmm14, xmm14, 0xB1
1302
+ paddd xmm10, xmm15
1303
+ paddd xmm11, xmm12
1304
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1305
+ paddd xmm8, xmm13
1306
+ paddd xmm9, xmm14
1307
+ pxor xmm5, xmm10
1308
+ pxor xmm6, xmm11
1309
+ pxor xmm7, xmm8
1310
+ pxor xmm4, xmm9
1311
+ movdqa xmmword ptr [rsp+0x100], xmm8
1312
+ movdqa xmm8, xmm5
1313
+ psrld xmm8, 12
1314
+ pslld xmm5, 20
1315
+ por xmm5, xmm8
1316
+ movdqa xmm8, xmm6
1317
+ psrld xmm8, 12
1318
+ pslld xmm6, 20
1319
+ por xmm6, xmm8
1320
+ movdqa xmm8, xmm7
1321
+ psrld xmm8, 12
1322
+ pslld xmm7, 20
1323
+ por xmm7, xmm8
1324
+ movdqa xmm8, xmm4
1325
+ psrld xmm8, 12
1326
+ pslld xmm4, 20
1327
+ por xmm4, xmm8
1328
+ paddd xmm0, xmmword ptr [rsp+0x30]
1329
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1330
+ paddd xmm2, xmmword ptr [rsp+0x60]
1331
+ paddd xmm3, xmmword ptr [rsp+0x70]
1332
+ paddd xmm0, xmm5
1333
+ paddd xmm1, xmm6
1334
+ paddd xmm2, xmm7
1335
+ paddd xmm3, xmm4
1336
+ pxor xmm15, xmm0
1337
+ pxor xmm12, xmm1
1338
+ pxor xmm13, xmm2
1339
+ pxor xmm14, xmm3
1340
+ movdqa xmm8, xmm15
1341
+ psrld xmm15, 8
1342
+ pslld xmm8, 24
1343
+ pxor xmm15, xmm8
1344
+ movdqa xmm8, xmm12
1345
+ psrld xmm12, 8
1346
+ pslld xmm8, 24
1347
+ pxor xmm12, xmm8
1348
+ movdqa xmm8, xmm13
1349
+ psrld xmm13, 8
1350
+ pslld xmm8, 24
1351
+ pxor xmm13, xmm8
1352
+ movdqa xmm8, xmm14
1353
+ psrld xmm14, 8
1354
+ pslld xmm8, 24
1355
+ pxor xmm14, xmm8
1356
+ paddd xmm10, xmm15
1357
+ paddd xmm11, xmm12
1358
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1359
+ paddd xmm8, xmm13
1360
+ paddd xmm9, xmm14
1361
+ pxor xmm5, xmm10
1362
+ pxor xmm6, xmm11
1363
+ pxor xmm7, xmm8
1364
+ pxor xmm4, xmm9
1365
+ movdqa xmmword ptr [rsp+0x100], xmm8
1366
+ movdqa xmm8, xmm5
1367
+ psrld xmm8, 7
1368
+ pslld xmm5, 25
1369
+ por xmm5, xmm8
1370
+ movdqa xmm8, xmm6
1371
+ psrld xmm8, 7
1372
+ pslld xmm6, 25
1373
+ por xmm6, xmm8
1374
+ movdqa xmm8, xmm7
1375
+ psrld xmm8, 7
1376
+ pslld xmm7, 25
1377
+ por xmm7, xmm8
1378
+ movdqa xmm8, xmm4
1379
+ psrld xmm8, 7
1380
+ pslld xmm4, 25
1381
+ por xmm4, xmm8
1382
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1383
+ paddd xmm1, xmmword ptr [rsp+0x50]
1384
+ paddd xmm2, xmmword ptr [rsp+0x10]
1385
+ paddd xmm3, xmmword ptr [rsp+0x80]
1386
+ paddd xmm0, xmm4
1387
+ paddd xmm1, xmm5
1388
+ paddd xmm2, xmm6
1389
+ paddd xmm3, xmm7
1390
+ pxor xmm12, xmm0
1391
+ pxor xmm13, xmm1
1392
+ pxor xmm14, xmm2
1393
+ pxor xmm15, xmm3
1394
+ pshuflw xmm12, xmm12, 0xB1
1395
+ pshufhw xmm12, xmm12, 0xB1
1396
+ pshuflw xmm13, xmm13, 0xB1
1397
+ pshufhw xmm13, xmm13, 0xB1
1398
+ pshuflw xmm14, xmm14, 0xB1
1399
+ pshufhw xmm14, xmm14, 0xB1
1400
+ pshuflw xmm15, xmm15, 0xB1
1401
+ pshufhw xmm15, xmm15, 0xB1
1402
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1403
+ paddd xmm8, xmm12
1404
+ paddd xmm9, xmm13
1405
+ paddd xmm10, xmm14
1406
+ paddd xmm11, xmm15
1407
+ pxor xmm4, xmm8
1408
+ pxor xmm5, xmm9
1409
+ pxor xmm6, xmm10
1410
+ pxor xmm7, xmm11
1411
+ movdqa xmmword ptr [rsp+0x100], xmm8
1412
+ movdqa xmm8, xmm4
1413
+ psrld xmm8, 12
1414
+ pslld xmm4, 20
1415
+ por xmm4, xmm8
1416
+ movdqa xmm8, xmm5
1417
+ psrld xmm8, 12
1418
+ pslld xmm5, 20
1419
+ por xmm5, xmm8
1420
+ movdqa xmm8, xmm6
1421
+ psrld xmm8, 12
1422
+ pslld xmm6, 20
1423
+ por xmm6, xmm8
1424
+ movdqa xmm8, xmm7
1425
+ psrld xmm8, 12
1426
+ pslld xmm7, 20
1427
+ por xmm7, xmm8
1428
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1429
+ paddd xmm1, xmmword ptr [rsp]
1430
+ paddd xmm2, xmmword ptr [rsp+0x90]
1431
+ paddd xmm3, xmmword ptr [rsp+0x60]
1432
+ paddd xmm0, xmm4
1433
+ paddd xmm1, xmm5
1434
+ paddd xmm2, xmm6
1435
+ paddd xmm3, xmm7
1436
+ pxor xmm12, xmm0
1437
+ pxor xmm13, xmm1
1438
+ pxor xmm14, xmm2
1439
+ pxor xmm15, xmm3
1440
+ movdqa xmm8, xmm12
1441
+ psrld xmm12, 8
1442
+ pslld xmm8, 24
1443
+ pxor xmm12, xmm8
1444
+ movdqa xmm8, xmm13
1445
+ psrld xmm13, 8
1446
+ pslld xmm8, 24
1447
+ pxor xmm13, xmm8
1448
+ movdqa xmm8, xmm14
1449
+ psrld xmm14, 8
1450
+ pslld xmm8, 24
1451
+ pxor xmm14, xmm8
1452
+ movdqa xmm8, xmm15
1453
+ psrld xmm15, 8
1454
+ pslld xmm8, 24
1455
+ pxor xmm15, xmm8
1456
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1457
+ paddd xmm8, xmm12
1458
+ paddd xmm9, xmm13
1459
+ paddd xmm10, xmm14
1460
+ paddd xmm11, xmm15
1461
+ pxor xmm4, xmm8
1462
+ pxor xmm5, xmm9
1463
+ pxor xmm6, xmm10
1464
+ pxor xmm7, xmm11
1465
+ movdqa xmmword ptr [rsp+0x100], xmm8
1466
+ movdqa xmm8, xmm4
1467
+ psrld xmm8, 7
1468
+ pslld xmm4, 25
1469
+ por xmm4, xmm8
1470
+ movdqa xmm8, xmm5
1471
+ psrld xmm8, 7
1472
+ pslld xmm5, 25
1473
+ por xmm5, xmm8
1474
+ movdqa xmm8, xmm6
1475
+ psrld xmm8, 7
1476
+ pslld xmm6, 25
1477
+ por xmm6, xmm8
1478
+ movdqa xmm8, xmm7
1479
+ psrld xmm8, 7
1480
+ pslld xmm7, 25
1481
+ por xmm7, xmm8
1482
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1483
+ paddd xmm1, xmmword ptr [rsp+0x20]
1484
+ paddd xmm2, xmmword ptr [rsp+0x30]
1485
+ paddd xmm3, xmmword ptr [rsp+0x70]
1486
+ paddd xmm0, xmm5
1487
+ paddd xmm1, xmm6
1488
+ paddd xmm2, xmm7
1489
+ paddd xmm3, xmm4
1490
+ pxor xmm15, xmm0
1491
+ pxor xmm12, xmm1
1492
+ pxor xmm13, xmm2
1493
+ pxor xmm14, xmm3
1494
+ pshuflw xmm15, xmm15, 0xB1
1495
+ pshufhw xmm15, xmm15, 0xB1
1496
+ pshuflw xmm12, xmm12, 0xB1
1497
+ pshufhw xmm12, xmm12, 0xB1
1498
+ pshuflw xmm13, xmm13, 0xB1
1499
+ pshufhw xmm13, xmm13, 0xB1
1500
+ pshuflw xmm14, xmm14, 0xB1
1501
+ pshufhw xmm14, xmm14, 0xB1
1502
+ paddd xmm10, xmm15
1503
+ paddd xmm11, xmm12
1504
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1505
+ paddd xmm8, xmm13
1506
+ paddd xmm9, xmm14
1507
+ pxor xmm5, xmm10
1508
+ pxor xmm6, xmm11
1509
+ pxor xmm7, xmm8
1510
+ pxor xmm4, xmm9
1511
+ movdqa xmmword ptr [rsp+0x100], xmm8
1512
+ movdqa xmm8, xmm5
1513
+ psrld xmm8, 12
1514
+ pslld xmm5, 20
1515
+ por xmm5, xmm8
1516
+ movdqa xmm8, xmm6
1517
+ psrld xmm8, 12
1518
+ pslld xmm6, 20
1519
+ por xmm6, xmm8
1520
+ movdqa xmm8, xmm7
1521
+ psrld xmm8, 12
1522
+ pslld xmm7, 20
1523
+ por xmm7, xmm8
1524
+ movdqa xmm8, xmm4
1525
+ psrld xmm8, 12
1526
+ pslld xmm4, 20
1527
+ por xmm4, xmm8
1528
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1529
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1530
+ paddd xmm2, xmmword ptr [rsp+0x40]
1531
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1532
+ paddd xmm0, xmm5
1533
+ paddd xmm1, xmm6
1534
+ paddd xmm2, xmm7
1535
+ paddd xmm3, xmm4
1536
+ pxor xmm15, xmm0
1537
+ pxor xmm12, xmm1
1538
+ pxor xmm13, xmm2
1539
+ pxor xmm14, xmm3
1540
+ movdqa xmm8, xmm15
1541
+ psrld xmm15, 8
1542
+ pslld xmm8, 24
1543
+ pxor xmm15, xmm8
1544
+ movdqa xmm8, xmm12
1545
+ psrld xmm12, 8
1546
+ pslld xmm8, 24
1547
+ pxor xmm12, xmm8
1548
+ movdqa xmm8, xmm13
1549
+ psrld xmm13, 8
1550
+ pslld xmm8, 24
1551
+ pxor xmm13, xmm8
1552
+ movdqa xmm8, xmm14
1553
+ psrld xmm14, 8
1554
+ pslld xmm8, 24
1555
+ pxor xmm14, xmm8
1556
+ paddd xmm10, xmm15
1557
+ paddd xmm11, xmm12
1558
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1559
+ paddd xmm8, xmm13
1560
+ paddd xmm9, xmm14
1561
+ pxor xmm5, xmm10
1562
+ pxor xmm6, xmm11
1563
+ pxor xmm7, xmm8
1564
+ pxor xmm4, xmm9
1565
+ pxor xmm0, xmm8
1566
+ pxor xmm1, xmm9
1567
+ pxor xmm2, xmm10
1568
+ pxor xmm3, xmm11
1569
+ movdqa xmm8, xmm5
1570
+ psrld xmm8, 7
1571
+ pslld xmm5, 25
1572
+ por xmm5, xmm8
1573
+ movdqa xmm8, xmm6
1574
+ psrld xmm8, 7
1575
+ pslld xmm6, 25
1576
+ por xmm6, xmm8
1577
+ movdqa xmm8, xmm7
1578
+ psrld xmm8, 7
1579
+ pslld xmm7, 25
1580
+ por xmm7, xmm8
1581
+ movdqa xmm8, xmm4
1582
+ psrld xmm8, 7
1583
+ pslld xmm4, 25
1584
+ por xmm4, xmm8
1585
+ pxor xmm4, xmm12
1586
+ pxor xmm5, xmm13
1587
+ pxor xmm6, xmm14
1588
+ pxor xmm7, xmm15
1589
+ mov eax, r13d
1590
+ jne 9b
1591
+ movdqa xmm9, xmm0
1592
+ punpckldq xmm0, xmm1
1593
+ punpckhdq xmm9, xmm1
1594
+ movdqa xmm11, xmm2
1595
+ punpckldq xmm2, xmm3
1596
+ punpckhdq xmm11, xmm3
1597
+ movdqa xmm1, xmm0
1598
+ punpcklqdq xmm0, xmm2
1599
+ punpckhqdq xmm1, xmm2
1600
+ movdqa xmm3, xmm9
1601
+ punpcklqdq xmm9, xmm11
1602
+ punpckhqdq xmm3, xmm11
1603
+ movdqu xmmword ptr [rbx], xmm0
1604
+ movdqu xmmword ptr [rbx+0x20], xmm1
1605
+ movdqu xmmword ptr [rbx+0x40], xmm9
1606
+ movdqu xmmword ptr [rbx+0x60], xmm3
1607
+ movdqa xmm9, xmm4
1608
+ punpckldq xmm4, xmm5
1609
+ punpckhdq xmm9, xmm5
1610
+ movdqa xmm11, xmm6
1611
+ punpckldq xmm6, xmm7
1612
+ punpckhdq xmm11, xmm7
1613
+ movdqa xmm5, xmm4
1614
+ punpcklqdq xmm4, xmm6
1615
+ punpckhqdq xmm5, xmm6
1616
+ movdqa xmm7, xmm9
1617
+ punpcklqdq xmm9, xmm11
1618
+ punpckhqdq xmm7, xmm11
1619
+ movdqu xmmword ptr [rbx+0x10], xmm4
1620
+ movdqu xmmword ptr [rbx+0x30], xmm5
1621
+ movdqu xmmword ptr [rbx+0x50], xmm9
1622
+ movdqu xmmword ptr [rbx+0x70], xmm7
1623
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1624
+ movdqa xmm0, xmm1
1625
+ paddd xmm1, xmmword ptr [rsp+0x150]
1626
+ movdqa xmmword ptr [rsp+0x110], xmm1
1627
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1628
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1629
+ pcmpgtd xmm0, xmm1
1630
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1631
+ psubd xmm1, xmm0
1632
+ movdqa xmmword ptr [rsp+0x120], xmm1
1633
+ add rbx, 128
1634
+ add rdi, 32
1635
+ sub rsi, 4
1636
+ cmp rsi, 4
1637
+ jnc 2b
1638
+ test rsi, rsi
1639
+ jne 3f
1640
+ 4:
1641
+ movdqa xmm6, xmmword ptr [rsp+0x170]
1642
+ movdqa xmm7, xmmword ptr [rsp+0x180]
1643
+ movdqa xmm8, xmmword ptr [rsp+0x190]
1644
+ movdqa xmm9, xmmword ptr [rsp+0x1A0]
1645
+ movdqa xmm10, xmmword ptr [rsp+0x1B0]
1646
+ movdqa xmm11, xmmword ptr [rsp+0x1C0]
1647
+ movdqa xmm12, xmmword ptr [rsp+0x1D0]
1648
+ movdqa xmm13, xmmword ptr [rsp+0x1E0]
1649
+ movdqa xmm14, xmmword ptr [rsp+0x1F0]
1650
+ movdqa xmm15, xmmword ptr [rsp+0x200]
1651
+ mov rsp, rbp
1652
+ pop rbp
1653
+ pop rbx
1654
+ pop rdi
1655
+ pop rsi
1656
+ pop r12
1657
+ pop r13
1658
+ pop r14
1659
+ pop r15
1660
+ ret
1661
+ .p2align 5
1662
+ 3:
1663
+ test esi, 0x2
1664
+ je 3f
1665
+ movups xmm0, xmmword ptr [rcx]
1666
+ movups xmm1, xmmword ptr [rcx+0x10]
1667
+ movaps xmm8, xmm0
1668
+ movaps xmm9, xmm1
1669
+ movd xmm13, dword ptr [rsp+0x110]
1670
+ movd xmm14, dword ptr [rsp+0x120]
1671
+ punpckldq xmm13, xmm14
1672
+ movaps xmmword ptr [rsp], xmm13
1673
+ movd xmm14, dword ptr [rsp+0x114]
1674
+ movd xmm13, dword ptr [rsp+0x124]
1675
+ punpckldq xmm14, xmm13
1676
+ movaps xmmword ptr [rsp+0x10], xmm14
1677
+ mov r8, qword ptr [rdi]
1678
+ mov r9, qword ptr [rdi+0x8]
1679
+ movzx eax, byte ptr [rbp+0x80]
1680
+ or eax, r13d
1681
+ xor edx, edx
1682
+ 2:
1683
+ mov r14d, eax
1684
+ or eax, r12d
1685
+ add rdx, 64
1686
+ cmp rdx, r15
1687
+ cmovne eax, r14d
1688
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1689
+ movaps xmm10, xmm2
1690
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1691
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1692
+ movaps xmm3, xmm4
1693
+ shufps xmm4, xmm5, 136
1694
+ shufps xmm3, xmm5, 221
1695
+ movaps xmm5, xmm3
1696
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1697
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1698
+ movaps xmm3, xmm6
1699
+ shufps xmm6, xmm7, 136
1700
+ pshufd xmm6, xmm6, 0x93
1701
+ shufps xmm3, xmm7, 221
1702
+ pshufd xmm7, xmm3, 0x93
1703
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1704
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1705
+ movaps xmm11, xmm12
1706
+ shufps xmm12, xmm13, 136
1707
+ shufps xmm11, xmm13, 221
1708
+ movaps xmm13, xmm11
1709
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1710
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1711
+ movaps xmm11, xmm14
1712
+ shufps xmm14, xmm15, 136
1713
+ pshufd xmm14, xmm14, 0x93
1714
+ shufps xmm11, xmm15, 221
1715
+ pshufd xmm15, xmm11, 0x93
1716
+ shl rax, 0x20
1717
+ or rax, 0x40
1718
+ movq xmm3, rax
1719
+ movdqa xmmword ptr [rsp+0x20], xmm3
1720
+ movaps xmm3, xmmword ptr [rsp]
1721
+ movaps xmm11, xmmword ptr [rsp+0x10]
1722
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1723
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1724
+ mov al, 7
1725
+ 9:
1726
+ paddd xmm0, xmm4
1727
+ paddd xmm8, xmm12
1728
+ movaps xmmword ptr [rsp+0x20], xmm4
1729
+ movaps xmmword ptr [rsp+0x30], xmm12
1730
+ paddd xmm0, xmm1
1731
+ paddd xmm8, xmm9
1732
+ pxor xmm3, xmm0
1733
+ pxor xmm11, xmm8
1734
+ pshuflw xmm3, xmm3, 0xB1
1735
+ pshufhw xmm3, xmm3, 0xB1
1736
+ pshuflw xmm11, xmm11, 0xB1
1737
+ pshufhw xmm11, xmm11, 0xB1
1738
+ paddd xmm2, xmm3
1739
+ paddd xmm10, xmm11
1740
+ pxor xmm1, xmm2
1741
+ pxor xmm9, xmm10
1742
+ movdqa xmm4, xmm1
1743
+ pslld xmm1, 20
1744
+ psrld xmm4, 12
1745
+ por xmm1, xmm4
1746
+ movdqa xmm4, xmm9
1747
+ pslld xmm9, 20
1748
+ psrld xmm4, 12
1749
+ por xmm9, xmm4
1750
+ paddd xmm0, xmm5
1751
+ paddd xmm8, xmm13
1752
+ movaps xmmword ptr [rsp+0x40], xmm5
1753
+ movaps xmmword ptr [rsp+0x50], xmm13
1754
+ paddd xmm0, xmm1
1755
+ paddd xmm8, xmm9
1756
+ pxor xmm3, xmm0
1757
+ pxor xmm11, xmm8
1758
+ movdqa xmm13, xmm3
1759
+ psrld xmm3, 8
1760
+ pslld xmm13, 24
1761
+ pxor xmm3, xmm13
1762
+ movdqa xmm13, xmm11
1763
+ psrld xmm11, 8
1764
+ pslld xmm13, 24
1765
+ pxor xmm11, xmm13
1766
+ paddd xmm2, xmm3
1767
+ paddd xmm10, xmm11
1768
+ pxor xmm1, xmm2
1769
+ pxor xmm9, xmm10
1770
+ movdqa xmm4, xmm1
1771
+ pslld xmm1, 25
1772
+ psrld xmm4, 7
1773
+ por xmm1, xmm4
1774
+ movdqa xmm4, xmm9
1775
+ pslld xmm9, 25
1776
+ psrld xmm4, 7
1777
+ por xmm9, xmm4
1778
+ pshufd xmm0, xmm0, 0x93
1779
+ pshufd xmm8, xmm8, 0x93
1780
+ pshufd xmm3, xmm3, 0x4E
1781
+ pshufd xmm11, xmm11, 0x4E
1782
+ pshufd xmm2, xmm2, 0x39
1783
+ pshufd xmm10, xmm10, 0x39
1784
+ paddd xmm0, xmm6
1785
+ paddd xmm8, xmm14
1786
+ paddd xmm0, xmm1
1787
+ paddd xmm8, xmm9
1788
+ pxor xmm3, xmm0
1789
+ pxor xmm11, xmm8
1790
+ pshuflw xmm3, xmm3, 0xB1
1791
+ pshufhw xmm3, xmm3, 0xB1
1792
+ pshuflw xmm11, xmm11, 0xB1
1793
+ pshufhw xmm11, xmm11, 0xB1
1794
+ paddd xmm2, xmm3
1795
+ paddd xmm10, xmm11
1796
+ pxor xmm1, xmm2
1797
+ pxor xmm9, xmm10
1798
+ movdqa xmm4, xmm1
1799
+ pslld xmm1, 20
1800
+ psrld xmm4, 12
1801
+ por xmm1, xmm4
1802
+ movdqa xmm4, xmm9
1803
+ pslld xmm9, 20
1804
+ psrld xmm4, 12
1805
+ por xmm9, xmm4
1806
+ paddd xmm0, xmm7
1807
+ paddd xmm8, xmm15
1808
+ paddd xmm0, xmm1
1809
+ paddd xmm8, xmm9
1810
+ pxor xmm3, xmm0
1811
+ pxor xmm11, xmm8
1812
+ movdqa xmm13, xmm3
1813
+ psrld xmm3, 8
1814
+ pslld xmm13, 24
1815
+ pxor xmm3, xmm13
1816
+ movdqa xmm13, xmm11
1817
+ psrld xmm11, 8
1818
+ pslld xmm13, 24
1819
+ pxor xmm11, xmm13
1820
+ paddd xmm2, xmm3
1821
+ paddd xmm10, xmm11
1822
+ pxor xmm1, xmm2
1823
+ pxor xmm9, xmm10
1824
+ movdqa xmm4, xmm1
1825
+ pslld xmm1, 25
1826
+ psrld xmm4, 7
1827
+ por xmm1, xmm4
1828
+ movdqa xmm4, xmm9
1829
+ pslld xmm9, 25
1830
+ psrld xmm4, 7
1831
+ por xmm9, xmm4
1832
+ pshufd xmm0, xmm0, 0x39
1833
+ pshufd xmm8, xmm8, 0x39
1834
+ pshufd xmm3, xmm3, 0x4E
1835
+ pshufd xmm11, xmm11, 0x4E
1836
+ pshufd xmm2, xmm2, 0x93
1837
+ pshufd xmm10, xmm10, 0x93
1838
+ dec al
1839
+ je 9f
1840
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1841
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1842
+ pshufd xmm13, xmm12, 0x0F
1843
+ shufps xmm12, xmm5, 214
1844
+ pshufd xmm4, xmm12, 0x39
1845
+ movdqa xmm12, xmm6
1846
+ shufps xmm12, xmm7, 250
1847
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1848
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1849
+ por xmm13, xmm12
1850
+ movdqa xmmword ptr [rsp+0x20], xmm13
1851
+ movdqa xmm12, xmm7
1852
+ punpcklqdq xmm12, xmm5
1853
+ movdqa xmm13, xmm6
1854
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1855
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1856
+ por xmm12, xmm13
1857
+ pshufd xmm12, xmm12, 0x78
1858
+ punpckhdq xmm5, xmm7
1859
+ punpckldq xmm6, xmm5
1860
+ pshufd xmm7, xmm6, 0x1E
1861
+ movdqa xmmword ptr [rsp+0x40], xmm12
1862
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1863
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1864
+ pshufd xmm6, xmm5, 0x0F
1865
+ shufps xmm5, xmm13, 214
1866
+ pshufd xmm12, xmm5, 0x39
1867
+ movdqa xmm5, xmm14
1868
+ shufps xmm5, xmm15, 250
1869
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1870
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1871
+ por xmm6, xmm5
1872
+ movdqa xmm5, xmm15
1873
+ punpcklqdq xmm5, xmm13
1874
+ movdqa xmmword ptr [rsp+0x30], xmm2
1875
+ movdqa xmm2, xmm14
1876
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1877
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1878
+ por xmm5, xmm2
1879
+ movdqa xmm2, xmmword ptr [rsp+0x30]
1880
+ pshufd xmm5, xmm5, 0x78
1881
+ punpckhdq xmm13, xmm15
1882
+ punpckldq xmm14, xmm13
1883
+ pshufd xmm15, xmm14, 0x1E
1884
+ movdqa xmm13, xmm6
1885
+ movdqa xmm14, xmm5
1886
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1887
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1888
+ jmp 9b
1889
+ 9:
1890
+ pxor xmm0, xmm2
1891
+ pxor xmm1, xmm3
1892
+ pxor xmm8, xmm10
1893
+ pxor xmm9, xmm11
1894
+ mov eax, r13d
1895
+ cmp rdx, r15
1896
+ jne 2b
1897
+ movups xmmword ptr [rbx], xmm0
1898
+ movups xmmword ptr [rbx+0x10], xmm1
1899
+ movups xmmword ptr [rbx+0x20], xmm8
1900
+ movups xmmword ptr [rbx+0x30], xmm9
1901
+ mov eax, dword ptr [rsp+0x130]
1902
+ neg eax
1903
+ mov r10d, dword ptr [rsp+0x110+8*rax]
1904
+ mov r11d, dword ptr [rsp+0x120+8*rax]
1905
+ mov dword ptr [rsp+0x110], r10d
1906
+ mov dword ptr [rsp+0x120], r11d
1907
+ add rdi, 16
1908
+ add rbx, 64
1909
+ sub rsi, 2
1910
+ 3:
1911
+ test esi, 0x1
1912
+ je 4b
1913
+ movups xmm0, xmmword ptr [rcx]
1914
+ movups xmm1, xmmword ptr [rcx+0x10]
1915
+ movd xmm13, dword ptr [rsp+0x110]
1916
+ movd xmm14, dword ptr [rsp+0x120]
1917
+ punpckldq xmm13, xmm14
1918
+ mov r8, qword ptr [rdi]
1919
+ movzx eax, byte ptr [rbp+0x80]
1920
+ or eax, r13d
1921
+ xor edx, edx
1922
+ 2:
1923
+ mov r14d, eax
1924
+ or eax, r12d
1925
+ add rdx, 64
1926
+ cmp rdx, r15
1927
+ cmovne eax, r14d
1928
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1929
+ shl rax, 32
1930
+ or rax, 64
1931
+ movq xmm12, rax
1932
+ movdqa xmm3, xmm13
1933
+ punpcklqdq xmm3, xmm12
1934
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1935
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1936
+ movaps xmm8, xmm4
1937
+ shufps xmm4, xmm5, 136
1938
+ shufps xmm8, xmm5, 221
1939
+ movaps xmm5, xmm8
1940
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1941
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1942
+ movaps xmm8, xmm6
1943
+ shufps xmm6, xmm7, 136
1944
+ pshufd xmm6, xmm6, 0x93
1945
+ shufps xmm8, xmm7, 221
1946
+ pshufd xmm7, xmm8, 0x93
1947
+ mov al, 7
1948
+ 9:
1949
+ paddd xmm0, xmm4
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ pshuflw xmm3, xmm3, 0xB1
1953
+ pshufhw xmm3, xmm3, 0xB1
1954
+ paddd xmm2, xmm3
1955
+ pxor xmm1, xmm2
1956
+ movdqa xmm11, xmm1
1957
+ pslld xmm1, 20
1958
+ psrld xmm11, 12
1959
+ por xmm1, xmm11
1960
+ paddd xmm0, xmm5
1961
+ paddd xmm0, xmm1
1962
+ pxor xmm3, xmm0
1963
+ movdqa xmm14, xmm3
1964
+ psrld xmm3, 8
1965
+ pslld xmm14, 24
1966
+ pxor xmm3, xmm14
1967
+ paddd xmm2, xmm3
1968
+ pxor xmm1, xmm2
1969
+ movdqa xmm11, xmm1
1970
+ pslld xmm1, 25
1971
+ psrld xmm11, 7
1972
+ por xmm1, xmm11
1973
+ pshufd xmm0, xmm0, 0x93
1974
+ pshufd xmm3, xmm3, 0x4E
1975
+ pshufd xmm2, xmm2, 0x39
1976
+ paddd xmm0, xmm6
1977
+ paddd xmm0, xmm1
1978
+ pxor xmm3, xmm0
1979
+ pshuflw xmm3, xmm3, 0xB1
1980
+ pshufhw xmm3, xmm3, 0xB1
1981
+ paddd xmm2, xmm3
1982
+ pxor xmm1, xmm2
1983
+ movdqa xmm11, xmm1
1984
+ pslld xmm1, 20
1985
+ psrld xmm11, 12
1986
+ por xmm1, xmm11
1987
+ paddd xmm0, xmm7
1988
+ paddd xmm0, xmm1
1989
+ pxor xmm3, xmm0
1990
+ movdqa xmm14, xmm3
1991
+ psrld xmm3, 8
1992
+ pslld xmm14, 24
1993
+ pxor xmm3, xmm14
1994
+ paddd xmm2, xmm3
1995
+ pxor xmm1, xmm2
1996
+ movdqa xmm11, xmm1
1997
+ pslld xmm1, 25
1998
+ psrld xmm11, 7
1999
+ por xmm1, xmm11
2000
+ pshufd xmm0, xmm0, 0x39
2001
+ pshufd xmm3, xmm3, 0x4E
2002
+ pshufd xmm2, xmm2, 0x93
2003
+ dec al
2004
+ jz 9f
2005
+ movdqa xmm8, xmm4
2006
+ shufps xmm8, xmm5, 214
2007
+ pshufd xmm9, xmm4, 0x0F
2008
+ pshufd xmm4, xmm8, 0x39
2009
+ movdqa xmm8, xmm6
2010
+ shufps xmm8, xmm7, 250
2011
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2012
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2013
+ por xmm9, xmm8
2014
+ movdqa xmm8, xmm7
2015
+ punpcklqdq xmm8, xmm5
2016
+ movdqa xmm10, xmm6
2017
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2018
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2019
+ por xmm8, xmm10
2020
+ pshufd xmm8, xmm8, 0x78
2021
+ punpckhdq xmm5, xmm7
2022
+ punpckldq xmm6, xmm5
2023
+ pshufd xmm7, xmm6, 0x1E
2024
+ movdqa xmm5, xmm9
2025
+ movdqa xmm6, xmm8
2026
+ jmp 9b
2027
+ 9:
2028
+ pxor xmm0, xmm2
2029
+ pxor xmm1, xmm3
2030
+ mov eax, r13d
2031
+ cmp rdx, r15
2032
+ jne 2b
2033
+ movups xmmword ptr [rbx], xmm0
2034
+ movups xmmword ptr [rbx+0x10], xmm1
2035
+ jmp 4b
2036
+
2037
+ .p2align 6
2038
+ blake3_compress_in_place_sse2:
2039
+ _blake3_compress_in_place_sse2:
2040
+ sub rsp, 120
2041
+ movdqa xmmword ptr [rsp], xmm6
2042
+ movdqa xmmword ptr [rsp+0x10], xmm7
2043
+ movdqa xmmword ptr [rsp+0x20], xmm8
2044
+ movdqa xmmword ptr [rsp+0x30], xmm9
2045
+ movdqa xmmword ptr [rsp+0x40], xmm11
2046
+ movdqa xmmword ptr [rsp+0x50], xmm14
2047
+ movdqa xmmword ptr [rsp+0x60], xmm15
2048
+ movups xmm0, xmmword ptr [rcx]
2049
+ movups xmm1, xmmword ptr [rcx+0x10]
2050
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2051
+ movzx eax, byte ptr [rsp+0xA0]
2052
+ movzx r8d, r8b
2053
+ shl rax, 32
2054
+ add r8, rax
2055
+ movq xmm3, r9
2056
+ movq xmm4, r8
2057
+ punpcklqdq xmm3, xmm4
2058
+ movups xmm4, xmmword ptr [rdx]
2059
+ movups xmm5, xmmword ptr [rdx+0x10]
2060
+ movaps xmm8, xmm4
2061
+ shufps xmm4, xmm5, 136
2062
+ shufps xmm8, xmm5, 221
2063
+ movaps xmm5, xmm8
2064
+ movups xmm6, xmmword ptr [rdx+0x20]
2065
+ movups xmm7, xmmword ptr [rdx+0x30]
2066
+ movaps xmm8, xmm6
2067
+ shufps xmm6, xmm7, 136
2068
+ pshufd xmm6, xmm6, 0x93
2069
+ shufps xmm8, xmm7, 221
2070
+ pshufd xmm7, xmm8, 0x93
2071
+ mov al, 7
2072
+ 9:
2073
+ paddd xmm0, xmm4
2074
+ paddd xmm0, xmm1
2075
+ pxor xmm3, xmm0
2076
+ pshuflw xmm3, xmm3, 0xB1
2077
+ pshufhw xmm3, xmm3, 0xB1
2078
+ paddd xmm2, xmm3
2079
+ pxor xmm1, xmm2
2080
+ movdqa xmm11, xmm1
2081
+ pslld xmm1, 20
2082
+ psrld xmm11, 12
2083
+ por xmm1, xmm11
2084
+ paddd xmm0, xmm5
2085
+ paddd xmm0, xmm1
2086
+ pxor xmm3, xmm0
2087
+ movdqa xmm14, xmm3
2088
+ psrld xmm3, 8
2089
+ pslld xmm14, 24
2090
+ pxor xmm3, xmm14
2091
+ paddd xmm2, xmm3
2092
+ pxor xmm1, xmm2
2093
+ movdqa xmm11, xmm1
2094
+ pslld xmm1, 25
2095
+ psrld xmm11, 7
2096
+ por xmm1, xmm11
2097
+ pshufd xmm0, xmm0, 0x93
2098
+ pshufd xmm3, xmm3, 0x4E
2099
+ pshufd xmm2, xmm2, 0x39
2100
+ paddd xmm0, xmm6
2101
+ paddd xmm0, xmm1
2102
+ pxor xmm3, xmm0
2103
+ pshuflw xmm3, xmm3, 0xB1
2104
+ pshufhw xmm3, xmm3, 0xB1
2105
+ paddd xmm2, xmm3
2106
+ pxor xmm1, xmm2
2107
+ movdqa xmm11, xmm1
2108
+ pslld xmm1, 20
2109
+ psrld xmm11, 12
2110
+ por xmm1, xmm11
2111
+ paddd xmm0, xmm7
2112
+ paddd xmm0, xmm1
2113
+ pxor xmm3, xmm0
2114
+ movdqa xmm14, xmm3
2115
+ psrld xmm3, 8
2116
+ pslld xmm14, 24
2117
+ pxor xmm3, xmm14
2118
+ paddd xmm2, xmm3
2119
+ pxor xmm1, xmm2
2120
+ movdqa xmm11, xmm1
2121
+ pslld xmm1, 25
2122
+ psrld xmm11, 7
2123
+ por xmm1, xmm11
2124
+ pshufd xmm0, xmm0, 0x39
2125
+ pshufd xmm3, xmm3, 0x4E
2126
+ pshufd xmm2, xmm2, 0x93
2127
+ dec al
2128
+ jz 9f
2129
+ movdqa xmm8, xmm4
2130
+ shufps xmm8, xmm5, 214
2131
+ pshufd xmm9, xmm4, 0x0F
2132
+ pshufd xmm4, xmm8, 0x39
2133
+ movdqa xmm8, xmm6
2134
+ shufps xmm8, xmm7, 250
2135
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2136
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2137
+ por xmm9, xmm8
2138
+ movdqa xmm8, xmm7
2139
+ punpcklqdq xmm8, xmm5
2140
+ movdqa xmm14, xmm6
2141
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2142
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2143
+ por xmm8, xmm14
2144
+ pshufd xmm8, xmm8, 0x78
2145
+ punpckhdq xmm5, xmm7
2146
+ punpckldq xmm6, xmm5
2147
+ pshufd xmm7, xmm6, 0x1E
2148
+ movdqa xmm5, xmm9
2149
+ movdqa xmm6, xmm8
2150
+ jmp 9b
2151
+ 9:
2152
+ pxor xmm0, xmm2
2153
+ pxor xmm1, xmm3
2154
+ movups xmmword ptr [rcx], xmm0
2155
+ movups xmmword ptr [rcx+0x10], xmm1
2156
+ movdqa xmm6, xmmword ptr [rsp]
2157
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2158
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2159
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2160
+ movdqa xmm11, xmmword ptr [rsp+0x40]
2161
+ movdqa xmm14, xmmword ptr [rsp+0x50]
2162
+ movdqa xmm15, xmmword ptr [rsp+0x60]
2163
+ add rsp, 120
2164
+ ret
2165
+
2166
+
2167
+ .p2align 6
2168
+ _blake3_compress_xof_sse2:
2169
+ blake3_compress_xof_sse2:
2170
+ sub rsp, 120
2171
+ movdqa xmmword ptr [rsp], xmm6
2172
+ movdqa xmmword ptr [rsp+0x10], xmm7
2173
+ movdqa xmmword ptr [rsp+0x20], xmm8
2174
+ movdqa xmmword ptr [rsp+0x30], xmm9
2175
+ movdqa xmmword ptr [rsp+0x40], xmm11
2176
+ movdqa xmmword ptr [rsp+0x50], xmm14
2177
+ movdqa xmmword ptr [rsp+0x60], xmm15
2178
+ movups xmm0, xmmword ptr [rcx]
2179
+ movups xmm1, xmmword ptr [rcx+0x10]
2180
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2181
+ movzx eax, byte ptr [rsp+0xA0]
2182
+ movzx r8d, r8b
2183
+ mov r10, qword ptr [rsp+0xA8]
2184
+ shl rax, 32
2185
+ add r8, rax
2186
+ movq xmm3, r9
2187
+ movq xmm4, r8
2188
+ punpcklqdq xmm3, xmm4
2189
+ movups xmm4, xmmword ptr [rdx]
2190
+ movups xmm5, xmmword ptr [rdx+0x10]
2191
+ movaps xmm8, xmm4
2192
+ shufps xmm4, xmm5, 136
2193
+ shufps xmm8, xmm5, 221
2194
+ movaps xmm5, xmm8
2195
+ movups xmm6, xmmword ptr [rdx+0x20]
2196
+ movups xmm7, xmmword ptr [rdx+0x30]
2197
+ movaps xmm8, xmm6
2198
+ shufps xmm6, xmm7, 136
2199
+ pshufd xmm6, xmm6, 0x93
2200
+ shufps xmm8, xmm7, 221
2201
+ pshufd xmm7, xmm8, 0x93
2202
+ mov al, 7
2203
+ 9:
2204
+ paddd xmm0, xmm4
2205
+ paddd xmm0, xmm1
2206
+ pxor xmm3, xmm0
2207
+ pshuflw xmm3, xmm3, 0xB1
2208
+ pshufhw xmm3, xmm3, 0xB1
2209
+ paddd xmm2, xmm3
2210
+ pxor xmm1, xmm2
2211
+ movdqa xmm11, xmm1
2212
+ pslld xmm1, 20
2213
+ psrld xmm11, 12
2214
+ por xmm1, xmm11
2215
+ paddd xmm0, xmm5
2216
+ paddd xmm0, xmm1
2217
+ pxor xmm3, xmm0
2218
+ movdqa xmm14, xmm3
2219
+ psrld xmm3, 8
2220
+ pslld xmm14, 24
2221
+ pxor xmm3, xmm14
2222
+ paddd xmm2, xmm3
2223
+ pxor xmm1, xmm2
2224
+ movdqa xmm11, xmm1
2225
+ pslld xmm1, 25
2226
+ psrld xmm11, 7
2227
+ por xmm1, xmm11
2228
+ pshufd xmm0, xmm0, 0x93
2229
+ pshufd xmm3, xmm3, 0x4E
2230
+ pshufd xmm2, xmm2, 0x39
2231
+ paddd xmm0, xmm6
2232
+ paddd xmm0, xmm1
2233
+ pxor xmm3, xmm0
2234
+ pshuflw xmm3, xmm3, 0xB1
2235
+ pshufhw xmm3, xmm3, 0xB1
2236
+ paddd xmm2, xmm3
2237
+ pxor xmm1, xmm2
2238
+ movdqa xmm11, xmm1
2239
+ pslld xmm1, 20
2240
+ psrld xmm11, 12
2241
+ por xmm1, xmm11
2242
+ paddd xmm0, xmm7
2243
+ paddd xmm0, xmm1
2244
+ pxor xmm3, xmm0
2245
+ movdqa xmm14, xmm3
2246
+ psrld xmm3, 8
2247
+ pslld xmm14, 24
2248
+ pxor xmm3, xmm14
2249
+ paddd xmm2, xmm3
2250
+ pxor xmm1, xmm2
2251
+ movdqa xmm11, xmm1
2252
+ pslld xmm1, 25
2253
+ psrld xmm11, 7
2254
+ por xmm1, xmm11
2255
+ pshufd xmm0, xmm0, 0x39
2256
+ pshufd xmm3, xmm3, 0x4E
2257
+ pshufd xmm2, xmm2, 0x93
2258
+ dec al
2259
+ jz 9f
2260
+ movdqa xmm8, xmm4
2261
+ shufps xmm8, xmm5, 214
2262
+ pshufd xmm9, xmm4, 0x0F
2263
+ pshufd xmm4, xmm8, 0x39
2264
+ movdqa xmm8, xmm6
2265
+ shufps xmm8, xmm7, 250
2266
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2267
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2268
+ por xmm9, xmm8
2269
+ movdqa xmm8, xmm7
2270
+ punpcklqdq xmm8, xmm5
2271
+ movdqa xmm14, xmm6
2272
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2273
+ pand xmm14, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2274
+ por xmm8, xmm14
2275
+ pshufd xmm8, xmm8, 0x78
2276
+ punpckhdq xmm5, xmm7
2277
+ punpckldq xmm6, xmm5
2278
+ pshufd xmm7, xmm6, 0x1E
2279
+ movdqa xmm5, xmm9
2280
+ movdqa xmm6, xmm8
2281
+ jmp 9b
2282
+ 9:
2283
+ movdqu xmm4, xmmword ptr [rcx]
2284
+ movdqu xmm5, xmmword ptr [rcx+0x10]
2285
+ pxor xmm0, xmm2
2286
+ pxor xmm1, xmm3
2287
+ pxor xmm2, xmm4
2288
+ pxor xmm3, xmm5
2289
+ movups xmmword ptr [r10], xmm0
2290
+ movups xmmword ptr [r10+0x10], xmm1
2291
+ movups xmmword ptr [r10+0x20], xmm2
2292
+ movups xmmword ptr [r10+0x30], xmm3
2293
+ movdqa xmm6, xmmword ptr [rsp]
2294
+ movdqa xmm7, xmmword ptr [rsp+0x10]
2295
+ movdqa xmm8, xmmword ptr [rsp+0x20]
2296
+ movdqa xmm9, xmmword ptr [rsp+0x30]
2297
+ movdqa xmm11, xmmword ptr [rsp+0x40]
2298
+ movdqa xmm14, xmmword ptr [rsp+0x50]
2299
+ movdqa xmm15, xmmword ptr [rsp+0x60]
2300
+ add rsp, 120
2301
+ ret
2302
+
2303
+
2304
+ .section .rodata
2305
+ .p2align 6
2306
+ BLAKE3_IV:
2307
+ .long 0x6A09E667, 0xBB67AE85
2308
+ .long 0x3C6EF372, 0xA54FF53A
2309
+ ADD0:
2310
+ .long 0, 1, 2, 3
2311
+ ADD1:
2312
+ .long 4, 4, 4, 4
2313
+ BLAKE3_IV_0:
2314
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2315
+ BLAKE3_IV_1:
2316
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2317
+ BLAKE3_IV_2:
2318
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2319
+ BLAKE3_IV_3:
2320
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2321
+ BLAKE3_BLOCK_LEN:
2322
+ .long 64, 64, 64, 64
2323
+ CMP_MSB_MASK:
2324
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2325
+ PBLENDW_0x33_MASK:
2326
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2327
+ PBLENDW_0xCC_MASK:
2328
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2329
+ PBLENDW_0x3F_MASK:
2330
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2331
+ PBLENDW_0xC0_MASK:
2332
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF