digest-blake3 0.0.1 → 0.37.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2291 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
15
+ .intel_syntax noprefix
16
+ .global blake3_hash_many_sse2
17
+ .global _blake3_hash_many_sse2
18
+ .global blake3_compress_in_place_sse2
19
+ .global _blake3_compress_in_place_sse2
20
+ .global blake3_compress_xof_sse2
21
+ .global _blake3_compress_xof_sse2
22
+ #ifdef __APPLE__
23
+ .text
24
+ #else
25
+ .section .text
26
+ #endif
27
+ .p2align 6
28
+ _blake3_hash_many_sse2:
29
+ blake3_hash_many_sse2:
30
+ _CET_ENDBR
31
+ push r15
32
+ push r14
33
+ push r13
34
+ push r12
35
+ push rbx
36
+ push rbp
37
+ mov rbp, rsp
38
+ sub rsp, 360
39
+ and rsp, 0xFFFFFFFFFFFFFFC0
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 0x00
43
+ movdqa xmmword ptr [rsp+0x130], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0+rip]
46
+ pand xmm0, xmmword ptr [ADD1+rip]
47
+ movdqa xmmword ptr [rsp+0x150], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 0x00
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+0x110], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 0x00
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+0x120], xmm2
60
+ mov rbx, qword ptr [rbp+0x50]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+0x38]
64
+ movzx r12d, byte ptr [rbp+0x48]
65
+ cmp rsi, 4
66
+ jc 3f
67
+ 2:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 0x00
70
+ pshufd xmm1, xmm3, 0x55
71
+ pshufd xmm2, xmm3, 0xAA
72
+ pshufd xmm3, xmm3, 0xFF
73
+ movdqu xmm7, xmmword ptr [rcx+0x10]
74
+ pshufd xmm4, xmm7, 0x00
75
+ pshufd xmm5, xmm7, 0x55
76
+ pshufd xmm6, xmm7, 0xAA
77
+ pshufd xmm7, xmm7, 0xFF
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+0x8]
80
+ mov r10, qword ptr [rdi+0x10]
81
+ mov r11, qword ptr [rdi+0x18]
82
+ movzx eax, byte ptr [rbp+0x40]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ 9:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+0x10], xmm9
109
+ movdqa xmmword ptr [rsp+0x20], xmm12
110
+ movdqa xmmword ptr [rsp+0x30], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+0x40], xmm8
128
+ movdqa xmmword ptr [rsp+0x50], xmm9
129
+ movdqa xmmword ptr [rsp+0x60], xmm12
130
+ movdqa xmmword ptr [rsp+0x70], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+0x80], xmm8
148
+ movdqa xmmword ptr [rsp+0x90], xmm9
149
+ movdqa xmmword ptr [rsp+0xA0], xmm12
150
+ movdqa xmmword ptr [rsp+0xB0], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0xC0], xmm8
168
+ movdqa xmmword ptr [rsp+0xD0], xmm9
169
+ movdqa xmmword ptr [rsp+0xE0], xmm12
170
+ movdqa xmmword ptr [rsp+0xF0], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
174
+ movdqa xmm12, xmmword ptr [rsp+0x110]
175
+ movdqa xmm13, xmmword ptr [rsp+0x120]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 0x00
179
+ prefetcht0 [r8+rdx+0x80]
180
+ prefetcht0 [r9+rdx+0x80]
181
+ prefetcht0 [r10+rdx+0x80]
182
+ prefetcht0 [r11+rdx+0x80]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+0x20]
185
+ paddd xmm2, xmmword ptr [rsp+0x40]
186
+ paddd xmm3, xmmword ptr [rsp+0x60]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ pshuflw xmm12, xmm12, 0xB1
196
+ pshufhw xmm12, xmm12, 0xB1
197
+ pshuflw xmm13, xmm13, 0xB1
198
+ pshufhw xmm13, xmm13, 0xB1
199
+ pshuflw xmm14, xmm14, 0xB1
200
+ pshufhw xmm14, xmm14, 0xB1
201
+ pshuflw xmm15, xmm15, 0xB1
202
+ pshufhw xmm15, xmm15, 0xB1
203
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
204
+ paddd xmm8, xmm12
205
+ paddd xmm9, xmm13
206
+ paddd xmm10, xmm14
207
+ paddd xmm11, xmm15
208
+ pxor xmm4, xmm8
209
+ pxor xmm5, xmm9
210
+ pxor xmm6, xmm10
211
+ pxor xmm7, xmm11
212
+ movdqa xmmword ptr [rsp+0x100], xmm8
213
+ movdqa xmm8, xmm4
214
+ psrld xmm8, 12
215
+ pslld xmm4, 20
216
+ por xmm4, xmm8
217
+ movdqa xmm8, xmm5
218
+ psrld xmm8, 12
219
+ pslld xmm5, 20
220
+ por xmm5, xmm8
221
+ movdqa xmm8, xmm6
222
+ psrld xmm8, 12
223
+ pslld xmm6, 20
224
+ por xmm6, xmm8
225
+ movdqa xmm8, xmm7
226
+ psrld xmm8, 12
227
+ pslld xmm7, 20
228
+ por xmm7, xmm8
229
+ paddd xmm0, xmmword ptr [rsp+0x10]
230
+ paddd xmm1, xmmword ptr [rsp+0x30]
231
+ paddd xmm2, xmmword ptr [rsp+0x50]
232
+ paddd xmm3, xmmword ptr [rsp+0x70]
233
+ paddd xmm0, xmm4
234
+ paddd xmm1, xmm5
235
+ paddd xmm2, xmm6
236
+ paddd xmm3, xmm7
237
+ pxor xmm12, xmm0
238
+ pxor xmm13, xmm1
239
+ pxor xmm14, xmm2
240
+ pxor xmm15, xmm3
241
+ movdqa xmm8, xmm12
242
+ psrld xmm12, 8
243
+ pslld xmm8, 24
244
+ pxor xmm12, xmm8
245
+ movdqa xmm8, xmm13
246
+ psrld xmm13, 8
247
+ pslld xmm8, 24
248
+ pxor xmm13, xmm8
249
+ movdqa xmm8, xmm14
250
+ psrld xmm14, 8
251
+ pslld xmm8, 24
252
+ pxor xmm14, xmm8
253
+ movdqa xmm8, xmm15
254
+ psrld xmm15, 8
255
+ pslld xmm8, 24
256
+ pxor xmm15, xmm8
257
+ movdqa xmm8, xmmword ptr [rsp+0x100]
258
+ paddd xmm8, xmm12
259
+ paddd xmm9, xmm13
260
+ paddd xmm10, xmm14
261
+ paddd xmm11, xmm15
262
+ pxor xmm4, xmm8
263
+ pxor xmm5, xmm9
264
+ pxor xmm6, xmm10
265
+ pxor xmm7, xmm11
266
+ movdqa xmmword ptr [rsp+0x100], xmm8
267
+ movdqa xmm8, xmm4
268
+ psrld xmm8, 7
269
+ pslld xmm4, 25
270
+ por xmm4, xmm8
271
+ movdqa xmm8, xmm5
272
+ psrld xmm8, 7
273
+ pslld xmm5, 25
274
+ por xmm5, xmm8
275
+ movdqa xmm8, xmm6
276
+ psrld xmm8, 7
277
+ pslld xmm6, 25
278
+ por xmm6, xmm8
279
+ movdqa xmm8, xmm7
280
+ psrld xmm8, 7
281
+ pslld xmm7, 25
282
+ por xmm7, xmm8
283
+ paddd xmm0, xmmword ptr [rsp+0x80]
284
+ paddd xmm1, xmmword ptr [rsp+0xA0]
285
+ paddd xmm2, xmmword ptr [rsp+0xC0]
286
+ paddd xmm3, xmmword ptr [rsp+0xE0]
287
+ paddd xmm0, xmm5
288
+ paddd xmm1, xmm6
289
+ paddd xmm2, xmm7
290
+ paddd xmm3, xmm4
291
+ pxor xmm15, xmm0
292
+ pxor xmm12, xmm1
293
+ pxor xmm13, xmm2
294
+ pxor xmm14, xmm3
295
+ pshuflw xmm15, xmm15, 0xB1
296
+ pshufhw xmm15, xmm15, 0xB1
297
+ pshuflw xmm12, xmm12, 0xB1
298
+ pshufhw xmm12, xmm12, 0xB1
299
+ pshuflw xmm13, xmm13, 0xB1
300
+ pshufhw xmm13, xmm13, 0xB1
301
+ pshuflw xmm14, xmm14, 0xB1
302
+ pshufhw xmm14, xmm14, 0xB1
303
+ paddd xmm10, xmm15
304
+ paddd xmm11, xmm12
305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
306
+ paddd xmm8, xmm13
307
+ paddd xmm9, xmm14
308
+ pxor xmm5, xmm10
309
+ pxor xmm6, xmm11
310
+ pxor xmm7, xmm8
311
+ pxor xmm4, xmm9
312
+ movdqa xmmword ptr [rsp+0x100], xmm8
313
+ movdqa xmm8, xmm5
314
+ psrld xmm8, 12
315
+ pslld xmm5, 20
316
+ por xmm5, xmm8
317
+ movdqa xmm8, xmm6
318
+ psrld xmm8, 12
319
+ pslld xmm6, 20
320
+ por xmm6, xmm8
321
+ movdqa xmm8, xmm7
322
+ psrld xmm8, 12
323
+ pslld xmm7, 20
324
+ por xmm7, xmm8
325
+ movdqa xmm8, xmm4
326
+ psrld xmm8, 12
327
+ pslld xmm4, 20
328
+ por xmm4, xmm8
329
+ paddd xmm0, xmmword ptr [rsp+0x90]
330
+ paddd xmm1, xmmword ptr [rsp+0xB0]
331
+ paddd xmm2, xmmword ptr [rsp+0xD0]
332
+ paddd xmm3, xmmword ptr [rsp+0xF0]
333
+ paddd xmm0, xmm5
334
+ paddd xmm1, xmm6
335
+ paddd xmm2, xmm7
336
+ paddd xmm3, xmm4
337
+ pxor xmm15, xmm0
338
+ pxor xmm12, xmm1
339
+ pxor xmm13, xmm2
340
+ pxor xmm14, xmm3
341
+ movdqa xmm8, xmm15
342
+ psrld xmm15, 8
343
+ pslld xmm8, 24
344
+ pxor xmm15, xmm8
345
+ movdqa xmm8, xmm12
346
+ psrld xmm12, 8
347
+ pslld xmm8, 24
348
+ pxor xmm12, xmm8
349
+ movdqa xmm8, xmm13
350
+ psrld xmm13, 8
351
+ pslld xmm8, 24
352
+ pxor xmm13, xmm8
353
+ movdqa xmm8, xmm14
354
+ psrld xmm14, 8
355
+ pslld xmm8, 24
356
+ pxor xmm14, xmm8
357
+ paddd xmm10, xmm15
358
+ paddd xmm11, xmm12
359
+ movdqa xmm8, xmmword ptr [rsp+0x100]
360
+ paddd xmm8, xmm13
361
+ paddd xmm9, xmm14
362
+ pxor xmm5, xmm10
363
+ pxor xmm6, xmm11
364
+ pxor xmm7, xmm8
365
+ pxor xmm4, xmm9
366
+ movdqa xmmword ptr [rsp+0x100], xmm8
367
+ movdqa xmm8, xmm5
368
+ psrld xmm8, 7
369
+ pslld xmm5, 25
370
+ por xmm5, xmm8
371
+ movdqa xmm8, xmm6
372
+ psrld xmm8, 7
373
+ pslld xmm6, 25
374
+ por xmm6, xmm8
375
+ movdqa xmm8, xmm7
376
+ psrld xmm8, 7
377
+ pslld xmm7, 25
378
+ por xmm7, xmm8
379
+ movdqa xmm8, xmm4
380
+ psrld xmm8, 7
381
+ pslld xmm4, 25
382
+ por xmm4, xmm8
383
+ paddd xmm0, xmmword ptr [rsp+0x20]
384
+ paddd xmm1, xmmword ptr [rsp+0x30]
385
+ paddd xmm2, xmmword ptr [rsp+0x70]
386
+ paddd xmm3, xmmword ptr [rsp+0x40]
387
+ paddd xmm0, xmm4
388
+ paddd xmm1, xmm5
389
+ paddd xmm2, xmm6
390
+ paddd xmm3, xmm7
391
+ pxor xmm12, xmm0
392
+ pxor xmm13, xmm1
393
+ pxor xmm14, xmm2
394
+ pxor xmm15, xmm3
395
+ pshuflw xmm12, xmm12, 0xB1
396
+ pshufhw xmm12, xmm12, 0xB1
397
+ pshuflw xmm13, xmm13, 0xB1
398
+ pshufhw xmm13, xmm13, 0xB1
399
+ pshuflw xmm14, xmm14, 0xB1
400
+ pshufhw xmm14, xmm14, 0xB1
401
+ pshuflw xmm15, xmm15, 0xB1
402
+ pshufhw xmm15, xmm15, 0xB1
403
+ movdqa xmm8, xmmword ptr [rsp+0x100]
404
+ paddd xmm8, xmm12
405
+ paddd xmm9, xmm13
406
+ paddd xmm10, xmm14
407
+ paddd xmm11, xmm15
408
+ pxor xmm4, xmm8
409
+ pxor xmm5, xmm9
410
+ pxor xmm6, xmm10
411
+ pxor xmm7, xmm11
412
+ movdqa xmmword ptr [rsp+0x100], xmm8
413
+ movdqa xmm8, xmm4
414
+ psrld xmm8, 12
415
+ pslld xmm4, 20
416
+ por xmm4, xmm8
417
+ movdqa xmm8, xmm5
418
+ psrld xmm8, 12
419
+ pslld xmm5, 20
420
+ por xmm5, xmm8
421
+ movdqa xmm8, xmm6
422
+ psrld xmm8, 12
423
+ pslld xmm6, 20
424
+ por xmm6, xmm8
425
+ movdqa xmm8, xmm7
426
+ psrld xmm8, 12
427
+ pslld xmm7, 20
428
+ por xmm7, xmm8
429
+ paddd xmm0, xmmword ptr [rsp+0x60]
430
+ paddd xmm1, xmmword ptr [rsp+0xA0]
431
+ paddd xmm2, xmmword ptr [rsp]
432
+ paddd xmm3, xmmword ptr [rsp+0xD0]
433
+ paddd xmm0, xmm4
434
+ paddd xmm1, xmm5
435
+ paddd xmm2, xmm6
436
+ paddd xmm3, xmm7
437
+ pxor xmm12, xmm0
438
+ pxor xmm13, xmm1
439
+ pxor xmm14, xmm2
440
+ pxor xmm15, xmm3
441
+ movdqa xmm8, xmm12
442
+ psrld xmm12, 8
443
+ pslld xmm8, 24
444
+ pxor xmm12, xmm8
445
+ movdqa xmm8, xmm13
446
+ psrld xmm13, 8
447
+ pslld xmm8, 24
448
+ pxor xmm13, xmm8
449
+ movdqa xmm8, xmm14
450
+ psrld xmm14, 8
451
+ pslld xmm8, 24
452
+ pxor xmm14, xmm8
453
+ movdqa xmm8, xmm15
454
+ psrld xmm15, 8
455
+ pslld xmm8, 24
456
+ pxor xmm15, xmm8
457
+ movdqa xmm8, xmmword ptr [rsp+0x100]
458
+ paddd xmm8, xmm12
459
+ paddd xmm9, xmm13
460
+ paddd xmm10, xmm14
461
+ paddd xmm11, xmm15
462
+ pxor xmm4, xmm8
463
+ pxor xmm5, xmm9
464
+ pxor xmm6, xmm10
465
+ pxor xmm7, xmm11
466
+ movdqa xmmword ptr [rsp+0x100], xmm8
467
+ movdqa xmm8, xmm4
468
+ psrld xmm8, 7
469
+ pslld xmm4, 25
470
+ por xmm4, xmm8
471
+ movdqa xmm8, xmm5
472
+ psrld xmm8, 7
473
+ pslld xmm5, 25
474
+ por xmm5, xmm8
475
+ movdqa xmm8, xmm6
476
+ psrld xmm8, 7
477
+ pslld xmm6, 25
478
+ por xmm6, xmm8
479
+ movdqa xmm8, xmm7
480
+ psrld xmm8, 7
481
+ pslld xmm7, 25
482
+ por xmm7, xmm8
483
+ paddd xmm0, xmmword ptr [rsp+0x10]
484
+ paddd xmm1, xmmword ptr [rsp+0xC0]
485
+ paddd xmm2, xmmword ptr [rsp+0x90]
486
+ paddd xmm3, xmmword ptr [rsp+0xF0]
487
+ paddd xmm0, xmm5
488
+ paddd xmm1, xmm6
489
+ paddd xmm2, xmm7
490
+ paddd xmm3, xmm4
491
+ pxor xmm15, xmm0
492
+ pxor xmm12, xmm1
493
+ pxor xmm13, xmm2
494
+ pxor xmm14, xmm3
495
+ pshuflw xmm15, xmm15, 0xB1
496
+ pshufhw xmm15, xmm15, 0xB1
497
+ pshuflw xmm12, xmm12, 0xB1
498
+ pshufhw xmm12, xmm12, 0xB1
499
+ pshuflw xmm13, xmm13, 0xB1
500
+ pshufhw xmm13, xmm13, 0xB1
501
+ pshuflw xmm14, xmm14, 0xB1
502
+ pshufhw xmm14, xmm14, 0xB1
503
+ paddd xmm10, xmm15
504
+ paddd xmm11, xmm12
505
+ movdqa xmm8, xmmword ptr [rsp+0x100]
506
+ paddd xmm8, xmm13
507
+ paddd xmm9, xmm14
508
+ pxor xmm5, xmm10
509
+ pxor xmm6, xmm11
510
+ pxor xmm7, xmm8
511
+ pxor xmm4, xmm9
512
+ movdqa xmmword ptr [rsp+0x100], xmm8
513
+ movdqa xmm8, xmm5
514
+ psrld xmm8, 12
515
+ pslld xmm5, 20
516
+ por xmm5, xmm8
517
+ movdqa xmm8, xmm6
518
+ psrld xmm8, 12
519
+ pslld xmm6, 20
520
+ por xmm6, xmm8
521
+ movdqa xmm8, xmm7
522
+ psrld xmm8, 12
523
+ pslld xmm7, 20
524
+ por xmm7, xmm8
525
+ movdqa xmm8, xmm4
526
+ psrld xmm8, 12
527
+ pslld xmm4, 20
528
+ por xmm4, xmm8
529
+ paddd xmm0, xmmword ptr [rsp+0xB0]
530
+ paddd xmm1, xmmword ptr [rsp+0x50]
531
+ paddd xmm2, xmmword ptr [rsp+0xE0]
532
+ paddd xmm3, xmmword ptr [rsp+0x80]
533
+ paddd xmm0, xmm5
534
+ paddd xmm1, xmm6
535
+ paddd xmm2, xmm7
536
+ paddd xmm3, xmm4
537
+ pxor xmm15, xmm0
538
+ pxor xmm12, xmm1
539
+ pxor xmm13, xmm2
540
+ pxor xmm14, xmm3
541
+ movdqa xmm8, xmm15
542
+ psrld xmm15, 8
543
+ pslld xmm8, 24
544
+ pxor xmm15, xmm8
545
+ movdqa xmm8, xmm12
546
+ psrld xmm12, 8
547
+ pslld xmm8, 24
548
+ pxor xmm12, xmm8
549
+ movdqa xmm8, xmm13
550
+ psrld xmm13, 8
551
+ pslld xmm8, 24
552
+ pxor xmm13, xmm8
553
+ movdqa xmm8, xmm14
554
+ psrld xmm14, 8
555
+ pslld xmm8, 24
556
+ pxor xmm14, xmm8
557
+ paddd xmm10, xmm15
558
+ paddd xmm11, xmm12
559
+ movdqa xmm8, xmmword ptr [rsp+0x100]
560
+ paddd xmm8, xmm13
561
+ paddd xmm9, xmm14
562
+ pxor xmm5, xmm10
563
+ pxor xmm6, xmm11
564
+ pxor xmm7, xmm8
565
+ pxor xmm4, xmm9
566
+ movdqa xmmword ptr [rsp+0x100], xmm8
567
+ movdqa xmm8, xmm5
568
+ psrld xmm8, 7
569
+ pslld xmm5, 25
570
+ por xmm5, xmm8
571
+ movdqa xmm8, xmm6
572
+ psrld xmm8, 7
573
+ pslld xmm6, 25
574
+ por xmm6, xmm8
575
+ movdqa xmm8, xmm7
576
+ psrld xmm8, 7
577
+ pslld xmm7, 25
578
+ por xmm7, xmm8
579
+ movdqa xmm8, xmm4
580
+ psrld xmm8, 7
581
+ pslld xmm4, 25
582
+ por xmm4, xmm8
583
+ paddd xmm0, xmmword ptr [rsp+0x30]
584
+ paddd xmm1, xmmword ptr [rsp+0xA0]
585
+ paddd xmm2, xmmword ptr [rsp+0xD0]
586
+ paddd xmm3, xmmword ptr [rsp+0x70]
587
+ paddd xmm0, xmm4
588
+ paddd xmm1, xmm5
589
+ paddd xmm2, xmm6
590
+ paddd xmm3, xmm7
591
+ pxor xmm12, xmm0
592
+ pxor xmm13, xmm1
593
+ pxor xmm14, xmm2
594
+ pxor xmm15, xmm3
595
+ pshuflw xmm12, xmm12, 0xB1
596
+ pshufhw xmm12, xmm12, 0xB1
597
+ pshuflw xmm13, xmm13, 0xB1
598
+ pshufhw xmm13, xmm13, 0xB1
599
+ pshuflw xmm14, xmm14, 0xB1
600
+ pshufhw xmm14, xmm14, 0xB1
601
+ pshuflw xmm15, xmm15, 0xB1
602
+ pshufhw xmm15, xmm15, 0xB1
603
+ movdqa xmm8, xmmword ptr [rsp+0x100]
604
+ paddd xmm8, xmm12
605
+ paddd xmm9, xmm13
606
+ paddd xmm10, xmm14
607
+ paddd xmm11, xmm15
608
+ pxor xmm4, xmm8
609
+ pxor xmm5, xmm9
610
+ pxor xmm6, xmm10
611
+ pxor xmm7, xmm11
612
+ movdqa xmmword ptr [rsp+0x100], xmm8
613
+ movdqa xmm8, xmm4
614
+ psrld xmm8, 12
615
+ pslld xmm4, 20
616
+ por xmm4, xmm8
617
+ movdqa xmm8, xmm5
618
+ psrld xmm8, 12
619
+ pslld xmm5, 20
620
+ por xmm5, xmm8
621
+ movdqa xmm8, xmm6
622
+ psrld xmm8, 12
623
+ pslld xmm6, 20
624
+ por xmm6, xmm8
625
+ movdqa xmm8, xmm7
626
+ psrld xmm8, 12
627
+ pslld xmm7, 20
628
+ por xmm7, xmm8
629
+ paddd xmm0, xmmword ptr [rsp+0x40]
630
+ paddd xmm1, xmmword ptr [rsp+0xC0]
631
+ paddd xmm2, xmmword ptr [rsp+0x20]
632
+ paddd xmm3, xmmword ptr [rsp+0xE0]
633
+ paddd xmm0, xmm4
634
+ paddd xmm1, xmm5
635
+ paddd xmm2, xmm6
636
+ paddd xmm3, xmm7
637
+ pxor xmm12, xmm0
638
+ pxor xmm13, xmm1
639
+ pxor xmm14, xmm2
640
+ pxor xmm15, xmm3
641
+ movdqa xmm8, xmm12
642
+ psrld xmm12, 8
643
+ pslld xmm8, 24
644
+ pxor xmm12, xmm8
645
+ movdqa xmm8, xmm13
646
+ psrld xmm13, 8
647
+ pslld xmm8, 24
648
+ pxor xmm13, xmm8
649
+ movdqa xmm8, xmm14
650
+ psrld xmm14, 8
651
+ pslld xmm8, 24
652
+ pxor xmm14, xmm8
653
+ movdqa xmm8, xmm15
654
+ psrld xmm15, 8
655
+ pslld xmm8, 24
656
+ pxor xmm15, xmm8
657
+ movdqa xmm8, xmmword ptr [rsp+0x100]
658
+ paddd xmm8, xmm12
659
+ paddd xmm9, xmm13
660
+ paddd xmm10, xmm14
661
+ paddd xmm11, xmm15
662
+ pxor xmm4, xmm8
663
+ pxor xmm5, xmm9
664
+ pxor xmm6, xmm10
665
+ pxor xmm7, xmm11
666
+ movdqa xmmword ptr [rsp+0x100], xmm8
667
+ movdqa xmm8, xmm4
668
+ psrld xmm8, 7
669
+ pslld xmm4, 25
670
+ por xmm4, xmm8
671
+ movdqa xmm8, xmm5
672
+ psrld xmm8, 7
673
+ pslld xmm5, 25
674
+ por xmm5, xmm8
675
+ movdqa xmm8, xmm6
676
+ psrld xmm8, 7
677
+ pslld xmm6, 25
678
+ por xmm6, xmm8
679
+ movdqa xmm8, xmm7
680
+ psrld xmm8, 7
681
+ pslld xmm7, 25
682
+ por xmm7, xmm8
683
+ paddd xmm0, xmmword ptr [rsp+0x60]
684
+ paddd xmm1, xmmword ptr [rsp+0x90]
685
+ paddd xmm2, xmmword ptr [rsp+0xB0]
686
+ paddd xmm3, xmmword ptr [rsp+0x80]
687
+ paddd xmm0, xmm5
688
+ paddd xmm1, xmm6
689
+ paddd xmm2, xmm7
690
+ paddd xmm3, xmm4
691
+ pxor xmm15, xmm0
692
+ pxor xmm12, xmm1
693
+ pxor xmm13, xmm2
694
+ pxor xmm14, xmm3
695
+ pshuflw xmm15, xmm15, 0xB1
696
+ pshufhw xmm15, xmm15, 0xB1
697
+ pshuflw xmm12, xmm12, 0xB1
698
+ pshufhw xmm12, xmm12, 0xB1
699
+ pshuflw xmm13, xmm13, 0xB1
700
+ pshufhw xmm13, xmm13, 0xB1
701
+ pshuflw xmm14, xmm14, 0xB1
702
+ pshufhw xmm14, xmm14, 0xB1
703
+ paddd xmm10, xmm15
704
+ paddd xmm11, xmm12
705
+ movdqa xmm8, xmmword ptr [rsp+0x100]
706
+ paddd xmm8, xmm13
707
+ paddd xmm9, xmm14
708
+ pxor xmm5, xmm10
709
+ pxor xmm6, xmm11
710
+ pxor xmm7, xmm8
711
+ pxor xmm4, xmm9
712
+ movdqa xmmword ptr [rsp+0x100], xmm8
713
+ movdqa xmm8, xmm5
714
+ psrld xmm8, 12
715
+ pslld xmm5, 20
716
+ por xmm5, xmm8
717
+ movdqa xmm8, xmm6
718
+ psrld xmm8, 12
719
+ pslld xmm6, 20
720
+ por xmm6, xmm8
721
+ movdqa xmm8, xmm7
722
+ psrld xmm8, 12
723
+ pslld xmm7, 20
724
+ por xmm7, xmm8
725
+ movdqa xmm8, xmm4
726
+ psrld xmm8, 12
727
+ pslld xmm4, 20
728
+ por xmm4, xmm8
729
+ paddd xmm0, xmmword ptr [rsp+0x50]
730
+ paddd xmm1, xmmword ptr [rsp]
731
+ paddd xmm2, xmmword ptr [rsp+0xF0]
732
+ paddd xmm3, xmmword ptr [rsp+0x10]
733
+ paddd xmm0, xmm5
734
+ paddd xmm1, xmm6
735
+ paddd xmm2, xmm7
736
+ paddd xmm3, xmm4
737
+ pxor xmm15, xmm0
738
+ pxor xmm12, xmm1
739
+ pxor xmm13, xmm2
740
+ pxor xmm14, xmm3
741
+ movdqa xmm8, xmm15
742
+ psrld xmm15, 8
743
+ pslld xmm8, 24
744
+ pxor xmm15, xmm8
745
+ movdqa xmm8, xmm12
746
+ psrld xmm12, 8
747
+ pslld xmm8, 24
748
+ pxor xmm12, xmm8
749
+ movdqa xmm8, xmm13
750
+ psrld xmm13, 8
751
+ pslld xmm8, 24
752
+ pxor xmm13, xmm8
753
+ movdqa xmm8, xmm14
754
+ psrld xmm14, 8
755
+ pslld xmm8, 24
756
+ pxor xmm14, xmm8
757
+ paddd xmm10, xmm15
758
+ paddd xmm11, xmm12
759
+ movdqa xmm8, xmmword ptr [rsp+0x100]
760
+ paddd xmm8, xmm13
761
+ paddd xmm9, xmm14
762
+ pxor xmm5, xmm10
763
+ pxor xmm6, xmm11
764
+ pxor xmm7, xmm8
765
+ pxor xmm4, xmm9
766
+ movdqa xmmword ptr [rsp+0x100], xmm8
767
+ movdqa xmm8, xmm5
768
+ psrld xmm8, 7
769
+ pslld xmm5, 25
770
+ por xmm5, xmm8
771
+ movdqa xmm8, xmm6
772
+ psrld xmm8, 7
773
+ pslld xmm6, 25
774
+ por xmm6, xmm8
775
+ movdqa xmm8, xmm7
776
+ psrld xmm8, 7
777
+ pslld xmm7, 25
778
+ por xmm7, xmm8
779
+ movdqa xmm8, xmm4
780
+ psrld xmm8, 7
781
+ pslld xmm4, 25
782
+ por xmm4, xmm8
783
+ paddd xmm0, xmmword ptr [rsp+0xA0]
784
+ paddd xmm1, xmmword ptr [rsp+0xC0]
785
+ paddd xmm2, xmmword ptr [rsp+0xE0]
786
+ paddd xmm3, xmmword ptr [rsp+0xD0]
787
+ paddd xmm0, xmm4
788
+ paddd xmm1, xmm5
789
+ paddd xmm2, xmm6
790
+ paddd xmm3, xmm7
791
+ pxor xmm12, xmm0
792
+ pxor xmm13, xmm1
793
+ pxor xmm14, xmm2
794
+ pxor xmm15, xmm3
795
+ pshuflw xmm12, xmm12, 0xB1
796
+ pshufhw xmm12, xmm12, 0xB1
797
+ pshuflw xmm13, xmm13, 0xB1
798
+ pshufhw xmm13, xmm13, 0xB1
799
+ pshuflw xmm14, xmm14, 0xB1
800
+ pshufhw xmm14, xmm14, 0xB1
801
+ pshuflw xmm15, xmm15, 0xB1
802
+ pshufhw xmm15, xmm15, 0xB1
803
+ movdqa xmm8, xmmword ptr [rsp+0x100]
804
+ paddd xmm8, xmm12
805
+ paddd xmm9, xmm13
806
+ paddd xmm10, xmm14
807
+ paddd xmm11, xmm15
808
+ pxor xmm4, xmm8
809
+ pxor xmm5, xmm9
810
+ pxor xmm6, xmm10
811
+ pxor xmm7, xmm11
812
+ movdqa xmmword ptr [rsp+0x100], xmm8
813
+ movdqa xmm8, xmm4
814
+ psrld xmm8, 12
815
+ pslld xmm4, 20
816
+ por xmm4, xmm8
817
+ movdqa xmm8, xmm5
818
+ psrld xmm8, 12
819
+ pslld xmm5, 20
820
+ por xmm5, xmm8
821
+ movdqa xmm8, xmm6
822
+ psrld xmm8, 12
823
+ pslld xmm6, 20
824
+ por xmm6, xmm8
825
+ movdqa xmm8, xmm7
826
+ psrld xmm8, 12
827
+ pslld xmm7, 20
828
+ por xmm7, xmm8
829
+ paddd xmm0, xmmword ptr [rsp+0x70]
830
+ paddd xmm1, xmmword ptr [rsp+0x90]
831
+ paddd xmm2, xmmword ptr [rsp+0x30]
832
+ paddd xmm3, xmmword ptr [rsp+0xF0]
833
+ paddd xmm0, xmm4
834
+ paddd xmm1, xmm5
835
+ paddd xmm2, xmm6
836
+ paddd xmm3, xmm7
837
+ pxor xmm12, xmm0
838
+ pxor xmm13, xmm1
839
+ pxor xmm14, xmm2
840
+ pxor xmm15, xmm3
841
+ movdqa xmm8, xmm12
842
+ psrld xmm12, 8
843
+ pslld xmm8, 24
844
+ pxor xmm12, xmm8
845
+ movdqa xmm8, xmm13
846
+ psrld xmm13, 8
847
+ pslld xmm8, 24
848
+ pxor xmm13, xmm8
849
+ movdqa xmm8, xmm14
850
+ psrld xmm14, 8
851
+ pslld xmm8, 24
852
+ pxor xmm14, xmm8
853
+ movdqa xmm8, xmm15
854
+ psrld xmm15, 8
855
+ pslld xmm8, 24
856
+ pxor xmm15, xmm8
857
+ movdqa xmm8, xmmword ptr [rsp+0x100]
858
+ paddd xmm8, xmm12
859
+ paddd xmm9, xmm13
860
+ paddd xmm10, xmm14
861
+ paddd xmm11, xmm15
862
+ pxor xmm4, xmm8
863
+ pxor xmm5, xmm9
864
+ pxor xmm6, xmm10
865
+ pxor xmm7, xmm11
866
+ movdqa xmmword ptr [rsp+0x100], xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ movdqa xmm8, xmm5
872
+ psrld xmm8, 7
873
+ pslld xmm5, 25
874
+ por xmm5, xmm8
875
+ movdqa xmm8, xmm6
876
+ psrld xmm8, 7
877
+ pslld xmm6, 25
878
+ por xmm6, xmm8
879
+ movdqa xmm8, xmm7
880
+ psrld xmm8, 7
881
+ pslld xmm7, 25
882
+ por xmm7, xmm8
883
+ paddd xmm0, xmmword ptr [rsp+0x40]
884
+ paddd xmm1, xmmword ptr [rsp+0xB0]
885
+ paddd xmm2, xmmword ptr [rsp+0x50]
886
+ paddd xmm3, xmmword ptr [rsp+0x10]
887
+ paddd xmm0, xmm5
888
+ paddd xmm1, xmm6
889
+ paddd xmm2, xmm7
890
+ paddd xmm3, xmm4
891
+ pxor xmm15, xmm0
892
+ pxor xmm12, xmm1
893
+ pxor xmm13, xmm2
894
+ pxor xmm14, xmm3
895
+ pshuflw xmm15, xmm15, 0xB1
896
+ pshufhw xmm15, xmm15, 0xB1
897
+ pshuflw xmm12, xmm12, 0xB1
898
+ pshufhw xmm12, xmm12, 0xB1
899
+ pshuflw xmm13, xmm13, 0xB1
900
+ pshufhw xmm13, xmm13, 0xB1
901
+ pshuflw xmm14, xmm14, 0xB1
902
+ pshufhw xmm14, xmm14, 0xB1
903
+ paddd xmm10, xmm15
904
+ paddd xmm11, xmm12
905
+ movdqa xmm8, xmmword ptr [rsp+0x100]
906
+ paddd xmm8, xmm13
907
+ paddd xmm9, xmm14
908
+ pxor xmm5, xmm10
909
+ pxor xmm6, xmm11
910
+ pxor xmm7, xmm8
911
+ pxor xmm4, xmm9
912
+ movdqa xmmword ptr [rsp+0x100], xmm8
913
+ movdqa xmm8, xmm5
914
+ psrld xmm8, 12
915
+ pslld xmm5, 20
916
+ por xmm5, xmm8
917
+ movdqa xmm8, xmm6
918
+ psrld xmm8, 12
919
+ pslld xmm6, 20
920
+ por xmm6, xmm8
921
+ movdqa xmm8, xmm7
922
+ psrld xmm8, 12
923
+ pslld xmm7, 20
924
+ por xmm7, xmm8
925
+ movdqa xmm8, xmm4
926
+ psrld xmm8, 12
927
+ pslld xmm4, 20
928
+ por xmm4, xmm8
929
+ paddd xmm0, xmmword ptr [rsp]
930
+ paddd xmm1, xmmword ptr [rsp+0x20]
931
+ paddd xmm2, xmmword ptr [rsp+0x80]
932
+ paddd xmm3, xmmword ptr [rsp+0x60]
933
+ paddd xmm0, xmm5
934
+ paddd xmm1, xmm6
935
+ paddd xmm2, xmm7
936
+ paddd xmm3, xmm4
937
+ pxor xmm15, xmm0
938
+ pxor xmm12, xmm1
939
+ pxor xmm13, xmm2
940
+ pxor xmm14, xmm3
941
+ movdqa xmm8, xmm15
942
+ psrld xmm15, 8
943
+ pslld xmm8, 24
944
+ pxor xmm15, xmm8
945
+ movdqa xmm8, xmm12
946
+ psrld xmm12, 8
947
+ pslld xmm8, 24
948
+ pxor xmm12, xmm8
949
+ movdqa xmm8, xmm13
950
+ psrld xmm13, 8
951
+ pslld xmm8, 24
952
+ pxor xmm13, xmm8
953
+ movdqa xmm8, xmm14
954
+ psrld xmm14, 8
955
+ pslld xmm8, 24
956
+ pxor xmm14, xmm8
957
+ paddd xmm10, xmm15
958
+ paddd xmm11, xmm12
959
+ movdqa xmm8, xmmword ptr [rsp+0x100]
960
+ paddd xmm8, xmm13
961
+ paddd xmm9, xmm14
962
+ pxor xmm5, xmm10
963
+ pxor xmm6, xmm11
964
+ pxor xmm7, xmm8
965
+ pxor xmm4, xmm9
966
+ movdqa xmmword ptr [rsp+0x100], xmm8
967
+ movdqa xmm8, xmm5
968
+ psrld xmm8, 7
969
+ pslld xmm5, 25
970
+ por xmm5, xmm8
971
+ movdqa xmm8, xmm6
972
+ psrld xmm8, 7
973
+ pslld xmm6, 25
974
+ por xmm6, xmm8
975
+ movdqa xmm8, xmm7
976
+ psrld xmm8, 7
977
+ pslld xmm7, 25
978
+ por xmm7, xmm8
979
+ movdqa xmm8, xmm4
980
+ psrld xmm8, 7
981
+ pslld xmm4, 25
982
+ por xmm4, xmm8
983
+ paddd xmm0, xmmword ptr [rsp+0xC0]
984
+ paddd xmm1, xmmword ptr [rsp+0x90]
985
+ paddd xmm2, xmmword ptr [rsp+0xF0]
986
+ paddd xmm3, xmmword ptr [rsp+0xE0]
987
+ paddd xmm0, xmm4
988
+ paddd xmm1, xmm5
989
+ paddd xmm2, xmm6
990
+ paddd xmm3, xmm7
991
+ pxor xmm12, xmm0
992
+ pxor xmm13, xmm1
993
+ pxor xmm14, xmm2
994
+ pxor xmm15, xmm3
995
+ pshuflw xmm12, xmm12, 0xB1
996
+ pshufhw xmm12, xmm12, 0xB1
997
+ pshuflw xmm13, xmm13, 0xB1
998
+ pshufhw xmm13, xmm13, 0xB1
999
+ pshuflw xmm14, xmm14, 0xB1
1000
+ pshufhw xmm14, xmm14, 0xB1
1001
+ pshuflw xmm15, xmm15, 0xB1
1002
+ pshufhw xmm15, xmm15, 0xB1
1003
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1004
+ paddd xmm8, xmm12
1005
+ paddd xmm9, xmm13
1006
+ paddd xmm10, xmm14
1007
+ paddd xmm11, xmm15
1008
+ pxor xmm4, xmm8
1009
+ pxor xmm5, xmm9
1010
+ pxor xmm6, xmm10
1011
+ pxor xmm7, xmm11
1012
+ movdqa xmmword ptr [rsp+0x100], xmm8
1013
+ movdqa xmm8, xmm4
1014
+ psrld xmm8, 12
1015
+ pslld xmm4, 20
1016
+ por xmm4, xmm8
1017
+ movdqa xmm8, xmm5
1018
+ psrld xmm8, 12
1019
+ pslld xmm5, 20
1020
+ por xmm5, xmm8
1021
+ movdqa xmm8, xmm6
1022
+ psrld xmm8, 12
1023
+ pslld xmm6, 20
1024
+ por xmm6, xmm8
1025
+ movdqa xmm8, xmm7
1026
+ psrld xmm8, 12
1027
+ pslld xmm7, 20
1028
+ por xmm7, xmm8
1029
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1030
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1031
+ paddd xmm2, xmmword ptr [rsp+0xA0]
1032
+ paddd xmm3, xmmword ptr [rsp+0x80]
1033
+ paddd xmm0, xmm4
1034
+ paddd xmm1, xmm5
1035
+ paddd xmm2, xmm6
1036
+ paddd xmm3, xmm7
1037
+ pxor xmm12, xmm0
1038
+ pxor xmm13, xmm1
1039
+ pxor xmm14, xmm2
1040
+ pxor xmm15, xmm3
1041
+ movdqa xmm8, xmm12
1042
+ psrld xmm12, 8
1043
+ pslld xmm8, 24
1044
+ pxor xmm12, xmm8
1045
+ movdqa xmm8, xmm13
1046
+ psrld xmm13, 8
1047
+ pslld xmm8, 24
1048
+ pxor xmm13, xmm8
1049
+ movdqa xmm8, xmm14
1050
+ psrld xmm14, 8
1051
+ pslld xmm8, 24
1052
+ pxor xmm14, xmm8
1053
+ movdqa xmm8, xmm15
1054
+ psrld xmm15, 8
1055
+ pslld xmm8, 24
1056
+ pxor xmm15, xmm8
1057
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1058
+ paddd xmm8, xmm12
1059
+ paddd xmm9, xmm13
1060
+ paddd xmm10, xmm14
1061
+ paddd xmm11, xmm15
1062
+ pxor xmm4, xmm8
1063
+ pxor xmm5, xmm9
1064
+ pxor xmm6, xmm10
1065
+ pxor xmm7, xmm11
1066
+ movdqa xmmword ptr [rsp+0x100], xmm8
1067
+ movdqa xmm8, xmm4
1068
+ psrld xmm8, 7
1069
+ pslld xmm4, 25
1070
+ por xmm4, xmm8
1071
+ movdqa xmm8, xmm5
1072
+ psrld xmm8, 7
1073
+ pslld xmm5, 25
1074
+ por xmm5, xmm8
1075
+ movdqa xmm8, xmm6
1076
+ psrld xmm8, 7
1077
+ pslld xmm6, 25
1078
+ por xmm6, xmm8
1079
+ movdqa xmm8, xmm7
1080
+ psrld xmm8, 7
1081
+ pslld xmm7, 25
1082
+ por xmm7, xmm8
1083
+ paddd xmm0, xmmword ptr [rsp+0x70]
1084
+ paddd xmm1, xmmword ptr [rsp+0x50]
1085
+ paddd xmm2, xmmword ptr [rsp]
1086
+ paddd xmm3, xmmword ptr [rsp+0x60]
1087
+ paddd xmm0, xmm5
1088
+ paddd xmm1, xmm6
1089
+ paddd xmm2, xmm7
1090
+ paddd xmm3, xmm4
1091
+ pxor xmm15, xmm0
1092
+ pxor xmm12, xmm1
1093
+ pxor xmm13, xmm2
1094
+ pxor xmm14, xmm3
1095
+ pshuflw xmm15, xmm15, 0xB1
1096
+ pshufhw xmm15, xmm15, 0xB1
1097
+ pshuflw xmm12, xmm12, 0xB1
1098
+ pshufhw xmm12, xmm12, 0xB1
1099
+ pshuflw xmm13, xmm13, 0xB1
1100
+ pshufhw xmm13, xmm13, 0xB1
1101
+ pshuflw xmm14, xmm14, 0xB1
1102
+ pshufhw xmm14, xmm14, 0xB1
1103
+ paddd xmm10, xmm15
1104
+ paddd xmm11, xmm12
1105
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1106
+ paddd xmm8, xmm13
1107
+ paddd xmm9, xmm14
1108
+ pxor xmm5, xmm10
1109
+ pxor xmm6, xmm11
1110
+ pxor xmm7, xmm8
1111
+ pxor xmm4, xmm9
1112
+ movdqa xmmword ptr [rsp+0x100], xmm8
1113
+ movdqa xmm8, xmm5
1114
+ psrld xmm8, 12
1115
+ pslld xmm5, 20
1116
+ por xmm5, xmm8
1117
+ movdqa xmm8, xmm6
1118
+ psrld xmm8, 12
1119
+ pslld xmm6, 20
1120
+ por xmm6, xmm8
1121
+ movdqa xmm8, xmm7
1122
+ psrld xmm8, 12
1123
+ pslld xmm7, 20
1124
+ por xmm7, xmm8
1125
+ movdqa xmm8, xmm4
1126
+ psrld xmm8, 12
1127
+ pslld xmm4, 20
1128
+ por xmm4, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+0x20]
1130
+ paddd xmm1, xmmword ptr [rsp+0x30]
1131
+ paddd xmm2, xmmword ptr [rsp+0x10]
1132
+ paddd xmm3, xmmword ptr [rsp+0x40]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmm15
1142
+ psrld xmm15, 8
1143
+ pslld xmm8, 24
1144
+ pxor xmm15, xmm8
1145
+ movdqa xmm8, xmm12
1146
+ psrld xmm12, 8
1147
+ pslld xmm8, 24
1148
+ pxor xmm12, xmm8
1149
+ movdqa xmm8, xmm13
1150
+ psrld xmm13, 8
1151
+ pslld xmm8, 24
1152
+ pxor xmm13, xmm8
1153
+ movdqa xmm8, xmm14
1154
+ psrld xmm14, 8
1155
+ pslld xmm8, 24
1156
+ pxor xmm14, xmm8
1157
+ paddd xmm10, xmm15
1158
+ paddd xmm11, xmm12
1159
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1160
+ paddd xmm8, xmm13
1161
+ paddd xmm9, xmm14
1162
+ pxor xmm5, xmm10
1163
+ pxor xmm6, xmm11
1164
+ pxor xmm7, xmm8
1165
+ pxor xmm4, xmm9
1166
+ movdqa xmmword ptr [rsp+0x100], xmm8
1167
+ movdqa xmm8, xmm5
1168
+ psrld xmm8, 7
1169
+ pslld xmm5, 25
1170
+ por xmm5, xmm8
1171
+ movdqa xmm8, xmm6
1172
+ psrld xmm8, 7
1173
+ pslld xmm6, 25
1174
+ por xmm6, xmm8
1175
+ movdqa xmm8, xmm7
1176
+ psrld xmm8, 7
1177
+ pslld xmm7, 25
1178
+ por xmm7, xmm8
1179
+ movdqa xmm8, xmm4
1180
+ psrld xmm8, 7
1181
+ pslld xmm4, 25
1182
+ por xmm4, xmm8
1183
+ paddd xmm0, xmmword ptr [rsp+0x90]
1184
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1185
+ paddd xmm2, xmmword ptr [rsp+0x80]
1186
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1187
+ paddd xmm0, xmm4
1188
+ paddd xmm1, xmm5
1189
+ paddd xmm2, xmm6
1190
+ paddd xmm3, xmm7
1191
+ pxor xmm12, xmm0
1192
+ pxor xmm13, xmm1
1193
+ pxor xmm14, xmm2
1194
+ pxor xmm15, xmm3
1195
+ pshuflw xmm12, xmm12, 0xB1
1196
+ pshufhw xmm12, xmm12, 0xB1
1197
+ pshuflw xmm13, xmm13, 0xB1
1198
+ pshufhw xmm13, xmm13, 0xB1
1199
+ pshuflw xmm14, xmm14, 0xB1
1200
+ pshufhw xmm14, xmm14, 0xB1
1201
+ pshuflw xmm15, xmm15, 0xB1
1202
+ pshufhw xmm15, xmm15, 0xB1
1203
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1204
+ paddd xmm8, xmm12
1205
+ paddd xmm9, xmm13
1206
+ paddd xmm10, xmm14
1207
+ paddd xmm11, xmm15
1208
+ pxor xmm4, xmm8
1209
+ pxor xmm5, xmm9
1210
+ pxor xmm6, xmm10
1211
+ pxor xmm7, xmm11
1212
+ movdqa xmmword ptr [rsp+0x100], xmm8
1213
+ movdqa xmm8, xmm4
1214
+ psrld xmm8, 12
1215
+ pslld xmm4, 20
1216
+ por xmm4, xmm8
1217
+ movdqa xmm8, xmm5
1218
+ psrld xmm8, 12
1219
+ pslld xmm5, 20
1220
+ por xmm5, xmm8
1221
+ movdqa xmm8, xmm6
1222
+ psrld xmm8, 12
1223
+ pslld xmm6, 20
1224
+ por xmm6, xmm8
1225
+ movdqa xmm8, xmm7
1226
+ psrld xmm8, 12
1227
+ pslld xmm7, 20
1228
+ por xmm7, xmm8
1229
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1230
+ paddd xmm1, xmmword ptr [rsp+0x50]
1231
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1232
+ paddd xmm3, xmmword ptr [rsp+0x10]
1233
+ paddd xmm0, xmm4
1234
+ paddd xmm1, xmm5
1235
+ paddd xmm2, xmm6
1236
+ paddd xmm3, xmm7
1237
+ pxor xmm12, xmm0
1238
+ pxor xmm13, xmm1
1239
+ pxor xmm14, xmm2
1240
+ pxor xmm15, xmm3
1241
+ movdqa xmm8, xmm12
1242
+ psrld xmm12, 8
1243
+ pslld xmm8, 24
1244
+ pxor xmm12, xmm8
1245
+ movdqa xmm8, xmm13
1246
+ psrld xmm13, 8
1247
+ pslld xmm8, 24
1248
+ pxor xmm13, xmm8
1249
+ movdqa xmm8, xmm14
1250
+ psrld xmm14, 8
1251
+ pslld xmm8, 24
1252
+ pxor xmm14, xmm8
1253
+ movdqa xmm8, xmm15
1254
+ psrld xmm15, 8
1255
+ pslld xmm8, 24
1256
+ pxor xmm15, xmm8
1257
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1258
+ paddd xmm8, xmm12
1259
+ paddd xmm9, xmm13
1260
+ paddd xmm10, xmm14
1261
+ paddd xmm11, xmm15
1262
+ pxor xmm4, xmm8
1263
+ pxor xmm5, xmm9
1264
+ pxor xmm6, xmm10
1265
+ pxor xmm7, xmm11
1266
+ movdqa xmmword ptr [rsp+0x100], xmm8
1267
+ movdqa xmm8, xmm4
1268
+ psrld xmm8, 7
1269
+ pslld xmm4, 25
1270
+ por xmm4, xmm8
1271
+ movdqa xmm8, xmm5
1272
+ psrld xmm8, 7
1273
+ pslld xmm5, 25
1274
+ por xmm5, xmm8
1275
+ movdqa xmm8, xmm6
1276
+ psrld xmm8, 7
1277
+ pslld xmm6, 25
1278
+ por xmm6, xmm8
1279
+ movdqa xmm8, xmm7
1280
+ psrld xmm8, 7
1281
+ pslld xmm7, 25
1282
+ por xmm7, xmm8
1283
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1284
+ paddd xmm1, xmmword ptr [rsp]
1285
+ paddd xmm2, xmmword ptr [rsp+0x20]
1286
+ paddd xmm3, xmmword ptr [rsp+0x40]
1287
+ paddd xmm0, xmm5
1288
+ paddd xmm1, xmm6
1289
+ paddd xmm2, xmm7
1290
+ paddd xmm3, xmm4
1291
+ pxor xmm15, xmm0
1292
+ pxor xmm12, xmm1
1293
+ pxor xmm13, xmm2
1294
+ pxor xmm14, xmm3
1295
+ pshuflw xmm15, xmm15, 0xB1
1296
+ pshufhw xmm15, xmm15, 0xB1
1297
+ pshuflw xmm12, xmm12, 0xB1
1298
+ pshufhw xmm12, xmm12, 0xB1
1299
+ pshuflw xmm13, xmm13, 0xB1
1300
+ pshufhw xmm13, xmm13, 0xB1
1301
+ pshuflw xmm14, xmm14, 0xB1
1302
+ pshufhw xmm14, xmm14, 0xB1
1303
+ paddd xmm10, xmm15
1304
+ paddd xmm11, xmm12
1305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1306
+ paddd xmm8, xmm13
1307
+ paddd xmm9, xmm14
1308
+ pxor xmm5, xmm10
1309
+ pxor xmm6, xmm11
1310
+ pxor xmm7, xmm8
1311
+ pxor xmm4, xmm9
1312
+ movdqa xmmword ptr [rsp+0x100], xmm8
1313
+ movdqa xmm8, xmm5
1314
+ psrld xmm8, 12
1315
+ pslld xmm5, 20
1316
+ por xmm5, xmm8
1317
+ movdqa xmm8, xmm6
1318
+ psrld xmm8, 12
1319
+ pslld xmm6, 20
1320
+ por xmm6, xmm8
1321
+ movdqa xmm8, xmm7
1322
+ psrld xmm8, 12
1323
+ pslld xmm7, 20
1324
+ por xmm7, xmm8
1325
+ movdqa xmm8, xmm4
1326
+ psrld xmm8, 12
1327
+ pslld xmm4, 20
1328
+ por xmm4, xmm8
1329
+ paddd xmm0, xmmword ptr [rsp+0x30]
1330
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1331
+ paddd xmm2, xmmword ptr [rsp+0x60]
1332
+ paddd xmm3, xmmword ptr [rsp+0x70]
1333
+ paddd xmm0, xmm5
1334
+ paddd xmm1, xmm6
1335
+ paddd xmm2, xmm7
1336
+ paddd xmm3, xmm4
1337
+ pxor xmm15, xmm0
1338
+ pxor xmm12, xmm1
1339
+ pxor xmm13, xmm2
1340
+ pxor xmm14, xmm3
1341
+ movdqa xmm8, xmm15
1342
+ psrld xmm15, 8
1343
+ pslld xmm8, 24
1344
+ pxor xmm15, xmm8
1345
+ movdqa xmm8, xmm12
1346
+ psrld xmm12, 8
1347
+ pslld xmm8, 24
1348
+ pxor xmm12, xmm8
1349
+ movdqa xmm8, xmm13
1350
+ psrld xmm13, 8
1351
+ pslld xmm8, 24
1352
+ pxor xmm13, xmm8
1353
+ movdqa xmm8, xmm14
1354
+ psrld xmm14, 8
1355
+ pslld xmm8, 24
1356
+ pxor xmm14, xmm8
1357
+ paddd xmm10, xmm15
1358
+ paddd xmm11, xmm12
1359
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1360
+ paddd xmm8, xmm13
1361
+ paddd xmm9, xmm14
1362
+ pxor xmm5, xmm10
1363
+ pxor xmm6, xmm11
1364
+ pxor xmm7, xmm8
1365
+ pxor xmm4, xmm9
1366
+ movdqa xmmword ptr [rsp+0x100], xmm8
1367
+ movdqa xmm8, xmm5
1368
+ psrld xmm8, 7
1369
+ pslld xmm5, 25
1370
+ por xmm5, xmm8
1371
+ movdqa xmm8, xmm6
1372
+ psrld xmm8, 7
1373
+ pslld xmm6, 25
1374
+ por xmm6, xmm8
1375
+ movdqa xmm8, xmm7
1376
+ psrld xmm8, 7
1377
+ pslld xmm7, 25
1378
+ por xmm7, xmm8
1379
+ movdqa xmm8, xmm4
1380
+ psrld xmm8, 7
1381
+ pslld xmm4, 25
1382
+ por xmm4, xmm8
1383
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1384
+ paddd xmm1, xmmword ptr [rsp+0x50]
1385
+ paddd xmm2, xmmword ptr [rsp+0x10]
1386
+ paddd xmm3, xmmword ptr [rsp+0x80]
1387
+ paddd xmm0, xmm4
1388
+ paddd xmm1, xmm5
1389
+ paddd xmm2, xmm6
1390
+ paddd xmm3, xmm7
1391
+ pxor xmm12, xmm0
1392
+ pxor xmm13, xmm1
1393
+ pxor xmm14, xmm2
1394
+ pxor xmm15, xmm3
1395
+ pshuflw xmm12, xmm12, 0xB1
1396
+ pshufhw xmm12, xmm12, 0xB1
1397
+ pshuflw xmm13, xmm13, 0xB1
1398
+ pshufhw xmm13, xmm13, 0xB1
1399
+ pshuflw xmm14, xmm14, 0xB1
1400
+ pshufhw xmm14, xmm14, 0xB1
1401
+ pshuflw xmm15, xmm15, 0xB1
1402
+ pshufhw xmm15, xmm15, 0xB1
1403
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1404
+ paddd xmm8, xmm12
1405
+ paddd xmm9, xmm13
1406
+ paddd xmm10, xmm14
1407
+ paddd xmm11, xmm15
1408
+ pxor xmm4, xmm8
1409
+ pxor xmm5, xmm9
1410
+ pxor xmm6, xmm10
1411
+ pxor xmm7, xmm11
1412
+ movdqa xmmword ptr [rsp+0x100], xmm8
1413
+ movdqa xmm8, xmm4
1414
+ psrld xmm8, 12
1415
+ pslld xmm4, 20
1416
+ por xmm4, xmm8
1417
+ movdqa xmm8, xmm5
1418
+ psrld xmm8, 12
1419
+ pslld xmm5, 20
1420
+ por xmm5, xmm8
1421
+ movdqa xmm8, xmm6
1422
+ psrld xmm8, 12
1423
+ pslld xmm6, 20
1424
+ por xmm6, xmm8
1425
+ movdqa xmm8, xmm7
1426
+ psrld xmm8, 12
1427
+ pslld xmm7, 20
1428
+ por xmm7, xmm8
1429
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1430
+ paddd xmm1, xmmword ptr [rsp]
1431
+ paddd xmm2, xmmword ptr [rsp+0x90]
1432
+ paddd xmm3, xmmword ptr [rsp+0x60]
1433
+ paddd xmm0, xmm4
1434
+ paddd xmm1, xmm5
1435
+ paddd xmm2, xmm6
1436
+ paddd xmm3, xmm7
1437
+ pxor xmm12, xmm0
1438
+ pxor xmm13, xmm1
1439
+ pxor xmm14, xmm2
1440
+ pxor xmm15, xmm3
1441
+ movdqa xmm8, xmm12
1442
+ psrld xmm12, 8
1443
+ pslld xmm8, 24
1444
+ pxor xmm12, xmm8
1445
+ movdqa xmm8, xmm13
1446
+ psrld xmm13, 8
1447
+ pslld xmm8, 24
1448
+ pxor xmm13, xmm8
1449
+ movdqa xmm8, xmm14
1450
+ psrld xmm14, 8
1451
+ pslld xmm8, 24
1452
+ pxor xmm14, xmm8
1453
+ movdqa xmm8, xmm15
1454
+ psrld xmm15, 8
1455
+ pslld xmm8, 24
1456
+ pxor xmm15, xmm8
1457
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1458
+ paddd xmm8, xmm12
1459
+ paddd xmm9, xmm13
1460
+ paddd xmm10, xmm14
1461
+ paddd xmm11, xmm15
1462
+ pxor xmm4, xmm8
1463
+ pxor xmm5, xmm9
1464
+ pxor xmm6, xmm10
1465
+ pxor xmm7, xmm11
1466
+ movdqa xmmword ptr [rsp+0x100], xmm8
1467
+ movdqa xmm8, xmm4
1468
+ psrld xmm8, 7
1469
+ pslld xmm4, 25
1470
+ por xmm4, xmm8
1471
+ movdqa xmm8, xmm5
1472
+ psrld xmm8, 7
1473
+ pslld xmm5, 25
1474
+ por xmm5, xmm8
1475
+ movdqa xmm8, xmm6
1476
+ psrld xmm8, 7
1477
+ pslld xmm6, 25
1478
+ por xmm6, xmm8
1479
+ movdqa xmm8, xmm7
1480
+ psrld xmm8, 7
1481
+ pslld xmm7, 25
1482
+ por xmm7, xmm8
1483
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1484
+ paddd xmm1, xmmword ptr [rsp+0x20]
1485
+ paddd xmm2, xmmword ptr [rsp+0x30]
1486
+ paddd xmm3, xmmword ptr [rsp+0x70]
1487
+ paddd xmm0, xmm5
1488
+ paddd xmm1, xmm6
1489
+ paddd xmm2, xmm7
1490
+ paddd xmm3, xmm4
1491
+ pxor xmm15, xmm0
1492
+ pxor xmm12, xmm1
1493
+ pxor xmm13, xmm2
1494
+ pxor xmm14, xmm3
1495
+ pshuflw xmm15, xmm15, 0xB1
1496
+ pshufhw xmm15, xmm15, 0xB1
1497
+ pshuflw xmm12, xmm12, 0xB1
1498
+ pshufhw xmm12, xmm12, 0xB1
1499
+ pshuflw xmm13, xmm13, 0xB1
1500
+ pshufhw xmm13, xmm13, 0xB1
1501
+ pshuflw xmm14, xmm14, 0xB1
1502
+ pshufhw xmm14, xmm14, 0xB1
1503
+ paddd xmm10, xmm15
1504
+ paddd xmm11, xmm12
1505
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1506
+ paddd xmm8, xmm13
1507
+ paddd xmm9, xmm14
1508
+ pxor xmm5, xmm10
1509
+ pxor xmm6, xmm11
1510
+ pxor xmm7, xmm8
1511
+ pxor xmm4, xmm9
1512
+ movdqa xmmword ptr [rsp+0x100], xmm8
1513
+ movdqa xmm8, xmm5
1514
+ psrld xmm8, 12
1515
+ pslld xmm5, 20
1516
+ por xmm5, xmm8
1517
+ movdqa xmm8, xmm6
1518
+ psrld xmm8, 12
1519
+ pslld xmm6, 20
1520
+ por xmm6, xmm8
1521
+ movdqa xmm8, xmm7
1522
+ psrld xmm8, 12
1523
+ pslld xmm7, 20
1524
+ por xmm7, xmm8
1525
+ movdqa xmm8, xmm4
1526
+ psrld xmm8, 12
1527
+ pslld xmm4, 20
1528
+ por xmm4, xmm8
1529
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1530
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1531
+ paddd xmm2, xmmword ptr [rsp+0x40]
1532
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1533
+ paddd xmm0, xmm5
1534
+ paddd xmm1, xmm6
1535
+ paddd xmm2, xmm7
1536
+ paddd xmm3, xmm4
1537
+ pxor xmm15, xmm0
1538
+ pxor xmm12, xmm1
1539
+ pxor xmm13, xmm2
1540
+ pxor xmm14, xmm3
1541
+ movdqa xmm8, xmm15
1542
+ psrld xmm15, 8
1543
+ pslld xmm8, 24
1544
+ pxor xmm15, xmm8
1545
+ movdqa xmm8, xmm12
1546
+ psrld xmm12, 8
1547
+ pslld xmm8, 24
1548
+ pxor xmm12, xmm8
1549
+ movdqa xmm8, xmm13
1550
+ psrld xmm13, 8
1551
+ pslld xmm8, 24
1552
+ pxor xmm13, xmm8
1553
+ movdqa xmm8, xmm14
1554
+ psrld xmm14, 8
1555
+ pslld xmm8, 24
1556
+ pxor xmm14, xmm8
1557
+ paddd xmm10, xmm15
1558
+ paddd xmm11, xmm12
1559
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1560
+ paddd xmm8, xmm13
1561
+ paddd xmm9, xmm14
1562
+ pxor xmm5, xmm10
1563
+ pxor xmm6, xmm11
1564
+ pxor xmm7, xmm8
1565
+ pxor xmm4, xmm9
1566
+ pxor xmm0, xmm8
1567
+ pxor xmm1, xmm9
1568
+ pxor xmm2, xmm10
1569
+ pxor xmm3, xmm11
1570
+ movdqa xmm8, xmm5
1571
+ psrld xmm8, 7
1572
+ pslld xmm5, 25
1573
+ por xmm5, xmm8
1574
+ movdqa xmm8, xmm6
1575
+ psrld xmm8, 7
1576
+ pslld xmm6, 25
1577
+ por xmm6, xmm8
1578
+ movdqa xmm8, xmm7
1579
+ psrld xmm8, 7
1580
+ pslld xmm7, 25
1581
+ por xmm7, xmm8
1582
+ movdqa xmm8, xmm4
1583
+ psrld xmm8, 7
1584
+ pslld xmm4, 25
1585
+ por xmm4, xmm8
1586
+ pxor xmm4, xmm12
1587
+ pxor xmm5, xmm13
1588
+ pxor xmm6, xmm14
1589
+ pxor xmm7, xmm15
1590
+ mov eax, r13d
1591
+ jne 9b
1592
+ movdqa xmm9, xmm0
1593
+ punpckldq xmm0, xmm1
1594
+ punpckhdq xmm9, xmm1
1595
+ movdqa xmm11, xmm2
1596
+ punpckldq xmm2, xmm3
1597
+ punpckhdq xmm11, xmm3
1598
+ movdqa xmm1, xmm0
1599
+ punpcklqdq xmm0, xmm2
1600
+ punpckhqdq xmm1, xmm2
1601
+ movdqa xmm3, xmm9
1602
+ punpcklqdq xmm9, xmm11
1603
+ punpckhqdq xmm3, xmm11
1604
+ movdqu xmmword ptr [rbx], xmm0
1605
+ movdqu xmmword ptr [rbx+0x20], xmm1
1606
+ movdqu xmmword ptr [rbx+0x40], xmm9
1607
+ movdqu xmmword ptr [rbx+0x60], xmm3
1608
+ movdqa xmm9, xmm4
1609
+ punpckldq xmm4, xmm5
1610
+ punpckhdq xmm9, xmm5
1611
+ movdqa xmm11, xmm6
1612
+ punpckldq xmm6, xmm7
1613
+ punpckhdq xmm11, xmm7
1614
+ movdqa xmm5, xmm4
1615
+ punpcklqdq xmm4, xmm6
1616
+ punpckhqdq xmm5, xmm6
1617
+ movdqa xmm7, xmm9
1618
+ punpcklqdq xmm9, xmm11
1619
+ punpckhqdq xmm7, xmm11
1620
+ movdqu xmmword ptr [rbx+0x10], xmm4
1621
+ movdqu xmmword ptr [rbx+0x30], xmm5
1622
+ movdqu xmmword ptr [rbx+0x50], xmm9
1623
+ movdqu xmmword ptr [rbx+0x70], xmm7
1624
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1625
+ movdqa xmm0, xmm1
1626
+ paddd xmm1, xmmword ptr [rsp+0x150]
1627
+ movdqa xmmword ptr [rsp+0x110], xmm1
1628
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1629
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1630
+ pcmpgtd xmm0, xmm1
1631
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1632
+ psubd xmm1, xmm0
1633
+ movdqa xmmword ptr [rsp+0x120], xmm1
1634
+ add rbx, 128
1635
+ add rdi, 32
1636
+ sub rsi, 4
1637
+ cmp rsi, 4
1638
+ jnc 2b
1639
+ test rsi, rsi
1640
+ jnz 3f
1641
+ 4:
1642
+ mov rsp, rbp
1643
+ pop rbp
1644
+ pop rbx
1645
+ pop r12
1646
+ pop r13
1647
+ pop r14
1648
+ pop r15
1649
+ ret
1650
+ .p2align 5
1651
+ 3:
1652
+ test esi, 0x2
1653
+ je 3f
1654
+ movups xmm0, xmmword ptr [rcx]
1655
+ movups xmm1, xmmword ptr [rcx+0x10]
1656
+ movaps xmm8, xmm0
1657
+ movaps xmm9, xmm1
1658
+ movd xmm13, dword ptr [rsp+0x110]
1659
+ movd xmm14, dword ptr [rsp+0x120]
1660
+ punpckldq xmm13, xmm14
1661
+ movaps xmmword ptr [rsp], xmm13
1662
+ movd xmm14, dword ptr [rsp+0x114]
1663
+ movd xmm13, dword ptr [rsp+0x124]
1664
+ punpckldq xmm14, xmm13
1665
+ movaps xmmword ptr [rsp+0x10], xmm14
1666
+ mov r8, qword ptr [rdi]
1667
+ mov r9, qword ptr [rdi+0x8]
1668
+ movzx eax, byte ptr [rbp+0x40]
1669
+ or eax, r13d
1670
+ xor edx, edx
1671
+ 2:
1672
+ mov r14d, eax
1673
+ or eax, r12d
1674
+ add rdx, 64
1675
+ cmp rdx, r15
1676
+ cmovne eax, r14d
1677
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1678
+ movaps xmm10, xmm2
1679
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1680
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1681
+ movaps xmm3, xmm4
1682
+ shufps xmm4, xmm5, 136
1683
+ shufps xmm3, xmm5, 221
1684
+ movaps xmm5, xmm3
1685
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1686
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1687
+ movaps xmm3, xmm6
1688
+ shufps xmm6, xmm7, 136
1689
+ pshufd xmm6, xmm6, 0x93
1690
+ shufps xmm3, xmm7, 221
1691
+ pshufd xmm7, xmm3, 0x93
1692
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1693
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1694
+ movaps xmm11, xmm12
1695
+ shufps xmm12, xmm13, 136
1696
+ shufps xmm11, xmm13, 221
1697
+ movaps xmm13, xmm11
1698
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1699
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1700
+ movaps xmm11, xmm14
1701
+ shufps xmm14, xmm15, 136
1702
+ pshufd xmm14, xmm14, 0x93
1703
+ shufps xmm11, xmm15, 221
1704
+ pshufd xmm15, xmm11, 0x93
1705
+ shl rax, 0x20
1706
+ or rax, 0x40
1707
+ movd xmm3, rax
1708
+ movdqa xmmword ptr [rsp+0x20], xmm3
1709
+ movaps xmm3, xmmword ptr [rsp]
1710
+ movaps xmm11, xmmword ptr [rsp+0x10]
1711
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1712
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1713
+ mov al, 7
1714
+ 9:
1715
+ paddd xmm0, xmm4
1716
+ paddd xmm8, xmm12
1717
+ movaps xmmword ptr [rsp+0x20], xmm4
1718
+ movaps xmmword ptr [rsp+0x30], xmm12
1719
+ paddd xmm0, xmm1
1720
+ paddd xmm8, xmm9
1721
+ pxor xmm3, xmm0
1722
+ pxor xmm11, xmm8
1723
+ pshuflw xmm3, xmm3, 0xB1
1724
+ pshufhw xmm3, xmm3, 0xB1
1725
+ pshuflw xmm11, xmm11, 0xB1
1726
+ pshufhw xmm11, xmm11, 0xB1
1727
+ paddd xmm2, xmm3
1728
+ paddd xmm10, xmm11
1729
+ pxor xmm1, xmm2
1730
+ pxor xmm9, xmm10
1731
+ movdqa xmm4, xmm1
1732
+ pslld xmm1, 20
1733
+ psrld xmm4, 12
1734
+ por xmm1, xmm4
1735
+ movdqa xmm4, xmm9
1736
+ pslld xmm9, 20
1737
+ psrld xmm4, 12
1738
+ por xmm9, xmm4
1739
+ paddd xmm0, xmm5
1740
+ paddd xmm8, xmm13
1741
+ movaps xmmword ptr [rsp+0x40], xmm5
1742
+ movaps xmmword ptr [rsp+0x50], xmm13
1743
+ paddd xmm0, xmm1
1744
+ paddd xmm8, xmm9
1745
+ pxor xmm3, xmm0
1746
+ pxor xmm11, xmm8
1747
+ movdqa xmm13, xmm3
1748
+ psrld xmm3, 8
1749
+ pslld xmm13, 24
1750
+ pxor xmm3, xmm13
1751
+ movdqa xmm13, xmm11
1752
+ psrld xmm11, 8
1753
+ pslld xmm13, 24
1754
+ pxor xmm11, xmm13
1755
+ paddd xmm2, xmm3
1756
+ paddd xmm10, xmm11
1757
+ pxor xmm1, xmm2
1758
+ pxor xmm9, xmm10
1759
+ movdqa xmm4, xmm1
1760
+ pslld xmm1, 25
1761
+ psrld xmm4, 7
1762
+ por xmm1, xmm4
1763
+ movdqa xmm4, xmm9
1764
+ pslld xmm9, 25
1765
+ psrld xmm4, 7
1766
+ por xmm9, xmm4
1767
+ pshufd xmm0, xmm0, 0x93
1768
+ pshufd xmm8, xmm8, 0x93
1769
+ pshufd xmm3, xmm3, 0x4E
1770
+ pshufd xmm11, xmm11, 0x4E
1771
+ pshufd xmm2, xmm2, 0x39
1772
+ pshufd xmm10, xmm10, 0x39
1773
+ paddd xmm0, xmm6
1774
+ paddd xmm8, xmm14
1775
+ paddd xmm0, xmm1
1776
+ paddd xmm8, xmm9
1777
+ pxor xmm3, xmm0
1778
+ pxor xmm11, xmm8
1779
+ pshuflw xmm3, xmm3, 0xB1
1780
+ pshufhw xmm3, xmm3, 0xB1
1781
+ pshuflw xmm11, xmm11, 0xB1
1782
+ pshufhw xmm11, xmm11, 0xB1
1783
+ paddd xmm2, xmm3
1784
+ paddd xmm10, xmm11
1785
+ pxor xmm1, xmm2
1786
+ pxor xmm9, xmm10
1787
+ movdqa xmm4, xmm1
1788
+ pslld xmm1, 20
1789
+ psrld xmm4, 12
1790
+ por xmm1, xmm4
1791
+ movdqa xmm4, xmm9
1792
+ pslld xmm9, 20
1793
+ psrld xmm4, 12
1794
+ por xmm9, xmm4
1795
+ paddd xmm0, xmm7
1796
+ paddd xmm8, xmm15
1797
+ paddd xmm0, xmm1
1798
+ paddd xmm8, xmm9
1799
+ pxor xmm3, xmm0
1800
+ pxor xmm11, xmm8
1801
+ movdqa xmm13, xmm3
1802
+ psrld xmm3, 8
1803
+ pslld xmm13, 24
1804
+ pxor xmm3, xmm13
1805
+ movdqa xmm13, xmm11
1806
+ psrld xmm11, 8
1807
+ pslld xmm13, 24
1808
+ pxor xmm11, xmm13
1809
+ paddd xmm2, xmm3
1810
+ paddd xmm10, xmm11
1811
+ pxor xmm1, xmm2
1812
+ pxor xmm9, xmm10
1813
+ movdqa xmm4, xmm1
1814
+ pslld xmm1, 25
1815
+ psrld xmm4, 7
1816
+ por xmm1, xmm4
1817
+ movdqa xmm4, xmm9
1818
+ pslld xmm9, 25
1819
+ psrld xmm4, 7
1820
+ por xmm9, xmm4
1821
+ pshufd xmm0, xmm0, 0x39
1822
+ pshufd xmm8, xmm8, 0x39
1823
+ pshufd xmm3, xmm3, 0x4E
1824
+ pshufd xmm11, xmm11, 0x4E
1825
+ pshufd xmm2, xmm2, 0x93
1826
+ pshufd xmm10, xmm10, 0x93
1827
+ dec al
1828
+ je 9f
1829
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1830
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1831
+ pshufd xmm13, xmm12, 0x0F
1832
+ shufps xmm12, xmm5, 214
1833
+ pshufd xmm4, xmm12, 0x39
1834
+ movdqa xmm12, xmm6
1835
+ shufps xmm12, xmm7, 250
1836
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1837
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1838
+ por xmm13, xmm12
1839
+ movdqa xmmword ptr [rsp+0x20], xmm13
1840
+ movdqa xmm12, xmm7
1841
+ punpcklqdq xmm12, xmm5
1842
+ movdqa xmm13, xmm6
1843
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1844
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1845
+ por xmm12, xmm13
1846
+ pshufd xmm12, xmm12, 0x78
1847
+ punpckhdq xmm5, xmm7
1848
+ punpckldq xmm6, xmm5
1849
+ pshufd xmm7, xmm6, 0x1E
1850
+ movdqa xmmword ptr [rsp+0x40], xmm12
1851
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1852
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1853
+ pshufd xmm6, xmm5, 0x0F
1854
+ shufps xmm5, xmm13, 214
1855
+ pshufd xmm12, xmm5, 0x39
1856
+ movdqa xmm5, xmm14
1857
+ shufps xmm5, xmm15, 250
1858
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1859
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1860
+ por xmm6, xmm5
1861
+ movdqa xmm5, xmm15
1862
+ punpcklqdq xmm5, xmm13
1863
+ movdqa xmmword ptr [rsp+0x30], xmm2
1864
+ movdqa xmm2, xmm14
1865
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1866
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1867
+ por xmm5, xmm2
1868
+ movdqa xmm2, xmmword ptr [rsp+0x30]
1869
+ pshufd xmm5, xmm5, 0x78
1870
+ punpckhdq xmm13, xmm15
1871
+ punpckldq xmm14, xmm13
1872
+ pshufd xmm15, xmm14, 0x1E
1873
+ movdqa xmm13, xmm6
1874
+ movdqa xmm14, xmm5
1875
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1876
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1877
+ jmp 9b
1878
+ 9:
1879
+ pxor xmm0, xmm2
1880
+ pxor xmm1, xmm3
1881
+ pxor xmm8, xmm10
1882
+ pxor xmm9, xmm11
1883
+ mov eax, r13d
1884
+ cmp rdx, r15
1885
+ jne 2b
1886
+ movups xmmword ptr [rbx], xmm0
1887
+ movups xmmword ptr [rbx+0x10], xmm1
1888
+ movups xmmword ptr [rbx+0x20], xmm8
1889
+ movups xmmword ptr [rbx+0x30], xmm9
1890
+ mov eax, dword ptr [rsp+0x130]
1891
+ neg eax
1892
+ mov r10d, dword ptr [rsp+0x110+8*rax]
1893
+ mov r11d, dword ptr [rsp+0x120+8*rax]
1894
+ mov dword ptr [rsp+0x110], r10d
1895
+ mov dword ptr [rsp+0x120], r11d
1896
+ add rdi, 16
1897
+ add rbx, 64
1898
+ sub rsi, 2
1899
+ 3:
1900
+ test esi, 0x1
1901
+ je 4b
1902
+ movups xmm0, xmmword ptr [rcx]
1903
+ movups xmm1, xmmword ptr [rcx+0x10]
1904
+ movd xmm13, dword ptr [rsp+0x110]
1905
+ movd xmm14, dword ptr [rsp+0x120]
1906
+ punpckldq xmm13, xmm14
1907
+ mov r8, qword ptr [rdi]
1908
+ movzx eax, byte ptr [rbp+0x40]
1909
+ or eax, r13d
1910
+ xor edx, edx
1911
+ 2:
1912
+ mov r14d, eax
1913
+ or eax, r12d
1914
+ add rdx, 64
1915
+ cmp rdx, r15
1916
+ cmovne eax, r14d
1917
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1918
+ shl rax, 32
1919
+ or rax, 64
1920
+ movd xmm12, rax
1921
+ movdqa xmm3, xmm13
1922
+ punpcklqdq xmm3, xmm12
1923
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1924
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1925
+ movaps xmm8, xmm4
1926
+ shufps xmm4, xmm5, 136
1927
+ shufps xmm8, xmm5, 221
1928
+ movaps xmm5, xmm8
1929
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1930
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1931
+ movaps xmm8, xmm6
1932
+ shufps xmm6, xmm7, 136
1933
+ pshufd xmm6, xmm6, 0x93
1934
+ shufps xmm8, xmm7, 221
1935
+ pshufd xmm7, xmm8, 0x93
1936
+ mov al, 7
1937
+ 9:
1938
+ paddd xmm0, xmm4
1939
+ paddd xmm0, xmm1
1940
+ pxor xmm3, xmm0
1941
+ pshuflw xmm3, xmm3, 0xB1
1942
+ pshufhw xmm3, xmm3, 0xB1
1943
+ paddd xmm2, xmm3
1944
+ pxor xmm1, xmm2
1945
+ movdqa xmm11, xmm1
1946
+ pslld xmm1, 20
1947
+ psrld xmm11, 12
1948
+ por xmm1, xmm11
1949
+ paddd xmm0, xmm5
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ movdqa xmm14, xmm3
1953
+ psrld xmm3, 8
1954
+ pslld xmm14, 24
1955
+ pxor xmm3, xmm14
1956
+ paddd xmm2, xmm3
1957
+ pxor xmm1, xmm2
1958
+ movdqa xmm11, xmm1
1959
+ pslld xmm1, 25
1960
+ psrld xmm11, 7
1961
+ por xmm1, xmm11
1962
+ pshufd xmm0, xmm0, 0x93
1963
+ pshufd xmm3, xmm3, 0x4E
1964
+ pshufd xmm2, xmm2, 0x39
1965
+ paddd xmm0, xmm6
1966
+ paddd xmm0, xmm1
1967
+ pxor xmm3, xmm0
1968
+ pshuflw xmm3, xmm3, 0xB1
1969
+ pshufhw xmm3, xmm3, 0xB1
1970
+ paddd xmm2, xmm3
1971
+ pxor xmm1, xmm2
1972
+ movdqa xmm11, xmm1
1973
+ pslld xmm1, 20
1974
+ psrld xmm11, 12
1975
+ por xmm1, xmm11
1976
+ paddd xmm0, xmm7
1977
+ paddd xmm0, xmm1
1978
+ pxor xmm3, xmm0
1979
+ movdqa xmm14, xmm3
1980
+ psrld xmm3, 8
1981
+ pslld xmm14, 24
1982
+ pxor xmm3, xmm14
1983
+ paddd xmm2, xmm3
1984
+ pxor xmm1, xmm2
1985
+ movdqa xmm11, xmm1
1986
+ pslld xmm1, 25
1987
+ psrld xmm11, 7
1988
+ por xmm1, xmm11
1989
+ pshufd xmm0, xmm0, 0x39
1990
+ pshufd xmm3, xmm3, 0x4E
1991
+ pshufd xmm2, xmm2, 0x93
1992
+ dec al
1993
+ jz 9f
1994
+ movdqa xmm8, xmm4
1995
+ shufps xmm8, xmm5, 214
1996
+ pshufd xmm9, xmm4, 0x0F
1997
+ pshufd xmm4, xmm8, 0x39
1998
+ movdqa xmm8, xmm6
1999
+ shufps xmm8, xmm7, 250
2000
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2001
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2002
+ por xmm9, xmm8
2003
+ movdqa xmm8, xmm7
2004
+ punpcklqdq xmm8, xmm5
2005
+ movdqa xmm10, xmm6
2006
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2007
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2008
+ por xmm8, xmm10
2009
+ pshufd xmm8, xmm8, 0x78
2010
+ punpckhdq xmm5, xmm7
2011
+ punpckldq xmm6, xmm5
2012
+ pshufd xmm7, xmm6, 0x1E
2013
+ movdqa xmm5, xmm9
2014
+ movdqa xmm6, xmm8
2015
+ jmp 9b
2016
+ 9:
2017
+ pxor xmm0, xmm2
2018
+ pxor xmm1, xmm3
2019
+ mov eax, r13d
2020
+ cmp rdx, r15
2021
+ jne 2b
2022
+ movups xmmword ptr [rbx], xmm0
2023
+ movups xmmword ptr [rbx+0x10], xmm1
2024
+ jmp 4b
2025
+
2026
+ .p2align 6
2027
+ blake3_compress_in_place_sse2:
2028
+ _blake3_compress_in_place_sse2:
2029
+ _CET_ENDBR
2030
+ movups xmm0, xmmword ptr [rdi]
2031
+ movups xmm1, xmmword ptr [rdi+0x10]
2032
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2033
+ shl r8, 32
2034
+ add rdx, r8
2035
+ movq xmm3, rcx
2036
+ movq xmm4, rdx
2037
+ punpcklqdq xmm3, xmm4
2038
+ movups xmm4, xmmword ptr [rsi]
2039
+ movups xmm5, xmmword ptr [rsi+0x10]
2040
+ movaps xmm8, xmm4
2041
+ shufps xmm4, xmm5, 136
2042
+ shufps xmm8, xmm5, 221
2043
+ movaps xmm5, xmm8
2044
+ movups xmm6, xmmword ptr [rsi+0x20]
2045
+ movups xmm7, xmmword ptr [rsi+0x30]
2046
+ movaps xmm8, xmm6
2047
+ shufps xmm6, xmm7, 136
2048
+ pshufd xmm6, xmm6, 0x93
2049
+ shufps xmm8, xmm7, 221
2050
+ pshufd xmm7, xmm8, 0x93
2051
+ mov al, 7
2052
+ 9:
2053
+ paddd xmm0, xmm4
2054
+ paddd xmm0, xmm1
2055
+ pxor xmm3, xmm0
2056
+ pshuflw xmm3, xmm3, 0xB1
2057
+ pshufhw xmm3, xmm3, 0xB1
2058
+ paddd xmm2, xmm3
2059
+ pxor xmm1, xmm2
2060
+ movdqa xmm11, xmm1
2061
+ pslld xmm1, 20
2062
+ psrld xmm11, 12
2063
+ por xmm1, xmm11
2064
+ paddd xmm0, xmm5
2065
+ paddd xmm0, xmm1
2066
+ pxor xmm3, xmm0
2067
+ movdqa xmm14, xmm3
2068
+ psrld xmm3, 8
2069
+ pslld xmm14, 24
2070
+ pxor xmm3, xmm14
2071
+ paddd xmm2, xmm3
2072
+ pxor xmm1, xmm2
2073
+ movdqa xmm11, xmm1
2074
+ pslld xmm1, 25
2075
+ psrld xmm11, 7
2076
+ por xmm1, xmm11
2077
+ pshufd xmm0, xmm0, 0x93
2078
+ pshufd xmm3, xmm3, 0x4E
2079
+ pshufd xmm2, xmm2, 0x39
2080
+ paddd xmm0, xmm6
2081
+ paddd xmm0, xmm1
2082
+ pxor xmm3, xmm0
2083
+ pshuflw xmm3, xmm3, 0xB1
2084
+ pshufhw xmm3, xmm3, 0xB1
2085
+ paddd xmm2, xmm3
2086
+ pxor xmm1, xmm2
2087
+ movdqa xmm11, xmm1
2088
+ pslld xmm1, 20
2089
+ psrld xmm11, 12
2090
+ por xmm1, xmm11
2091
+ paddd xmm0, xmm7
2092
+ paddd xmm0, xmm1
2093
+ pxor xmm3, xmm0
2094
+ movdqa xmm14, xmm3
2095
+ psrld xmm3, 8
2096
+ pslld xmm14, 24
2097
+ pxor xmm3, xmm14
2098
+ paddd xmm2, xmm3
2099
+ pxor xmm1, xmm2
2100
+ movdqa xmm11, xmm1
2101
+ pslld xmm1, 25
2102
+ psrld xmm11, 7
2103
+ por xmm1, xmm11
2104
+ pshufd xmm0, xmm0, 0x39
2105
+ pshufd xmm3, xmm3, 0x4E
2106
+ pshufd xmm2, xmm2, 0x93
2107
+ dec al
2108
+ jz 9f
2109
+ movdqa xmm8, xmm4
2110
+ shufps xmm8, xmm5, 214
2111
+ pshufd xmm9, xmm4, 0x0F
2112
+ pshufd xmm4, xmm8, 0x39
2113
+ movdqa xmm8, xmm6
2114
+ shufps xmm8, xmm7, 250
2115
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2116
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2117
+ por xmm9, xmm8
2118
+ movdqa xmm8, xmm7
2119
+ punpcklqdq xmm8, xmm5
2120
+ movdqa xmm10, xmm6
2121
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2122
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2123
+ por xmm8, xmm10
2124
+ pshufd xmm8, xmm8, 0x78
2125
+ punpckhdq xmm5, xmm7
2126
+ punpckldq xmm6, xmm5
2127
+ pshufd xmm7, xmm6, 0x1E
2128
+ movdqa xmm5, xmm9
2129
+ movdqa xmm6, xmm8
2130
+ jmp 9b
2131
+ 9:
2132
+ pxor xmm0, xmm2
2133
+ pxor xmm1, xmm3
2134
+ movups xmmword ptr [rdi], xmm0
2135
+ movups xmmword ptr [rdi+0x10], xmm1
2136
+ ret
2137
+
2138
+ .p2align 6
2139
+ blake3_compress_xof_sse2:
2140
+ _blake3_compress_xof_sse2:
2141
+ _CET_ENDBR
2142
+ movups xmm0, xmmword ptr [rdi]
2143
+ movups xmm1, xmmword ptr [rdi+0x10]
2144
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2145
+ movzx eax, r8b
2146
+ movzx edx, dl
2147
+ shl rax, 32
2148
+ add rdx, rax
2149
+ movq xmm3, rcx
2150
+ movq xmm4, rdx
2151
+ punpcklqdq xmm3, xmm4
2152
+ movups xmm4, xmmword ptr [rsi]
2153
+ movups xmm5, xmmword ptr [rsi+0x10]
2154
+ movaps xmm8, xmm4
2155
+ shufps xmm4, xmm5, 136
2156
+ shufps xmm8, xmm5, 221
2157
+ movaps xmm5, xmm8
2158
+ movups xmm6, xmmword ptr [rsi+0x20]
2159
+ movups xmm7, xmmword ptr [rsi+0x30]
2160
+ movaps xmm8, xmm6
2161
+ shufps xmm6, xmm7, 136
2162
+ pshufd xmm6, xmm6, 0x93
2163
+ shufps xmm8, xmm7, 221
2164
+ pshufd xmm7, xmm8, 0x93
2165
+ mov al, 7
2166
+ 9:
2167
+ paddd xmm0, xmm4
2168
+ paddd xmm0, xmm1
2169
+ pxor xmm3, xmm0
2170
+ pshuflw xmm3, xmm3, 0xB1
2171
+ pshufhw xmm3, xmm3, 0xB1
2172
+ paddd xmm2, xmm3
2173
+ pxor xmm1, xmm2
2174
+ movdqa xmm11, xmm1
2175
+ pslld xmm1, 20
2176
+ psrld xmm11, 12
2177
+ por xmm1, xmm11
2178
+ paddd xmm0, xmm5
2179
+ paddd xmm0, xmm1
2180
+ pxor xmm3, xmm0
2181
+ movdqa xmm14, xmm3
2182
+ psrld xmm3, 8
2183
+ pslld xmm14, 24
2184
+ pxor xmm3, xmm14
2185
+ paddd xmm2, xmm3
2186
+ pxor xmm1, xmm2
2187
+ movdqa xmm11, xmm1
2188
+ pslld xmm1, 25
2189
+ psrld xmm11, 7
2190
+ por xmm1, xmm11
2191
+ pshufd xmm0, xmm0, 0x93
2192
+ pshufd xmm3, xmm3, 0x4E
2193
+ pshufd xmm2, xmm2, 0x39
2194
+ paddd xmm0, xmm6
2195
+ paddd xmm0, xmm1
2196
+ pxor xmm3, xmm0
2197
+ pshuflw xmm3, xmm3, 0xB1
2198
+ pshufhw xmm3, xmm3, 0xB1
2199
+ paddd xmm2, xmm3
2200
+ pxor xmm1, xmm2
2201
+ movdqa xmm11, xmm1
2202
+ pslld xmm1, 20
2203
+ psrld xmm11, 12
2204
+ por xmm1, xmm11
2205
+ paddd xmm0, xmm7
2206
+ paddd xmm0, xmm1
2207
+ pxor xmm3, xmm0
2208
+ movdqa xmm14, xmm3
2209
+ psrld xmm3, 8
2210
+ pslld xmm14, 24
2211
+ pxor xmm3, xmm14
2212
+ paddd xmm2, xmm3
2213
+ pxor xmm1, xmm2
2214
+ movdqa xmm11, xmm1
2215
+ pslld xmm1, 25
2216
+ psrld xmm11, 7
2217
+ por xmm1, xmm11
2218
+ pshufd xmm0, xmm0, 0x39
2219
+ pshufd xmm3, xmm3, 0x4E
2220
+ pshufd xmm2, xmm2, 0x93
2221
+ dec al
2222
+ jz 9f
2223
+ movdqa xmm8, xmm4
2224
+ shufps xmm8, xmm5, 214
2225
+ pshufd xmm9, xmm4, 0x0F
2226
+ pshufd xmm4, xmm8, 0x39
2227
+ movdqa xmm8, xmm6
2228
+ shufps xmm8, xmm7, 250
2229
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2230
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2231
+ por xmm9, xmm8
2232
+ movdqa xmm8, xmm7
2233
+ punpcklqdq xmm8, xmm5
2234
+ movdqa xmm10, xmm6
2235
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2236
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2237
+ por xmm8, xmm10
2238
+ pshufd xmm8, xmm8, 0x78
2239
+ punpckhdq xmm5, xmm7
2240
+ punpckldq xmm6, xmm5
2241
+ pshufd xmm7, xmm6, 0x1E
2242
+ movdqa xmm5, xmm9
2243
+ movdqa xmm6, xmm8
2244
+ jmp 9b
2245
+ 9:
2246
+ movdqu xmm4, xmmword ptr [rdi]
2247
+ movdqu xmm5, xmmword ptr [rdi+0x10]
2248
+ pxor xmm0, xmm2
2249
+ pxor xmm1, xmm3
2250
+ pxor xmm2, xmm4
2251
+ pxor xmm3, xmm5
2252
+ movups xmmword ptr [r9], xmm0
2253
+ movups xmmword ptr [r9+0x10], xmm1
2254
+ movups xmmword ptr [r9+0x20], xmm2
2255
+ movups xmmword ptr [r9+0x30], xmm3
2256
+ ret
2257
+
2258
+
2259
+ #ifdef __APPLE__
2260
+ .static_data
2261
+ #else
2262
+ .section .rodata
2263
+ #endif
2264
+ .p2align 6
2265
+ BLAKE3_IV:
2266
+ .long 0x6A09E667, 0xBB67AE85
2267
+ .long 0x3C6EF372, 0xA54FF53A
2268
+ ADD0:
2269
+ .long 0, 1, 2, 3
2270
+ ADD1:
2271
+ .long 4, 4, 4, 4
2272
+ BLAKE3_IV_0:
2273
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2274
+ BLAKE3_IV_1:
2275
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2276
+ BLAKE3_IV_2:
2277
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2278
+ BLAKE3_IV_3:
2279
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2280
+ BLAKE3_BLOCK_LEN:
2281
+ .long 64, 64, 64, 64
2282
+ CMP_MSB_MASK:
2283
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2284
+ PBLENDW_0x33_MASK:
2285
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2286
+ PBLENDW_0xCC_MASK:
2287
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2288
+ PBLENDW_0x3F_MASK:
2289
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2290
+ PBLENDW_0xC0_MASK:
2291
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF