digest-blake3 0.22.1 → 1.2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2291 @@
1
+ #if defined(__ELF__) && defined(__linux__)
2
+ .section .note.GNU-stack,"",%progbits
3
+ #endif
4
+
5
+ #if defined(__ELF__) && defined(__CET__) && defined(__has_include)
6
+ #if __has_include(<cet.h>)
7
+ #include <cet.h>
8
+ #endif
9
+ #endif
10
+
11
+ #if !defined(_CET_ENDBR)
12
+ #define _CET_ENDBR
13
+ #endif
14
+
15
+ .intel_syntax noprefix
16
+ .global blake3_hash_many_sse2
17
+ .global _blake3_hash_many_sse2
18
+ .global blake3_compress_in_place_sse2
19
+ .global _blake3_compress_in_place_sse2
20
+ .global blake3_compress_xof_sse2
21
+ .global _blake3_compress_xof_sse2
22
+ #ifdef __APPLE__
23
+ .text
24
+ #else
25
+ .section .text
26
+ #endif
27
+ .p2align 6
28
+ _blake3_hash_many_sse2:
29
+ blake3_hash_many_sse2:
30
+ _CET_ENDBR
31
+ push r15
32
+ push r14
33
+ push r13
34
+ push r12
35
+ push rbx
36
+ push rbp
37
+ mov rbp, rsp
38
+ sub rsp, 360
39
+ and rsp, 0xFFFFFFFFFFFFFFC0
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 0x00
43
+ movdqa xmmword ptr [rsp+0x130], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0+rip]
46
+ pand xmm0, xmmword ptr [ADD1+rip]
47
+ movdqa xmmword ptr [rsp+0x150], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 0x00
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+0x110], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 0x00
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+0x120], xmm2
60
+ mov rbx, qword ptr [rbp+0x50]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+0x38]
64
+ movzx r12d, byte ptr [rbp+0x48]
65
+ cmp rsi, 4
66
+ jc 3f
67
+ 2:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 0x00
70
+ pshufd xmm1, xmm3, 0x55
71
+ pshufd xmm2, xmm3, 0xAA
72
+ pshufd xmm3, xmm3, 0xFF
73
+ movdqu xmm7, xmmword ptr [rcx+0x10]
74
+ pshufd xmm4, xmm7, 0x00
75
+ pshufd xmm5, xmm7, 0x55
76
+ pshufd xmm6, xmm7, 0xAA
77
+ pshufd xmm7, xmm7, 0xFF
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+0x8]
80
+ mov r10, qword ptr [rdi+0x10]
81
+ mov r11, qword ptr [rdi+0x18]
82
+ movzx eax, byte ptr [rbp+0x40]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ 9:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-0x40]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-0x40]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-0x40]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-0x40]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+0x10], xmm9
109
+ movdqa xmmword ptr [rsp+0x20], xmm12
110
+ movdqa xmmword ptr [rsp+0x30], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-0x30]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-0x30]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-0x30]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-0x30]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+0x40], xmm8
128
+ movdqa xmmword ptr [rsp+0x50], xmm9
129
+ movdqa xmmword ptr [rsp+0x60], xmm12
130
+ movdqa xmmword ptr [rsp+0x70], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-0x20]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-0x20]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-0x20]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-0x20]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+0x80], xmm8
148
+ movdqa xmmword ptr [rsp+0x90], xmm9
149
+ movdqa xmmword ptr [rsp+0xA0], xmm12
150
+ movdqa xmmword ptr [rsp+0xB0], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-0x10]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-0x10]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-0x10]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-0x10]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0xC0], xmm8
168
+ movdqa xmmword ptr [rsp+0xD0], xmm9
169
+ movdqa xmmword ptr [rsp+0xE0], xmm12
170
+ movdqa xmmword ptr [rsp+0xF0], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
174
+ movdqa xmm12, xmmword ptr [rsp+0x110]
175
+ movdqa xmm13, xmmword ptr [rsp+0x120]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 0x00
179
+ prefetcht0 [r8+rdx+0x80]
180
+ prefetcht0 [r9+rdx+0x80]
181
+ prefetcht0 [r10+rdx+0x80]
182
+ prefetcht0 [r11+rdx+0x80]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+0x20]
185
+ paddd xmm2, xmmword ptr [rsp+0x40]
186
+ paddd xmm3, xmmword ptr [rsp+0x60]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ pshuflw xmm12, xmm12, 0xB1
196
+ pshufhw xmm12, xmm12, 0xB1
197
+ pshuflw xmm13, xmm13, 0xB1
198
+ pshufhw xmm13, xmm13, 0xB1
199
+ pshuflw xmm14, xmm14, 0xB1
200
+ pshufhw xmm14, xmm14, 0xB1
201
+ pshuflw xmm15, xmm15, 0xB1
202
+ pshufhw xmm15, xmm15, 0xB1
203
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
204
+ paddd xmm8, xmm12
205
+ paddd xmm9, xmm13
206
+ paddd xmm10, xmm14
207
+ paddd xmm11, xmm15
208
+ pxor xmm4, xmm8
209
+ pxor xmm5, xmm9
210
+ pxor xmm6, xmm10
211
+ pxor xmm7, xmm11
212
+ movdqa xmmword ptr [rsp+0x100], xmm8
213
+ movdqa xmm8, xmm4
214
+ psrld xmm8, 12
215
+ pslld xmm4, 20
216
+ por xmm4, xmm8
217
+ movdqa xmm8, xmm5
218
+ psrld xmm8, 12
219
+ pslld xmm5, 20
220
+ por xmm5, xmm8
221
+ movdqa xmm8, xmm6
222
+ psrld xmm8, 12
223
+ pslld xmm6, 20
224
+ por xmm6, xmm8
225
+ movdqa xmm8, xmm7
226
+ psrld xmm8, 12
227
+ pslld xmm7, 20
228
+ por xmm7, xmm8
229
+ paddd xmm0, xmmword ptr [rsp+0x10]
230
+ paddd xmm1, xmmword ptr [rsp+0x30]
231
+ paddd xmm2, xmmword ptr [rsp+0x50]
232
+ paddd xmm3, xmmword ptr [rsp+0x70]
233
+ paddd xmm0, xmm4
234
+ paddd xmm1, xmm5
235
+ paddd xmm2, xmm6
236
+ paddd xmm3, xmm7
237
+ pxor xmm12, xmm0
238
+ pxor xmm13, xmm1
239
+ pxor xmm14, xmm2
240
+ pxor xmm15, xmm3
241
+ movdqa xmm8, xmm12
242
+ psrld xmm12, 8
243
+ pslld xmm8, 24
244
+ pxor xmm12, xmm8
245
+ movdqa xmm8, xmm13
246
+ psrld xmm13, 8
247
+ pslld xmm8, 24
248
+ pxor xmm13, xmm8
249
+ movdqa xmm8, xmm14
250
+ psrld xmm14, 8
251
+ pslld xmm8, 24
252
+ pxor xmm14, xmm8
253
+ movdqa xmm8, xmm15
254
+ psrld xmm15, 8
255
+ pslld xmm8, 24
256
+ pxor xmm15, xmm8
257
+ movdqa xmm8, xmmword ptr [rsp+0x100]
258
+ paddd xmm8, xmm12
259
+ paddd xmm9, xmm13
260
+ paddd xmm10, xmm14
261
+ paddd xmm11, xmm15
262
+ pxor xmm4, xmm8
263
+ pxor xmm5, xmm9
264
+ pxor xmm6, xmm10
265
+ pxor xmm7, xmm11
266
+ movdqa xmmword ptr [rsp+0x100], xmm8
267
+ movdqa xmm8, xmm4
268
+ psrld xmm8, 7
269
+ pslld xmm4, 25
270
+ por xmm4, xmm8
271
+ movdqa xmm8, xmm5
272
+ psrld xmm8, 7
273
+ pslld xmm5, 25
274
+ por xmm5, xmm8
275
+ movdqa xmm8, xmm6
276
+ psrld xmm8, 7
277
+ pslld xmm6, 25
278
+ por xmm6, xmm8
279
+ movdqa xmm8, xmm7
280
+ psrld xmm8, 7
281
+ pslld xmm7, 25
282
+ por xmm7, xmm8
283
+ paddd xmm0, xmmword ptr [rsp+0x80]
284
+ paddd xmm1, xmmword ptr [rsp+0xA0]
285
+ paddd xmm2, xmmword ptr [rsp+0xC0]
286
+ paddd xmm3, xmmword ptr [rsp+0xE0]
287
+ paddd xmm0, xmm5
288
+ paddd xmm1, xmm6
289
+ paddd xmm2, xmm7
290
+ paddd xmm3, xmm4
291
+ pxor xmm15, xmm0
292
+ pxor xmm12, xmm1
293
+ pxor xmm13, xmm2
294
+ pxor xmm14, xmm3
295
+ pshuflw xmm15, xmm15, 0xB1
296
+ pshufhw xmm15, xmm15, 0xB1
297
+ pshuflw xmm12, xmm12, 0xB1
298
+ pshufhw xmm12, xmm12, 0xB1
299
+ pshuflw xmm13, xmm13, 0xB1
300
+ pshufhw xmm13, xmm13, 0xB1
301
+ pshuflw xmm14, xmm14, 0xB1
302
+ pshufhw xmm14, xmm14, 0xB1
303
+ paddd xmm10, xmm15
304
+ paddd xmm11, xmm12
305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
306
+ paddd xmm8, xmm13
307
+ paddd xmm9, xmm14
308
+ pxor xmm5, xmm10
309
+ pxor xmm6, xmm11
310
+ pxor xmm7, xmm8
311
+ pxor xmm4, xmm9
312
+ movdqa xmmword ptr [rsp+0x100], xmm8
313
+ movdqa xmm8, xmm5
314
+ psrld xmm8, 12
315
+ pslld xmm5, 20
316
+ por xmm5, xmm8
317
+ movdqa xmm8, xmm6
318
+ psrld xmm8, 12
319
+ pslld xmm6, 20
320
+ por xmm6, xmm8
321
+ movdqa xmm8, xmm7
322
+ psrld xmm8, 12
323
+ pslld xmm7, 20
324
+ por xmm7, xmm8
325
+ movdqa xmm8, xmm4
326
+ psrld xmm8, 12
327
+ pslld xmm4, 20
328
+ por xmm4, xmm8
329
+ paddd xmm0, xmmword ptr [rsp+0x90]
330
+ paddd xmm1, xmmword ptr [rsp+0xB0]
331
+ paddd xmm2, xmmword ptr [rsp+0xD0]
332
+ paddd xmm3, xmmword ptr [rsp+0xF0]
333
+ paddd xmm0, xmm5
334
+ paddd xmm1, xmm6
335
+ paddd xmm2, xmm7
336
+ paddd xmm3, xmm4
337
+ pxor xmm15, xmm0
338
+ pxor xmm12, xmm1
339
+ pxor xmm13, xmm2
340
+ pxor xmm14, xmm3
341
+ movdqa xmm8, xmm15
342
+ psrld xmm15, 8
343
+ pslld xmm8, 24
344
+ pxor xmm15, xmm8
345
+ movdqa xmm8, xmm12
346
+ psrld xmm12, 8
347
+ pslld xmm8, 24
348
+ pxor xmm12, xmm8
349
+ movdqa xmm8, xmm13
350
+ psrld xmm13, 8
351
+ pslld xmm8, 24
352
+ pxor xmm13, xmm8
353
+ movdqa xmm8, xmm14
354
+ psrld xmm14, 8
355
+ pslld xmm8, 24
356
+ pxor xmm14, xmm8
357
+ paddd xmm10, xmm15
358
+ paddd xmm11, xmm12
359
+ movdqa xmm8, xmmword ptr [rsp+0x100]
360
+ paddd xmm8, xmm13
361
+ paddd xmm9, xmm14
362
+ pxor xmm5, xmm10
363
+ pxor xmm6, xmm11
364
+ pxor xmm7, xmm8
365
+ pxor xmm4, xmm9
366
+ movdqa xmmword ptr [rsp+0x100], xmm8
367
+ movdqa xmm8, xmm5
368
+ psrld xmm8, 7
369
+ pslld xmm5, 25
370
+ por xmm5, xmm8
371
+ movdqa xmm8, xmm6
372
+ psrld xmm8, 7
373
+ pslld xmm6, 25
374
+ por xmm6, xmm8
375
+ movdqa xmm8, xmm7
376
+ psrld xmm8, 7
377
+ pslld xmm7, 25
378
+ por xmm7, xmm8
379
+ movdqa xmm8, xmm4
380
+ psrld xmm8, 7
381
+ pslld xmm4, 25
382
+ por xmm4, xmm8
383
+ paddd xmm0, xmmword ptr [rsp+0x20]
384
+ paddd xmm1, xmmword ptr [rsp+0x30]
385
+ paddd xmm2, xmmword ptr [rsp+0x70]
386
+ paddd xmm3, xmmword ptr [rsp+0x40]
387
+ paddd xmm0, xmm4
388
+ paddd xmm1, xmm5
389
+ paddd xmm2, xmm6
390
+ paddd xmm3, xmm7
391
+ pxor xmm12, xmm0
392
+ pxor xmm13, xmm1
393
+ pxor xmm14, xmm2
394
+ pxor xmm15, xmm3
395
+ pshuflw xmm12, xmm12, 0xB1
396
+ pshufhw xmm12, xmm12, 0xB1
397
+ pshuflw xmm13, xmm13, 0xB1
398
+ pshufhw xmm13, xmm13, 0xB1
399
+ pshuflw xmm14, xmm14, 0xB1
400
+ pshufhw xmm14, xmm14, 0xB1
401
+ pshuflw xmm15, xmm15, 0xB1
402
+ pshufhw xmm15, xmm15, 0xB1
403
+ movdqa xmm8, xmmword ptr [rsp+0x100]
404
+ paddd xmm8, xmm12
405
+ paddd xmm9, xmm13
406
+ paddd xmm10, xmm14
407
+ paddd xmm11, xmm15
408
+ pxor xmm4, xmm8
409
+ pxor xmm5, xmm9
410
+ pxor xmm6, xmm10
411
+ pxor xmm7, xmm11
412
+ movdqa xmmword ptr [rsp+0x100], xmm8
413
+ movdqa xmm8, xmm4
414
+ psrld xmm8, 12
415
+ pslld xmm4, 20
416
+ por xmm4, xmm8
417
+ movdqa xmm8, xmm5
418
+ psrld xmm8, 12
419
+ pslld xmm5, 20
420
+ por xmm5, xmm8
421
+ movdqa xmm8, xmm6
422
+ psrld xmm8, 12
423
+ pslld xmm6, 20
424
+ por xmm6, xmm8
425
+ movdqa xmm8, xmm7
426
+ psrld xmm8, 12
427
+ pslld xmm7, 20
428
+ por xmm7, xmm8
429
+ paddd xmm0, xmmword ptr [rsp+0x60]
430
+ paddd xmm1, xmmword ptr [rsp+0xA0]
431
+ paddd xmm2, xmmword ptr [rsp]
432
+ paddd xmm3, xmmword ptr [rsp+0xD0]
433
+ paddd xmm0, xmm4
434
+ paddd xmm1, xmm5
435
+ paddd xmm2, xmm6
436
+ paddd xmm3, xmm7
437
+ pxor xmm12, xmm0
438
+ pxor xmm13, xmm1
439
+ pxor xmm14, xmm2
440
+ pxor xmm15, xmm3
441
+ movdqa xmm8, xmm12
442
+ psrld xmm12, 8
443
+ pslld xmm8, 24
444
+ pxor xmm12, xmm8
445
+ movdqa xmm8, xmm13
446
+ psrld xmm13, 8
447
+ pslld xmm8, 24
448
+ pxor xmm13, xmm8
449
+ movdqa xmm8, xmm14
450
+ psrld xmm14, 8
451
+ pslld xmm8, 24
452
+ pxor xmm14, xmm8
453
+ movdqa xmm8, xmm15
454
+ psrld xmm15, 8
455
+ pslld xmm8, 24
456
+ pxor xmm15, xmm8
457
+ movdqa xmm8, xmmword ptr [rsp+0x100]
458
+ paddd xmm8, xmm12
459
+ paddd xmm9, xmm13
460
+ paddd xmm10, xmm14
461
+ paddd xmm11, xmm15
462
+ pxor xmm4, xmm8
463
+ pxor xmm5, xmm9
464
+ pxor xmm6, xmm10
465
+ pxor xmm7, xmm11
466
+ movdqa xmmword ptr [rsp+0x100], xmm8
467
+ movdqa xmm8, xmm4
468
+ psrld xmm8, 7
469
+ pslld xmm4, 25
470
+ por xmm4, xmm8
471
+ movdqa xmm8, xmm5
472
+ psrld xmm8, 7
473
+ pslld xmm5, 25
474
+ por xmm5, xmm8
475
+ movdqa xmm8, xmm6
476
+ psrld xmm8, 7
477
+ pslld xmm6, 25
478
+ por xmm6, xmm8
479
+ movdqa xmm8, xmm7
480
+ psrld xmm8, 7
481
+ pslld xmm7, 25
482
+ por xmm7, xmm8
483
+ paddd xmm0, xmmword ptr [rsp+0x10]
484
+ paddd xmm1, xmmword ptr [rsp+0xC0]
485
+ paddd xmm2, xmmword ptr [rsp+0x90]
486
+ paddd xmm3, xmmword ptr [rsp+0xF0]
487
+ paddd xmm0, xmm5
488
+ paddd xmm1, xmm6
489
+ paddd xmm2, xmm7
490
+ paddd xmm3, xmm4
491
+ pxor xmm15, xmm0
492
+ pxor xmm12, xmm1
493
+ pxor xmm13, xmm2
494
+ pxor xmm14, xmm3
495
+ pshuflw xmm15, xmm15, 0xB1
496
+ pshufhw xmm15, xmm15, 0xB1
497
+ pshuflw xmm12, xmm12, 0xB1
498
+ pshufhw xmm12, xmm12, 0xB1
499
+ pshuflw xmm13, xmm13, 0xB1
500
+ pshufhw xmm13, xmm13, 0xB1
501
+ pshuflw xmm14, xmm14, 0xB1
502
+ pshufhw xmm14, xmm14, 0xB1
503
+ paddd xmm10, xmm15
504
+ paddd xmm11, xmm12
505
+ movdqa xmm8, xmmword ptr [rsp+0x100]
506
+ paddd xmm8, xmm13
507
+ paddd xmm9, xmm14
508
+ pxor xmm5, xmm10
509
+ pxor xmm6, xmm11
510
+ pxor xmm7, xmm8
511
+ pxor xmm4, xmm9
512
+ movdqa xmmword ptr [rsp+0x100], xmm8
513
+ movdqa xmm8, xmm5
514
+ psrld xmm8, 12
515
+ pslld xmm5, 20
516
+ por xmm5, xmm8
517
+ movdqa xmm8, xmm6
518
+ psrld xmm8, 12
519
+ pslld xmm6, 20
520
+ por xmm6, xmm8
521
+ movdqa xmm8, xmm7
522
+ psrld xmm8, 12
523
+ pslld xmm7, 20
524
+ por xmm7, xmm8
525
+ movdqa xmm8, xmm4
526
+ psrld xmm8, 12
527
+ pslld xmm4, 20
528
+ por xmm4, xmm8
529
+ paddd xmm0, xmmword ptr [rsp+0xB0]
530
+ paddd xmm1, xmmword ptr [rsp+0x50]
531
+ paddd xmm2, xmmword ptr [rsp+0xE0]
532
+ paddd xmm3, xmmword ptr [rsp+0x80]
533
+ paddd xmm0, xmm5
534
+ paddd xmm1, xmm6
535
+ paddd xmm2, xmm7
536
+ paddd xmm3, xmm4
537
+ pxor xmm15, xmm0
538
+ pxor xmm12, xmm1
539
+ pxor xmm13, xmm2
540
+ pxor xmm14, xmm3
541
+ movdqa xmm8, xmm15
542
+ psrld xmm15, 8
543
+ pslld xmm8, 24
544
+ pxor xmm15, xmm8
545
+ movdqa xmm8, xmm12
546
+ psrld xmm12, 8
547
+ pslld xmm8, 24
548
+ pxor xmm12, xmm8
549
+ movdqa xmm8, xmm13
550
+ psrld xmm13, 8
551
+ pslld xmm8, 24
552
+ pxor xmm13, xmm8
553
+ movdqa xmm8, xmm14
554
+ psrld xmm14, 8
555
+ pslld xmm8, 24
556
+ pxor xmm14, xmm8
557
+ paddd xmm10, xmm15
558
+ paddd xmm11, xmm12
559
+ movdqa xmm8, xmmword ptr [rsp+0x100]
560
+ paddd xmm8, xmm13
561
+ paddd xmm9, xmm14
562
+ pxor xmm5, xmm10
563
+ pxor xmm6, xmm11
564
+ pxor xmm7, xmm8
565
+ pxor xmm4, xmm9
566
+ movdqa xmmword ptr [rsp+0x100], xmm8
567
+ movdqa xmm8, xmm5
568
+ psrld xmm8, 7
569
+ pslld xmm5, 25
570
+ por xmm5, xmm8
571
+ movdqa xmm8, xmm6
572
+ psrld xmm8, 7
573
+ pslld xmm6, 25
574
+ por xmm6, xmm8
575
+ movdqa xmm8, xmm7
576
+ psrld xmm8, 7
577
+ pslld xmm7, 25
578
+ por xmm7, xmm8
579
+ movdqa xmm8, xmm4
580
+ psrld xmm8, 7
581
+ pslld xmm4, 25
582
+ por xmm4, xmm8
583
+ paddd xmm0, xmmword ptr [rsp+0x30]
584
+ paddd xmm1, xmmword ptr [rsp+0xA0]
585
+ paddd xmm2, xmmword ptr [rsp+0xD0]
586
+ paddd xmm3, xmmword ptr [rsp+0x70]
587
+ paddd xmm0, xmm4
588
+ paddd xmm1, xmm5
589
+ paddd xmm2, xmm6
590
+ paddd xmm3, xmm7
591
+ pxor xmm12, xmm0
592
+ pxor xmm13, xmm1
593
+ pxor xmm14, xmm2
594
+ pxor xmm15, xmm3
595
+ pshuflw xmm12, xmm12, 0xB1
596
+ pshufhw xmm12, xmm12, 0xB1
597
+ pshuflw xmm13, xmm13, 0xB1
598
+ pshufhw xmm13, xmm13, 0xB1
599
+ pshuflw xmm14, xmm14, 0xB1
600
+ pshufhw xmm14, xmm14, 0xB1
601
+ pshuflw xmm15, xmm15, 0xB1
602
+ pshufhw xmm15, xmm15, 0xB1
603
+ movdqa xmm8, xmmword ptr [rsp+0x100]
604
+ paddd xmm8, xmm12
605
+ paddd xmm9, xmm13
606
+ paddd xmm10, xmm14
607
+ paddd xmm11, xmm15
608
+ pxor xmm4, xmm8
609
+ pxor xmm5, xmm9
610
+ pxor xmm6, xmm10
611
+ pxor xmm7, xmm11
612
+ movdqa xmmword ptr [rsp+0x100], xmm8
613
+ movdqa xmm8, xmm4
614
+ psrld xmm8, 12
615
+ pslld xmm4, 20
616
+ por xmm4, xmm8
617
+ movdqa xmm8, xmm5
618
+ psrld xmm8, 12
619
+ pslld xmm5, 20
620
+ por xmm5, xmm8
621
+ movdqa xmm8, xmm6
622
+ psrld xmm8, 12
623
+ pslld xmm6, 20
624
+ por xmm6, xmm8
625
+ movdqa xmm8, xmm7
626
+ psrld xmm8, 12
627
+ pslld xmm7, 20
628
+ por xmm7, xmm8
629
+ paddd xmm0, xmmword ptr [rsp+0x40]
630
+ paddd xmm1, xmmword ptr [rsp+0xC0]
631
+ paddd xmm2, xmmword ptr [rsp+0x20]
632
+ paddd xmm3, xmmword ptr [rsp+0xE0]
633
+ paddd xmm0, xmm4
634
+ paddd xmm1, xmm5
635
+ paddd xmm2, xmm6
636
+ paddd xmm3, xmm7
637
+ pxor xmm12, xmm0
638
+ pxor xmm13, xmm1
639
+ pxor xmm14, xmm2
640
+ pxor xmm15, xmm3
641
+ movdqa xmm8, xmm12
642
+ psrld xmm12, 8
643
+ pslld xmm8, 24
644
+ pxor xmm12, xmm8
645
+ movdqa xmm8, xmm13
646
+ psrld xmm13, 8
647
+ pslld xmm8, 24
648
+ pxor xmm13, xmm8
649
+ movdqa xmm8, xmm14
650
+ psrld xmm14, 8
651
+ pslld xmm8, 24
652
+ pxor xmm14, xmm8
653
+ movdqa xmm8, xmm15
654
+ psrld xmm15, 8
655
+ pslld xmm8, 24
656
+ pxor xmm15, xmm8
657
+ movdqa xmm8, xmmword ptr [rsp+0x100]
658
+ paddd xmm8, xmm12
659
+ paddd xmm9, xmm13
660
+ paddd xmm10, xmm14
661
+ paddd xmm11, xmm15
662
+ pxor xmm4, xmm8
663
+ pxor xmm5, xmm9
664
+ pxor xmm6, xmm10
665
+ pxor xmm7, xmm11
666
+ movdqa xmmword ptr [rsp+0x100], xmm8
667
+ movdqa xmm8, xmm4
668
+ psrld xmm8, 7
669
+ pslld xmm4, 25
670
+ por xmm4, xmm8
671
+ movdqa xmm8, xmm5
672
+ psrld xmm8, 7
673
+ pslld xmm5, 25
674
+ por xmm5, xmm8
675
+ movdqa xmm8, xmm6
676
+ psrld xmm8, 7
677
+ pslld xmm6, 25
678
+ por xmm6, xmm8
679
+ movdqa xmm8, xmm7
680
+ psrld xmm8, 7
681
+ pslld xmm7, 25
682
+ por xmm7, xmm8
683
+ paddd xmm0, xmmword ptr [rsp+0x60]
684
+ paddd xmm1, xmmword ptr [rsp+0x90]
685
+ paddd xmm2, xmmword ptr [rsp+0xB0]
686
+ paddd xmm3, xmmword ptr [rsp+0x80]
687
+ paddd xmm0, xmm5
688
+ paddd xmm1, xmm6
689
+ paddd xmm2, xmm7
690
+ paddd xmm3, xmm4
691
+ pxor xmm15, xmm0
692
+ pxor xmm12, xmm1
693
+ pxor xmm13, xmm2
694
+ pxor xmm14, xmm3
695
+ pshuflw xmm15, xmm15, 0xB1
696
+ pshufhw xmm15, xmm15, 0xB1
697
+ pshuflw xmm12, xmm12, 0xB1
698
+ pshufhw xmm12, xmm12, 0xB1
699
+ pshuflw xmm13, xmm13, 0xB1
700
+ pshufhw xmm13, xmm13, 0xB1
701
+ pshuflw xmm14, xmm14, 0xB1
702
+ pshufhw xmm14, xmm14, 0xB1
703
+ paddd xmm10, xmm15
704
+ paddd xmm11, xmm12
705
+ movdqa xmm8, xmmword ptr [rsp+0x100]
706
+ paddd xmm8, xmm13
707
+ paddd xmm9, xmm14
708
+ pxor xmm5, xmm10
709
+ pxor xmm6, xmm11
710
+ pxor xmm7, xmm8
711
+ pxor xmm4, xmm9
712
+ movdqa xmmword ptr [rsp+0x100], xmm8
713
+ movdqa xmm8, xmm5
714
+ psrld xmm8, 12
715
+ pslld xmm5, 20
716
+ por xmm5, xmm8
717
+ movdqa xmm8, xmm6
718
+ psrld xmm8, 12
719
+ pslld xmm6, 20
720
+ por xmm6, xmm8
721
+ movdqa xmm8, xmm7
722
+ psrld xmm8, 12
723
+ pslld xmm7, 20
724
+ por xmm7, xmm8
725
+ movdqa xmm8, xmm4
726
+ psrld xmm8, 12
727
+ pslld xmm4, 20
728
+ por xmm4, xmm8
729
+ paddd xmm0, xmmword ptr [rsp+0x50]
730
+ paddd xmm1, xmmword ptr [rsp]
731
+ paddd xmm2, xmmword ptr [rsp+0xF0]
732
+ paddd xmm3, xmmword ptr [rsp+0x10]
733
+ paddd xmm0, xmm5
734
+ paddd xmm1, xmm6
735
+ paddd xmm2, xmm7
736
+ paddd xmm3, xmm4
737
+ pxor xmm15, xmm0
738
+ pxor xmm12, xmm1
739
+ pxor xmm13, xmm2
740
+ pxor xmm14, xmm3
741
+ movdqa xmm8, xmm15
742
+ psrld xmm15, 8
743
+ pslld xmm8, 24
744
+ pxor xmm15, xmm8
745
+ movdqa xmm8, xmm12
746
+ psrld xmm12, 8
747
+ pslld xmm8, 24
748
+ pxor xmm12, xmm8
749
+ movdqa xmm8, xmm13
750
+ psrld xmm13, 8
751
+ pslld xmm8, 24
752
+ pxor xmm13, xmm8
753
+ movdqa xmm8, xmm14
754
+ psrld xmm14, 8
755
+ pslld xmm8, 24
756
+ pxor xmm14, xmm8
757
+ paddd xmm10, xmm15
758
+ paddd xmm11, xmm12
759
+ movdqa xmm8, xmmword ptr [rsp+0x100]
760
+ paddd xmm8, xmm13
761
+ paddd xmm9, xmm14
762
+ pxor xmm5, xmm10
763
+ pxor xmm6, xmm11
764
+ pxor xmm7, xmm8
765
+ pxor xmm4, xmm9
766
+ movdqa xmmword ptr [rsp+0x100], xmm8
767
+ movdqa xmm8, xmm5
768
+ psrld xmm8, 7
769
+ pslld xmm5, 25
770
+ por xmm5, xmm8
771
+ movdqa xmm8, xmm6
772
+ psrld xmm8, 7
773
+ pslld xmm6, 25
774
+ por xmm6, xmm8
775
+ movdqa xmm8, xmm7
776
+ psrld xmm8, 7
777
+ pslld xmm7, 25
778
+ por xmm7, xmm8
779
+ movdqa xmm8, xmm4
780
+ psrld xmm8, 7
781
+ pslld xmm4, 25
782
+ por xmm4, xmm8
783
+ paddd xmm0, xmmword ptr [rsp+0xA0]
784
+ paddd xmm1, xmmword ptr [rsp+0xC0]
785
+ paddd xmm2, xmmword ptr [rsp+0xE0]
786
+ paddd xmm3, xmmword ptr [rsp+0xD0]
787
+ paddd xmm0, xmm4
788
+ paddd xmm1, xmm5
789
+ paddd xmm2, xmm6
790
+ paddd xmm3, xmm7
791
+ pxor xmm12, xmm0
792
+ pxor xmm13, xmm1
793
+ pxor xmm14, xmm2
794
+ pxor xmm15, xmm3
795
+ pshuflw xmm12, xmm12, 0xB1
796
+ pshufhw xmm12, xmm12, 0xB1
797
+ pshuflw xmm13, xmm13, 0xB1
798
+ pshufhw xmm13, xmm13, 0xB1
799
+ pshuflw xmm14, xmm14, 0xB1
800
+ pshufhw xmm14, xmm14, 0xB1
801
+ pshuflw xmm15, xmm15, 0xB1
802
+ pshufhw xmm15, xmm15, 0xB1
803
+ movdqa xmm8, xmmword ptr [rsp+0x100]
804
+ paddd xmm8, xmm12
805
+ paddd xmm9, xmm13
806
+ paddd xmm10, xmm14
807
+ paddd xmm11, xmm15
808
+ pxor xmm4, xmm8
809
+ pxor xmm5, xmm9
810
+ pxor xmm6, xmm10
811
+ pxor xmm7, xmm11
812
+ movdqa xmmword ptr [rsp+0x100], xmm8
813
+ movdqa xmm8, xmm4
814
+ psrld xmm8, 12
815
+ pslld xmm4, 20
816
+ por xmm4, xmm8
817
+ movdqa xmm8, xmm5
818
+ psrld xmm8, 12
819
+ pslld xmm5, 20
820
+ por xmm5, xmm8
821
+ movdqa xmm8, xmm6
822
+ psrld xmm8, 12
823
+ pslld xmm6, 20
824
+ por xmm6, xmm8
825
+ movdqa xmm8, xmm7
826
+ psrld xmm8, 12
827
+ pslld xmm7, 20
828
+ por xmm7, xmm8
829
+ paddd xmm0, xmmword ptr [rsp+0x70]
830
+ paddd xmm1, xmmword ptr [rsp+0x90]
831
+ paddd xmm2, xmmword ptr [rsp+0x30]
832
+ paddd xmm3, xmmword ptr [rsp+0xF0]
833
+ paddd xmm0, xmm4
834
+ paddd xmm1, xmm5
835
+ paddd xmm2, xmm6
836
+ paddd xmm3, xmm7
837
+ pxor xmm12, xmm0
838
+ pxor xmm13, xmm1
839
+ pxor xmm14, xmm2
840
+ pxor xmm15, xmm3
841
+ movdqa xmm8, xmm12
842
+ psrld xmm12, 8
843
+ pslld xmm8, 24
844
+ pxor xmm12, xmm8
845
+ movdqa xmm8, xmm13
846
+ psrld xmm13, 8
847
+ pslld xmm8, 24
848
+ pxor xmm13, xmm8
849
+ movdqa xmm8, xmm14
850
+ psrld xmm14, 8
851
+ pslld xmm8, 24
852
+ pxor xmm14, xmm8
853
+ movdqa xmm8, xmm15
854
+ psrld xmm15, 8
855
+ pslld xmm8, 24
856
+ pxor xmm15, xmm8
857
+ movdqa xmm8, xmmword ptr [rsp+0x100]
858
+ paddd xmm8, xmm12
859
+ paddd xmm9, xmm13
860
+ paddd xmm10, xmm14
861
+ paddd xmm11, xmm15
862
+ pxor xmm4, xmm8
863
+ pxor xmm5, xmm9
864
+ pxor xmm6, xmm10
865
+ pxor xmm7, xmm11
866
+ movdqa xmmword ptr [rsp+0x100], xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ movdqa xmm8, xmm5
872
+ psrld xmm8, 7
873
+ pslld xmm5, 25
874
+ por xmm5, xmm8
875
+ movdqa xmm8, xmm6
876
+ psrld xmm8, 7
877
+ pslld xmm6, 25
878
+ por xmm6, xmm8
879
+ movdqa xmm8, xmm7
880
+ psrld xmm8, 7
881
+ pslld xmm7, 25
882
+ por xmm7, xmm8
883
+ paddd xmm0, xmmword ptr [rsp+0x40]
884
+ paddd xmm1, xmmword ptr [rsp+0xB0]
885
+ paddd xmm2, xmmword ptr [rsp+0x50]
886
+ paddd xmm3, xmmword ptr [rsp+0x10]
887
+ paddd xmm0, xmm5
888
+ paddd xmm1, xmm6
889
+ paddd xmm2, xmm7
890
+ paddd xmm3, xmm4
891
+ pxor xmm15, xmm0
892
+ pxor xmm12, xmm1
893
+ pxor xmm13, xmm2
894
+ pxor xmm14, xmm3
895
+ pshuflw xmm15, xmm15, 0xB1
896
+ pshufhw xmm15, xmm15, 0xB1
897
+ pshuflw xmm12, xmm12, 0xB1
898
+ pshufhw xmm12, xmm12, 0xB1
899
+ pshuflw xmm13, xmm13, 0xB1
900
+ pshufhw xmm13, xmm13, 0xB1
901
+ pshuflw xmm14, xmm14, 0xB1
902
+ pshufhw xmm14, xmm14, 0xB1
903
+ paddd xmm10, xmm15
904
+ paddd xmm11, xmm12
905
+ movdqa xmm8, xmmword ptr [rsp+0x100]
906
+ paddd xmm8, xmm13
907
+ paddd xmm9, xmm14
908
+ pxor xmm5, xmm10
909
+ pxor xmm6, xmm11
910
+ pxor xmm7, xmm8
911
+ pxor xmm4, xmm9
912
+ movdqa xmmword ptr [rsp+0x100], xmm8
913
+ movdqa xmm8, xmm5
914
+ psrld xmm8, 12
915
+ pslld xmm5, 20
916
+ por xmm5, xmm8
917
+ movdqa xmm8, xmm6
918
+ psrld xmm8, 12
919
+ pslld xmm6, 20
920
+ por xmm6, xmm8
921
+ movdqa xmm8, xmm7
922
+ psrld xmm8, 12
923
+ pslld xmm7, 20
924
+ por xmm7, xmm8
925
+ movdqa xmm8, xmm4
926
+ psrld xmm8, 12
927
+ pslld xmm4, 20
928
+ por xmm4, xmm8
929
+ paddd xmm0, xmmword ptr [rsp]
930
+ paddd xmm1, xmmword ptr [rsp+0x20]
931
+ paddd xmm2, xmmword ptr [rsp+0x80]
932
+ paddd xmm3, xmmword ptr [rsp+0x60]
933
+ paddd xmm0, xmm5
934
+ paddd xmm1, xmm6
935
+ paddd xmm2, xmm7
936
+ paddd xmm3, xmm4
937
+ pxor xmm15, xmm0
938
+ pxor xmm12, xmm1
939
+ pxor xmm13, xmm2
940
+ pxor xmm14, xmm3
941
+ movdqa xmm8, xmm15
942
+ psrld xmm15, 8
943
+ pslld xmm8, 24
944
+ pxor xmm15, xmm8
945
+ movdqa xmm8, xmm12
946
+ psrld xmm12, 8
947
+ pslld xmm8, 24
948
+ pxor xmm12, xmm8
949
+ movdqa xmm8, xmm13
950
+ psrld xmm13, 8
951
+ pslld xmm8, 24
952
+ pxor xmm13, xmm8
953
+ movdqa xmm8, xmm14
954
+ psrld xmm14, 8
955
+ pslld xmm8, 24
956
+ pxor xmm14, xmm8
957
+ paddd xmm10, xmm15
958
+ paddd xmm11, xmm12
959
+ movdqa xmm8, xmmword ptr [rsp+0x100]
960
+ paddd xmm8, xmm13
961
+ paddd xmm9, xmm14
962
+ pxor xmm5, xmm10
963
+ pxor xmm6, xmm11
964
+ pxor xmm7, xmm8
965
+ pxor xmm4, xmm9
966
+ movdqa xmmword ptr [rsp+0x100], xmm8
967
+ movdqa xmm8, xmm5
968
+ psrld xmm8, 7
969
+ pslld xmm5, 25
970
+ por xmm5, xmm8
971
+ movdqa xmm8, xmm6
972
+ psrld xmm8, 7
973
+ pslld xmm6, 25
974
+ por xmm6, xmm8
975
+ movdqa xmm8, xmm7
976
+ psrld xmm8, 7
977
+ pslld xmm7, 25
978
+ por xmm7, xmm8
979
+ movdqa xmm8, xmm4
980
+ psrld xmm8, 7
981
+ pslld xmm4, 25
982
+ por xmm4, xmm8
983
+ paddd xmm0, xmmword ptr [rsp+0xC0]
984
+ paddd xmm1, xmmword ptr [rsp+0x90]
985
+ paddd xmm2, xmmword ptr [rsp+0xF0]
986
+ paddd xmm3, xmmword ptr [rsp+0xE0]
987
+ paddd xmm0, xmm4
988
+ paddd xmm1, xmm5
989
+ paddd xmm2, xmm6
990
+ paddd xmm3, xmm7
991
+ pxor xmm12, xmm0
992
+ pxor xmm13, xmm1
993
+ pxor xmm14, xmm2
994
+ pxor xmm15, xmm3
995
+ pshuflw xmm12, xmm12, 0xB1
996
+ pshufhw xmm12, xmm12, 0xB1
997
+ pshuflw xmm13, xmm13, 0xB1
998
+ pshufhw xmm13, xmm13, 0xB1
999
+ pshuflw xmm14, xmm14, 0xB1
1000
+ pshufhw xmm14, xmm14, 0xB1
1001
+ pshuflw xmm15, xmm15, 0xB1
1002
+ pshufhw xmm15, xmm15, 0xB1
1003
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1004
+ paddd xmm8, xmm12
1005
+ paddd xmm9, xmm13
1006
+ paddd xmm10, xmm14
1007
+ paddd xmm11, xmm15
1008
+ pxor xmm4, xmm8
1009
+ pxor xmm5, xmm9
1010
+ pxor xmm6, xmm10
1011
+ pxor xmm7, xmm11
1012
+ movdqa xmmword ptr [rsp+0x100], xmm8
1013
+ movdqa xmm8, xmm4
1014
+ psrld xmm8, 12
1015
+ pslld xmm4, 20
1016
+ por xmm4, xmm8
1017
+ movdqa xmm8, xmm5
1018
+ psrld xmm8, 12
1019
+ pslld xmm5, 20
1020
+ por xmm5, xmm8
1021
+ movdqa xmm8, xmm6
1022
+ psrld xmm8, 12
1023
+ pslld xmm6, 20
1024
+ por xmm6, xmm8
1025
+ movdqa xmm8, xmm7
1026
+ psrld xmm8, 12
1027
+ pslld xmm7, 20
1028
+ por xmm7, xmm8
1029
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1030
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1031
+ paddd xmm2, xmmword ptr [rsp+0xA0]
1032
+ paddd xmm3, xmmword ptr [rsp+0x80]
1033
+ paddd xmm0, xmm4
1034
+ paddd xmm1, xmm5
1035
+ paddd xmm2, xmm6
1036
+ paddd xmm3, xmm7
1037
+ pxor xmm12, xmm0
1038
+ pxor xmm13, xmm1
1039
+ pxor xmm14, xmm2
1040
+ pxor xmm15, xmm3
1041
+ movdqa xmm8, xmm12
1042
+ psrld xmm12, 8
1043
+ pslld xmm8, 24
1044
+ pxor xmm12, xmm8
1045
+ movdqa xmm8, xmm13
1046
+ psrld xmm13, 8
1047
+ pslld xmm8, 24
1048
+ pxor xmm13, xmm8
1049
+ movdqa xmm8, xmm14
1050
+ psrld xmm14, 8
1051
+ pslld xmm8, 24
1052
+ pxor xmm14, xmm8
1053
+ movdqa xmm8, xmm15
1054
+ psrld xmm15, 8
1055
+ pslld xmm8, 24
1056
+ pxor xmm15, xmm8
1057
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1058
+ paddd xmm8, xmm12
1059
+ paddd xmm9, xmm13
1060
+ paddd xmm10, xmm14
1061
+ paddd xmm11, xmm15
1062
+ pxor xmm4, xmm8
1063
+ pxor xmm5, xmm9
1064
+ pxor xmm6, xmm10
1065
+ pxor xmm7, xmm11
1066
+ movdqa xmmword ptr [rsp+0x100], xmm8
1067
+ movdqa xmm8, xmm4
1068
+ psrld xmm8, 7
1069
+ pslld xmm4, 25
1070
+ por xmm4, xmm8
1071
+ movdqa xmm8, xmm5
1072
+ psrld xmm8, 7
1073
+ pslld xmm5, 25
1074
+ por xmm5, xmm8
1075
+ movdqa xmm8, xmm6
1076
+ psrld xmm8, 7
1077
+ pslld xmm6, 25
1078
+ por xmm6, xmm8
1079
+ movdqa xmm8, xmm7
1080
+ psrld xmm8, 7
1081
+ pslld xmm7, 25
1082
+ por xmm7, xmm8
1083
+ paddd xmm0, xmmword ptr [rsp+0x70]
1084
+ paddd xmm1, xmmword ptr [rsp+0x50]
1085
+ paddd xmm2, xmmword ptr [rsp]
1086
+ paddd xmm3, xmmword ptr [rsp+0x60]
1087
+ paddd xmm0, xmm5
1088
+ paddd xmm1, xmm6
1089
+ paddd xmm2, xmm7
1090
+ paddd xmm3, xmm4
1091
+ pxor xmm15, xmm0
1092
+ pxor xmm12, xmm1
1093
+ pxor xmm13, xmm2
1094
+ pxor xmm14, xmm3
1095
+ pshuflw xmm15, xmm15, 0xB1
1096
+ pshufhw xmm15, xmm15, 0xB1
1097
+ pshuflw xmm12, xmm12, 0xB1
1098
+ pshufhw xmm12, xmm12, 0xB1
1099
+ pshuflw xmm13, xmm13, 0xB1
1100
+ pshufhw xmm13, xmm13, 0xB1
1101
+ pshuflw xmm14, xmm14, 0xB1
1102
+ pshufhw xmm14, xmm14, 0xB1
1103
+ paddd xmm10, xmm15
1104
+ paddd xmm11, xmm12
1105
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1106
+ paddd xmm8, xmm13
1107
+ paddd xmm9, xmm14
1108
+ pxor xmm5, xmm10
1109
+ pxor xmm6, xmm11
1110
+ pxor xmm7, xmm8
1111
+ pxor xmm4, xmm9
1112
+ movdqa xmmword ptr [rsp+0x100], xmm8
1113
+ movdqa xmm8, xmm5
1114
+ psrld xmm8, 12
1115
+ pslld xmm5, 20
1116
+ por xmm5, xmm8
1117
+ movdqa xmm8, xmm6
1118
+ psrld xmm8, 12
1119
+ pslld xmm6, 20
1120
+ por xmm6, xmm8
1121
+ movdqa xmm8, xmm7
1122
+ psrld xmm8, 12
1123
+ pslld xmm7, 20
1124
+ por xmm7, xmm8
1125
+ movdqa xmm8, xmm4
1126
+ psrld xmm8, 12
1127
+ pslld xmm4, 20
1128
+ por xmm4, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+0x20]
1130
+ paddd xmm1, xmmword ptr [rsp+0x30]
1131
+ paddd xmm2, xmmword ptr [rsp+0x10]
1132
+ paddd xmm3, xmmword ptr [rsp+0x40]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmm15
1142
+ psrld xmm15, 8
1143
+ pslld xmm8, 24
1144
+ pxor xmm15, xmm8
1145
+ movdqa xmm8, xmm12
1146
+ psrld xmm12, 8
1147
+ pslld xmm8, 24
1148
+ pxor xmm12, xmm8
1149
+ movdqa xmm8, xmm13
1150
+ psrld xmm13, 8
1151
+ pslld xmm8, 24
1152
+ pxor xmm13, xmm8
1153
+ movdqa xmm8, xmm14
1154
+ psrld xmm14, 8
1155
+ pslld xmm8, 24
1156
+ pxor xmm14, xmm8
1157
+ paddd xmm10, xmm15
1158
+ paddd xmm11, xmm12
1159
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1160
+ paddd xmm8, xmm13
1161
+ paddd xmm9, xmm14
1162
+ pxor xmm5, xmm10
1163
+ pxor xmm6, xmm11
1164
+ pxor xmm7, xmm8
1165
+ pxor xmm4, xmm9
1166
+ movdqa xmmword ptr [rsp+0x100], xmm8
1167
+ movdqa xmm8, xmm5
1168
+ psrld xmm8, 7
1169
+ pslld xmm5, 25
1170
+ por xmm5, xmm8
1171
+ movdqa xmm8, xmm6
1172
+ psrld xmm8, 7
1173
+ pslld xmm6, 25
1174
+ por xmm6, xmm8
1175
+ movdqa xmm8, xmm7
1176
+ psrld xmm8, 7
1177
+ pslld xmm7, 25
1178
+ por xmm7, xmm8
1179
+ movdqa xmm8, xmm4
1180
+ psrld xmm8, 7
1181
+ pslld xmm4, 25
1182
+ por xmm4, xmm8
1183
+ paddd xmm0, xmmword ptr [rsp+0x90]
1184
+ paddd xmm1, xmmword ptr [rsp+0xB0]
1185
+ paddd xmm2, xmmword ptr [rsp+0x80]
1186
+ paddd xmm3, xmmword ptr [rsp+0xF0]
1187
+ paddd xmm0, xmm4
1188
+ paddd xmm1, xmm5
1189
+ paddd xmm2, xmm6
1190
+ paddd xmm3, xmm7
1191
+ pxor xmm12, xmm0
1192
+ pxor xmm13, xmm1
1193
+ pxor xmm14, xmm2
1194
+ pxor xmm15, xmm3
1195
+ pshuflw xmm12, xmm12, 0xB1
1196
+ pshufhw xmm12, xmm12, 0xB1
1197
+ pshuflw xmm13, xmm13, 0xB1
1198
+ pshufhw xmm13, xmm13, 0xB1
1199
+ pshuflw xmm14, xmm14, 0xB1
1200
+ pshufhw xmm14, xmm14, 0xB1
1201
+ pshuflw xmm15, xmm15, 0xB1
1202
+ pshufhw xmm15, xmm15, 0xB1
1203
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1204
+ paddd xmm8, xmm12
1205
+ paddd xmm9, xmm13
1206
+ paddd xmm10, xmm14
1207
+ paddd xmm11, xmm15
1208
+ pxor xmm4, xmm8
1209
+ pxor xmm5, xmm9
1210
+ pxor xmm6, xmm10
1211
+ pxor xmm7, xmm11
1212
+ movdqa xmmword ptr [rsp+0x100], xmm8
1213
+ movdqa xmm8, xmm4
1214
+ psrld xmm8, 12
1215
+ pslld xmm4, 20
1216
+ por xmm4, xmm8
1217
+ movdqa xmm8, xmm5
1218
+ psrld xmm8, 12
1219
+ pslld xmm5, 20
1220
+ por xmm5, xmm8
1221
+ movdqa xmm8, xmm6
1222
+ psrld xmm8, 12
1223
+ pslld xmm6, 20
1224
+ por xmm6, xmm8
1225
+ movdqa xmm8, xmm7
1226
+ psrld xmm8, 12
1227
+ pslld xmm7, 20
1228
+ por xmm7, xmm8
1229
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1230
+ paddd xmm1, xmmword ptr [rsp+0x50]
1231
+ paddd xmm2, xmmword ptr [rsp+0xC0]
1232
+ paddd xmm3, xmmword ptr [rsp+0x10]
1233
+ paddd xmm0, xmm4
1234
+ paddd xmm1, xmm5
1235
+ paddd xmm2, xmm6
1236
+ paddd xmm3, xmm7
1237
+ pxor xmm12, xmm0
1238
+ pxor xmm13, xmm1
1239
+ pxor xmm14, xmm2
1240
+ pxor xmm15, xmm3
1241
+ movdqa xmm8, xmm12
1242
+ psrld xmm12, 8
1243
+ pslld xmm8, 24
1244
+ pxor xmm12, xmm8
1245
+ movdqa xmm8, xmm13
1246
+ psrld xmm13, 8
1247
+ pslld xmm8, 24
1248
+ pxor xmm13, xmm8
1249
+ movdqa xmm8, xmm14
1250
+ psrld xmm14, 8
1251
+ pslld xmm8, 24
1252
+ pxor xmm14, xmm8
1253
+ movdqa xmm8, xmm15
1254
+ psrld xmm15, 8
1255
+ pslld xmm8, 24
1256
+ pxor xmm15, xmm8
1257
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1258
+ paddd xmm8, xmm12
1259
+ paddd xmm9, xmm13
1260
+ paddd xmm10, xmm14
1261
+ paddd xmm11, xmm15
1262
+ pxor xmm4, xmm8
1263
+ pxor xmm5, xmm9
1264
+ pxor xmm6, xmm10
1265
+ pxor xmm7, xmm11
1266
+ movdqa xmmword ptr [rsp+0x100], xmm8
1267
+ movdqa xmm8, xmm4
1268
+ psrld xmm8, 7
1269
+ pslld xmm4, 25
1270
+ por xmm4, xmm8
1271
+ movdqa xmm8, xmm5
1272
+ psrld xmm8, 7
1273
+ pslld xmm5, 25
1274
+ por xmm5, xmm8
1275
+ movdqa xmm8, xmm6
1276
+ psrld xmm8, 7
1277
+ pslld xmm6, 25
1278
+ por xmm6, xmm8
1279
+ movdqa xmm8, xmm7
1280
+ psrld xmm8, 7
1281
+ pslld xmm7, 25
1282
+ por xmm7, xmm8
1283
+ paddd xmm0, xmmword ptr [rsp+0xD0]
1284
+ paddd xmm1, xmmword ptr [rsp]
1285
+ paddd xmm2, xmmword ptr [rsp+0x20]
1286
+ paddd xmm3, xmmword ptr [rsp+0x40]
1287
+ paddd xmm0, xmm5
1288
+ paddd xmm1, xmm6
1289
+ paddd xmm2, xmm7
1290
+ paddd xmm3, xmm4
1291
+ pxor xmm15, xmm0
1292
+ pxor xmm12, xmm1
1293
+ pxor xmm13, xmm2
1294
+ pxor xmm14, xmm3
1295
+ pshuflw xmm15, xmm15, 0xB1
1296
+ pshufhw xmm15, xmm15, 0xB1
1297
+ pshuflw xmm12, xmm12, 0xB1
1298
+ pshufhw xmm12, xmm12, 0xB1
1299
+ pshuflw xmm13, xmm13, 0xB1
1300
+ pshufhw xmm13, xmm13, 0xB1
1301
+ pshuflw xmm14, xmm14, 0xB1
1302
+ pshufhw xmm14, xmm14, 0xB1
1303
+ paddd xmm10, xmm15
1304
+ paddd xmm11, xmm12
1305
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1306
+ paddd xmm8, xmm13
1307
+ paddd xmm9, xmm14
1308
+ pxor xmm5, xmm10
1309
+ pxor xmm6, xmm11
1310
+ pxor xmm7, xmm8
1311
+ pxor xmm4, xmm9
1312
+ movdqa xmmword ptr [rsp+0x100], xmm8
1313
+ movdqa xmm8, xmm5
1314
+ psrld xmm8, 12
1315
+ pslld xmm5, 20
1316
+ por xmm5, xmm8
1317
+ movdqa xmm8, xmm6
1318
+ psrld xmm8, 12
1319
+ pslld xmm6, 20
1320
+ por xmm6, xmm8
1321
+ movdqa xmm8, xmm7
1322
+ psrld xmm8, 12
1323
+ pslld xmm7, 20
1324
+ por xmm7, xmm8
1325
+ movdqa xmm8, xmm4
1326
+ psrld xmm8, 12
1327
+ pslld xmm4, 20
1328
+ por xmm4, xmm8
1329
+ paddd xmm0, xmmword ptr [rsp+0x30]
1330
+ paddd xmm1, xmmword ptr [rsp+0xA0]
1331
+ paddd xmm2, xmmword ptr [rsp+0x60]
1332
+ paddd xmm3, xmmword ptr [rsp+0x70]
1333
+ paddd xmm0, xmm5
1334
+ paddd xmm1, xmm6
1335
+ paddd xmm2, xmm7
1336
+ paddd xmm3, xmm4
1337
+ pxor xmm15, xmm0
1338
+ pxor xmm12, xmm1
1339
+ pxor xmm13, xmm2
1340
+ pxor xmm14, xmm3
1341
+ movdqa xmm8, xmm15
1342
+ psrld xmm15, 8
1343
+ pslld xmm8, 24
1344
+ pxor xmm15, xmm8
1345
+ movdqa xmm8, xmm12
1346
+ psrld xmm12, 8
1347
+ pslld xmm8, 24
1348
+ pxor xmm12, xmm8
1349
+ movdqa xmm8, xmm13
1350
+ psrld xmm13, 8
1351
+ pslld xmm8, 24
1352
+ pxor xmm13, xmm8
1353
+ movdqa xmm8, xmm14
1354
+ psrld xmm14, 8
1355
+ pslld xmm8, 24
1356
+ pxor xmm14, xmm8
1357
+ paddd xmm10, xmm15
1358
+ paddd xmm11, xmm12
1359
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1360
+ paddd xmm8, xmm13
1361
+ paddd xmm9, xmm14
1362
+ pxor xmm5, xmm10
1363
+ pxor xmm6, xmm11
1364
+ pxor xmm7, xmm8
1365
+ pxor xmm4, xmm9
1366
+ movdqa xmmword ptr [rsp+0x100], xmm8
1367
+ movdqa xmm8, xmm5
1368
+ psrld xmm8, 7
1369
+ pslld xmm5, 25
1370
+ por xmm5, xmm8
1371
+ movdqa xmm8, xmm6
1372
+ psrld xmm8, 7
1373
+ pslld xmm6, 25
1374
+ por xmm6, xmm8
1375
+ movdqa xmm8, xmm7
1376
+ psrld xmm8, 7
1377
+ pslld xmm7, 25
1378
+ por xmm7, xmm8
1379
+ movdqa xmm8, xmm4
1380
+ psrld xmm8, 7
1381
+ pslld xmm4, 25
1382
+ por xmm4, xmm8
1383
+ paddd xmm0, xmmword ptr [rsp+0xB0]
1384
+ paddd xmm1, xmmword ptr [rsp+0x50]
1385
+ paddd xmm2, xmmword ptr [rsp+0x10]
1386
+ paddd xmm3, xmmword ptr [rsp+0x80]
1387
+ paddd xmm0, xmm4
1388
+ paddd xmm1, xmm5
1389
+ paddd xmm2, xmm6
1390
+ paddd xmm3, xmm7
1391
+ pxor xmm12, xmm0
1392
+ pxor xmm13, xmm1
1393
+ pxor xmm14, xmm2
1394
+ pxor xmm15, xmm3
1395
+ pshuflw xmm12, xmm12, 0xB1
1396
+ pshufhw xmm12, xmm12, 0xB1
1397
+ pshuflw xmm13, xmm13, 0xB1
1398
+ pshufhw xmm13, xmm13, 0xB1
1399
+ pshuflw xmm14, xmm14, 0xB1
1400
+ pshufhw xmm14, xmm14, 0xB1
1401
+ pshuflw xmm15, xmm15, 0xB1
1402
+ pshufhw xmm15, xmm15, 0xB1
1403
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1404
+ paddd xmm8, xmm12
1405
+ paddd xmm9, xmm13
1406
+ paddd xmm10, xmm14
1407
+ paddd xmm11, xmm15
1408
+ pxor xmm4, xmm8
1409
+ pxor xmm5, xmm9
1410
+ pxor xmm6, xmm10
1411
+ pxor xmm7, xmm11
1412
+ movdqa xmmword ptr [rsp+0x100], xmm8
1413
+ movdqa xmm8, xmm4
1414
+ psrld xmm8, 12
1415
+ pslld xmm4, 20
1416
+ por xmm4, xmm8
1417
+ movdqa xmm8, xmm5
1418
+ psrld xmm8, 12
1419
+ pslld xmm5, 20
1420
+ por xmm5, xmm8
1421
+ movdqa xmm8, xmm6
1422
+ psrld xmm8, 12
1423
+ pslld xmm6, 20
1424
+ por xmm6, xmm8
1425
+ movdqa xmm8, xmm7
1426
+ psrld xmm8, 12
1427
+ pslld xmm7, 20
1428
+ por xmm7, xmm8
1429
+ paddd xmm0, xmmword ptr [rsp+0xF0]
1430
+ paddd xmm1, xmmword ptr [rsp]
1431
+ paddd xmm2, xmmword ptr [rsp+0x90]
1432
+ paddd xmm3, xmmword ptr [rsp+0x60]
1433
+ paddd xmm0, xmm4
1434
+ paddd xmm1, xmm5
1435
+ paddd xmm2, xmm6
1436
+ paddd xmm3, xmm7
1437
+ pxor xmm12, xmm0
1438
+ pxor xmm13, xmm1
1439
+ pxor xmm14, xmm2
1440
+ pxor xmm15, xmm3
1441
+ movdqa xmm8, xmm12
1442
+ psrld xmm12, 8
1443
+ pslld xmm8, 24
1444
+ pxor xmm12, xmm8
1445
+ movdqa xmm8, xmm13
1446
+ psrld xmm13, 8
1447
+ pslld xmm8, 24
1448
+ pxor xmm13, xmm8
1449
+ movdqa xmm8, xmm14
1450
+ psrld xmm14, 8
1451
+ pslld xmm8, 24
1452
+ pxor xmm14, xmm8
1453
+ movdqa xmm8, xmm15
1454
+ psrld xmm15, 8
1455
+ pslld xmm8, 24
1456
+ pxor xmm15, xmm8
1457
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1458
+ paddd xmm8, xmm12
1459
+ paddd xmm9, xmm13
1460
+ paddd xmm10, xmm14
1461
+ paddd xmm11, xmm15
1462
+ pxor xmm4, xmm8
1463
+ pxor xmm5, xmm9
1464
+ pxor xmm6, xmm10
1465
+ pxor xmm7, xmm11
1466
+ movdqa xmmword ptr [rsp+0x100], xmm8
1467
+ movdqa xmm8, xmm4
1468
+ psrld xmm8, 7
1469
+ pslld xmm4, 25
1470
+ por xmm4, xmm8
1471
+ movdqa xmm8, xmm5
1472
+ psrld xmm8, 7
1473
+ pslld xmm5, 25
1474
+ por xmm5, xmm8
1475
+ movdqa xmm8, xmm6
1476
+ psrld xmm8, 7
1477
+ pslld xmm6, 25
1478
+ por xmm6, xmm8
1479
+ movdqa xmm8, xmm7
1480
+ psrld xmm8, 7
1481
+ pslld xmm7, 25
1482
+ por xmm7, xmm8
1483
+ paddd xmm0, xmmword ptr [rsp+0xE0]
1484
+ paddd xmm1, xmmword ptr [rsp+0x20]
1485
+ paddd xmm2, xmmword ptr [rsp+0x30]
1486
+ paddd xmm3, xmmword ptr [rsp+0x70]
1487
+ paddd xmm0, xmm5
1488
+ paddd xmm1, xmm6
1489
+ paddd xmm2, xmm7
1490
+ paddd xmm3, xmm4
1491
+ pxor xmm15, xmm0
1492
+ pxor xmm12, xmm1
1493
+ pxor xmm13, xmm2
1494
+ pxor xmm14, xmm3
1495
+ pshuflw xmm15, xmm15, 0xB1
1496
+ pshufhw xmm15, xmm15, 0xB1
1497
+ pshuflw xmm12, xmm12, 0xB1
1498
+ pshufhw xmm12, xmm12, 0xB1
1499
+ pshuflw xmm13, xmm13, 0xB1
1500
+ pshufhw xmm13, xmm13, 0xB1
1501
+ pshuflw xmm14, xmm14, 0xB1
1502
+ pshufhw xmm14, xmm14, 0xB1
1503
+ paddd xmm10, xmm15
1504
+ paddd xmm11, xmm12
1505
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1506
+ paddd xmm8, xmm13
1507
+ paddd xmm9, xmm14
1508
+ pxor xmm5, xmm10
1509
+ pxor xmm6, xmm11
1510
+ pxor xmm7, xmm8
1511
+ pxor xmm4, xmm9
1512
+ movdqa xmmword ptr [rsp+0x100], xmm8
1513
+ movdqa xmm8, xmm5
1514
+ psrld xmm8, 12
1515
+ pslld xmm5, 20
1516
+ por xmm5, xmm8
1517
+ movdqa xmm8, xmm6
1518
+ psrld xmm8, 12
1519
+ pslld xmm6, 20
1520
+ por xmm6, xmm8
1521
+ movdqa xmm8, xmm7
1522
+ psrld xmm8, 12
1523
+ pslld xmm7, 20
1524
+ por xmm7, xmm8
1525
+ movdqa xmm8, xmm4
1526
+ psrld xmm8, 12
1527
+ pslld xmm4, 20
1528
+ por xmm4, xmm8
1529
+ paddd xmm0, xmmword ptr [rsp+0xA0]
1530
+ paddd xmm1, xmmword ptr [rsp+0xC0]
1531
+ paddd xmm2, xmmword ptr [rsp+0x40]
1532
+ paddd xmm3, xmmword ptr [rsp+0xD0]
1533
+ paddd xmm0, xmm5
1534
+ paddd xmm1, xmm6
1535
+ paddd xmm2, xmm7
1536
+ paddd xmm3, xmm4
1537
+ pxor xmm15, xmm0
1538
+ pxor xmm12, xmm1
1539
+ pxor xmm13, xmm2
1540
+ pxor xmm14, xmm3
1541
+ movdqa xmm8, xmm15
1542
+ psrld xmm15, 8
1543
+ pslld xmm8, 24
1544
+ pxor xmm15, xmm8
1545
+ movdqa xmm8, xmm12
1546
+ psrld xmm12, 8
1547
+ pslld xmm8, 24
1548
+ pxor xmm12, xmm8
1549
+ movdqa xmm8, xmm13
1550
+ psrld xmm13, 8
1551
+ pslld xmm8, 24
1552
+ pxor xmm13, xmm8
1553
+ movdqa xmm8, xmm14
1554
+ psrld xmm14, 8
1555
+ pslld xmm8, 24
1556
+ pxor xmm14, xmm8
1557
+ paddd xmm10, xmm15
1558
+ paddd xmm11, xmm12
1559
+ movdqa xmm8, xmmword ptr [rsp+0x100]
1560
+ paddd xmm8, xmm13
1561
+ paddd xmm9, xmm14
1562
+ pxor xmm5, xmm10
1563
+ pxor xmm6, xmm11
1564
+ pxor xmm7, xmm8
1565
+ pxor xmm4, xmm9
1566
+ pxor xmm0, xmm8
1567
+ pxor xmm1, xmm9
1568
+ pxor xmm2, xmm10
1569
+ pxor xmm3, xmm11
1570
+ movdqa xmm8, xmm5
1571
+ psrld xmm8, 7
1572
+ pslld xmm5, 25
1573
+ por xmm5, xmm8
1574
+ movdqa xmm8, xmm6
1575
+ psrld xmm8, 7
1576
+ pslld xmm6, 25
1577
+ por xmm6, xmm8
1578
+ movdqa xmm8, xmm7
1579
+ psrld xmm8, 7
1580
+ pslld xmm7, 25
1581
+ por xmm7, xmm8
1582
+ movdqa xmm8, xmm4
1583
+ psrld xmm8, 7
1584
+ pslld xmm4, 25
1585
+ por xmm4, xmm8
1586
+ pxor xmm4, xmm12
1587
+ pxor xmm5, xmm13
1588
+ pxor xmm6, xmm14
1589
+ pxor xmm7, xmm15
1590
+ mov eax, r13d
1591
+ jne 9b
1592
+ movdqa xmm9, xmm0
1593
+ punpckldq xmm0, xmm1
1594
+ punpckhdq xmm9, xmm1
1595
+ movdqa xmm11, xmm2
1596
+ punpckldq xmm2, xmm3
1597
+ punpckhdq xmm11, xmm3
1598
+ movdqa xmm1, xmm0
1599
+ punpcklqdq xmm0, xmm2
1600
+ punpckhqdq xmm1, xmm2
1601
+ movdqa xmm3, xmm9
1602
+ punpcklqdq xmm9, xmm11
1603
+ punpckhqdq xmm3, xmm11
1604
+ movdqu xmmword ptr [rbx], xmm0
1605
+ movdqu xmmword ptr [rbx+0x20], xmm1
1606
+ movdqu xmmword ptr [rbx+0x40], xmm9
1607
+ movdqu xmmword ptr [rbx+0x60], xmm3
1608
+ movdqa xmm9, xmm4
1609
+ punpckldq xmm4, xmm5
1610
+ punpckhdq xmm9, xmm5
1611
+ movdqa xmm11, xmm6
1612
+ punpckldq xmm6, xmm7
1613
+ punpckhdq xmm11, xmm7
1614
+ movdqa xmm5, xmm4
1615
+ punpcklqdq xmm4, xmm6
1616
+ punpckhqdq xmm5, xmm6
1617
+ movdqa xmm7, xmm9
1618
+ punpcklqdq xmm9, xmm11
1619
+ punpckhqdq xmm7, xmm11
1620
+ movdqu xmmword ptr [rbx+0x10], xmm4
1621
+ movdqu xmmword ptr [rbx+0x30], xmm5
1622
+ movdqu xmmword ptr [rbx+0x50], xmm9
1623
+ movdqu xmmword ptr [rbx+0x70], xmm7
1624
+ movdqa xmm1, xmmword ptr [rsp+0x110]
1625
+ movdqa xmm0, xmm1
1626
+ paddd xmm1, xmmword ptr [rsp+0x150]
1627
+ movdqa xmmword ptr [rsp+0x110], xmm1
1628
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
1629
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
1630
+ pcmpgtd xmm0, xmm1
1631
+ movdqa xmm1, xmmword ptr [rsp+0x120]
1632
+ psubd xmm1, xmm0
1633
+ movdqa xmmword ptr [rsp+0x120], xmm1
1634
+ add rbx, 128
1635
+ add rdi, 32
1636
+ sub rsi, 4
1637
+ cmp rsi, 4
1638
+ jnc 2b
1639
+ test rsi, rsi
1640
+ jnz 3f
1641
+ 4:
1642
+ mov rsp, rbp
1643
+ pop rbp
1644
+ pop rbx
1645
+ pop r12
1646
+ pop r13
1647
+ pop r14
1648
+ pop r15
1649
+ ret
1650
+ .p2align 5
1651
+ 3:
1652
+ test esi, 0x2
1653
+ je 3f
1654
+ movups xmm0, xmmword ptr [rcx]
1655
+ movups xmm1, xmmword ptr [rcx+0x10]
1656
+ movaps xmm8, xmm0
1657
+ movaps xmm9, xmm1
1658
+ movd xmm13, dword ptr [rsp+0x110]
1659
+ movd xmm14, dword ptr [rsp+0x120]
1660
+ punpckldq xmm13, xmm14
1661
+ movaps xmmword ptr [rsp], xmm13
1662
+ movd xmm14, dword ptr [rsp+0x114]
1663
+ movd xmm13, dword ptr [rsp+0x124]
1664
+ punpckldq xmm14, xmm13
1665
+ movaps xmmword ptr [rsp+0x10], xmm14
1666
+ mov r8, qword ptr [rdi]
1667
+ mov r9, qword ptr [rdi+0x8]
1668
+ movzx eax, byte ptr [rbp+0x40]
1669
+ or eax, r13d
1670
+ xor edx, edx
1671
+ 2:
1672
+ mov r14d, eax
1673
+ or eax, r12d
1674
+ add rdx, 64
1675
+ cmp rdx, r15
1676
+ cmovne eax, r14d
1677
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1678
+ movaps xmm10, xmm2
1679
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1680
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1681
+ movaps xmm3, xmm4
1682
+ shufps xmm4, xmm5, 136
1683
+ shufps xmm3, xmm5, 221
1684
+ movaps xmm5, xmm3
1685
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1686
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1687
+ movaps xmm3, xmm6
1688
+ shufps xmm6, xmm7, 136
1689
+ pshufd xmm6, xmm6, 0x93
1690
+ shufps xmm3, xmm7, 221
1691
+ pshufd xmm7, xmm3, 0x93
1692
+ movups xmm12, xmmword ptr [r9+rdx-0x40]
1693
+ movups xmm13, xmmword ptr [r9+rdx-0x30]
1694
+ movaps xmm11, xmm12
1695
+ shufps xmm12, xmm13, 136
1696
+ shufps xmm11, xmm13, 221
1697
+ movaps xmm13, xmm11
1698
+ movups xmm14, xmmword ptr [r9+rdx-0x20]
1699
+ movups xmm15, xmmword ptr [r9+rdx-0x10]
1700
+ movaps xmm11, xmm14
1701
+ shufps xmm14, xmm15, 136
1702
+ pshufd xmm14, xmm14, 0x93
1703
+ shufps xmm11, xmm15, 221
1704
+ pshufd xmm15, xmm11, 0x93
1705
+ shl rax, 0x20
1706
+ or rax, 0x40
1707
+ movq xmm3, rax
1708
+ movdqa xmmword ptr [rsp+0x20], xmm3
1709
+ movaps xmm3, xmmword ptr [rsp]
1710
+ movaps xmm11, xmmword ptr [rsp+0x10]
1711
+ punpcklqdq xmm3, xmmword ptr [rsp+0x20]
1712
+ punpcklqdq xmm11, xmmword ptr [rsp+0x20]
1713
+ mov al, 7
1714
+ 9:
1715
+ paddd xmm0, xmm4
1716
+ paddd xmm8, xmm12
1717
+ movaps xmmword ptr [rsp+0x20], xmm4
1718
+ movaps xmmword ptr [rsp+0x30], xmm12
1719
+ paddd xmm0, xmm1
1720
+ paddd xmm8, xmm9
1721
+ pxor xmm3, xmm0
1722
+ pxor xmm11, xmm8
1723
+ pshuflw xmm3, xmm3, 0xB1
1724
+ pshufhw xmm3, xmm3, 0xB1
1725
+ pshuflw xmm11, xmm11, 0xB1
1726
+ pshufhw xmm11, xmm11, 0xB1
1727
+ paddd xmm2, xmm3
1728
+ paddd xmm10, xmm11
1729
+ pxor xmm1, xmm2
1730
+ pxor xmm9, xmm10
1731
+ movdqa xmm4, xmm1
1732
+ pslld xmm1, 20
1733
+ psrld xmm4, 12
1734
+ por xmm1, xmm4
1735
+ movdqa xmm4, xmm9
1736
+ pslld xmm9, 20
1737
+ psrld xmm4, 12
1738
+ por xmm9, xmm4
1739
+ paddd xmm0, xmm5
1740
+ paddd xmm8, xmm13
1741
+ movaps xmmword ptr [rsp+0x40], xmm5
1742
+ movaps xmmword ptr [rsp+0x50], xmm13
1743
+ paddd xmm0, xmm1
1744
+ paddd xmm8, xmm9
1745
+ pxor xmm3, xmm0
1746
+ pxor xmm11, xmm8
1747
+ movdqa xmm13, xmm3
1748
+ psrld xmm3, 8
1749
+ pslld xmm13, 24
1750
+ pxor xmm3, xmm13
1751
+ movdqa xmm13, xmm11
1752
+ psrld xmm11, 8
1753
+ pslld xmm13, 24
1754
+ pxor xmm11, xmm13
1755
+ paddd xmm2, xmm3
1756
+ paddd xmm10, xmm11
1757
+ pxor xmm1, xmm2
1758
+ pxor xmm9, xmm10
1759
+ movdqa xmm4, xmm1
1760
+ pslld xmm1, 25
1761
+ psrld xmm4, 7
1762
+ por xmm1, xmm4
1763
+ movdqa xmm4, xmm9
1764
+ pslld xmm9, 25
1765
+ psrld xmm4, 7
1766
+ por xmm9, xmm4
1767
+ pshufd xmm0, xmm0, 0x93
1768
+ pshufd xmm8, xmm8, 0x93
1769
+ pshufd xmm3, xmm3, 0x4E
1770
+ pshufd xmm11, xmm11, 0x4E
1771
+ pshufd xmm2, xmm2, 0x39
1772
+ pshufd xmm10, xmm10, 0x39
1773
+ paddd xmm0, xmm6
1774
+ paddd xmm8, xmm14
1775
+ paddd xmm0, xmm1
1776
+ paddd xmm8, xmm9
1777
+ pxor xmm3, xmm0
1778
+ pxor xmm11, xmm8
1779
+ pshuflw xmm3, xmm3, 0xB1
1780
+ pshufhw xmm3, xmm3, 0xB1
1781
+ pshuflw xmm11, xmm11, 0xB1
1782
+ pshufhw xmm11, xmm11, 0xB1
1783
+ paddd xmm2, xmm3
1784
+ paddd xmm10, xmm11
1785
+ pxor xmm1, xmm2
1786
+ pxor xmm9, xmm10
1787
+ movdqa xmm4, xmm1
1788
+ pslld xmm1, 20
1789
+ psrld xmm4, 12
1790
+ por xmm1, xmm4
1791
+ movdqa xmm4, xmm9
1792
+ pslld xmm9, 20
1793
+ psrld xmm4, 12
1794
+ por xmm9, xmm4
1795
+ paddd xmm0, xmm7
1796
+ paddd xmm8, xmm15
1797
+ paddd xmm0, xmm1
1798
+ paddd xmm8, xmm9
1799
+ pxor xmm3, xmm0
1800
+ pxor xmm11, xmm8
1801
+ movdqa xmm13, xmm3
1802
+ psrld xmm3, 8
1803
+ pslld xmm13, 24
1804
+ pxor xmm3, xmm13
1805
+ movdqa xmm13, xmm11
1806
+ psrld xmm11, 8
1807
+ pslld xmm13, 24
1808
+ pxor xmm11, xmm13
1809
+ paddd xmm2, xmm3
1810
+ paddd xmm10, xmm11
1811
+ pxor xmm1, xmm2
1812
+ pxor xmm9, xmm10
1813
+ movdqa xmm4, xmm1
1814
+ pslld xmm1, 25
1815
+ psrld xmm4, 7
1816
+ por xmm1, xmm4
1817
+ movdqa xmm4, xmm9
1818
+ pslld xmm9, 25
1819
+ psrld xmm4, 7
1820
+ por xmm9, xmm4
1821
+ pshufd xmm0, xmm0, 0x39
1822
+ pshufd xmm8, xmm8, 0x39
1823
+ pshufd xmm3, xmm3, 0x4E
1824
+ pshufd xmm11, xmm11, 0x4E
1825
+ pshufd xmm2, xmm2, 0x93
1826
+ pshufd xmm10, xmm10, 0x93
1827
+ dec al
1828
+ je 9f
1829
+ movdqa xmm12, xmmword ptr [rsp+0x20]
1830
+ movdqa xmm5, xmmword ptr [rsp+0x40]
1831
+ pshufd xmm13, xmm12, 0x0F
1832
+ shufps xmm12, xmm5, 214
1833
+ pshufd xmm4, xmm12, 0x39
1834
+ movdqa xmm12, xmm6
1835
+ shufps xmm12, xmm7, 250
1836
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
1837
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1838
+ por xmm13, xmm12
1839
+ movdqa xmmword ptr [rsp+0x20], xmm13
1840
+ movdqa xmm12, xmm7
1841
+ punpcklqdq xmm12, xmm5
1842
+ movdqa xmm13, xmm6
1843
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1844
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1845
+ por xmm12, xmm13
1846
+ pshufd xmm12, xmm12, 0x78
1847
+ punpckhdq xmm5, xmm7
1848
+ punpckldq xmm6, xmm5
1849
+ pshufd xmm7, xmm6, 0x1E
1850
+ movdqa xmmword ptr [rsp+0x40], xmm12
1851
+ movdqa xmm5, xmmword ptr [rsp+0x30]
1852
+ movdqa xmm13, xmmword ptr [rsp+0x50]
1853
+ pshufd xmm6, xmm5, 0x0F
1854
+ shufps xmm5, xmm13, 214
1855
+ pshufd xmm12, xmm5, 0x39
1856
+ movdqa xmm5, xmm14
1857
+ shufps xmm5, xmm15, 250
1858
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
1859
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
1860
+ por xmm6, xmm5
1861
+ movdqa xmm5, xmm15
1862
+ punpcklqdq xmm5, xmm13
1863
+ movdqa xmmword ptr [rsp+0x30], xmm2
1864
+ movdqa xmm2, xmm14
1865
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
1866
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
1867
+ por xmm5, xmm2
1868
+ movdqa xmm2, xmmword ptr [rsp+0x30]
1869
+ pshufd xmm5, xmm5, 0x78
1870
+ punpckhdq xmm13, xmm15
1871
+ punpckldq xmm14, xmm13
1872
+ pshufd xmm15, xmm14, 0x1E
1873
+ movdqa xmm13, xmm6
1874
+ movdqa xmm14, xmm5
1875
+ movdqa xmm5, xmmword ptr [rsp+0x20]
1876
+ movdqa xmm6, xmmword ptr [rsp+0x40]
1877
+ jmp 9b
1878
+ 9:
1879
+ pxor xmm0, xmm2
1880
+ pxor xmm1, xmm3
1881
+ pxor xmm8, xmm10
1882
+ pxor xmm9, xmm11
1883
+ mov eax, r13d
1884
+ cmp rdx, r15
1885
+ jne 2b
1886
+ movups xmmword ptr [rbx], xmm0
1887
+ movups xmmword ptr [rbx+0x10], xmm1
1888
+ movups xmmword ptr [rbx+0x20], xmm8
1889
+ movups xmmword ptr [rbx+0x30], xmm9
1890
+ mov eax, dword ptr [rsp+0x130]
1891
+ neg eax
1892
+ mov r10d, dword ptr [rsp+0x110+8*rax]
1893
+ mov r11d, dword ptr [rsp+0x120+8*rax]
1894
+ mov dword ptr [rsp+0x110], r10d
1895
+ mov dword ptr [rsp+0x120], r11d
1896
+ add rdi, 16
1897
+ add rbx, 64
1898
+ sub rsi, 2
1899
+ 3:
1900
+ test esi, 0x1
1901
+ je 4b
1902
+ movups xmm0, xmmword ptr [rcx]
1903
+ movups xmm1, xmmword ptr [rcx+0x10]
1904
+ movd xmm13, dword ptr [rsp+0x110]
1905
+ movd xmm14, dword ptr [rsp+0x120]
1906
+ punpckldq xmm13, xmm14
1907
+ mov r8, qword ptr [rdi]
1908
+ movzx eax, byte ptr [rbp+0x40]
1909
+ or eax, r13d
1910
+ xor edx, edx
1911
+ 2:
1912
+ mov r14d, eax
1913
+ or eax, r12d
1914
+ add rdx, 64
1915
+ cmp rdx, r15
1916
+ cmovne eax, r14d
1917
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
1918
+ shl rax, 32
1919
+ or rax, 64
1920
+ movq xmm12, rax
1921
+ movdqa xmm3, xmm13
1922
+ punpcklqdq xmm3, xmm12
1923
+ movups xmm4, xmmword ptr [r8+rdx-0x40]
1924
+ movups xmm5, xmmword ptr [r8+rdx-0x30]
1925
+ movaps xmm8, xmm4
1926
+ shufps xmm4, xmm5, 136
1927
+ shufps xmm8, xmm5, 221
1928
+ movaps xmm5, xmm8
1929
+ movups xmm6, xmmword ptr [r8+rdx-0x20]
1930
+ movups xmm7, xmmword ptr [r8+rdx-0x10]
1931
+ movaps xmm8, xmm6
1932
+ shufps xmm6, xmm7, 136
1933
+ pshufd xmm6, xmm6, 0x93
1934
+ shufps xmm8, xmm7, 221
1935
+ pshufd xmm7, xmm8, 0x93
1936
+ mov al, 7
1937
+ 9:
1938
+ paddd xmm0, xmm4
1939
+ paddd xmm0, xmm1
1940
+ pxor xmm3, xmm0
1941
+ pshuflw xmm3, xmm3, 0xB1
1942
+ pshufhw xmm3, xmm3, 0xB1
1943
+ paddd xmm2, xmm3
1944
+ pxor xmm1, xmm2
1945
+ movdqa xmm11, xmm1
1946
+ pslld xmm1, 20
1947
+ psrld xmm11, 12
1948
+ por xmm1, xmm11
1949
+ paddd xmm0, xmm5
1950
+ paddd xmm0, xmm1
1951
+ pxor xmm3, xmm0
1952
+ movdqa xmm14, xmm3
1953
+ psrld xmm3, 8
1954
+ pslld xmm14, 24
1955
+ pxor xmm3, xmm14
1956
+ paddd xmm2, xmm3
1957
+ pxor xmm1, xmm2
1958
+ movdqa xmm11, xmm1
1959
+ pslld xmm1, 25
1960
+ psrld xmm11, 7
1961
+ por xmm1, xmm11
1962
+ pshufd xmm0, xmm0, 0x93
1963
+ pshufd xmm3, xmm3, 0x4E
1964
+ pshufd xmm2, xmm2, 0x39
1965
+ paddd xmm0, xmm6
1966
+ paddd xmm0, xmm1
1967
+ pxor xmm3, xmm0
1968
+ pshuflw xmm3, xmm3, 0xB1
1969
+ pshufhw xmm3, xmm3, 0xB1
1970
+ paddd xmm2, xmm3
1971
+ pxor xmm1, xmm2
1972
+ movdqa xmm11, xmm1
1973
+ pslld xmm1, 20
1974
+ psrld xmm11, 12
1975
+ por xmm1, xmm11
1976
+ paddd xmm0, xmm7
1977
+ paddd xmm0, xmm1
1978
+ pxor xmm3, xmm0
1979
+ movdqa xmm14, xmm3
1980
+ psrld xmm3, 8
1981
+ pslld xmm14, 24
1982
+ pxor xmm3, xmm14
1983
+ paddd xmm2, xmm3
1984
+ pxor xmm1, xmm2
1985
+ movdqa xmm11, xmm1
1986
+ pslld xmm1, 25
1987
+ psrld xmm11, 7
1988
+ por xmm1, xmm11
1989
+ pshufd xmm0, xmm0, 0x39
1990
+ pshufd xmm3, xmm3, 0x4E
1991
+ pshufd xmm2, xmm2, 0x93
1992
+ dec al
1993
+ jz 9f
1994
+ movdqa xmm8, xmm4
1995
+ shufps xmm8, xmm5, 214
1996
+ pshufd xmm9, xmm4, 0x0F
1997
+ pshufd xmm4, xmm8, 0x39
1998
+ movdqa xmm8, xmm6
1999
+ shufps xmm8, xmm7, 250
2000
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2001
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2002
+ por xmm9, xmm8
2003
+ movdqa xmm8, xmm7
2004
+ punpcklqdq xmm8, xmm5
2005
+ movdqa xmm10, xmm6
2006
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2007
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2008
+ por xmm8, xmm10
2009
+ pshufd xmm8, xmm8, 0x78
2010
+ punpckhdq xmm5, xmm7
2011
+ punpckldq xmm6, xmm5
2012
+ pshufd xmm7, xmm6, 0x1E
2013
+ movdqa xmm5, xmm9
2014
+ movdqa xmm6, xmm8
2015
+ jmp 9b
2016
+ 9:
2017
+ pxor xmm0, xmm2
2018
+ pxor xmm1, xmm3
2019
+ mov eax, r13d
2020
+ cmp rdx, r15
2021
+ jne 2b
2022
+ movups xmmword ptr [rbx], xmm0
2023
+ movups xmmword ptr [rbx+0x10], xmm1
2024
+ jmp 4b
2025
+
2026
+ .p2align 6
2027
+ blake3_compress_in_place_sse2:
2028
+ _blake3_compress_in_place_sse2:
2029
+ _CET_ENDBR
2030
+ movups xmm0, xmmword ptr [rdi]
2031
+ movups xmm1, xmmword ptr [rdi+0x10]
2032
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2033
+ shl r8, 32
2034
+ add rdx, r8
2035
+ movq xmm3, rcx
2036
+ movq xmm4, rdx
2037
+ punpcklqdq xmm3, xmm4
2038
+ movups xmm4, xmmword ptr [rsi]
2039
+ movups xmm5, xmmword ptr [rsi+0x10]
2040
+ movaps xmm8, xmm4
2041
+ shufps xmm4, xmm5, 136
2042
+ shufps xmm8, xmm5, 221
2043
+ movaps xmm5, xmm8
2044
+ movups xmm6, xmmword ptr [rsi+0x20]
2045
+ movups xmm7, xmmword ptr [rsi+0x30]
2046
+ movaps xmm8, xmm6
2047
+ shufps xmm6, xmm7, 136
2048
+ pshufd xmm6, xmm6, 0x93
2049
+ shufps xmm8, xmm7, 221
2050
+ pshufd xmm7, xmm8, 0x93
2051
+ mov al, 7
2052
+ 9:
2053
+ paddd xmm0, xmm4
2054
+ paddd xmm0, xmm1
2055
+ pxor xmm3, xmm0
2056
+ pshuflw xmm3, xmm3, 0xB1
2057
+ pshufhw xmm3, xmm3, 0xB1
2058
+ paddd xmm2, xmm3
2059
+ pxor xmm1, xmm2
2060
+ movdqa xmm11, xmm1
2061
+ pslld xmm1, 20
2062
+ psrld xmm11, 12
2063
+ por xmm1, xmm11
2064
+ paddd xmm0, xmm5
2065
+ paddd xmm0, xmm1
2066
+ pxor xmm3, xmm0
2067
+ movdqa xmm14, xmm3
2068
+ psrld xmm3, 8
2069
+ pslld xmm14, 24
2070
+ pxor xmm3, xmm14
2071
+ paddd xmm2, xmm3
2072
+ pxor xmm1, xmm2
2073
+ movdqa xmm11, xmm1
2074
+ pslld xmm1, 25
2075
+ psrld xmm11, 7
2076
+ por xmm1, xmm11
2077
+ pshufd xmm0, xmm0, 0x93
2078
+ pshufd xmm3, xmm3, 0x4E
2079
+ pshufd xmm2, xmm2, 0x39
2080
+ paddd xmm0, xmm6
2081
+ paddd xmm0, xmm1
2082
+ pxor xmm3, xmm0
2083
+ pshuflw xmm3, xmm3, 0xB1
2084
+ pshufhw xmm3, xmm3, 0xB1
2085
+ paddd xmm2, xmm3
2086
+ pxor xmm1, xmm2
2087
+ movdqa xmm11, xmm1
2088
+ pslld xmm1, 20
2089
+ psrld xmm11, 12
2090
+ por xmm1, xmm11
2091
+ paddd xmm0, xmm7
2092
+ paddd xmm0, xmm1
2093
+ pxor xmm3, xmm0
2094
+ movdqa xmm14, xmm3
2095
+ psrld xmm3, 8
2096
+ pslld xmm14, 24
2097
+ pxor xmm3, xmm14
2098
+ paddd xmm2, xmm3
2099
+ pxor xmm1, xmm2
2100
+ movdqa xmm11, xmm1
2101
+ pslld xmm1, 25
2102
+ psrld xmm11, 7
2103
+ por xmm1, xmm11
2104
+ pshufd xmm0, xmm0, 0x39
2105
+ pshufd xmm3, xmm3, 0x4E
2106
+ pshufd xmm2, xmm2, 0x93
2107
+ dec al
2108
+ jz 9f
2109
+ movdqa xmm8, xmm4
2110
+ shufps xmm8, xmm5, 214
2111
+ pshufd xmm9, xmm4, 0x0F
2112
+ pshufd xmm4, xmm8, 0x39
2113
+ movdqa xmm8, xmm6
2114
+ shufps xmm8, xmm7, 250
2115
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2116
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2117
+ por xmm9, xmm8
2118
+ movdqa xmm8, xmm7
2119
+ punpcklqdq xmm8, xmm5
2120
+ movdqa xmm10, xmm6
2121
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2122
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2123
+ por xmm8, xmm10
2124
+ pshufd xmm8, xmm8, 0x78
2125
+ punpckhdq xmm5, xmm7
2126
+ punpckldq xmm6, xmm5
2127
+ pshufd xmm7, xmm6, 0x1E
2128
+ movdqa xmm5, xmm9
2129
+ movdqa xmm6, xmm8
2130
+ jmp 9b
2131
+ 9:
2132
+ pxor xmm0, xmm2
2133
+ pxor xmm1, xmm3
2134
+ movups xmmword ptr [rdi], xmm0
2135
+ movups xmmword ptr [rdi+0x10], xmm1
2136
+ ret
2137
+
2138
+ .p2align 6
2139
+ blake3_compress_xof_sse2:
2140
+ _blake3_compress_xof_sse2:
2141
+ _CET_ENDBR
2142
+ movups xmm0, xmmword ptr [rdi]
2143
+ movups xmm1, xmmword ptr [rdi+0x10]
2144
+ movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
2145
+ movzx eax, r8b
2146
+ movzx edx, dl
2147
+ shl rax, 32
2148
+ add rdx, rax
2149
+ movq xmm3, rcx
2150
+ movq xmm4, rdx
2151
+ punpcklqdq xmm3, xmm4
2152
+ movups xmm4, xmmword ptr [rsi]
2153
+ movups xmm5, xmmword ptr [rsi+0x10]
2154
+ movaps xmm8, xmm4
2155
+ shufps xmm4, xmm5, 136
2156
+ shufps xmm8, xmm5, 221
2157
+ movaps xmm5, xmm8
2158
+ movups xmm6, xmmword ptr [rsi+0x20]
2159
+ movups xmm7, xmmword ptr [rsi+0x30]
2160
+ movaps xmm8, xmm6
2161
+ shufps xmm6, xmm7, 136
2162
+ pshufd xmm6, xmm6, 0x93
2163
+ shufps xmm8, xmm7, 221
2164
+ pshufd xmm7, xmm8, 0x93
2165
+ mov al, 7
2166
+ 9:
2167
+ paddd xmm0, xmm4
2168
+ paddd xmm0, xmm1
2169
+ pxor xmm3, xmm0
2170
+ pshuflw xmm3, xmm3, 0xB1
2171
+ pshufhw xmm3, xmm3, 0xB1
2172
+ paddd xmm2, xmm3
2173
+ pxor xmm1, xmm2
2174
+ movdqa xmm11, xmm1
2175
+ pslld xmm1, 20
2176
+ psrld xmm11, 12
2177
+ por xmm1, xmm11
2178
+ paddd xmm0, xmm5
2179
+ paddd xmm0, xmm1
2180
+ pxor xmm3, xmm0
2181
+ movdqa xmm14, xmm3
2182
+ psrld xmm3, 8
2183
+ pslld xmm14, 24
2184
+ pxor xmm3, xmm14
2185
+ paddd xmm2, xmm3
2186
+ pxor xmm1, xmm2
2187
+ movdqa xmm11, xmm1
2188
+ pslld xmm1, 25
2189
+ psrld xmm11, 7
2190
+ por xmm1, xmm11
2191
+ pshufd xmm0, xmm0, 0x93
2192
+ pshufd xmm3, xmm3, 0x4E
2193
+ pshufd xmm2, xmm2, 0x39
2194
+ paddd xmm0, xmm6
2195
+ paddd xmm0, xmm1
2196
+ pxor xmm3, xmm0
2197
+ pshuflw xmm3, xmm3, 0xB1
2198
+ pshufhw xmm3, xmm3, 0xB1
2199
+ paddd xmm2, xmm3
2200
+ pxor xmm1, xmm2
2201
+ movdqa xmm11, xmm1
2202
+ pslld xmm1, 20
2203
+ psrld xmm11, 12
2204
+ por xmm1, xmm11
2205
+ paddd xmm0, xmm7
2206
+ paddd xmm0, xmm1
2207
+ pxor xmm3, xmm0
2208
+ movdqa xmm14, xmm3
2209
+ psrld xmm3, 8
2210
+ pslld xmm14, 24
2211
+ pxor xmm3, xmm14
2212
+ paddd xmm2, xmm3
2213
+ pxor xmm1, xmm2
2214
+ movdqa xmm11, xmm1
2215
+ pslld xmm1, 25
2216
+ psrld xmm11, 7
2217
+ por xmm1, xmm11
2218
+ pshufd xmm0, xmm0, 0x39
2219
+ pshufd xmm3, xmm3, 0x4E
2220
+ pshufd xmm2, xmm2, 0x93
2221
+ dec al
2222
+ jz 9f
2223
+ movdqa xmm8, xmm4
2224
+ shufps xmm8, xmm5, 214
2225
+ pshufd xmm9, xmm4, 0x0F
2226
+ pshufd xmm4, xmm8, 0x39
2227
+ movdqa xmm8, xmm6
2228
+ shufps xmm8, xmm7, 250
2229
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
2230
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
2231
+ por xmm9, xmm8
2232
+ movdqa xmm8, xmm7
2233
+ punpcklqdq xmm8, xmm5
2234
+ movdqa xmm10, xmm6
2235
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
2236
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
2237
+ por xmm8, xmm10
2238
+ pshufd xmm8, xmm8, 0x78
2239
+ punpckhdq xmm5, xmm7
2240
+ punpckldq xmm6, xmm5
2241
+ pshufd xmm7, xmm6, 0x1E
2242
+ movdqa xmm5, xmm9
2243
+ movdqa xmm6, xmm8
2244
+ jmp 9b
2245
+ 9:
2246
+ movdqu xmm4, xmmword ptr [rdi]
2247
+ movdqu xmm5, xmmword ptr [rdi+0x10]
2248
+ pxor xmm0, xmm2
2249
+ pxor xmm1, xmm3
2250
+ pxor xmm2, xmm4
2251
+ pxor xmm3, xmm5
2252
+ movups xmmword ptr [r9], xmm0
2253
+ movups xmmword ptr [r9+0x10], xmm1
2254
+ movups xmmword ptr [r9+0x20], xmm2
2255
+ movups xmmword ptr [r9+0x30], xmm3
2256
+ ret
2257
+
2258
+
2259
+ #ifdef __APPLE__
2260
+ .static_data
2261
+ #else
2262
+ .section .rodata
2263
+ #endif
2264
+ .p2align 6
2265
+ BLAKE3_IV:
2266
+ .long 0x6A09E667, 0xBB67AE85
2267
+ .long 0x3C6EF372, 0xA54FF53A
2268
+ ADD0:
2269
+ .long 0, 1, 2, 3
2270
+ ADD1:
2271
+ .long 4, 4, 4, 4
2272
+ BLAKE3_IV_0:
2273
+ .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
2274
+ BLAKE3_IV_1:
2275
+ .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
2276
+ BLAKE3_IV_2:
2277
+ .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
2278
+ BLAKE3_IV_3:
2279
+ .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
2280
+ BLAKE3_BLOCK_LEN:
2281
+ .long 64, 64, 64, 64
2282
+ CMP_MSB_MASK:
2283
+ .long 0x80000000, 0x80000000, 0x80000000, 0x80000000
2284
+ PBLENDW_0x33_MASK:
2285
+ .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
2286
+ PBLENDW_0xCC_MASK:
2287
+ .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
2288
+ PBLENDW_0x3F_MASK:
2289
+ .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
2290
+ PBLENDW_0xC0_MASK:
2291
+ .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF