digest-blake3 0.34.0 → 0.37.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2350 @@
1
+ public _blake3_hash_many_sse2
2
+ public blake3_hash_many_sse2
3
+ public blake3_compress_in_place_sse2
4
+ public _blake3_compress_in_place_sse2
5
+ public blake3_compress_xof_sse2
6
+ public _blake3_compress_xof_sse2
7
+
8
+ _TEXT SEGMENT ALIGN(16) 'CODE'
9
+
10
+ ALIGN 16
11
+ blake3_hash_many_sse2 PROC
12
+ _blake3_hash_many_sse2 PROC
13
+ push r15
14
+ push r14
15
+ push r13
16
+ push r12
17
+ push rsi
18
+ push rdi
19
+ push rbx
20
+ push rbp
21
+ mov rbp, rsp
22
+ sub rsp, 528
23
+ and rsp, 0FFFFFFFFFFFFFFC0H
24
+ movdqa xmmword ptr [rsp+170H], xmm6
25
+ movdqa xmmword ptr [rsp+180H], xmm7
26
+ movdqa xmmword ptr [rsp+190H], xmm8
27
+ movdqa xmmword ptr [rsp+1A0H], xmm9
28
+ movdqa xmmword ptr [rsp+1B0H], xmm10
29
+ movdqa xmmword ptr [rsp+1C0H], xmm11
30
+ movdqa xmmword ptr [rsp+1D0H], xmm12
31
+ movdqa xmmword ptr [rsp+1E0H], xmm13
32
+ movdqa xmmword ptr [rsp+1F0H], xmm14
33
+ movdqa xmmword ptr [rsp+200H], xmm15
34
+ mov rdi, rcx
35
+ mov rsi, rdx
36
+ mov rdx, r8
37
+ mov rcx, r9
38
+ mov r8, qword ptr [rbp+68H]
39
+ movzx r9, byte ptr [rbp+70H]
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 00H
43
+ movdqa xmmword ptr [rsp+130H], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0]
46
+ pand xmm0, xmmword ptr [ADD1]
47
+ movdqa xmmword ptr [rsp+150H], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 00H
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+110H], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 00H
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+120H], xmm2
60
+ mov rbx, qword ptr [rbp+90H]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+78H]
64
+ movzx r12d, byte ptr [rbp+88H]
65
+ cmp rsi, 4
66
+ jc final3blocks
67
+ outerloop4:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 00H
70
+ pshufd xmm1, xmm3, 55H
71
+ pshufd xmm2, xmm3, 0AAH
72
+ pshufd xmm3, xmm3, 0FFH
73
+ movdqu xmm7, xmmword ptr [rcx+10H]
74
+ pshufd xmm4, xmm7, 00H
75
+ pshufd xmm5, xmm7, 55H
76
+ pshufd xmm6, xmm7, 0AAH
77
+ pshufd xmm7, xmm7, 0FFH
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+8H]
80
+ mov r10, qword ptr [rdi+10H]
81
+ mov r11, qword ptr [rdi+18H]
82
+ movzx eax, byte ptr [rbp+80H]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ innerloop4:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-40H]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-40H]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-40H]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-40H]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+10H], xmm9
109
+ movdqa xmmword ptr [rsp+20H], xmm12
110
+ movdqa xmmword ptr [rsp+30H], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-30H]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-30H]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-30H]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-30H]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+40H], xmm8
128
+ movdqa xmmword ptr [rsp+50H], xmm9
129
+ movdqa xmmword ptr [rsp+60H], xmm12
130
+ movdqa xmmword ptr [rsp+70H], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-20H]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-20H]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-20H]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-20H]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+80H], xmm8
148
+ movdqa xmmword ptr [rsp+90H], xmm9
149
+ movdqa xmmword ptr [rsp+0A0H], xmm12
150
+ movdqa xmmword ptr [rsp+0B0H], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-10H]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-10H]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-10H]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-10H]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0C0H], xmm8
168
+ movdqa xmmword ptr [rsp+0D0H], xmm9
169
+ movdqa xmmword ptr [rsp+0E0H], xmm12
170
+ movdqa xmmword ptr [rsp+0F0H], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
174
+ movdqa xmm12, xmmword ptr [rsp+110H]
175
+ movdqa xmm13, xmmword ptr [rsp+120H]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 00H
179
+ prefetcht0 byte ptr [r8+rdx+80H]
180
+ prefetcht0 byte ptr [r9+rdx+80H]
181
+ prefetcht0 byte ptr [r10+rdx+80H]
182
+ prefetcht0 byte ptr [r11+rdx+80H]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+20H]
185
+ paddd xmm2, xmmword ptr [rsp+40H]
186
+ paddd xmm3, xmmword ptr [rsp+60H]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ pshuflw xmm12, xmm12, 0B1H
196
+ pshufhw xmm12, xmm12, 0B1H
197
+ pshuflw xmm13, xmm13, 0B1H
198
+ pshufhw xmm13, xmm13, 0B1H
199
+ pshuflw xmm14, xmm14, 0B1H
200
+ pshufhw xmm14, xmm14, 0B1H
201
+ pshuflw xmm15, xmm15, 0B1H
202
+ pshufhw xmm15, xmm15, 0B1H
203
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
204
+ paddd xmm8, xmm12
205
+ paddd xmm9, xmm13
206
+ paddd xmm10, xmm14
207
+ paddd xmm11, xmm15
208
+ pxor xmm4, xmm8
209
+ pxor xmm5, xmm9
210
+ pxor xmm6, xmm10
211
+ pxor xmm7, xmm11
212
+ movdqa xmmword ptr [rsp+100H], xmm8
213
+ movdqa xmm8, xmm4
214
+ psrld xmm8, 12
215
+ pslld xmm4, 20
216
+ por xmm4, xmm8
217
+ movdqa xmm8, xmm5
218
+ psrld xmm8, 12
219
+ pslld xmm5, 20
220
+ por xmm5, xmm8
221
+ movdqa xmm8, xmm6
222
+ psrld xmm8, 12
223
+ pslld xmm6, 20
224
+ por xmm6, xmm8
225
+ movdqa xmm8, xmm7
226
+ psrld xmm8, 12
227
+ pslld xmm7, 20
228
+ por xmm7, xmm8
229
+ paddd xmm0, xmmword ptr [rsp+10H]
230
+ paddd xmm1, xmmword ptr [rsp+30H]
231
+ paddd xmm2, xmmword ptr [rsp+50H]
232
+ paddd xmm3, xmmword ptr [rsp+70H]
233
+ paddd xmm0, xmm4
234
+ paddd xmm1, xmm5
235
+ paddd xmm2, xmm6
236
+ paddd xmm3, xmm7
237
+ pxor xmm12, xmm0
238
+ pxor xmm13, xmm1
239
+ pxor xmm14, xmm2
240
+ pxor xmm15, xmm3
241
+ movdqa xmm8, xmm12
242
+ psrld xmm12, 8
243
+ pslld xmm8, 24
244
+ pxor xmm12, xmm8
245
+ movdqa xmm8, xmm13
246
+ psrld xmm13, 8
247
+ pslld xmm8, 24
248
+ pxor xmm13, xmm8
249
+ movdqa xmm8, xmm14
250
+ psrld xmm14, 8
251
+ pslld xmm8, 24
252
+ pxor xmm14, xmm8
253
+ movdqa xmm8, xmm15
254
+ psrld xmm15, 8
255
+ pslld xmm8, 24
256
+ pxor xmm15, xmm8
257
+ movdqa xmm8, xmmword ptr [rsp+100H]
258
+ paddd xmm8, xmm12
259
+ paddd xmm9, xmm13
260
+ paddd xmm10, xmm14
261
+ paddd xmm11, xmm15
262
+ pxor xmm4, xmm8
263
+ pxor xmm5, xmm9
264
+ pxor xmm6, xmm10
265
+ pxor xmm7, xmm11
266
+ movdqa xmmword ptr [rsp+100H], xmm8
267
+ movdqa xmm8, xmm4
268
+ psrld xmm8, 7
269
+ pslld xmm4, 25
270
+ por xmm4, xmm8
271
+ movdqa xmm8, xmm5
272
+ psrld xmm8, 7
273
+ pslld xmm5, 25
274
+ por xmm5, xmm8
275
+ movdqa xmm8, xmm6
276
+ psrld xmm8, 7
277
+ pslld xmm6, 25
278
+ por xmm6, xmm8
279
+ movdqa xmm8, xmm7
280
+ psrld xmm8, 7
281
+ pslld xmm7, 25
282
+ por xmm7, xmm8
283
+ paddd xmm0, xmmword ptr [rsp+80H]
284
+ paddd xmm1, xmmword ptr [rsp+0A0H]
285
+ paddd xmm2, xmmword ptr [rsp+0C0H]
286
+ paddd xmm3, xmmword ptr [rsp+0E0H]
287
+ paddd xmm0, xmm5
288
+ paddd xmm1, xmm6
289
+ paddd xmm2, xmm7
290
+ paddd xmm3, xmm4
291
+ pxor xmm15, xmm0
292
+ pxor xmm12, xmm1
293
+ pxor xmm13, xmm2
294
+ pxor xmm14, xmm3
295
+ pshuflw xmm15, xmm15, 0B1H
296
+ pshufhw xmm15, xmm15, 0B1H
297
+ pshuflw xmm12, xmm12, 0B1H
298
+ pshufhw xmm12, xmm12, 0B1H
299
+ pshuflw xmm13, xmm13, 0B1H
300
+ pshufhw xmm13, xmm13, 0B1H
301
+ pshuflw xmm14, xmm14, 0B1H
302
+ pshufhw xmm14, xmm14, 0B1H
303
+ paddd xmm10, xmm15
304
+ paddd xmm11, xmm12
305
+ movdqa xmm8, xmmword ptr [rsp+100H]
306
+ paddd xmm8, xmm13
307
+ paddd xmm9, xmm14
308
+ pxor xmm5, xmm10
309
+ pxor xmm6, xmm11
310
+ pxor xmm7, xmm8
311
+ pxor xmm4, xmm9
312
+ movdqa xmmword ptr [rsp+100H], xmm8
313
+ movdqa xmm8, xmm5
314
+ psrld xmm8, 12
315
+ pslld xmm5, 20
316
+ por xmm5, xmm8
317
+ movdqa xmm8, xmm6
318
+ psrld xmm8, 12
319
+ pslld xmm6, 20
320
+ por xmm6, xmm8
321
+ movdqa xmm8, xmm7
322
+ psrld xmm8, 12
323
+ pslld xmm7, 20
324
+ por xmm7, xmm8
325
+ movdqa xmm8, xmm4
326
+ psrld xmm8, 12
327
+ pslld xmm4, 20
328
+ por xmm4, xmm8
329
+ paddd xmm0, xmmword ptr [rsp+90H]
330
+ paddd xmm1, xmmword ptr [rsp+0B0H]
331
+ paddd xmm2, xmmword ptr [rsp+0D0H]
332
+ paddd xmm3, xmmword ptr [rsp+0F0H]
333
+ paddd xmm0, xmm5
334
+ paddd xmm1, xmm6
335
+ paddd xmm2, xmm7
336
+ paddd xmm3, xmm4
337
+ pxor xmm15, xmm0
338
+ pxor xmm12, xmm1
339
+ pxor xmm13, xmm2
340
+ pxor xmm14, xmm3
341
+ movdqa xmm8, xmm15
342
+ psrld xmm15, 8
343
+ pslld xmm8, 24
344
+ pxor xmm15, xmm8
345
+ movdqa xmm8, xmm12
346
+ psrld xmm12, 8
347
+ pslld xmm8, 24
348
+ pxor xmm12, xmm8
349
+ movdqa xmm8, xmm13
350
+ psrld xmm13, 8
351
+ pslld xmm8, 24
352
+ pxor xmm13, xmm8
353
+ movdqa xmm8, xmm14
354
+ psrld xmm14, 8
355
+ pslld xmm8, 24
356
+ pxor xmm14, xmm8
357
+ paddd xmm10, xmm15
358
+ paddd xmm11, xmm12
359
+ movdqa xmm8, xmmword ptr [rsp+100H]
360
+ paddd xmm8, xmm13
361
+ paddd xmm9, xmm14
362
+ pxor xmm5, xmm10
363
+ pxor xmm6, xmm11
364
+ pxor xmm7, xmm8
365
+ pxor xmm4, xmm9
366
+ movdqa xmmword ptr [rsp+100H], xmm8
367
+ movdqa xmm8, xmm5
368
+ psrld xmm8, 7
369
+ pslld xmm5, 25
370
+ por xmm5, xmm8
371
+ movdqa xmm8, xmm6
372
+ psrld xmm8, 7
373
+ pslld xmm6, 25
374
+ por xmm6, xmm8
375
+ movdqa xmm8, xmm7
376
+ psrld xmm8, 7
377
+ pslld xmm7, 25
378
+ por xmm7, xmm8
379
+ movdqa xmm8, xmm4
380
+ psrld xmm8, 7
381
+ pslld xmm4, 25
382
+ por xmm4, xmm8
383
+ paddd xmm0, xmmword ptr [rsp+20H]
384
+ paddd xmm1, xmmword ptr [rsp+30H]
385
+ paddd xmm2, xmmword ptr [rsp+70H]
386
+ paddd xmm3, xmmword ptr [rsp+40H]
387
+ paddd xmm0, xmm4
388
+ paddd xmm1, xmm5
389
+ paddd xmm2, xmm6
390
+ paddd xmm3, xmm7
391
+ pxor xmm12, xmm0
392
+ pxor xmm13, xmm1
393
+ pxor xmm14, xmm2
394
+ pxor xmm15, xmm3
395
+ pshuflw xmm12, xmm12, 0B1H
396
+ pshufhw xmm12, xmm12, 0B1H
397
+ pshuflw xmm13, xmm13, 0B1H
398
+ pshufhw xmm13, xmm13, 0B1H
399
+ pshuflw xmm14, xmm14, 0B1H
400
+ pshufhw xmm14, xmm14, 0B1H
401
+ pshuflw xmm15, xmm15, 0B1H
402
+ pshufhw xmm15, xmm15, 0B1H
403
+ movdqa xmm8, xmmword ptr [rsp+100H]
404
+ paddd xmm8, xmm12
405
+ paddd xmm9, xmm13
406
+ paddd xmm10, xmm14
407
+ paddd xmm11, xmm15
408
+ pxor xmm4, xmm8
409
+ pxor xmm5, xmm9
410
+ pxor xmm6, xmm10
411
+ pxor xmm7, xmm11
412
+ movdqa xmmword ptr [rsp+100H], xmm8
413
+ movdqa xmm8, xmm4
414
+ psrld xmm8, 12
415
+ pslld xmm4, 20
416
+ por xmm4, xmm8
417
+ movdqa xmm8, xmm5
418
+ psrld xmm8, 12
419
+ pslld xmm5, 20
420
+ por xmm5, xmm8
421
+ movdqa xmm8, xmm6
422
+ psrld xmm8, 12
423
+ pslld xmm6, 20
424
+ por xmm6, xmm8
425
+ movdqa xmm8, xmm7
426
+ psrld xmm8, 12
427
+ pslld xmm7, 20
428
+ por xmm7, xmm8
429
+ paddd xmm0, xmmword ptr [rsp+60H]
430
+ paddd xmm1, xmmword ptr [rsp+0A0H]
431
+ paddd xmm2, xmmword ptr [rsp]
432
+ paddd xmm3, xmmword ptr [rsp+0D0H]
433
+ paddd xmm0, xmm4
434
+ paddd xmm1, xmm5
435
+ paddd xmm2, xmm6
436
+ paddd xmm3, xmm7
437
+ pxor xmm12, xmm0
438
+ pxor xmm13, xmm1
439
+ pxor xmm14, xmm2
440
+ pxor xmm15, xmm3
441
+ movdqa xmm8, xmm12
442
+ psrld xmm12, 8
443
+ pslld xmm8, 24
444
+ pxor xmm12, xmm8
445
+ movdqa xmm8, xmm13
446
+ psrld xmm13, 8
447
+ pslld xmm8, 24
448
+ pxor xmm13, xmm8
449
+ movdqa xmm8, xmm14
450
+ psrld xmm14, 8
451
+ pslld xmm8, 24
452
+ pxor xmm14, xmm8
453
+ movdqa xmm8, xmm15
454
+ psrld xmm15, 8
455
+ pslld xmm8, 24
456
+ pxor xmm15, xmm8
457
+ movdqa xmm8, xmmword ptr [rsp+100H]
458
+ paddd xmm8, xmm12
459
+ paddd xmm9, xmm13
460
+ paddd xmm10, xmm14
461
+ paddd xmm11, xmm15
462
+ pxor xmm4, xmm8
463
+ pxor xmm5, xmm9
464
+ pxor xmm6, xmm10
465
+ pxor xmm7, xmm11
466
+ movdqa xmmword ptr [rsp+100H], xmm8
467
+ movdqa xmm8, xmm4
468
+ psrld xmm8, 7
469
+ pslld xmm4, 25
470
+ por xmm4, xmm8
471
+ movdqa xmm8, xmm5
472
+ psrld xmm8, 7
473
+ pslld xmm5, 25
474
+ por xmm5, xmm8
475
+ movdqa xmm8, xmm6
476
+ psrld xmm8, 7
477
+ pslld xmm6, 25
478
+ por xmm6, xmm8
479
+ movdqa xmm8, xmm7
480
+ psrld xmm8, 7
481
+ pslld xmm7, 25
482
+ por xmm7, xmm8
483
+ paddd xmm0, xmmword ptr [rsp+10H]
484
+ paddd xmm1, xmmword ptr [rsp+0C0H]
485
+ paddd xmm2, xmmword ptr [rsp+90H]
486
+ paddd xmm3, xmmword ptr [rsp+0F0H]
487
+ paddd xmm0, xmm5
488
+ paddd xmm1, xmm6
489
+ paddd xmm2, xmm7
490
+ paddd xmm3, xmm4
491
+ pxor xmm15, xmm0
492
+ pxor xmm12, xmm1
493
+ pxor xmm13, xmm2
494
+ pxor xmm14, xmm3
495
+ pshuflw xmm15, xmm15, 0B1H
496
+ pshufhw xmm15, xmm15, 0B1H
497
+ pshuflw xmm12, xmm12, 0B1H
498
+ pshufhw xmm12, xmm12, 0B1H
499
+ pshuflw xmm13, xmm13, 0B1H
500
+ pshufhw xmm13, xmm13, 0B1H
501
+ pshuflw xmm14, xmm14, 0B1H
502
+ pshufhw xmm14, xmm14, 0B1H
503
+ paddd xmm10, xmm15
504
+ paddd xmm11, xmm12
505
+ movdqa xmm8, xmmword ptr [rsp+100H]
506
+ paddd xmm8, xmm13
507
+ paddd xmm9, xmm14
508
+ pxor xmm5, xmm10
509
+ pxor xmm6, xmm11
510
+ pxor xmm7, xmm8
511
+ pxor xmm4, xmm9
512
+ movdqa xmmword ptr [rsp+100H], xmm8
513
+ movdqa xmm8, xmm5
514
+ psrld xmm8, 12
515
+ pslld xmm5, 20
516
+ por xmm5, xmm8
517
+ movdqa xmm8, xmm6
518
+ psrld xmm8, 12
519
+ pslld xmm6, 20
520
+ por xmm6, xmm8
521
+ movdqa xmm8, xmm7
522
+ psrld xmm8, 12
523
+ pslld xmm7, 20
524
+ por xmm7, xmm8
525
+ movdqa xmm8, xmm4
526
+ psrld xmm8, 12
527
+ pslld xmm4, 20
528
+ por xmm4, xmm8
529
+ paddd xmm0, xmmword ptr [rsp+0B0H]
530
+ paddd xmm1, xmmword ptr [rsp+50H]
531
+ paddd xmm2, xmmword ptr [rsp+0E0H]
532
+ paddd xmm3, xmmword ptr [rsp+80H]
533
+ paddd xmm0, xmm5
534
+ paddd xmm1, xmm6
535
+ paddd xmm2, xmm7
536
+ paddd xmm3, xmm4
537
+ pxor xmm15, xmm0
538
+ pxor xmm12, xmm1
539
+ pxor xmm13, xmm2
540
+ pxor xmm14, xmm3
541
+ movdqa xmm8, xmm15
542
+ psrld xmm15, 8
543
+ pslld xmm8, 24
544
+ pxor xmm15, xmm8
545
+ movdqa xmm8, xmm12
546
+ psrld xmm12, 8
547
+ pslld xmm8, 24
548
+ pxor xmm12, xmm8
549
+ movdqa xmm8, xmm13
550
+ psrld xmm13, 8
551
+ pslld xmm8, 24
552
+ pxor xmm13, xmm8
553
+ movdqa xmm8, xmm14
554
+ psrld xmm14, 8
555
+ pslld xmm8, 24
556
+ pxor xmm14, xmm8
557
+ paddd xmm10, xmm15
558
+ paddd xmm11, xmm12
559
+ movdqa xmm8, xmmword ptr [rsp+100H]
560
+ paddd xmm8, xmm13
561
+ paddd xmm9, xmm14
562
+ pxor xmm5, xmm10
563
+ pxor xmm6, xmm11
564
+ pxor xmm7, xmm8
565
+ pxor xmm4, xmm9
566
+ movdqa xmmword ptr [rsp+100H], xmm8
567
+ movdqa xmm8, xmm5
568
+ psrld xmm8, 7
569
+ pslld xmm5, 25
570
+ por xmm5, xmm8
571
+ movdqa xmm8, xmm6
572
+ psrld xmm8, 7
573
+ pslld xmm6, 25
574
+ por xmm6, xmm8
575
+ movdqa xmm8, xmm7
576
+ psrld xmm8, 7
577
+ pslld xmm7, 25
578
+ por xmm7, xmm8
579
+ movdqa xmm8, xmm4
580
+ psrld xmm8, 7
581
+ pslld xmm4, 25
582
+ por xmm4, xmm8
583
+ paddd xmm0, xmmword ptr [rsp+30H]
584
+ paddd xmm1, xmmword ptr [rsp+0A0H]
585
+ paddd xmm2, xmmword ptr [rsp+0D0H]
586
+ paddd xmm3, xmmword ptr [rsp+70H]
587
+ paddd xmm0, xmm4
588
+ paddd xmm1, xmm5
589
+ paddd xmm2, xmm6
590
+ paddd xmm3, xmm7
591
+ pxor xmm12, xmm0
592
+ pxor xmm13, xmm1
593
+ pxor xmm14, xmm2
594
+ pxor xmm15, xmm3
595
+ pshuflw xmm12, xmm12, 0B1H
596
+ pshufhw xmm12, xmm12, 0B1H
597
+ pshuflw xmm13, xmm13, 0B1H
598
+ pshufhw xmm13, xmm13, 0B1H
599
+ pshuflw xmm14, xmm14, 0B1H
600
+ pshufhw xmm14, xmm14, 0B1H
601
+ pshuflw xmm15, xmm15, 0B1H
602
+ pshufhw xmm15, xmm15, 0B1H
603
+ movdqa xmm8, xmmword ptr [rsp+100H]
604
+ paddd xmm8, xmm12
605
+ paddd xmm9, xmm13
606
+ paddd xmm10, xmm14
607
+ paddd xmm11, xmm15
608
+ pxor xmm4, xmm8
609
+ pxor xmm5, xmm9
610
+ pxor xmm6, xmm10
611
+ pxor xmm7, xmm11
612
+ movdqa xmmword ptr [rsp+100H], xmm8
613
+ movdqa xmm8, xmm4
614
+ psrld xmm8, 12
615
+ pslld xmm4, 20
616
+ por xmm4, xmm8
617
+ movdqa xmm8, xmm5
618
+ psrld xmm8, 12
619
+ pslld xmm5, 20
620
+ por xmm5, xmm8
621
+ movdqa xmm8, xmm6
622
+ psrld xmm8, 12
623
+ pslld xmm6, 20
624
+ por xmm6, xmm8
625
+ movdqa xmm8, xmm7
626
+ psrld xmm8, 12
627
+ pslld xmm7, 20
628
+ por xmm7, xmm8
629
+ paddd xmm0, xmmword ptr [rsp+40H]
630
+ paddd xmm1, xmmword ptr [rsp+0C0H]
631
+ paddd xmm2, xmmword ptr [rsp+20H]
632
+ paddd xmm3, xmmword ptr [rsp+0E0H]
633
+ paddd xmm0, xmm4
634
+ paddd xmm1, xmm5
635
+ paddd xmm2, xmm6
636
+ paddd xmm3, xmm7
637
+ pxor xmm12, xmm0
638
+ pxor xmm13, xmm1
639
+ pxor xmm14, xmm2
640
+ pxor xmm15, xmm3
641
+ movdqa xmm8, xmm12
642
+ psrld xmm12, 8
643
+ pslld xmm8, 24
644
+ pxor xmm12, xmm8
645
+ movdqa xmm8, xmm13
646
+ psrld xmm13, 8
647
+ pslld xmm8, 24
648
+ pxor xmm13, xmm8
649
+ movdqa xmm8, xmm14
650
+ psrld xmm14, 8
651
+ pslld xmm8, 24
652
+ pxor xmm14, xmm8
653
+ movdqa xmm8, xmm15
654
+ psrld xmm15, 8
655
+ pslld xmm8, 24
656
+ pxor xmm15, xmm8
657
+ movdqa xmm8, xmmword ptr [rsp+100H]
658
+ paddd xmm8, xmm12
659
+ paddd xmm9, xmm13
660
+ paddd xmm10, xmm14
661
+ paddd xmm11, xmm15
662
+ pxor xmm4, xmm8
663
+ pxor xmm5, xmm9
664
+ pxor xmm6, xmm10
665
+ pxor xmm7, xmm11
666
+ movdqa xmmword ptr [rsp+100H], xmm8
667
+ movdqa xmm8, xmm4
668
+ psrld xmm8, 7
669
+ pslld xmm4, 25
670
+ por xmm4, xmm8
671
+ movdqa xmm8, xmm5
672
+ psrld xmm8, 7
673
+ pslld xmm5, 25
674
+ por xmm5, xmm8
675
+ movdqa xmm8, xmm6
676
+ psrld xmm8, 7
677
+ pslld xmm6, 25
678
+ por xmm6, xmm8
679
+ movdqa xmm8, xmm7
680
+ psrld xmm8, 7
681
+ pslld xmm7, 25
682
+ por xmm7, xmm8
683
+ paddd xmm0, xmmword ptr [rsp+60H]
684
+ paddd xmm1, xmmword ptr [rsp+90H]
685
+ paddd xmm2, xmmword ptr [rsp+0B0H]
686
+ paddd xmm3, xmmword ptr [rsp+80H]
687
+ paddd xmm0, xmm5
688
+ paddd xmm1, xmm6
689
+ paddd xmm2, xmm7
690
+ paddd xmm3, xmm4
691
+ pxor xmm15, xmm0
692
+ pxor xmm12, xmm1
693
+ pxor xmm13, xmm2
694
+ pxor xmm14, xmm3
695
+ pshuflw xmm15, xmm15, 0B1H
696
+ pshufhw xmm15, xmm15, 0B1H
697
+ pshuflw xmm12, xmm12, 0B1H
698
+ pshufhw xmm12, xmm12, 0B1H
699
+ pshuflw xmm13, xmm13, 0B1H
700
+ pshufhw xmm13, xmm13, 0B1H
701
+ pshuflw xmm14, xmm14, 0B1H
702
+ pshufhw xmm14, xmm14, 0B1H
703
+ paddd xmm10, xmm15
704
+ paddd xmm11, xmm12
705
+ movdqa xmm8, xmmword ptr [rsp+100H]
706
+ paddd xmm8, xmm13
707
+ paddd xmm9, xmm14
708
+ pxor xmm5, xmm10
709
+ pxor xmm6, xmm11
710
+ pxor xmm7, xmm8
711
+ pxor xmm4, xmm9
712
+ movdqa xmmword ptr [rsp+100H], xmm8
713
+ movdqa xmm8, xmm5
714
+ psrld xmm8, 12
715
+ pslld xmm5, 20
716
+ por xmm5, xmm8
717
+ movdqa xmm8, xmm6
718
+ psrld xmm8, 12
719
+ pslld xmm6, 20
720
+ por xmm6, xmm8
721
+ movdqa xmm8, xmm7
722
+ psrld xmm8, 12
723
+ pslld xmm7, 20
724
+ por xmm7, xmm8
725
+ movdqa xmm8, xmm4
726
+ psrld xmm8, 12
727
+ pslld xmm4, 20
728
+ por xmm4, xmm8
729
+ paddd xmm0, xmmword ptr [rsp+50H]
730
+ paddd xmm1, xmmword ptr [rsp]
731
+ paddd xmm2, xmmword ptr [rsp+0F0H]
732
+ paddd xmm3, xmmword ptr [rsp+10H]
733
+ paddd xmm0, xmm5
734
+ paddd xmm1, xmm6
735
+ paddd xmm2, xmm7
736
+ paddd xmm3, xmm4
737
+ pxor xmm15, xmm0
738
+ pxor xmm12, xmm1
739
+ pxor xmm13, xmm2
740
+ pxor xmm14, xmm3
741
+ movdqa xmm8, xmm15
742
+ psrld xmm15, 8
743
+ pslld xmm8, 24
744
+ pxor xmm15, xmm8
745
+ movdqa xmm8, xmm12
746
+ psrld xmm12, 8
747
+ pslld xmm8, 24
748
+ pxor xmm12, xmm8
749
+ movdqa xmm8, xmm13
750
+ psrld xmm13, 8
751
+ pslld xmm8, 24
752
+ pxor xmm13, xmm8
753
+ movdqa xmm8, xmm14
754
+ psrld xmm14, 8
755
+ pslld xmm8, 24
756
+ pxor xmm14, xmm8
757
+ paddd xmm10, xmm15
758
+ paddd xmm11, xmm12
759
+ movdqa xmm8, xmmword ptr [rsp+100H]
760
+ paddd xmm8, xmm13
761
+ paddd xmm9, xmm14
762
+ pxor xmm5, xmm10
763
+ pxor xmm6, xmm11
764
+ pxor xmm7, xmm8
765
+ pxor xmm4, xmm9
766
+ movdqa xmmword ptr [rsp+100H], xmm8
767
+ movdqa xmm8, xmm5
768
+ psrld xmm8, 7
769
+ pslld xmm5, 25
770
+ por xmm5, xmm8
771
+ movdqa xmm8, xmm6
772
+ psrld xmm8, 7
773
+ pslld xmm6, 25
774
+ por xmm6, xmm8
775
+ movdqa xmm8, xmm7
776
+ psrld xmm8, 7
777
+ pslld xmm7, 25
778
+ por xmm7, xmm8
779
+ movdqa xmm8, xmm4
780
+ psrld xmm8, 7
781
+ pslld xmm4, 25
782
+ por xmm4, xmm8
783
+ paddd xmm0, xmmword ptr [rsp+0A0H]
784
+ paddd xmm1, xmmword ptr [rsp+0C0H]
785
+ paddd xmm2, xmmword ptr [rsp+0E0H]
786
+ paddd xmm3, xmmword ptr [rsp+0D0H]
787
+ paddd xmm0, xmm4
788
+ paddd xmm1, xmm5
789
+ paddd xmm2, xmm6
790
+ paddd xmm3, xmm7
791
+ pxor xmm12, xmm0
792
+ pxor xmm13, xmm1
793
+ pxor xmm14, xmm2
794
+ pxor xmm15, xmm3
795
+ pshuflw xmm12, xmm12, 0B1H
796
+ pshufhw xmm12, xmm12, 0B1H
797
+ pshuflw xmm13, xmm13, 0B1H
798
+ pshufhw xmm13, xmm13, 0B1H
799
+ pshuflw xmm14, xmm14, 0B1H
800
+ pshufhw xmm14, xmm14, 0B1H
801
+ pshuflw xmm15, xmm15, 0B1H
802
+ pshufhw xmm15, xmm15, 0B1H
803
+ movdqa xmm8, xmmword ptr [rsp+100H]
804
+ paddd xmm8, xmm12
805
+ paddd xmm9, xmm13
806
+ paddd xmm10, xmm14
807
+ paddd xmm11, xmm15
808
+ pxor xmm4, xmm8
809
+ pxor xmm5, xmm9
810
+ pxor xmm6, xmm10
811
+ pxor xmm7, xmm11
812
+ movdqa xmmword ptr [rsp+100H], xmm8
813
+ movdqa xmm8, xmm4
814
+ psrld xmm8, 12
815
+ pslld xmm4, 20
816
+ por xmm4, xmm8
817
+ movdqa xmm8, xmm5
818
+ psrld xmm8, 12
819
+ pslld xmm5, 20
820
+ por xmm5, xmm8
821
+ movdqa xmm8, xmm6
822
+ psrld xmm8, 12
823
+ pslld xmm6, 20
824
+ por xmm6, xmm8
825
+ movdqa xmm8, xmm7
826
+ psrld xmm8, 12
827
+ pslld xmm7, 20
828
+ por xmm7, xmm8
829
+ paddd xmm0, xmmword ptr [rsp+70H]
830
+ paddd xmm1, xmmword ptr [rsp+90H]
831
+ paddd xmm2, xmmword ptr [rsp+30H]
832
+ paddd xmm3, xmmword ptr [rsp+0F0H]
833
+ paddd xmm0, xmm4
834
+ paddd xmm1, xmm5
835
+ paddd xmm2, xmm6
836
+ paddd xmm3, xmm7
837
+ pxor xmm12, xmm0
838
+ pxor xmm13, xmm1
839
+ pxor xmm14, xmm2
840
+ pxor xmm15, xmm3
841
+ movdqa xmm8, xmm12
842
+ psrld xmm12, 8
843
+ pslld xmm8, 24
844
+ pxor xmm12, xmm8
845
+ movdqa xmm8, xmm13
846
+ psrld xmm13, 8
847
+ pslld xmm8, 24
848
+ pxor xmm13, xmm8
849
+ movdqa xmm8, xmm14
850
+ psrld xmm14, 8
851
+ pslld xmm8, 24
852
+ pxor xmm14, xmm8
853
+ movdqa xmm8, xmm15
854
+ psrld xmm15, 8
855
+ pslld xmm8, 24
856
+ pxor xmm15, xmm8
857
+ movdqa xmm8, xmmword ptr [rsp+100H]
858
+ paddd xmm8, xmm12
859
+ paddd xmm9, xmm13
860
+ paddd xmm10, xmm14
861
+ paddd xmm11, xmm15
862
+ pxor xmm4, xmm8
863
+ pxor xmm5, xmm9
864
+ pxor xmm6, xmm10
865
+ pxor xmm7, xmm11
866
+ movdqa xmmword ptr [rsp+100H], xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ movdqa xmm8, xmm5
872
+ psrld xmm8, 7
873
+ pslld xmm5, 25
874
+ por xmm5, xmm8
875
+ movdqa xmm8, xmm6
876
+ psrld xmm8, 7
877
+ pslld xmm6, 25
878
+ por xmm6, xmm8
879
+ movdqa xmm8, xmm7
880
+ psrld xmm8, 7
881
+ pslld xmm7, 25
882
+ por xmm7, xmm8
883
+ paddd xmm0, xmmword ptr [rsp+40H]
884
+ paddd xmm1, xmmword ptr [rsp+0B0H]
885
+ paddd xmm2, xmmword ptr [rsp+50H]
886
+ paddd xmm3, xmmword ptr [rsp+10H]
887
+ paddd xmm0, xmm5
888
+ paddd xmm1, xmm6
889
+ paddd xmm2, xmm7
890
+ paddd xmm3, xmm4
891
+ pxor xmm15, xmm0
892
+ pxor xmm12, xmm1
893
+ pxor xmm13, xmm2
894
+ pxor xmm14, xmm3
895
+ pshuflw xmm15, xmm15, 0B1H
896
+ pshufhw xmm15, xmm15, 0B1H
897
+ pshuflw xmm12, xmm12, 0B1H
898
+ pshufhw xmm12, xmm12, 0B1H
899
+ pshuflw xmm13, xmm13, 0B1H
900
+ pshufhw xmm13, xmm13, 0B1H
901
+ pshuflw xmm14, xmm14, 0B1H
902
+ pshufhw xmm14, xmm14, 0B1H
903
+ paddd xmm10, xmm15
904
+ paddd xmm11, xmm12
905
+ movdqa xmm8, xmmword ptr [rsp+100H]
906
+ paddd xmm8, xmm13
907
+ paddd xmm9, xmm14
908
+ pxor xmm5, xmm10
909
+ pxor xmm6, xmm11
910
+ pxor xmm7, xmm8
911
+ pxor xmm4, xmm9
912
+ movdqa xmmword ptr [rsp+100H], xmm8
913
+ movdqa xmm8, xmm5
914
+ psrld xmm8, 12
915
+ pslld xmm5, 20
916
+ por xmm5, xmm8
917
+ movdqa xmm8, xmm6
918
+ psrld xmm8, 12
919
+ pslld xmm6, 20
920
+ por xmm6, xmm8
921
+ movdqa xmm8, xmm7
922
+ psrld xmm8, 12
923
+ pslld xmm7, 20
924
+ por xmm7, xmm8
925
+ movdqa xmm8, xmm4
926
+ psrld xmm8, 12
927
+ pslld xmm4, 20
928
+ por xmm4, xmm8
929
+ paddd xmm0, xmmword ptr [rsp]
930
+ paddd xmm1, xmmword ptr [rsp+20H]
931
+ paddd xmm2, xmmword ptr [rsp+80H]
932
+ paddd xmm3, xmmword ptr [rsp+60H]
933
+ paddd xmm0, xmm5
934
+ paddd xmm1, xmm6
935
+ paddd xmm2, xmm7
936
+ paddd xmm3, xmm4
937
+ pxor xmm15, xmm0
938
+ pxor xmm12, xmm1
939
+ pxor xmm13, xmm2
940
+ pxor xmm14, xmm3
941
+ movdqa xmm8, xmm15
942
+ psrld xmm15, 8
943
+ pslld xmm8, 24
944
+ pxor xmm15, xmm8
945
+ movdqa xmm8, xmm12
946
+ psrld xmm12, 8
947
+ pslld xmm8, 24
948
+ pxor xmm12, xmm8
949
+ movdqa xmm8, xmm13
950
+ psrld xmm13, 8
951
+ pslld xmm8, 24
952
+ pxor xmm13, xmm8
953
+ movdqa xmm8, xmm14
954
+ psrld xmm14, 8
955
+ pslld xmm8, 24
956
+ pxor xmm14, xmm8
957
+ paddd xmm10, xmm15
958
+ paddd xmm11, xmm12
959
+ movdqa xmm8, xmmword ptr [rsp+100H]
960
+ paddd xmm8, xmm13
961
+ paddd xmm9, xmm14
962
+ pxor xmm5, xmm10
963
+ pxor xmm6, xmm11
964
+ pxor xmm7, xmm8
965
+ pxor xmm4, xmm9
966
+ movdqa xmmword ptr [rsp+100H], xmm8
967
+ movdqa xmm8, xmm5
968
+ psrld xmm8, 7
969
+ pslld xmm5, 25
970
+ por xmm5, xmm8
971
+ movdqa xmm8, xmm6
972
+ psrld xmm8, 7
973
+ pslld xmm6, 25
974
+ por xmm6, xmm8
975
+ movdqa xmm8, xmm7
976
+ psrld xmm8, 7
977
+ pslld xmm7, 25
978
+ por xmm7, xmm8
979
+ movdqa xmm8, xmm4
980
+ psrld xmm8, 7
981
+ pslld xmm4, 25
982
+ por xmm4, xmm8
983
+ paddd xmm0, xmmword ptr [rsp+0C0H]
984
+ paddd xmm1, xmmword ptr [rsp+90H]
985
+ paddd xmm2, xmmword ptr [rsp+0F0H]
986
+ paddd xmm3, xmmword ptr [rsp+0E0H]
987
+ paddd xmm0, xmm4
988
+ paddd xmm1, xmm5
989
+ paddd xmm2, xmm6
990
+ paddd xmm3, xmm7
991
+ pxor xmm12, xmm0
992
+ pxor xmm13, xmm1
993
+ pxor xmm14, xmm2
994
+ pxor xmm15, xmm3
995
+ pshuflw xmm12, xmm12, 0B1H
996
+ pshufhw xmm12, xmm12, 0B1H
997
+ pshuflw xmm13, xmm13, 0B1H
998
+ pshufhw xmm13, xmm13, 0B1H
999
+ pshuflw xmm14, xmm14, 0B1H
1000
+ pshufhw xmm14, xmm14, 0B1H
1001
+ pshuflw xmm15, xmm15, 0B1H
1002
+ pshufhw xmm15, xmm15, 0B1H
1003
+ movdqa xmm8, xmmword ptr [rsp+100H]
1004
+ paddd xmm8, xmm12
1005
+ paddd xmm9, xmm13
1006
+ paddd xmm10, xmm14
1007
+ paddd xmm11, xmm15
1008
+ pxor xmm4, xmm8
1009
+ pxor xmm5, xmm9
1010
+ pxor xmm6, xmm10
1011
+ pxor xmm7, xmm11
1012
+ movdqa xmmword ptr [rsp+100H], xmm8
1013
+ movdqa xmm8, xmm4
1014
+ psrld xmm8, 12
1015
+ pslld xmm4, 20
1016
+ por xmm4, xmm8
1017
+ movdqa xmm8, xmm5
1018
+ psrld xmm8, 12
1019
+ pslld xmm5, 20
1020
+ por xmm5, xmm8
1021
+ movdqa xmm8, xmm6
1022
+ psrld xmm8, 12
1023
+ pslld xmm6, 20
1024
+ por xmm6, xmm8
1025
+ movdqa xmm8, xmm7
1026
+ psrld xmm8, 12
1027
+ pslld xmm7, 20
1028
+ por xmm7, xmm8
1029
+ paddd xmm0, xmmword ptr [rsp+0D0H]
1030
+ paddd xmm1, xmmword ptr [rsp+0B0H]
1031
+ paddd xmm2, xmmword ptr [rsp+0A0H]
1032
+ paddd xmm3, xmmword ptr [rsp+80H]
1033
+ paddd xmm0, xmm4
1034
+ paddd xmm1, xmm5
1035
+ paddd xmm2, xmm6
1036
+ paddd xmm3, xmm7
1037
+ pxor xmm12, xmm0
1038
+ pxor xmm13, xmm1
1039
+ pxor xmm14, xmm2
1040
+ pxor xmm15, xmm3
1041
+ movdqa xmm8, xmm12
1042
+ psrld xmm12, 8
1043
+ pslld xmm8, 24
1044
+ pxor xmm12, xmm8
1045
+ movdqa xmm8, xmm13
1046
+ psrld xmm13, 8
1047
+ pslld xmm8, 24
1048
+ pxor xmm13, xmm8
1049
+ movdqa xmm8, xmm14
1050
+ psrld xmm14, 8
1051
+ pslld xmm8, 24
1052
+ pxor xmm14, xmm8
1053
+ movdqa xmm8, xmm15
1054
+ psrld xmm15, 8
1055
+ pslld xmm8, 24
1056
+ pxor xmm15, xmm8
1057
+ movdqa xmm8, xmmword ptr [rsp+100H]
1058
+ paddd xmm8, xmm12
1059
+ paddd xmm9, xmm13
1060
+ paddd xmm10, xmm14
1061
+ paddd xmm11, xmm15
1062
+ pxor xmm4, xmm8
1063
+ pxor xmm5, xmm9
1064
+ pxor xmm6, xmm10
1065
+ pxor xmm7, xmm11
1066
+ movdqa xmmword ptr [rsp+100H], xmm8
1067
+ movdqa xmm8, xmm4
1068
+ psrld xmm8, 7
1069
+ pslld xmm4, 25
1070
+ por xmm4, xmm8
1071
+ movdqa xmm8, xmm5
1072
+ psrld xmm8, 7
1073
+ pslld xmm5, 25
1074
+ por xmm5, xmm8
1075
+ movdqa xmm8, xmm6
1076
+ psrld xmm8, 7
1077
+ pslld xmm6, 25
1078
+ por xmm6, xmm8
1079
+ movdqa xmm8, xmm7
1080
+ psrld xmm8, 7
1081
+ pslld xmm7, 25
1082
+ por xmm7, xmm8
1083
+ paddd xmm0, xmmword ptr [rsp+70H]
1084
+ paddd xmm1, xmmword ptr [rsp+50H]
1085
+ paddd xmm2, xmmword ptr [rsp]
1086
+ paddd xmm3, xmmword ptr [rsp+60H]
1087
+ paddd xmm0, xmm5
1088
+ paddd xmm1, xmm6
1089
+ paddd xmm2, xmm7
1090
+ paddd xmm3, xmm4
1091
+ pxor xmm15, xmm0
1092
+ pxor xmm12, xmm1
1093
+ pxor xmm13, xmm2
1094
+ pxor xmm14, xmm3
1095
+ pshuflw xmm15, xmm15, 0B1H
1096
+ pshufhw xmm15, xmm15, 0B1H
1097
+ pshuflw xmm12, xmm12, 0B1H
1098
+ pshufhw xmm12, xmm12, 0B1H
1099
+ pshuflw xmm13, xmm13, 0B1H
1100
+ pshufhw xmm13, xmm13, 0B1H
1101
+ pshuflw xmm14, xmm14, 0B1H
1102
+ pshufhw xmm14, xmm14, 0B1H
1103
+ paddd xmm10, xmm15
1104
+ paddd xmm11, xmm12
1105
+ movdqa xmm8, xmmword ptr [rsp+100H]
1106
+ paddd xmm8, xmm13
1107
+ paddd xmm9, xmm14
1108
+ pxor xmm5, xmm10
1109
+ pxor xmm6, xmm11
1110
+ pxor xmm7, xmm8
1111
+ pxor xmm4, xmm9
1112
+ movdqa xmmword ptr [rsp+100H], xmm8
1113
+ movdqa xmm8, xmm5
1114
+ psrld xmm8, 12
1115
+ pslld xmm5, 20
1116
+ por xmm5, xmm8
1117
+ movdqa xmm8, xmm6
1118
+ psrld xmm8, 12
1119
+ pslld xmm6, 20
1120
+ por xmm6, xmm8
1121
+ movdqa xmm8, xmm7
1122
+ psrld xmm8, 12
1123
+ pslld xmm7, 20
1124
+ por xmm7, xmm8
1125
+ movdqa xmm8, xmm4
1126
+ psrld xmm8, 12
1127
+ pslld xmm4, 20
1128
+ por xmm4, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+20H]
1130
+ paddd xmm1, xmmword ptr [rsp+30H]
1131
+ paddd xmm2, xmmword ptr [rsp+10H]
1132
+ paddd xmm3, xmmword ptr [rsp+40H]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmm15
1142
+ psrld xmm15, 8
1143
+ pslld xmm8, 24
1144
+ pxor xmm15, xmm8
1145
+ movdqa xmm8, xmm12
1146
+ psrld xmm12, 8
1147
+ pslld xmm8, 24
1148
+ pxor xmm12, xmm8
1149
+ movdqa xmm8, xmm13
1150
+ psrld xmm13, 8
1151
+ pslld xmm8, 24
1152
+ pxor xmm13, xmm8
1153
+ movdqa xmm8, xmm14
1154
+ psrld xmm14, 8
1155
+ pslld xmm8, 24
1156
+ pxor xmm14, xmm8
1157
+ paddd xmm10, xmm15
1158
+ paddd xmm11, xmm12
1159
+ movdqa xmm8, xmmword ptr [rsp+100H]
1160
+ paddd xmm8, xmm13
1161
+ paddd xmm9, xmm14
1162
+ pxor xmm5, xmm10
1163
+ pxor xmm6, xmm11
1164
+ pxor xmm7, xmm8
1165
+ pxor xmm4, xmm9
1166
+ movdqa xmmword ptr [rsp+100H], xmm8
1167
+ movdqa xmm8, xmm5
1168
+ psrld xmm8, 7
1169
+ pslld xmm5, 25
1170
+ por xmm5, xmm8
1171
+ movdqa xmm8, xmm6
1172
+ psrld xmm8, 7
1173
+ pslld xmm6, 25
1174
+ por xmm6, xmm8
1175
+ movdqa xmm8, xmm7
1176
+ psrld xmm8, 7
1177
+ pslld xmm7, 25
1178
+ por xmm7, xmm8
1179
+ movdqa xmm8, xmm4
1180
+ psrld xmm8, 7
1181
+ pslld xmm4, 25
1182
+ por xmm4, xmm8
1183
+ paddd xmm0, xmmword ptr [rsp+90H]
1184
+ paddd xmm1, xmmword ptr [rsp+0B0H]
1185
+ paddd xmm2, xmmword ptr [rsp+80H]
1186
+ paddd xmm3, xmmword ptr [rsp+0F0H]
1187
+ paddd xmm0, xmm4
1188
+ paddd xmm1, xmm5
1189
+ paddd xmm2, xmm6
1190
+ paddd xmm3, xmm7
1191
+ pxor xmm12, xmm0
1192
+ pxor xmm13, xmm1
1193
+ pxor xmm14, xmm2
1194
+ pxor xmm15, xmm3
1195
+ pshuflw xmm12, xmm12, 0B1H
1196
+ pshufhw xmm12, xmm12, 0B1H
1197
+ pshuflw xmm13, xmm13, 0B1H
1198
+ pshufhw xmm13, xmm13, 0B1H
1199
+ pshuflw xmm14, xmm14, 0B1H
1200
+ pshufhw xmm14, xmm14, 0B1H
1201
+ pshuflw xmm15, xmm15, 0B1H
1202
+ pshufhw xmm15, xmm15, 0B1H
1203
+ movdqa xmm8, xmmword ptr [rsp+100H]
1204
+ paddd xmm8, xmm12
1205
+ paddd xmm9, xmm13
1206
+ paddd xmm10, xmm14
1207
+ paddd xmm11, xmm15
1208
+ pxor xmm4, xmm8
1209
+ pxor xmm5, xmm9
1210
+ pxor xmm6, xmm10
1211
+ pxor xmm7, xmm11
1212
+ movdqa xmmword ptr [rsp+100H], xmm8
1213
+ movdqa xmm8, xmm4
1214
+ psrld xmm8, 12
1215
+ pslld xmm4, 20
1216
+ por xmm4, xmm8
1217
+ movdqa xmm8, xmm5
1218
+ psrld xmm8, 12
1219
+ pslld xmm5, 20
1220
+ por xmm5, xmm8
1221
+ movdqa xmm8, xmm6
1222
+ psrld xmm8, 12
1223
+ pslld xmm6, 20
1224
+ por xmm6, xmm8
1225
+ movdqa xmm8, xmm7
1226
+ psrld xmm8, 12
1227
+ pslld xmm7, 20
1228
+ por xmm7, xmm8
1229
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1230
+ paddd xmm1, xmmword ptr [rsp+50H]
1231
+ paddd xmm2, xmmword ptr [rsp+0C0H]
1232
+ paddd xmm3, xmmword ptr [rsp+10H]
1233
+ paddd xmm0, xmm4
1234
+ paddd xmm1, xmm5
1235
+ paddd xmm2, xmm6
1236
+ paddd xmm3, xmm7
1237
+ pxor xmm12, xmm0
1238
+ pxor xmm13, xmm1
1239
+ pxor xmm14, xmm2
1240
+ pxor xmm15, xmm3
1241
+ movdqa xmm8, xmm12
1242
+ psrld xmm12, 8
1243
+ pslld xmm8, 24
1244
+ pxor xmm12, xmm8
1245
+ movdqa xmm8, xmm13
1246
+ psrld xmm13, 8
1247
+ pslld xmm8, 24
1248
+ pxor xmm13, xmm8
1249
+ movdqa xmm8, xmm14
1250
+ psrld xmm14, 8
1251
+ pslld xmm8, 24
1252
+ pxor xmm14, xmm8
1253
+ movdqa xmm8, xmm15
1254
+ psrld xmm15, 8
1255
+ pslld xmm8, 24
1256
+ pxor xmm15, xmm8
1257
+ movdqa xmm8, xmmword ptr [rsp+100H]
1258
+ paddd xmm8, xmm12
1259
+ paddd xmm9, xmm13
1260
+ paddd xmm10, xmm14
1261
+ paddd xmm11, xmm15
1262
+ pxor xmm4, xmm8
1263
+ pxor xmm5, xmm9
1264
+ pxor xmm6, xmm10
1265
+ pxor xmm7, xmm11
1266
+ movdqa xmmword ptr [rsp+100H], xmm8
1267
+ movdqa xmm8, xmm4
1268
+ psrld xmm8, 7
1269
+ pslld xmm4, 25
1270
+ por xmm4, xmm8
1271
+ movdqa xmm8, xmm5
1272
+ psrld xmm8, 7
1273
+ pslld xmm5, 25
1274
+ por xmm5, xmm8
1275
+ movdqa xmm8, xmm6
1276
+ psrld xmm8, 7
1277
+ pslld xmm6, 25
1278
+ por xmm6, xmm8
1279
+ movdqa xmm8, xmm7
1280
+ psrld xmm8, 7
1281
+ pslld xmm7, 25
1282
+ por xmm7, xmm8
1283
+ paddd xmm0, xmmword ptr [rsp+0D0H]
1284
+ paddd xmm1, xmmword ptr [rsp]
1285
+ paddd xmm2, xmmword ptr [rsp+20H]
1286
+ paddd xmm3, xmmword ptr [rsp+40H]
1287
+ paddd xmm0, xmm5
1288
+ paddd xmm1, xmm6
1289
+ paddd xmm2, xmm7
1290
+ paddd xmm3, xmm4
1291
+ pxor xmm15, xmm0
1292
+ pxor xmm12, xmm1
1293
+ pxor xmm13, xmm2
1294
+ pxor xmm14, xmm3
1295
+ pshuflw xmm15, xmm15, 0B1H
1296
+ pshufhw xmm15, xmm15, 0B1H
1297
+ pshuflw xmm12, xmm12, 0B1H
1298
+ pshufhw xmm12, xmm12, 0B1H
1299
+ pshuflw xmm13, xmm13, 0B1H
1300
+ pshufhw xmm13, xmm13, 0B1H
1301
+ pshuflw xmm14, xmm14, 0B1H
1302
+ pshufhw xmm14, xmm14, 0B1H
1303
+ paddd xmm10, xmm15
1304
+ paddd xmm11, xmm12
1305
+ movdqa xmm8, xmmword ptr [rsp+100H]
1306
+ paddd xmm8, xmm13
1307
+ paddd xmm9, xmm14
1308
+ pxor xmm5, xmm10
1309
+ pxor xmm6, xmm11
1310
+ pxor xmm7, xmm8
1311
+ pxor xmm4, xmm9
1312
+ movdqa xmmword ptr [rsp+100H], xmm8
1313
+ movdqa xmm8, xmm5
1314
+ psrld xmm8, 12
1315
+ pslld xmm5, 20
1316
+ por xmm5, xmm8
1317
+ movdqa xmm8, xmm6
1318
+ psrld xmm8, 12
1319
+ pslld xmm6, 20
1320
+ por xmm6, xmm8
1321
+ movdqa xmm8, xmm7
1322
+ psrld xmm8, 12
1323
+ pslld xmm7, 20
1324
+ por xmm7, xmm8
1325
+ movdqa xmm8, xmm4
1326
+ psrld xmm8, 12
1327
+ pslld xmm4, 20
1328
+ por xmm4, xmm8
1329
+ paddd xmm0, xmmword ptr [rsp+30H]
1330
+ paddd xmm1, xmmword ptr [rsp+0A0H]
1331
+ paddd xmm2, xmmword ptr [rsp+60H]
1332
+ paddd xmm3, xmmword ptr [rsp+70H]
1333
+ paddd xmm0, xmm5
1334
+ paddd xmm1, xmm6
1335
+ paddd xmm2, xmm7
1336
+ paddd xmm3, xmm4
1337
+ pxor xmm15, xmm0
1338
+ pxor xmm12, xmm1
1339
+ pxor xmm13, xmm2
1340
+ pxor xmm14, xmm3
1341
+ movdqa xmm8, xmm15
1342
+ psrld xmm15, 8
1343
+ pslld xmm8, 24
1344
+ pxor xmm15, xmm8
1345
+ movdqa xmm8, xmm12
1346
+ psrld xmm12, 8
1347
+ pslld xmm8, 24
1348
+ pxor xmm12, xmm8
1349
+ movdqa xmm8, xmm13
1350
+ psrld xmm13, 8
1351
+ pslld xmm8, 24
1352
+ pxor xmm13, xmm8
1353
+ movdqa xmm8, xmm14
1354
+ psrld xmm14, 8
1355
+ pslld xmm8, 24
1356
+ pxor xmm14, xmm8
1357
+ paddd xmm10, xmm15
1358
+ paddd xmm11, xmm12
1359
+ movdqa xmm8, xmmword ptr [rsp+100H]
1360
+ paddd xmm8, xmm13
1361
+ paddd xmm9, xmm14
1362
+ pxor xmm5, xmm10
1363
+ pxor xmm6, xmm11
1364
+ pxor xmm7, xmm8
1365
+ pxor xmm4, xmm9
1366
+ movdqa xmmword ptr [rsp+100H], xmm8
1367
+ movdqa xmm8, xmm5
1368
+ psrld xmm8, 7
1369
+ pslld xmm5, 25
1370
+ por xmm5, xmm8
1371
+ movdqa xmm8, xmm6
1372
+ psrld xmm8, 7
1373
+ pslld xmm6, 25
1374
+ por xmm6, xmm8
1375
+ movdqa xmm8, xmm7
1376
+ psrld xmm8, 7
1377
+ pslld xmm7, 25
1378
+ por xmm7, xmm8
1379
+ movdqa xmm8, xmm4
1380
+ psrld xmm8, 7
1381
+ pslld xmm4, 25
1382
+ por xmm4, xmm8
1383
+ paddd xmm0, xmmword ptr [rsp+0B0H]
1384
+ paddd xmm1, xmmword ptr [rsp+50H]
1385
+ paddd xmm2, xmmword ptr [rsp+10H]
1386
+ paddd xmm3, xmmword ptr [rsp+80H]
1387
+ paddd xmm0, xmm4
1388
+ paddd xmm1, xmm5
1389
+ paddd xmm2, xmm6
1390
+ paddd xmm3, xmm7
1391
+ pxor xmm12, xmm0
1392
+ pxor xmm13, xmm1
1393
+ pxor xmm14, xmm2
1394
+ pxor xmm15, xmm3
1395
+ pshuflw xmm12, xmm12, 0B1H
1396
+ pshufhw xmm12, xmm12, 0B1H
1397
+ pshuflw xmm13, xmm13, 0B1H
1398
+ pshufhw xmm13, xmm13, 0B1H
1399
+ pshuflw xmm14, xmm14, 0B1H
1400
+ pshufhw xmm14, xmm14, 0B1H
1401
+ pshuflw xmm15, xmm15, 0B1H
1402
+ pshufhw xmm15, xmm15, 0B1H
1403
+ movdqa xmm8, xmmword ptr [rsp+100H]
1404
+ paddd xmm8, xmm12
1405
+ paddd xmm9, xmm13
1406
+ paddd xmm10, xmm14
1407
+ paddd xmm11, xmm15
1408
+ pxor xmm4, xmm8
1409
+ pxor xmm5, xmm9
1410
+ pxor xmm6, xmm10
1411
+ pxor xmm7, xmm11
1412
+ movdqa xmmword ptr [rsp+100H], xmm8
1413
+ movdqa xmm8, xmm4
1414
+ psrld xmm8, 12
1415
+ pslld xmm4, 20
1416
+ por xmm4, xmm8
1417
+ movdqa xmm8, xmm5
1418
+ psrld xmm8, 12
1419
+ pslld xmm5, 20
1420
+ por xmm5, xmm8
1421
+ movdqa xmm8, xmm6
1422
+ psrld xmm8, 12
1423
+ pslld xmm6, 20
1424
+ por xmm6, xmm8
1425
+ movdqa xmm8, xmm7
1426
+ psrld xmm8, 12
1427
+ pslld xmm7, 20
1428
+ por xmm7, xmm8
1429
+ paddd xmm0, xmmword ptr [rsp+0F0H]
1430
+ paddd xmm1, xmmword ptr [rsp]
1431
+ paddd xmm2, xmmword ptr [rsp+90H]
1432
+ paddd xmm3, xmmword ptr [rsp+60H]
1433
+ paddd xmm0, xmm4
1434
+ paddd xmm1, xmm5
1435
+ paddd xmm2, xmm6
1436
+ paddd xmm3, xmm7
1437
+ pxor xmm12, xmm0
1438
+ pxor xmm13, xmm1
1439
+ pxor xmm14, xmm2
1440
+ pxor xmm15, xmm3
1441
+ movdqa xmm8, xmm12
1442
+ psrld xmm12, 8
1443
+ pslld xmm8, 24
1444
+ pxor xmm12, xmm8
1445
+ movdqa xmm8, xmm13
1446
+ psrld xmm13, 8
1447
+ pslld xmm8, 24
1448
+ pxor xmm13, xmm8
1449
+ movdqa xmm8, xmm14
1450
+ psrld xmm14, 8
1451
+ pslld xmm8, 24
1452
+ pxor xmm14, xmm8
1453
+ movdqa xmm8, xmm15
1454
+ psrld xmm15, 8
1455
+ pslld xmm8, 24
1456
+ pxor xmm15, xmm8
1457
+ movdqa xmm8, xmmword ptr [rsp+100H]
1458
+ paddd xmm8, xmm12
1459
+ paddd xmm9, xmm13
1460
+ paddd xmm10, xmm14
1461
+ paddd xmm11, xmm15
1462
+ pxor xmm4, xmm8
1463
+ pxor xmm5, xmm9
1464
+ pxor xmm6, xmm10
1465
+ pxor xmm7, xmm11
1466
+ movdqa xmmword ptr [rsp+100H], xmm8
1467
+ movdqa xmm8, xmm4
1468
+ psrld xmm8, 7
1469
+ pslld xmm4, 25
1470
+ por xmm4, xmm8
1471
+ movdqa xmm8, xmm5
1472
+ psrld xmm8, 7
1473
+ pslld xmm5, 25
1474
+ por xmm5, xmm8
1475
+ movdqa xmm8, xmm6
1476
+ psrld xmm8, 7
1477
+ pslld xmm6, 25
1478
+ por xmm6, xmm8
1479
+ movdqa xmm8, xmm7
1480
+ psrld xmm8, 7
1481
+ pslld xmm7, 25
1482
+ por xmm7, xmm8
1483
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1484
+ paddd xmm1, xmmword ptr [rsp+20H]
1485
+ paddd xmm2, xmmword ptr [rsp+30H]
1486
+ paddd xmm3, xmmword ptr [rsp+70H]
1487
+ paddd xmm0, xmm5
1488
+ paddd xmm1, xmm6
1489
+ paddd xmm2, xmm7
1490
+ paddd xmm3, xmm4
1491
+ pxor xmm15, xmm0
1492
+ pxor xmm12, xmm1
1493
+ pxor xmm13, xmm2
1494
+ pxor xmm14, xmm3
1495
+ pshuflw xmm15, xmm15, 0B1H
1496
+ pshufhw xmm15, xmm15, 0B1H
1497
+ pshuflw xmm12, xmm12, 0B1H
1498
+ pshufhw xmm12, xmm12, 0B1H
1499
+ pshuflw xmm13, xmm13, 0B1H
1500
+ pshufhw xmm13, xmm13, 0B1H
1501
+ pshuflw xmm14, xmm14, 0B1H
1502
+ pshufhw xmm14, xmm14, 0B1H
1503
+ paddd xmm10, xmm15
1504
+ paddd xmm11, xmm12
1505
+ movdqa xmm8, xmmword ptr [rsp+100H]
1506
+ paddd xmm8, xmm13
1507
+ paddd xmm9, xmm14
1508
+ pxor xmm5, xmm10
1509
+ pxor xmm6, xmm11
1510
+ pxor xmm7, xmm8
1511
+ pxor xmm4, xmm9
1512
+ movdqa xmmword ptr [rsp+100H], xmm8
1513
+ movdqa xmm8, xmm5
1514
+ psrld xmm8, 12
1515
+ pslld xmm5, 20
1516
+ por xmm5, xmm8
1517
+ movdqa xmm8, xmm6
1518
+ psrld xmm8, 12
1519
+ pslld xmm6, 20
1520
+ por xmm6, xmm8
1521
+ movdqa xmm8, xmm7
1522
+ psrld xmm8, 12
1523
+ pslld xmm7, 20
1524
+ por xmm7, xmm8
1525
+ movdqa xmm8, xmm4
1526
+ psrld xmm8, 12
1527
+ pslld xmm4, 20
1528
+ por xmm4, xmm8
1529
+ paddd xmm0, xmmword ptr [rsp+0A0H]
1530
+ paddd xmm1, xmmword ptr [rsp+0C0H]
1531
+ paddd xmm2, xmmword ptr [rsp+40H]
1532
+ paddd xmm3, xmmword ptr [rsp+0D0H]
1533
+ paddd xmm0, xmm5
1534
+ paddd xmm1, xmm6
1535
+ paddd xmm2, xmm7
1536
+ paddd xmm3, xmm4
1537
+ pxor xmm15, xmm0
1538
+ pxor xmm12, xmm1
1539
+ pxor xmm13, xmm2
1540
+ pxor xmm14, xmm3
1541
+ movdqa xmm8, xmm15
1542
+ psrld xmm15, 8
1543
+ pslld xmm8, 24
1544
+ pxor xmm15, xmm8
1545
+ movdqa xmm8, xmm12
1546
+ psrld xmm12, 8
1547
+ pslld xmm8, 24
1548
+ pxor xmm12, xmm8
1549
+ movdqa xmm8, xmm13
1550
+ psrld xmm13, 8
1551
+ pslld xmm8, 24
1552
+ pxor xmm13, xmm8
1553
+ movdqa xmm8, xmm14
1554
+ psrld xmm14, 8
1555
+ pslld xmm8, 24
1556
+ pxor xmm14, xmm8
1557
+ paddd xmm10, xmm15
1558
+ paddd xmm11, xmm12
1559
+ movdqa xmm8, xmmword ptr [rsp+100H]
1560
+ paddd xmm8, xmm13
1561
+ paddd xmm9, xmm14
1562
+ pxor xmm5, xmm10
1563
+ pxor xmm6, xmm11
1564
+ pxor xmm7, xmm8
1565
+ pxor xmm4, xmm9
1566
+ pxor xmm0, xmm8
1567
+ pxor xmm1, xmm9
1568
+ pxor xmm2, xmm10
1569
+ pxor xmm3, xmm11
1570
+ movdqa xmm8, xmm5
1571
+ psrld xmm8, 7
1572
+ pslld xmm5, 25
1573
+ por xmm5, xmm8
1574
+ movdqa xmm8, xmm6
1575
+ psrld xmm8, 7
1576
+ pslld xmm6, 25
1577
+ por xmm6, xmm8
1578
+ movdqa xmm8, xmm7
1579
+ psrld xmm8, 7
1580
+ pslld xmm7, 25
1581
+ por xmm7, xmm8
1582
+ movdqa xmm8, xmm4
1583
+ psrld xmm8, 7
1584
+ pslld xmm4, 25
1585
+ por xmm4, xmm8
1586
+ pxor xmm4, xmm12
1587
+ pxor xmm5, xmm13
1588
+ pxor xmm6, xmm14
1589
+ pxor xmm7, xmm15
1590
+ mov eax, r13d
1591
+ jne innerloop4
1592
+ movdqa xmm9, xmm0
1593
+ punpckldq xmm0, xmm1
1594
+ punpckhdq xmm9, xmm1
1595
+ movdqa xmm11, xmm2
1596
+ punpckldq xmm2, xmm3
1597
+ punpckhdq xmm11, xmm3
1598
+ movdqa xmm1, xmm0
1599
+ punpcklqdq xmm0, xmm2
1600
+ punpckhqdq xmm1, xmm2
1601
+ movdqa xmm3, xmm9
1602
+ punpcklqdq xmm9, xmm11
1603
+ punpckhqdq xmm3, xmm11
1604
+ movdqu xmmword ptr [rbx], xmm0
1605
+ movdqu xmmword ptr [rbx+20H], xmm1
1606
+ movdqu xmmword ptr [rbx+40H], xmm9
1607
+ movdqu xmmword ptr [rbx+60H], xmm3
1608
+ movdqa xmm9, xmm4
1609
+ punpckldq xmm4, xmm5
1610
+ punpckhdq xmm9, xmm5
1611
+ movdqa xmm11, xmm6
1612
+ punpckldq xmm6, xmm7
1613
+ punpckhdq xmm11, xmm7
1614
+ movdqa xmm5, xmm4
1615
+ punpcklqdq xmm4, xmm6
1616
+ punpckhqdq xmm5, xmm6
1617
+ movdqa xmm7, xmm9
1618
+ punpcklqdq xmm9, xmm11
1619
+ punpckhqdq xmm7, xmm11
1620
+ movdqu xmmword ptr [rbx+10H], xmm4
1621
+ movdqu xmmword ptr [rbx+30H], xmm5
1622
+ movdqu xmmword ptr [rbx+50H], xmm9
1623
+ movdqu xmmword ptr [rbx+70H], xmm7
1624
+ movdqa xmm1, xmmword ptr [rsp+110H]
1625
+ movdqa xmm0, xmm1
1626
+ paddd xmm1, xmmword ptr [rsp+150H]
1627
+ movdqa xmmword ptr [rsp+110H], xmm1
1628
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
1629
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
1630
+ pcmpgtd xmm0, xmm1
1631
+ movdqa xmm1, xmmword ptr [rsp+120H]
1632
+ psubd xmm1, xmm0
1633
+ movdqa xmmword ptr [rsp+120H], xmm1
1634
+ add rbx, 128
1635
+ add rdi, 32
1636
+ sub rsi, 4
1637
+ cmp rsi, 4
1638
+ jnc outerloop4
1639
+ test rsi, rsi
1640
+ jne final3blocks
1641
+ unwind:
1642
+ movdqa xmm6, xmmword ptr [rsp+170H]
1643
+ movdqa xmm7, xmmword ptr [rsp+180H]
1644
+ movdqa xmm8, xmmword ptr [rsp+190H]
1645
+ movdqa xmm9, xmmword ptr [rsp+1A0H]
1646
+ movdqa xmm10, xmmword ptr [rsp+1B0H]
1647
+ movdqa xmm11, xmmword ptr [rsp+1C0H]
1648
+ movdqa xmm12, xmmword ptr [rsp+1D0H]
1649
+ movdqa xmm13, xmmword ptr [rsp+1E0H]
1650
+ movdqa xmm14, xmmword ptr [rsp+1F0H]
1651
+ movdqa xmm15, xmmword ptr [rsp+200H]
1652
+ mov rsp, rbp
1653
+ pop rbp
1654
+ pop rbx
1655
+ pop rdi
1656
+ pop rsi
1657
+ pop r12
1658
+ pop r13
1659
+ pop r14
1660
+ pop r15
1661
+ ret
1662
+ ALIGN 16
1663
+ final3blocks:
1664
+ test esi, 2H
1665
+ je final1block
1666
+ movups xmm0, xmmword ptr [rcx]
1667
+ movups xmm1, xmmword ptr [rcx+10H]
1668
+ movaps xmm8, xmm0
1669
+ movaps xmm9, xmm1
1670
+ movd xmm13, dword ptr [rsp+110H]
1671
+ movd xmm14, dword ptr [rsp+120H]
1672
+ punpckldq xmm13, xmm14
1673
+ movaps xmmword ptr [rsp], xmm13
1674
+ movd xmm14, dword ptr [rsp+114H]
1675
+ movd xmm13, dword ptr [rsp+124H]
1676
+ punpckldq xmm14, xmm13
1677
+ movaps xmmword ptr [rsp+10H], xmm14
1678
+ mov r8, qword ptr [rdi]
1679
+ mov r9, qword ptr [rdi+8H]
1680
+ movzx eax, byte ptr [rbp+80H]
1681
+ or eax, r13d
1682
+ xor edx, edx
1683
+ innerloop2:
1684
+ mov r14d, eax
1685
+ or eax, r12d
1686
+ add rdx, 64
1687
+ cmp rdx, r15
1688
+ cmovne eax, r14d
1689
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1690
+ movaps xmm10, xmm2
1691
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1692
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1693
+ movaps xmm3, xmm4
1694
+ shufps xmm4, xmm5, 136
1695
+ shufps xmm3, xmm5, 221
1696
+ movaps xmm5, xmm3
1697
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1698
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1699
+ movaps xmm3, xmm6
1700
+ shufps xmm6, xmm7, 136
1701
+ pshufd xmm6, xmm6, 93H
1702
+ shufps xmm3, xmm7, 221
1703
+ pshufd xmm7, xmm3, 93H
1704
+ movups xmm12, xmmword ptr [r9+rdx-40H]
1705
+ movups xmm13, xmmword ptr [r9+rdx-30H]
1706
+ movaps xmm11, xmm12
1707
+ shufps xmm12, xmm13, 136
1708
+ shufps xmm11, xmm13, 221
1709
+ movaps xmm13, xmm11
1710
+ movups xmm14, xmmword ptr [r9+rdx-20H]
1711
+ movups xmm15, xmmword ptr [r9+rdx-10H]
1712
+ movaps xmm11, xmm14
1713
+ shufps xmm14, xmm15, 136
1714
+ pshufd xmm14, xmm14, 93H
1715
+ shufps xmm11, xmm15, 221
1716
+ pshufd xmm15, xmm11, 93H
1717
+ shl rax, 20H
1718
+ or rax, 40H
1719
+ movd xmm3, rax
1720
+ movdqa xmmword ptr [rsp+20H], xmm3
1721
+ movaps xmm3, xmmword ptr [rsp]
1722
+ movaps xmm11, xmmword ptr [rsp+10H]
1723
+ punpcklqdq xmm3, xmmword ptr [rsp+20H]
1724
+ punpcklqdq xmm11, xmmword ptr [rsp+20H]
1725
+ mov al, 7
1726
+ roundloop2:
1727
+ paddd xmm0, xmm4
1728
+ paddd xmm8, xmm12
1729
+ movaps xmmword ptr [rsp+20H], xmm4
1730
+ movaps xmmword ptr [rsp+30H], xmm12
1731
+ paddd xmm0, xmm1
1732
+ paddd xmm8, xmm9
1733
+ pxor xmm3, xmm0
1734
+ pxor xmm11, xmm8
1735
+ pshuflw xmm3, xmm3, 0B1H
1736
+ pshufhw xmm3, xmm3, 0B1H
1737
+ pshuflw xmm11, xmm11, 0B1H
1738
+ pshufhw xmm11, xmm11, 0B1H
1739
+ paddd xmm2, xmm3
1740
+ paddd xmm10, xmm11
1741
+ pxor xmm1, xmm2
1742
+ pxor xmm9, xmm10
1743
+ movdqa xmm4, xmm1
1744
+ pslld xmm1, 20
1745
+ psrld xmm4, 12
1746
+ por xmm1, xmm4
1747
+ movdqa xmm4, xmm9
1748
+ pslld xmm9, 20
1749
+ psrld xmm4, 12
1750
+ por xmm9, xmm4
1751
+ paddd xmm0, xmm5
1752
+ paddd xmm8, xmm13
1753
+ movaps xmmword ptr [rsp+40H], xmm5
1754
+ movaps xmmword ptr [rsp+50H], xmm13
1755
+ paddd xmm0, xmm1
1756
+ paddd xmm8, xmm9
1757
+ pxor xmm3, xmm0
1758
+ pxor xmm11, xmm8
1759
+ movdqa xmm13, xmm3
1760
+ psrld xmm3, 8
1761
+ pslld xmm13, 24
1762
+ pxor xmm3, xmm13
1763
+ movdqa xmm13, xmm11
1764
+ psrld xmm11, 8
1765
+ pslld xmm13, 24
1766
+ pxor xmm11, xmm13
1767
+ paddd xmm2, xmm3
1768
+ paddd xmm10, xmm11
1769
+ pxor xmm1, xmm2
1770
+ pxor xmm9, xmm10
1771
+ movdqa xmm4, xmm1
1772
+ pslld xmm1, 25
1773
+ psrld xmm4, 7
1774
+ por xmm1, xmm4
1775
+ movdqa xmm4, xmm9
1776
+ pslld xmm9, 25
1777
+ psrld xmm4, 7
1778
+ por xmm9, xmm4
1779
+ pshufd xmm0, xmm0, 93H
1780
+ pshufd xmm8, xmm8, 93H
1781
+ pshufd xmm3, xmm3, 4EH
1782
+ pshufd xmm11, xmm11, 4EH
1783
+ pshufd xmm2, xmm2, 39H
1784
+ pshufd xmm10, xmm10, 39H
1785
+ paddd xmm0, xmm6
1786
+ paddd xmm8, xmm14
1787
+ paddd xmm0, xmm1
1788
+ paddd xmm8, xmm9
1789
+ pxor xmm3, xmm0
1790
+ pxor xmm11, xmm8
1791
+ pshuflw xmm3, xmm3, 0B1H
1792
+ pshufhw xmm3, xmm3, 0B1H
1793
+ pshuflw xmm11, xmm11, 0B1H
1794
+ pshufhw xmm11, xmm11, 0B1H
1795
+ paddd xmm2, xmm3
1796
+ paddd xmm10, xmm11
1797
+ pxor xmm1, xmm2
1798
+ pxor xmm9, xmm10
1799
+ movdqa xmm4, xmm1
1800
+ pslld xmm1, 20
1801
+ psrld xmm4, 12
1802
+ por xmm1, xmm4
1803
+ movdqa xmm4, xmm9
1804
+ pslld xmm9, 20
1805
+ psrld xmm4, 12
1806
+ por xmm9, xmm4
1807
+ paddd xmm0, xmm7
1808
+ paddd xmm8, xmm15
1809
+ paddd xmm0, xmm1
1810
+ paddd xmm8, xmm9
1811
+ pxor xmm3, xmm0
1812
+ pxor xmm11, xmm8
1813
+ movdqa xmm13, xmm3
1814
+ psrld xmm3, 8
1815
+ pslld xmm13, 24
1816
+ pxor xmm3, xmm13
1817
+ movdqa xmm13, xmm11
1818
+ psrld xmm11, 8
1819
+ pslld xmm13, 24
1820
+ pxor xmm11, xmm13
1821
+ paddd xmm2, xmm3
1822
+ paddd xmm10, xmm11
1823
+ pxor xmm1, xmm2
1824
+ pxor xmm9, xmm10
1825
+ movdqa xmm4, xmm1
1826
+ pslld xmm1, 25
1827
+ psrld xmm4, 7
1828
+ por xmm1, xmm4
1829
+ movdqa xmm4, xmm9
1830
+ pslld xmm9, 25
1831
+ psrld xmm4, 7
1832
+ por xmm9, xmm4
1833
+ pshufd xmm0, xmm0, 39H
1834
+ pshufd xmm8, xmm8, 39H
1835
+ pshufd xmm3, xmm3, 4EH
1836
+ pshufd xmm11, xmm11, 4EH
1837
+ pshufd xmm2, xmm2, 93H
1838
+ pshufd xmm10, xmm10, 93H
1839
+ dec al
1840
+ je endroundloop2
1841
+ movdqa xmm12, xmmword ptr [rsp+20H]
1842
+ movdqa xmm5, xmmword ptr [rsp+40H]
1843
+ pshufd xmm13, xmm12, 0FH
1844
+ shufps xmm12, xmm5, 214
1845
+ pshufd xmm4, xmm12, 39H
1846
+ movdqa xmm12, xmm6
1847
+ shufps xmm12, xmm7, 250
1848
+ pand xmm13, xmmword ptr [PBLENDW_0x33_MASK]
1849
+ pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK]
1850
+ por xmm13, xmm12
1851
+ movdqa xmmword ptr [rsp+20H], xmm13
1852
+ movdqa xmm12, xmm7
1853
+ punpcklqdq xmm12, xmm5
1854
+ movdqa xmm13, xmm6
1855
+ pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK]
1856
+ pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK]
1857
+ por xmm12, xmm13
1858
+ pshufd xmm12, xmm12, 78H
1859
+ punpckhdq xmm5, xmm7
1860
+ punpckldq xmm6, xmm5
1861
+ pshufd xmm7, xmm6, 1EH
1862
+ movdqa xmmword ptr [rsp+40H], xmm12
1863
+ movdqa xmm5, xmmword ptr [rsp+30H]
1864
+ movdqa xmm13, xmmword ptr [rsp+50H]
1865
+ pshufd xmm6, xmm5, 0FH
1866
+ shufps xmm5, xmm13, 214
1867
+ pshufd xmm12, xmm5, 39H
1868
+ movdqa xmm5, xmm14
1869
+ shufps xmm5, xmm15, 250
1870
+ pand xmm6, xmmword ptr [PBLENDW_0x33_MASK]
1871
+ pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK]
1872
+ por xmm6, xmm5
1873
+ movdqa xmm5, xmm15
1874
+ punpcklqdq xmm5, xmm13
1875
+ movdqa xmmword ptr [rsp+30H], xmm2
1876
+ movdqa xmm2, xmm14
1877
+ pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK]
1878
+ pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK]
1879
+ por xmm5, xmm2
1880
+ movdqa xmm2, xmmword ptr [rsp+30H]
1881
+ pshufd xmm5, xmm5, 78H
1882
+ punpckhdq xmm13, xmm15
1883
+ punpckldq xmm14, xmm13
1884
+ pshufd xmm15, xmm14, 1EH
1885
+ movdqa xmm13, xmm6
1886
+ movdqa xmm14, xmm5
1887
+ movdqa xmm5, xmmword ptr [rsp+20H]
1888
+ movdqa xmm6, xmmword ptr [rsp+40H]
1889
+ jmp roundloop2
1890
+ endroundloop2:
1891
+ pxor xmm0, xmm2
1892
+ pxor xmm1, xmm3
1893
+ pxor xmm8, xmm10
1894
+ pxor xmm9, xmm11
1895
+ mov eax, r13d
1896
+ cmp rdx, r15
1897
+ jne innerloop2
1898
+ movups xmmword ptr [rbx], xmm0
1899
+ movups xmmword ptr [rbx+10H], xmm1
1900
+ movups xmmword ptr [rbx+20H], xmm8
1901
+ movups xmmword ptr [rbx+30H], xmm9
1902
+ mov eax, dword ptr [rsp+130H]
1903
+ neg eax
1904
+ mov r10d, dword ptr [rsp+110H+8*rax]
1905
+ mov r11d, dword ptr [rsp+120H+8*rax]
1906
+ mov dword ptr [rsp+110H], r10d
1907
+ mov dword ptr [rsp+120H], r11d
1908
+ add rdi, 16
1909
+ add rbx, 64
1910
+ sub rsi, 2
1911
+ final1block:
1912
+ test esi, 1H
1913
+ je unwind
1914
+ movups xmm0, xmmword ptr [rcx]
1915
+ movups xmm1, xmmword ptr [rcx+10H]
1916
+ movd xmm13, dword ptr [rsp+110H]
1917
+ movd xmm14, dword ptr [rsp+120H]
1918
+ punpckldq xmm13, xmm14
1919
+ mov r8, qword ptr [rdi]
1920
+ movzx eax, byte ptr [rbp+80H]
1921
+ or eax, r13d
1922
+ xor edx, edx
1923
+ innerloop1:
1924
+ mov r14d, eax
1925
+ or eax, r12d
1926
+ add rdx, 64
1927
+ cmp rdx, r15
1928
+ cmovne eax, r14d
1929
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1930
+ shl rax, 32
1931
+ or rax, 64
1932
+ movd xmm12, rax
1933
+ movdqa xmm3, xmm13
1934
+ punpcklqdq xmm3, xmm12
1935
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1936
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1937
+ movaps xmm8, xmm4
1938
+ shufps xmm4, xmm5, 136
1939
+ shufps xmm8, xmm5, 221
1940
+ movaps xmm5, xmm8
1941
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1942
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1943
+ movaps xmm8, xmm6
1944
+ shufps xmm6, xmm7, 136
1945
+ pshufd xmm6, xmm6, 93H
1946
+ shufps xmm8, xmm7, 221
1947
+ pshufd xmm7, xmm8, 93H
1948
+ mov al, 7
1949
+ roundloop1:
1950
+ paddd xmm0, xmm4
1951
+ paddd xmm0, xmm1
1952
+ pxor xmm3, xmm0
1953
+ pshuflw xmm3, xmm3, 0B1H
1954
+ pshufhw xmm3, xmm3, 0B1H
1955
+ paddd xmm2, xmm3
1956
+ pxor xmm1, xmm2
1957
+ movdqa xmm11, xmm1
1958
+ pslld xmm1, 20
1959
+ psrld xmm11, 12
1960
+ por xmm1, xmm11
1961
+ paddd xmm0, xmm5
1962
+ paddd xmm0, xmm1
1963
+ pxor xmm3, xmm0
1964
+ movdqa xmm14, xmm3
1965
+ psrld xmm3, 8
1966
+ pslld xmm14, 24
1967
+ pxor xmm3, xmm14
1968
+ paddd xmm2, xmm3
1969
+ pxor xmm1, xmm2
1970
+ movdqa xmm11, xmm1
1971
+ pslld xmm1, 25
1972
+ psrld xmm11, 7
1973
+ por xmm1, xmm11
1974
+ pshufd xmm0, xmm0, 93H
1975
+ pshufd xmm3, xmm3, 4EH
1976
+ pshufd xmm2, xmm2, 39H
1977
+ paddd xmm0, xmm6
1978
+ paddd xmm0, xmm1
1979
+ pxor xmm3, xmm0
1980
+ pshuflw xmm3, xmm3, 0B1H
1981
+ pshufhw xmm3, xmm3, 0B1H
1982
+ paddd xmm2, xmm3
1983
+ pxor xmm1, xmm2
1984
+ movdqa xmm11, xmm1
1985
+ pslld xmm1, 20
1986
+ psrld xmm11, 12
1987
+ por xmm1, xmm11
1988
+ paddd xmm0, xmm7
1989
+ paddd xmm0, xmm1
1990
+ pxor xmm3, xmm0
1991
+ movdqa xmm14, xmm3
1992
+ psrld xmm3, 8
1993
+ pslld xmm14, 24
1994
+ pxor xmm3, xmm14
1995
+ paddd xmm2, xmm3
1996
+ pxor xmm1, xmm2
1997
+ movdqa xmm11, xmm1
1998
+ pslld xmm1, 25
1999
+ psrld xmm11, 7
2000
+ por xmm1, xmm11
2001
+ pshufd xmm0, xmm0, 39H
2002
+ pshufd xmm3, xmm3, 4EH
2003
+ pshufd xmm2, xmm2, 93H
2004
+ dec al
2005
+ jz endroundloop1
2006
+ movdqa xmm8, xmm4
2007
+ shufps xmm8, xmm5, 214
2008
+ pshufd xmm9, xmm4, 0FH
2009
+ pshufd xmm4, xmm8, 39H
2010
+ movdqa xmm8, xmm6
2011
+ shufps xmm8, xmm7, 250
2012
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2013
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2014
+ por xmm9, xmm8
2015
+ movdqa xmm8, xmm7
2016
+ punpcklqdq xmm8, xmm5
2017
+ movdqa xmm10, xmm6
2018
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2019
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2020
+ por xmm8, xmm10
2021
+ pshufd xmm8, xmm8, 78H
2022
+ punpckhdq xmm5, xmm7
2023
+ punpckldq xmm6, xmm5
2024
+ pshufd xmm7, xmm6, 1EH
2025
+ movdqa xmm5, xmm9
2026
+ movdqa xmm6, xmm8
2027
+ jmp roundloop1
2028
+ endroundloop1:
2029
+ pxor xmm0, xmm2
2030
+ pxor xmm1, xmm3
2031
+ mov eax, r13d
2032
+ cmp rdx, r15
2033
+ jne innerloop1
2034
+ movups xmmword ptr [rbx], xmm0
2035
+ movups xmmword ptr [rbx+10H], xmm1
2036
+ jmp unwind
2037
+ _blake3_hash_many_sse2 ENDP
2038
+ blake3_hash_many_sse2 ENDP
2039
+
2040
+ blake3_compress_in_place_sse2 PROC
2041
+ _blake3_compress_in_place_sse2 PROC
2042
+ sub rsp, 120
2043
+ movdqa xmmword ptr [rsp], xmm6
2044
+ movdqa xmmword ptr [rsp+10H], xmm7
2045
+ movdqa xmmword ptr [rsp+20H], xmm8
2046
+ movdqa xmmword ptr [rsp+30H], xmm9
2047
+ movdqa xmmword ptr [rsp+40H], xmm11
2048
+ movdqa xmmword ptr [rsp+50H], xmm14
2049
+ movdqa xmmword ptr [rsp+60H], xmm15
2050
+ movups xmm0, xmmword ptr [rcx]
2051
+ movups xmm1, xmmword ptr [rcx+10H]
2052
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
2053
+ movzx eax, byte ptr [rsp+0A0H]
2054
+ movzx r8d, r8b
2055
+ shl rax, 32
2056
+ add r8, rax
2057
+ movq xmm3, r9
2058
+ movq xmm4, r8
2059
+ punpcklqdq xmm3, xmm4
2060
+ movups xmm4, xmmword ptr [rdx]
2061
+ movups xmm5, xmmword ptr [rdx+10H]
2062
+ movaps xmm8, xmm4
2063
+ shufps xmm4, xmm5, 136
2064
+ shufps xmm8, xmm5, 221
2065
+ movaps xmm5, xmm8
2066
+ movups xmm6, xmmword ptr [rdx+20H]
2067
+ movups xmm7, xmmword ptr [rdx+30H]
2068
+ movaps xmm8, xmm6
2069
+ shufps xmm6, xmm7, 136
2070
+ pshufd xmm6, xmm6, 93H
2071
+ shufps xmm8, xmm7, 221
2072
+ pshufd xmm7, xmm8, 93H
2073
+ mov al, 7
2074
+ @@:
2075
+ paddd xmm0, xmm4
2076
+ paddd xmm0, xmm1
2077
+ pxor xmm3, xmm0
2078
+ pshuflw xmm3, xmm3, 0B1H
2079
+ pshufhw xmm3, xmm3, 0B1H
2080
+ paddd xmm2, xmm3
2081
+ pxor xmm1, xmm2
2082
+ movdqa xmm11, xmm1
2083
+ pslld xmm1, 20
2084
+ psrld xmm11, 12
2085
+ por xmm1, xmm11
2086
+ paddd xmm0, xmm5
2087
+ paddd xmm0, xmm1
2088
+ pxor xmm3, xmm0
2089
+ movdqa xmm14, xmm3
2090
+ psrld xmm3, 8
2091
+ pslld xmm14, 24
2092
+ pxor xmm3, xmm14
2093
+ paddd xmm2, xmm3
2094
+ pxor xmm1, xmm2
2095
+ movdqa xmm11, xmm1
2096
+ pslld xmm1, 25
2097
+ psrld xmm11, 7
2098
+ por xmm1, xmm11
2099
+ pshufd xmm0, xmm0, 93H
2100
+ pshufd xmm3, xmm3, 4EH
2101
+ pshufd xmm2, xmm2, 39H
2102
+ paddd xmm0, xmm6
2103
+ paddd xmm0, xmm1
2104
+ pxor xmm3, xmm0
2105
+ pshuflw xmm3, xmm3, 0B1H
2106
+ pshufhw xmm3, xmm3, 0B1H
2107
+ paddd xmm2, xmm3
2108
+ pxor xmm1, xmm2
2109
+ movdqa xmm11, xmm1
2110
+ pslld xmm1, 20
2111
+ psrld xmm11, 12
2112
+ por xmm1, xmm11
2113
+ paddd xmm0, xmm7
2114
+ paddd xmm0, xmm1
2115
+ pxor xmm3, xmm0
2116
+ movdqa xmm14, xmm3
2117
+ psrld xmm3, 8
2118
+ pslld xmm14, 24
2119
+ pxor xmm3, xmm14
2120
+ paddd xmm2, xmm3
2121
+ pxor xmm1, xmm2
2122
+ movdqa xmm11, xmm1
2123
+ pslld xmm1, 25
2124
+ psrld xmm11, 7
2125
+ por xmm1, xmm11
2126
+ pshufd xmm0, xmm0, 39H
2127
+ pshufd xmm3, xmm3, 4EH
2128
+ pshufd xmm2, xmm2, 93H
2129
+ dec al
2130
+ jz @F
2131
+ movdqa xmm8, xmm4
2132
+ shufps xmm8, xmm5, 214
2133
+ pshufd xmm9, xmm4, 0FH
2134
+ pshufd xmm4, xmm8, 39H
2135
+ movdqa xmm8, xmm6
2136
+ shufps xmm8, xmm7, 250
2137
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2138
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2139
+ por xmm9, xmm8
2140
+ movdqa xmm8, xmm7
2141
+ punpcklqdq xmm8, xmm5
2142
+ movdqa xmm10, xmm6
2143
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2144
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2145
+ por xmm8, xmm10
2146
+ pshufd xmm8, xmm8, 78H
2147
+ punpckhdq xmm5, xmm7
2148
+ punpckldq xmm6, xmm5
2149
+ pshufd xmm7, xmm6, 1EH
2150
+ movdqa xmm5, xmm9
2151
+ movdqa xmm6, xmm8
2152
+ jmp @B
2153
+ @@:
2154
+ pxor xmm0, xmm2
2155
+ pxor xmm1, xmm3
2156
+ movups xmmword ptr [rcx], xmm0
2157
+ movups xmmword ptr [rcx+10H], xmm1
2158
+ movdqa xmm6, xmmword ptr [rsp]
2159
+ movdqa xmm7, xmmword ptr [rsp+10H]
2160
+ movdqa xmm8, xmmword ptr [rsp+20H]
2161
+ movdqa xmm9, xmmword ptr [rsp+30H]
2162
+ movdqa xmm11, xmmword ptr [rsp+40H]
2163
+ movdqa xmm14, xmmword ptr [rsp+50H]
2164
+ movdqa xmm15, xmmword ptr [rsp+60H]
2165
+ add rsp, 120
2166
+ ret
2167
+ _blake3_compress_in_place_sse2 ENDP
2168
+ blake3_compress_in_place_sse2 ENDP
2169
+
2170
+ ALIGN 16
2171
+ blake3_compress_xof_sse2 PROC
2172
+ _blake3_compress_xof_sse2 PROC
2173
+ sub rsp, 120
2174
+ movdqa xmmword ptr [rsp], xmm6
2175
+ movdqa xmmword ptr [rsp+10H], xmm7
2176
+ movdqa xmmword ptr [rsp+20H], xmm8
2177
+ movdqa xmmword ptr [rsp+30H], xmm9
2178
+ movdqa xmmword ptr [rsp+40H], xmm11
2179
+ movdqa xmmword ptr [rsp+50H], xmm14
2180
+ movdqa xmmword ptr [rsp+60H], xmm15
2181
+ movups xmm0, xmmword ptr [rcx]
2182
+ movups xmm1, xmmword ptr [rcx+10H]
2183
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
2184
+ movzx eax, byte ptr [rsp+0A0H]
2185
+ movzx r8d, r8b
2186
+ mov r10, qword ptr [rsp+0A8H]
2187
+ shl rax, 32
2188
+ add r8, rax
2189
+ movq xmm3, r9
2190
+ movq xmm4, r8
2191
+ punpcklqdq xmm3, xmm4
2192
+ movups xmm4, xmmword ptr [rdx]
2193
+ movups xmm5, xmmword ptr [rdx+10H]
2194
+ movaps xmm8, xmm4
2195
+ shufps xmm4, xmm5, 136
2196
+ shufps xmm8, xmm5, 221
2197
+ movaps xmm5, xmm8
2198
+ movups xmm6, xmmword ptr [rdx+20H]
2199
+ movups xmm7, xmmword ptr [rdx+30H]
2200
+ movaps xmm8, xmm6
2201
+ shufps xmm6, xmm7, 136
2202
+ pshufd xmm6, xmm6, 93H
2203
+ shufps xmm8, xmm7, 221
2204
+ pshufd xmm7, xmm8, 93H
2205
+ mov al, 7
2206
+ @@:
2207
+ paddd xmm0, xmm4
2208
+ paddd xmm0, xmm1
2209
+ pxor xmm3, xmm0
2210
+ pshuflw xmm3, xmm3, 0B1H
2211
+ pshufhw xmm3, xmm3, 0B1H
2212
+ paddd xmm2, xmm3
2213
+ pxor xmm1, xmm2
2214
+ movdqa xmm11, xmm1
2215
+ pslld xmm1, 20
2216
+ psrld xmm11, 12
2217
+ por xmm1, xmm11
2218
+ paddd xmm0, xmm5
2219
+ paddd xmm0, xmm1
2220
+ pxor xmm3, xmm0
2221
+ movdqa xmm14, xmm3
2222
+ psrld xmm3, 8
2223
+ pslld xmm14, 24
2224
+ pxor xmm3, xmm14
2225
+ paddd xmm2, xmm3
2226
+ pxor xmm1, xmm2
2227
+ movdqa xmm11, xmm1
2228
+ pslld xmm1, 25
2229
+ psrld xmm11, 7
2230
+ por xmm1, xmm11
2231
+ pshufd xmm0, xmm0, 93H
2232
+ pshufd xmm3, xmm3, 4EH
2233
+ pshufd xmm2, xmm2, 39H
2234
+ paddd xmm0, xmm6
2235
+ paddd xmm0, xmm1
2236
+ pxor xmm3, xmm0
2237
+ pshuflw xmm3, xmm3, 0B1H
2238
+ pshufhw xmm3, xmm3, 0B1H
2239
+ paddd xmm2, xmm3
2240
+ pxor xmm1, xmm2
2241
+ movdqa xmm11, xmm1
2242
+ pslld xmm1, 20
2243
+ psrld xmm11, 12
2244
+ por xmm1, xmm11
2245
+ paddd xmm0, xmm7
2246
+ paddd xmm0, xmm1
2247
+ pxor xmm3, xmm0
2248
+ movdqa xmm14, xmm3
2249
+ psrld xmm3, 8
2250
+ pslld xmm14, 24
2251
+ pxor xmm3, xmm14
2252
+ paddd xmm2, xmm3
2253
+ pxor xmm1, xmm2
2254
+ movdqa xmm11, xmm1
2255
+ pslld xmm1, 25
2256
+ psrld xmm11, 7
2257
+ por xmm1, xmm11
2258
+ pshufd xmm0, xmm0, 39H
2259
+ pshufd xmm3, xmm3, 4EH
2260
+ pshufd xmm2, xmm2, 93H
2261
+ dec al
2262
+ jz @F
2263
+ movdqa xmm8, xmm4
2264
+ shufps xmm8, xmm5, 214
2265
+ pshufd xmm9, xmm4, 0FH
2266
+ pshufd xmm4, xmm8, 39H
2267
+ movdqa xmm8, xmm6
2268
+ shufps xmm8, xmm7, 250
2269
+ pand xmm9, xmmword ptr [PBLENDW_0x33_MASK]
2270
+ pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK]
2271
+ por xmm9, xmm8
2272
+ movdqa xmm8, xmm7
2273
+ punpcklqdq xmm8, xmm5
2274
+ movdqa xmm10, xmm6
2275
+ pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK]
2276
+ pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK]
2277
+ por xmm8, xmm10
2278
+ pshufd xmm8, xmm8, 78H
2279
+ punpckhdq xmm5, xmm7
2280
+ punpckldq xmm6, xmm5
2281
+ pshufd xmm7, xmm6, 1EH
2282
+ movdqa xmm5, xmm9
2283
+ movdqa xmm6, xmm8
2284
+ jmp @B
2285
+ @@:
2286
+ movdqu xmm4, xmmword ptr [rcx]
2287
+ movdqu xmm5, xmmword ptr [rcx+10H]
2288
+ pxor xmm0, xmm2
2289
+ pxor xmm1, xmm3
2290
+ pxor xmm2, xmm4
2291
+ pxor xmm3, xmm5
2292
+ movups xmmword ptr [r10], xmm0
2293
+ movups xmmword ptr [r10+10H], xmm1
2294
+ movups xmmword ptr [r10+20H], xmm2
2295
+ movups xmmword ptr [r10+30H], xmm3
2296
+ movdqa xmm6, xmmword ptr [rsp]
2297
+ movdqa xmm7, xmmword ptr [rsp+10H]
2298
+ movdqa xmm8, xmmword ptr [rsp+20H]
2299
+ movdqa xmm9, xmmword ptr [rsp+30H]
2300
+ movdqa xmm11, xmmword ptr [rsp+40H]
2301
+ movdqa xmm14, xmmword ptr [rsp+50H]
2302
+ movdqa xmm15, xmmword ptr [rsp+60H]
2303
+ add rsp, 120
2304
+ ret
2305
+ _blake3_compress_xof_sse2 ENDP
2306
+ blake3_compress_xof_sse2 ENDP
2307
+
2308
+ _TEXT ENDS
2309
+
2310
+
2311
+ _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
2312
+ ALIGN 64
2313
+ BLAKE3_IV:
2314
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
2315
+
2316
+ ADD0:
2317
+ dd 0, 1, 2, 3
2318
+
2319
+ ADD1:
2320
+ dd 4 dup (4)
2321
+
2322
+ BLAKE3_IV_0:
2323
+ dd 4 dup (6A09E667H)
2324
+
2325
+ BLAKE3_IV_1:
2326
+ dd 4 dup (0BB67AE85H)
2327
+
2328
+ BLAKE3_IV_2:
2329
+ dd 4 dup (3C6EF372H)
2330
+
2331
+ BLAKE3_IV_3:
2332
+ dd 4 dup (0A54FF53AH)
2333
+
2334
+ BLAKE3_BLOCK_LEN:
2335
+ dd 4 dup (64)
2336
+
2337
+ CMP_MSB_MASK:
2338
+ dd 8 dup(80000000H)
2339
+
2340
+ PBLENDW_0x33_MASK:
2341
+ dd 0FFFFFFFFH, 000000000H, 0FFFFFFFFH, 000000000H
2342
+ PBLENDW_0xCC_MASK:
2343
+ dd 000000000H, 0FFFFFFFFH, 000000000H, 0FFFFFFFFH
2344
+ PBLENDW_0x3F_MASK:
2345
+ dd 0FFFFFFFFH, 0FFFFFFFFH, 0FFFFFFFFH, 000000000H
2346
+ PBLENDW_0xC0_MASK:
2347
+ dd 000000000H, 000000000H, 000000000H, 0FFFFFFFFH
2348
+
2349
+ _RDATA ENDS
2350
+ END