digest-blake3 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2077 @@
1
+ public _blake3_hash_many_sse41
2
+ public blake3_hash_many_sse41
3
+ public blake3_compress_in_place_sse41
4
+ public _blake3_compress_in_place_sse41
5
+ public blake3_compress_xof_sse41
6
+ public _blake3_compress_xof_sse41
7
+
8
+ _TEXT SEGMENT ALIGN(16) 'CODE'
9
+
10
+ ALIGN 16
11
+ blake3_hash_many_sse41 PROC
12
+ _blake3_hash_many_sse41 PROC
13
+ push r15
14
+ push r14
15
+ push r13
16
+ push r12
17
+ push rsi
18
+ push rdi
19
+ push rbx
20
+ push rbp
21
+ mov rbp, rsp
22
+ sub rsp, 528
23
+ and rsp, 0FFFFFFFFFFFFFFC0H
24
+ movdqa xmmword ptr [rsp+170H], xmm6
25
+ movdqa xmmword ptr [rsp+180H], xmm7
26
+ movdqa xmmword ptr [rsp+190H], xmm8
27
+ movdqa xmmword ptr [rsp+1A0H], xmm9
28
+ movdqa xmmword ptr [rsp+1B0H], xmm10
29
+ movdqa xmmword ptr [rsp+1C0H], xmm11
30
+ movdqa xmmword ptr [rsp+1D0H], xmm12
31
+ movdqa xmmword ptr [rsp+1E0H], xmm13
32
+ movdqa xmmword ptr [rsp+1F0H], xmm14
33
+ movdqa xmmword ptr [rsp+200H], xmm15
34
+ mov rdi, rcx
35
+ mov rsi, rdx
36
+ mov rdx, r8
37
+ mov rcx, r9
38
+ mov r8, qword ptr [rbp+68H]
39
+ movzx r9, byte ptr [rbp+70H]
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 00H
43
+ movdqa xmmword ptr [rsp+130H], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0]
46
+ pand xmm0, xmmword ptr [ADD1]
47
+ movdqa xmmword ptr [rsp+150H], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 00H
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+110H], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 00H
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+120H], xmm2
60
+ mov rbx, qword ptr [rbp+90H]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+78H]
64
+ movzx r12d, byte ptr [rbp+88H]
65
+ cmp rsi, 4
66
+ jc final3blocks
67
+ outerloop4:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 00H
70
+ pshufd xmm1, xmm3, 55H
71
+ pshufd xmm2, xmm3, 0AAH
72
+ pshufd xmm3, xmm3, 0FFH
73
+ movdqu xmm7, xmmword ptr [rcx+10H]
74
+ pshufd xmm4, xmm7, 00H
75
+ pshufd xmm5, xmm7, 55H
76
+ pshufd xmm6, xmm7, 0AAH
77
+ pshufd xmm7, xmm7, 0FFH
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+8H]
80
+ mov r10, qword ptr [rdi+10H]
81
+ mov r11, qword ptr [rdi+18H]
82
+ movzx eax, byte ptr [rbp+80H]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ innerloop4:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-40H]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-40H]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-40H]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-40H]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+10H], xmm9
109
+ movdqa xmmword ptr [rsp+20H], xmm12
110
+ movdqa xmmword ptr [rsp+30H], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-30H]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-30H]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-30H]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-30H]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+40H], xmm8
128
+ movdqa xmmword ptr [rsp+50H], xmm9
129
+ movdqa xmmword ptr [rsp+60H], xmm12
130
+ movdqa xmmword ptr [rsp+70H], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-20H]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-20H]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-20H]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-20H]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+80H], xmm8
148
+ movdqa xmmword ptr [rsp+90H], xmm9
149
+ movdqa xmmword ptr [rsp+0A0H], xmm12
150
+ movdqa xmmword ptr [rsp+0B0H], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-10H]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-10H]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-10H]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-10H]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0C0H], xmm8
168
+ movdqa xmmword ptr [rsp+0D0H], xmm9
169
+ movdqa xmmword ptr [rsp+0E0H], xmm12
170
+ movdqa xmmword ptr [rsp+0F0H], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
174
+ movdqa xmm12, xmmword ptr [rsp+110H]
175
+ movdqa xmm13, xmmword ptr [rsp+120H]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 00H
179
+ prefetcht0 byte ptr [r8+rdx+80H]
180
+ prefetcht0 byte ptr [r9+rdx+80H]
181
+ prefetcht0 byte ptr [r10+rdx+80H]
182
+ prefetcht0 byte ptr [r11+rdx+80H]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+20H]
185
+ paddd xmm2, xmmword ptr [rsp+40H]
186
+ paddd xmm3, xmmword ptr [rsp+60H]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ movdqa xmm8, xmmword ptr [ROT16]
196
+ pshufb xmm12, xmm8
197
+ pshufb xmm13, xmm8
198
+ pshufb xmm14, xmm8
199
+ pshufb xmm15, xmm8
200
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
201
+ paddd xmm8, xmm12
202
+ paddd xmm9, xmm13
203
+ paddd xmm10, xmm14
204
+ paddd xmm11, xmm15
205
+ pxor xmm4, xmm8
206
+ pxor xmm5, xmm9
207
+ pxor xmm6, xmm10
208
+ pxor xmm7, xmm11
209
+ movdqa xmmword ptr [rsp+100H], xmm8
210
+ movdqa xmm8, xmm4
211
+ psrld xmm8, 12
212
+ pslld xmm4, 20
213
+ por xmm4, xmm8
214
+ movdqa xmm8, xmm5
215
+ psrld xmm8, 12
216
+ pslld xmm5, 20
217
+ por xmm5, xmm8
218
+ movdqa xmm8, xmm6
219
+ psrld xmm8, 12
220
+ pslld xmm6, 20
221
+ por xmm6, xmm8
222
+ movdqa xmm8, xmm7
223
+ psrld xmm8, 12
224
+ pslld xmm7, 20
225
+ por xmm7, xmm8
226
+ paddd xmm0, xmmword ptr [rsp+10H]
227
+ paddd xmm1, xmmword ptr [rsp+30H]
228
+ paddd xmm2, xmmword ptr [rsp+50H]
229
+ paddd xmm3, xmmword ptr [rsp+70H]
230
+ paddd xmm0, xmm4
231
+ paddd xmm1, xmm5
232
+ paddd xmm2, xmm6
233
+ paddd xmm3, xmm7
234
+ pxor xmm12, xmm0
235
+ pxor xmm13, xmm1
236
+ pxor xmm14, xmm2
237
+ pxor xmm15, xmm3
238
+ movdqa xmm8, xmmword ptr [ROT8]
239
+ pshufb xmm12, xmm8
240
+ pshufb xmm13, xmm8
241
+ pshufb xmm14, xmm8
242
+ pshufb xmm15, xmm8
243
+ movdqa xmm8, xmmword ptr [rsp+100H]
244
+ paddd xmm8, xmm12
245
+ paddd xmm9, xmm13
246
+ paddd xmm10, xmm14
247
+ paddd xmm11, xmm15
248
+ pxor xmm4, xmm8
249
+ pxor xmm5, xmm9
250
+ pxor xmm6, xmm10
251
+ pxor xmm7, xmm11
252
+ movdqa xmmword ptr [rsp+100H], xmm8
253
+ movdqa xmm8, xmm4
254
+ psrld xmm8, 7
255
+ pslld xmm4, 25
256
+ por xmm4, xmm8
257
+ movdqa xmm8, xmm5
258
+ psrld xmm8, 7
259
+ pslld xmm5, 25
260
+ por xmm5, xmm8
261
+ movdqa xmm8, xmm6
262
+ psrld xmm8, 7
263
+ pslld xmm6, 25
264
+ por xmm6, xmm8
265
+ movdqa xmm8, xmm7
266
+ psrld xmm8, 7
267
+ pslld xmm7, 25
268
+ por xmm7, xmm8
269
+ paddd xmm0, xmmword ptr [rsp+80H]
270
+ paddd xmm1, xmmword ptr [rsp+0A0H]
271
+ paddd xmm2, xmmword ptr [rsp+0C0H]
272
+ paddd xmm3, xmmword ptr [rsp+0E0H]
273
+ paddd xmm0, xmm5
274
+ paddd xmm1, xmm6
275
+ paddd xmm2, xmm7
276
+ paddd xmm3, xmm4
277
+ pxor xmm15, xmm0
278
+ pxor xmm12, xmm1
279
+ pxor xmm13, xmm2
280
+ pxor xmm14, xmm3
281
+ movdqa xmm8, xmmword ptr [ROT16]
282
+ pshufb xmm15, xmm8
283
+ pshufb xmm12, xmm8
284
+ pshufb xmm13, xmm8
285
+ pshufb xmm14, xmm8
286
+ paddd xmm10, xmm15
287
+ paddd xmm11, xmm12
288
+ movdqa xmm8, xmmword ptr [rsp+100H]
289
+ paddd xmm8, xmm13
290
+ paddd xmm9, xmm14
291
+ pxor xmm5, xmm10
292
+ pxor xmm6, xmm11
293
+ pxor xmm7, xmm8
294
+ pxor xmm4, xmm9
295
+ movdqa xmmword ptr [rsp+100H], xmm8
296
+ movdqa xmm8, xmm5
297
+ psrld xmm8, 12
298
+ pslld xmm5, 20
299
+ por xmm5, xmm8
300
+ movdqa xmm8, xmm6
301
+ psrld xmm8, 12
302
+ pslld xmm6, 20
303
+ por xmm6, xmm8
304
+ movdqa xmm8, xmm7
305
+ psrld xmm8, 12
306
+ pslld xmm7, 20
307
+ por xmm7, xmm8
308
+ movdqa xmm8, xmm4
309
+ psrld xmm8, 12
310
+ pslld xmm4, 20
311
+ por xmm4, xmm8
312
+ paddd xmm0, xmmword ptr [rsp+90H]
313
+ paddd xmm1, xmmword ptr [rsp+0B0H]
314
+ paddd xmm2, xmmword ptr [rsp+0D0H]
315
+ paddd xmm3, xmmword ptr [rsp+0F0H]
316
+ paddd xmm0, xmm5
317
+ paddd xmm1, xmm6
318
+ paddd xmm2, xmm7
319
+ paddd xmm3, xmm4
320
+ pxor xmm15, xmm0
321
+ pxor xmm12, xmm1
322
+ pxor xmm13, xmm2
323
+ pxor xmm14, xmm3
324
+ movdqa xmm8, xmmword ptr [ROT8]
325
+ pshufb xmm15, xmm8
326
+ pshufb xmm12, xmm8
327
+ pshufb xmm13, xmm8
328
+ pshufb xmm14, xmm8
329
+ paddd xmm10, xmm15
330
+ paddd xmm11, xmm12
331
+ movdqa xmm8, xmmword ptr [rsp+100H]
332
+ paddd xmm8, xmm13
333
+ paddd xmm9, xmm14
334
+ pxor xmm5, xmm10
335
+ pxor xmm6, xmm11
336
+ pxor xmm7, xmm8
337
+ pxor xmm4, xmm9
338
+ movdqa xmmword ptr [rsp+100H], xmm8
339
+ movdqa xmm8, xmm5
340
+ psrld xmm8, 7
341
+ pslld xmm5, 25
342
+ por xmm5, xmm8
343
+ movdqa xmm8, xmm6
344
+ psrld xmm8, 7
345
+ pslld xmm6, 25
346
+ por xmm6, xmm8
347
+ movdqa xmm8, xmm7
348
+ psrld xmm8, 7
349
+ pslld xmm7, 25
350
+ por xmm7, xmm8
351
+ movdqa xmm8, xmm4
352
+ psrld xmm8, 7
353
+ pslld xmm4, 25
354
+ por xmm4, xmm8
355
+ paddd xmm0, xmmword ptr [rsp+20H]
356
+ paddd xmm1, xmmword ptr [rsp+30H]
357
+ paddd xmm2, xmmword ptr [rsp+70H]
358
+ paddd xmm3, xmmword ptr [rsp+40H]
359
+ paddd xmm0, xmm4
360
+ paddd xmm1, xmm5
361
+ paddd xmm2, xmm6
362
+ paddd xmm3, xmm7
363
+ pxor xmm12, xmm0
364
+ pxor xmm13, xmm1
365
+ pxor xmm14, xmm2
366
+ pxor xmm15, xmm3
367
+ movdqa xmm8, xmmword ptr [ROT16]
368
+ pshufb xmm12, xmm8
369
+ pshufb xmm13, xmm8
370
+ pshufb xmm14, xmm8
371
+ pshufb xmm15, xmm8
372
+ movdqa xmm8, xmmword ptr [rsp+100H]
373
+ paddd xmm8, xmm12
374
+ paddd xmm9, xmm13
375
+ paddd xmm10, xmm14
376
+ paddd xmm11, xmm15
377
+ pxor xmm4, xmm8
378
+ pxor xmm5, xmm9
379
+ pxor xmm6, xmm10
380
+ pxor xmm7, xmm11
381
+ movdqa xmmword ptr [rsp+100H], xmm8
382
+ movdqa xmm8, xmm4
383
+ psrld xmm8, 12
384
+ pslld xmm4, 20
385
+ por xmm4, xmm8
386
+ movdqa xmm8, xmm5
387
+ psrld xmm8, 12
388
+ pslld xmm5, 20
389
+ por xmm5, xmm8
390
+ movdqa xmm8, xmm6
391
+ psrld xmm8, 12
392
+ pslld xmm6, 20
393
+ por xmm6, xmm8
394
+ movdqa xmm8, xmm7
395
+ psrld xmm8, 12
396
+ pslld xmm7, 20
397
+ por xmm7, xmm8
398
+ paddd xmm0, xmmword ptr [rsp+60H]
399
+ paddd xmm1, xmmword ptr [rsp+0A0H]
400
+ paddd xmm2, xmmword ptr [rsp]
401
+ paddd xmm3, xmmword ptr [rsp+0D0H]
402
+ paddd xmm0, xmm4
403
+ paddd xmm1, xmm5
404
+ paddd xmm2, xmm6
405
+ paddd xmm3, xmm7
406
+ pxor xmm12, xmm0
407
+ pxor xmm13, xmm1
408
+ pxor xmm14, xmm2
409
+ pxor xmm15, xmm3
410
+ movdqa xmm8, xmmword ptr [ROT8]
411
+ pshufb xmm12, xmm8
412
+ pshufb xmm13, xmm8
413
+ pshufb xmm14, xmm8
414
+ pshufb xmm15, xmm8
415
+ movdqa xmm8, xmmword ptr [rsp+100H]
416
+ paddd xmm8, xmm12
417
+ paddd xmm9, xmm13
418
+ paddd xmm10, xmm14
419
+ paddd xmm11, xmm15
420
+ pxor xmm4, xmm8
421
+ pxor xmm5, xmm9
422
+ pxor xmm6, xmm10
423
+ pxor xmm7, xmm11
424
+ movdqa xmmword ptr [rsp+100H], xmm8
425
+ movdqa xmm8, xmm4
426
+ psrld xmm8, 7
427
+ pslld xmm4, 25
428
+ por xmm4, xmm8
429
+ movdqa xmm8, xmm5
430
+ psrld xmm8, 7
431
+ pslld xmm5, 25
432
+ por xmm5, xmm8
433
+ movdqa xmm8, xmm6
434
+ psrld xmm8, 7
435
+ pslld xmm6, 25
436
+ por xmm6, xmm8
437
+ movdqa xmm8, xmm7
438
+ psrld xmm8, 7
439
+ pslld xmm7, 25
440
+ por xmm7, xmm8
441
+ paddd xmm0, xmmword ptr [rsp+10H]
442
+ paddd xmm1, xmmword ptr [rsp+0C0H]
443
+ paddd xmm2, xmmword ptr [rsp+90H]
444
+ paddd xmm3, xmmword ptr [rsp+0F0H]
445
+ paddd xmm0, xmm5
446
+ paddd xmm1, xmm6
447
+ paddd xmm2, xmm7
448
+ paddd xmm3, xmm4
449
+ pxor xmm15, xmm0
450
+ pxor xmm12, xmm1
451
+ pxor xmm13, xmm2
452
+ pxor xmm14, xmm3
453
+ movdqa xmm8, xmmword ptr [ROT16]
454
+ pshufb xmm15, xmm8
455
+ pshufb xmm12, xmm8
456
+ pshufb xmm13, xmm8
457
+ pshufb xmm14, xmm8
458
+ paddd xmm10, xmm15
459
+ paddd xmm11, xmm12
460
+ movdqa xmm8, xmmword ptr [rsp+100H]
461
+ paddd xmm8, xmm13
462
+ paddd xmm9, xmm14
463
+ pxor xmm5, xmm10
464
+ pxor xmm6, xmm11
465
+ pxor xmm7, xmm8
466
+ pxor xmm4, xmm9
467
+ movdqa xmmword ptr [rsp+100H], xmm8
468
+ movdqa xmm8, xmm5
469
+ psrld xmm8, 12
470
+ pslld xmm5, 20
471
+ por xmm5, xmm8
472
+ movdqa xmm8, xmm6
473
+ psrld xmm8, 12
474
+ pslld xmm6, 20
475
+ por xmm6, xmm8
476
+ movdqa xmm8, xmm7
477
+ psrld xmm8, 12
478
+ pslld xmm7, 20
479
+ por xmm7, xmm8
480
+ movdqa xmm8, xmm4
481
+ psrld xmm8, 12
482
+ pslld xmm4, 20
483
+ por xmm4, xmm8
484
+ paddd xmm0, xmmword ptr [rsp+0B0H]
485
+ paddd xmm1, xmmword ptr [rsp+50H]
486
+ paddd xmm2, xmmword ptr [rsp+0E0H]
487
+ paddd xmm3, xmmword ptr [rsp+80H]
488
+ paddd xmm0, xmm5
489
+ paddd xmm1, xmm6
490
+ paddd xmm2, xmm7
491
+ paddd xmm3, xmm4
492
+ pxor xmm15, xmm0
493
+ pxor xmm12, xmm1
494
+ pxor xmm13, xmm2
495
+ pxor xmm14, xmm3
496
+ movdqa xmm8, xmmword ptr [ROT8]
497
+ pshufb xmm15, xmm8
498
+ pshufb xmm12, xmm8
499
+ pshufb xmm13, xmm8
500
+ pshufb xmm14, xmm8
501
+ paddd xmm10, xmm15
502
+ paddd xmm11, xmm12
503
+ movdqa xmm8, xmmword ptr [rsp+100H]
504
+ paddd xmm8, xmm13
505
+ paddd xmm9, xmm14
506
+ pxor xmm5, xmm10
507
+ pxor xmm6, xmm11
508
+ pxor xmm7, xmm8
509
+ pxor xmm4, xmm9
510
+ movdqa xmmword ptr [rsp+100H], xmm8
511
+ movdqa xmm8, xmm5
512
+ psrld xmm8, 7
513
+ pslld xmm5, 25
514
+ por xmm5, xmm8
515
+ movdqa xmm8, xmm6
516
+ psrld xmm8, 7
517
+ pslld xmm6, 25
518
+ por xmm6, xmm8
519
+ movdqa xmm8, xmm7
520
+ psrld xmm8, 7
521
+ pslld xmm7, 25
522
+ por xmm7, xmm8
523
+ movdqa xmm8, xmm4
524
+ psrld xmm8, 7
525
+ pslld xmm4, 25
526
+ por xmm4, xmm8
527
+ paddd xmm0, xmmword ptr [rsp+30H]
528
+ paddd xmm1, xmmword ptr [rsp+0A0H]
529
+ paddd xmm2, xmmword ptr [rsp+0D0H]
530
+ paddd xmm3, xmmword ptr [rsp+70H]
531
+ paddd xmm0, xmm4
532
+ paddd xmm1, xmm5
533
+ paddd xmm2, xmm6
534
+ paddd xmm3, xmm7
535
+ pxor xmm12, xmm0
536
+ pxor xmm13, xmm1
537
+ pxor xmm14, xmm2
538
+ pxor xmm15, xmm3
539
+ movdqa xmm8, xmmword ptr [ROT16]
540
+ pshufb xmm12, xmm8
541
+ pshufb xmm13, xmm8
542
+ pshufb xmm14, xmm8
543
+ pshufb xmm15, xmm8
544
+ movdqa xmm8, xmmword ptr [rsp+100H]
545
+ paddd xmm8, xmm12
546
+ paddd xmm9, xmm13
547
+ paddd xmm10, xmm14
548
+ paddd xmm11, xmm15
549
+ pxor xmm4, xmm8
550
+ pxor xmm5, xmm9
551
+ pxor xmm6, xmm10
552
+ pxor xmm7, xmm11
553
+ movdqa xmmword ptr [rsp+100H], xmm8
554
+ movdqa xmm8, xmm4
555
+ psrld xmm8, 12
556
+ pslld xmm4, 20
557
+ por xmm4, xmm8
558
+ movdqa xmm8, xmm5
559
+ psrld xmm8, 12
560
+ pslld xmm5, 20
561
+ por xmm5, xmm8
562
+ movdqa xmm8, xmm6
563
+ psrld xmm8, 12
564
+ pslld xmm6, 20
565
+ por xmm6, xmm8
566
+ movdqa xmm8, xmm7
567
+ psrld xmm8, 12
568
+ pslld xmm7, 20
569
+ por xmm7, xmm8
570
+ paddd xmm0, xmmword ptr [rsp+40H]
571
+ paddd xmm1, xmmword ptr [rsp+0C0H]
572
+ paddd xmm2, xmmword ptr [rsp+20H]
573
+ paddd xmm3, xmmword ptr [rsp+0E0H]
574
+ paddd xmm0, xmm4
575
+ paddd xmm1, xmm5
576
+ paddd xmm2, xmm6
577
+ paddd xmm3, xmm7
578
+ pxor xmm12, xmm0
579
+ pxor xmm13, xmm1
580
+ pxor xmm14, xmm2
581
+ pxor xmm15, xmm3
582
+ movdqa xmm8, xmmword ptr [ROT8]
583
+ pshufb xmm12, xmm8
584
+ pshufb xmm13, xmm8
585
+ pshufb xmm14, xmm8
586
+ pshufb xmm15, xmm8
587
+ movdqa xmm8, xmmword ptr [rsp+100H]
588
+ paddd xmm8, xmm12
589
+ paddd xmm9, xmm13
590
+ paddd xmm10, xmm14
591
+ paddd xmm11, xmm15
592
+ pxor xmm4, xmm8
593
+ pxor xmm5, xmm9
594
+ pxor xmm6, xmm10
595
+ pxor xmm7, xmm11
596
+ movdqa xmmword ptr [rsp+100H], xmm8
597
+ movdqa xmm8, xmm4
598
+ psrld xmm8, 7
599
+ pslld xmm4, 25
600
+ por xmm4, xmm8
601
+ movdqa xmm8, xmm5
602
+ psrld xmm8, 7
603
+ pslld xmm5, 25
604
+ por xmm5, xmm8
605
+ movdqa xmm8, xmm6
606
+ psrld xmm8, 7
607
+ pslld xmm6, 25
608
+ por xmm6, xmm8
609
+ movdqa xmm8, xmm7
610
+ psrld xmm8, 7
611
+ pslld xmm7, 25
612
+ por xmm7, xmm8
613
+ paddd xmm0, xmmword ptr [rsp+60H]
614
+ paddd xmm1, xmmword ptr [rsp+90H]
615
+ paddd xmm2, xmmword ptr [rsp+0B0H]
616
+ paddd xmm3, xmmword ptr [rsp+80H]
617
+ paddd xmm0, xmm5
618
+ paddd xmm1, xmm6
619
+ paddd xmm2, xmm7
620
+ paddd xmm3, xmm4
621
+ pxor xmm15, xmm0
622
+ pxor xmm12, xmm1
623
+ pxor xmm13, xmm2
624
+ pxor xmm14, xmm3
625
+ movdqa xmm8, xmmword ptr [ROT16]
626
+ pshufb xmm15, xmm8
627
+ pshufb xmm12, xmm8
628
+ pshufb xmm13, xmm8
629
+ pshufb xmm14, xmm8
630
+ paddd xmm10, xmm15
631
+ paddd xmm11, xmm12
632
+ movdqa xmm8, xmmword ptr [rsp+100H]
633
+ paddd xmm8, xmm13
634
+ paddd xmm9, xmm14
635
+ pxor xmm5, xmm10
636
+ pxor xmm6, xmm11
637
+ pxor xmm7, xmm8
638
+ pxor xmm4, xmm9
639
+ movdqa xmmword ptr [rsp+100H], xmm8
640
+ movdqa xmm8, xmm5
641
+ psrld xmm8, 12
642
+ pslld xmm5, 20
643
+ por xmm5, xmm8
644
+ movdqa xmm8, xmm6
645
+ psrld xmm8, 12
646
+ pslld xmm6, 20
647
+ por xmm6, xmm8
648
+ movdqa xmm8, xmm7
649
+ psrld xmm8, 12
650
+ pslld xmm7, 20
651
+ por xmm7, xmm8
652
+ movdqa xmm8, xmm4
653
+ psrld xmm8, 12
654
+ pslld xmm4, 20
655
+ por xmm4, xmm8
656
+ paddd xmm0, xmmword ptr [rsp+50H]
657
+ paddd xmm1, xmmword ptr [rsp]
658
+ paddd xmm2, xmmword ptr [rsp+0F0H]
659
+ paddd xmm3, xmmword ptr [rsp+10H]
660
+ paddd xmm0, xmm5
661
+ paddd xmm1, xmm6
662
+ paddd xmm2, xmm7
663
+ paddd xmm3, xmm4
664
+ pxor xmm15, xmm0
665
+ pxor xmm12, xmm1
666
+ pxor xmm13, xmm2
667
+ pxor xmm14, xmm3
668
+ movdqa xmm8, xmmword ptr [ROT8]
669
+ pshufb xmm15, xmm8
670
+ pshufb xmm12, xmm8
671
+ pshufb xmm13, xmm8
672
+ pshufb xmm14, xmm8
673
+ paddd xmm10, xmm15
674
+ paddd xmm11, xmm12
675
+ movdqa xmm8, xmmword ptr [rsp+100H]
676
+ paddd xmm8, xmm13
677
+ paddd xmm9, xmm14
678
+ pxor xmm5, xmm10
679
+ pxor xmm6, xmm11
680
+ pxor xmm7, xmm8
681
+ pxor xmm4, xmm9
682
+ movdqa xmmword ptr [rsp+100H], xmm8
683
+ movdqa xmm8, xmm5
684
+ psrld xmm8, 7
685
+ pslld xmm5, 25
686
+ por xmm5, xmm8
687
+ movdqa xmm8, xmm6
688
+ psrld xmm8, 7
689
+ pslld xmm6, 25
690
+ por xmm6, xmm8
691
+ movdqa xmm8, xmm7
692
+ psrld xmm8, 7
693
+ pslld xmm7, 25
694
+ por xmm7, xmm8
695
+ movdqa xmm8, xmm4
696
+ psrld xmm8, 7
697
+ pslld xmm4, 25
698
+ por xmm4, xmm8
699
+ paddd xmm0, xmmword ptr [rsp+0A0H]
700
+ paddd xmm1, xmmword ptr [rsp+0C0H]
701
+ paddd xmm2, xmmword ptr [rsp+0E0H]
702
+ paddd xmm3, xmmword ptr [rsp+0D0H]
703
+ paddd xmm0, xmm4
704
+ paddd xmm1, xmm5
705
+ paddd xmm2, xmm6
706
+ paddd xmm3, xmm7
707
+ pxor xmm12, xmm0
708
+ pxor xmm13, xmm1
709
+ pxor xmm14, xmm2
710
+ pxor xmm15, xmm3
711
+ movdqa xmm8, xmmword ptr [ROT16]
712
+ pshufb xmm12, xmm8
713
+ pshufb xmm13, xmm8
714
+ pshufb xmm14, xmm8
715
+ pshufb xmm15, xmm8
716
+ movdqa xmm8, xmmword ptr [rsp+100H]
717
+ paddd xmm8, xmm12
718
+ paddd xmm9, xmm13
719
+ paddd xmm10, xmm14
720
+ paddd xmm11, xmm15
721
+ pxor xmm4, xmm8
722
+ pxor xmm5, xmm9
723
+ pxor xmm6, xmm10
724
+ pxor xmm7, xmm11
725
+ movdqa xmmword ptr [rsp+100H], xmm8
726
+ movdqa xmm8, xmm4
727
+ psrld xmm8, 12
728
+ pslld xmm4, 20
729
+ por xmm4, xmm8
730
+ movdqa xmm8, xmm5
731
+ psrld xmm8, 12
732
+ pslld xmm5, 20
733
+ por xmm5, xmm8
734
+ movdqa xmm8, xmm6
735
+ psrld xmm8, 12
736
+ pslld xmm6, 20
737
+ por xmm6, xmm8
738
+ movdqa xmm8, xmm7
739
+ psrld xmm8, 12
740
+ pslld xmm7, 20
741
+ por xmm7, xmm8
742
+ paddd xmm0, xmmword ptr [rsp+70H]
743
+ paddd xmm1, xmmword ptr [rsp+90H]
744
+ paddd xmm2, xmmword ptr [rsp+30H]
745
+ paddd xmm3, xmmword ptr [rsp+0F0H]
746
+ paddd xmm0, xmm4
747
+ paddd xmm1, xmm5
748
+ paddd xmm2, xmm6
749
+ paddd xmm3, xmm7
750
+ pxor xmm12, xmm0
751
+ pxor xmm13, xmm1
752
+ pxor xmm14, xmm2
753
+ pxor xmm15, xmm3
754
+ movdqa xmm8, xmmword ptr [ROT8]
755
+ pshufb xmm12, xmm8
756
+ pshufb xmm13, xmm8
757
+ pshufb xmm14, xmm8
758
+ pshufb xmm15, xmm8
759
+ movdqa xmm8, xmmword ptr [rsp+100H]
760
+ paddd xmm8, xmm12
761
+ paddd xmm9, xmm13
762
+ paddd xmm10, xmm14
763
+ paddd xmm11, xmm15
764
+ pxor xmm4, xmm8
765
+ pxor xmm5, xmm9
766
+ pxor xmm6, xmm10
767
+ pxor xmm7, xmm11
768
+ movdqa xmmword ptr [rsp+100H], xmm8
769
+ movdqa xmm8, xmm4
770
+ psrld xmm8, 7
771
+ pslld xmm4, 25
772
+ por xmm4, xmm8
773
+ movdqa xmm8, xmm5
774
+ psrld xmm8, 7
775
+ pslld xmm5, 25
776
+ por xmm5, xmm8
777
+ movdqa xmm8, xmm6
778
+ psrld xmm8, 7
779
+ pslld xmm6, 25
780
+ por xmm6, xmm8
781
+ movdqa xmm8, xmm7
782
+ psrld xmm8, 7
783
+ pslld xmm7, 25
784
+ por xmm7, xmm8
785
+ paddd xmm0, xmmword ptr [rsp+40H]
786
+ paddd xmm1, xmmword ptr [rsp+0B0H]
787
+ paddd xmm2, xmmword ptr [rsp+50H]
788
+ paddd xmm3, xmmword ptr [rsp+10H]
789
+ paddd xmm0, xmm5
790
+ paddd xmm1, xmm6
791
+ paddd xmm2, xmm7
792
+ paddd xmm3, xmm4
793
+ pxor xmm15, xmm0
794
+ pxor xmm12, xmm1
795
+ pxor xmm13, xmm2
796
+ pxor xmm14, xmm3
797
+ movdqa xmm8, xmmword ptr [ROT16]
798
+ pshufb xmm15, xmm8
799
+ pshufb xmm12, xmm8
800
+ pshufb xmm13, xmm8
801
+ pshufb xmm14, xmm8
802
+ paddd xmm10, xmm15
803
+ paddd xmm11, xmm12
804
+ movdqa xmm8, xmmword ptr [rsp+100H]
805
+ paddd xmm8, xmm13
806
+ paddd xmm9, xmm14
807
+ pxor xmm5, xmm10
808
+ pxor xmm6, xmm11
809
+ pxor xmm7, xmm8
810
+ pxor xmm4, xmm9
811
+ movdqa xmmword ptr [rsp+100H], xmm8
812
+ movdqa xmm8, xmm5
813
+ psrld xmm8, 12
814
+ pslld xmm5, 20
815
+ por xmm5, xmm8
816
+ movdqa xmm8, xmm6
817
+ psrld xmm8, 12
818
+ pslld xmm6, 20
819
+ por xmm6, xmm8
820
+ movdqa xmm8, xmm7
821
+ psrld xmm8, 12
822
+ pslld xmm7, 20
823
+ por xmm7, xmm8
824
+ movdqa xmm8, xmm4
825
+ psrld xmm8, 12
826
+ pslld xmm4, 20
827
+ por xmm4, xmm8
828
+ paddd xmm0, xmmword ptr [rsp]
829
+ paddd xmm1, xmmword ptr [rsp+20H]
830
+ paddd xmm2, xmmword ptr [rsp+80H]
831
+ paddd xmm3, xmmword ptr [rsp+60H]
832
+ paddd xmm0, xmm5
833
+ paddd xmm1, xmm6
834
+ paddd xmm2, xmm7
835
+ paddd xmm3, xmm4
836
+ pxor xmm15, xmm0
837
+ pxor xmm12, xmm1
838
+ pxor xmm13, xmm2
839
+ pxor xmm14, xmm3
840
+ movdqa xmm8, xmmword ptr [ROT8]
841
+ pshufb xmm15, xmm8
842
+ pshufb xmm12, xmm8
843
+ pshufb xmm13, xmm8
844
+ pshufb xmm14, xmm8
845
+ paddd xmm10, xmm15
846
+ paddd xmm11, xmm12
847
+ movdqa xmm8, xmmword ptr [rsp+100H]
848
+ paddd xmm8, xmm13
849
+ paddd xmm9, xmm14
850
+ pxor xmm5, xmm10
851
+ pxor xmm6, xmm11
852
+ pxor xmm7, xmm8
853
+ pxor xmm4, xmm9
854
+ movdqa xmmword ptr [rsp+100H], xmm8
855
+ movdqa xmm8, xmm5
856
+ psrld xmm8, 7
857
+ pslld xmm5, 25
858
+ por xmm5, xmm8
859
+ movdqa xmm8, xmm6
860
+ psrld xmm8, 7
861
+ pslld xmm6, 25
862
+ por xmm6, xmm8
863
+ movdqa xmm8, xmm7
864
+ psrld xmm8, 7
865
+ pslld xmm7, 25
866
+ por xmm7, xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ paddd xmm0, xmmword ptr [rsp+0C0H]
872
+ paddd xmm1, xmmword ptr [rsp+90H]
873
+ paddd xmm2, xmmword ptr [rsp+0F0H]
874
+ paddd xmm3, xmmword ptr [rsp+0E0H]
875
+ paddd xmm0, xmm4
876
+ paddd xmm1, xmm5
877
+ paddd xmm2, xmm6
878
+ paddd xmm3, xmm7
879
+ pxor xmm12, xmm0
880
+ pxor xmm13, xmm1
881
+ pxor xmm14, xmm2
882
+ pxor xmm15, xmm3
883
+ movdqa xmm8, xmmword ptr [ROT16]
884
+ pshufb xmm12, xmm8
885
+ pshufb xmm13, xmm8
886
+ pshufb xmm14, xmm8
887
+ pshufb xmm15, xmm8
888
+ movdqa xmm8, xmmword ptr [rsp+100H]
889
+ paddd xmm8, xmm12
890
+ paddd xmm9, xmm13
891
+ paddd xmm10, xmm14
892
+ paddd xmm11, xmm15
893
+ pxor xmm4, xmm8
894
+ pxor xmm5, xmm9
895
+ pxor xmm6, xmm10
896
+ pxor xmm7, xmm11
897
+ movdqa xmmword ptr [rsp+100H], xmm8
898
+ movdqa xmm8, xmm4
899
+ psrld xmm8, 12
900
+ pslld xmm4, 20
901
+ por xmm4, xmm8
902
+ movdqa xmm8, xmm5
903
+ psrld xmm8, 12
904
+ pslld xmm5, 20
905
+ por xmm5, xmm8
906
+ movdqa xmm8, xmm6
907
+ psrld xmm8, 12
908
+ pslld xmm6, 20
909
+ por xmm6, xmm8
910
+ movdqa xmm8, xmm7
911
+ psrld xmm8, 12
912
+ pslld xmm7, 20
913
+ por xmm7, xmm8
914
+ paddd xmm0, xmmword ptr [rsp+0D0H]
915
+ paddd xmm1, xmmword ptr [rsp+0B0H]
916
+ paddd xmm2, xmmword ptr [rsp+0A0H]
917
+ paddd xmm3, xmmword ptr [rsp+80H]
918
+ paddd xmm0, xmm4
919
+ paddd xmm1, xmm5
920
+ paddd xmm2, xmm6
921
+ paddd xmm3, xmm7
922
+ pxor xmm12, xmm0
923
+ pxor xmm13, xmm1
924
+ pxor xmm14, xmm2
925
+ pxor xmm15, xmm3
926
+ movdqa xmm8, xmmword ptr [ROT8]
927
+ pshufb xmm12, xmm8
928
+ pshufb xmm13, xmm8
929
+ pshufb xmm14, xmm8
930
+ pshufb xmm15, xmm8
931
+ movdqa xmm8, xmmword ptr [rsp+100H]
932
+ paddd xmm8, xmm12
933
+ paddd xmm9, xmm13
934
+ paddd xmm10, xmm14
935
+ paddd xmm11, xmm15
936
+ pxor xmm4, xmm8
937
+ pxor xmm5, xmm9
938
+ pxor xmm6, xmm10
939
+ pxor xmm7, xmm11
940
+ movdqa xmmword ptr [rsp+100H], xmm8
941
+ movdqa xmm8, xmm4
942
+ psrld xmm8, 7
943
+ pslld xmm4, 25
944
+ por xmm4, xmm8
945
+ movdqa xmm8, xmm5
946
+ psrld xmm8, 7
947
+ pslld xmm5, 25
948
+ por xmm5, xmm8
949
+ movdqa xmm8, xmm6
950
+ psrld xmm8, 7
951
+ pslld xmm6, 25
952
+ por xmm6, xmm8
953
+ movdqa xmm8, xmm7
954
+ psrld xmm8, 7
955
+ pslld xmm7, 25
956
+ por xmm7, xmm8
957
+ paddd xmm0, xmmword ptr [rsp+70H]
958
+ paddd xmm1, xmmword ptr [rsp+50H]
959
+ paddd xmm2, xmmword ptr [rsp]
960
+ paddd xmm3, xmmword ptr [rsp+60H]
961
+ paddd xmm0, xmm5
962
+ paddd xmm1, xmm6
963
+ paddd xmm2, xmm7
964
+ paddd xmm3, xmm4
965
+ pxor xmm15, xmm0
966
+ pxor xmm12, xmm1
967
+ pxor xmm13, xmm2
968
+ pxor xmm14, xmm3
969
+ movdqa xmm8, xmmword ptr [ROT16]
970
+ pshufb xmm15, xmm8
971
+ pshufb xmm12, xmm8
972
+ pshufb xmm13, xmm8
973
+ pshufb xmm14, xmm8
974
+ paddd xmm10, xmm15
975
+ paddd xmm11, xmm12
976
+ movdqa xmm8, xmmword ptr [rsp+100H]
977
+ paddd xmm8, xmm13
978
+ paddd xmm9, xmm14
979
+ pxor xmm5, xmm10
980
+ pxor xmm6, xmm11
981
+ pxor xmm7, xmm8
982
+ pxor xmm4, xmm9
983
+ movdqa xmmword ptr [rsp+100H], xmm8
984
+ movdqa xmm8, xmm5
985
+ psrld xmm8, 12
986
+ pslld xmm5, 20
987
+ por xmm5, xmm8
988
+ movdqa xmm8, xmm6
989
+ psrld xmm8, 12
990
+ pslld xmm6, 20
991
+ por xmm6, xmm8
992
+ movdqa xmm8, xmm7
993
+ psrld xmm8, 12
994
+ pslld xmm7, 20
995
+ por xmm7, xmm8
996
+ movdqa xmm8, xmm4
997
+ psrld xmm8, 12
998
+ pslld xmm4, 20
999
+ por xmm4, xmm8
1000
+ paddd xmm0, xmmword ptr [rsp+20H]
1001
+ paddd xmm1, xmmword ptr [rsp+30H]
1002
+ paddd xmm2, xmmword ptr [rsp+10H]
1003
+ paddd xmm3, xmmword ptr [rsp+40H]
1004
+ paddd xmm0, xmm5
1005
+ paddd xmm1, xmm6
1006
+ paddd xmm2, xmm7
1007
+ paddd xmm3, xmm4
1008
+ pxor xmm15, xmm0
1009
+ pxor xmm12, xmm1
1010
+ pxor xmm13, xmm2
1011
+ pxor xmm14, xmm3
1012
+ movdqa xmm8, xmmword ptr [ROT8]
1013
+ pshufb xmm15, xmm8
1014
+ pshufb xmm12, xmm8
1015
+ pshufb xmm13, xmm8
1016
+ pshufb xmm14, xmm8
1017
+ paddd xmm10, xmm15
1018
+ paddd xmm11, xmm12
1019
+ movdqa xmm8, xmmword ptr [rsp+100H]
1020
+ paddd xmm8, xmm13
1021
+ paddd xmm9, xmm14
1022
+ pxor xmm5, xmm10
1023
+ pxor xmm6, xmm11
1024
+ pxor xmm7, xmm8
1025
+ pxor xmm4, xmm9
1026
+ movdqa xmmword ptr [rsp+100H], xmm8
1027
+ movdqa xmm8, xmm5
1028
+ psrld xmm8, 7
1029
+ pslld xmm5, 25
1030
+ por xmm5, xmm8
1031
+ movdqa xmm8, xmm6
1032
+ psrld xmm8, 7
1033
+ pslld xmm6, 25
1034
+ por xmm6, xmm8
1035
+ movdqa xmm8, xmm7
1036
+ psrld xmm8, 7
1037
+ pslld xmm7, 25
1038
+ por xmm7, xmm8
1039
+ movdqa xmm8, xmm4
1040
+ psrld xmm8, 7
1041
+ pslld xmm4, 25
1042
+ por xmm4, xmm8
1043
+ paddd xmm0, xmmword ptr [rsp+90H]
1044
+ paddd xmm1, xmmword ptr [rsp+0B0H]
1045
+ paddd xmm2, xmmword ptr [rsp+80H]
1046
+ paddd xmm3, xmmword ptr [rsp+0F0H]
1047
+ paddd xmm0, xmm4
1048
+ paddd xmm1, xmm5
1049
+ paddd xmm2, xmm6
1050
+ paddd xmm3, xmm7
1051
+ pxor xmm12, xmm0
1052
+ pxor xmm13, xmm1
1053
+ pxor xmm14, xmm2
1054
+ pxor xmm15, xmm3
1055
+ movdqa xmm8, xmmword ptr [ROT16]
1056
+ pshufb xmm12, xmm8
1057
+ pshufb xmm13, xmm8
1058
+ pshufb xmm14, xmm8
1059
+ pshufb xmm15, xmm8
1060
+ movdqa xmm8, xmmword ptr [rsp+100H]
1061
+ paddd xmm8, xmm12
1062
+ paddd xmm9, xmm13
1063
+ paddd xmm10, xmm14
1064
+ paddd xmm11, xmm15
1065
+ pxor xmm4, xmm8
1066
+ pxor xmm5, xmm9
1067
+ pxor xmm6, xmm10
1068
+ pxor xmm7, xmm11
1069
+ movdqa xmmword ptr [rsp+100H], xmm8
1070
+ movdqa xmm8, xmm4
1071
+ psrld xmm8, 12
1072
+ pslld xmm4, 20
1073
+ por xmm4, xmm8
1074
+ movdqa xmm8, xmm5
1075
+ psrld xmm8, 12
1076
+ pslld xmm5, 20
1077
+ por xmm5, xmm8
1078
+ movdqa xmm8, xmm6
1079
+ psrld xmm8, 12
1080
+ pslld xmm6, 20
1081
+ por xmm6, xmm8
1082
+ movdqa xmm8, xmm7
1083
+ psrld xmm8, 12
1084
+ pslld xmm7, 20
1085
+ por xmm7, xmm8
1086
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1087
+ paddd xmm1, xmmword ptr [rsp+50H]
1088
+ paddd xmm2, xmmword ptr [rsp+0C0H]
1089
+ paddd xmm3, xmmword ptr [rsp+10H]
1090
+ paddd xmm0, xmm4
1091
+ paddd xmm1, xmm5
1092
+ paddd xmm2, xmm6
1093
+ paddd xmm3, xmm7
1094
+ pxor xmm12, xmm0
1095
+ pxor xmm13, xmm1
1096
+ pxor xmm14, xmm2
1097
+ pxor xmm15, xmm3
1098
+ movdqa xmm8, xmmword ptr [ROT8]
1099
+ pshufb xmm12, xmm8
1100
+ pshufb xmm13, xmm8
1101
+ pshufb xmm14, xmm8
1102
+ pshufb xmm15, xmm8
1103
+ movdqa xmm8, xmmword ptr [rsp+100H]
1104
+ paddd xmm8, xmm12
1105
+ paddd xmm9, xmm13
1106
+ paddd xmm10, xmm14
1107
+ paddd xmm11, xmm15
1108
+ pxor xmm4, xmm8
1109
+ pxor xmm5, xmm9
1110
+ pxor xmm6, xmm10
1111
+ pxor xmm7, xmm11
1112
+ movdqa xmmword ptr [rsp+100H], xmm8
1113
+ movdqa xmm8, xmm4
1114
+ psrld xmm8, 7
1115
+ pslld xmm4, 25
1116
+ por xmm4, xmm8
1117
+ movdqa xmm8, xmm5
1118
+ psrld xmm8, 7
1119
+ pslld xmm5, 25
1120
+ por xmm5, xmm8
1121
+ movdqa xmm8, xmm6
1122
+ psrld xmm8, 7
1123
+ pslld xmm6, 25
1124
+ por xmm6, xmm8
1125
+ movdqa xmm8, xmm7
1126
+ psrld xmm8, 7
1127
+ pslld xmm7, 25
1128
+ por xmm7, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+0D0H]
1130
+ paddd xmm1, xmmword ptr [rsp]
1131
+ paddd xmm2, xmmword ptr [rsp+20H]
1132
+ paddd xmm3, xmmword ptr [rsp+40H]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmmword ptr [ROT16]
1142
+ pshufb xmm15, xmm8
1143
+ pshufb xmm12, xmm8
1144
+ pshufb xmm13, xmm8
1145
+ pshufb xmm14, xmm8
1146
+ paddd xmm10, xmm15
1147
+ paddd xmm11, xmm12
1148
+ movdqa xmm8, xmmword ptr [rsp+100H]
1149
+ paddd xmm8, xmm13
1150
+ paddd xmm9, xmm14
1151
+ pxor xmm5, xmm10
1152
+ pxor xmm6, xmm11
1153
+ pxor xmm7, xmm8
1154
+ pxor xmm4, xmm9
1155
+ movdqa xmmword ptr [rsp+100H], xmm8
1156
+ movdqa xmm8, xmm5
1157
+ psrld xmm8, 12
1158
+ pslld xmm5, 20
1159
+ por xmm5, xmm8
1160
+ movdqa xmm8, xmm6
1161
+ psrld xmm8, 12
1162
+ pslld xmm6, 20
1163
+ por xmm6, xmm8
1164
+ movdqa xmm8, xmm7
1165
+ psrld xmm8, 12
1166
+ pslld xmm7, 20
1167
+ por xmm7, xmm8
1168
+ movdqa xmm8, xmm4
1169
+ psrld xmm8, 12
1170
+ pslld xmm4, 20
1171
+ por xmm4, xmm8
1172
+ paddd xmm0, xmmword ptr [rsp+30H]
1173
+ paddd xmm1, xmmword ptr [rsp+0A0H]
1174
+ paddd xmm2, xmmword ptr [rsp+60H]
1175
+ paddd xmm3, xmmword ptr [rsp+70H]
1176
+ paddd xmm0, xmm5
1177
+ paddd xmm1, xmm6
1178
+ paddd xmm2, xmm7
1179
+ paddd xmm3, xmm4
1180
+ pxor xmm15, xmm0
1181
+ pxor xmm12, xmm1
1182
+ pxor xmm13, xmm2
1183
+ pxor xmm14, xmm3
1184
+ movdqa xmm8, xmmword ptr [ROT8]
1185
+ pshufb xmm15, xmm8
1186
+ pshufb xmm12, xmm8
1187
+ pshufb xmm13, xmm8
1188
+ pshufb xmm14, xmm8
1189
+ paddd xmm10, xmm15
1190
+ paddd xmm11, xmm12
1191
+ movdqa xmm8, xmmword ptr [rsp+100H]
1192
+ paddd xmm8, xmm13
1193
+ paddd xmm9, xmm14
1194
+ pxor xmm5, xmm10
1195
+ pxor xmm6, xmm11
1196
+ pxor xmm7, xmm8
1197
+ pxor xmm4, xmm9
1198
+ movdqa xmmword ptr [rsp+100H], xmm8
1199
+ movdqa xmm8, xmm5
1200
+ psrld xmm8, 7
1201
+ pslld xmm5, 25
1202
+ por xmm5, xmm8
1203
+ movdqa xmm8, xmm6
1204
+ psrld xmm8, 7
1205
+ pslld xmm6, 25
1206
+ por xmm6, xmm8
1207
+ movdqa xmm8, xmm7
1208
+ psrld xmm8, 7
1209
+ pslld xmm7, 25
1210
+ por xmm7, xmm8
1211
+ movdqa xmm8, xmm4
1212
+ psrld xmm8, 7
1213
+ pslld xmm4, 25
1214
+ por xmm4, xmm8
1215
+ paddd xmm0, xmmword ptr [rsp+0B0H]
1216
+ paddd xmm1, xmmword ptr [rsp+50H]
1217
+ paddd xmm2, xmmword ptr [rsp+10H]
1218
+ paddd xmm3, xmmword ptr [rsp+80H]
1219
+ paddd xmm0, xmm4
1220
+ paddd xmm1, xmm5
1221
+ paddd xmm2, xmm6
1222
+ paddd xmm3, xmm7
1223
+ pxor xmm12, xmm0
1224
+ pxor xmm13, xmm1
1225
+ pxor xmm14, xmm2
1226
+ pxor xmm15, xmm3
1227
+ movdqa xmm8, xmmword ptr [ROT16]
1228
+ pshufb xmm12, xmm8
1229
+ pshufb xmm13, xmm8
1230
+ pshufb xmm14, xmm8
1231
+ pshufb xmm15, xmm8
1232
+ movdqa xmm8, xmmword ptr [rsp+100H]
1233
+ paddd xmm8, xmm12
1234
+ paddd xmm9, xmm13
1235
+ paddd xmm10, xmm14
1236
+ paddd xmm11, xmm15
1237
+ pxor xmm4, xmm8
1238
+ pxor xmm5, xmm9
1239
+ pxor xmm6, xmm10
1240
+ pxor xmm7, xmm11
1241
+ movdqa xmmword ptr [rsp+100H], xmm8
1242
+ movdqa xmm8, xmm4
1243
+ psrld xmm8, 12
1244
+ pslld xmm4, 20
1245
+ por xmm4, xmm8
1246
+ movdqa xmm8, xmm5
1247
+ psrld xmm8, 12
1248
+ pslld xmm5, 20
1249
+ por xmm5, xmm8
1250
+ movdqa xmm8, xmm6
1251
+ psrld xmm8, 12
1252
+ pslld xmm6, 20
1253
+ por xmm6, xmm8
1254
+ movdqa xmm8, xmm7
1255
+ psrld xmm8, 12
1256
+ pslld xmm7, 20
1257
+ por xmm7, xmm8
1258
+ paddd xmm0, xmmword ptr [rsp+0F0H]
1259
+ paddd xmm1, xmmword ptr [rsp]
1260
+ paddd xmm2, xmmword ptr [rsp+90H]
1261
+ paddd xmm3, xmmword ptr [rsp+60H]
1262
+ paddd xmm0, xmm4
1263
+ paddd xmm1, xmm5
1264
+ paddd xmm2, xmm6
1265
+ paddd xmm3, xmm7
1266
+ pxor xmm12, xmm0
1267
+ pxor xmm13, xmm1
1268
+ pxor xmm14, xmm2
1269
+ pxor xmm15, xmm3
1270
+ movdqa xmm8, xmmword ptr [ROT8]
1271
+ pshufb xmm12, xmm8
1272
+ pshufb xmm13, xmm8
1273
+ pshufb xmm14, xmm8
1274
+ pshufb xmm15, xmm8
1275
+ movdqa xmm8, xmmword ptr [rsp+100H]
1276
+ paddd xmm8, xmm12
1277
+ paddd xmm9, xmm13
1278
+ paddd xmm10, xmm14
1279
+ paddd xmm11, xmm15
1280
+ pxor xmm4, xmm8
1281
+ pxor xmm5, xmm9
1282
+ pxor xmm6, xmm10
1283
+ pxor xmm7, xmm11
1284
+ movdqa xmmword ptr [rsp+100H], xmm8
1285
+ movdqa xmm8, xmm4
1286
+ psrld xmm8, 7
1287
+ pslld xmm4, 25
1288
+ por xmm4, xmm8
1289
+ movdqa xmm8, xmm5
1290
+ psrld xmm8, 7
1291
+ pslld xmm5, 25
1292
+ por xmm5, xmm8
1293
+ movdqa xmm8, xmm6
1294
+ psrld xmm8, 7
1295
+ pslld xmm6, 25
1296
+ por xmm6, xmm8
1297
+ movdqa xmm8, xmm7
1298
+ psrld xmm8, 7
1299
+ pslld xmm7, 25
1300
+ por xmm7, xmm8
1301
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1302
+ paddd xmm1, xmmword ptr [rsp+20H]
1303
+ paddd xmm2, xmmword ptr [rsp+30H]
1304
+ paddd xmm3, xmmword ptr [rsp+70H]
1305
+ paddd xmm0, xmm5
1306
+ paddd xmm1, xmm6
1307
+ paddd xmm2, xmm7
1308
+ paddd xmm3, xmm4
1309
+ pxor xmm15, xmm0
1310
+ pxor xmm12, xmm1
1311
+ pxor xmm13, xmm2
1312
+ pxor xmm14, xmm3
1313
+ movdqa xmm8, xmmword ptr [ROT16]
1314
+ pshufb xmm15, xmm8
1315
+ pshufb xmm12, xmm8
1316
+ pshufb xmm13, xmm8
1317
+ pshufb xmm14, xmm8
1318
+ paddd xmm10, xmm15
1319
+ paddd xmm11, xmm12
1320
+ movdqa xmm8, xmmword ptr [rsp+100H]
1321
+ paddd xmm8, xmm13
1322
+ paddd xmm9, xmm14
1323
+ pxor xmm5, xmm10
1324
+ pxor xmm6, xmm11
1325
+ pxor xmm7, xmm8
1326
+ pxor xmm4, xmm9
1327
+ movdqa xmmword ptr [rsp+100H], xmm8
1328
+ movdqa xmm8, xmm5
1329
+ psrld xmm8, 12
1330
+ pslld xmm5, 20
1331
+ por xmm5, xmm8
1332
+ movdqa xmm8, xmm6
1333
+ psrld xmm8, 12
1334
+ pslld xmm6, 20
1335
+ por xmm6, xmm8
1336
+ movdqa xmm8, xmm7
1337
+ psrld xmm8, 12
1338
+ pslld xmm7, 20
1339
+ por xmm7, xmm8
1340
+ movdqa xmm8, xmm4
1341
+ psrld xmm8, 12
1342
+ pslld xmm4, 20
1343
+ por xmm4, xmm8
1344
+ paddd xmm0, xmmword ptr [rsp+0A0H]
1345
+ paddd xmm1, xmmword ptr [rsp+0C0H]
1346
+ paddd xmm2, xmmword ptr [rsp+40H]
1347
+ paddd xmm3, xmmword ptr [rsp+0D0H]
1348
+ paddd xmm0, xmm5
1349
+ paddd xmm1, xmm6
1350
+ paddd xmm2, xmm7
1351
+ paddd xmm3, xmm4
1352
+ pxor xmm15, xmm0
1353
+ pxor xmm12, xmm1
1354
+ pxor xmm13, xmm2
1355
+ pxor xmm14, xmm3
1356
+ movdqa xmm8, xmmword ptr [ROT8]
1357
+ pshufb xmm15, xmm8
1358
+ pshufb xmm12, xmm8
1359
+ pshufb xmm13, xmm8
1360
+ pshufb xmm14, xmm8
1361
+ paddd xmm10, xmm15
1362
+ paddd xmm11, xmm12
1363
+ movdqa xmm8, xmmword ptr [rsp+100H]
1364
+ paddd xmm8, xmm13
1365
+ paddd xmm9, xmm14
1366
+ pxor xmm5, xmm10
1367
+ pxor xmm6, xmm11
1368
+ pxor xmm7, xmm8
1369
+ pxor xmm4, xmm9
1370
+ pxor xmm0, xmm8
1371
+ pxor xmm1, xmm9
1372
+ pxor xmm2, xmm10
1373
+ pxor xmm3, xmm11
1374
+ movdqa xmm8, xmm5
1375
+ psrld xmm8, 7
1376
+ pslld xmm5, 25
1377
+ por xmm5, xmm8
1378
+ movdqa xmm8, xmm6
1379
+ psrld xmm8, 7
1380
+ pslld xmm6, 25
1381
+ por xmm6, xmm8
1382
+ movdqa xmm8, xmm7
1383
+ psrld xmm8, 7
1384
+ pslld xmm7, 25
1385
+ por xmm7, xmm8
1386
+ movdqa xmm8, xmm4
1387
+ psrld xmm8, 7
1388
+ pslld xmm4, 25
1389
+ por xmm4, xmm8
1390
+ pxor xmm4, xmm12
1391
+ pxor xmm5, xmm13
1392
+ pxor xmm6, xmm14
1393
+ pxor xmm7, xmm15
1394
+ mov eax, r13d
1395
+ jne innerloop4
1396
+ movdqa xmm9, xmm0
1397
+ punpckldq xmm0, xmm1
1398
+ punpckhdq xmm9, xmm1
1399
+ movdqa xmm11, xmm2
1400
+ punpckldq xmm2, xmm3
1401
+ punpckhdq xmm11, xmm3
1402
+ movdqa xmm1, xmm0
1403
+ punpcklqdq xmm0, xmm2
1404
+ punpckhqdq xmm1, xmm2
1405
+ movdqa xmm3, xmm9
1406
+ punpcklqdq xmm9, xmm11
1407
+ punpckhqdq xmm3, xmm11
1408
+ movdqu xmmword ptr [rbx], xmm0
1409
+ movdqu xmmword ptr [rbx+20H], xmm1
1410
+ movdqu xmmword ptr [rbx+40H], xmm9
1411
+ movdqu xmmword ptr [rbx+60H], xmm3
1412
+ movdqa xmm9, xmm4
1413
+ punpckldq xmm4, xmm5
1414
+ punpckhdq xmm9, xmm5
1415
+ movdqa xmm11, xmm6
1416
+ punpckldq xmm6, xmm7
1417
+ punpckhdq xmm11, xmm7
1418
+ movdqa xmm5, xmm4
1419
+ punpcklqdq xmm4, xmm6
1420
+ punpckhqdq xmm5, xmm6
1421
+ movdqa xmm7, xmm9
1422
+ punpcklqdq xmm9, xmm11
1423
+ punpckhqdq xmm7, xmm11
1424
+ movdqu xmmword ptr [rbx+10H], xmm4
1425
+ movdqu xmmword ptr [rbx+30H], xmm5
1426
+ movdqu xmmword ptr [rbx+50H], xmm9
1427
+ movdqu xmmword ptr [rbx+70H], xmm7
1428
+ movdqa xmm1, xmmword ptr [rsp+110H]
1429
+ movdqa xmm0, xmm1
1430
+ paddd xmm1, xmmword ptr [rsp+150H]
1431
+ movdqa xmmword ptr [rsp+110H], xmm1
1432
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
1433
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
1434
+ pcmpgtd xmm0, xmm1
1435
+ movdqa xmm1, xmmword ptr [rsp+120H]
1436
+ psubd xmm1, xmm0
1437
+ movdqa xmmword ptr [rsp+120H], xmm1
1438
+ add rbx, 128
1439
+ add rdi, 32
1440
+ sub rsi, 4
1441
+ cmp rsi, 4
1442
+ jnc outerloop4
1443
+ test rsi, rsi
1444
+ jne final3blocks
1445
+ unwind:
1446
+ movdqa xmm6, xmmword ptr [rsp+170H]
1447
+ movdqa xmm7, xmmword ptr [rsp+180H]
1448
+ movdqa xmm8, xmmword ptr [rsp+190H]
1449
+ movdqa xmm9, xmmword ptr [rsp+1A0H]
1450
+ movdqa xmm10, xmmword ptr [rsp+1B0H]
1451
+ movdqa xmm11, xmmword ptr [rsp+1C0H]
1452
+ movdqa xmm12, xmmword ptr [rsp+1D0H]
1453
+ movdqa xmm13, xmmword ptr [rsp+1E0H]
1454
+ movdqa xmm14, xmmword ptr [rsp+1F0H]
1455
+ movdqa xmm15, xmmword ptr [rsp+200H]
1456
+ mov rsp, rbp
1457
+ pop rbp
1458
+ pop rbx
1459
+ pop rdi
1460
+ pop rsi
1461
+ pop r12
1462
+ pop r13
1463
+ pop r14
1464
+ pop r15
1465
+ ret
1466
+ ALIGN 16
1467
+ final3blocks:
1468
+ test esi, 2H
1469
+ je final1block
1470
+ movups xmm0, xmmword ptr [rcx]
1471
+ movups xmm1, xmmword ptr [rcx+10H]
1472
+ movaps xmm8, xmm0
1473
+ movaps xmm9, xmm1
1474
+ movd xmm13, dword ptr [rsp+110H]
1475
+ pinsrd xmm13, dword ptr [rsp+120H], 1
1476
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1477
+ movaps xmmword ptr [rsp], xmm13
1478
+ movd xmm14, dword ptr [rsp+114H]
1479
+ pinsrd xmm14, dword ptr [rsp+124H], 1
1480
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
1481
+ movaps xmmword ptr [rsp+10H], xmm14
1482
+ mov r8, qword ptr [rdi]
1483
+ mov r9, qword ptr [rdi+8H]
1484
+ movzx eax, byte ptr [rbp+80H]
1485
+ or eax, r13d
1486
+ xor edx, edx
1487
+ innerloop2:
1488
+ mov r14d, eax
1489
+ or eax, r12d
1490
+ add rdx, 64
1491
+ cmp rdx, r15
1492
+ cmovne eax, r14d
1493
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1494
+ movaps xmm10, xmm2
1495
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1496
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1497
+ movaps xmm3, xmm4
1498
+ shufps xmm4, xmm5, 136
1499
+ shufps xmm3, xmm5, 221
1500
+ movaps xmm5, xmm3
1501
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1502
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1503
+ movaps xmm3, xmm6
1504
+ shufps xmm6, xmm7, 136
1505
+ pshufd xmm6, xmm6, 93H
1506
+ shufps xmm3, xmm7, 221
1507
+ pshufd xmm7, xmm3, 93H
1508
+ movups xmm12, xmmword ptr [r9+rdx-40H]
1509
+ movups xmm13, xmmword ptr [r9+rdx-30H]
1510
+ movaps xmm11, xmm12
1511
+ shufps xmm12, xmm13, 136
1512
+ shufps xmm11, xmm13, 221
1513
+ movaps xmm13, xmm11
1514
+ movups xmm14, xmmword ptr [r9+rdx-20H]
1515
+ movups xmm15, xmmword ptr [r9+rdx-10H]
1516
+ movaps xmm11, xmm14
1517
+ shufps xmm14, xmm15, 136
1518
+ pshufd xmm14, xmm14, 93H
1519
+ shufps xmm11, xmm15, 221
1520
+ pshufd xmm15, xmm11, 93H
1521
+ movaps xmm3, xmmword ptr [rsp]
1522
+ movaps xmm11, xmmword ptr [rsp+10H]
1523
+ pinsrd xmm3, eax, 3
1524
+ pinsrd xmm11, eax, 3
1525
+ mov al, 7
1526
+ roundloop2:
1527
+ paddd xmm0, xmm4
1528
+ paddd xmm8, xmm12
1529
+ movaps xmmword ptr [rsp+20H], xmm4
1530
+ movaps xmmword ptr [rsp+30H], xmm12
1531
+ paddd xmm0, xmm1
1532
+ paddd xmm8, xmm9
1533
+ pxor xmm3, xmm0
1534
+ pxor xmm11, xmm8
1535
+ movaps xmm12, xmmword ptr [ROT16]
1536
+ pshufb xmm3, xmm12
1537
+ pshufb xmm11, xmm12
1538
+ paddd xmm2, xmm3
1539
+ paddd xmm10, xmm11
1540
+ pxor xmm1, xmm2
1541
+ pxor xmm9, xmm10
1542
+ movdqa xmm4, xmm1
1543
+ pslld xmm1, 20
1544
+ psrld xmm4, 12
1545
+ por xmm1, xmm4
1546
+ movdqa xmm4, xmm9
1547
+ pslld xmm9, 20
1548
+ psrld xmm4, 12
1549
+ por xmm9, xmm4
1550
+ paddd xmm0, xmm5
1551
+ paddd xmm8, xmm13
1552
+ movaps xmmword ptr [rsp+40H], xmm5
1553
+ movaps xmmword ptr [rsp+50H], xmm13
1554
+ paddd xmm0, xmm1
1555
+ paddd xmm8, xmm9
1556
+ pxor xmm3, xmm0
1557
+ pxor xmm11, xmm8
1558
+ movaps xmm13, xmmword ptr [ROT8]
1559
+ pshufb xmm3, xmm13
1560
+ pshufb xmm11, xmm13
1561
+ paddd xmm2, xmm3
1562
+ paddd xmm10, xmm11
1563
+ pxor xmm1, xmm2
1564
+ pxor xmm9, xmm10
1565
+ movdqa xmm4, xmm1
1566
+ pslld xmm1, 25
1567
+ psrld xmm4, 7
1568
+ por xmm1, xmm4
1569
+ movdqa xmm4, xmm9
1570
+ pslld xmm9, 25
1571
+ psrld xmm4, 7
1572
+ por xmm9, xmm4
1573
+ pshufd xmm0, xmm0, 93H
1574
+ pshufd xmm8, xmm8, 93H
1575
+ pshufd xmm3, xmm3, 4EH
1576
+ pshufd xmm11, xmm11, 4EH
1577
+ pshufd xmm2, xmm2, 39H
1578
+ pshufd xmm10, xmm10, 39H
1579
+ paddd xmm0, xmm6
1580
+ paddd xmm8, xmm14
1581
+ paddd xmm0, xmm1
1582
+ paddd xmm8, xmm9
1583
+ pxor xmm3, xmm0
1584
+ pxor xmm11, xmm8
1585
+ pshufb xmm3, xmm12
1586
+ pshufb xmm11, xmm12
1587
+ paddd xmm2, xmm3
1588
+ paddd xmm10, xmm11
1589
+ pxor xmm1, xmm2
1590
+ pxor xmm9, xmm10
1591
+ movdqa xmm4, xmm1
1592
+ pslld xmm1, 20
1593
+ psrld xmm4, 12
1594
+ por xmm1, xmm4
1595
+ movdqa xmm4, xmm9
1596
+ pslld xmm9, 20
1597
+ psrld xmm4, 12
1598
+ por xmm9, xmm4
1599
+ paddd xmm0, xmm7
1600
+ paddd xmm8, xmm15
1601
+ paddd xmm0, xmm1
1602
+ paddd xmm8, xmm9
1603
+ pxor xmm3, xmm0
1604
+ pxor xmm11, xmm8
1605
+ pshufb xmm3, xmm13
1606
+ pshufb xmm11, xmm13
1607
+ paddd xmm2, xmm3
1608
+ paddd xmm10, xmm11
1609
+ pxor xmm1, xmm2
1610
+ pxor xmm9, xmm10
1611
+ movdqa xmm4, xmm1
1612
+ pslld xmm1, 25
1613
+ psrld xmm4, 7
1614
+ por xmm1, xmm4
1615
+ movdqa xmm4, xmm9
1616
+ pslld xmm9, 25
1617
+ psrld xmm4, 7
1618
+ por xmm9, xmm4
1619
+ pshufd xmm0, xmm0, 39H
1620
+ pshufd xmm8, xmm8, 39H
1621
+ pshufd xmm3, xmm3, 4EH
1622
+ pshufd xmm11, xmm11, 4EH
1623
+ pshufd xmm2, xmm2, 93H
1624
+ pshufd xmm10, xmm10, 93H
1625
+ dec al
1626
+ je endroundloop2
1627
+ movdqa xmm12, xmmword ptr [rsp+20H]
1628
+ movdqa xmm5, xmmword ptr [rsp+40H]
1629
+ pshufd xmm13, xmm12, 0FH
1630
+ shufps xmm12, xmm5, 214
1631
+ pshufd xmm4, xmm12, 39H
1632
+ movdqa xmm12, xmm6
1633
+ shufps xmm12, xmm7, 250
1634
+ pblendw xmm13, xmm12, 0CCH
1635
+ movdqa xmm12, xmm7
1636
+ punpcklqdq xmm12, xmm5
1637
+ pblendw xmm12, xmm6, 0C0H
1638
+ pshufd xmm12, xmm12, 78H
1639
+ punpckhdq xmm5, xmm7
1640
+ punpckldq xmm6, xmm5
1641
+ pshufd xmm7, xmm6, 1EH
1642
+ movdqa xmmword ptr [rsp+20H], xmm13
1643
+ movdqa xmmword ptr [rsp+40H], xmm12
1644
+ movdqa xmm5, xmmword ptr [rsp+30H]
1645
+ movdqa xmm13, xmmword ptr [rsp+50H]
1646
+ pshufd xmm6, xmm5, 0FH
1647
+ shufps xmm5, xmm13, 214
1648
+ pshufd xmm12, xmm5, 39H
1649
+ movdqa xmm5, xmm14
1650
+ shufps xmm5, xmm15, 250
1651
+ pblendw xmm6, xmm5, 0CCH
1652
+ movdqa xmm5, xmm15
1653
+ punpcklqdq xmm5, xmm13
1654
+ pblendw xmm5, xmm14, 0C0H
1655
+ pshufd xmm5, xmm5, 78H
1656
+ punpckhdq xmm13, xmm15
1657
+ punpckldq xmm14, xmm13
1658
+ pshufd xmm15, xmm14, 1EH
1659
+ movdqa xmm13, xmm6
1660
+ movdqa xmm14, xmm5
1661
+ movdqa xmm5, xmmword ptr [rsp+20H]
1662
+ movdqa xmm6, xmmword ptr [rsp+40H]
1663
+ jmp roundloop2
1664
+ endroundloop2:
1665
+ pxor xmm0, xmm2
1666
+ pxor xmm1, xmm3
1667
+ pxor xmm8, xmm10
1668
+ pxor xmm9, xmm11
1669
+ mov eax, r13d
1670
+ cmp rdx, r15
1671
+ jne innerloop2
1672
+ movups xmmword ptr [rbx], xmm0
1673
+ movups xmmword ptr [rbx+10H], xmm1
1674
+ movups xmmword ptr [rbx+20H], xmm8
1675
+ movups xmmword ptr [rbx+30H], xmm9
1676
+ movdqa xmm0, xmmword ptr [rsp+130H]
1677
+ movdqa xmm1, xmmword ptr [rsp+110H]
1678
+ movdqa xmm2, xmmword ptr [rsp+120H]
1679
+ movdqu xmm3, xmmword ptr [rsp+118H]
1680
+ movdqu xmm4, xmmword ptr [rsp+128H]
1681
+ blendvps xmm1, xmm3, xmm0
1682
+ blendvps xmm2, xmm4, xmm0
1683
+ movdqa xmmword ptr [rsp+110H], xmm1
1684
+ movdqa xmmword ptr [rsp+120H], xmm2
1685
+ add rdi, 16
1686
+ add rbx, 64
1687
+ sub rsi, 2
1688
+ final1block:
1689
+ test esi, 1H
1690
+ je unwind
1691
+ movups xmm0, xmmword ptr [rcx]
1692
+ movups xmm1, xmmword ptr [rcx+10H]
1693
+ movd xmm13, dword ptr [rsp+110H]
1694
+ pinsrd xmm13, dword ptr [rsp+120H], 1
1695
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1696
+ movaps xmm14, xmmword ptr [ROT8]
1697
+ movaps xmm15, xmmword ptr [ROT16]
1698
+ mov r8, qword ptr [rdi]
1699
+ movzx eax, byte ptr [rbp+80H]
1700
+ or eax, r13d
1701
+ xor edx, edx
1702
+ innerloop1:
1703
+ mov r14d, eax
1704
+ or eax, r12d
1705
+ add rdx, 64
1706
+ cmp rdx, r15
1707
+ cmovne eax, r14d
1708
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1709
+ movaps xmm3, xmm13
1710
+ pinsrd xmm3, eax, 3
1711
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1712
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1713
+ movaps xmm8, xmm4
1714
+ shufps xmm4, xmm5, 136
1715
+ shufps xmm8, xmm5, 221
1716
+ movaps xmm5, xmm8
1717
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1718
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1719
+ movaps xmm8, xmm6
1720
+ shufps xmm6, xmm7, 136
1721
+ pshufd xmm6, xmm6, 93H
1722
+ shufps xmm8, xmm7, 221
1723
+ pshufd xmm7, xmm8, 93H
1724
+ mov al, 7
1725
+ roundloop1:
1726
+ paddd xmm0, xmm4
1727
+ paddd xmm0, xmm1
1728
+ pxor xmm3, xmm0
1729
+ pshufb xmm3, xmm15
1730
+ paddd xmm2, xmm3
1731
+ pxor xmm1, xmm2
1732
+ movdqa xmm11, xmm1
1733
+ pslld xmm1, 20
1734
+ psrld xmm11, 12
1735
+ por xmm1, xmm11
1736
+ paddd xmm0, xmm5
1737
+ paddd xmm0, xmm1
1738
+ pxor xmm3, xmm0
1739
+ pshufb xmm3, xmm14
1740
+ paddd xmm2, xmm3
1741
+ pxor xmm1, xmm2
1742
+ movdqa xmm11, xmm1
1743
+ pslld xmm1, 25
1744
+ psrld xmm11, 7
1745
+ por xmm1, xmm11
1746
+ pshufd xmm0, xmm0, 93H
1747
+ pshufd xmm3, xmm3, 4EH
1748
+ pshufd xmm2, xmm2, 39H
1749
+ paddd xmm0, xmm6
1750
+ paddd xmm0, xmm1
1751
+ pxor xmm3, xmm0
1752
+ pshufb xmm3, xmm15
1753
+ paddd xmm2, xmm3
1754
+ pxor xmm1, xmm2
1755
+ movdqa xmm11, xmm1
1756
+ pslld xmm1, 20
1757
+ psrld xmm11, 12
1758
+ por xmm1, xmm11
1759
+ paddd xmm0, xmm7
1760
+ paddd xmm0, xmm1
1761
+ pxor xmm3, xmm0
1762
+ pshufb xmm3, xmm14
1763
+ paddd xmm2, xmm3
1764
+ pxor xmm1, xmm2
1765
+ movdqa xmm11, xmm1
1766
+ pslld xmm1, 25
1767
+ psrld xmm11, 7
1768
+ por xmm1, xmm11
1769
+ pshufd xmm0, xmm0, 39H
1770
+ pshufd xmm3, xmm3, 4EH
1771
+ pshufd xmm2, xmm2, 93H
1772
+ dec al
1773
+ jz endroundloop1
1774
+ movdqa xmm8, xmm4
1775
+ shufps xmm8, xmm5, 214
1776
+ pshufd xmm9, xmm4, 0FH
1777
+ pshufd xmm4, xmm8, 39H
1778
+ movdqa xmm8, xmm6
1779
+ shufps xmm8, xmm7, 250
1780
+ pblendw xmm9, xmm8, 0CCH
1781
+ movdqa xmm8, xmm7
1782
+ punpcklqdq xmm8, xmm5
1783
+ pblendw xmm8, xmm6, 0C0H
1784
+ pshufd xmm8, xmm8, 78H
1785
+ punpckhdq xmm5, xmm7
1786
+ punpckldq xmm6, xmm5
1787
+ pshufd xmm7, xmm6, 1EH
1788
+ movdqa xmm5, xmm9
1789
+ movdqa xmm6, xmm8
1790
+ jmp roundloop1
1791
+ endroundloop1:
1792
+ pxor xmm0, xmm2
1793
+ pxor xmm1, xmm3
1794
+ mov eax, r13d
1795
+ cmp rdx, r15
1796
+ jne innerloop1
1797
+ movups xmmword ptr [rbx], xmm0
1798
+ movups xmmword ptr [rbx+10H], xmm1
1799
+ jmp unwind
1800
+ _blake3_hash_many_sse41 ENDP
1801
+ blake3_hash_many_sse41 ENDP
1802
+
1803
+ blake3_compress_in_place_sse41 PROC
1804
+ _blake3_compress_in_place_sse41 PROC
1805
+ sub rsp, 72
1806
+ movdqa xmmword ptr [rsp], xmm6
1807
+ movdqa xmmword ptr [rsp+10H], xmm7
1808
+ movdqa xmmword ptr [rsp+20H], xmm8
1809
+ movdqa xmmword ptr [rsp+30H], xmm9
1810
+ movups xmm0, xmmword ptr [rcx]
1811
+ movups xmm1, xmmword ptr [rcx+10H]
1812
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1813
+ movzx eax, byte ptr [rsp+70H]
1814
+ movzx r8d, r8b
1815
+ shl rax, 32
1816
+ add r8, rax
1817
+ movq xmm3, r9
1818
+ movq xmm4, r8
1819
+ punpcklqdq xmm3, xmm4
1820
+ movups xmm4, xmmword ptr [rdx]
1821
+ movups xmm5, xmmword ptr [rdx+10H]
1822
+ movaps xmm8, xmm4
1823
+ shufps xmm4, xmm5, 136
1824
+ shufps xmm8, xmm5, 221
1825
+ movaps xmm5, xmm8
1826
+ movups xmm6, xmmword ptr [rdx+20H]
1827
+ movups xmm7, xmmword ptr [rdx+30H]
1828
+ movaps xmm8, xmm6
1829
+ shufps xmm6, xmm7, 136
1830
+ pshufd xmm6, xmm6, 93H
1831
+ shufps xmm8, xmm7, 221
1832
+ pshufd xmm7, xmm8, 93H
1833
+ movaps xmm14, xmmword ptr [ROT8]
1834
+ movaps xmm15, xmmword ptr [ROT16]
1835
+ mov al, 7
1836
+ @@:
1837
+ paddd xmm0, xmm4
1838
+ paddd xmm0, xmm1
1839
+ pxor xmm3, xmm0
1840
+ pshufb xmm3, xmm15
1841
+ paddd xmm2, xmm3
1842
+ pxor xmm1, xmm2
1843
+ movdqa xmm11, xmm1
1844
+ pslld xmm1, 20
1845
+ psrld xmm11, 12
1846
+ por xmm1, xmm11
1847
+ paddd xmm0, xmm5
1848
+ paddd xmm0, xmm1
1849
+ pxor xmm3, xmm0
1850
+ pshufb xmm3, xmm14
1851
+ paddd xmm2, xmm3
1852
+ pxor xmm1, xmm2
1853
+ movdqa xmm11, xmm1
1854
+ pslld xmm1, 25
1855
+ psrld xmm11, 7
1856
+ por xmm1, xmm11
1857
+ pshufd xmm0, xmm0, 93H
1858
+ pshufd xmm3, xmm3, 4EH
1859
+ pshufd xmm2, xmm2, 39H
1860
+ paddd xmm0, xmm6
1861
+ paddd xmm0, xmm1
1862
+ pxor xmm3, xmm0
1863
+ pshufb xmm3, xmm15
1864
+ paddd xmm2, xmm3
1865
+ pxor xmm1, xmm2
1866
+ movdqa xmm11, xmm1
1867
+ pslld xmm1, 20
1868
+ psrld xmm11, 12
1869
+ por xmm1, xmm11
1870
+ paddd xmm0, xmm7
1871
+ paddd xmm0, xmm1
1872
+ pxor xmm3, xmm0
1873
+ pshufb xmm3, xmm14
1874
+ paddd xmm2, xmm3
1875
+ pxor xmm1, xmm2
1876
+ movdqa xmm11, xmm1
1877
+ pslld xmm1, 25
1878
+ psrld xmm11, 7
1879
+ por xmm1, xmm11
1880
+ pshufd xmm0, xmm0, 39H
1881
+ pshufd xmm3, xmm3, 4EH
1882
+ pshufd xmm2, xmm2, 93H
1883
+ dec al
1884
+ jz @F
1885
+ movdqa xmm8, xmm4
1886
+ shufps xmm8, xmm5, 214
1887
+ pshufd xmm9, xmm4, 0FH
1888
+ pshufd xmm4, xmm8, 39H
1889
+ movdqa xmm8, xmm6
1890
+ shufps xmm8, xmm7, 250
1891
+ pblendw xmm9, xmm8, 0CCH
1892
+ movdqa xmm8, xmm7
1893
+ punpcklqdq xmm8, xmm5
1894
+ pblendw xmm8, xmm6, 0C0H
1895
+ pshufd xmm8, xmm8, 78H
1896
+ punpckhdq xmm5, xmm7
1897
+ punpckldq xmm6, xmm5
1898
+ pshufd xmm7, xmm6, 1EH
1899
+ movdqa xmm5, xmm9
1900
+ movdqa xmm6, xmm8
1901
+ jmp @B
1902
+ @@:
1903
+ pxor xmm0, xmm2
1904
+ pxor xmm1, xmm3
1905
+ movups xmmword ptr [rcx], xmm0
1906
+ movups xmmword ptr [rcx+10H], xmm1
1907
+ movdqa xmm6, xmmword ptr [rsp]
1908
+ movdqa xmm7, xmmword ptr [rsp+10H]
1909
+ movdqa xmm8, xmmword ptr [rsp+20H]
1910
+ movdqa xmm9, xmmword ptr [rsp+30H]
1911
+ add rsp, 72
1912
+ ret
1913
+ _blake3_compress_in_place_sse41 ENDP
1914
+ blake3_compress_in_place_sse41 ENDP
1915
+
1916
+ ALIGN 16
1917
+ blake3_compress_xof_sse41 PROC
1918
+ _blake3_compress_xof_sse41 PROC
1919
+ sub rsp, 72
1920
+ movdqa xmmword ptr [rsp], xmm6
1921
+ movdqa xmmword ptr [rsp+10H], xmm7
1922
+ movdqa xmmword ptr [rsp+20H], xmm8
1923
+ movdqa xmmword ptr [rsp+30H], xmm9
1924
+ movups xmm0, xmmword ptr [rcx]
1925
+ movups xmm1, xmmword ptr [rcx+10H]
1926
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1927
+ movzx eax, byte ptr [rsp+70H]
1928
+ movzx r8d, r8b
1929
+ mov r10, qword ptr [rsp+78H]
1930
+ shl rax, 32
1931
+ add r8, rax
1932
+ movq xmm3, r9
1933
+ movq xmm4, r8
1934
+ punpcklqdq xmm3, xmm4
1935
+ movups xmm4, xmmword ptr [rdx]
1936
+ movups xmm5, xmmword ptr [rdx+10H]
1937
+ movaps xmm8, xmm4
1938
+ shufps xmm4, xmm5, 136
1939
+ shufps xmm8, xmm5, 221
1940
+ movaps xmm5, xmm8
1941
+ movups xmm6, xmmword ptr [rdx+20H]
1942
+ movups xmm7, xmmword ptr [rdx+30H]
1943
+ movaps xmm8, xmm6
1944
+ shufps xmm6, xmm7, 136
1945
+ pshufd xmm6, xmm6, 93H
1946
+ shufps xmm8, xmm7, 221
1947
+ pshufd xmm7, xmm8, 93H
1948
+ movaps xmm14, xmmword ptr [ROT8]
1949
+ movaps xmm15, xmmword ptr [ROT16]
1950
+ mov al, 7
1951
+ @@:
1952
+ paddd xmm0, xmm4
1953
+ paddd xmm0, xmm1
1954
+ pxor xmm3, xmm0
1955
+ pshufb xmm3, xmm15
1956
+ paddd xmm2, xmm3
1957
+ pxor xmm1, xmm2
1958
+ movdqa xmm11, xmm1
1959
+ pslld xmm1, 20
1960
+ psrld xmm11, 12
1961
+ por xmm1, xmm11
1962
+ paddd xmm0, xmm5
1963
+ paddd xmm0, xmm1
1964
+ pxor xmm3, xmm0
1965
+ pshufb xmm3, xmm14
1966
+ paddd xmm2, xmm3
1967
+ pxor xmm1, xmm2
1968
+ movdqa xmm11, xmm1
1969
+ pslld xmm1, 25
1970
+ psrld xmm11, 7
1971
+ por xmm1, xmm11
1972
+ pshufd xmm0, xmm0, 93H
1973
+ pshufd xmm3, xmm3, 4EH
1974
+ pshufd xmm2, xmm2, 39H
1975
+ paddd xmm0, xmm6
1976
+ paddd xmm0, xmm1
1977
+ pxor xmm3, xmm0
1978
+ pshufb xmm3, xmm15
1979
+ paddd xmm2, xmm3
1980
+ pxor xmm1, xmm2
1981
+ movdqa xmm11, xmm1
1982
+ pslld xmm1, 20
1983
+ psrld xmm11, 12
1984
+ por xmm1, xmm11
1985
+ paddd xmm0, xmm7
1986
+ paddd xmm0, xmm1
1987
+ pxor xmm3, xmm0
1988
+ pshufb xmm3, xmm14
1989
+ paddd xmm2, xmm3
1990
+ pxor xmm1, xmm2
1991
+ movdqa xmm11, xmm1
1992
+ pslld xmm1, 25
1993
+ psrld xmm11, 7
1994
+ por xmm1, xmm11
1995
+ pshufd xmm0, xmm0, 39H
1996
+ pshufd xmm3, xmm3, 4EH
1997
+ pshufd xmm2, xmm2, 93H
1998
+ dec al
1999
+ jz @F
2000
+ movdqa xmm8, xmm4
2001
+ shufps xmm8, xmm5, 214
2002
+ pshufd xmm9, xmm4, 0FH
2003
+ pshufd xmm4, xmm8, 39H
2004
+ movdqa xmm8, xmm6
2005
+ shufps xmm8, xmm7, 250
2006
+ pblendw xmm9, xmm8, 0CCH
2007
+ movdqa xmm8, xmm7
2008
+ punpcklqdq xmm8, xmm5
2009
+ pblendw xmm8, xmm6, 0C0H
2010
+ pshufd xmm8, xmm8, 78H
2011
+ punpckhdq xmm5, xmm7
2012
+ punpckldq xmm6, xmm5
2013
+ pshufd xmm7, xmm6, 1EH
2014
+ movdqa xmm5, xmm9
2015
+ movdqa xmm6, xmm8
2016
+ jmp @B
2017
+ @@:
2018
+ movdqu xmm4, xmmword ptr [rcx]
2019
+ movdqu xmm5, xmmword ptr [rcx+10H]
2020
+ pxor xmm0, xmm2
2021
+ pxor xmm1, xmm3
2022
+ pxor xmm2, xmm4
2023
+ pxor xmm3, xmm5
2024
+ movups xmmword ptr [r10], xmm0
2025
+ movups xmmword ptr [r10+10H], xmm1
2026
+ movups xmmword ptr [r10+20H], xmm2
2027
+ movups xmmword ptr [r10+30H], xmm3
2028
+ movdqa xmm6, xmmword ptr [rsp]
2029
+ movdqa xmm7, xmmword ptr [rsp+10H]
2030
+ movdqa xmm8, xmmword ptr [rsp+20H]
2031
+ movdqa xmm9, xmmword ptr [rsp+30H]
2032
+ add rsp, 72
2033
+ ret
2034
+ _blake3_compress_xof_sse41 ENDP
2035
+ blake3_compress_xof_sse41 ENDP
2036
+
2037
+ _TEXT ENDS
2038
+
2039
+
2040
+ _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
2041
+ ALIGN 64
2042
+ BLAKE3_IV:
2043
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
2044
+
2045
+ ADD0:
2046
+ dd 0, 1, 2, 3
2047
+
2048
+ ADD1:
2049
+ dd 4 dup (4)
2050
+
2051
+ BLAKE3_IV_0:
2052
+ dd 4 dup (6A09E667H)
2053
+
2054
+ BLAKE3_IV_1:
2055
+ dd 4 dup (0BB67AE85H)
2056
+
2057
+ BLAKE3_IV_2:
2058
+ dd 4 dup (3C6EF372H)
2059
+
2060
+ BLAKE3_IV_3:
2061
+ dd 4 dup (0A54FF53AH)
2062
+
2063
+ BLAKE3_BLOCK_LEN:
2064
+ dd 4 dup (64)
2065
+
2066
+ ROT16:
2067
+ db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2068
+
2069
+ ROT8:
2070
+ db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2071
+
2072
+ CMP_MSB_MASK:
2073
+ dd 8 dup(80000000H)
2074
+
2075
+ _RDATA ENDS
2076
+ END
2077
+