digest-blake3 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2077 @@
1
+ public _blake3_hash_many_sse41
2
+ public blake3_hash_many_sse41
3
+ public blake3_compress_in_place_sse41
4
+ public _blake3_compress_in_place_sse41
5
+ public blake3_compress_xof_sse41
6
+ public _blake3_compress_xof_sse41
7
+
8
+ _TEXT SEGMENT ALIGN(16) 'CODE'
9
+
10
+ ALIGN 16
11
+ blake3_hash_many_sse41 PROC
12
+ _blake3_hash_many_sse41 PROC
13
+ push r15
14
+ push r14
15
+ push r13
16
+ push r12
17
+ push rsi
18
+ push rdi
19
+ push rbx
20
+ push rbp
21
+ mov rbp, rsp
22
+ sub rsp, 528
23
+ and rsp, 0FFFFFFFFFFFFFFC0H
24
+ movdqa xmmword ptr [rsp+170H], xmm6
25
+ movdqa xmmword ptr [rsp+180H], xmm7
26
+ movdqa xmmword ptr [rsp+190H], xmm8
27
+ movdqa xmmword ptr [rsp+1A0H], xmm9
28
+ movdqa xmmword ptr [rsp+1B0H], xmm10
29
+ movdqa xmmword ptr [rsp+1C0H], xmm11
30
+ movdqa xmmword ptr [rsp+1D0H], xmm12
31
+ movdqa xmmword ptr [rsp+1E0H], xmm13
32
+ movdqa xmmword ptr [rsp+1F0H], xmm14
33
+ movdqa xmmword ptr [rsp+200H], xmm15
34
+ mov rdi, rcx
35
+ mov rsi, rdx
36
+ mov rdx, r8
37
+ mov rcx, r9
38
+ mov r8, qword ptr [rbp+68H]
39
+ movzx r9, byte ptr [rbp+70H]
40
+ neg r9d
41
+ movd xmm0, r9d
42
+ pshufd xmm0, xmm0, 00H
43
+ movdqa xmmword ptr [rsp+130H], xmm0
44
+ movdqa xmm1, xmm0
45
+ pand xmm1, xmmword ptr [ADD0]
46
+ pand xmm0, xmmword ptr [ADD1]
47
+ movdqa xmmword ptr [rsp+150H], xmm0
48
+ movd xmm0, r8d
49
+ pshufd xmm0, xmm0, 00H
50
+ paddd xmm0, xmm1
51
+ movdqa xmmword ptr [rsp+110H], xmm0
52
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
53
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
54
+ pcmpgtd xmm1, xmm0
55
+ shr r8, 32
56
+ movd xmm2, r8d
57
+ pshufd xmm2, xmm2, 00H
58
+ psubd xmm2, xmm1
59
+ movdqa xmmword ptr [rsp+120H], xmm2
60
+ mov rbx, qword ptr [rbp+90H]
61
+ mov r15, rdx
62
+ shl r15, 6
63
+ movzx r13d, byte ptr [rbp+78H]
64
+ movzx r12d, byte ptr [rbp+88H]
65
+ cmp rsi, 4
66
+ jc final3blocks
67
+ outerloop4:
68
+ movdqu xmm3, xmmword ptr [rcx]
69
+ pshufd xmm0, xmm3, 00H
70
+ pshufd xmm1, xmm3, 55H
71
+ pshufd xmm2, xmm3, 0AAH
72
+ pshufd xmm3, xmm3, 0FFH
73
+ movdqu xmm7, xmmword ptr [rcx+10H]
74
+ pshufd xmm4, xmm7, 00H
75
+ pshufd xmm5, xmm7, 55H
76
+ pshufd xmm6, xmm7, 0AAH
77
+ pshufd xmm7, xmm7, 0FFH
78
+ mov r8, qword ptr [rdi]
79
+ mov r9, qword ptr [rdi+8H]
80
+ mov r10, qword ptr [rdi+10H]
81
+ mov r11, qword ptr [rdi+18H]
82
+ movzx eax, byte ptr [rbp+80H]
83
+ or eax, r13d
84
+ xor edx, edx
85
+ innerloop4:
86
+ mov r14d, eax
87
+ or eax, r12d
88
+ add rdx, 64
89
+ cmp rdx, r15
90
+ cmovne eax, r14d
91
+ movdqu xmm8, xmmword ptr [r8+rdx-40H]
92
+ movdqu xmm9, xmmword ptr [r9+rdx-40H]
93
+ movdqu xmm10, xmmword ptr [r10+rdx-40H]
94
+ movdqu xmm11, xmmword ptr [r11+rdx-40H]
95
+ movdqa xmm12, xmm8
96
+ punpckldq xmm8, xmm9
97
+ punpckhdq xmm12, xmm9
98
+ movdqa xmm14, xmm10
99
+ punpckldq xmm10, xmm11
100
+ punpckhdq xmm14, xmm11
101
+ movdqa xmm9, xmm8
102
+ punpcklqdq xmm8, xmm10
103
+ punpckhqdq xmm9, xmm10
104
+ movdqa xmm13, xmm12
105
+ punpcklqdq xmm12, xmm14
106
+ punpckhqdq xmm13, xmm14
107
+ movdqa xmmword ptr [rsp], xmm8
108
+ movdqa xmmword ptr [rsp+10H], xmm9
109
+ movdqa xmmword ptr [rsp+20H], xmm12
110
+ movdqa xmmword ptr [rsp+30H], xmm13
111
+ movdqu xmm8, xmmword ptr [r8+rdx-30H]
112
+ movdqu xmm9, xmmword ptr [r9+rdx-30H]
113
+ movdqu xmm10, xmmword ptr [r10+rdx-30H]
114
+ movdqu xmm11, xmmword ptr [r11+rdx-30H]
115
+ movdqa xmm12, xmm8
116
+ punpckldq xmm8, xmm9
117
+ punpckhdq xmm12, xmm9
118
+ movdqa xmm14, xmm10
119
+ punpckldq xmm10, xmm11
120
+ punpckhdq xmm14, xmm11
121
+ movdqa xmm9, xmm8
122
+ punpcklqdq xmm8, xmm10
123
+ punpckhqdq xmm9, xmm10
124
+ movdqa xmm13, xmm12
125
+ punpcklqdq xmm12, xmm14
126
+ punpckhqdq xmm13, xmm14
127
+ movdqa xmmword ptr [rsp+40H], xmm8
128
+ movdqa xmmword ptr [rsp+50H], xmm9
129
+ movdqa xmmword ptr [rsp+60H], xmm12
130
+ movdqa xmmword ptr [rsp+70H], xmm13
131
+ movdqu xmm8, xmmword ptr [r8+rdx-20H]
132
+ movdqu xmm9, xmmword ptr [r9+rdx-20H]
133
+ movdqu xmm10, xmmword ptr [r10+rdx-20H]
134
+ movdqu xmm11, xmmword ptr [r11+rdx-20H]
135
+ movdqa xmm12, xmm8
136
+ punpckldq xmm8, xmm9
137
+ punpckhdq xmm12, xmm9
138
+ movdqa xmm14, xmm10
139
+ punpckldq xmm10, xmm11
140
+ punpckhdq xmm14, xmm11
141
+ movdqa xmm9, xmm8
142
+ punpcklqdq xmm8, xmm10
143
+ punpckhqdq xmm9, xmm10
144
+ movdqa xmm13, xmm12
145
+ punpcklqdq xmm12, xmm14
146
+ punpckhqdq xmm13, xmm14
147
+ movdqa xmmword ptr [rsp+80H], xmm8
148
+ movdqa xmmword ptr [rsp+90H], xmm9
149
+ movdqa xmmword ptr [rsp+0A0H], xmm12
150
+ movdqa xmmword ptr [rsp+0B0H], xmm13
151
+ movdqu xmm8, xmmword ptr [r8+rdx-10H]
152
+ movdqu xmm9, xmmword ptr [r9+rdx-10H]
153
+ movdqu xmm10, xmmword ptr [r10+rdx-10H]
154
+ movdqu xmm11, xmmword ptr [r11+rdx-10H]
155
+ movdqa xmm12, xmm8
156
+ punpckldq xmm8, xmm9
157
+ punpckhdq xmm12, xmm9
158
+ movdqa xmm14, xmm10
159
+ punpckldq xmm10, xmm11
160
+ punpckhdq xmm14, xmm11
161
+ movdqa xmm9, xmm8
162
+ punpcklqdq xmm8, xmm10
163
+ punpckhqdq xmm9, xmm10
164
+ movdqa xmm13, xmm12
165
+ punpcklqdq xmm12, xmm14
166
+ punpckhqdq xmm13, xmm14
167
+ movdqa xmmword ptr [rsp+0C0H], xmm8
168
+ movdqa xmmword ptr [rsp+0D0H], xmm9
169
+ movdqa xmmword ptr [rsp+0E0H], xmm12
170
+ movdqa xmmword ptr [rsp+0F0H], xmm13
171
+ movdqa xmm9, xmmword ptr [BLAKE3_IV_1]
172
+ movdqa xmm10, xmmword ptr [BLAKE3_IV_2]
173
+ movdqa xmm11, xmmword ptr [BLAKE3_IV_3]
174
+ movdqa xmm12, xmmword ptr [rsp+110H]
175
+ movdqa xmm13, xmmword ptr [rsp+120H]
176
+ movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN]
177
+ movd xmm15, eax
178
+ pshufd xmm15, xmm15, 00H
179
+ prefetcht0 byte ptr [r8+rdx+80H]
180
+ prefetcht0 byte ptr [r9+rdx+80H]
181
+ prefetcht0 byte ptr [r10+rdx+80H]
182
+ prefetcht0 byte ptr [r11+rdx+80H]
183
+ paddd xmm0, xmmword ptr [rsp]
184
+ paddd xmm1, xmmword ptr [rsp+20H]
185
+ paddd xmm2, xmmword ptr [rsp+40H]
186
+ paddd xmm3, xmmword ptr [rsp+60H]
187
+ paddd xmm0, xmm4
188
+ paddd xmm1, xmm5
189
+ paddd xmm2, xmm6
190
+ paddd xmm3, xmm7
191
+ pxor xmm12, xmm0
192
+ pxor xmm13, xmm1
193
+ pxor xmm14, xmm2
194
+ pxor xmm15, xmm3
195
+ movdqa xmm8, xmmword ptr [ROT16]
196
+ pshufb xmm12, xmm8
197
+ pshufb xmm13, xmm8
198
+ pshufb xmm14, xmm8
199
+ pshufb xmm15, xmm8
200
+ movdqa xmm8, xmmword ptr [BLAKE3_IV_0]
201
+ paddd xmm8, xmm12
202
+ paddd xmm9, xmm13
203
+ paddd xmm10, xmm14
204
+ paddd xmm11, xmm15
205
+ pxor xmm4, xmm8
206
+ pxor xmm5, xmm9
207
+ pxor xmm6, xmm10
208
+ pxor xmm7, xmm11
209
+ movdqa xmmword ptr [rsp+100H], xmm8
210
+ movdqa xmm8, xmm4
211
+ psrld xmm8, 12
212
+ pslld xmm4, 20
213
+ por xmm4, xmm8
214
+ movdqa xmm8, xmm5
215
+ psrld xmm8, 12
216
+ pslld xmm5, 20
217
+ por xmm5, xmm8
218
+ movdqa xmm8, xmm6
219
+ psrld xmm8, 12
220
+ pslld xmm6, 20
221
+ por xmm6, xmm8
222
+ movdqa xmm8, xmm7
223
+ psrld xmm8, 12
224
+ pslld xmm7, 20
225
+ por xmm7, xmm8
226
+ paddd xmm0, xmmword ptr [rsp+10H]
227
+ paddd xmm1, xmmword ptr [rsp+30H]
228
+ paddd xmm2, xmmword ptr [rsp+50H]
229
+ paddd xmm3, xmmword ptr [rsp+70H]
230
+ paddd xmm0, xmm4
231
+ paddd xmm1, xmm5
232
+ paddd xmm2, xmm6
233
+ paddd xmm3, xmm7
234
+ pxor xmm12, xmm0
235
+ pxor xmm13, xmm1
236
+ pxor xmm14, xmm2
237
+ pxor xmm15, xmm3
238
+ movdqa xmm8, xmmword ptr [ROT8]
239
+ pshufb xmm12, xmm8
240
+ pshufb xmm13, xmm8
241
+ pshufb xmm14, xmm8
242
+ pshufb xmm15, xmm8
243
+ movdqa xmm8, xmmword ptr [rsp+100H]
244
+ paddd xmm8, xmm12
245
+ paddd xmm9, xmm13
246
+ paddd xmm10, xmm14
247
+ paddd xmm11, xmm15
248
+ pxor xmm4, xmm8
249
+ pxor xmm5, xmm9
250
+ pxor xmm6, xmm10
251
+ pxor xmm7, xmm11
252
+ movdqa xmmword ptr [rsp+100H], xmm8
253
+ movdqa xmm8, xmm4
254
+ psrld xmm8, 7
255
+ pslld xmm4, 25
256
+ por xmm4, xmm8
257
+ movdqa xmm8, xmm5
258
+ psrld xmm8, 7
259
+ pslld xmm5, 25
260
+ por xmm5, xmm8
261
+ movdqa xmm8, xmm6
262
+ psrld xmm8, 7
263
+ pslld xmm6, 25
264
+ por xmm6, xmm8
265
+ movdqa xmm8, xmm7
266
+ psrld xmm8, 7
267
+ pslld xmm7, 25
268
+ por xmm7, xmm8
269
+ paddd xmm0, xmmword ptr [rsp+80H]
270
+ paddd xmm1, xmmword ptr [rsp+0A0H]
271
+ paddd xmm2, xmmword ptr [rsp+0C0H]
272
+ paddd xmm3, xmmword ptr [rsp+0E0H]
273
+ paddd xmm0, xmm5
274
+ paddd xmm1, xmm6
275
+ paddd xmm2, xmm7
276
+ paddd xmm3, xmm4
277
+ pxor xmm15, xmm0
278
+ pxor xmm12, xmm1
279
+ pxor xmm13, xmm2
280
+ pxor xmm14, xmm3
281
+ movdqa xmm8, xmmword ptr [ROT16]
282
+ pshufb xmm15, xmm8
283
+ pshufb xmm12, xmm8
284
+ pshufb xmm13, xmm8
285
+ pshufb xmm14, xmm8
286
+ paddd xmm10, xmm15
287
+ paddd xmm11, xmm12
288
+ movdqa xmm8, xmmword ptr [rsp+100H]
289
+ paddd xmm8, xmm13
290
+ paddd xmm9, xmm14
291
+ pxor xmm5, xmm10
292
+ pxor xmm6, xmm11
293
+ pxor xmm7, xmm8
294
+ pxor xmm4, xmm9
295
+ movdqa xmmword ptr [rsp+100H], xmm8
296
+ movdqa xmm8, xmm5
297
+ psrld xmm8, 12
298
+ pslld xmm5, 20
299
+ por xmm5, xmm8
300
+ movdqa xmm8, xmm6
301
+ psrld xmm8, 12
302
+ pslld xmm6, 20
303
+ por xmm6, xmm8
304
+ movdqa xmm8, xmm7
305
+ psrld xmm8, 12
306
+ pslld xmm7, 20
307
+ por xmm7, xmm8
308
+ movdqa xmm8, xmm4
309
+ psrld xmm8, 12
310
+ pslld xmm4, 20
311
+ por xmm4, xmm8
312
+ paddd xmm0, xmmword ptr [rsp+90H]
313
+ paddd xmm1, xmmword ptr [rsp+0B0H]
314
+ paddd xmm2, xmmword ptr [rsp+0D0H]
315
+ paddd xmm3, xmmword ptr [rsp+0F0H]
316
+ paddd xmm0, xmm5
317
+ paddd xmm1, xmm6
318
+ paddd xmm2, xmm7
319
+ paddd xmm3, xmm4
320
+ pxor xmm15, xmm0
321
+ pxor xmm12, xmm1
322
+ pxor xmm13, xmm2
323
+ pxor xmm14, xmm3
324
+ movdqa xmm8, xmmword ptr [ROT8]
325
+ pshufb xmm15, xmm8
326
+ pshufb xmm12, xmm8
327
+ pshufb xmm13, xmm8
328
+ pshufb xmm14, xmm8
329
+ paddd xmm10, xmm15
330
+ paddd xmm11, xmm12
331
+ movdqa xmm8, xmmword ptr [rsp+100H]
332
+ paddd xmm8, xmm13
333
+ paddd xmm9, xmm14
334
+ pxor xmm5, xmm10
335
+ pxor xmm6, xmm11
336
+ pxor xmm7, xmm8
337
+ pxor xmm4, xmm9
338
+ movdqa xmmword ptr [rsp+100H], xmm8
339
+ movdqa xmm8, xmm5
340
+ psrld xmm8, 7
341
+ pslld xmm5, 25
342
+ por xmm5, xmm8
343
+ movdqa xmm8, xmm6
344
+ psrld xmm8, 7
345
+ pslld xmm6, 25
346
+ por xmm6, xmm8
347
+ movdqa xmm8, xmm7
348
+ psrld xmm8, 7
349
+ pslld xmm7, 25
350
+ por xmm7, xmm8
351
+ movdqa xmm8, xmm4
352
+ psrld xmm8, 7
353
+ pslld xmm4, 25
354
+ por xmm4, xmm8
355
+ paddd xmm0, xmmword ptr [rsp+20H]
356
+ paddd xmm1, xmmword ptr [rsp+30H]
357
+ paddd xmm2, xmmword ptr [rsp+70H]
358
+ paddd xmm3, xmmword ptr [rsp+40H]
359
+ paddd xmm0, xmm4
360
+ paddd xmm1, xmm5
361
+ paddd xmm2, xmm6
362
+ paddd xmm3, xmm7
363
+ pxor xmm12, xmm0
364
+ pxor xmm13, xmm1
365
+ pxor xmm14, xmm2
366
+ pxor xmm15, xmm3
367
+ movdqa xmm8, xmmword ptr [ROT16]
368
+ pshufb xmm12, xmm8
369
+ pshufb xmm13, xmm8
370
+ pshufb xmm14, xmm8
371
+ pshufb xmm15, xmm8
372
+ movdqa xmm8, xmmword ptr [rsp+100H]
373
+ paddd xmm8, xmm12
374
+ paddd xmm9, xmm13
375
+ paddd xmm10, xmm14
376
+ paddd xmm11, xmm15
377
+ pxor xmm4, xmm8
378
+ pxor xmm5, xmm9
379
+ pxor xmm6, xmm10
380
+ pxor xmm7, xmm11
381
+ movdqa xmmword ptr [rsp+100H], xmm8
382
+ movdqa xmm8, xmm4
383
+ psrld xmm8, 12
384
+ pslld xmm4, 20
385
+ por xmm4, xmm8
386
+ movdqa xmm8, xmm5
387
+ psrld xmm8, 12
388
+ pslld xmm5, 20
389
+ por xmm5, xmm8
390
+ movdqa xmm8, xmm6
391
+ psrld xmm8, 12
392
+ pslld xmm6, 20
393
+ por xmm6, xmm8
394
+ movdqa xmm8, xmm7
395
+ psrld xmm8, 12
396
+ pslld xmm7, 20
397
+ por xmm7, xmm8
398
+ paddd xmm0, xmmword ptr [rsp+60H]
399
+ paddd xmm1, xmmword ptr [rsp+0A0H]
400
+ paddd xmm2, xmmword ptr [rsp]
401
+ paddd xmm3, xmmword ptr [rsp+0D0H]
402
+ paddd xmm0, xmm4
403
+ paddd xmm1, xmm5
404
+ paddd xmm2, xmm6
405
+ paddd xmm3, xmm7
406
+ pxor xmm12, xmm0
407
+ pxor xmm13, xmm1
408
+ pxor xmm14, xmm2
409
+ pxor xmm15, xmm3
410
+ movdqa xmm8, xmmword ptr [ROT8]
411
+ pshufb xmm12, xmm8
412
+ pshufb xmm13, xmm8
413
+ pshufb xmm14, xmm8
414
+ pshufb xmm15, xmm8
415
+ movdqa xmm8, xmmword ptr [rsp+100H]
416
+ paddd xmm8, xmm12
417
+ paddd xmm9, xmm13
418
+ paddd xmm10, xmm14
419
+ paddd xmm11, xmm15
420
+ pxor xmm4, xmm8
421
+ pxor xmm5, xmm9
422
+ pxor xmm6, xmm10
423
+ pxor xmm7, xmm11
424
+ movdqa xmmword ptr [rsp+100H], xmm8
425
+ movdqa xmm8, xmm4
426
+ psrld xmm8, 7
427
+ pslld xmm4, 25
428
+ por xmm4, xmm8
429
+ movdqa xmm8, xmm5
430
+ psrld xmm8, 7
431
+ pslld xmm5, 25
432
+ por xmm5, xmm8
433
+ movdqa xmm8, xmm6
434
+ psrld xmm8, 7
435
+ pslld xmm6, 25
436
+ por xmm6, xmm8
437
+ movdqa xmm8, xmm7
438
+ psrld xmm8, 7
439
+ pslld xmm7, 25
440
+ por xmm7, xmm8
441
+ paddd xmm0, xmmword ptr [rsp+10H]
442
+ paddd xmm1, xmmword ptr [rsp+0C0H]
443
+ paddd xmm2, xmmword ptr [rsp+90H]
444
+ paddd xmm3, xmmword ptr [rsp+0F0H]
445
+ paddd xmm0, xmm5
446
+ paddd xmm1, xmm6
447
+ paddd xmm2, xmm7
448
+ paddd xmm3, xmm4
449
+ pxor xmm15, xmm0
450
+ pxor xmm12, xmm1
451
+ pxor xmm13, xmm2
452
+ pxor xmm14, xmm3
453
+ movdqa xmm8, xmmword ptr [ROT16]
454
+ pshufb xmm15, xmm8
455
+ pshufb xmm12, xmm8
456
+ pshufb xmm13, xmm8
457
+ pshufb xmm14, xmm8
458
+ paddd xmm10, xmm15
459
+ paddd xmm11, xmm12
460
+ movdqa xmm8, xmmword ptr [rsp+100H]
461
+ paddd xmm8, xmm13
462
+ paddd xmm9, xmm14
463
+ pxor xmm5, xmm10
464
+ pxor xmm6, xmm11
465
+ pxor xmm7, xmm8
466
+ pxor xmm4, xmm9
467
+ movdqa xmmword ptr [rsp+100H], xmm8
468
+ movdqa xmm8, xmm5
469
+ psrld xmm8, 12
470
+ pslld xmm5, 20
471
+ por xmm5, xmm8
472
+ movdqa xmm8, xmm6
473
+ psrld xmm8, 12
474
+ pslld xmm6, 20
475
+ por xmm6, xmm8
476
+ movdqa xmm8, xmm7
477
+ psrld xmm8, 12
478
+ pslld xmm7, 20
479
+ por xmm7, xmm8
480
+ movdqa xmm8, xmm4
481
+ psrld xmm8, 12
482
+ pslld xmm4, 20
483
+ por xmm4, xmm8
484
+ paddd xmm0, xmmword ptr [rsp+0B0H]
485
+ paddd xmm1, xmmword ptr [rsp+50H]
486
+ paddd xmm2, xmmword ptr [rsp+0E0H]
487
+ paddd xmm3, xmmword ptr [rsp+80H]
488
+ paddd xmm0, xmm5
489
+ paddd xmm1, xmm6
490
+ paddd xmm2, xmm7
491
+ paddd xmm3, xmm4
492
+ pxor xmm15, xmm0
493
+ pxor xmm12, xmm1
494
+ pxor xmm13, xmm2
495
+ pxor xmm14, xmm3
496
+ movdqa xmm8, xmmword ptr [ROT8]
497
+ pshufb xmm15, xmm8
498
+ pshufb xmm12, xmm8
499
+ pshufb xmm13, xmm8
500
+ pshufb xmm14, xmm8
501
+ paddd xmm10, xmm15
502
+ paddd xmm11, xmm12
503
+ movdqa xmm8, xmmword ptr [rsp+100H]
504
+ paddd xmm8, xmm13
505
+ paddd xmm9, xmm14
506
+ pxor xmm5, xmm10
507
+ pxor xmm6, xmm11
508
+ pxor xmm7, xmm8
509
+ pxor xmm4, xmm9
510
+ movdqa xmmword ptr [rsp+100H], xmm8
511
+ movdqa xmm8, xmm5
512
+ psrld xmm8, 7
513
+ pslld xmm5, 25
514
+ por xmm5, xmm8
515
+ movdqa xmm8, xmm6
516
+ psrld xmm8, 7
517
+ pslld xmm6, 25
518
+ por xmm6, xmm8
519
+ movdqa xmm8, xmm7
520
+ psrld xmm8, 7
521
+ pslld xmm7, 25
522
+ por xmm7, xmm8
523
+ movdqa xmm8, xmm4
524
+ psrld xmm8, 7
525
+ pslld xmm4, 25
526
+ por xmm4, xmm8
527
+ paddd xmm0, xmmword ptr [rsp+30H]
528
+ paddd xmm1, xmmword ptr [rsp+0A0H]
529
+ paddd xmm2, xmmword ptr [rsp+0D0H]
530
+ paddd xmm3, xmmword ptr [rsp+70H]
531
+ paddd xmm0, xmm4
532
+ paddd xmm1, xmm5
533
+ paddd xmm2, xmm6
534
+ paddd xmm3, xmm7
535
+ pxor xmm12, xmm0
536
+ pxor xmm13, xmm1
537
+ pxor xmm14, xmm2
538
+ pxor xmm15, xmm3
539
+ movdqa xmm8, xmmword ptr [ROT16]
540
+ pshufb xmm12, xmm8
541
+ pshufb xmm13, xmm8
542
+ pshufb xmm14, xmm8
543
+ pshufb xmm15, xmm8
544
+ movdqa xmm8, xmmword ptr [rsp+100H]
545
+ paddd xmm8, xmm12
546
+ paddd xmm9, xmm13
547
+ paddd xmm10, xmm14
548
+ paddd xmm11, xmm15
549
+ pxor xmm4, xmm8
550
+ pxor xmm5, xmm9
551
+ pxor xmm6, xmm10
552
+ pxor xmm7, xmm11
553
+ movdqa xmmword ptr [rsp+100H], xmm8
554
+ movdqa xmm8, xmm4
555
+ psrld xmm8, 12
556
+ pslld xmm4, 20
557
+ por xmm4, xmm8
558
+ movdqa xmm8, xmm5
559
+ psrld xmm8, 12
560
+ pslld xmm5, 20
561
+ por xmm5, xmm8
562
+ movdqa xmm8, xmm6
563
+ psrld xmm8, 12
564
+ pslld xmm6, 20
565
+ por xmm6, xmm8
566
+ movdqa xmm8, xmm7
567
+ psrld xmm8, 12
568
+ pslld xmm7, 20
569
+ por xmm7, xmm8
570
+ paddd xmm0, xmmword ptr [rsp+40H]
571
+ paddd xmm1, xmmword ptr [rsp+0C0H]
572
+ paddd xmm2, xmmword ptr [rsp+20H]
573
+ paddd xmm3, xmmword ptr [rsp+0E0H]
574
+ paddd xmm0, xmm4
575
+ paddd xmm1, xmm5
576
+ paddd xmm2, xmm6
577
+ paddd xmm3, xmm7
578
+ pxor xmm12, xmm0
579
+ pxor xmm13, xmm1
580
+ pxor xmm14, xmm2
581
+ pxor xmm15, xmm3
582
+ movdqa xmm8, xmmword ptr [ROT8]
583
+ pshufb xmm12, xmm8
584
+ pshufb xmm13, xmm8
585
+ pshufb xmm14, xmm8
586
+ pshufb xmm15, xmm8
587
+ movdqa xmm8, xmmword ptr [rsp+100H]
588
+ paddd xmm8, xmm12
589
+ paddd xmm9, xmm13
590
+ paddd xmm10, xmm14
591
+ paddd xmm11, xmm15
592
+ pxor xmm4, xmm8
593
+ pxor xmm5, xmm9
594
+ pxor xmm6, xmm10
595
+ pxor xmm7, xmm11
596
+ movdqa xmmword ptr [rsp+100H], xmm8
597
+ movdqa xmm8, xmm4
598
+ psrld xmm8, 7
599
+ pslld xmm4, 25
600
+ por xmm4, xmm8
601
+ movdqa xmm8, xmm5
602
+ psrld xmm8, 7
603
+ pslld xmm5, 25
604
+ por xmm5, xmm8
605
+ movdqa xmm8, xmm6
606
+ psrld xmm8, 7
607
+ pslld xmm6, 25
608
+ por xmm6, xmm8
609
+ movdqa xmm8, xmm7
610
+ psrld xmm8, 7
611
+ pslld xmm7, 25
612
+ por xmm7, xmm8
613
+ paddd xmm0, xmmword ptr [rsp+60H]
614
+ paddd xmm1, xmmword ptr [rsp+90H]
615
+ paddd xmm2, xmmword ptr [rsp+0B0H]
616
+ paddd xmm3, xmmword ptr [rsp+80H]
617
+ paddd xmm0, xmm5
618
+ paddd xmm1, xmm6
619
+ paddd xmm2, xmm7
620
+ paddd xmm3, xmm4
621
+ pxor xmm15, xmm0
622
+ pxor xmm12, xmm1
623
+ pxor xmm13, xmm2
624
+ pxor xmm14, xmm3
625
+ movdqa xmm8, xmmword ptr [ROT16]
626
+ pshufb xmm15, xmm8
627
+ pshufb xmm12, xmm8
628
+ pshufb xmm13, xmm8
629
+ pshufb xmm14, xmm8
630
+ paddd xmm10, xmm15
631
+ paddd xmm11, xmm12
632
+ movdqa xmm8, xmmword ptr [rsp+100H]
633
+ paddd xmm8, xmm13
634
+ paddd xmm9, xmm14
635
+ pxor xmm5, xmm10
636
+ pxor xmm6, xmm11
637
+ pxor xmm7, xmm8
638
+ pxor xmm4, xmm9
639
+ movdqa xmmword ptr [rsp+100H], xmm8
640
+ movdqa xmm8, xmm5
641
+ psrld xmm8, 12
642
+ pslld xmm5, 20
643
+ por xmm5, xmm8
644
+ movdqa xmm8, xmm6
645
+ psrld xmm8, 12
646
+ pslld xmm6, 20
647
+ por xmm6, xmm8
648
+ movdqa xmm8, xmm7
649
+ psrld xmm8, 12
650
+ pslld xmm7, 20
651
+ por xmm7, xmm8
652
+ movdqa xmm8, xmm4
653
+ psrld xmm8, 12
654
+ pslld xmm4, 20
655
+ por xmm4, xmm8
656
+ paddd xmm0, xmmword ptr [rsp+50H]
657
+ paddd xmm1, xmmword ptr [rsp]
658
+ paddd xmm2, xmmword ptr [rsp+0F0H]
659
+ paddd xmm3, xmmword ptr [rsp+10H]
660
+ paddd xmm0, xmm5
661
+ paddd xmm1, xmm6
662
+ paddd xmm2, xmm7
663
+ paddd xmm3, xmm4
664
+ pxor xmm15, xmm0
665
+ pxor xmm12, xmm1
666
+ pxor xmm13, xmm2
667
+ pxor xmm14, xmm3
668
+ movdqa xmm8, xmmword ptr [ROT8]
669
+ pshufb xmm15, xmm8
670
+ pshufb xmm12, xmm8
671
+ pshufb xmm13, xmm8
672
+ pshufb xmm14, xmm8
673
+ paddd xmm10, xmm15
674
+ paddd xmm11, xmm12
675
+ movdqa xmm8, xmmword ptr [rsp+100H]
676
+ paddd xmm8, xmm13
677
+ paddd xmm9, xmm14
678
+ pxor xmm5, xmm10
679
+ pxor xmm6, xmm11
680
+ pxor xmm7, xmm8
681
+ pxor xmm4, xmm9
682
+ movdqa xmmword ptr [rsp+100H], xmm8
683
+ movdqa xmm8, xmm5
684
+ psrld xmm8, 7
685
+ pslld xmm5, 25
686
+ por xmm5, xmm8
687
+ movdqa xmm8, xmm6
688
+ psrld xmm8, 7
689
+ pslld xmm6, 25
690
+ por xmm6, xmm8
691
+ movdqa xmm8, xmm7
692
+ psrld xmm8, 7
693
+ pslld xmm7, 25
694
+ por xmm7, xmm8
695
+ movdqa xmm8, xmm4
696
+ psrld xmm8, 7
697
+ pslld xmm4, 25
698
+ por xmm4, xmm8
699
+ paddd xmm0, xmmword ptr [rsp+0A0H]
700
+ paddd xmm1, xmmword ptr [rsp+0C0H]
701
+ paddd xmm2, xmmword ptr [rsp+0E0H]
702
+ paddd xmm3, xmmword ptr [rsp+0D0H]
703
+ paddd xmm0, xmm4
704
+ paddd xmm1, xmm5
705
+ paddd xmm2, xmm6
706
+ paddd xmm3, xmm7
707
+ pxor xmm12, xmm0
708
+ pxor xmm13, xmm1
709
+ pxor xmm14, xmm2
710
+ pxor xmm15, xmm3
711
+ movdqa xmm8, xmmword ptr [ROT16]
712
+ pshufb xmm12, xmm8
713
+ pshufb xmm13, xmm8
714
+ pshufb xmm14, xmm8
715
+ pshufb xmm15, xmm8
716
+ movdqa xmm8, xmmword ptr [rsp+100H]
717
+ paddd xmm8, xmm12
718
+ paddd xmm9, xmm13
719
+ paddd xmm10, xmm14
720
+ paddd xmm11, xmm15
721
+ pxor xmm4, xmm8
722
+ pxor xmm5, xmm9
723
+ pxor xmm6, xmm10
724
+ pxor xmm7, xmm11
725
+ movdqa xmmword ptr [rsp+100H], xmm8
726
+ movdqa xmm8, xmm4
727
+ psrld xmm8, 12
728
+ pslld xmm4, 20
729
+ por xmm4, xmm8
730
+ movdqa xmm8, xmm5
731
+ psrld xmm8, 12
732
+ pslld xmm5, 20
733
+ por xmm5, xmm8
734
+ movdqa xmm8, xmm6
735
+ psrld xmm8, 12
736
+ pslld xmm6, 20
737
+ por xmm6, xmm8
738
+ movdqa xmm8, xmm7
739
+ psrld xmm8, 12
740
+ pslld xmm7, 20
741
+ por xmm7, xmm8
742
+ paddd xmm0, xmmword ptr [rsp+70H]
743
+ paddd xmm1, xmmword ptr [rsp+90H]
744
+ paddd xmm2, xmmword ptr [rsp+30H]
745
+ paddd xmm3, xmmword ptr [rsp+0F0H]
746
+ paddd xmm0, xmm4
747
+ paddd xmm1, xmm5
748
+ paddd xmm2, xmm6
749
+ paddd xmm3, xmm7
750
+ pxor xmm12, xmm0
751
+ pxor xmm13, xmm1
752
+ pxor xmm14, xmm2
753
+ pxor xmm15, xmm3
754
+ movdqa xmm8, xmmword ptr [ROT8]
755
+ pshufb xmm12, xmm8
756
+ pshufb xmm13, xmm8
757
+ pshufb xmm14, xmm8
758
+ pshufb xmm15, xmm8
759
+ movdqa xmm8, xmmword ptr [rsp+100H]
760
+ paddd xmm8, xmm12
761
+ paddd xmm9, xmm13
762
+ paddd xmm10, xmm14
763
+ paddd xmm11, xmm15
764
+ pxor xmm4, xmm8
765
+ pxor xmm5, xmm9
766
+ pxor xmm6, xmm10
767
+ pxor xmm7, xmm11
768
+ movdqa xmmword ptr [rsp+100H], xmm8
769
+ movdqa xmm8, xmm4
770
+ psrld xmm8, 7
771
+ pslld xmm4, 25
772
+ por xmm4, xmm8
773
+ movdqa xmm8, xmm5
774
+ psrld xmm8, 7
775
+ pslld xmm5, 25
776
+ por xmm5, xmm8
777
+ movdqa xmm8, xmm6
778
+ psrld xmm8, 7
779
+ pslld xmm6, 25
780
+ por xmm6, xmm8
781
+ movdqa xmm8, xmm7
782
+ psrld xmm8, 7
783
+ pslld xmm7, 25
784
+ por xmm7, xmm8
785
+ paddd xmm0, xmmword ptr [rsp+40H]
786
+ paddd xmm1, xmmword ptr [rsp+0B0H]
787
+ paddd xmm2, xmmword ptr [rsp+50H]
788
+ paddd xmm3, xmmword ptr [rsp+10H]
789
+ paddd xmm0, xmm5
790
+ paddd xmm1, xmm6
791
+ paddd xmm2, xmm7
792
+ paddd xmm3, xmm4
793
+ pxor xmm15, xmm0
794
+ pxor xmm12, xmm1
795
+ pxor xmm13, xmm2
796
+ pxor xmm14, xmm3
797
+ movdqa xmm8, xmmword ptr [ROT16]
798
+ pshufb xmm15, xmm8
799
+ pshufb xmm12, xmm8
800
+ pshufb xmm13, xmm8
801
+ pshufb xmm14, xmm8
802
+ paddd xmm10, xmm15
803
+ paddd xmm11, xmm12
804
+ movdqa xmm8, xmmword ptr [rsp+100H]
805
+ paddd xmm8, xmm13
806
+ paddd xmm9, xmm14
807
+ pxor xmm5, xmm10
808
+ pxor xmm6, xmm11
809
+ pxor xmm7, xmm8
810
+ pxor xmm4, xmm9
811
+ movdqa xmmword ptr [rsp+100H], xmm8
812
+ movdqa xmm8, xmm5
813
+ psrld xmm8, 12
814
+ pslld xmm5, 20
815
+ por xmm5, xmm8
816
+ movdqa xmm8, xmm6
817
+ psrld xmm8, 12
818
+ pslld xmm6, 20
819
+ por xmm6, xmm8
820
+ movdqa xmm8, xmm7
821
+ psrld xmm8, 12
822
+ pslld xmm7, 20
823
+ por xmm7, xmm8
824
+ movdqa xmm8, xmm4
825
+ psrld xmm8, 12
826
+ pslld xmm4, 20
827
+ por xmm4, xmm8
828
+ paddd xmm0, xmmword ptr [rsp]
829
+ paddd xmm1, xmmword ptr [rsp+20H]
830
+ paddd xmm2, xmmword ptr [rsp+80H]
831
+ paddd xmm3, xmmword ptr [rsp+60H]
832
+ paddd xmm0, xmm5
833
+ paddd xmm1, xmm6
834
+ paddd xmm2, xmm7
835
+ paddd xmm3, xmm4
836
+ pxor xmm15, xmm0
837
+ pxor xmm12, xmm1
838
+ pxor xmm13, xmm2
839
+ pxor xmm14, xmm3
840
+ movdqa xmm8, xmmword ptr [ROT8]
841
+ pshufb xmm15, xmm8
842
+ pshufb xmm12, xmm8
843
+ pshufb xmm13, xmm8
844
+ pshufb xmm14, xmm8
845
+ paddd xmm10, xmm15
846
+ paddd xmm11, xmm12
847
+ movdqa xmm8, xmmword ptr [rsp+100H]
848
+ paddd xmm8, xmm13
849
+ paddd xmm9, xmm14
850
+ pxor xmm5, xmm10
851
+ pxor xmm6, xmm11
852
+ pxor xmm7, xmm8
853
+ pxor xmm4, xmm9
854
+ movdqa xmmword ptr [rsp+100H], xmm8
855
+ movdqa xmm8, xmm5
856
+ psrld xmm8, 7
857
+ pslld xmm5, 25
858
+ por xmm5, xmm8
859
+ movdqa xmm8, xmm6
860
+ psrld xmm8, 7
861
+ pslld xmm6, 25
862
+ por xmm6, xmm8
863
+ movdqa xmm8, xmm7
864
+ psrld xmm8, 7
865
+ pslld xmm7, 25
866
+ por xmm7, xmm8
867
+ movdqa xmm8, xmm4
868
+ psrld xmm8, 7
869
+ pslld xmm4, 25
870
+ por xmm4, xmm8
871
+ paddd xmm0, xmmword ptr [rsp+0C0H]
872
+ paddd xmm1, xmmword ptr [rsp+90H]
873
+ paddd xmm2, xmmword ptr [rsp+0F0H]
874
+ paddd xmm3, xmmword ptr [rsp+0E0H]
875
+ paddd xmm0, xmm4
876
+ paddd xmm1, xmm5
877
+ paddd xmm2, xmm6
878
+ paddd xmm3, xmm7
879
+ pxor xmm12, xmm0
880
+ pxor xmm13, xmm1
881
+ pxor xmm14, xmm2
882
+ pxor xmm15, xmm3
883
+ movdqa xmm8, xmmword ptr [ROT16]
884
+ pshufb xmm12, xmm8
885
+ pshufb xmm13, xmm8
886
+ pshufb xmm14, xmm8
887
+ pshufb xmm15, xmm8
888
+ movdqa xmm8, xmmword ptr [rsp+100H]
889
+ paddd xmm8, xmm12
890
+ paddd xmm9, xmm13
891
+ paddd xmm10, xmm14
892
+ paddd xmm11, xmm15
893
+ pxor xmm4, xmm8
894
+ pxor xmm5, xmm9
895
+ pxor xmm6, xmm10
896
+ pxor xmm7, xmm11
897
+ movdqa xmmword ptr [rsp+100H], xmm8
898
+ movdqa xmm8, xmm4
899
+ psrld xmm8, 12
900
+ pslld xmm4, 20
901
+ por xmm4, xmm8
902
+ movdqa xmm8, xmm5
903
+ psrld xmm8, 12
904
+ pslld xmm5, 20
905
+ por xmm5, xmm8
906
+ movdqa xmm8, xmm6
907
+ psrld xmm8, 12
908
+ pslld xmm6, 20
909
+ por xmm6, xmm8
910
+ movdqa xmm8, xmm7
911
+ psrld xmm8, 12
912
+ pslld xmm7, 20
913
+ por xmm7, xmm8
914
+ paddd xmm0, xmmword ptr [rsp+0D0H]
915
+ paddd xmm1, xmmword ptr [rsp+0B0H]
916
+ paddd xmm2, xmmword ptr [rsp+0A0H]
917
+ paddd xmm3, xmmword ptr [rsp+80H]
918
+ paddd xmm0, xmm4
919
+ paddd xmm1, xmm5
920
+ paddd xmm2, xmm6
921
+ paddd xmm3, xmm7
922
+ pxor xmm12, xmm0
923
+ pxor xmm13, xmm1
924
+ pxor xmm14, xmm2
925
+ pxor xmm15, xmm3
926
+ movdqa xmm8, xmmword ptr [ROT8]
927
+ pshufb xmm12, xmm8
928
+ pshufb xmm13, xmm8
929
+ pshufb xmm14, xmm8
930
+ pshufb xmm15, xmm8
931
+ movdqa xmm8, xmmword ptr [rsp+100H]
932
+ paddd xmm8, xmm12
933
+ paddd xmm9, xmm13
934
+ paddd xmm10, xmm14
935
+ paddd xmm11, xmm15
936
+ pxor xmm4, xmm8
937
+ pxor xmm5, xmm9
938
+ pxor xmm6, xmm10
939
+ pxor xmm7, xmm11
940
+ movdqa xmmword ptr [rsp+100H], xmm8
941
+ movdqa xmm8, xmm4
942
+ psrld xmm8, 7
943
+ pslld xmm4, 25
944
+ por xmm4, xmm8
945
+ movdqa xmm8, xmm5
946
+ psrld xmm8, 7
947
+ pslld xmm5, 25
948
+ por xmm5, xmm8
949
+ movdqa xmm8, xmm6
950
+ psrld xmm8, 7
951
+ pslld xmm6, 25
952
+ por xmm6, xmm8
953
+ movdqa xmm8, xmm7
954
+ psrld xmm8, 7
955
+ pslld xmm7, 25
956
+ por xmm7, xmm8
957
+ paddd xmm0, xmmword ptr [rsp+70H]
958
+ paddd xmm1, xmmword ptr [rsp+50H]
959
+ paddd xmm2, xmmword ptr [rsp]
960
+ paddd xmm3, xmmword ptr [rsp+60H]
961
+ paddd xmm0, xmm5
962
+ paddd xmm1, xmm6
963
+ paddd xmm2, xmm7
964
+ paddd xmm3, xmm4
965
+ pxor xmm15, xmm0
966
+ pxor xmm12, xmm1
967
+ pxor xmm13, xmm2
968
+ pxor xmm14, xmm3
969
+ movdqa xmm8, xmmword ptr [ROT16]
970
+ pshufb xmm15, xmm8
971
+ pshufb xmm12, xmm8
972
+ pshufb xmm13, xmm8
973
+ pshufb xmm14, xmm8
974
+ paddd xmm10, xmm15
975
+ paddd xmm11, xmm12
976
+ movdqa xmm8, xmmword ptr [rsp+100H]
977
+ paddd xmm8, xmm13
978
+ paddd xmm9, xmm14
979
+ pxor xmm5, xmm10
980
+ pxor xmm6, xmm11
981
+ pxor xmm7, xmm8
982
+ pxor xmm4, xmm9
983
+ movdqa xmmword ptr [rsp+100H], xmm8
984
+ movdqa xmm8, xmm5
985
+ psrld xmm8, 12
986
+ pslld xmm5, 20
987
+ por xmm5, xmm8
988
+ movdqa xmm8, xmm6
989
+ psrld xmm8, 12
990
+ pslld xmm6, 20
991
+ por xmm6, xmm8
992
+ movdqa xmm8, xmm7
993
+ psrld xmm8, 12
994
+ pslld xmm7, 20
995
+ por xmm7, xmm8
996
+ movdqa xmm8, xmm4
997
+ psrld xmm8, 12
998
+ pslld xmm4, 20
999
+ por xmm4, xmm8
1000
+ paddd xmm0, xmmword ptr [rsp+20H]
1001
+ paddd xmm1, xmmword ptr [rsp+30H]
1002
+ paddd xmm2, xmmword ptr [rsp+10H]
1003
+ paddd xmm3, xmmword ptr [rsp+40H]
1004
+ paddd xmm0, xmm5
1005
+ paddd xmm1, xmm6
1006
+ paddd xmm2, xmm7
1007
+ paddd xmm3, xmm4
1008
+ pxor xmm15, xmm0
1009
+ pxor xmm12, xmm1
1010
+ pxor xmm13, xmm2
1011
+ pxor xmm14, xmm3
1012
+ movdqa xmm8, xmmword ptr [ROT8]
1013
+ pshufb xmm15, xmm8
1014
+ pshufb xmm12, xmm8
1015
+ pshufb xmm13, xmm8
1016
+ pshufb xmm14, xmm8
1017
+ paddd xmm10, xmm15
1018
+ paddd xmm11, xmm12
1019
+ movdqa xmm8, xmmword ptr [rsp+100H]
1020
+ paddd xmm8, xmm13
1021
+ paddd xmm9, xmm14
1022
+ pxor xmm5, xmm10
1023
+ pxor xmm6, xmm11
1024
+ pxor xmm7, xmm8
1025
+ pxor xmm4, xmm9
1026
+ movdqa xmmword ptr [rsp+100H], xmm8
1027
+ movdqa xmm8, xmm5
1028
+ psrld xmm8, 7
1029
+ pslld xmm5, 25
1030
+ por xmm5, xmm8
1031
+ movdqa xmm8, xmm6
1032
+ psrld xmm8, 7
1033
+ pslld xmm6, 25
1034
+ por xmm6, xmm8
1035
+ movdqa xmm8, xmm7
1036
+ psrld xmm8, 7
1037
+ pslld xmm7, 25
1038
+ por xmm7, xmm8
1039
+ movdqa xmm8, xmm4
1040
+ psrld xmm8, 7
1041
+ pslld xmm4, 25
1042
+ por xmm4, xmm8
1043
+ paddd xmm0, xmmword ptr [rsp+90H]
1044
+ paddd xmm1, xmmword ptr [rsp+0B0H]
1045
+ paddd xmm2, xmmword ptr [rsp+80H]
1046
+ paddd xmm3, xmmword ptr [rsp+0F0H]
1047
+ paddd xmm0, xmm4
1048
+ paddd xmm1, xmm5
1049
+ paddd xmm2, xmm6
1050
+ paddd xmm3, xmm7
1051
+ pxor xmm12, xmm0
1052
+ pxor xmm13, xmm1
1053
+ pxor xmm14, xmm2
1054
+ pxor xmm15, xmm3
1055
+ movdqa xmm8, xmmword ptr [ROT16]
1056
+ pshufb xmm12, xmm8
1057
+ pshufb xmm13, xmm8
1058
+ pshufb xmm14, xmm8
1059
+ pshufb xmm15, xmm8
1060
+ movdqa xmm8, xmmword ptr [rsp+100H]
1061
+ paddd xmm8, xmm12
1062
+ paddd xmm9, xmm13
1063
+ paddd xmm10, xmm14
1064
+ paddd xmm11, xmm15
1065
+ pxor xmm4, xmm8
1066
+ pxor xmm5, xmm9
1067
+ pxor xmm6, xmm10
1068
+ pxor xmm7, xmm11
1069
+ movdqa xmmword ptr [rsp+100H], xmm8
1070
+ movdqa xmm8, xmm4
1071
+ psrld xmm8, 12
1072
+ pslld xmm4, 20
1073
+ por xmm4, xmm8
1074
+ movdqa xmm8, xmm5
1075
+ psrld xmm8, 12
1076
+ pslld xmm5, 20
1077
+ por xmm5, xmm8
1078
+ movdqa xmm8, xmm6
1079
+ psrld xmm8, 12
1080
+ pslld xmm6, 20
1081
+ por xmm6, xmm8
1082
+ movdqa xmm8, xmm7
1083
+ psrld xmm8, 12
1084
+ pslld xmm7, 20
1085
+ por xmm7, xmm8
1086
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1087
+ paddd xmm1, xmmword ptr [rsp+50H]
1088
+ paddd xmm2, xmmword ptr [rsp+0C0H]
1089
+ paddd xmm3, xmmword ptr [rsp+10H]
1090
+ paddd xmm0, xmm4
1091
+ paddd xmm1, xmm5
1092
+ paddd xmm2, xmm6
1093
+ paddd xmm3, xmm7
1094
+ pxor xmm12, xmm0
1095
+ pxor xmm13, xmm1
1096
+ pxor xmm14, xmm2
1097
+ pxor xmm15, xmm3
1098
+ movdqa xmm8, xmmword ptr [ROT8]
1099
+ pshufb xmm12, xmm8
1100
+ pshufb xmm13, xmm8
1101
+ pshufb xmm14, xmm8
1102
+ pshufb xmm15, xmm8
1103
+ movdqa xmm8, xmmword ptr [rsp+100H]
1104
+ paddd xmm8, xmm12
1105
+ paddd xmm9, xmm13
1106
+ paddd xmm10, xmm14
1107
+ paddd xmm11, xmm15
1108
+ pxor xmm4, xmm8
1109
+ pxor xmm5, xmm9
1110
+ pxor xmm6, xmm10
1111
+ pxor xmm7, xmm11
1112
+ movdqa xmmword ptr [rsp+100H], xmm8
1113
+ movdqa xmm8, xmm4
1114
+ psrld xmm8, 7
1115
+ pslld xmm4, 25
1116
+ por xmm4, xmm8
1117
+ movdqa xmm8, xmm5
1118
+ psrld xmm8, 7
1119
+ pslld xmm5, 25
1120
+ por xmm5, xmm8
1121
+ movdqa xmm8, xmm6
1122
+ psrld xmm8, 7
1123
+ pslld xmm6, 25
1124
+ por xmm6, xmm8
1125
+ movdqa xmm8, xmm7
1126
+ psrld xmm8, 7
1127
+ pslld xmm7, 25
1128
+ por xmm7, xmm8
1129
+ paddd xmm0, xmmword ptr [rsp+0D0H]
1130
+ paddd xmm1, xmmword ptr [rsp]
1131
+ paddd xmm2, xmmword ptr [rsp+20H]
1132
+ paddd xmm3, xmmword ptr [rsp+40H]
1133
+ paddd xmm0, xmm5
1134
+ paddd xmm1, xmm6
1135
+ paddd xmm2, xmm7
1136
+ paddd xmm3, xmm4
1137
+ pxor xmm15, xmm0
1138
+ pxor xmm12, xmm1
1139
+ pxor xmm13, xmm2
1140
+ pxor xmm14, xmm3
1141
+ movdqa xmm8, xmmword ptr [ROT16]
1142
+ pshufb xmm15, xmm8
1143
+ pshufb xmm12, xmm8
1144
+ pshufb xmm13, xmm8
1145
+ pshufb xmm14, xmm8
1146
+ paddd xmm10, xmm15
1147
+ paddd xmm11, xmm12
1148
+ movdqa xmm8, xmmword ptr [rsp+100H]
1149
+ paddd xmm8, xmm13
1150
+ paddd xmm9, xmm14
1151
+ pxor xmm5, xmm10
1152
+ pxor xmm6, xmm11
1153
+ pxor xmm7, xmm8
1154
+ pxor xmm4, xmm9
1155
+ movdqa xmmword ptr [rsp+100H], xmm8
1156
+ movdqa xmm8, xmm5
1157
+ psrld xmm8, 12
1158
+ pslld xmm5, 20
1159
+ por xmm5, xmm8
1160
+ movdqa xmm8, xmm6
1161
+ psrld xmm8, 12
1162
+ pslld xmm6, 20
1163
+ por xmm6, xmm8
1164
+ movdqa xmm8, xmm7
1165
+ psrld xmm8, 12
1166
+ pslld xmm7, 20
1167
+ por xmm7, xmm8
1168
+ movdqa xmm8, xmm4
1169
+ psrld xmm8, 12
1170
+ pslld xmm4, 20
1171
+ por xmm4, xmm8
1172
+ paddd xmm0, xmmword ptr [rsp+30H]
1173
+ paddd xmm1, xmmword ptr [rsp+0A0H]
1174
+ paddd xmm2, xmmword ptr [rsp+60H]
1175
+ paddd xmm3, xmmword ptr [rsp+70H]
1176
+ paddd xmm0, xmm5
1177
+ paddd xmm1, xmm6
1178
+ paddd xmm2, xmm7
1179
+ paddd xmm3, xmm4
1180
+ pxor xmm15, xmm0
1181
+ pxor xmm12, xmm1
1182
+ pxor xmm13, xmm2
1183
+ pxor xmm14, xmm3
1184
+ movdqa xmm8, xmmword ptr [ROT8]
1185
+ pshufb xmm15, xmm8
1186
+ pshufb xmm12, xmm8
1187
+ pshufb xmm13, xmm8
1188
+ pshufb xmm14, xmm8
1189
+ paddd xmm10, xmm15
1190
+ paddd xmm11, xmm12
1191
+ movdqa xmm8, xmmword ptr [rsp+100H]
1192
+ paddd xmm8, xmm13
1193
+ paddd xmm9, xmm14
1194
+ pxor xmm5, xmm10
1195
+ pxor xmm6, xmm11
1196
+ pxor xmm7, xmm8
1197
+ pxor xmm4, xmm9
1198
+ movdqa xmmword ptr [rsp+100H], xmm8
1199
+ movdqa xmm8, xmm5
1200
+ psrld xmm8, 7
1201
+ pslld xmm5, 25
1202
+ por xmm5, xmm8
1203
+ movdqa xmm8, xmm6
1204
+ psrld xmm8, 7
1205
+ pslld xmm6, 25
1206
+ por xmm6, xmm8
1207
+ movdqa xmm8, xmm7
1208
+ psrld xmm8, 7
1209
+ pslld xmm7, 25
1210
+ por xmm7, xmm8
1211
+ movdqa xmm8, xmm4
1212
+ psrld xmm8, 7
1213
+ pslld xmm4, 25
1214
+ por xmm4, xmm8
1215
+ paddd xmm0, xmmword ptr [rsp+0B0H]
1216
+ paddd xmm1, xmmword ptr [rsp+50H]
1217
+ paddd xmm2, xmmword ptr [rsp+10H]
1218
+ paddd xmm3, xmmword ptr [rsp+80H]
1219
+ paddd xmm0, xmm4
1220
+ paddd xmm1, xmm5
1221
+ paddd xmm2, xmm6
1222
+ paddd xmm3, xmm7
1223
+ pxor xmm12, xmm0
1224
+ pxor xmm13, xmm1
1225
+ pxor xmm14, xmm2
1226
+ pxor xmm15, xmm3
1227
+ movdqa xmm8, xmmword ptr [ROT16]
1228
+ pshufb xmm12, xmm8
1229
+ pshufb xmm13, xmm8
1230
+ pshufb xmm14, xmm8
1231
+ pshufb xmm15, xmm8
1232
+ movdqa xmm8, xmmword ptr [rsp+100H]
1233
+ paddd xmm8, xmm12
1234
+ paddd xmm9, xmm13
1235
+ paddd xmm10, xmm14
1236
+ paddd xmm11, xmm15
1237
+ pxor xmm4, xmm8
1238
+ pxor xmm5, xmm9
1239
+ pxor xmm6, xmm10
1240
+ pxor xmm7, xmm11
1241
+ movdqa xmmword ptr [rsp+100H], xmm8
1242
+ movdqa xmm8, xmm4
1243
+ psrld xmm8, 12
1244
+ pslld xmm4, 20
1245
+ por xmm4, xmm8
1246
+ movdqa xmm8, xmm5
1247
+ psrld xmm8, 12
1248
+ pslld xmm5, 20
1249
+ por xmm5, xmm8
1250
+ movdqa xmm8, xmm6
1251
+ psrld xmm8, 12
1252
+ pslld xmm6, 20
1253
+ por xmm6, xmm8
1254
+ movdqa xmm8, xmm7
1255
+ psrld xmm8, 12
1256
+ pslld xmm7, 20
1257
+ por xmm7, xmm8
1258
+ paddd xmm0, xmmword ptr [rsp+0F0H]
1259
+ paddd xmm1, xmmword ptr [rsp]
1260
+ paddd xmm2, xmmword ptr [rsp+90H]
1261
+ paddd xmm3, xmmword ptr [rsp+60H]
1262
+ paddd xmm0, xmm4
1263
+ paddd xmm1, xmm5
1264
+ paddd xmm2, xmm6
1265
+ paddd xmm3, xmm7
1266
+ pxor xmm12, xmm0
1267
+ pxor xmm13, xmm1
1268
+ pxor xmm14, xmm2
1269
+ pxor xmm15, xmm3
1270
+ movdqa xmm8, xmmword ptr [ROT8]
1271
+ pshufb xmm12, xmm8
1272
+ pshufb xmm13, xmm8
1273
+ pshufb xmm14, xmm8
1274
+ pshufb xmm15, xmm8
1275
+ movdqa xmm8, xmmword ptr [rsp+100H]
1276
+ paddd xmm8, xmm12
1277
+ paddd xmm9, xmm13
1278
+ paddd xmm10, xmm14
1279
+ paddd xmm11, xmm15
1280
+ pxor xmm4, xmm8
1281
+ pxor xmm5, xmm9
1282
+ pxor xmm6, xmm10
1283
+ pxor xmm7, xmm11
1284
+ movdqa xmmword ptr [rsp+100H], xmm8
1285
+ movdqa xmm8, xmm4
1286
+ psrld xmm8, 7
1287
+ pslld xmm4, 25
1288
+ por xmm4, xmm8
1289
+ movdqa xmm8, xmm5
1290
+ psrld xmm8, 7
1291
+ pslld xmm5, 25
1292
+ por xmm5, xmm8
1293
+ movdqa xmm8, xmm6
1294
+ psrld xmm8, 7
1295
+ pslld xmm6, 25
1296
+ por xmm6, xmm8
1297
+ movdqa xmm8, xmm7
1298
+ psrld xmm8, 7
1299
+ pslld xmm7, 25
1300
+ por xmm7, xmm8
1301
+ paddd xmm0, xmmword ptr [rsp+0E0H]
1302
+ paddd xmm1, xmmword ptr [rsp+20H]
1303
+ paddd xmm2, xmmword ptr [rsp+30H]
1304
+ paddd xmm3, xmmword ptr [rsp+70H]
1305
+ paddd xmm0, xmm5
1306
+ paddd xmm1, xmm6
1307
+ paddd xmm2, xmm7
1308
+ paddd xmm3, xmm4
1309
+ pxor xmm15, xmm0
1310
+ pxor xmm12, xmm1
1311
+ pxor xmm13, xmm2
1312
+ pxor xmm14, xmm3
1313
+ movdqa xmm8, xmmword ptr [ROT16]
1314
+ pshufb xmm15, xmm8
1315
+ pshufb xmm12, xmm8
1316
+ pshufb xmm13, xmm8
1317
+ pshufb xmm14, xmm8
1318
+ paddd xmm10, xmm15
1319
+ paddd xmm11, xmm12
1320
+ movdqa xmm8, xmmword ptr [rsp+100H]
1321
+ paddd xmm8, xmm13
1322
+ paddd xmm9, xmm14
1323
+ pxor xmm5, xmm10
1324
+ pxor xmm6, xmm11
1325
+ pxor xmm7, xmm8
1326
+ pxor xmm4, xmm9
1327
+ movdqa xmmword ptr [rsp+100H], xmm8
1328
+ movdqa xmm8, xmm5
1329
+ psrld xmm8, 12
1330
+ pslld xmm5, 20
1331
+ por xmm5, xmm8
1332
+ movdqa xmm8, xmm6
1333
+ psrld xmm8, 12
1334
+ pslld xmm6, 20
1335
+ por xmm6, xmm8
1336
+ movdqa xmm8, xmm7
1337
+ psrld xmm8, 12
1338
+ pslld xmm7, 20
1339
+ por xmm7, xmm8
1340
+ movdqa xmm8, xmm4
1341
+ psrld xmm8, 12
1342
+ pslld xmm4, 20
1343
+ por xmm4, xmm8
1344
+ paddd xmm0, xmmword ptr [rsp+0A0H]
1345
+ paddd xmm1, xmmword ptr [rsp+0C0H]
1346
+ paddd xmm2, xmmword ptr [rsp+40H]
1347
+ paddd xmm3, xmmword ptr [rsp+0D0H]
1348
+ paddd xmm0, xmm5
1349
+ paddd xmm1, xmm6
1350
+ paddd xmm2, xmm7
1351
+ paddd xmm3, xmm4
1352
+ pxor xmm15, xmm0
1353
+ pxor xmm12, xmm1
1354
+ pxor xmm13, xmm2
1355
+ pxor xmm14, xmm3
1356
+ movdqa xmm8, xmmword ptr [ROT8]
1357
+ pshufb xmm15, xmm8
1358
+ pshufb xmm12, xmm8
1359
+ pshufb xmm13, xmm8
1360
+ pshufb xmm14, xmm8
1361
+ paddd xmm10, xmm15
1362
+ paddd xmm11, xmm12
1363
+ movdqa xmm8, xmmword ptr [rsp+100H]
1364
+ paddd xmm8, xmm13
1365
+ paddd xmm9, xmm14
1366
+ pxor xmm5, xmm10
1367
+ pxor xmm6, xmm11
1368
+ pxor xmm7, xmm8
1369
+ pxor xmm4, xmm9
1370
+ pxor xmm0, xmm8
1371
+ pxor xmm1, xmm9
1372
+ pxor xmm2, xmm10
1373
+ pxor xmm3, xmm11
1374
+ movdqa xmm8, xmm5
1375
+ psrld xmm8, 7
1376
+ pslld xmm5, 25
1377
+ por xmm5, xmm8
1378
+ movdqa xmm8, xmm6
1379
+ psrld xmm8, 7
1380
+ pslld xmm6, 25
1381
+ por xmm6, xmm8
1382
+ movdqa xmm8, xmm7
1383
+ psrld xmm8, 7
1384
+ pslld xmm7, 25
1385
+ por xmm7, xmm8
1386
+ movdqa xmm8, xmm4
1387
+ psrld xmm8, 7
1388
+ pslld xmm4, 25
1389
+ por xmm4, xmm8
1390
+ pxor xmm4, xmm12
1391
+ pxor xmm5, xmm13
1392
+ pxor xmm6, xmm14
1393
+ pxor xmm7, xmm15
1394
+ mov eax, r13d
1395
+ jne innerloop4
1396
+ movdqa xmm9, xmm0
1397
+ punpckldq xmm0, xmm1
1398
+ punpckhdq xmm9, xmm1
1399
+ movdqa xmm11, xmm2
1400
+ punpckldq xmm2, xmm3
1401
+ punpckhdq xmm11, xmm3
1402
+ movdqa xmm1, xmm0
1403
+ punpcklqdq xmm0, xmm2
1404
+ punpckhqdq xmm1, xmm2
1405
+ movdqa xmm3, xmm9
1406
+ punpcklqdq xmm9, xmm11
1407
+ punpckhqdq xmm3, xmm11
1408
+ movdqu xmmword ptr [rbx], xmm0
1409
+ movdqu xmmword ptr [rbx+20H], xmm1
1410
+ movdqu xmmword ptr [rbx+40H], xmm9
1411
+ movdqu xmmword ptr [rbx+60H], xmm3
1412
+ movdqa xmm9, xmm4
1413
+ punpckldq xmm4, xmm5
1414
+ punpckhdq xmm9, xmm5
1415
+ movdqa xmm11, xmm6
1416
+ punpckldq xmm6, xmm7
1417
+ punpckhdq xmm11, xmm7
1418
+ movdqa xmm5, xmm4
1419
+ punpcklqdq xmm4, xmm6
1420
+ punpckhqdq xmm5, xmm6
1421
+ movdqa xmm7, xmm9
1422
+ punpcklqdq xmm9, xmm11
1423
+ punpckhqdq xmm7, xmm11
1424
+ movdqu xmmword ptr [rbx+10H], xmm4
1425
+ movdqu xmmword ptr [rbx+30H], xmm5
1426
+ movdqu xmmword ptr [rbx+50H], xmm9
1427
+ movdqu xmmword ptr [rbx+70H], xmm7
1428
+ movdqa xmm1, xmmword ptr [rsp+110H]
1429
+ movdqa xmm0, xmm1
1430
+ paddd xmm1, xmmword ptr [rsp+150H]
1431
+ movdqa xmmword ptr [rsp+110H], xmm1
1432
+ pxor xmm0, xmmword ptr [CMP_MSB_MASK]
1433
+ pxor xmm1, xmmword ptr [CMP_MSB_MASK]
1434
+ pcmpgtd xmm0, xmm1
1435
+ movdqa xmm1, xmmword ptr [rsp+120H]
1436
+ psubd xmm1, xmm0
1437
+ movdqa xmmword ptr [rsp+120H], xmm1
1438
+ add rbx, 128
1439
+ add rdi, 32
1440
+ sub rsi, 4
1441
+ cmp rsi, 4
1442
+ jnc outerloop4
1443
+ test rsi, rsi
1444
+ jne final3blocks
1445
+ unwind:
1446
+ movdqa xmm6, xmmword ptr [rsp+170H]
1447
+ movdqa xmm7, xmmword ptr [rsp+180H]
1448
+ movdqa xmm8, xmmword ptr [rsp+190H]
1449
+ movdqa xmm9, xmmword ptr [rsp+1A0H]
1450
+ movdqa xmm10, xmmword ptr [rsp+1B0H]
1451
+ movdqa xmm11, xmmword ptr [rsp+1C0H]
1452
+ movdqa xmm12, xmmword ptr [rsp+1D0H]
1453
+ movdqa xmm13, xmmword ptr [rsp+1E0H]
1454
+ movdqa xmm14, xmmword ptr [rsp+1F0H]
1455
+ movdqa xmm15, xmmword ptr [rsp+200H]
1456
+ mov rsp, rbp
1457
+ pop rbp
1458
+ pop rbx
1459
+ pop rdi
1460
+ pop rsi
1461
+ pop r12
1462
+ pop r13
1463
+ pop r14
1464
+ pop r15
1465
+ ret
1466
+ ALIGN 16
1467
+ final3blocks:
1468
+ test esi, 2H
1469
+ je final1block
1470
+ movups xmm0, xmmword ptr [rcx]
1471
+ movups xmm1, xmmword ptr [rcx+10H]
1472
+ movaps xmm8, xmm0
1473
+ movaps xmm9, xmm1
1474
+ movd xmm13, dword ptr [rsp+110H]
1475
+ pinsrd xmm13, dword ptr [rsp+120H], 1
1476
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1477
+ movaps xmmword ptr [rsp], xmm13
1478
+ movd xmm14, dword ptr [rsp+114H]
1479
+ pinsrd xmm14, dword ptr [rsp+124H], 1
1480
+ pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN], 2
1481
+ movaps xmmword ptr [rsp+10H], xmm14
1482
+ mov r8, qword ptr [rdi]
1483
+ mov r9, qword ptr [rdi+8H]
1484
+ movzx eax, byte ptr [rbp+80H]
1485
+ or eax, r13d
1486
+ xor edx, edx
1487
+ innerloop2:
1488
+ mov r14d, eax
1489
+ or eax, r12d
1490
+ add rdx, 64
1491
+ cmp rdx, r15
1492
+ cmovne eax, r14d
1493
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1494
+ movaps xmm10, xmm2
1495
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1496
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1497
+ movaps xmm3, xmm4
1498
+ shufps xmm4, xmm5, 136
1499
+ shufps xmm3, xmm5, 221
1500
+ movaps xmm5, xmm3
1501
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1502
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1503
+ movaps xmm3, xmm6
1504
+ shufps xmm6, xmm7, 136
1505
+ pshufd xmm6, xmm6, 93H
1506
+ shufps xmm3, xmm7, 221
1507
+ pshufd xmm7, xmm3, 93H
1508
+ movups xmm12, xmmword ptr [r9+rdx-40H]
1509
+ movups xmm13, xmmword ptr [r9+rdx-30H]
1510
+ movaps xmm11, xmm12
1511
+ shufps xmm12, xmm13, 136
1512
+ shufps xmm11, xmm13, 221
1513
+ movaps xmm13, xmm11
1514
+ movups xmm14, xmmword ptr [r9+rdx-20H]
1515
+ movups xmm15, xmmword ptr [r9+rdx-10H]
1516
+ movaps xmm11, xmm14
1517
+ shufps xmm14, xmm15, 136
1518
+ pshufd xmm14, xmm14, 93H
1519
+ shufps xmm11, xmm15, 221
1520
+ pshufd xmm15, xmm11, 93H
1521
+ movaps xmm3, xmmword ptr [rsp]
1522
+ movaps xmm11, xmmword ptr [rsp+10H]
1523
+ pinsrd xmm3, eax, 3
1524
+ pinsrd xmm11, eax, 3
1525
+ mov al, 7
1526
+ roundloop2:
1527
+ paddd xmm0, xmm4
1528
+ paddd xmm8, xmm12
1529
+ movaps xmmword ptr [rsp+20H], xmm4
1530
+ movaps xmmword ptr [rsp+30H], xmm12
1531
+ paddd xmm0, xmm1
1532
+ paddd xmm8, xmm9
1533
+ pxor xmm3, xmm0
1534
+ pxor xmm11, xmm8
1535
+ movaps xmm12, xmmword ptr [ROT16]
1536
+ pshufb xmm3, xmm12
1537
+ pshufb xmm11, xmm12
1538
+ paddd xmm2, xmm3
1539
+ paddd xmm10, xmm11
1540
+ pxor xmm1, xmm2
1541
+ pxor xmm9, xmm10
1542
+ movdqa xmm4, xmm1
1543
+ pslld xmm1, 20
1544
+ psrld xmm4, 12
1545
+ por xmm1, xmm4
1546
+ movdqa xmm4, xmm9
1547
+ pslld xmm9, 20
1548
+ psrld xmm4, 12
1549
+ por xmm9, xmm4
1550
+ paddd xmm0, xmm5
1551
+ paddd xmm8, xmm13
1552
+ movaps xmmword ptr [rsp+40H], xmm5
1553
+ movaps xmmword ptr [rsp+50H], xmm13
1554
+ paddd xmm0, xmm1
1555
+ paddd xmm8, xmm9
1556
+ pxor xmm3, xmm0
1557
+ pxor xmm11, xmm8
1558
+ movaps xmm13, xmmword ptr [ROT8]
1559
+ pshufb xmm3, xmm13
1560
+ pshufb xmm11, xmm13
1561
+ paddd xmm2, xmm3
1562
+ paddd xmm10, xmm11
1563
+ pxor xmm1, xmm2
1564
+ pxor xmm9, xmm10
1565
+ movdqa xmm4, xmm1
1566
+ pslld xmm1, 25
1567
+ psrld xmm4, 7
1568
+ por xmm1, xmm4
1569
+ movdqa xmm4, xmm9
1570
+ pslld xmm9, 25
1571
+ psrld xmm4, 7
1572
+ por xmm9, xmm4
1573
+ pshufd xmm0, xmm0, 93H
1574
+ pshufd xmm8, xmm8, 93H
1575
+ pshufd xmm3, xmm3, 4EH
1576
+ pshufd xmm11, xmm11, 4EH
1577
+ pshufd xmm2, xmm2, 39H
1578
+ pshufd xmm10, xmm10, 39H
1579
+ paddd xmm0, xmm6
1580
+ paddd xmm8, xmm14
1581
+ paddd xmm0, xmm1
1582
+ paddd xmm8, xmm9
1583
+ pxor xmm3, xmm0
1584
+ pxor xmm11, xmm8
1585
+ pshufb xmm3, xmm12
1586
+ pshufb xmm11, xmm12
1587
+ paddd xmm2, xmm3
1588
+ paddd xmm10, xmm11
1589
+ pxor xmm1, xmm2
1590
+ pxor xmm9, xmm10
1591
+ movdqa xmm4, xmm1
1592
+ pslld xmm1, 20
1593
+ psrld xmm4, 12
1594
+ por xmm1, xmm4
1595
+ movdqa xmm4, xmm9
1596
+ pslld xmm9, 20
1597
+ psrld xmm4, 12
1598
+ por xmm9, xmm4
1599
+ paddd xmm0, xmm7
1600
+ paddd xmm8, xmm15
1601
+ paddd xmm0, xmm1
1602
+ paddd xmm8, xmm9
1603
+ pxor xmm3, xmm0
1604
+ pxor xmm11, xmm8
1605
+ pshufb xmm3, xmm13
1606
+ pshufb xmm11, xmm13
1607
+ paddd xmm2, xmm3
1608
+ paddd xmm10, xmm11
1609
+ pxor xmm1, xmm2
1610
+ pxor xmm9, xmm10
1611
+ movdqa xmm4, xmm1
1612
+ pslld xmm1, 25
1613
+ psrld xmm4, 7
1614
+ por xmm1, xmm4
1615
+ movdqa xmm4, xmm9
1616
+ pslld xmm9, 25
1617
+ psrld xmm4, 7
1618
+ por xmm9, xmm4
1619
+ pshufd xmm0, xmm0, 39H
1620
+ pshufd xmm8, xmm8, 39H
1621
+ pshufd xmm3, xmm3, 4EH
1622
+ pshufd xmm11, xmm11, 4EH
1623
+ pshufd xmm2, xmm2, 93H
1624
+ pshufd xmm10, xmm10, 93H
1625
+ dec al
1626
+ je endroundloop2
1627
+ movdqa xmm12, xmmword ptr [rsp+20H]
1628
+ movdqa xmm5, xmmword ptr [rsp+40H]
1629
+ pshufd xmm13, xmm12, 0FH
1630
+ shufps xmm12, xmm5, 214
1631
+ pshufd xmm4, xmm12, 39H
1632
+ movdqa xmm12, xmm6
1633
+ shufps xmm12, xmm7, 250
1634
+ pblendw xmm13, xmm12, 0CCH
1635
+ movdqa xmm12, xmm7
1636
+ punpcklqdq xmm12, xmm5
1637
+ pblendw xmm12, xmm6, 0C0H
1638
+ pshufd xmm12, xmm12, 78H
1639
+ punpckhdq xmm5, xmm7
1640
+ punpckldq xmm6, xmm5
1641
+ pshufd xmm7, xmm6, 1EH
1642
+ movdqa xmmword ptr [rsp+20H], xmm13
1643
+ movdqa xmmword ptr [rsp+40H], xmm12
1644
+ movdqa xmm5, xmmword ptr [rsp+30H]
1645
+ movdqa xmm13, xmmword ptr [rsp+50H]
1646
+ pshufd xmm6, xmm5, 0FH
1647
+ shufps xmm5, xmm13, 214
1648
+ pshufd xmm12, xmm5, 39H
1649
+ movdqa xmm5, xmm14
1650
+ shufps xmm5, xmm15, 250
1651
+ pblendw xmm6, xmm5, 0CCH
1652
+ movdqa xmm5, xmm15
1653
+ punpcklqdq xmm5, xmm13
1654
+ pblendw xmm5, xmm14, 0C0H
1655
+ pshufd xmm5, xmm5, 78H
1656
+ punpckhdq xmm13, xmm15
1657
+ punpckldq xmm14, xmm13
1658
+ pshufd xmm15, xmm14, 1EH
1659
+ movdqa xmm13, xmm6
1660
+ movdqa xmm14, xmm5
1661
+ movdqa xmm5, xmmword ptr [rsp+20H]
1662
+ movdqa xmm6, xmmword ptr [rsp+40H]
1663
+ jmp roundloop2
1664
+ endroundloop2:
1665
+ pxor xmm0, xmm2
1666
+ pxor xmm1, xmm3
1667
+ pxor xmm8, xmm10
1668
+ pxor xmm9, xmm11
1669
+ mov eax, r13d
1670
+ cmp rdx, r15
1671
+ jne innerloop2
1672
+ movups xmmword ptr [rbx], xmm0
1673
+ movups xmmword ptr [rbx+10H], xmm1
1674
+ movups xmmword ptr [rbx+20H], xmm8
1675
+ movups xmmword ptr [rbx+30H], xmm9
1676
+ movdqa xmm0, xmmword ptr [rsp+130H]
1677
+ movdqa xmm1, xmmword ptr [rsp+110H]
1678
+ movdqa xmm2, xmmword ptr [rsp+120H]
1679
+ movdqu xmm3, xmmword ptr [rsp+118H]
1680
+ movdqu xmm4, xmmword ptr [rsp+128H]
1681
+ blendvps xmm1, xmm3, xmm0
1682
+ blendvps xmm2, xmm4, xmm0
1683
+ movdqa xmmword ptr [rsp+110H], xmm1
1684
+ movdqa xmmword ptr [rsp+120H], xmm2
1685
+ add rdi, 16
1686
+ add rbx, 64
1687
+ sub rsi, 2
1688
+ final1block:
1689
+ test esi, 1H
1690
+ je unwind
1691
+ movups xmm0, xmmword ptr [rcx]
1692
+ movups xmm1, xmmword ptr [rcx+10H]
1693
+ movd xmm13, dword ptr [rsp+110H]
1694
+ pinsrd xmm13, dword ptr [rsp+120H], 1
1695
+ pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN], 2
1696
+ movaps xmm14, xmmword ptr [ROT8]
1697
+ movaps xmm15, xmmword ptr [ROT16]
1698
+ mov r8, qword ptr [rdi]
1699
+ movzx eax, byte ptr [rbp+80H]
1700
+ or eax, r13d
1701
+ xor edx, edx
1702
+ innerloop1:
1703
+ mov r14d, eax
1704
+ or eax, r12d
1705
+ add rdx, 64
1706
+ cmp rdx, r15
1707
+ cmovne eax, r14d
1708
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1709
+ movaps xmm3, xmm13
1710
+ pinsrd xmm3, eax, 3
1711
+ movups xmm4, xmmword ptr [r8+rdx-40H]
1712
+ movups xmm5, xmmword ptr [r8+rdx-30H]
1713
+ movaps xmm8, xmm4
1714
+ shufps xmm4, xmm5, 136
1715
+ shufps xmm8, xmm5, 221
1716
+ movaps xmm5, xmm8
1717
+ movups xmm6, xmmword ptr [r8+rdx-20H]
1718
+ movups xmm7, xmmword ptr [r8+rdx-10H]
1719
+ movaps xmm8, xmm6
1720
+ shufps xmm6, xmm7, 136
1721
+ pshufd xmm6, xmm6, 93H
1722
+ shufps xmm8, xmm7, 221
1723
+ pshufd xmm7, xmm8, 93H
1724
+ mov al, 7
1725
+ roundloop1:
1726
+ paddd xmm0, xmm4
1727
+ paddd xmm0, xmm1
1728
+ pxor xmm3, xmm0
1729
+ pshufb xmm3, xmm15
1730
+ paddd xmm2, xmm3
1731
+ pxor xmm1, xmm2
1732
+ movdqa xmm11, xmm1
1733
+ pslld xmm1, 20
1734
+ psrld xmm11, 12
1735
+ por xmm1, xmm11
1736
+ paddd xmm0, xmm5
1737
+ paddd xmm0, xmm1
1738
+ pxor xmm3, xmm0
1739
+ pshufb xmm3, xmm14
1740
+ paddd xmm2, xmm3
1741
+ pxor xmm1, xmm2
1742
+ movdqa xmm11, xmm1
1743
+ pslld xmm1, 25
1744
+ psrld xmm11, 7
1745
+ por xmm1, xmm11
1746
+ pshufd xmm0, xmm0, 93H
1747
+ pshufd xmm3, xmm3, 4EH
1748
+ pshufd xmm2, xmm2, 39H
1749
+ paddd xmm0, xmm6
1750
+ paddd xmm0, xmm1
1751
+ pxor xmm3, xmm0
1752
+ pshufb xmm3, xmm15
1753
+ paddd xmm2, xmm3
1754
+ pxor xmm1, xmm2
1755
+ movdqa xmm11, xmm1
1756
+ pslld xmm1, 20
1757
+ psrld xmm11, 12
1758
+ por xmm1, xmm11
1759
+ paddd xmm0, xmm7
1760
+ paddd xmm0, xmm1
1761
+ pxor xmm3, xmm0
1762
+ pshufb xmm3, xmm14
1763
+ paddd xmm2, xmm3
1764
+ pxor xmm1, xmm2
1765
+ movdqa xmm11, xmm1
1766
+ pslld xmm1, 25
1767
+ psrld xmm11, 7
1768
+ por xmm1, xmm11
1769
+ pshufd xmm0, xmm0, 39H
1770
+ pshufd xmm3, xmm3, 4EH
1771
+ pshufd xmm2, xmm2, 93H
1772
+ dec al
1773
+ jz endroundloop1
1774
+ movdqa xmm8, xmm4
1775
+ shufps xmm8, xmm5, 214
1776
+ pshufd xmm9, xmm4, 0FH
1777
+ pshufd xmm4, xmm8, 39H
1778
+ movdqa xmm8, xmm6
1779
+ shufps xmm8, xmm7, 250
1780
+ pblendw xmm9, xmm8, 0CCH
1781
+ movdqa xmm8, xmm7
1782
+ punpcklqdq xmm8, xmm5
1783
+ pblendw xmm8, xmm6, 0C0H
1784
+ pshufd xmm8, xmm8, 78H
1785
+ punpckhdq xmm5, xmm7
1786
+ punpckldq xmm6, xmm5
1787
+ pshufd xmm7, xmm6, 1EH
1788
+ movdqa xmm5, xmm9
1789
+ movdqa xmm6, xmm8
1790
+ jmp roundloop1
1791
+ endroundloop1:
1792
+ pxor xmm0, xmm2
1793
+ pxor xmm1, xmm3
1794
+ mov eax, r13d
1795
+ cmp rdx, r15
1796
+ jne innerloop1
1797
+ movups xmmword ptr [rbx], xmm0
1798
+ movups xmmword ptr [rbx+10H], xmm1
1799
+ jmp unwind
1800
+ _blake3_hash_many_sse41 ENDP
1801
+ blake3_hash_many_sse41 ENDP
1802
+
1803
+ blake3_compress_in_place_sse41 PROC
1804
+ _blake3_compress_in_place_sse41 PROC
1805
+ sub rsp, 72
1806
+ movdqa xmmword ptr [rsp], xmm6
1807
+ movdqa xmmword ptr [rsp+10H], xmm7
1808
+ movdqa xmmword ptr [rsp+20H], xmm8
1809
+ movdqa xmmword ptr [rsp+30H], xmm9
1810
+ movups xmm0, xmmword ptr [rcx]
1811
+ movups xmm1, xmmword ptr [rcx+10H]
1812
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1813
+ movzx eax, byte ptr [rsp+70H]
1814
+ movzx r8d, r8b
1815
+ shl rax, 32
1816
+ add r8, rax
1817
+ movq xmm3, r9
1818
+ movq xmm4, r8
1819
+ punpcklqdq xmm3, xmm4
1820
+ movups xmm4, xmmword ptr [rdx]
1821
+ movups xmm5, xmmword ptr [rdx+10H]
1822
+ movaps xmm8, xmm4
1823
+ shufps xmm4, xmm5, 136
1824
+ shufps xmm8, xmm5, 221
1825
+ movaps xmm5, xmm8
1826
+ movups xmm6, xmmword ptr [rdx+20H]
1827
+ movups xmm7, xmmword ptr [rdx+30H]
1828
+ movaps xmm8, xmm6
1829
+ shufps xmm6, xmm7, 136
1830
+ pshufd xmm6, xmm6, 93H
1831
+ shufps xmm8, xmm7, 221
1832
+ pshufd xmm7, xmm8, 93H
1833
+ movaps xmm14, xmmword ptr [ROT8]
1834
+ movaps xmm15, xmmword ptr [ROT16]
1835
+ mov al, 7
1836
+ @@:
1837
+ paddd xmm0, xmm4
1838
+ paddd xmm0, xmm1
1839
+ pxor xmm3, xmm0
1840
+ pshufb xmm3, xmm15
1841
+ paddd xmm2, xmm3
1842
+ pxor xmm1, xmm2
1843
+ movdqa xmm11, xmm1
1844
+ pslld xmm1, 20
1845
+ psrld xmm11, 12
1846
+ por xmm1, xmm11
1847
+ paddd xmm0, xmm5
1848
+ paddd xmm0, xmm1
1849
+ pxor xmm3, xmm0
1850
+ pshufb xmm3, xmm14
1851
+ paddd xmm2, xmm3
1852
+ pxor xmm1, xmm2
1853
+ movdqa xmm11, xmm1
1854
+ pslld xmm1, 25
1855
+ psrld xmm11, 7
1856
+ por xmm1, xmm11
1857
+ pshufd xmm0, xmm0, 93H
1858
+ pshufd xmm3, xmm3, 4EH
1859
+ pshufd xmm2, xmm2, 39H
1860
+ paddd xmm0, xmm6
1861
+ paddd xmm0, xmm1
1862
+ pxor xmm3, xmm0
1863
+ pshufb xmm3, xmm15
1864
+ paddd xmm2, xmm3
1865
+ pxor xmm1, xmm2
1866
+ movdqa xmm11, xmm1
1867
+ pslld xmm1, 20
1868
+ psrld xmm11, 12
1869
+ por xmm1, xmm11
1870
+ paddd xmm0, xmm7
1871
+ paddd xmm0, xmm1
1872
+ pxor xmm3, xmm0
1873
+ pshufb xmm3, xmm14
1874
+ paddd xmm2, xmm3
1875
+ pxor xmm1, xmm2
1876
+ movdqa xmm11, xmm1
1877
+ pslld xmm1, 25
1878
+ psrld xmm11, 7
1879
+ por xmm1, xmm11
1880
+ pshufd xmm0, xmm0, 39H
1881
+ pshufd xmm3, xmm3, 4EH
1882
+ pshufd xmm2, xmm2, 93H
1883
+ dec al
1884
+ jz @F
1885
+ movdqa xmm8, xmm4
1886
+ shufps xmm8, xmm5, 214
1887
+ pshufd xmm9, xmm4, 0FH
1888
+ pshufd xmm4, xmm8, 39H
1889
+ movdqa xmm8, xmm6
1890
+ shufps xmm8, xmm7, 250
1891
+ pblendw xmm9, xmm8, 0CCH
1892
+ movdqa xmm8, xmm7
1893
+ punpcklqdq xmm8, xmm5
1894
+ pblendw xmm8, xmm6, 0C0H
1895
+ pshufd xmm8, xmm8, 78H
1896
+ punpckhdq xmm5, xmm7
1897
+ punpckldq xmm6, xmm5
1898
+ pshufd xmm7, xmm6, 1EH
1899
+ movdqa xmm5, xmm9
1900
+ movdqa xmm6, xmm8
1901
+ jmp @B
1902
+ @@:
1903
+ pxor xmm0, xmm2
1904
+ pxor xmm1, xmm3
1905
+ movups xmmword ptr [rcx], xmm0
1906
+ movups xmmword ptr [rcx+10H], xmm1
1907
+ movdqa xmm6, xmmword ptr [rsp]
1908
+ movdqa xmm7, xmmword ptr [rsp+10H]
1909
+ movdqa xmm8, xmmword ptr [rsp+20H]
1910
+ movdqa xmm9, xmmword ptr [rsp+30H]
1911
+ add rsp, 72
1912
+ ret
1913
+ _blake3_compress_in_place_sse41 ENDP
1914
+ blake3_compress_in_place_sse41 ENDP
1915
+
1916
+ ALIGN 16
1917
+ blake3_compress_xof_sse41 PROC
1918
+ _blake3_compress_xof_sse41 PROC
1919
+ sub rsp, 72
1920
+ movdqa xmmword ptr [rsp], xmm6
1921
+ movdqa xmmword ptr [rsp+10H], xmm7
1922
+ movdqa xmmword ptr [rsp+20H], xmm8
1923
+ movdqa xmmword ptr [rsp+30H], xmm9
1924
+ movups xmm0, xmmword ptr [rcx]
1925
+ movups xmm1, xmmword ptr [rcx+10H]
1926
+ movaps xmm2, xmmword ptr [BLAKE3_IV]
1927
+ movzx eax, byte ptr [rsp+70H]
1928
+ movzx r8d, r8b
1929
+ mov r10, qword ptr [rsp+78H]
1930
+ shl rax, 32
1931
+ add r8, rax
1932
+ movq xmm3, r9
1933
+ movq xmm4, r8
1934
+ punpcklqdq xmm3, xmm4
1935
+ movups xmm4, xmmword ptr [rdx]
1936
+ movups xmm5, xmmword ptr [rdx+10H]
1937
+ movaps xmm8, xmm4
1938
+ shufps xmm4, xmm5, 136
1939
+ shufps xmm8, xmm5, 221
1940
+ movaps xmm5, xmm8
1941
+ movups xmm6, xmmword ptr [rdx+20H]
1942
+ movups xmm7, xmmword ptr [rdx+30H]
1943
+ movaps xmm8, xmm6
1944
+ shufps xmm6, xmm7, 136
1945
+ pshufd xmm6, xmm6, 93H
1946
+ shufps xmm8, xmm7, 221
1947
+ pshufd xmm7, xmm8, 93H
1948
+ movaps xmm14, xmmword ptr [ROT8]
1949
+ movaps xmm15, xmmword ptr [ROT16]
1950
+ mov al, 7
1951
+ @@:
1952
+ paddd xmm0, xmm4
1953
+ paddd xmm0, xmm1
1954
+ pxor xmm3, xmm0
1955
+ pshufb xmm3, xmm15
1956
+ paddd xmm2, xmm3
1957
+ pxor xmm1, xmm2
1958
+ movdqa xmm11, xmm1
1959
+ pslld xmm1, 20
1960
+ psrld xmm11, 12
1961
+ por xmm1, xmm11
1962
+ paddd xmm0, xmm5
1963
+ paddd xmm0, xmm1
1964
+ pxor xmm3, xmm0
1965
+ pshufb xmm3, xmm14
1966
+ paddd xmm2, xmm3
1967
+ pxor xmm1, xmm2
1968
+ movdqa xmm11, xmm1
1969
+ pslld xmm1, 25
1970
+ psrld xmm11, 7
1971
+ por xmm1, xmm11
1972
+ pshufd xmm0, xmm0, 93H
1973
+ pshufd xmm3, xmm3, 4EH
1974
+ pshufd xmm2, xmm2, 39H
1975
+ paddd xmm0, xmm6
1976
+ paddd xmm0, xmm1
1977
+ pxor xmm3, xmm0
1978
+ pshufb xmm3, xmm15
1979
+ paddd xmm2, xmm3
1980
+ pxor xmm1, xmm2
1981
+ movdqa xmm11, xmm1
1982
+ pslld xmm1, 20
1983
+ psrld xmm11, 12
1984
+ por xmm1, xmm11
1985
+ paddd xmm0, xmm7
1986
+ paddd xmm0, xmm1
1987
+ pxor xmm3, xmm0
1988
+ pshufb xmm3, xmm14
1989
+ paddd xmm2, xmm3
1990
+ pxor xmm1, xmm2
1991
+ movdqa xmm11, xmm1
1992
+ pslld xmm1, 25
1993
+ psrld xmm11, 7
1994
+ por xmm1, xmm11
1995
+ pshufd xmm0, xmm0, 39H
1996
+ pshufd xmm3, xmm3, 4EH
1997
+ pshufd xmm2, xmm2, 93H
1998
+ dec al
1999
+ jz @F
2000
+ movdqa xmm8, xmm4
2001
+ shufps xmm8, xmm5, 214
2002
+ pshufd xmm9, xmm4, 0FH
2003
+ pshufd xmm4, xmm8, 39H
2004
+ movdqa xmm8, xmm6
2005
+ shufps xmm8, xmm7, 250
2006
+ pblendw xmm9, xmm8, 0CCH
2007
+ movdqa xmm8, xmm7
2008
+ punpcklqdq xmm8, xmm5
2009
+ pblendw xmm8, xmm6, 0C0H
2010
+ pshufd xmm8, xmm8, 78H
2011
+ punpckhdq xmm5, xmm7
2012
+ punpckldq xmm6, xmm5
2013
+ pshufd xmm7, xmm6, 1EH
2014
+ movdqa xmm5, xmm9
2015
+ movdqa xmm6, xmm8
2016
+ jmp @B
2017
+ @@:
2018
+ movdqu xmm4, xmmword ptr [rcx]
2019
+ movdqu xmm5, xmmword ptr [rcx+10H]
2020
+ pxor xmm0, xmm2
2021
+ pxor xmm1, xmm3
2022
+ pxor xmm2, xmm4
2023
+ pxor xmm3, xmm5
2024
+ movups xmmword ptr [r10], xmm0
2025
+ movups xmmword ptr [r10+10H], xmm1
2026
+ movups xmmword ptr [r10+20H], xmm2
2027
+ movups xmmword ptr [r10+30H], xmm3
2028
+ movdqa xmm6, xmmword ptr [rsp]
2029
+ movdqa xmm7, xmmword ptr [rsp+10H]
2030
+ movdqa xmm8, xmmword ptr [rsp+20H]
2031
+ movdqa xmm9, xmmword ptr [rsp+30H]
2032
+ add rsp, 72
2033
+ ret
2034
+ _blake3_compress_xof_sse41 ENDP
2035
+ blake3_compress_xof_sse41 ENDP
2036
+
2037
+ _TEXT ENDS
2038
+
2039
+
2040
+ _RDATA SEGMENT READONLY PAGE ALIAS(".rdata") 'CONST'
2041
+ ALIGN 64
2042
+ BLAKE3_IV:
2043
+ dd 6A09E667H, 0BB67AE85H, 3C6EF372H, 0A54FF53AH
2044
+
2045
+ ADD0:
2046
+ dd 0, 1, 2, 3
2047
+
2048
+ ADD1:
2049
+ dd 4 dup (4)
2050
+
2051
+ BLAKE3_IV_0:
2052
+ dd 4 dup (6A09E667H)
2053
+
2054
+ BLAKE3_IV_1:
2055
+ dd 4 dup (0BB67AE85H)
2056
+
2057
+ BLAKE3_IV_2:
2058
+ dd 4 dup (3C6EF372H)
2059
+
2060
+ BLAKE3_IV_3:
2061
+ dd 4 dup (0A54FF53AH)
2062
+
2063
+ BLAKE3_BLOCK_LEN:
2064
+ dd 4 dup (64)
2065
+
2066
+ ROT16:
2067
+ db 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13
2068
+
2069
+ ROT8:
2070
+ db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
2071
+
2072
+ CMP_MSB_MASK:
2073
+ dd 8 dup(80000000H)
2074
+
2075
+ _RDATA ENDS
2076
+ END
2077
+