pq_crypto 0.6.1 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/SECURITY.md +7 -0
- data/ext/pqcrypto/pqcrypto_version.h +1 -1
- data/ext/pqcrypto/vendor/.vendored +4 -4
- data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
- data/lib/pq_crypto/version.rb +1 -1
- data/script/vendor_libs.rb +3 -3
- metadata +41 -35
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "../../../common.h"
|
|
21
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
22
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
23
|
+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 7)
|
|
24
|
+
|
|
25
|
+
/*
|
|
26
|
+
* WARNING: This file is auto-derived from the mldsa-native source file
|
|
27
|
+
* dev/x86_64/src/pointwise_acc_l7_avx2_asm.S using scripts/simpasm. Do not modify it directly.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
.text
|
|
31
|
+
.balign 4
|
|
32
|
+
.global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2_asm)
|
|
33
|
+
MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2_asm)
|
|
34
|
+
|
|
35
|
+
.cfi_startproc
|
|
36
|
+
vmovdqa 0x20(%rcx), %ymm0
|
|
37
|
+
vmovdqa (%rcx), %ymm1
|
|
38
|
+
xorl %eax, %eax
|
|
39
|
+
|
|
40
|
+
Lpointwise_acc_l7_avx2_looptop2:
|
|
41
|
+
vmovdqa (%rsi), %ymm6
|
|
42
|
+
vmovdqa 0x20(%rsi), %ymm8
|
|
43
|
+
vmovdqa (%rdx), %ymm10
|
|
44
|
+
vmovdqa 0x20(%rdx), %ymm12
|
|
45
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
46
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
47
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
48
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
49
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
50
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
51
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
52
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
53
|
+
vmovdqa %ymm6, %ymm2
|
|
54
|
+
vmovdqa %ymm7, %ymm3
|
|
55
|
+
vmovdqa %ymm8, %ymm4
|
|
56
|
+
vmovdqa %ymm9, %ymm5
|
|
57
|
+
vmovdqa 0x400(%rsi), %ymm6
|
|
58
|
+
vmovdqa 0x420(%rsi), %ymm8
|
|
59
|
+
vmovdqa 0x400(%rdx), %ymm10
|
|
60
|
+
vmovdqa 0x420(%rdx), %ymm12
|
|
61
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
62
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
63
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
64
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
65
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
66
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
67
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
68
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
69
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
70
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
71
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
72
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
73
|
+
vmovdqa 0x800(%rsi), %ymm6
|
|
74
|
+
vmovdqa 0x820(%rsi), %ymm8
|
|
75
|
+
vmovdqa 0x800(%rdx), %ymm10
|
|
76
|
+
vmovdqa 0x820(%rdx), %ymm12
|
|
77
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
78
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
79
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
80
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
81
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
82
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
83
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
84
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
85
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
86
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
87
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
88
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
89
|
+
vmovdqa 0xc00(%rsi), %ymm6
|
|
90
|
+
vmovdqa 0xc20(%rsi), %ymm8
|
|
91
|
+
vmovdqa 0xc00(%rdx), %ymm10
|
|
92
|
+
vmovdqa 0xc20(%rdx), %ymm12
|
|
93
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
94
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
95
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
96
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
97
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
98
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
99
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
100
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
101
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
102
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
103
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
104
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
105
|
+
vmovdqa 0x1000(%rsi), %ymm6
|
|
106
|
+
vmovdqa 0x1020(%rsi), %ymm8
|
|
107
|
+
vmovdqa 0x1000(%rdx), %ymm10
|
|
108
|
+
vmovdqa 0x1020(%rdx), %ymm12
|
|
109
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
110
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
111
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
112
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
113
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
114
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
115
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
116
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
117
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
118
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
119
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
120
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
121
|
+
vmovdqa 0x1400(%rsi), %ymm6
|
|
122
|
+
vmovdqa 0x1420(%rsi), %ymm8
|
|
123
|
+
vmovdqa 0x1400(%rdx), %ymm10
|
|
124
|
+
vmovdqa 0x1420(%rdx), %ymm12
|
|
125
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
126
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
127
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
128
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
129
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
130
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
131
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
132
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
133
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
134
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
135
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
136
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
137
|
+
vmovdqa 0x1800(%rsi), %ymm6
|
|
138
|
+
vmovdqa 0x1820(%rsi), %ymm8
|
|
139
|
+
vmovdqa 0x1800(%rdx), %ymm10
|
|
140
|
+
vmovdqa 0x1820(%rdx), %ymm12
|
|
141
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
142
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
143
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
144
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
145
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
146
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
147
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
148
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
149
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
150
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
151
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
152
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
153
|
+
vpmuldq %ymm2, %ymm0, %ymm6
|
|
154
|
+
vpmuldq %ymm3, %ymm0, %ymm7
|
|
155
|
+
vpmuldq %ymm4, %ymm0, %ymm8
|
|
156
|
+
vpmuldq %ymm5, %ymm0, %ymm9
|
|
157
|
+
vpmuldq %ymm6, %ymm1, %ymm6
|
|
158
|
+
vpmuldq %ymm7, %ymm1, %ymm7
|
|
159
|
+
vpmuldq %ymm8, %ymm1, %ymm8
|
|
160
|
+
vpmuldq %ymm9, %ymm1, %ymm9
|
|
161
|
+
vpsubq %ymm6, %ymm2, %ymm2
|
|
162
|
+
vpsubq %ymm7, %ymm3, %ymm3
|
|
163
|
+
vpsubq %ymm8, %ymm4, %ymm4
|
|
164
|
+
vpsubq %ymm9, %ymm5, %ymm5
|
|
165
|
+
vpsrlq $0x20, %ymm2, %ymm2
|
|
166
|
+
vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
|
|
167
|
+
vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
|
|
168
|
+
vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
|
|
169
|
+
vmovdqa %ymm2, (%rdi)
|
|
170
|
+
vmovdqa %ymm4, 0x20(%rdi)
|
|
171
|
+
addq $0x40, %rsi
|
|
172
|
+
addq $0x40, %rdx
|
|
173
|
+
addq $0x40, %rdi
|
|
174
|
+
addl $0x1, %eax
|
|
175
|
+
cmpl $0x10, %eax
|
|
176
|
+
jb Lpointwise_acc_l7_avx2_looptop2
|
|
177
|
+
retq
|
|
178
|
+
.cfi_endproc
|
|
179
|
+
|
|
180
|
+
MLD_ASM_FN_SIZE(pointwise_acc_l7_avx2_asm)
|
|
181
|
+
|
|
182
|
+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
183
|
+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 7) */
|
|
184
|
+
|
|
185
|
+
#if defined(__ELF__)
|
|
186
|
+
.section .note.GNU-stack,"",%progbits
|
|
187
|
+
#endif
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "../../../common.h"
|
|
21
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
22
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
|
|
23
|
+
|
|
24
|
+
/*
|
|
25
|
+
* WARNING: This file is auto-derived from the mldsa-native source file
|
|
26
|
+
* dev/x86_64/src/pointwise_avx2_asm.S using scripts/simpasm. Do not modify it directly.
|
|
27
|
+
*/
|
|
28
|
+
|
|
29
|
+
.text
|
|
30
|
+
.balign 4
|
|
31
|
+
.global MLD_ASM_NAMESPACE(pointwise_avx2_asm)
|
|
32
|
+
MLD_ASM_FN_SYMBOL(pointwise_avx2_asm)
|
|
33
|
+
|
|
34
|
+
.cfi_startproc
|
|
35
|
+
vmovdqa 0x20(%rdx), %ymm0
|
|
36
|
+
vmovdqa (%rdx), %ymm1
|
|
37
|
+
xorl %eax, %eax
|
|
38
|
+
|
|
39
|
+
Lpointwise_avx2_looptop1:
|
|
40
|
+
vmovdqa (%rdi), %ymm2
|
|
41
|
+
vmovdqa 0x20(%rdi), %ymm4
|
|
42
|
+
vmovdqa 0x40(%rdi), %ymm6
|
|
43
|
+
vmovdqa (%rsi), %ymm10
|
|
44
|
+
vmovdqa 0x20(%rsi), %ymm12
|
|
45
|
+
vmovdqa 0x40(%rsi), %ymm14
|
|
46
|
+
vpsrlq $0x20, %ymm2, %ymm3
|
|
47
|
+
vpsrlq $0x20, %ymm4, %ymm5
|
|
48
|
+
vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7]
|
|
49
|
+
vpsrlq $0x20, %ymm10, %ymm11
|
|
50
|
+
vpsrlq $0x20, %ymm12, %ymm13
|
|
51
|
+
vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7]
|
|
52
|
+
vpmuldq %ymm10, %ymm2, %ymm2
|
|
53
|
+
vpmuldq %ymm11, %ymm3, %ymm3
|
|
54
|
+
vpmuldq %ymm12, %ymm4, %ymm4
|
|
55
|
+
vpmuldq %ymm13, %ymm5, %ymm5
|
|
56
|
+
vpmuldq %ymm14, %ymm6, %ymm6
|
|
57
|
+
vpmuldq %ymm15, %ymm7, %ymm7
|
|
58
|
+
vpmuldq %ymm2, %ymm0, %ymm10
|
|
59
|
+
vpmuldq %ymm3, %ymm0, %ymm11
|
|
60
|
+
vpmuldq %ymm4, %ymm0, %ymm12
|
|
61
|
+
vpmuldq %ymm5, %ymm0, %ymm13
|
|
62
|
+
vpmuldq %ymm6, %ymm0, %ymm14
|
|
63
|
+
vpmuldq %ymm7, %ymm0, %ymm15
|
|
64
|
+
vpmuldq %ymm10, %ymm1, %ymm10
|
|
65
|
+
vpmuldq %ymm11, %ymm1, %ymm11
|
|
66
|
+
vpmuldq %ymm12, %ymm1, %ymm12
|
|
67
|
+
vpmuldq %ymm13, %ymm1, %ymm13
|
|
68
|
+
vpmuldq %ymm14, %ymm1, %ymm14
|
|
69
|
+
vpmuldq %ymm15, %ymm1, %ymm15
|
|
70
|
+
vpsubq %ymm10, %ymm2, %ymm2
|
|
71
|
+
vpsubq %ymm11, %ymm3, %ymm3
|
|
72
|
+
vpsubq %ymm12, %ymm4, %ymm4
|
|
73
|
+
vpsubq %ymm13, %ymm5, %ymm5
|
|
74
|
+
vpsubq %ymm14, %ymm6, %ymm6
|
|
75
|
+
vpsubq %ymm15, %ymm7, %ymm7
|
|
76
|
+
vpsrlq $0x20, %ymm2, %ymm2
|
|
77
|
+
vpsrlq $0x20, %ymm4, %ymm4
|
|
78
|
+
vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7]
|
|
79
|
+
vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
|
|
80
|
+
vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
|
|
81
|
+
vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7]
|
|
82
|
+
vmovdqa %ymm2, (%rdi)
|
|
83
|
+
vmovdqa %ymm4, 0x20(%rdi)
|
|
84
|
+
vmovdqa %ymm6, 0x40(%rdi)
|
|
85
|
+
addq $0x60, %rdi
|
|
86
|
+
addq $0x60, %rsi
|
|
87
|
+
addl $0x1, %eax
|
|
88
|
+
cmpl $0xa, %eax
|
|
89
|
+
jb Lpointwise_avx2_looptop1
|
|
90
|
+
vmovdqa (%rdi), %ymm2
|
|
91
|
+
vmovdqa 0x20(%rdi), %ymm4
|
|
92
|
+
vmovdqa (%rsi), %ymm10
|
|
93
|
+
vmovdqa 0x20(%rsi), %ymm12
|
|
94
|
+
vpsrlq $0x20, %ymm2, %ymm3
|
|
95
|
+
vpsrlq $0x20, %ymm4, %ymm5
|
|
96
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
97
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
98
|
+
vpmuldq %ymm10, %ymm2, %ymm2
|
|
99
|
+
vpmuldq %ymm11, %ymm3, %ymm3
|
|
100
|
+
vpmuldq %ymm12, %ymm4, %ymm4
|
|
101
|
+
vpmuldq %ymm13, %ymm5, %ymm5
|
|
102
|
+
vpmuldq %ymm2, %ymm0, %ymm10
|
|
103
|
+
vpmuldq %ymm3, %ymm0, %ymm11
|
|
104
|
+
vpmuldq %ymm4, %ymm0, %ymm12
|
|
105
|
+
vpmuldq %ymm5, %ymm0, %ymm13
|
|
106
|
+
vpmuldq %ymm10, %ymm1, %ymm10
|
|
107
|
+
vpmuldq %ymm11, %ymm1, %ymm11
|
|
108
|
+
vpmuldq %ymm12, %ymm1, %ymm12
|
|
109
|
+
vpmuldq %ymm13, %ymm1, %ymm13
|
|
110
|
+
vpsubq %ymm10, %ymm2, %ymm2
|
|
111
|
+
vpsubq %ymm11, %ymm3, %ymm3
|
|
112
|
+
vpsubq %ymm12, %ymm4, %ymm4
|
|
113
|
+
vpsubq %ymm13, %ymm5, %ymm5
|
|
114
|
+
vpsrlq $0x20, %ymm2, %ymm2
|
|
115
|
+
vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
|
|
116
|
+
vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
|
|
117
|
+
vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
|
|
118
|
+
vmovdqa %ymm2, (%rdi)
|
|
119
|
+
vmovdqa %ymm4, 0x20(%rdi)
|
|
120
|
+
retq
|
|
121
|
+
.cfi_endproc
|
|
122
|
+
|
|
123
|
+
MLD_ASM_FN_SIZE(pointwise_avx2_asm)
|
|
124
|
+
|
|
125
|
+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
126
|
+
*/
|
|
127
|
+
|
|
128
|
+
#if defined(__ELF__)
|
|
129
|
+
.section .note.GNU-stack,"",%progbits
|
|
130
|
+
#endif
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
/*************************************************
|
|
22
|
+
* Name: mld_poly_caddq_avx2_asm
|
|
23
|
+
*
|
|
24
|
+
* Description: For all coefficients of in/out polynomial add Q if
|
|
25
|
+
* coefficient is negative.
|
|
26
|
+
*
|
|
27
|
+
* Arguments: - int32_t *r: pointer to input/output polynomial
|
|
28
|
+
**************************************************/
|
|
29
|
+
|
|
30
|
+
#include "../../../common.h"
|
|
31
|
+
|
|
32
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
33
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
/*
|
|
37
|
+
* WARNING: This file is auto-derived from the mldsa-native source file
|
|
38
|
+
* dev/x86_64/src/poly_caddq_avx2_asm.S using scripts/simpasm. Do not modify it directly.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
.text
|
|
42
|
+
.balign 4
|
|
43
|
+
.global MLD_ASM_NAMESPACE(poly_caddq_avx2_asm)
|
|
44
|
+
MLD_ASM_FN_SYMBOL(poly_caddq_avx2_asm)
|
|
45
|
+
|
|
46
|
+
.cfi_startproc
|
|
47
|
+
vpxor %xmm2, %xmm2, %xmm2
|
|
48
|
+
movl $0x7fe001, %eax # imm = 0x7FE001
|
|
49
|
+
vmovd %eax, %xmm1
|
|
50
|
+
vpbroadcastd %xmm1, %ymm1
|
|
51
|
+
vpcmpgtd (%rdi), %ymm2, %ymm0
|
|
52
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
53
|
+
vpaddd (%rdi), %ymm0, %ymm0
|
|
54
|
+
vmovdqa %ymm0, (%rdi)
|
|
55
|
+
vpcmpgtd 0x20(%rdi), %ymm2, %ymm3
|
|
56
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
57
|
+
vpaddd 0x20(%rdi), %ymm3, %ymm3
|
|
58
|
+
vmovdqa %ymm3, 0x20(%rdi)
|
|
59
|
+
vpcmpgtd 0x40(%rdi), %ymm2, %ymm4
|
|
60
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
61
|
+
vpaddd 0x40(%rdi), %ymm4, %ymm4
|
|
62
|
+
vmovdqa %ymm4, 0x40(%rdi)
|
|
63
|
+
vpcmpgtd 0x60(%rdi), %ymm2, %ymm5
|
|
64
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
65
|
+
vpaddd 0x60(%rdi), %ymm5, %ymm5
|
|
66
|
+
vmovdqa %ymm5, 0x60(%rdi)
|
|
67
|
+
vpcmpgtd 0x80(%rdi), %ymm2, %ymm0
|
|
68
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
69
|
+
vpaddd 0x80(%rdi), %ymm0, %ymm0
|
|
70
|
+
vmovdqa %ymm0, 0x80(%rdi)
|
|
71
|
+
vpcmpgtd 0xa0(%rdi), %ymm2, %ymm3
|
|
72
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
73
|
+
vpaddd 0xa0(%rdi), %ymm3, %ymm3
|
|
74
|
+
vmovdqa %ymm3, 0xa0(%rdi)
|
|
75
|
+
vpcmpgtd 0xc0(%rdi), %ymm2, %ymm4
|
|
76
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
77
|
+
vpaddd 0xc0(%rdi), %ymm4, %ymm4
|
|
78
|
+
vmovdqa %ymm4, 0xc0(%rdi)
|
|
79
|
+
vpcmpgtd 0xe0(%rdi), %ymm2, %ymm5
|
|
80
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
81
|
+
vpaddd 0xe0(%rdi), %ymm5, %ymm5
|
|
82
|
+
vmovdqa %ymm5, 0xe0(%rdi)
|
|
83
|
+
vpcmpgtd 0x100(%rdi), %ymm2, %ymm0
|
|
84
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
85
|
+
vpaddd 0x100(%rdi), %ymm0, %ymm0
|
|
86
|
+
vmovdqa %ymm0, 0x100(%rdi)
|
|
87
|
+
vpcmpgtd 0x120(%rdi), %ymm2, %ymm3
|
|
88
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
89
|
+
vpaddd 0x120(%rdi), %ymm3, %ymm3
|
|
90
|
+
vmovdqa %ymm3, 0x120(%rdi)
|
|
91
|
+
vpcmpgtd 0x140(%rdi), %ymm2, %ymm4
|
|
92
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
93
|
+
vpaddd 0x140(%rdi), %ymm4, %ymm4
|
|
94
|
+
vmovdqa %ymm4, 0x140(%rdi)
|
|
95
|
+
vpcmpgtd 0x160(%rdi), %ymm2, %ymm5
|
|
96
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
97
|
+
vpaddd 0x160(%rdi), %ymm5, %ymm5
|
|
98
|
+
vmovdqa %ymm5, 0x160(%rdi)
|
|
99
|
+
vpcmpgtd 0x180(%rdi), %ymm2, %ymm0
|
|
100
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
101
|
+
vpaddd 0x180(%rdi), %ymm0, %ymm0
|
|
102
|
+
vmovdqa %ymm0, 0x180(%rdi)
|
|
103
|
+
vpcmpgtd 0x1a0(%rdi), %ymm2, %ymm3
|
|
104
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
105
|
+
vpaddd 0x1a0(%rdi), %ymm3, %ymm3
|
|
106
|
+
vmovdqa %ymm3, 0x1a0(%rdi)
|
|
107
|
+
vpcmpgtd 0x1c0(%rdi), %ymm2, %ymm4
|
|
108
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
109
|
+
vpaddd 0x1c0(%rdi), %ymm4, %ymm4
|
|
110
|
+
vmovdqa %ymm4, 0x1c0(%rdi)
|
|
111
|
+
vpcmpgtd 0x1e0(%rdi), %ymm2, %ymm5
|
|
112
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
113
|
+
vpaddd 0x1e0(%rdi), %ymm5, %ymm5
|
|
114
|
+
vmovdqa %ymm5, 0x1e0(%rdi)
|
|
115
|
+
vpcmpgtd 0x200(%rdi), %ymm2, %ymm0
|
|
116
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
117
|
+
vpaddd 0x200(%rdi), %ymm0, %ymm0
|
|
118
|
+
vmovdqa %ymm0, 0x200(%rdi)
|
|
119
|
+
vpcmpgtd 0x220(%rdi), %ymm2, %ymm3
|
|
120
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
121
|
+
vpaddd 0x220(%rdi), %ymm3, %ymm3
|
|
122
|
+
vmovdqa %ymm3, 0x220(%rdi)
|
|
123
|
+
vpcmpgtd 0x240(%rdi), %ymm2, %ymm4
|
|
124
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
125
|
+
vpaddd 0x240(%rdi), %ymm4, %ymm4
|
|
126
|
+
vmovdqa %ymm4, 0x240(%rdi)
|
|
127
|
+
vpcmpgtd 0x260(%rdi), %ymm2, %ymm5
|
|
128
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
129
|
+
vpaddd 0x260(%rdi), %ymm5, %ymm5
|
|
130
|
+
vmovdqa %ymm5, 0x260(%rdi)
|
|
131
|
+
vpcmpgtd 0x280(%rdi), %ymm2, %ymm0
|
|
132
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
133
|
+
vpaddd 0x280(%rdi), %ymm0, %ymm0
|
|
134
|
+
vmovdqa %ymm0, 0x280(%rdi)
|
|
135
|
+
vpcmpgtd 0x2a0(%rdi), %ymm2, %ymm3
|
|
136
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
137
|
+
vpaddd 0x2a0(%rdi), %ymm3, %ymm3
|
|
138
|
+
vmovdqa %ymm3, 0x2a0(%rdi)
|
|
139
|
+
vpcmpgtd 0x2c0(%rdi), %ymm2, %ymm4
|
|
140
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
141
|
+
vpaddd 0x2c0(%rdi), %ymm4, %ymm4
|
|
142
|
+
vmovdqa %ymm4, 0x2c0(%rdi)
|
|
143
|
+
vpcmpgtd 0x2e0(%rdi), %ymm2, %ymm5
|
|
144
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
145
|
+
vpaddd 0x2e0(%rdi), %ymm5, %ymm5
|
|
146
|
+
vmovdqa %ymm5, 0x2e0(%rdi)
|
|
147
|
+
vpcmpgtd 0x300(%rdi), %ymm2, %ymm0
|
|
148
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
149
|
+
vpaddd 0x300(%rdi), %ymm0, %ymm0
|
|
150
|
+
vmovdqa %ymm0, 0x300(%rdi)
|
|
151
|
+
vpcmpgtd 0x320(%rdi), %ymm2, %ymm3
|
|
152
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
153
|
+
vpaddd 0x320(%rdi), %ymm3, %ymm3
|
|
154
|
+
vmovdqa %ymm3, 0x320(%rdi)
|
|
155
|
+
vpcmpgtd 0x340(%rdi), %ymm2, %ymm4
|
|
156
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
157
|
+
vpaddd 0x340(%rdi), %ymm4, %ymm4
|
|
158
|
+
vmovdqa %ymm4, 0x340(%rdi)
|
|
159
|
+
vpcmpgtd 0x360(%rdi), %ymm2, %ymm5
|
|
160
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
161
|
+
vpaddd 0x360(%rdi), %ymm5, %ymm5
|
|
162
|
+
vmovdqa %ymm5, 0x360(%rdi)
|
|
163
|
+
vpcmpgtd 0x380(%rdi), %ymm2, %ymm0
|
|
164
|
+
vpand %ymm1, %ymm0, %ymm0
|
|
165
|
+
vpaddd 0x380(%rdi), %ymm0, %ymm0
|
|
166
|
+
vmovdqa %ymm0, 0x380(%rdi)
|
|
167
|
+
vpcmpgtd 0x3a0(%rdi), %ymm2, %ymm3
|
|
168
|
+
vpand %ymm1, %ymm3, %ymm3
|
|
169
|
+
vpaddd 0x3a0(%rdi), %ymm3, %ymm3
|
|
170
|
+
vmovdqa %ymm3, 0x3a0(%rdi)
|
|
171
|
+
vpcmpgtd 0x3c0(%rdi), %ymm2, %ymm4
|
|
172
|
+
vpand %ymm1, %ymm4, %ymm4
|
|
173
|
+
vpaddd 0x3c0(%rdi), %ymm4, %ymm4
|
|
174
|
+
vmovdqa %ymm4, 0x3c0(%rdi)
|
|
175
|
+
vpcmpgtd 0x3e0(%rdi), %ymm2, %ymm5
|
|
176
|
+
vpand %ymm1, %ymm5, %ymm5
|
|
177
|
+
vpaddd 0x3e0(%rdi), %ymm5, %ymm5
|
|
178
|
+
vmovdqa %ymm5, 0x3e0(%rdi)
|
|
179
|
+
retq
|
|
180
|
+
.cfi_endproc
|
|
181
|
+
|
|
182
|
+
MLD_ASM_FN_SIZE(poly_caddq_avx2_asm)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
186
|
+
*/
|
|
187
|
+
|
|
188
|
+
#if defined(__ELF__)
|
|
189
|
+
.section .note.GNU-stack,"",%progbits
|
|
190
|
+
#endif
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include "../../../common.h"
|
|
25
25
|
|
|
26
26
|
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
27
|
+
!defined(MLD_CONFIG_NO_SIGN_API) && \
|
|
27
28
|
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
28
29
|
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
29
30
|
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
|
|
@@ -143,13 +144,14 @@ void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0)
|
|
|
143
144
|
}
|
|
144
145
|
}
|
|
145
146
|
|
|
146
|
-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !
|
|
147
|
-
&&
|
|
148
|
-
|
|
147
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
|
|
148
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
149
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|
|
150
|
+
|| MLD_CONFIG_PARAMETER_SET == 87) */
|
|
149
151
|
|
|
150
152
|
MLD_EMPTY_CU(avx2_poly_decompose_32)
|
|
151
153
|
|
|
152
|
-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&
|
|
154
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
|
|
153
155
|
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
154
156
|
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|
|
155
157
|
|| MLD_CONFIG_PARAMETER_SET == 87)) */
|
|
@@ -24,6 +24,7 @@
|
|
|
24
24
|
#include "../../../common.h"
|
|
25
25
|
|
|
26
26
|
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
27
|
+
!defined(MLD_CONFIG_NO_SIGN_API) && \
|
|
27
28
|
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
28
29
|
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
29
30
|
MLD_CONFIG_PARAMETER_SET == 44)
|
|
@@ -143,13 +144,14 @@ void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0)
|
|
|
143
144
|
_mm256_store_si256((__m256i *)&a0[8 * i], f0);
|
|
144
145
|
}
|
|
145
146
|
}
|
|
146
|
-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !
|
|
147
|
-
&&
|
|
148
|
-
44)
|
|
147
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
|
|
148
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
149
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
|
|
150
|
+
*/
|
|
149
151
|
|
|
150
152
|
MLD_EMPTY_CU(avx2_poly_decompose_88)
|
|
151
153
|
|
|
152
|
-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&
|
|
154
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
|
|
153
155
|
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
154
156
|
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
155
157
|
44)) */
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include "../../../common.h"
|
|
21
21
|
|
|
22
22
|
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
23
|
+
!defined(MLD_CONFIG_NO_VERIFY_API) && \
|
|
23
24
|
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
24
25
|
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
25
26
|
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
|
|
@@ -33,12 +34,11 @@
|
|
|
33
34
|
_mm256_castsi256_ps(b), \
|
|
34
35
|
_mm256_castsi256_ps(mask)))
|
|
35
36
|
|
|
36
|
-
void mld_poly_use_hint_32_avx2(int32_t *
|
|
37
|
-
const int32_t *hint)
|
|
37
|
+
void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *hint)
|
|
38
38
|
{
|
|
39
39
|
unsigned int i;
|
|
40
40
|
__m256i f, f0, f1, h, t;
|
|
41
|
-
const __m256i q_bound = _mm256_set1_epi32(
|
|
41
|
+
const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32));
|
|
42
42
|
/* check-magic: 1025 == floor(2**22 / 4092) */
|
|
43
43
|
const __m256i v = _mm256_set1_epi32(1025);
|
|
44
44
|
const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
|
|
@@ -82,17 +82,18 @@ void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a,
|
|
|
82
82
|
f1 = _mm256_add_epi32(f1, h);
|
|
83
83
|
f1 = _mm256_and_si256(f1, mask);
|
|
84
84
|
|
|
85
|
-
_mm256_store_si256((__m256i *)&
|
|
85
|
+
_mm256_store_si256((__m256i *)&a[8 * i], f1);
|
|
86
86
|
}
|
|
87
87
|
}
|
|
88
88
|
|
|
89
|
-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !
|
|
90
|
-
&&
|
|
91
|
-
|
|
89
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
|
|
90
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
91
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|
|
92
|
+
|| MLD_CONFIG_PARAMETER_SET == 87) */
|
|
92
93
|
|
|
93
94
|
MLD_EMPTY_CU(avx2_poly_use_hint_32)
|
|
94
95
|
|
|
95
|
-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&
|
|
96
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
|
|
96
97
|
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
97
98
|
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|
|
98
99
|
|| MLD_CONFIG_PARAMETER_SET == 87)) */
|
|
@@ -20,6 +20,7 @@
|
|
|
20
20
|
#include "../../../common.h"
|
|
21
21
|
|
|
22
22
|
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
23
|
+
!defined(MLD_CONFIG_NO_VERIFY_API) && \
|
|
23
24
|
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
24
25
|
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
25
26
|
MLD_CONFIG_PARAMETER_SET == 44)
|
|
@@ -33,8 +34,7 @@
|
|
|
33
34
|
_mm256_castsi256_ps(b), \
|
|
34
35
|
_mm256_castsi256_ps(mask)))
|
|
35
36
|
|
|
36
|
-
void mld_poly_use_hint_88_avx2(int32_t *
|
|
37
|
-
const int32_t *hint)
|
|
37
|
+
void mld_poly_use_hint_88_avx2(int32_t *a, const int32_t *hint)
|
|
38
38
|
{
|
|
39
39
|
unsigned int i;
|
|
40
40
|
__m256i f, f0, f1, h, t;
|
|
@@ -84,19 +84,20 @@ void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a,
|
|
|
84
84
|
f = _mm256_cmpgt_epi32(f1, max);
|
|
85
85
|
f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f);
|
|
86
86
|
|
|
87
|
-
_mm256_store_si256((__m256i *)&
|
|
87
|
+
_mm256_store_si256((__m256i *)&a[8 * i], f1);
|
|
88
88
|
}
|
|
89
89
|
}
|
|
90
90
|
|
|
91
|
-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !
|
|
92
|
-
&&
|
|
93
|
-
44)
|
|
91
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
|
|
92
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
93
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
|
|
94
|
+
*/
|
|
94
95
|
|
|
95
96
|
MLD_EMPTY_CU(avx2_poly_use_hint_88)
|
|
96
97
|
|
|
97
|
-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&
|
|
98
|
-
!MLD_CONFIG_MULTILEVEL_NO_SHARED &&
|
|
99
|
-
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET ==
|
|
98
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
|
|
99
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
100
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
100
101
|
44)) */
|
|
101
102
|
|
|
102
103
|
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
|
|
@@ -20,6 +20,8 @@
|
|
|
20
20
|
#include "../../../common.h"
|
|
21
21
|
|
|
22
22
|
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
23
|
+
(!defined(MLD_CONFIG_NO_SIGN_API) || \
|
|
24
|
+
!defined(MLD_CONFIG_NO_VERIFY_API)) && \
|
|
23
25
|
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
24
26
|
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
25
27
|
MLD_CONFIG_PARAMETER_SET == 44)
|
|
@@ -79,13 +81,14 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a)
|
|
|
79
81
|
_mm256_store_si256((__m256i *)&r[8 * i], f);
|
|
80
82
|
}
|
|
81
83
|
}
|
|
82
|
-
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !
|
|
83
|
-
&&
|
|
84
|
-
44)
|
|
84
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \
|
|
85
|
+
!MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
86
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
|
|
87
|
+
*/
|
|
85
88
|
|
|
86
89
|
MLD_EMPTY_CU(avx2_polyz_unpack_17)
|
|
87
90
|
|
|
88
|
-
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT &&
|
|
89
|
-
!MLD_CONFIG_MULTILEVEL_NO_SHARED &&
|
|
91
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \
|
|
92
|
+
!MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
90
93
|
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
91
94
|
44)) */
|