pq_crypto 0.3.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +56 -0
- data/CHANGELOG.md +62 -0
- data/GET_STARTED.md +366 -40
- data/README.md +76 -233
- data/SECURITY.md +107 -82
- data/ext/pqcrypto/extconf.rb +169 -87
- data/ext/pqcrypto/mldsa_api.h +1 -48
- data/ext/pqcrypto/mlkem_api.h +1 -18
- data/ext/pqcrypto/pq_externalmu.c +89 -204
- data/ext/pqcrypto/pqcrypto_native_api.h +129 -0
- data/ext/pqcrypto/pqcrypto_ruby_secure.c +484 -84
- data/ext/pqcrypto/pqcrypto_secure.c +203 -78
- data/ext/pqcrypto/pqcrypto_secure.h +53 -14
- data/ext/pqcrypto/pqcrypto_version.h +7 -0
- data/ext/pqcrypto/randombytes.h +9 -0
- data/ext/pqcrypto/vendor/.vendored +10 -5
- data/ext/pqcrypto/vendor/mldsa-native/BUILDING.md +105 -0
- data/ext/pqcrypto/vendor/mldsa-native/LICENSE +286 -0
- data/ext/pqcrypto/vendor/mldsa-native/META.yml +24 -0
- data/ext/pqcrypto/vendor/mldsa-native/README.md +221 -0
- data/ext/pqcrypto/vendor/mldsa-native/SECURITY.md +8 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +721 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +975 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +724 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +723 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +166 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +321 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.c +21 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +385 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.c +73 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +130 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +277 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +244 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +182 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +117 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +438 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +105 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +71 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +62 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +376 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +204 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +259 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +1077 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +987 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +41 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +26 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +35 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +37 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +27 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +36 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +69 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/README.md +10 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +32 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +20 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +638 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +136 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +52 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +29 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +488 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +16 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/xkcp.h +31 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +247 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +231 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +150 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +753 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +129 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +145 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +177 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +653 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +79 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +53 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +55 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +85 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +85 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +102 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +110 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +72 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +69 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +40 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +189 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +135 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +128 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +543 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +62 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +649 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/meta.h +23 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +315 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +124 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +157 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +27 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +2311 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +2383 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +239 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +131 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +139 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +155 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +187 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +61 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_chknorm_avx2.c +52 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +155 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +155 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +102 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +104 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +91 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +93 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_avx2.c +126 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +155 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +139 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +160 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +293 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +224 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/params.h +77 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +991 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +393 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +946 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +360 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +877 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +725 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/randombytes.h +26 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +139 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +249 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +1511 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +806 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/symmetric.h +68 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sys.h +268 -0
- data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/zetas.inc +55 -0
- data/ext/pqcrypto/vendor/mlkem-native/BUILDING.md +104 -0
- data/ext/pqcrypto/vendor/mlkem-native/LICENSE +294 -0
- data/ext/pqcrypto/vendor/mlkem-native/META.yml +30 -0
- data/ext/pqcrypto/vendor/mlkem-native/README.md +223 -0
- data/ext/pqcrypto/vendor/mlkem-native/RELEASE.md +86 -0
- data/ext/pqcrypto/vendor/mlkem-native/SECURITY.md +8 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/README.md +23 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native.c +660 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native.h +538 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native_asm.S +681 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native_config.h +709 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/cbmc.h +174 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/common.h +274 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/compress.c +717 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/compress.h +688 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/debug.c +64 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/debug.h +128 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202.c +251 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202.h +158 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202x4.c +208 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202x4.h +80 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/keccakf1600.c +463 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/keccakf1600.h +98 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/auto.h +70 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/fips202_native_aarch64.h +69 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +375 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +203 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +258 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +1076 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +986 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +46 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x1_scalar.h +25 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x1_v84a.h +34 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x2_v84a.h +35 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x4_v8a_scalar.h +26 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +35 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/api.h +117 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/README.md +10 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/mve.h +79 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/fips202_native_armv81m.h +35 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +667 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +40 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +51 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +290 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +314 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/auto.h +28 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h +33 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/fips202_native_x86_64.h +41 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2.S +451 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/keccakf1600_constants.c +51 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/indcpa.c +622 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/indcpa.h +156 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/kem.c +446 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/kem.h +326 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/README.md +16 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/meta.h +122 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/aarch64_zetas.c +174 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/arith_native_aarch64.h +177 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/intt.S +628 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/ntt.S +562 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/poly_mulcache_compute_asm.S +127 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/poly_reduce_asm.S +150 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/poly_tobytes_asm.S +117 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/poly_tomont_asm.S +98 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S +261 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S +314 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S +368 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/rej_uniform_asm.S +226 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/rej_uniform_table.c +542 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/api.h +637 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/meta.h +25 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/README.md +11 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/meta.h +128 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/arith_native_riscv64.h +45 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_debug.c +81 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_debug.h +145 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_izetas.inc +27 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_poly.c +805 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_zetas.inc +27 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_zetas_basemul.inc +39 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/README.md +4 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/meta.h +304 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/arith_native_x86_64.h +309 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/compress_consts.c +94 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/compress_consts.h +45 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/consts.c +102 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/consts.h +25 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/intt.S +719 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/mulcache_compute.S +90 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/ntt.S +639 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/nttfrombytes.S +193 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/ntttobytes.S +181 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/nttunpack.S +174 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_compress_d10.S +382 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_compress_d11.S +448 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_compress_d4.S +163 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_compress_d5.S +220 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_decompress_d10.S +228 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_decompress_d11.S +277 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_decompress_d4.S +180 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/poly_decompress_d5.S +192 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S +502 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S +750 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S +998 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/reduce.S +218 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/rej_uniform_asm.S +103 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/rej_uniform_table.c +544 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/tomont.S +155 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/params.h +76 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly.c +572 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly.h +317 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly_k.c +502 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly_k.h +668 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/randombytes.h +60 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sampling.c +362 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sampling.h +118 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/symmetric.h +70 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sys.h +260 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/verify.c +20 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/verify.h +464 -0
- data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/zetas.inc +30 -0
- data/lib/pq_crypto/algorithm_registry.rb +200 -0
- data/lib/pq_crypto/hybrid_kem.rb +1 -12
- data/lib/pq_crypto/kem.rb +104 -13
- data/lib/pq_crypto/pkcs8.rb +387 -0
- data/lib/pq_crypto/serialization.rb +1 -14
- data/lib/pq_crypto/signature.rb +123 -17
- data/lib/pq_crypto/spki.rb +131 -0
- data/lib/pq_crypto/version.rb +1 -1
- data/lib/pq_crypto.rb +79 -20
- data/script/vendor_libs.rb +88 -155
- metadata +241 -73
- data/ext/pqcrypto/vendor/pqclean/common/aes.c +0 -639
- data/ext/pqcrypto/vendor/pqclean/common/aes.h +0 -64
- data/ext/pqcrypto/vendor/pqclean/common/compat.h +0 -73
- data/ext/pqcrypto/vendor/pqclean/common/crypto_declassify.h +0 -7
- data/ext/pqcrypto/vendor/pqclean/common/fips202.c +0 -928
- data/ext/pqcrypto/vendor/pqclean/common/fips202.h +0 -166
- data/ext/pqcrypto/vendor/pqclean/common/keccak2x/feat.S +0 -168
- data/ext/pqcrypto/vendor/pqclean/common/keccak2x/fips202x2.c +0 -684
- data/ext/pqcrypto/vendor/pqclean/common/keccak2x/fips202x2.h +0 -60
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/KeccakP-1600-times4-SIMD256.c +0 -1028
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/KeccakP-1600-times4-SnP.h +0 -50
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/KeccakP-1600-unrolling.macros +0 -198
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/Makefile +0 -8
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/Makefile.Microsoft_nmake +0 -8
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/SIMD256-config.h +0 -3
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/align.h +0 -34
- data/ext/pqcrypto/vendor/pqclean/common/keccak4x/brg_endian.h +0 -142
- data/ext/pqcrypto/vendor/pqclean/common/nistseedexpander.c +0 -101
- data/ext/pqcrypto/vendor/pqclean/common/nistseedexpander.h +0 -39
- data/ext/pqcrypto/vendor/pqclean/common/randombytes.c +0 -355
- data/ext/pqcrypto/vendor/pqclean/common/randombytes.h +0 -27
- data/ext/pqcrypto/vendor/pqclean/common/sha2.c +0 -769
- data/ext/pqcrypto/vendor/pqclean/common/sha2.h +0 -173
- data/ext/pqcrypto/vendor/pqclean/common/sp800-185.c +0 -156
- data/ext/pqcrypto/vendor/pqclean/common/sp800-185.h +0 -27
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/LICENSE +0 -5
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/Makefile +0 -19
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/Makefile.Microsoft_nmake +0 -23
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/api.h +0 -18
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/cbd.c +0 -83
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/cbd.h +0 -11
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/indcpa.c +0 -327
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/indcpa.h +0 -22
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/kem.c +0 -164
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/kem.h +0 -23
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/ntt.c +0 -146
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/ntt.h +0 -14
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/params.h +0 -36
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/poly.c +0 -299
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/poly.h +0 -37
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/polyvec.c +0 -188
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/polyvec.h +0 -26
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/reduce.c +0 -41
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/reduce.h +0 -13
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/symmetric-shake.c +0 -71
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/symmetric.h +0 -30
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/verify.c +0 -67
- data/ext/pqcrypto/vendor/pqclean/crypto_kem/ml-kem-768/clean/verify.h +0 -13
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/LICENSE +0 -5
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/Makefile +0 -19
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/Makefile.Microsoft_nmake +0 -23
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/api.h +0 -50
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/ntt.c +0 -98
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/ntt.h +0 -10
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/packing.c +0 -261
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/packing.h +0 -31
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/params.h +0 -44
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/poly.c +0 -799
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/poly.h +0 -52
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/polyvec.c +0 -415
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/polyvec.h +0 -65
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/reduce.c +0 -69
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/reduce.h +0 -17
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/rounding.c +0 -92
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/rounding.h +0 -14
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/sign.c +0 -407
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/sign.h +0 -47
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/symmetric-shake.c +0 -26
- data/ext/pqcrypto/vendor/pqclean/crypto_sign/ml-dsa-65/clean/symmetric.h +0 -34
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "../../../common.h"
|
|
21
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
22
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
23
|
+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 7)
|
|
24
|
+
|
|
25
|
+
/*
|
|
26
|
+
* WARNING: This file is auto-derived from the mldsa-native source file
|
|
27
|
+
* dev/x86_64/src/pointwise_acc_l7.S using scripts/simpasm. Do not modify it directly.
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
#if defined(__ELF__)
|
|
31
|
+
.section .note.GNU-stack,"",@progbits
|
|
32
|
+
#endif
|
|
33
|
+
|
|
34
|
+
.text
|
|
35
|
+
.balign 4
|
|
36
|
+
.global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2)
|
|
37
|
+
MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2)
|
|
38
|
+
|
|
39
|
+
.cfi_startproc
|
|
40
|
+
vmovdqa 0x20(%rcx), %ymm0
|
|
41
|
+
vmovdqa (%rcx), %ymm1
|
|
42
|
+
xorl %eax, %eax
|
|
43
|
+
|
|
44
|
+
Lpointwise_acc_l7_avx2_looptop2:
|
|
45
|
+
vmovdqa (%rsi), %ymm6
|
|
46
|
+
vmovdqa 0x20(%rsi), %ymm8
|
|
47
|
+
vmovdqa (%rdx), %ymm10
|
|
48
|
+
vmovdqa 0x20(%rdx), %ymm12
|
|
49
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
50
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
51
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
52
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
53
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
54
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
55
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
56
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
57
|
+
vmovdqa %ymm6, %ymm2
|
|
58
|
+
vmovdqa %ymm7, %ymm3
|
|
59
|
+
vmovdqa %ymm8, %ymm4
|
|
60
|
+
vmovdqa %ymm9, %ymm5
|
|
61
|
+
vmovdqa 0x400(%rsi), %ymm6
|
|
62
|
+
vmovdqa 0x420(%rsi), %ymm8
|
|
63
|
+
vmovdqa 0x400(%rdx), %ymm10
|
|
64
|
+
vmovdqa 0x420(%rdx), %ymm12
|
|
65
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
66
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
67
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
68
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
69
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
70
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
71
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
72
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
73
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
74
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
75
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
76
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
77
|
+
vmovdqa 0x800(%rsi), %ymm6
|
|
78
|
+
vmovdqa 0x820(%rsi), %ymm8
|
|
79
|
+
vmovdqa 0x800(%rdx), %ymm10
|
|
80
|
+
vmovdqa 0x820(%rdx), %ymm12
|
|
81
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
82
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
83
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
84
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
85
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
86
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
87
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
88
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
89
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
90
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
91
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
92
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
93
|
+
vmovdqa 0xc00(%rsi), %ymm6
|
|
94
|
+
vmovdqa 0xc20(%rsi), %ymm8
|
|
95
|
+
vmovdqa 0xc00(%rdx), %ymm10
|
|
96
|
+
vmovdqa 0xc20(%rdx), %ymm12
|
|
97
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
98
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
99
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
100
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
101
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
102
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
103
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
104
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
105
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
106
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
107
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
108
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
109
|
+
vmovdqa 0x1000(%rsi), %ymm6
|
|
110
|
+
vmovdqa 0x1020(%rsi), %ymm8
|
|
111
|
+
vmovdqa 0x1000(%rdx), %ymm10
|
|
112
|
+
vmovdqa 0x1020(%rdx), %ymm12
|
|
113
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
114
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
115
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
116
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
117
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
118
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
119
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
120
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
121
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
122
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
123
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
124
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
125
|
+
vmovdqa 0x1400(%rsi), %ymm6
|
|
126
|
+
vmovdqa 0x1420(%rsi), %ymm8
|
|
127
|
+
vmovdqa 0x1400(%rdx), %ymm10
|
|
128
|
+
vmovdqa 0x1420(%rdx), %ymm12
|
|
129
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
130
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
131
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
132
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
133
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
134
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
135
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
136
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
137
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
138
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
139
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
140
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
141
|
+
vmovdqa 0x1800(%rsi), %ymm6
|
|
142
|
+
vmovdqa 0x1820(%rsi), %ymm8
|
|
143
|
+
vmovdqa 0x1800(%rdx), %ymm10
|
|
144
|
+
vmovdqa 0x1820(%rdx), %ymm12
|
|
145
|
+
vpsrlq $0x20, %ymm6, %ymm7
|
|
146
|
+
vpsrlq $0x20, %ymm8, %ymm9
|
|
147
|
+
vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
|
|
148
|
+
vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
|
|
149
|
+
vpmuldq %ymm10, %ymm6, %ymm6
|
|
150
|
+
vpmuldq %ymm11, %ymm7, %ymm7
|
|
151
|
+
vpmuldq %ymm12, %ymm8, %ymm8
|
|
152
|
+
vpmuldq %ymm13, %ymm9, %ymm9
|
|
153
|
+
vpaddq %ymm2, %ymm6, %ymm2
|
|
154
|
+
vpaddq %ymm3, %ymm7, %ymm3
|
|
155
|
+
vpaddq %ymm4, %ymm8, %ymm4
|
|
156
|
+
vpaddq %ymm5, %ymm9, %ymm5
|
|
157
|
+
vpmuldq %ymm2, %ymm0, %ymm6
|
|
158
|
+
vpmuldq %ymm3, %ymm0, %ymm7
|
|
159
|
+
vpmuldq %ymm4, %ymm0, %ymm8
|
|
160
|
+
vpmuldq %ymm5, %ymm0, %ymm9
|
|
161
|
+
vpmuldq %ymm6, %ymm1, %ymm6
|
|
162
|
+
vpmuldq %ymm7, %ymm1, %ymm7
|
|
163
|
+
vpmuldq %ymm8, %ymm1, %ymm8
|
|
164
|
+
vpmuldq %ymm9, %ymm1, %ymm9
|
|
165
|
+
vpsubq %ymm6, %ymm2, %ymm2
|
|
166
|
+
vpsubq %ymm7, %ymm3, %ymm3
|
|
167
|
+
vpsubq %ymm8, %ymm4, %ymm4
|
|
168
|
+
vpsubq %ymm9, %ymm5, %ymm5
|
|
169
|
+
vpsrlq $0x20, %ymm2, %ymm2
|
|
170
|
+
vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
|
|
171
|
+
vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
|
|
172
|
+
vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
|
|
173
|
+
vmovdqa %ymm2, (%rdi)
|
|
174
|
+
vmovdqa %ymm4, 0x20(%rdi)
|
|
175
|
+
addq $0x40, %rsi
|
|
176
|
+
addq $0x40, %rdx
|
|
177
|
+
addq $0x40, %rdi
|
|
178
|
+
addl $0x1, %eax
|
|
179
|
+
cmpl $0x10, %eax
|
|
180
|
+
jb Lpointwise_acc_l7_avx2_looptop2
|
|
181
|
+
retq
|
|
182
|
+
.cfi_endproc
|
|
183
|
+
|
|
184
|
+
MLD_ASM_FN_SIZE(pointwise_acc_l7_avx2)
|
|
185
|
+
|
|
186
|
+
#endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
187
|
+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 7) */
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "../../../common.h"
|
|
21
|
+
|
|
22
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
23
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
|
|
24
|
+
|
|
25
|
+
#include <immintrin.h>
|
|
26
|
+
#include "arith_native_x86_64.h"
|
|
27
|
+
#include "consts.h"
|
|
28
|
+
|
|
29
|
+
/*************************************************
|
|
30
|
+
* Name: mld_poly_caddq_avx2
|
|
31
|
+
*
|
|
32
|
+
* Description: For all coefficients of in/out polynomial add Q if
|
|
33
|
+
* coefficient is negative.
|
|
34
|
+
*
|
|
35
|
+
* Arguments: - int32_t *r: pointer to input/output polynomial
|
|
36
|
+
**************************************************/
|
|
37
|
+
void mld_poly_caddq_avx2(int32_t *r)
|
|
38
|
+
{
|
|
39
|
+
unsigned int i;
|
|
40
|
+
__m256i f, g;
|
|
41
|
+
const __m256i q = _mm256_set1_epi32(MLDSA_Q);
|
|
42
|
+
const __m256i zero = _mm256_setzero_si256();
|
|
43
|
+
__m256i *rr = (__m256i *)r;
|
|
44
|
+
|
|
45
|
+
for (i = 0; i < MLDSA_N / 8; i++)
|
|
46
|
+
{
|
|
47
|
+
f = _mm256_load_si256(&rr[i]);
|
|
48
|
+
g = _mm256_cmpgt_epi32(zero, f);
|
|
49
|
+
g = _mm256_and_si256(g, q);
|
|
50
|
+
f = _mm256_add_epi32(f, g);
|
|
51
|
+
_mm256_store_si256(&rr[i], f);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
56
|
+
*/
|
|
57
|
+
|
|
58
|
+
MLD_EMPTY_CU(avx2_reduce)
|
|
59
|
+
|
|
60
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
|
|
61
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED) */
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
#include "../../../common.h"
|
|
21
|
+
|
|
22
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
23
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
|
|
24
|
+
|
|
25
|
+
#include <immintrin.h>
|
|
26
|
+
#include "arith_native_x86_64.h"
|
|
27
|
+
|
|
28
|
+
int mld_poly_chknorm_avx2(const int32_t *a, int32_t B)
|
|
29
|
+
{
|
|
30
|
+
unsigned int i;
|
|
31
|
+
__m256i f, t;
|
|
32
|
+
const __m256i bound = _mm256_set1_epi32(B - 1);
|
|
33
|
+
|
|
34
|
+
t = _mm256_setzero_si256();
|
|
35
|
+
for (i = 0; i < MLDSA_N / 8; i++)
|
|
36
|
+
{
|
|
37
|
+
f = _mm256_load_si256((const __m256i *)&a[8 * i]);
|
|
38
|
+
f = _mm256_abs_epi32(f);
|
|
39
|
+
f = _mm256_cmpgt_epi32(f, bound);
|
|
40
|
+
t = _mm256_or_si256(t, f);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
return 1 - _mm256_testz_si256(t, t);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
47
|
+
*/
|
|
48
|
+
|
|
49
|
+
MLD_EMPTY_CU(avx2_poly_chknorm)
|
|
50
|
+
|
|
51
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
|
|
52
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED) */
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*
|
|
19
|
+
* The algorithm for Decompose(r) (more specifically the handling for the
|
|
20
|
+
* wrap-around cases) are modified. See the "Reference" section in the comments
|
|
21
|
+
* below for a more detailed comparison.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
#include "../../../common.h"
|
|
25
|
+
|
|
26
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
27
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
28
|
+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
29
|
+
(MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
|
|
30
|
+
|
|
31
|
+
#include <immintrin.h>
|
|
32
|
+
#include "arith_native_x86_64.h"
|
|
33
|
+
#include "consts.h"
|
|
34
|
+
|
|
35
|
+
/*
|
|
36
|
+
* Reference: The reference implementation has the input polynomial as a
|
|
37
|
+
* separate argument that may be aliased with either of the outputs.
|
|
38
|
+
* Removing the aliasing eases CBMC proofs.
|
|
39
|
+
*/
|
|
40
|
+
void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0)
|
|
41
|
+
{
|
|
42
|
+
unsigned int i;
|
|
43
|
+
__m256i f, f0, f1, t;
|
|
44
|
+
const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32));
|
|
45
|
+
/* check-magic: 1025 == floor(2**22 / 4092) */
|
|
46
|
+
const __m256i v = _mm256_set1_epi32(1025);
|
|
47
|
+
const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
|
|
48
|
+
const __m256i off = _mm256_set1_epi32(127);
|
|
49
|
+
const __m256i shift = _mm256_set1_epi32(512);
|
|
50
|
+
|
|
51
|
+
for (i = 0; i < MLDSA_N / 8; i++)
|
|
52
|
+
{
|
|
53
|
+
f = _mm256_load_si256((__m256i *)&a0[8 * i]);
|
|
54
|
+
|
|
55
|
+
/* check-magic: 4092 == intdiv(2 * intdiv(MLDSA_Q - 1, 32), 128) */
|
|
56
|
+
/*
|
|
57
|
+
* Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) =
|
|
58
|
+
* round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 4092. See
|
|
59
|
+
* mld_decompose() in mldsa/src/rounding.h for more details.
|
|
60
|
+
*
|
|
61
|
+
* range: 0 <= f <= Q-1 = 32*GAMMA2 = 16*128*B
|
|
62
|
+
*/
|
|
63
|
+
|
|
64
|
+
/* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
|
|
65
|
+
f1 = _mm256_add_epi32(f, off);
|
|
66
|
+
f1 = _mm256_srli_epi32(f1, 7);
|
|
67
|
+
/*
|
|
68
|
+
* range: 0 <= f1' <= (Q-1)/128 = 16B
|
|
69
|
+
*
|
|
70
|
+
* Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index
|
|
71
|
+
* 16-bit lanes are all 0, so no bits will be dropped in the input of the
|
|
72
|
+
* _mm256_mulhi_epu16() below.
|
|
73
|
+
*/
|
|
74
|
+
|
|
75
|
+
/*
|
|
76
|
+
* Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
|
|
77
|
+
* for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the
|
|
78
|
+
* proof, and proofs/isabelle/compress for a formalization of the argument.
|
|
79
|
+
*
|
|
80
|
+
* round(f1' * 1025 / 2^22) is in turn computed in 2 steps as
|
|
81
|
+
* round(floor(f1' * 1025 / 2^16) / 2^6). The mulhi computes f1'' =
|
|
82
|
+
* floor(f1' * 1025 / 2^16). As for the next step f1 = round(f1'' / 2^6),
|
|
83
|
+
* because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we
|
|
84
|
+
* simulate it using mulhrs with a power of 2, in this case mulhrs(f1'',
|
|
85
|
+
* 2^9) = round(f1'' * 2^9 / 2^15). (Note that the denominator is 2^15,
|
|
86
|
+
* not 2^16 as in mulhi.)
|
|
87
|
+
*/
|
|
88
|
+
f1 = _mm256_mulhi_epu16(f1, v);
|
|
89
|
+
/*
|
|
90
|
+
* range: 0 <= f1'' = floor(f1' * 1025 / 2^16)
|
|
91
|
+
* <= f1' * 1025 / 2^16
|
|
92
|
+
* < 2^16 * 1025 / 2^16 = 1025
|
|
93
|
+
*
|
|
94
|
+
* Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
|
|
95
|
+
* is, no erroneous sign-extension occurs.
|
|
96
|
+
*/
|
|
97
|
+
f1 = _mm256_mulhrs_epi16(f1, shift);
|
|
98
|
+
/*
|
|
99
|
+
* range: 0 <= f1 = round-(f1' / B) <= round-(16B / B) = 16
|
|
100
|
+
*
|
|
101
|
+
* Note that the odd-index 16-bit lanes are still all 0 right now, so
|
|
102
|
+
* reinterpreting f1 as 8 lanes of int32_t (as done in the following) does
|
|
103
|
+
* not affect its value.
|
|
104
|
+
*/
|
|
105
|
+
|
|
106
|
+
/*
|
|
107
|
+
* If f1 = 16, i.e. f > 31*GAMMA2, proceed as if f' = f - Q was given
|
|
108
|
+
* instead. (For f = 31*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
|
|
109
|
+
* like other "wrapped around" cases.)
|
|
110
|
+
*
|
|
111
|
+
* Reference: They handle wrap-around in a somewhat convoluted way. Most
|
|
112
|
+
* notably, they compute remainder f0 with quotient f1 that's
|
|
113
|
+
* already wrapped around, so is off by q (instead of by 1) from
|
|
114
|
+
* what it should be ultimately. They detect the need for
|
|
115
|
+
* correction by checking if f0 is abnormally large.
|
|
116
|
+
*
|
|
117
|
+
* Our approach is closer to Algorithm 36 in the specification,
|
|
118
|
+
* in that we compute f0 normally and correct f1, f0 in the way
|
|
119
|
+
* they prescribed. The only real difference is that we check for
|
|
120
|
+
* wrap-around by examining f directly, instead of some other
|
|
121
|
+
* intermediates computed from it.
|
|
122
|
+
*/
|
|
123
|
+
|
|
124
|
+
/* Check for wrap-around */
|
|
125
|
+
t = _mm256_cmpgt_epi32(f, q_bound);
|
|
126
|
+
|
|
127
|
+
/* Compute remainder f0 */
|
|
128
|
+
f0 = _mm256_mullo_epi32(f1, alpha);
|
|
129
|
+
f0 = _mm256_sub_epi32(f, f0);
|
|
130
|
+
/*
|
|
131
|
+
* range: -GAMMA2 < f0 <= GAMMA2
|
|
132
|
+
*
|
|
133
|
+
* This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly.
|
|
134
|
+
*/
|
|
135
|
+
|
|
136
|
+
/* If wrap-around is required, set f1 = 0 and f0 -= 1 */
|
|
137
|
+
f1 = _mm256_andnot_si256(t, f1);
|
|
138
|
+
f0 = _mm256_add_epi32(f0, t);
|
|
139
|
+
/* range: 0 <= f1 <= 15, -GAMMA2 <= f0 <= GAMMA2 */
|
|
140
|
+
|
|
141
|
+
_mm256_store_si256((__m256i *)&a1[8 * i], f1);
|
|
142
|
+
_mm256_store_si256((__m256i *)&a0[8 * i], f0);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
147
|
+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
148
|
+
65 || MLD_CONFIG_PARAMETER_SET == 87) */
|
|
149
|
+
|
|
150
|
+
MLD_EMPTY_CU(avx2_poly_decompose_32)
|
|
151
|
+
|
|
152
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
|
|
153
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
154
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
|
|
155
|
+
|| MLD_CONFIG_PARAMETER_SET == 87)) */
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (c) The mldsa-native project authors
|
|
3
|
+
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
/* References
|
|
7
|
+
* ==========
|
|
8
|
+
*
|
|
9
|
+
* - [REF_AVX2]
|
|
10
|
+
* CRYSTALS-Dilithium optimized AVX2 implementation
|
|
11
|
+
* Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
|
|
12
|
+
* https://github.com/pq-crystals/dilithium/tree/master/avx2
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
/*
|
|
16
|
+
* This file is derived from the public domain
|
|
17
|
+
* AVX2 Dilithium implementation @[REF_AVX2].
|
|
18
|
+
*
|
|
19
|
+
* The algorithm for Decompose(r) (more specifically the handling for the
|
|
20
|
+
* wrap-around cases) are modified. See the "Reference" section in the comments
|
|
21
|
+
* below for a more detailed comparison.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
#include "../../../common.h"
|
|
25
|
+
|
|
26
|
+
#if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
|
|
27
|
+
!defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
|
|
28
|
+
(defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
|
|
29
|
+
MLD_CONFIG_PARAMETER_SET == 44)
|
|
30
|
+
|
|
31
|
+
#include <immintrin.h>
|
|
32
|
+
#include "arith_native_x86_64.h"
|
|
33
|
+
#include "consts.h"
|
|
34
|
+
|
|
35
|
+
/*
|
|
36
|
+
* Reference: The reference implementation has the input polynomial as a
|
|
37
|
+
* separate argument that may be aliased with either of the outputs.
|
|
38
|
+
* Removing the aliasing eases CBMC proofs.
|
|
39
|
+
*/
|
|
40
|
+
|
|
41
|
+
void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0)
|
|
42
|
+
{
|
|
43
|
+
unsigned int i;
|
|
44
|
+
__m256i f, f0, f1, t;
|
|
45
|
+
const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 88));
|
|
46
|
+
/* check-magic: 11275 == floor(2**24 / 1488) */
|
|
47
|
+
const __m256i v = _mm256_set1_epi32(11275);
|
|
48
|
+
const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 88));
|
|
49
|
+
const __m256i off = _mm256_set1_epi32(127);
|
|
50
|
+
const __m256i shift = _mm256_set1_epi32(128);
|
|
51
|
+
|
|
52
|
+
for (i = 0; i < MLDSA_N / 8; i++)
|
|
53
|
+
{
|
|
54
|
+
f = _mm256_load_si256((__m256i *)&a0[8 * i]);
|
|
55
|
+
|
|
56
|
+
/* check-magic: 1488 == intdiv(2 * intdiv(MLDSA_Q - 1, 88), 128) */
|
|
57
|
+
/*
|
|
58
|
+
* Compute f1 = round-(f / (2*GAMMA2)) as round-(f / (128B)) =
|
|
59
|
+
* round-(ceil(f / 128) / B) where B = 2*GAMMA2 / 128 = 1488. See
|
|
60
|
+
* mld_decompose() in mldsa/src/rounding.h for more details.
|
|
61
|
+
*
|
|
62
|
+
* range: 0 <= f <= Q-1 = 88*GAMMA2 = 44*128*B
|
|
63
|
+
*/
|
|
64
|
+
|
|
65
|
+
/* Compute f1' = ceil(f / 128) as floor((f + 127) / 2^7) */
|
|
66
|
+
f1 = _mm256_add_epi32(f, off);
|
|
67
|
+
f1 = _mm256_srli_epi32(f1, 7);
|
|
68
|
+
/*
|
|
69
|
+
* range: 0 <= f1' <= (Q-1)/128 = 44B
|
|
70
|
+
*
|
|
71
|
+
* Also, f1' <= (Q-1)/128 = 2^16 - 2^6 < 2^16 ensures that the odd-index
|
|
72
|
+
* 16-bit lanes are all 0, so no bits will be dropped in the input of the
|
|
73
|
+
* _mm256_mulhi_epu16() below.
|
|
74
|
+
*/
|
|
75
|
+
|
|
76
|
+
/*
|
|
77
|
+
* Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
|
|
78
|
+
* for 0 <= f1' < 2^16. See mld_decompose() in mldsa/src/rounding.h for the
|
|
79
|
+
* proof, and proofs/isabelle/compress for a formalization of the argument.
|
|
80
|
+
*
|
|
81
|
+
* round(f1' * 11275 / 2^24) is in turn computed in 2 steps as
|
|
82
|
+
* round(floor(f1' * 11275 / 2^16) / 2^8). The mulhi computes f1'' =
|
|
83
|
+
* floor(f1' * 11275 / 2^16). As for the next step f1 = round(f1'' / 2^8),
|
|
84
|
+
* because AVX2 doesn't have rounding right-shift (e.g. urshr in Neon), we
|
|
85
|
+
* simulate it using mulhrs with a power of 2, in this case mulhrs(f1'',
|
|
86
|
+
* 2^7) = round(f1'' * 2^7 / 2^15). (Note that the denominator is 2^15,
|
|
87
|
+
* not 2^16 as in mulhi.)
|
|
88
|
+
*/
|
|
89
|
+
f1 = _mm256_mulhi_epu16(f1, v);
|
|
90
|
+
/*
|
|
91
|
+
* range: 0 <= f1'' = floor(f1' * 11275 / 2^16)
|
|
92
|
+
* <= f1' * 11275 / 2^16
|
|
93
|
+
* < 2^16 * 11275 / 2^16 = 11275
|
|
94
|
+
*
|
|
95
|
+
* Because 0 <= f1'' < 2^15, the multiplication in mulhrs is unsigned, that
|
|
96
|
+
* is, no erroneous sign-extension occurs.
|
|
97
|
+
*/
|
|
98
|
+
f1 = _mm256_mulhrs_epi16(f1, shift);
|
|
99
|
+
/*
|
|
100
|
+
* range: 0 <= f1 = round-(f1' / B) <= round-(44B / B) = 44
|
|
101
|
+
*
|
|
102
|
+
* Note that the odd-index 16-bit lanes are still all 0 right now, so
|
|
103
|
+
* reinterpreting f1 as 8 lanes of int32_t (as done in the following) does
|
|
104
|
+
* not affect its value.
|
|
105
|
+
*/
|
|
106
|
+
|
|
107
|
+
/*
|
|
108
|
+
* If f1 = 44, i.e. f > 87*GAMMA2, proceed as if f' = f - Q was given
|
|
109
|
+
* instead. (For f = 87*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
|
|
110
|
+
* like other "wrapped around" cases.)
|
|
111
|
+
*
|
|
112
|
+
* Reference: They handle wrap-around in a somewhat convoluted way. Most
|
|
113
|
+
* notably, they compute remainder f0 with quotient f1 that's
|
|
114
|
+
* already wrapped around, so is off by q (instead of by 1) from
|
|
115
|
+
* what it should be ultimately. They detect the need for
|
|
116
|
+
* correction by checking if f0 is abnormally large.
|
|
117
|
+
*
|
|
118
|
+
* Our approach is closer to Algorithm 36 in the specification,
|
|
119
|
+
* in that we compute f0 normally and correct f1, f0 in the way
|
|
120
|
+
* they prescribed. The only real difference is that we check for
|
|
121
|
+
* wrap-around by examining f directly, instead of some other
|
|
122
|
+
* intermediates computed from it.
|
|
123
|
+
*/
|
|
124
|
+
|
|
125
|
+
/* Check for wrap-around */
|
|
126
|
+
t = _mm256_cmpgt_epi32(f, q_bound);
|
|
127
|
+
|
|
128
|
+
/* Compute remainder f0 */
|
|
129
|
+
f0 = _mm256_mullo_epi32(f1, alpha);
|
|
130
|
+
f0 = _mm256_sub_epi32(f, f0);
|
|
131
|
+
/*
|
|
132
|
+
* range: -GAMMA2 < f0 <= GAMMA2
|
|
133
|
+
*
|
|
134
|
+
* This holds since f1 = round-(f / (2*GAMMA2)) was computed exactly.
|
|
135
|
+
*/
|
|
136
|
+
|
|
137
|
+
/* If wrap-around is required, set f1 = 0 and f0 -= 1 */
|
|
138
|
+
f1 = _mm256_andnot_si256(t, f1);
|
|
139
|
+
f0 = _mm256_add_epi32(f0, t);
|
|
140
|
+
/* range: 0 <= f1 <= 43, -GAMMA2 <= f0 <= GAMMA2 */
|
|
141
|
+
|
|
142
|
+
_mm256_store_si256((__m256i *)&a1[8 * i], f1);
|
|
143
|
+
_mm256_store_si256((__m256i *)&a0[8 * i], f0);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
#else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
|
|
147
|
+
&& (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
148
|
+
44) */
|
|
149
|
+
|
|
150
|
+
MLD_EMPTY_CU(avx2_poly_decompose_88)
|
|
151
|
+
|
|
152
|
+
#endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
|
|
153
|
+
!MLD_CONFIG_MULTILEVEL_NO_SHARED && \
|
|
154
|
+
(MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
|
|
155
|
+
44)) */
|