pq_crypto 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  5. data/ext/pqcrypto/vendor/.vendored +4 -4
  6. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  7. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  8. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  9. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  10. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  107. data/lib/pq_crypto/version.rb +1 -1
  108. data/script/vendor_libs.rb +3 -3
  109. metadata +41 -35
  110. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  111. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  112. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  113. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  114. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  115. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  116. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  117. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  118. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  119. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  120. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  121. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  122. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  123. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  124. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  125. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  126. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  127. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  128. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  129. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  130. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  131. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  132. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  133. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  134. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  135. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  136. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  137. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  138. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  139. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  140. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  141. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -0,0 +1,187 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 7)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l7_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l7_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l7_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l7_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vmovdqa 0x1000(%rsi), %ymm6
106
+ vmovdqa 0x1020(%rsi), %ymm8
107
+ vmovdqa 0x1000(%rdx), %ymm10
108
+ vmovdqa 0x1020(%rdx), %ymm12
109
+ vpsrlq $0x20, %ymm6, %ymm7
110
+ vpsrlq $0x20, %ymm8, %ymm9
111
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
112
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
113
+ vpmuldq %ymm10, %ymm6, %ymm6
114
+ vpmuldq %ymm11, %ymm7, %ymm7
115
+ vpmuldq %ymm12, %ymm8, %ymm8
116
+ vpmuldq %ymm13, %ymm9, %ymm9
117
+ vpaddq %ymm2, %ymm6, %ymm2
118
+ vpaddq %ymm3, %ymm7, %ymm3
119
+ vpaddq %ymm4, %ymm8, %ymm4
120
+ vpaddq %ymm5, %ymm9, %ymm5
121
+ vmovdqa 0x1400(%rsi), %ymm6
122
+ vmovdqa 0x1420(%rsi), %ymm8
123
+ vmovdqa 0x1400(%rdx), %ymm10
124
+ vmovdqa 0x1420(%rdx), %ymm12
125
+ vpsrlq $0x20, %ymm6, %ymm7
126
+ vpsrlq $0x20, %ymm8, %ymm9
127
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
128
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
129
+ vpmuldq %ymm10, %ymm6, %ymm6
130
+ vpmuldq %ymm11, %ymm7, %ymm7
131
+ vpmuldq %ymm12, %ymm8, %ymm8
132
+ vpmuldq %ymm13, %ymm9, %ymm9
133
+ vpaddq %ymm2, %ymm6, %ymm2
134
+ vpaddq %ymm3, %ymm7, %ymm3
135
+ vpaddq %ymm4, %ymm8, %ymm4
136
+ vpaddq %ymm5, %ymm9, %ymm5
137
+ vmovdqa 0x1800(%rsi), %ymm6
138
+ vmovdqa 0x1820(%rsi), %ymm8
139
+ vmovdqa 0x1800(%rdx), %ymm10
140
+ vmovdqa 0x1820(%rdx), %ymm12
141
+ vpsrlq $0x20, %ymm6, %ymm7
142
+ vpsrlq $0x20, %ymm8, %ymm9
143
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
144
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
145
+ vpmuldq %ymm10, %ymm6, %ymm6
146
+ vpmuldq %ymm11, %ymm7, %ymm7
147
+ vpmuldq %ymm12, %ymm8, %ymm8
148
+ vpmuldq %ymm13, %ymm9, %ymm9
149
+ vpaddq %ymm2, %ymm6, %ymm2
150
+ vpaddq %ymm3, %ymm7, %ymm3
151
+ vpaddq %ymm4, %ymm8, %ymm4
152
+ vpaddq %ymm5, %ymm9, %ymm5
153
+ vpmuldq %ymm2, %ymm0, %ymm6
154
+ vpmuldq %ymm3, %ymm0, %ymm7
155
+ vpmuldq %ymm4, %ymm0, %ymm8
156
+ vpmuldq %ymm5, %ymm0, %ymm9
157
+ vpmuldq %ymm6, %ymm1, %ymm6
158
+ vpmuldq %ymm7, %ymm1, %ymm7
159
+ vpmuldq %ymm8, %ymm1, %ymm8
160
+ vpmuldq %ymm9, %ymm1, %ymm9
161
+ vpsubq %ymm6, %ymm2, %ymm2
162
+ vpsubq %ymm7, %ymm3, %ymm3
163
+ vpsubq %ymm8, %ymm4, %ymm4
164
+ vpsubq %ymm9, %ymm5, %ymm5
165
+ vpsrlq $0x20, %ymm2, %ymm2
166
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
167
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
168
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
169
+ vmovdqa %ymm2, (%rdi)
170
+ vmovdqa %ymm4, 0x20(%rdi)
171
+ addq $0x40, %rsi
172
+ addq $0x40, %rdx
173
+ addq $0x40, %rdi
174
+ addl $0x1, %eax
175
+ cmpl $0x10, %eax
176
+ jb Lpointwise_acc_l7_avx2_looptop2
177
+ retq
178
+ .cfi_endproc
179
+
180
+ MLD_ASM_FN_SIZE(pointwise_acc_l7_avx2_asm)
181
+
182
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
183
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 7) */
184
+
185
+ #if defined(__ELF__)
186
+ .section .note.GNU-stack,"",%progbits
187
+ #endif
@@ -0,0 +1,130 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
23
+
24
+ /*
25
+ * WARNING: This file is auto-derived from the mldsa-native source file
26
+ * dev/x86_64/src/pointwise_avx2_asm.S using scripts/simpasm. Do not modify it directly.
27
+ */
28
+
29
+ .text
30
+ .balign 4
31
+ .global MLD_ASM_NAMESPACE(pointwise_avx2_asm)
32
+ MLD_ASM_FN_SYMBOL(pointwise_avx2_asm)
33
+
34
+ .cfi_startproc
35
+ vmovdqa 0x20(%rdx), %ymm0
36
+ vmovdqa (%rdx), %ymm1
37
+ xorl %eax, %eax
38
+
39
+ Lpointwise_avx2_looptop1:
40
+ vmovdqa (%rdi), %ymm2
41
+ vmovdqa 0x20(%rdi), %ymm4
42
+ vmovdqa 0x40(%rdi), %ymm6
43
+ vmovdqa (%rsi), %ymm10
44
+ vmovdqa 0x20(%rsi), %ymm12
45
+ vmovdqa 0x40(%rsi), %ymm14
46
+ vpsrlq $0x20, %ymm2, %ymm3
47
+ vpsrlq $0x20, %ymm4, %ymm5
48
+ vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7]
49
+ vpsrlq $0x20, %ymm10, %ymm11
50
+ vpsrlq $0x20, %ymm12, %ymm13
51
+ vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7]
52
+ vpmuldq %ymm10, %ymm2, %ymm2
53
+ vpmuldq %ymm11, %ymm3, %ymm3
54
+ vpmuldq %ymm12, %ymm4, %ymm4
55
+ vpmuldq %ymm13, %ymm5, %ymm5
56
+ vpmuldq %ymm14, %ymm6, %ymm6
57
+ vpmuldq %ymm15, %ymm7, %ymm7
58
+ vpmuldq %ymm2, %ymm0, %ymm10
59
+ vpmuldq %ymm3, %ymm0, %ymm11
60
+ vpmuldq %ymm4, %ymm0, %ymm12
61
+ vpmuldq %ymm5, %ymm0, %ymm13
62
+ vpmuldq %ymm6, %ymm0, %ymm14
63
+ vpmuldq %ymm7, %ymm0, %ymm15
64
+ vpmuldq %ymm10, %ymm1, %ymm10
65
+ vpmuldq %ymm11, %ymm1, %ymm11
66
+ vpmuldq %ymm12, %ymm1, %ymm12
67
+ vpmuldq %ymm13, %ymm1, %ymm13
68
+ vpmuldq %ymm14, %ymm1, %ymm14
69
+ vpmuldq %ymm15, %ymm1, %ymm15
70
+ vpsubq %ymm10, %ymm2, %ymm2
71
+ vpsubq %ymm11, %ymm3, %ymm3
72
+ vpsubq %ymm12, %ymm4, %ymm4
73
+ vpsubq %ymm13, %ymm5, %ymm5
74
+ vpsubq %ymm14, %ymm6, %ymm6
75
+ vpsubq %ymm15, %ymm7, %ymm7
76
+ vpsrlq $0x20, %ymm2, %ymm2
77
+ vpsrlq $0x20, %ymm4, %ymm4
78
+ vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7]
79
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
80
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
81
+ vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7]
82
+ vmovdqa %ymm2, (%rdi)
83
+ vmovdqa %ymm4, 0x20(%rdi)
84
+ vmovdqa %ymm6, 0x40(%rdi)
85
+ addq $0x60, %rdi
86
+ addq $0x60, %rsi
87
+ addl $0x1, %eax
88
+ cmpl $0xa, %eax
89
+ jb Lpointwise_avx2_looptop1
90
+ vmovdqa (%rdi), %ymm2
91
+ vmovdqa 0x20(%rdi), %ymm4
92
+ vmovdqa (%rsi), %ymm10
93
+ vmovdqa 0x20(%rsi), %ymm12
94
+ vpsrlq $0x20, %ymm2, %ymm3
95
+ vpsrlq $0x20, %ymm4, %ymm5
96
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
97
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
98
+ vpmuldq %ymm10, %ymm2, %ymm2
99
+ vpmuldq %ymm11, %ymm3, %ymm3
100
+ vpmuldq %ymm12, %ymm4, %ymm4
101
+ vpmuldq %ymm13, %ymm5, %ymm5
102
+ vpmuldq %ymm2, %ymm0, %ymm10
103
+ vpmuldq %ymm3, %ymm0, %ymm11
104
+ vpmuldq %ymm4, %ymm0, %ymm12
105
+ vpmuldq %ymm5, %ymm0, %ymm13
106
+ vpmuldq %ymm10, %ymm1, %ymm10
107
+ vpmuldq %ymm11, %ymm1, %ymm11
108
+ vpmuldq %ymm12, %ymm1, %ymm12
109
+ vpmuldq %ymm13, %ymm1, %ymm13
110
+ vpsubq %ymm10, %ymm2, %ymm2
111
+ vpsubq %ymm11, %ymm3, %ymm3
112
+ vpsubq %ymm12, %ymm4, %ymm4
113
+ vpsubq %ymm13, %ymm5, %ymm5
114
+ vpsrlq $0x20, %ymm2, %ymm2
115
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
116
+ vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
117
+ vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
118
+ vmovdqa %ymm2, (%rdi)
119
+ vmovdqa %ymm4, 0x20(%rdi)
120
+ retq
121
+ .cfi_endproc
122
+
123
+ MLD_ASM_FN_SIZE(pointwise_avx2_asm)
124
+
125
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
126
+ */
127
+
128
+ #if defined(__ELF__)
129
+ .section .note.GNU-stack,"",%progbits
130
+ #endif
@@ -0,0 +1,190 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+
21
+ /*************************************************
22
+ * Name: mld_poly_caddq_avx2_asm
23
+ *
24
+ * Description: For all coefficients of in/out polynomial add Q if
25
+ * coefficient is negative.
26
+ *
27
+ * Arguments: - int32_t *r: pointer to input/output polynomial
28
+ **************************************************/
29
+
30
+ #include "../../../common.h"
31
+
32
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
33
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
34
+
35
+
36
+ /*
37
+ * WARNING: This file is auto-derived from the mldsa-native source file
38
+ * dev/x86_64/src/poly_caddq_avx2_asm.S using scripts/simpasm. Do not modify it directly.
39
+ */
40
+
41
+ .text
42
+ .balign 4
43
+ .global MLD_ASM_NAMESPACE(poly_caddq_avx2_asm)
44
+ MLD_ASM_FN_SYMBOL(poly_caddq_avx2_asm)
45
+
46
+ .cfi_startproc
47
+ vpxor %xmm2, %xmm2, %xmm2
48
+ movl $0x7fe001, %eax # imm = 0x7FE001
49
+ vmovd %eax, %xmm1
50
+ vpbroadcastd %xmm1, %ymm1
51
+ vpcmpgtd (%rdi), %ymm2, %ymm0
52
+ vpand %ymm1, %ymm0, %ymm0
53
+ vpaddd (%rdi), %ymm0, %ymm0
54
+ vmovdqa %ymm0, (%rdi)
55
+ vpcmpgtd 0x20(%rdi), %ymm2, %ymm3
56
+ vpand %ymm1, %ymm3, %ymm3
57
+ vpaddd 0x20(%rdi), %ymm3, %ymm3
58
+ vmovdqa %ymm3, 0x20(%rdi)
59
+ vpcmpgtd 0x40(%rdi), %ymm2, %ymm4
60
+ vpand %ymm1, %ymm4, %ymm4
61
+ vpaddd 0x40(%rdi), %ymm4, %ymm4
62
+ vmovdqa %ymm4, 0x40(%rdi)
63
+ vpcmpgtd 0x60(%rdi), %ymm2, %ymm5
64
+ vpand %ymm1, %ymm5, %ymm5
65
+ vpaddd 0x60(%rdi), %ymm5, %ymm5
66
+ vmovdqa %ymm5, 0x60(%rdi)
67
+ vpcmpgtd 0x80(%rdi), %ymm2, %ymm0
68
+ vpand %ymm1, %ymm0, %ymm0
69
+ vpaddd 0x80(%rdi), %ymm0, %ymm0
70
+ vmovdqa %ymm0, 0x80(%rdi)
71
+ vpcmpgtd 0xa0(%rdi), %ymm2, %ymm3
72
+ vpand %ymm1, %ymm3, %ymm3
73
+ vpaddd 0xa0(%rdi), %ymm3, %ymm3
74
+ vmovdqa %ymm3, 0xa0(%rdi)
75
+ vpcmpgtd 0xc0(%rdi), %ymm2, %ymm4
76
+ vpand %ymm1, %ymm4, %ymm4
77
+ vpaddd 0xc0(%rdi), %ymm4, %ymm4
78
+ vmovdqa %ymm4, 0xc0(%rdi)
79
+ vpcmpgtd 0xe0(%rdi), %ymm2, %ymm5
80
+ vpand %ymm1, %ymm5, %ymm5
81
+ vpaddd 0xe0(%rdi), %ymm5, %ymm5
82
+ vmovdqa %ymm5, 0xe0(%rdi)
83
+ vpcmpgtd 0x100(%rdi), %ymm2, %ymm0
84
+ vpand %ymm1, %ymm0, %ymm0
85
+ vpaddd 0x100(%rdi), %ymm0, %ymm0
86
+ vmovdqa %ymm0, 0x100(%rdi)
87
+ vpcmpgtd 0x120(%rdi), %ymm2, %ymm3
88
+ vpand %ymm1, %ymm3, %ymm3
89
+ vpaddd 0x120(%rdi), %ymm3, %ymm3
90
+ vmovdqa %ymm3, 0x120(%rdi)
91
+ vpcmpgtd 0x140(%rdi), %ymm2, %ymm4
92
+ vpand %ymm1, %ymm4, %ymm4
93
+ vpaddd 0x140(%rdi), %ymm4, %ymm4
94
+ vmovdqa %ymm4, 0x140(%rdi)
95
+ vpcmpgtd 0x160(%rdi), %ymm2, %ymm5
96
+ vpand %ymm1, %ymm5, %ymm5
97
+ vpaddd 0x160(%rdi), %ymm5, %ymm5
98
+ vmovdqa %ymm5, 0x160(%rdi)
99
+ vpcmpgtd 0x180(%rdi), %ymm2, %ymm0
100
+ vpand %ymm1, %ymm0, %ymm0
101
+ vpaddd 0x180(%rdi), %ymm0, %ymm0
102
+ vmovdqa %ymm0, 0x180(%rdi)
103
+ vpcmpgtd 0x1a0(%rdi), %ymm2, %ymm3
104
+ vpand %ymm1, %ymm3, %ymm3
105
+ vpaddd 0x1a0(%rdi), %ymm3, %ymm3
106
+ vmovdqa %ymm3, 0x1a0(%rdi)
107
+ vpcmpgtd 0x1c0(%rdi), %ymm2, %ymm4
108
+ vpand %ymm1, %ymm4, %ymm4
109
+ vpaddd 0x1c0(%rdi), %ymm4, %ymm4
110
+ vmovdqa %ymm4, 0x1c0(%rdi)
111
+ vpcmpgtd 0x1e0(%rdi), %ymm2, %ymm5
112
+ vpand %ymm1, %ymm5, %ymm5
113
+ vpaddd 0x1e0(%rdi), %ymm5, %ymm5
114
+ vmovdqa %ymm5, 0x1e0(%rdi)
115
+ vpcmpgtd 0x200(%rdi), %ymm2, %ymm0
116
+ vpand %ymm1, %ymm0, %ymm0
117
+ vpaddd 0x200(%rdi), %ymm0, %ymm0
118
+ vmovdqa %ymm0, 0x200(%rdi)
119
+ vpcmpgtd 0x220(%rdi), %ymm2, %ymm3
120
+ vpand %ymm1, %ymm3, %ymm3
121
+ vpaddd 0x220(%rdi), %ymm3, %ymm3
122
+ vmovdqa %ymm3, 0x220(%rdi)
123
+ vpcmpgtd 0x240(%rdi), %ymm2, %ymm4
124
+ vpand %ymm1, %ymm4, %ymm4
125
+ vpaddd 0x240(%rdi), %ymm4, %ymm4
126
+ vmovdqa %ymm4, 0x240(%rdi)
127
+ vpcmpgtd 0x260(%rdi), %ymm2, %ymm5
128
+ vpand %ymm1, %ymm5, %ymm5
129
+ vpaddd 0x260(%rdi), %ymm5, %ymm5
130
+ vmovdqa %ymm5, 0x260(%rdi)
131
+ vpcmpgtd 0x280(%rdi), %ymm2, %ymm0
132
+ vpand %ymm1, %ymm0, %ymm0
133
+ vpaddd 0x280(%rdi), %ymm0, %ymm0
134
+ vmovdqa %ymm0, 0x280(%rdi)
135
+ vpcmpgtd 0x2a0(%rdi), %ymm2, %ymm3
136
+ vpand %ymm1, %ymm3, %ymm3
137
+ vpaddd 0x2a0(%rdi), %ymm3, %ymm3
138
+ vmovdqa %ymm3, 0x2a0(%rdi)
139
+ vpcmpgtd 0x2c0(%rdi), %ymm2, %ymm4
140
+ vpand %ymm1, %ymm4, %ymm4
141
+ vpaddd 0x2c0(%rdi), %ymm4, %ymm4
142
+ vmovdqa %ymm4, 0x2c0(%rdi)
143
+ vpcmpgtd 0x2e0(%rdi), %ymm2, %ymm5
144
+ vpand %ymm1, %ymm5, %ymm5
145
+ vpaddd 0x2e0(%rdi), %ymm5, %ymm5
146
+ vmovdqa %ymm5, 0x2e0(%rdi)
147
+ vpcmpgtd 0x300(%rdi), %ymm2, %ymm0
148
+ vpand %ymm1, %ymm0, %ymm0
149
+ vpaddd 0x300(%rdi), %ymm0, %ymm0
150
+ vmovdqa %ymm0, 0x300(%rdi)
151
+ vpcmpgtd 0x320(%rdi), %ymm2, %ymm3
152
+ vpand %ymm1, %ymm3, %ymm3
153
+ vpaddd 0x320(%rdi), %ymm3, %ymm3
154
+ vmovdqa %ymm3, 0x320(%rdi)
155
+ vpcmpgtd 0x340(%rdi), %ymm2, %ymm4
156
+ vpand %ymm1, %ymm4, %ymm4
157
+ vpaddd 0x340(%rdi), %ymm4, %ymm4
158
+ vmovdqa %ymm4, 0x340(%rdi)
159
+ vpcmpgtd 0x360(%rdi), %ymm2, %ymm5
160
+ vpand %ymm1, %ymm5, %ymm5
161
+ vpaddd 0x360(%rdi), %ymm5, %ymm5
162
+ vmovdqa %ymm5, 0x360(%rdi)
163
+ vpcmpgtd 0x380(%rdi), %ymm2, %ymm0
164
+ vpand %ymm1, %ymm0, %ymm0
165
+ vpaddd 0x380(%rdi), %ymm0, %ymm0
166
+ vmovdqa %ymm0, 0x380(%rdi)
167
+ vpcmpgtd 0x3a0(%rdi), %ymm2, %ymm3
168
+ vpand %ymm1, %ymm3, %ymm3
169
+ vpaddd 0x3a0(%rdi), %ymm3, %ymm3
170
+ vmovdqa %ymm3, 0x3a0(%rdi)
171
+ vpcmpgtd 0x3c0(%rdi), %ymm2, %ymm4
172
+ vpand %ymm1, %ymm4, %ymm4
173
+ vpaddd 0x3c0(%rdi), %ymm4, %ymm4
174
+ vmovdqa %ymm4, 0x3c0(%rdi)
175
+ vpcmpgtd 0x3e0(%rdi), %ymm2, %ymm5
176
+ vpand %ymm1, %ymm5, %ymm5
177
+ vpaddd 0x3e0(%rdi), %ymm5, %ymm5
178
+ vmovdqa %ymm5, 0x3e0(%rdi)
179
+ retq
180
+ .cfi_endproc
181
+
182
+ MLD_ASM_FN_SIZE(poly_caddq_avx2_asm)
183
+
184
+
185
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
186
+ */
187
+
188
+ #if defined(__ELF__)
189
+ .section .note.GNU-stack,"",%progbits
190
+ #endif
@@ -24,6 +24,7 @@
24
24
  #include "../../../common.h"
25
25
 
26
26
  #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
27
+ !defined(MLD_CONFIG_NO_SIGN_API) && \
27
28
  !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
28
29
  (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
29
30
  (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
@@ -143,13 +144,14 @@ void mld_poly_decompose_32_avx2(int32_t *a1, int32_t *a0)
143
144
  }
144
145
  }
145
146
 
146
- #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
147
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
148
- 65 || MLD_CONFIG_PARAMETER_SET == 87) */
147
+ #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
148
+ !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
149
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
150
+ || MLD_CONFIG_PARAMETER_SET == 87) */
149
151
 
150
152
  MLD_EMPTY_CU(avx2_poly_decompose_32)
151
153
 
152
- #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
154
+ #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
153
155
  !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
154
156
  (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
155
157
  || MLD_CONFIG_PARAMETER_SET == 87)) */
@@ -24,6 +24,7 @@
24
24
  #include "../../../common.h"
25
25
 
26
26
  #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
27
+ !defined(MLD_CONFIG_NO_SIGN_API) && \
27
28
  !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
28
29
  (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
29
30
  MLD_CONFIG_PARAMETER_SET == 44)
@@ -143,13 +144,14 @@ void mld_poly_decompose_88_avx2(int32_t *a1, int32_t *a0)
143
144
  _mm256_store_si256((__m256i *)&a0[8 * i], f0);
144
145
  }
145
146
  }
146
- #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
147
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
148
- 44) */
147
+ #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
148
+ !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
149
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
150
+ */
149
151
 
150
152
  MLD_EMPTY_CU(avx2_poly_decompose_88)
151
153
 
152
- #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
154
+ #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_SIGN_API && \
153
155
  !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
154
156
  (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
155
157
  44)) */
@@ -20,6 +20,7 @@
20
20
  #include "../../../common.h"
21
21
 
22
22
  #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ !defined(MLD_CONFIG_NO_VERIFY_API) && \
23
24
  !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
24
25
  (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
25
26
  (MLD_CONFIG_PARAMETER_SET == 65 || MLD_CONFIG_PARAMETER_SET == 87))
@@ -33,12 +34,11 @@
33
34
  _mm256_castsi256_ps(b), \
34
35
  _mm256_castsi256_ps(mask)))
35
36
 
36
- void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a,
37
- const int32_t *hint)
37
+ void mld_poly_use_hint_32_avx2(int32_t *a, const int32_t *hint)
38
38
  {
39
39
  unsigned int i;
40
40
  __m256i f, f0, f1, h, t;
41
- const __m256i q_bound = _mm256_set1_epi32(87 * ((MLDSA_Q - 1) / 32));
41
+ const __m256i q_bound = _mm256_set1_epi32(31 * ((MLDSA_Q - 1) / 32));
42
42
  /* check-magic: 1025 == floor(2**22 / 4092) */
43
43
  const __m256i v = _mm256_set1_epi32(1025);
44
44
  const __m256i alpha = _mm256_set1_epi32(2 * ((MLDSA_Q - 1) / 32));
@@ -82,17 +82,18 @@ void mld_poly_use_hint_32_avx2(int32_t *b, const int32_t *a,
82
82
  f1 = _mm256_add_epi32(f1, h);
83
83
  f1 = _mm256_and_si256(f1, mask);
84
84
 
85
- _mm256_store_si256((__m256i *)&b[8 * i], f1);
85
+ _mm256_store_si256((__m256i *)&a[8 * i], f1);
86
86
  }
87
87
  }
88
88
 
89
- #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
90
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
91
- 65 || MLD_CONFIG_PARAMETER_SET == 87) */
89
+ #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
90
+ !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
91
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
92
+ || MLD_CONFIG_PARAMETER_SET == 87) */
92
93
 
93
94
  MLD_EMPTY_CU(avx2_poly_use_hint_32)
94
95
 
95
- #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
96
+ #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
96
97
  !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
97
98
  (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 65 \
98
99
  || MLD_CONFIG_PARAMETER_SET == 87)) */
@@ -20,6 +20,7 @@
20
20
  #include "../../../common.h"
21
21
 
22
22
  #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ !defined(MLD_CONFIG_NO_VERIFY_API) && \
23
24
  !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
24
25
  (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
25
26
  MLD_CONFIG_PARAMETER_SET == 44)
@@ -33,8 +34,7 @@
33
34
  _mm256_castsi256_ps(b), \
34
35
  _mm256_castsi256_ps(mask)))
35
36
 
36
- void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a,
37
- const int32_t *hint)
37
+ void mld_poly_use_hint_88_avx2(int32_t *a, const int32_t *hint)
38
38
  {
39
39
  unsigned int i;
40
40
  __m256i f, f0, f1, h, t;
@@ -84,19 +84,20 @@ void mld_poly_use_hint_88_avx2(int32_t *b, const int32_t *a,
84
84
  f = _mm256_cmpgt_epi32(f1, max);
85
85
  f1 = MLD_MM256_BLENDV_EPI32(f1, zero, f);
86
86
 
87
- _mm256_store_si256((__m256i *)&b[8 * i], f1);
87
+ _mm256_store_si256((__m256i *)&a[8 * i], f1);
88
88
  }
89
89
  }
90
90
 
91
- #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
92
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
93
- 44) */
91
+ #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
92
+ !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
93
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
94
+ */
94
95
 
95
96
  MLD_EMPTY_CU(avx2_poly_use_hint_88)
96
97
 
97
- #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
98
- !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
99
- (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
98
+ #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_NO_VERIFY_API && \
99
+ !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
100
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
100
101
  44)) */
101
102
 
102
103
  /* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
@@ -20,6 +20,8 @@
20
20
  #include "../../../common.h"
21
21
 
22
22
  #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ (!defined(MLD_CONFIG_NO_SIGN_API) || \
24
+ !defined(MLD_CONFIG_NO_VERIFY_API)) && \
23
25
  !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
24
26
  (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || \
25
27
  MLD_CONFIG_PARAMETER_SET == 44)
@@ -79,13 +81,14 @@ void mld_polyz_unpack_17_avx2(int32_t *r, const uint8_t *a)
79
81
  _mm256_store_si256((__m256i *)&r[8 * i], f);
80
82
  }
81
83
  }
82
- #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
83
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
84
- 44) */
84
+ #else /* MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \
85
+ !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
86
+ (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == 44) \
87
+ */
85
88
 
86
89
  MLD_EMPTY_CU(avx2_polyz_unpack_17)
87
90
 
88
- #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && \
89
- !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
91
+ #endif /* !(MLD_ARITH_BACKEND_X86_64_DEFAULT && (!MLD_CONFIG_NO_SIGN_API || \
92
+ !MLD_CONFIG_NO_VERIFY_API) && !MLD_CONFIG_MULTILEVEL_NO_SHARED && \
90
93
  (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLD_CONFIG_PARAMETER_SET == \
91
94
  44)) */