pq_crypto 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  5. data/ext/pqcrypto/vendor/.vendored +4 -4
  6. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  7. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  8. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  9. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  10. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  107. data/lib/pq_crypto/version.rb +1 -1
  108. data/script/vendor_libs.rb +3 -3
  109. metadata +41 -35
  110. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  111. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  112. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  113. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  114. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  115. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  116. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  117. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  118. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  119. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  120. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  121. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  122. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  123. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  124. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  125. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  126. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  127. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  128. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  129. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  130. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  131. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  132. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  133. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  134. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  135. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  136. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  137. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  138. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  139. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  140. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  141. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -0,0 +1,238 @@
1
+ /*
2
+ * Copyright (c) The mlkem-native project authors
3
+ * Copyright (c) The mldsa-native project authors
4
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5
+ */
6
+
7
+ /* References
8
+ * ==========
9
+ *
10
+ * - [REF_AVX2]
11
+ * CRYSTALS-Dilithium optimized AVX2 implementation
12
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
14
+ */
15
+
16
+ /*
17
+ * This file is derived from the public domain
18
+ * AVX2 Dilithium implementation @[REF_AVX2].
19
+ */
20
+
21
+ #include "../../../common.h"
22
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/nttunpack_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(nttunpack_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(nttunpack_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa (%rdi), %ymm4
37
+ vmovdqa 0x20(%rdi), %ymm5
38
+ vmovdqa 0x40(%rdi), %ymm6
39
+ vmovdqa 0x60(%rdi), %ymm7
40
+ vmovdqa 0x80(%rdi), %ymm8
41
+ vmovdqa 0xa0(%rdi), %ymm9
42
+ vmovdqa 0xc0(%rdi), %ymm10
43
+ vmovdqa 0xe0(%rdi), %ymm11
44
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
45
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
46
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
47
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
48
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
49
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
50
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
51
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
52
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
53
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
54
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
55
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
56
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
57
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
58
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
59
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
60
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
61
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
62
+ vpsrlq $0x20, %ymm7, %ymm7
63
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
64
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
65
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
66
+ vpsrlq $0x20, %ymm5, %ymm5
67
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
68
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
69
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
70
+ vpsrlq $0x20, %ymm3, %ymm3
71
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
72
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
73
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
74
+ vpsrlq $0x20, %ymm10, %ymm10
75
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
76
+ vmovdqa %ymm9, (%rdi)
77
+ vmovdqa %ymm8, 0x20(%rdi)
78
+ vmovdqa %ymm7, 0x40(%rdi)
79
+ vmovdqa %ymm6, 0x60(%rdi)
80
+ vmovdqa %ymm5, 0x80(%rdi)
81
+ vmovdqa %ymm4, 0xa0(%rdi)
82
+ vmovdqa %ymm3, 0xc0(%rdi)
83
+ vmovdqa %ymm11, 0xe0(%rdi)
84
+ vmovdqa 0x100(%rdi), %ymm4
85
+ vmovdqa 0x120(%rdi), %ymm5
86
+ vmovdqa 0x140(%rdi), %ymm6
87
+ vmovdqa 0x160(%rdi), %ymm7
88
+ vmovdqa 0x180(%rdi), %ymm8
89
+ vmovdqa 0x1a0(%rdi), %ymm9
90
+ vmovdqa 0x1c0(%rdi), %ymm10
91
+ vmovdqa 0x1e0(%rdi), %ymm11
92
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
93
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
94
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
95
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
96
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
97
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
98
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
99
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
100
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
101
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
102
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
103
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
104
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
105
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
106
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
107
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
108
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
109
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
110
+ vpsrlq $0x20, %ymm7, %ymm7
111
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
112
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
113
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
114
+ vpsrlq $0x20, %ymm5, %ymm5
115
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
116
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
117
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
118
+ vpsrlq $0x20, %ymm3, %ymm3
119
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
120
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
121
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
122
+ vpsrlq $0x20, %ymm10, %ymm10
123
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
124
+ vmovdqa %ymm9, 0x100(%rdi)
125
+ vmovdqa %ymm8, 0x120(%rdi)
126
+ vmovdqa %ymm7, 0x140(%rdi)
127
+ vmovdqa %ymm6, 0x160(%rdi)
128
+ vmovdqa %ymm5, 0x180(%rdi)
129
+ vmovdqa %ymm4, 0x1a0(%rdi)
130
+ vmovdqa %ymm3, 0x1c0(%rdi)
131
+ vmovdqa %ymm11, 0x1e0(%rdi)
132
+ vmovdqa 0x200(%rdi), %ymm4
133
+ vmovdqa 0x220(%rdi), %ymm5
134
+ vmovdqa 0x240(%rdi), %ymm6
135
+ vmovdqa 0x260(%rdi), %ymm7
136
+ vmovdqa 0x280(%rdi), %ymm8
137
+ vmovdqa 0x2a0(%rdi), %ymm9
138
+ vmovdqa 0x2c0(%rdi), %ymm10
139
+ vmovdqa 0x2e0(%rdi), %ymm11
140
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
141
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
142
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
143
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
144
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
145
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
146
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
147
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
148
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
149
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
150
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
151
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
152
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
153
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
154
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
155
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
156
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
157
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
158
+ vpsrlq $0x20, %ymm7, %ymm7
159
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
160
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
161
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
162
+ vpsrlq $0x20, %ymm5, %ymm5
163
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
164
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
165
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
166
+ vpsrlq $0x20, %ymm3, %ymm3
167
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
168
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
169
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
170
+ vpsrlq $0x20, %ymm10, %ymm10
171
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
172
+ vmovdqa %ymm9, 0x200(%rdi)
173
+ vmovdqa %ymm8, 0x220(%rdi)
174
+ vmovdqa %ymm7, 0x240(%rdi)
175
+ vmovdqa %ymm6, 0x260(%rdi)
176
+ vmovdqa %ymm5, 0x280(%rdi)
177
+ vmovdqa %ymm4, 0x2a0(%rdi)
178
+ vmovdqa %ymm3, 0x2c0(%rdi)
179
+ vmovdqa %ymm11, 0x2e0(%rdi)
180
+ vmovdqa 0x300(%rdi), %ymm4
181
+ vmovdqa 0x320(%rdi), %ymm5
182
+ vmovdqa 0x340(%rdi), %ymm6
183
+ vmovdqa 0x360(%rdi), %ymm7
184
+ vmovdqa 0x380(%rdi), %ymm8
185
+ vmovdqa 0x3a0(%rdi), %ymm9
186
+ vmovdqa 0x3c0(%rdi), %ymm10
187
+ vmovdqa 0x3e0(%rdi), %ymm11
188
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
189
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
190
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
191
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
192
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
193
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
194
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
195
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
196
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
197
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
198
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
199
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
200
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
201
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
202
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
203
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
204
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
205
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
206
+ vpsrlq $0x20, %ymm7, %ymm7
207
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
208
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
209
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
210
+ vpsrlq $0x20, %ymm5, %ymm5
211
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
212
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
213
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
214
+ vpsrlq $0x20, %ymm3, %ymm3
215
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
216
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
217
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
218
+ vpsrlq $0x20, %ymm10, %ymm10
219
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
220
+ vmovdqa %ymm9, 0x300(%rdi)
221
+ vmovdqa %ymm8, 0x320(%rdi)
222
+ vmovdqa %ymm7, 0x340(%rdi)
223
+ vmovdqa %ymm6, 0x360(%rdi)
224
+ vmovdqa %ymm5, 0x380(%rdi)
225
+ vmovdqa %ymm4, 0x3a0(%rdi)
226
+ vmovdqa %ymm3, 0x3c0(%rdi)
227
+ vmovdqa %ymm11, 0x3e0(%rdi)
228
+ retq
229
+ .cfi_endproc
230
+
231
+ MLD_ASM_FN_SIZE(nttunpack_avx2_asm)
232
+
233
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
234
+ */
235
+
236
+ #if defined(__ELF__)
237
+ .section .note.GNU-stack,"",%progbits
238
+ #endif
@@ -0,0 +1,139 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 4)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l4_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l4_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vpmuldq %ymm2, %ymm0, %ymm6
106
+ vpmuldq %ymm3, %ymm0, %ymm7
107
+ vpmuldq %ymm4, %ymm0, %ymm8
108
+ vpmuldq %ymm5, %ymm0, %ymm9
109
+ vpmuldq %ymm6, %ymm1, %ymm6
110
+ vpmuldq %ymm7, %ymm1, %ymm7
111
+ vpmuldq %ymm8, %ymm1, %ymm8
112
+ vpmuldq %ymm9, %ymm1, %ymm9
113
+ vpsubq %ymm6, %ymm2, %ymm2
114
+ vpsubq %ymm7, %ymm3, %ymm3
115
+ vpsubq %ymm8, %ymm4, %ymm4
116
+ vpsubq %ymm9, %ymm5, %ymm5
117
+ vpsrlq $0x20, %ymm2, %ymm2
118
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
119
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
120
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
121
+ vmovdqa %ymm2, (%rdi)
122
+ vmovdqa %ymm4, 0x20(%rdi)
123
+ addq $0x40, %rsi
124
+ addq $0x40, %rdx
125
+ addq $0x40, %rdi
126
+ addl $0x1, %eax
127
+ cmpl $0x10, %eax
128
+ jb Lpointwise_acc_l4_avx2_looptop2
129
+ retq
130
+ .cfi_endproc
131
+
132
+ MLD_ASM_FN_SIZE(pointwise_acc_l4_avx2_asm)
133
+
134
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
135
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 4) */
136
+
137
+ #if defined(__ELF__)
138
+ .section .note.GNU-stack,"",%progbits
139
+ #endif
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l5_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l5_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vmovdqa 0x1000(%rsi), %ymm6
106
+ vmovdqa 0x1020(%rsi), %ymm8
107
+ vmovdqa 0x1000(%rdx), %ymm10
108
+ vmovdqa 0x1020(%rdx), %ymm12
109
+ vpsrlq $0x20, %ymm6, %ymm7
110
+ vpsrlq $0x20, %ymm8, %ymm9
111
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
112
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
113
+ vpmuldq %ymm10, %ymm6, %ymm6
114
+ vpmuldq %ymm11, %ymm7, %ymm7
115
+ vpmuldq %ymm12, %ymm8, %ymm8
116
+ vpmuldq %ymm13, %ymm9, %ymm9
117
+ vpaddq %ymm2, %ymm6, %ymm2
118
+ vpaddq %ymm3, %ymm7, %ymm3
119
+ vpaddq %ymm4, %ymm8, %ymm4
120
+ vpaddq %ymm5, %ymm9, %ymm5
121
+ vpmuldq %ymm2, %ymm0, %ymm6
122
+ vpmuldq %ymm3, %ymm0, %ymm7
123
+ vpmuldq %ymm4, %ymm0, %ymm8
124
+ vpmuldq %ymm5, %ymm0, %ymm9
125
+ vpmuldq %ymm6, %ymm1, %ymm6
126
+ vpmuldq %ymm7, %ymm1, %ymm7
127
+ vpmuldq %ymm8, %ymm1, %ymm8
128
+ vpmuldq %ymm9, %ymm1, %ymm9
129
+ vpsubq %ymm6, %ymm2, %ymm2
130
+ vpsubq %ymm7, %ymm3, %ymm3
131
+ vpsubq %ymm8, %ymm4, %ymm4
132
+ vpsubq %ymm9, %ymm5, %ymm5
133
+ vpsrlq $0x20, %ymm2, %ymm2
134
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
135
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
136
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
137
+ vmovdqa %ymm2, (%rdi)
138
+ vmovdqa %ymm4, 0x20(%rdi)
139
+ addq $0x40, %rsi
140
+ addq $0x40, %rdx
141
+ addq $0x40, %rdi
142
+ addl $0x1, %eax
143
+ cmpl $0x10, %eax
144
+ jb Lpointwise_acc_l5_avx2_looptop2
145
+ retq
146
+ .cfi_endproc
147
+
148
+ MLD_ASM_FN_SIZE(pointwise_acc_l5_avx2_asm)
149
+
150
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
151
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */
152
+
153
+ #if defined(__ELF__)
154
+ .section .note.GNU-stack,"",%progbits
155
+ #endif