pq_crypto 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +12 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/extconf.rb +2 -0
  5. data/ext/pqcrypto/pqcrypto_ruby_secure.c +139 -0
  6. data/ext/pqcrypto/pqcrypto_secure.c +532 -0
  7. data/ext/pqcrypto/pqcrypto_secure.h +20 -0
  8. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  9. data/ext/pqcrypto/vendor/.vendored +4 -4
  10. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  107. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  108. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  109. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  110. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  111. data/lib/pq_crypto/hybrid_kem.rb +1 -1
  112. data/lib/pq_crypto/internal.rb +23 -0
  113. data/lib/pq_crypto/kem.rb +27 -34
  114. data/lib/pq_crypto/pkcs8/der.rb +68 -0
  115. data/lib/pq_crypto/pkcs8/private_key_choice.rb +186 -0
  116. data/lib/pq_crypto/pkcs8.rb +51 -468
  117. data/lib/pq_crypto/serialization.rb +19 -29
  118. data/lib/pq_crypto/signature.rb +28 -35
  119. data/lib/pq_crypto/version.rb +1 -1
  120. data/lib/pq_crypto.rb +10 -0
  121. data/script/vendor_libs.rb +3 -3
  122. metadata +44 -35
  123. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  124. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  125. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  126. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  127. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  128. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  129. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  130. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  131. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  132. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  133. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  134. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  135. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  136. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  137. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  138. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  139. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  140. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  141. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  142. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  143. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  144. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  145. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  146. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  147. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  148. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  149. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  150. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  151. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  152. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  153. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  154. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -0,0 +1,238 @@
1
+ /*
2
+ * Copyright (c) The mlkem-native project authors
3
+ * Copyright (c) The mldsa-native project authors
4
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5
+ */
6
+
7
+ /* References
8
+ * ==========
9
+ *
10
+ * - [REF_AVX2]
11
+ * CRYSTALS-Dilithium optimized AVX2 implementation
12
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
14
+ */
15
+
16
+ /*
17
+ * This file is derived from the public domain
18
+ * AVX2 Dilithium implementation @[REF_AVX2].
19
+ */
20
+
21
+ #include "../../../common.h"
22
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/nttunpack_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(nttunpack_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(nttunpack_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa (%rdi), %ymm4
37
+ vmovdqa 0x20(%rdi), %ymm5
38
+ vmovdqa 0x40(%rdi), %ymm6
39
+ vmovdqa 0x60(%rdi), %ymm7
40
+ vmovdqa 0x80(%rdi), %ymm8
41
+ vmovdqa 0xa0(%rdi), %ymm9
42
+ vmovdqa 0xc0(%rdi), %ymm10
43
+ vmovdqa 0xe0(%rdi), %ymm11
44
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
45
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
46
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
47
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
48
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
49
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
50
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
51
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
52
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
53
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
54
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
55
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
56
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
57
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
58
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
59
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
60
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
61
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
62
+ vpsrlq $0x20, %ymm7, %ymm7
63
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
64
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
65
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
66
+ vpsrlq $0x20, %ymm5, %ymm5
67
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
68
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
69
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
70
+ vpsrlq $0x20, %ymm3, %ymm3
71
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
72
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
73
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
74
+ vpsrlq $0x20, %ymm10, %ymm10
75
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
76
+ vmovdqa %ymm9, (%rdi)
77
+ vmovdqa %ymm8, 0x20(%rdi)
78
+ vmovdqa %ymm7, 0x40(%rdi)
79
+ vmovdqa %ymm6, 0x60(%rdi)
80
+ vmovdqa %ymm5, 0x80(%rdi)
81
+ vmovdqa %ymm4, 0xa0(%rdi)
82
+ vmovdqa %ymm3, 0xc0(%rdi)
83
+ vmovdqa %ymm11, 0xe0(%rdi)
84
+ vmovdqa 0x100(%rdi), %ymm4
85
+ vmovdqa 0x120(%rdi), %ymm5
86
+ vmovdqa 0x140(%rdi), %ymm6
87
+ vmovdqa 0x160(%rdi), %ymm7
88
+ vmovdqa 0x180(%rdi), %ymm8
89
+ vmovdqa 0x1a0(%rdi), %ymm9
90
+ vmovdqa 0x1c0(%rdi), %ymm10
91
+ vmovdqa 0x1e0(%rdi), %ymm11
92
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
93
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
94
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
95
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
96
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
97
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
98
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
99
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
100
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
101
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
102
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
103
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
104
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
105
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
106
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
107
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
108
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
109
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
110
+ vpsrlq $0x20, %ymm7, %ymm7
111
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
112
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
113
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
114
+ vpsrlq $0x20, %ymm5, %ymm5
115
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
116
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
117
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
118
+ vpsrlq $0x20, %ymm3, %ymm3
119
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
120
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
121
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
122
+ vpsrlq $0x20, %ymm10, %ymm10
123
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
124
+ vmovdqa %ymm9, 0x100(%rdi)
125
+ vmovdqa %ymm8, 0x120(%rdi)
126
+ vmovdqa %ymm7, 0x140(%rdi)
127
+ vmovdqa %ymm6, 0x160(%rdi)
128
+ vmovdqa %ymm5, 0x180(%rdi)
129
+ vmovdqa %ymm4, 0x1a0(%rdi)
130
+ vmovdqa %ymm3, 0x1c0(%rdi)
131
+ vmovdqa %ymm11, 0x1e0(%rdi)
132
+ vmovdqa 0x200(%rdi), %ymm4
133
+ vmovdqa 0x220(%rdi), %ymm5
134
+ vmovdqa 0x240(%rdi), %ymm6
135
+ vmovdqa 0x260(%rdi), %ymm7
136
+ vmovdqa 0x280(%rdi), %ymm8
137
+ vmovdqa 0x2a0(%rdi), %ymm9
138
+ vmovdqa 0x2c0(%rdi), %ymm10
139
+ vmovdqa 0x2e0(%rdi), %ymm11
140
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
141
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
142
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
143
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
144
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
145
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
146
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
147
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
148
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
149
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
150
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
151
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
152
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
153
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
154
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
155
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
156
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
157
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
158
+ vpsrlq $0x20, %ymm7, %ymm7
159
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
160
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
161
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
162
+ vpsrlq $0x20, %ymm5, %ymm5
163
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
164
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
165
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
166
+ vpsrlq $0x20, %ymm3, %ymm3
167
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
168
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
169
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
170
+ vpsrlq $0x20, %ymm10, %ymm10
171
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
172
+ vmovdqa %ymm9, 0x200(%rdi)
173
+ vmovdqa %ymm8, 0x220(%rdi)
174
+ vmovdqa %ymm7, 0x240(%rdi)
175
+ vmovdqa %ymm6, 0x260(%rdi)
176
+ vmovdqa %ymm5, 0x280(%rdi)
177
+ vmovdqa %ymm4, 0x2a0(%rdi)
178
+ vmovdqa %ymm3, 0x2c0(%rdi)
179
+ vmovdqa %ymm11, 0x2e0(%rdi)
180
+ vmovdqa 0x300(%rdi), %ymm4
181
+ vmovdqa 0x320(%rdi), %ymm5
182
+ vmovdqa 0x340(%rdi), %ymm6
183
+ vmovdqa 0x360(%rdi), %ymm7
184
+ vmovdqa 0x380(%rdi), %ymm8
185
+ vmovdqa 0x3a0(%rdi), %ymm9
186
+ vmovdqa 0x3c0(%rdi), %ymm10
187
+ vmovdqa 0x3e0(%rdi), %ymm11
188
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
189
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
190
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
191
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
192
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
193
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
194
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
195
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
196
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
197
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
198
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
199
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
200
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
201
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
202
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
203
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
204
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
205
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
206
+ vpsrlq $0x20, %ymm7, %ymm7
207
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
208
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
209
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
210
+ vpsrlq $0x20, %ymm5, %ymm5
211
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
212
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
213
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
214
+ vpsrlq $0x20, %ymm3, %ymm3
215
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
216
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
217
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
218
+ vpsrlq $0x20, %ymm10, %ymm10
219
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
220
+ vmovdqa %ymm9, 0x300(%rdi)
221
+ vmovdqa %ymm8, 0x320(%rdi)
222
+ vmovdqa %ymm7, 0x340(%rdi)
223
+ vmovdqa %ymm6, 0x360(%rdi)
224
+ vmovdqa %ymm5, 0x380(%rdi)
225
+ vmovdqa %ymm4, 0x3a0(%rdi)
226
+ vmovdqa %ymm3, 0x3c0(%rdi)
227
+ vmovdqa %ymm11, 0x3e0(%rdi)
228
+ retq
229
+ .cfi_endproc
230
+
231
+ MLD_ASM_FN_SIZE(nttunpack_avx2_asm)
232
+
233
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
234
+ */
235
+
236
+ #if defined(__ELF__)
237
+ .section .note.GNU-stack,"",%progbits
238
+ #endif
@@ -0,0 +1,139 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 4)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l4_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l4_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vpmuldq %ymm2, %ymm0, %ymm6
106
+ vpmuldq %ymm3, %ymm0, %ymm7
107
+ vpmuldq %ymm4, %ymm0, %ymm8
108
+ vpmuldq %ymm5, %ymm0, %ymm9
109
+ vpmuldq %ymm6, %ymm1, %ymm6
110
+ vpmuldq %ymm7, %ymm1, %ymm7
111
+ vpmuldq %ymm8, %ymm1, %ymm8
112
+ vpmuldq %ymm9, %ymm1, %ymm9
113
+ vpsubq %ymm6, %ymm2, %ymm2
114
+ vpsubq %ymm7, %ymm3, %ymm3
115
+ vpsubq %ymm8, %ymm4, %ymm4
116
+ vpsubq %ymm9, %ymm5, %ymm5
117
+ vpsrlq $0x20, %ymm2, %ymm2
118
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
119
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
120
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
121
+ vmovdqa %ymm2, (%rdi)
122
+ vmovdqa %ymm4, 0x20(%rdi)
123
+ addq $0x40, %rsi
124
+ addq $0x40, %rdx
125
+ addq $0x40, %rdi
126
+ addl $0x1, %eax
127
+ cmpl $0x10, %eax
128
+ jb Lpointwise_acc_l4_avx2_looptop2
129
+ retq
130
+ .cfi_endproc
131
+
132
+ MLD_ASM_FN_SIZE(pointwise_acc_l4_avx2_asm)
133
+
134
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
135
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 4) */
136
+
137
+ #if defined(__ELF__)
138
+ .section .note.GNU-stack,"",%progbits
139
+ #endif
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l5_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l5_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vmovdqa 0x1000(%rsi), %ymm6
106
+ vmovdqa 0x1020(%rsi), %ymm8
107
+ vmovdqa 0x1000(%rdx), %ymm10
108
+ vmovdqa 0x1020(%rdx), %ymm12
109
+ vpsrlq $0x20, %ymm6, %ymm7
110
+ vpsrlq $0x20, %ymm8, %ymm9
111
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
112
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
113
+ vpmuldq %ymm10, %ymm6, %ymm6
114
+ vpmuldq %ymm11, %ymm7, %ymm7
115
+ vpmuldq %ymm12, %ymm8, %ymm8
116
+ vpmuldq %ymm13, %ymm9, %ymm9
117
+ vpaddq %ymm2, %ymm6, %ymm2
118
+ vpaddq %ymm3, %ymm7, %ymm3
119
+ vpaddq %ymm4, %ymm8, %ymm4
120
+ vpaddq %ymm5, %ymm9, %ymm5
121
+ vpmuldq %ymm2, %ymm0, %ymm6
122
+ vpmuldq %ymm3, %ymm0, %ymm7
123
+ vpmuldq %ymm4, %ymm0, %ymm8
124
+ vpmuldq %ymm5, %ymm0, %ymm9
125
+ vpmuldq %ymm6, %ymm1, %ymm6
126
+ vpmuldq %ymm7, %ymm1, %ymm7
127
+ vpmuldq %ymm8, %ymm1, %ymm8
128
+ vpmuldq %ymm9, %ymm1, %ymm9
129
+ vpsubq %ymm6, %ymm2, %ymm2
130
+ vpsubq %ymm7, %ymm3, %ymm3
131
+ vpsubq %ymm8, %ymm4, %ymm4
132
+ vpsubq %ymm9, %ymm5, %ymm5
133
+ vpsrlq $0x20, %ymm2, %ymm2
134
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
135
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
136
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
137
+ vmovdqa %ymm2, (%rdi)
138
+ vmovdqa %ymm4, 0x20(%rdi)
139
+ addq $0x40, %rsi
140
+ addq $0x40, %rdx
141
+ addq $0x40, %rdi
142
+ addl $0x1, %eax
143
+ cmpl $0x10, %eax
144
+ jb Lpointwise_acc_l5_avx2_looptop2
145
+ retq
146
+ .cfi_endproc
147
+
148
+ MLD_ASM_FN_SIZE(pointwise_acc_l5_avx2_asm)
149
+
150
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
151
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */
152
+
153
+ #if defined(__ELF__)
154
+ .section .note.GNU-stack,"",%progbits
155
+ #endif