pq_crypto 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  5. data/ext/pqcrypto/vendor/.vendored +4 -4
  6. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  7. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  8. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  9. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  10. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  107. data/lib/pq_crypto/version.rb +1 -1
  108. data/script/vendor_libs.rb +3 -3
  109. metadata +41 -35
  110. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  111. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  112. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  113. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  114. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  115. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  116. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  117. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  118. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  119. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  120. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  121. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  122. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  123. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  124. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  125. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  126. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  127. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  128. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  129. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  130. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  131. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  132. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  133. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  134. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  135. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  136. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  137. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  138. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  139. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  140. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  141. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -1,239 +0,0 @@
1
- /*
2
- * Copyright (c) The mlkem-native project authors
3
- * Copyright (c) The mldsa-native project authors
4
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5
- */
6
-
7
- /* References
8
- * ==========
9
- *
10
- * - [REF_AVX2]
11
- * CRYSTALS-Dilithium optimized AVX2 implementation
12
- * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13
- * https://github.com/pq-crystals/dilithium/tree/master/avx2
14
- */
15
-
16
- /*
17
- * This file is derived from the public domain
18
- * AVX2 Dilithium implementation @[REF_AVX2].
19
- */
20
-
21
- #include "../../../common.h"
22
-
23
- #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
24
- !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
25
-
26
- /*
27
- * WARNING: This file is auto-derived from the mldsa-native source file
28
- * dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
29
- */
30
-
31
- #if defined(__ELF__)
32
- .section .note.GNU-stack,"",@progbits
33
- #endif
34
-
35
- .text
36
- .balign 4
37
- .global MLD_ASM_NAMESPACE(nttunpack_avx2)
38
- MLD_ASM_FN_SYMBOL(nttunpack_avx2)
39
-
40
- .cfi_startproc
41
- vmovdqa (%rdi), %ymm4
42
- vmovdqa 0x20(%rdi), %ymm5
43
- vmovdqa 0x40(%rdi), %ymm6
44
- vmovdqa 0x60(%rdi), %ymm7
45
- vmovdqa 0x80(%rdi), %ymm8
46
- vmovdqa 0xa0(%rdi), %ymm9
47
- vmovdqa 0xc0(%rdi), %ymm10
48
- vmovdqa 0xe0(%rdi), %ymm11
49
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
50
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
51
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
52
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
53
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
54
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
55
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
56
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
57
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
58
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
59
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
60
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
61
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
62
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
63
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
64
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
65
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
66
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
67
- vpsrlq $0x20, %ymm7, %ymm7
68
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
69
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
70
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
71
- vpsrlq $0x20, %ymm5, %ymm5
72
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
73
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
74
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
75
- vpsrlq $0x20, %ymm3, %ymm3
76
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
77
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
78
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
79
- vpsrlq $0x20, %ymm10, %ymm10
80
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
81
- vmovdqa %ymm9, (%rdi)
82
- vmovdqa %ymm8, 0x20(%rdi)
83
- vmovdqa %ymm7, 0x40(%rdi)
84
- vmovdqa %ymm6, 0x60(%rdi)
85
- vmovdqa %ymm5, 0x80(%rdi)
86
- vmovdqa %ymm4, 0xa0(%rdi)
87
- vmovdqa %ymm3, 0xc0(%rdi)
88
- vmovdqa %ymm11, 0xe0(%rdi)
89
- vmovdqa 0x100(%rdi), %ymm4
90
- vmovdqa 0x120(%rdi), %ymm5
91
- vmovdqa 0x140(%rdi), %ymm6
92
- vmovdqa 0x160(%rdi), %ymm7
93
- vmovdqa 0x180(%rdi), %ymm8
94
- vmovdqa 0x1a0(%rdi), %ymm9
95
- vmovdqa 0x1c0(%rdi), %ymm10
96
- vmovdqa 0x1e0(%rdi), %ymm11
97
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
98
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
99
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
100
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
101
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
102
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
103
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
104
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
105
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
106
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
107
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
108
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
109
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
110
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
111
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
112
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
113
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
114
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
115
- vpsrlq $0x20, %ymm7, %ymm7
116
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
117
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
118
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
119
- vpsrlq $0x20, %ymm5, %ymm5
120
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
121
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
122
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
123
- vpsrlq $0x20, %ymm3, %ymm3
124
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
125
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
126
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
127
- vpsrlq $0x20, %ymm10, %ymm10
128
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
129
- vmovdqa %ymm9, 0x100(%rdi)
130
- vmovdqa %ymm8, 0x120(%rdi)
131
- vmovdqa %ymm7, 0x140(%rdi)
132
- vmovdqa %ymm6, 0x160(%rdi)
133
- vmovdqa %ymm5, 0x180(%rdi)
134
- vmovdqa %ymm4, 0x1a0(%rdi)
135
- vmovdqa %ymm3, 0x1c0(%rdi)
136
- vmovdqa %ymm11, 0x1e0(%rdi)
137
- vmovdqa 0x200(%rdi), %ymm4
138
- vmovdqa 0x220(%rdi), %ymm5
139
- vmovdqa 0x240(%rdi), %ymm6
140
- vmovdqa 0x260(%rdi), %ymm7
141
- vmovdqa 0x280(%rdi), %ymm8
142
- vmovdqa 0x2a0(%rdi), %ymm9
143
- vmovdqa 0x2c0(%rdi), %ymm10
144
- vmovdqa 0x2e0(%rdi), %ymm11
145
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
146
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
147
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
148
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
149
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
150
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
151
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
152
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
153
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
154
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
155
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
156
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
157
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
158
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
159
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
160
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
161
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
162
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
163
- vpsrlq $0x20, %ymm7, %ymm7
164
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
165
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
166
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
167
- vpsrlq $0x20, %ymm5, %ymm5
168
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
169
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
170
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
171
- vpsrlq $0x20, %ymm3, %ymm3
172
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
173
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
174
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
175
- vpsrlq $0x20, %ymm10, %ymm10
176
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
177
- vmovdqa %ymm9, 0x200(%rdi)
178
- vmovdqa %ymm8, 0x220(%rdi)
179
- vmovdqa %ymm7, 0x240(%rdi)
180
- vmovdqa %ymm6, 0x260(%rdi)
181
- vmovdqa %ymm5, 0x280(%rdi)
182
- vmovdqa %ymm4, 0x2a0(%rdi)
183
- vmovdqa %ymm3, 0x2c0(%rdi)
184
- vmovdqa %ymm11, 0x2e0(%rdi)
185
- vmovdqa 0x300(%rdi), %ymm4
186
- vmovdqa 0x320(%rdi), %ymm5
187
- vmovdqa 0x340(%rdi), %ymm6
188
- vmovdqa 0x360(%rdi), %ymm7
189
- vmovdqa 0x380(%rdi), %ymm8
190
- vmovdqa 0x3a0(%rdi), %ymm9
191
- vmovdqa 0x3c0(%rdi), %ymm10
192
- vmovdqa 0x3e0(%rdi), %ymm11
193
- vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
194
- vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
195
- vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
196
- vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
197
- vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
198
- vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
199
- vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
200
- vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
201
- vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
202
- vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
203
- vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
204
- vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
205
- vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
206
- vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
207
- vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
208
- vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
209
- vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
210
- vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
211
- vpsrlq $0x20, %ymm7, %ymm7
212
- vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
213
- vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
214
- vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
215
- vpsrlq $0x20, %ymm5, %ymm5
216
- vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
217
- vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
218
- vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
219
- vpsrlq $0x20, %ymm3, %ymm3
220
- vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
221
- vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
222
- vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
223
- vpsrlq $0x20, %ymm10, %ymm10
224
- vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
225
- vmovdqa %ymm9, 0x300(%rdi)
226
- vmovdqa %ymm8, 0x320(%rdi)
227
- vmovdqa %ymm7, 0x340(%rdi)
228
- vmovdqa %ymm6, 0x360(%rdi)
229
- vmovdqa %ymm5, 0x380(%rdi)
230
- vmovdqa %ymm4, 0x3a0(%rdi)
231
- vmovdqa %ymm3, 0x3c0(%rdi)
232
- vmovdqa %ymm11, 0x3e0(%rdi)
233
- retq
234
- .cfi_endproc
235
-
236
- MLD_ASM_FN_SIZE(nttunpack_avx2)
237
-
238
- #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
239
- */
@@ -1,131 +0,0 @@
1
- /*
2
- * Copyright (c) The mldsa-native project authors
3
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
- */
5
-
6
- /* References
7
- * ==========
8
- *
9
- * - [REF_AVX2]
10
- * CRYSTALS-Dilithium optimized AVX2 implementation
11
- * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
- * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
- */
14
-
15
- /*
16
- * This file is derived from the public domain
17
- * AVX2 Dilithium implementation @[REF_AVX2].
18
- */
19
-
20
- #include "../../../common.h"
21
- #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
- !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
23
-
24
- /*
25
- * WARNING: This file is auto-derived from the mldsa-native source file
26
- * dev/x86_64/src/pointwise.S using scripts/simpasm. Do not modify it directly.
27
- */
28
-
29
- #if defined(__ELF__)
30
- .section .note.GNU-stack,"",@progbits
31
- #endif
32
-
33
- .text
34
- .balign 4
35
- .global MLD_ASM_NAMESPACE(pointwise_avx2)
36
- MLD_ASM_FN_SYMBOL(pointwise_avx2)
37
-
38
- .cfi_startproc
39
- vmovdqa 0x20(%rcx), %ymm0
40
- vmovdqa (%rcx), %ymm1
41
- xorl %eax, %eax
42
-
43
- Lpointwise_avx2_looptop1:
44
- vmovdqa (%rsi), %ymm2
45
- vmovdqa 0x20(%rsi), %ymm4
46
- vmovdqa 0x40(%rsi), %ymm6
47
- vmovdqa (%rdx), %ymm10
48
- vmovdqa 0x20(%rdx), %ymm12
49
- vmovdqa 0x40(%rdx), %ymm14
50
- vpsrlq $0x20, %ymm2, %ymm3
51
- vpsrlq $0x20, %ymm4, %ymm5
52
- vmovshdup %ymm6, %ymm7 # ymm7 = ymm6[1,1,3,3,5,5,7,7]
53
- vpsrlq $0x20, %ymm10, %ymm11
54
- vpsrlq $0x20, %ymm12, %ymm13
55
- vmovshdup %ymm14, %ymm15 # ymm15 = ymm14[1,1,3,3,5,5,7,7]
56
- vpmuldq %ymm10, %ymm2, %ymm2
57
- vpmuldq %ymm11, %ymm3, %ymm3
58
- vpmuldq %ymm12, %ymm4, %ymm4
59
- vpmuldq %ymm13, %ymm5, %ymm5
60
- vpmuldq %ymm14, %ymm6, %ymm6
61
- vpmuldq %ymm15, %ymm7, %ymm7
62
- vpmuldq %ymm2, %ymm0, %ymm10
63
- vpmuldq %ymm3, %ymm0, %ymm11
64
- vpmuldq %ymm4, %ymm0, %ymm12
65
- vpmuldq %ymm5, %ymm0, %ymm13
66
- vpmuldq %ymm6, %ymm0, %ymm14
67
- vpmuldq %ymm7, %ymm0, %ymm15
68
- vpmuldq %ymm10, %ymm1, %ymm10
69
- vpmuldq %ymm11, %ymm1, %ymm11
70
- vpmuldq %ymm12, %ymm1, %ymm12
71
- vpmuldq %ymm13, %ymm1, %ymm13
72
- vpmuldq %ymm14, %ymm1, %ymm14
73
- vpmuldq %ymm15, %ymm1, %ymm15
74
- vpsubq %ymm10, %ymm2, %ymm2
75
- vpsubq %ymm11, %ymm3, %ymm3
76
- vpsubq %ymm12, %ymm4, %ymm4
77
- vpsubq %ymm13, %ymm5, %ymm5
78
- vpsubq %ymm14, %ymm6, %ymm6
79
- vpsubq %ymm15, %ymm7, %ymm7
80
- vpsrlq $0x20, %ymm2, %ymm2
81
- vpsrlq $0x20, %ymm4, %ymm4
82
- vmovshdup %ymm6, %ymm6 # ymm6 = ymm6[1,1,3,3,5,5,7,7]
83
- vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
84
- vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
85
- vpblendd $0xaa, %ymm7, %ymm6, %ymm6 # ymm6 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7]
86
- vmovdqa %ymm2, (%rdi)
87
- vmovdqa %ymm4, 0x20(%rdi)
88
- vmovdqa %ymm6, 0x40(%rdi)
89
- addq $0x60, %rdi
90
- addq $0x60, %rsi
91
- addq $0x60, %rdx
92
- addl $0x1, %eax
93
- cmpl $0xa, %eax
94
- jb Lpointwise_avx2_looptop1
95
- vmovdqa (%rsi), %ymm2
96
- vmovdqa 0x20(%rsi), %ymm4
97
- vmovdqa (%rdx), %ymm10
98
- vmovdqa 0x20(%rdx), %ymm12
99
- vpsrlq $0x20, %ymm2, %ymm3
100
- vpsrlq $0x20, %ymm4, %ymm5
101
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
102
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
103
- vpmuldq %ymm10, %ymm2, %ymm2
104
- vpmuldq %ymm11, %ymm3, %ymm3
105
- vpmuldq %ymm12, %ymm4, %ymm4
106
- vpmuldq %ymm13, %ymm5, %ymm5
107
- vpmuldq %ymm2, %ymm0, %ymm10
108
- vpmuldq %ymm3, %ymm0, %ymm11
109
- vpmuldq %ymm4, %ymm0, %ymm12
110
- vpmuldq %ymm5, %ymm0, %ymm13
111
- vpmuldq %ymm10, %ymm1, %ymm10
112
- vpmuldq %ymm11, %ymm1, %ymm11
113
- vpmuldq %ymm12, %ymm1, %ymm12
114
- vpmuldq %ymm13, %ymm1, %ymm13
115
- vpsubq %ymm10, %ymm2, %ymm2
116
- vpsubq %ymm11, %ymm3, %ymm3
117
- vpsubq %ymm12, %ymm4, %ymm4
118
- vpsubq %ymm13, %ymm5, %ymm5
119
- vpsrlq $0x20, %ymm2, %ymm2
120
- vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
121
- vpblendd $0x55, %ymm2, %ymm3, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
122
- vpblendd $0x55, %ymm4, %ymm5, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
123
- vmovdqa %ymm2, (%rdi)
124
- vmovdqa %ymm4, 0x20(%rdi)
125
- retq
126
- .cfi_endproc
127
-
128
- MLD_ASM_FN_SIZE(pointwise_avx2)
129
-
130
- #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
131
- */
@@ -1,139 +0,0 @@
1
- /*
2
- * Copyright (c) The mldsa-native project authors
3
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
- */
5
-
6
- /* References
7
- * ==========
8
- *
9
- * - [REF_AVX2]
10
- * CRYSTALS-Dilithium optimized AVX2 implementation
11
- * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
- * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
- */
14
-
15
- /*
16
- * This file is derived from the public domain
17
- * AVX2 Dilithium implementation @[REF_AVX2].
18
- */
19
-
20
- #include "../../../common.h"
21
- #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
- !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
- (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 4)
24
-
25
- /*
26
- * WARNING: This file is auto-derived from the mldsa-native source file
27
- * dev/x86_64/src/pointwise_acc_l4.S using scripts/simpasm. Do not modify it directly.
28
- */
29
-
30
- #if defined(__ELF__)
31
- .section .note.GNU-stack,"",@progbits
32
- #endif
33
-
34
- .text
35
- .balign 4
36
- .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2)
37
- MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2)
38
-
39
- .cfi_startproc
40
- vmovdqa 0x20(%rcx), %ymm0
41
- vmovdqa (%rcx), %ymm1
42
- xorl %eax, %eax
43
-
44
- Lpointwise_acc_l4_avx2_looptop2:
45
- vmovdqa (%rsi), %ymm6
46
- vmovdqa 0x20(%rsi), %ymm8
47
- vmovdqa (%rdx), %ymm10
48
- vmovdqa 0x20(%rdx), %ymm12
49
- vpsrlq $0x20, %ymm6, %ymm7
50
- vpsrlq $0x20, %ymm8, %ymm9
51
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
52
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
53
- vpmuldq %ymm10, %ymm6, %ymm6
54
- vpmuldq %ymm11, %ymm7, %ymm7
55
- vpmuldq %ymm12, %ymm8, %ymm8
56
- vpmuldq %ymm13, %ymm9, %ymm9
57
- vmovdqa %ymm6, %ymm2
58
- vmovdqa %ymm7, %ymm3
59
- vmovdqa %ymm8, %ymm4
60
- vmovdqa %ymm9, %ymm5
61
- vmovdqa 0x400(%rsi), %ymm6
62
- vmovdqa 0x420(%rsi), %ymm8
63
- vmovdqa 0x400(%rdx), %ymm10
64
- vmovdqa 0x420(%rdx), %ymm12
65
- vpsrlq $0x20, %ymm6, %ymm7
66
- vpsrlq $0x20, %ymm8, %ymm9
67
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
68
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
69
- vpmuldq %ymm10, %ymm6, %ymm6
70
- vpmuldq %ymm11, %ymm7, %ymm7
71
- vpmuldq %ymm12, %ymm8, %ymm8
72
- vpmuldq %ymm13, %ymm9, %ymm9
73
- vpaddq %ymm2, %ymm6, %ymm2
74
- vpaddq %ymm3, %ymm7, %ymm3
75
- vpaddq %ymm4, %ymm8, %ymm4
76
- vpaddq %ymm5, %ymm9, %ymm5
77
- vmovdqa 0x800(%rsi), %ymm6
78
- vmovdqa 0x820(%rsi), %ymm8
79
- vmovdqa 0x800(%rdx), %ymm10
80
- vmovdqa 0x820(%rdx), %ymm12
81
- vpsrlq $0x20, %ymm6, %ymm7
82
- vpsrlq $0x20, %ymm8, %ymm9
83
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
84
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
85
- vpmuldq %ymm10, %ymm6, %ymm6
86
- vpmuldq %ymm11, %ymm7, %ymm7
87
- vpmuldq %ymm12, %ymm8, %ymm8
88
- vpmuldq %ymm13, %ymm9, %ymm9
89
- vpaddq %ymm2, %ymm6, %ymm2
90
- vpaddq %ymm3, %ymm7, %ymm3
91
- vpaddq %ymm4, %ymm8, %ymm4
92
- vpaddq %ymm5, %ymm9, %ymm5
93
- vmovdqa 0xc00(%rsi), %ymm6
94
- vmovdqa 0xc20(%rsi), %ymm8
95
- vmovdqa 0xc00(%rdx), %ymm10
96
- vmovdqa 0xc20(%rdx), %ymm12
97
- vpsrlq $0x20, %ymm6, %ymm7
98
- vpsrlq $0x20, %ymm8, %ymm9
99
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
100
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
101
- vpmuldq %ymm10, %ymm6, %ymm6
102
- vpmuldq %ymm11, %ymm7, %ymm7
103
- vpmuldq %ymm12, %ymm8, %ymm8
104
- vpmuldq %ymm13, %ymm9, %ymm9
105
- vpaddq %ymm2, %ymm6, %ymm2
106
- vpaddq %ymm3, %ymm7, %ymm3
107
- vpaddq %ymm4, %ymm8, %ymm4
108
- vpaddq %ymm5, %ymm9, %ymm5
109
- vpmuldq %ymm2, %ymm0, %ymm6
110
- vpmuldq %ymm3, %ymm0, %ymm7
111
- vpmuldq %ymm4, %ymm0, %ymm8
112
- vpmuldq %ymm5, %ymm0, %ymm9
113
- vpmuldq %ymm6, %ymm1, %ymm6
114
- vpmuldq %ymm7, %ymm1, %ymm7
115
- vpmuldq %ymm8, %ymm1, %ymm8
116
- vpmuldq %ymm9, %ymm1, %ymm9
117
- vpsubq %ymm6, %ymm2, %ymm2
118
- vpsubq %ymm7, %ymm3, %ymm3
119
- vpsubq %ymm8, %ymm4, %ymm4
120
- vpsubq %ymm9, %ymm5, %ymm5
121
- vpsrlq $0x20, %ymm2, %ymm2
122
- vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
123
- vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
124
- vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
125
- vmovdqa %ymm2, (%rdi)
126
- vmovdqa %ymm4, 0x20(%rdi)
127
- addq $0x40, %rsi
128
- addq $0x40, %rdx
129
- addq $0x40, %rdi
130
- addl $0x1, %eax
131
- cmpl $0x10, %eax
132
- jb Lpointwise_acc_l4_avx2_looptop2
133
- retq
134
- .cfi_endproc
135
-
136
- MLD_ASM_FN_SIZE(pointwise_acc_l4_avx2)
137
-
138
- #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
139
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 4) */
@@ -1,155 +0,0 @@
1
- /*
2
- * Copyright (c) The mldsa-native project authors
3
- * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
- */
5
-
6
- /* References
7
- * ==========
8
- *
9
- * - [REF_AVX2]
10
- * CRYSTALS-Dilithium optimized AVX2 implementation
11
- * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
- * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
- */
14
-
15
- /*
16
- * This file is derived from the public domain
17
- * AVX2 Dilithium implementation @[REF_AVX2].
18
- */
19
-
20
- #include "../../../common.h"
21
- #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
- !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
- (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
24
-
25
- /*
26
- * WARNING: This file is auto-derived from the mldsa-native source file
27
- * dev/x86_64/src/pointwise_acc_l5.S using scripts/simpasm. Do not modify it directly.
28
- */
29
-
30
- #if defined(__ELF__)
31
- .section .note.GNU-stack,"",@progbits
32
- #endif
33
-
34
- .text
35
- .balign 4
36
- .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2)
37
- MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2)
38
-
39
- .cfi_startproc
40
- vmovdqa 0x20(%rcx), %ymm0
41
- vmovdqa (%rcx), %ymm1
42
- xorl %eax, %eax
43
-
44
- Lpointwise_acc_l5_avx2_looptop2:
45
- vmovdqa (%rsi), %ymm6
46
- vmovdqa 0x20(%rsi), %ymm8
47
- vmovdqa (%rdx), %ymm10
48
- vmovdqa 0x20(%rdx), %ymm12
49
- vpsrlq $0x20, %ymm6, %ymm7
50
- vpsrlq $0x20, %ymm8, %ymm9
51
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
52
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
53
- vpmuldq %ymm10, %ymm6, %ymm6
54
- vpmuldq %ymm11, %ymm7, %ymm7
55
- vpmuldq %ymm12, %ymm8, %ymm8
56
- vpmuldq %ymm13, %ymm9, %ymm9
57
- vmovdqa %ymm6, %ymm2
58
- vmovdqa %ymm7, %ymm3
59
- vmovdqa %ymm8, %ymm4
60
- vmovdqa %ymm9, %ymm5
61
- vmovdqa 0x400(%rsi), %ymm6
62
- vmovdqa 0x420(%rsi), %ymm8
63
- vmovdqa 0x400(%rdx), %ymm10
64
- vmovdqa 0x420(%rdx), %ymm12
65
- vpsrlq $0x20, %ymm6, %ymm7
66
- vpsrlq $0x20, %ymm8, %ymm9
67
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
68
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
69
- vpmuldq %ymm10, %ymm6, %ymm6
70
- vpmuldq %ymm11, %ymm7, %ymm7
71
- vpmuldq %ymm12, %ymm8, %ymm8
72
- vpmuldq %ymm13, %ymm9, %ymm9
73
- vpaddq %ymm2, %ymm6, %ymm2
74
- vpaddq %ymm3, %ymm7, %ymm3
75
- vpaddq %ymm4, %ymm8, %ymm4
76
- vpaddq %ymm5, %ymm9, %ymm5
77
- vmovdqa 0x800(%rsi), %ymm6
78
- vmovdqa 0x820(%rsi), %ymm8
79
- vmovdqa 0x800(%rdx), %ymm10
80
- vmovdqa 0x820(%rdx), %ymm12
81
- vpsrlq $0x20, %ymm6, %ymm7
82
- vpsrlq $0x20, %ymm8, %ymm9
83
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
84
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
85
- vpmuldq %ymm10, %ymm6, %ymm6
86
- vpmuldq %ymm11, %ymm7, %ymm7
87
- vpmuldq %ymm12, %ymm8, %ymm8
88
- vpmuldq %ymm13, %ymm9, %ymm9
89
- vpaddq %ymm2, %ymm6, %ymm2
90
- vpaddq %ymm3, %ymm7, %ymm3
91
- vpaddq %ymm4, %ymm8, %ymm4
92
- vpaddq %ymm5, %ymm9, %ymm5
93
- vmovdqa 0xc00(%rsi), %ymm6
94
- vmovdqa 0xc20(%rsi), %ymm8
95
- vmovdqa 0xc00(%rdx), %ymm10
96
- vmovdqa 0xc20(%rdx), %ymm12
97
- vpsrlq $0x20, %ymm6, %ymm7
98
- vpsrlq $0x20, %ymm8, %ymm9
99
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
100
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
101
- vpmuldq %ymm10, %ymm6, %ymm6
102
- vpmuldq %ymm11, %ymm7, %ymm7
103
- vpmuldq %ymm12, %ymm8, %ymm8
104
- vpmuldq %ymm13, %ymm9, %ymm9
105
- vpaddq %ymm2, %ymm6, %ymm2
106
- vpaddq %ymm3, %ymm7, %ymm3
107
- vpaddq %ymm4, %ymm8, %ymm4
108
- vpaddq %ymm5, %ymm9, %ymm5
109
- vmovdqa 0x1000(%rsi), %ymm6
110
- vmovdqa 0x1020(%rsi), %ymm8
111
- vmovdqa 0x1000(%rdx), %ymm10
112
- vmovdqa 0x1020(%rdx), %ymm12
113
- vpsrlq $0x20, %ymm6, %ymm7
114
- vpsrlq $0x20, %ymm8, %ymm9
115
- vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
116
- vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
117
- vpmuldq %ymm10, %ymm6, %ymm6
118
- vpmuldq %ymm11, %ymm7, %ymm7
119
- vpmuldq %ymm12, %ymm8, %ymm8
120
- vpmuldq %ymm13, %ymm9, %ymm9
121
- vpaddq %ymm2, %ymm6, %ymm2
122
- vpaddq %ymm3, %ymm7, %ymm3
123
- vpaddq %ymm4, %ymm8, %ymm4
124
- vpaddq %ymm5, %ymm9, %ymm5
125
- vpmuldq %ymm2, %ymm0, %ymm6
126
- vpmuldq %ymm3, %ymm0, %ymm7
127
- vpmuldq %ymm4, %ymm0, %ymm8
128
- vpmuldq %ymm5, %ymm0, %ymm9
129
- vpmuldq %ymm6, %ymm1, %ymm6
130
- vpmuldq %ymm7, %ymm1, %ymm7
131
- vpmuldq %ymm8, %ymm1, %ymm8
132
- vpmuldq %ymm9, %ymm1, %ymm9
133
- vpsubq %ymm6, %ymm2, %ymm2
134
- vpsubq %ymm7, %ymm3, %ymm3
135
- vpsubq %ymm8, %ymm4, %ymm4
136
- vpsubq %ymm9, %ymm5, %ymm5
137
- vpsrlq $0x20, %ymm2, %ymm2
138
- vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
139
- vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
140
- vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
141
- vmovdqa %ymm2, (%rdi)
142
- vmovdqa %ymm4, 0x20(%rdi)
143
- addq $0x40, %rsi
144
- addq $0x40, %rdx
145
- addq $0x40, %rdi
146
- addl $0x1, %eax
147
- cmpl $0x10, %eax
148
- jb Lpointwise_acc_l5_avx2_looptop2
149
- retq
150
- .cfi_endproc
151
-
152
- MLD_ASM_FN_SIZE(pointwise_acc_l5_avx2)
153
-
154
- #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
155
- && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */