pq_crypto 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (247) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +10 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  5. data/ext/pqcrypto/vendor/.vendored +7 -7
  6. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  7. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  8. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  9. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  10. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  107. data/ext/pqcrypto/vendor/mlkem-native/README.md +6 -3
  108. data/ext/pqcrypto/vendor/mlkem-native/RELEASE.md +22 -0
  109. data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native.c +77 -36
  110. data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native.h +135 -146
  111. data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native_asm.S +116 -72
  112. data/ext/pqcrypto/vendor/mlkem-native/mlkem/mlkem_native_config.h +351 -415
  113. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/cbmc.h +43 -20
  114. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/common.h +16 -8
  115. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/compress.c +57 -31
  116. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/compress.h +260 -349
  117. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/debug.h +17 -24
  118. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202.c +35 -37
  119. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202.h +43 -57
  120. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202x4.c +14 -15
  121. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/fips202x4.h +5 -4
  122. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/keccakf1600.c +42 -6
  123. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/fips202_native_aarch64.h +31 -20
  124. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/{keccak_f1600_x1_scalar_asm.S → keccak_f1600_x1_scalar_aarch64_asm.S} +10 -10
  125. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/{keccak_f1600_x1_v84a_asm.S → keccak_f1600_x1_v84a_aarch64_asm.S} +10 -10
  126. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/{keccak_f1600_x2_v84a_asm.S → keccak_f1600_x2_v84a_aarch64_asm.S} +10 -10
  127. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/{keccak_f1600_x4_v8a_scalar_hybrid_asm.S → keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S} +10 -10
  128. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/{keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S → keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S} +10 -10
  129. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +10 -9
  130. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x1_scalar.h +2 -1
  131. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x1_v84a.h +1 -1
  132. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x2_v84a.h +4 -2
  133. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  134. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  135. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/fips202_native_armv81m.h +2 -1
  136. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +55 -9
  137. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  138. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +58 -14
  139. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +57 -16
  140. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/auto.h +2 -1
  141. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/keccak_f1600_x4_avx2.h +2 -2
  142. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/fips202_native_x86_64.h +10 -7
  143. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/{keccak_f1600_x4_avx2.S → keccak_f1600_x4_avx2_asm.S} +13 -11
  144. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/fips202/native/x86_64/src/keccakf1600_constants.c +12 -11
  145. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/indcpa.c +167 -136
  146. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/indcpa.h +75 -68
  147. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/kem.h +135 -157
  148. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/meta.h +15 -13
  149. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/aarch64_zetas.c +143 -135
  150. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/arith_native_aarch64.h +52 -46
  151. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{intt.S → intt_aarch64_asm.S} +10 -10
  152. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{ntt.S → ntt_aarch64_asm.S} +10 -10
  153. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{poly_mulcache_compute_asm.S → poly_mulcache_compute_aarch64_asm.S} +10 -10
  154. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{poly_reduce_asm.S → poly_reduce_aarch64_asm.S} +10 -10
  155. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{poly_tobytes_asm.S → poly_tobytes_aarch64_asm.S} +10 -10
  156. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{poly_tomont_asm.S → poly_tomont_aarch64_asm.S} +10 -12
  157. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{polyvec_basemul_acc_montgomery_cached_asm_k2.S → polyvec_basemul_acc_montgomery_cached_k2_aarch64_asm.S} +10 -10
  158. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{polyvec_basemul_acc_montgomery_cached_asm_k3.S → polyvec_basemul_acc_montgomery_cached_k3_aarch64_asm.S} +10 -10
  159. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{polyvec_basemul_acc_montgomery_cached_asm_k4.S → polyvec_basemul_acc_montgomery_cached_k4_aarch64_asm.S} +10 -10
  160. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/{rej_uniform_asm.S → rej_uniform_aarch64_asm.S} +12 -12
  161. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/aarch64/src/rej_uniform_table.c +514 -513
  162. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/api.h +254 -253
  163. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/meta.h +6 -1
  164. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/README.md +6 -0
  165. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/meta.h +77 -0
  166. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/arith_native_ppc64le.h +24 -0
  167. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/consts.c +299 -0
  168. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/consts.h +34 -0
  169. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/intt_ppc_asm.S +3222 -0
  170. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/ntt_ppc_asm.S +1651 -0
  171. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/poly_tomont_ppc_asm.S +294 -0
  172. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/ppc64le/src/reduce_ppc_asm.S +710 -0
  173. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/meta.h +5 -0
  174. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_debug.c +18 -16
  175. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_debug.h +19 -24
  176. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/riscv64/src/rv64v_poly.c +53 -65
  177. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/meta.h +20 -20
  178. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/arith_native_x86_64.h +106 -88
  179. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/compress_consts.c +45 -35
  180. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/compress_consts.h +8 -8
  181. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/consts.c +1 -1
  182. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/consts.h +1 -1
  183. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{intt.S → intt_avx2_asm.S} +8 -8
  184. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{ntt.S → ntt_avx2_asm.S} +8 -8
  185. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{nttfrombytes.S → nttfrombytes_avx2_asm.S} +8 -8
  186. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{ntttobytes.S → ntttobytes_avx2_asm.S} +8 -8
  187. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{nttunpack.S → nttunpack_avx2_asm.S} +8 -8
  188. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_compress_d10.S → poly_compress_d10_avx2_asm.S} +9 -9
  189. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_compress_d11.S → poly_compress_d11_avx2_asm.S} +9 -9
  190. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_compress_d4.S → poly_compress_d4_avx2_asm.S} +9 -9
  191. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_compress_d5.S → poly_compress_d5_avx2_asm.S} +9 -9
  192. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_decompress_d10.S → poly_decompress_d10_avx2_asm.S} +9 -9
  193. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_decompress_d11.S → poly_decompress_d11_avx2_asm.S} +9 -9
  194. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_decompress_d4.S → poly_decompress_d4_avx2_asm.S} +9 -9
  195. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{poly_decompress_d5.S → poly_decompress_d5_avx2_asm.S} +9 -9
  196. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{mulcache_compute.S → poly_mulcache_compute_avx2_asm.S} +8 -8
  197. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{polyvec_basemul_acc_montgomery_cached_asm_k2.S → polyvec_basemul_acc_montgomery_cached_k2_avx2_asm.S} +8 -8
  198. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{polyvec_basemul_acc_montgomery_cached_asm_k3.S → polyvec_basemul_acc_montgomery_cached_k3_avx2_asm.S} +8 -8
  199. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{polyvec_basemul_acc_montgomery_cached_asm_k4.S → polyvec_basemul_acc_montgomery_cached_k4_avx2_asm.S} +8 -8
  200. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{reduce.S → reduce_avx2_asm.S} +8 -8
  201. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{rej_uniform_asm.S → rej_uniform_avx2_asm.S} +9 -9
  202. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/rej_uniform_table.c +514 -513
  203. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/native/x86_64/src/{tomont.S → tomont_avx2_asm.S} +8 -8
  204. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly.c +61 -57
  205. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly.h +89 -116
  206. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly_k.c +31 -32
  207. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/poly_k.h +226 -301
  208. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/randombytes.h +21 -29
  209. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sampling.c +68 -63
  210. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sampling.h +37 -48
  211. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/sys.h +44 -2
  212. data/ext/pqcrypto/vendor/mlkem-native/mlkem/src/verify.h +141 -159
  213. data/lib/pq_crypto/version.rb +1 -1
  214. data/script/vendor_libs.rb +6 -6
  215. metadata +86 -71
  216. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  217. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  218. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  219. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  220. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  221. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  222. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  223. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  224. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  225. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  226. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  227. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  228. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  229. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  230. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  231. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  232. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  233. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  234. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  235. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  236. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  237. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  238. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  239. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  240. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  241. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  242. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  243. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  244. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  245. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  246. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  247. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -0,0 +1,238 @@
1
+ /*
2
+ * Copyright (c) The mlkem-native project authors
3
+ * Copyright (c) The mldsa-native project authors
4
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
5
+ */
6
+
7
+ /* References
8
+ * ==========
9
+ *
10
+ * - [REF_AVX2]
11
+ * CRYSTALS-Dilithium optimized AVX2 implementation
12
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
13
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
14
+ */
15
+
16
+ /*
17
+ * This file is derived from the public domain
18
+ * AVX2 Dilithium implementation @[REF_AVX2].
19
+ */
20
+
21
+ #include "../../../common.h"
22
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
23
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/nttunpack_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(nttunpack_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(nttunpack_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa (%rdi), %ymm4
37
+ vmovdqa 0x20(%rdi), %ymm5
38
+ vmovdqa 0x40(%rdi), %ymm6
39
+ vmovdqa 0x60(%rdi), %ymm7
40
+ vmovdqa 0x80(%rdi), %ymm8
41
+ vmovdqa 0xa0(%rdi), %ymm9
42
+ vmovdqa 0xc0(%rdi), %ymm10
43
+ vmovdqa 0xe0(%rdi), %ymm11
44
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
45
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
46
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
47
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
48
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
49
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
50
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
51
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
52
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
53
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
54
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
55
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
56
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
57
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
58
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
59
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
60
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
61
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
62
+ vpsrlq $0x20, %ymm7, %ymm7
63
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
64
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
65
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
66
+ vpsrlq $0x20, %ymm5, %ymm5
67
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
68
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
69
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
70
+ vpsrlq $0x20, %ymm3, %ymm3
71
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
72
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
73
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
74
+ vpsrlq $0x20, %ymm10, %ymm10
75
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
76
+ vmovdqa %ymm9, (%rdi)
77
+ vmovdqa %ymm8, 0x20(%rdi)
78
+ vmovdqa %ymm7, 0x40(%rdi)
79
+ vmovdqa %ymm6, 0x60(%rdi)
80
+ vmovdqa %ymm5, 0x80(%rdi)
81
+ vmovdqa %ymm4, 0xa0(%rdi)
82
+ vmovdqa %ymm3, 0xc0(%rdi)
83
+ vmovdqa %ymm11, 0xe0(%rdi)
84
+ vmovdqa 0x100(%rdi), %ymm4
85
+ vmovdqa 0x120(%rdi), %ymm5
86
+ vmovdqa 0x140(%rdi), %ymm6
87
+ vmovdqa 0x160(%rdi), %ymm7
88
+ vmovdqa 0x180(%rdi), %ymm8
89
+ vmovdqa 0x1a0(%rdi), %ymm9
90
+ vmovdqa 0x1c0(%rdi), %ymm10
91
+ vmovdqa 0x1e0(%rdi), %ymm11
92
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
93
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
94
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
95
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
96
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
97
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
98
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
99
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
100
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
101
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
102
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
103
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
104
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
105
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
106
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
107
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
108
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
109
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
110
+ vpsrlq $0x20, %ymm7, %ymm7
111
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
112
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
113
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
114
+ vpsrlq $0x20, %ymm5, %ymm5
115
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
116
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
117
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
118
+ vpsrlq $0x20, %ymm3, %ymm3
119
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
120
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
121
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
122
+ vpsrlq $0x20, %ymm10, %ymm10
123
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
124
+ vmovdqa %ymm9, 0x100(%rdi)
125
+ vmovdqa %ymm8, 0x120(%rdi)
126
+ vmovdqa %ymm7, 0x140(%rdi)
127
+ vmovdqa %ymm6, 0x160(%rdi)
128
+ vmovdqa %ymm5, 0x180(%rdi)
129
+ vmovdqa %ymm4, 0x1a0(%rdi)
130
+ vmovdqa %ymm3, 0x1c0(%rdi)
131
+ vmovdqa %ymm11, 0x1e0(%rdi)
132
+ vmovdqa 0x200(%rdi), %ymm4
133
+ vmovdqa 0x220(%rdi), %ymm5
134
+ vmovdqa 0x240(%rdi), %ymm6
135
+ vmovdqa 0x260(%rdi), %ymm7
136
+ vmovdqa 0x280(%rdi), %ymm8
137
+ vmovdqa 0x2a0(%rdi), %ymm9
138
+ vmovdqa 0x2c0(%rdi), %ymm10
139
+ vmovdqa 0x2e0(%rdi), %ymm11
140
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
141
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
142
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
143
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
144
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
145
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
146
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
147
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
148
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
149
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
150
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
151
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
152
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
153
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
154
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
155
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
156
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
157
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
158
+ vpsrlq $0x20, %ymm7, %ymm7
159
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
160
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
161
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
162
+ vpsrlq $0x20, %ymm5, %ymm5
163
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
164
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
165
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
166
+ vpsrlq $0x20, %ymm3, %ymm3
167
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
168
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
169
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
170
+ vpsrlq $0x20, %ymm10, %ymm10
171
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
172
+ vmovdqa %ymm9, 0x200(%rdi)
173
+ vmovdqa %ymm8, 0x220(%rdi)
174
+ vmovdqa %ymm7, 0x240(%rdi)
175
+ vmovdqa %ymm6, 0x260(%rdi)
176
+ vmovdqa %ymm5, 0x280(%rdi)
177
+ vmovdqa %ymm4, 0x2a0(%rdi)
178
+ vmovdqa %ymm3, 0x2c0(%rdi)
179
+ vmovdqa %ymm11, 0x2e0(%rdi)
180
+ vmovdqa 0x300(%rdi), %ymm4
181
+ vmovdqa 0x320(%rdi), %ymm5
182
+ vmovdqa 0x340(%rdi), %ymm6
183
+ vmovdqa 0x360(%rdi), %ymm7
184
+ vmovdqa 0x380(%rdi), %ymm8
185
+ vmovdqa 0x3a0(%rdi), %ymm9
186
+ vmovdqa 0x3c0(%rdi), %ymm10
187
+ vmovdqa 0x3e0(%rdi), %ymm11
188
+ vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
189
+ vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
190
+ vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
191
+ vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
192
+ vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
193
+ vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
194
+ vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
195
+ vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
196
+ vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
197
+ vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
198
+ vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
199
+ vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
200
+ vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
201
+ vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
202
+ vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
203
+ vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
204
+ vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
205
+ vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
206
+ vpsrlq $0x20, %ymm7, %ymm7
207
+ vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
208
+ vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
209
+ vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
210
+ vpsrlq $0x20, %ymm5, %ymm5
211
+ vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
212
+ vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
213
+ vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
214
+ vpsrlq $0x20, %ymm3, %ymm3
215
+ vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
216
+ vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
217
+ vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
218
+ vpsrlq $0x20, %ymm10, %ymm10
219
+ vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
220
+ vmovdqa %ymm9, 0x300(%rdi)
221
+ vmovdqa %ymm8, 0x320(%rdi)
222
+ vmovdqa %ymm7, 0x340(%rdi)
223
+ vmovdqa %ymm6, 0x360(%rdi)
224
+ vmovdqa %ymm5, 0x380(%rdi)
225
+ vmovdqa %ymm4, 0x3a0(%rdi)
226
+ vmovdqa %ymm3, 0x3c0(%rdi)
227
+ vmovdqa %ymm11, 0x3e0(%rdi)
228
+ retq
229
+ .cfi_endproc
230
+
231
+ MLD_ASM_FN_SIZE(nttunpack_avx2_asm)
232
+
233
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
234
+ */
235
+
236
+ #if defined(__ELF__)
237
+ .section .note.GNU-stack,"",%progbits
238
+ #endif
@@ -0,0 +1,139 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 4)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l4_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l4_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l4_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l4_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vpmuldq %ymm2, %ymm0, %ymm6
106
+ vpmuldq %ymm3, %ymm0, %ymm7
107
+ vpmuldq %ymm4, %ymm0, %ymm8
108
+ vpmuldq %ymm5, %ymm0, %ymm9
109
+ vpmuldq %ymm6, %ymm1, %ymm6
110
+ vpmuldq %ymm7, %ymm1, %ymm7
111
+ vpmuldq %ymm8, %ymm1, %ymm8
112
+ vpmuldq %ymm9, %ymm1, %ymm9
113
+ vpsubq %ymm6, %ymm2, %ymm2
114
+ vpsubq %ymm7, %ymm3, %ymm3
115
+ vpsubq %ymm8, %ymm4, %ymm4
116
+ vpsubq %ymm9, %ymm5, %ymm5
117
+ vpsrlq $0x20, %ymm2, %ymm2
118
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
119
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
120
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
121
+ vmovdqa %ymm2, (%rdi)
122
+ vmovdqa %ymm4, 0x20(%rdi)
123
+ addq $0x40, %rsi
124
+ addq $0x40, %rdx
125
+ addq $0x40, %rdi
126
+ addl $0x1, %eax
127
+ cmpl $0x10, %eax
128
+ jb Lpointwise_acc_l4_avx2_looptop2
129
+ retq
130
+ .cfi_endproc
131
+
132
+ MLD_ASM_FN_SIZE(pointwise_acc_l4_avx2_asm)
133
+
134
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
135
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 4) */
136
+
137
+ #if defined(__ELF__)
138
+ .section .note.GNU-stack,"",%progbits
139
+ #endif
@@ -0,0 +1,155 @@
1
+ /*
2
+ * Copyright (c) The mldsa-native project authors
3
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
4
+ */
5
+
6
+ /* References
7
+ * ==========
8
+ *
9
+ * - [REF_AVX2]
10
+ * CRYSTALS-Dilithium optimized AVX2 implementation
11
+ * Bai, Ducas, Kiltz, Lepoint, Lyubashevsky, Schwabe, Seiler, Stehlé
12
+ * https://github.com/pq-crystals/dilithium/tree/master/avx2
13
+ */
14
+
15
+ /*
16
+ * This file is derived from the public domain
17
+ * AVX2 Dilithium implementation @[REF_AVX2].
18
+ */
19
+
20
+ #include "../../../common.h"
21
+ #if defined(MLD_ARITH_BACKEND_X86_64_DEFAULT) && \
22
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED) && \
23
+ (defined(MLD_CONFIG_MULTILEVEL_WITH_SHARED) || MLDSA_L == 5)
24
+
25
+ /*
26
+ * WARNING: This file is auto-derived from the mldsa-native source file
27
+ * dev/x86_64/src/pointwise_acc_l5_avx2_asm.S using scripts/simpasm. Do not modify it directly.
28
+ */
29
+
30
+ .text
31
+ .balign 4
32
+ .global MLD_ASM_NAMESPACE(pointwise_acc_l5_avx2_asm)
33
+ MLD_ASM_FN_SYMBOL(pointwise_acc_l5_avx2_asm)
34
+
35
+ .cfi_startproc
36
+ vmovdqa 0x20(%rcx), %ymm0
37
+ vmovdqa (%rcx), %ymm1
38
+ xorl %eax, %eax
39
+
40
+ Lpointwise_acc_l5_avx2_looptop2:
41
+ vmovdqa (%rsi), %ymm6
42
+ vmovdqa 0x20(%rsi), %ymm8
43
+ vmovdqa (%rdx), %ymm10
44
+ vmovdqa 0x20(%rdx), %ymm12
45
+ vpsrlq $0x20, %ymm6, %ymm7
46
+ vpsrlq $0x20, %ymm8, %ymm9
47
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
48
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
49
+ vpmuldq %ymm10, %ymm6, %ymm6
50
+ vpmuldq %ymm11, %ymm7, %ymm7
51
+ vpmuldq %ymm12, %ymm8, %ymm8
52
+ vpmuldq %ymm13, %ymm9, %ymm9
53
+ vmovdqa %ymm6, %ymm2
54
+ vmovdqa %ymm7, %ymm3
55
+ vmovdqa %ymm8, %ymm4
56
+ vmovdqa %ymm9, %ymm5
57
+ vmovdqa 0x400(%rsi), %ymm6
58
+ vmovdqa 0x420(%rsi), %ymm8
59
+ vmovdqa 0x400(%rdx), %ymm10
60
+ vmovdqa 0x420(%rdx), %ymm12
61
+ vpsrlq $0x20, %ymm6, %ymm7
62
+ vpsrlq $0x20, %ymm8, %ymm9
63
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
64
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
65
+ vpmuldq %ymm10, %ymm6, %ymm6
66
+ vpmuldq %ymm11, %ymm7, %ymm7
67
+ vpmuldq %ymm12, %ymm8, %ymm8
68
+ vpmuldq %ymm13, %ymm9, %ymm9
69
+ vpaddq %ymm2, %ymm6, %ymm2
70
+ vpaddq %ymm3, %ymm7, %ymm3
71
+ vpaddq %ymm4, %ymm8, %ymm4
72
+ vpaddq %ymm5, %ymm9, %ymm5
73
+ vmovdqa 0x800(%rsi), %ymm6
74
+ vmovdqa 0x820(%rsi), %ymm8
75
+ vmovdqa 0x800(%rdx), %ymm10
76
+ vmovdqa 0x820(%rdx), %ymm12
77
+ vpsrlq $0x20, %ymm6, %ymm7
78
+ vpsrlq $0x20, %ymm8, %ymm9
79
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
80
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
81
+ vpmuldq %ymm10, %ymm6, %ymm6
82
+ vpmuldq %ymm11, %ymm7, %ymm7
83
+ vpmuldq %ymm12, %ymm8, %ymm8
84
+ vpmuldq %ymm13, %ymm9, %ymm9
85
+ vpaddq %ymm2, %ymm6, %ymm2
86
+ vpaddq %ymm3, %ymm7, %ymm3
87
+ vpaddq %ymm4, %ymm8, %ymm4
88
+ vpaddq %ymm5, %ymm9, %ymm5
89
+ vmovdqa 0xc00(%rsi), %ymm6
90
+ vmovdqa 0xc20(%rsi), %ymm8
91
+ vmovdqa 0xc00(%rdx), %ymm10
92
+ vmovdqa 0xc20(%rdx), %ymm12
93
+ vpsrlq $0x20, %ymm6, %ymm7
94
+ vpsrlq $0x20, %ymm8, %ymm9
95
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
96
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
97
+ vpmuldq %ymm10, %ymm6, %ymm6
98
+ vpmuldq %ymm11, %ymm7, %ymm7
99
+ vpmuldq %ymm12, %ymm8, %ymm8
100
+ vpmuldq %ymm13, %ymm9, %ymm9
101
+ vpaddq %ymm2, %ymm6, %ymm2
102
+ vpaddq %ymm3, %ymm7, %ymm3
103
+ vpaddq %ymm4, %ymm8, %ymm4
104
+ vpaddq %ymm5, %ymm9, %ymm5
105
+ vmovdqa 0x1000(%rsi), %ymm6
106
+ vmovdqa 0x1020(%rsi), %ymm8
107
+ vmovdqa 0x1000(%rdx), %ymm10
108
+ vmovdqa 0x1020(%rdx), %ymm12
109
+ vpsrlq $0x20, %ymm6, %ymm7
110
+ vpsrlq $0x20, %ymm8, %ymm9
111
+ vmovshdup %ymm10, %ymm11 # ymm11 = ymm10[1,1,3,3,5,5,7,7]
112
+ vmovshdup %ymm12, %ymm13 # ymm13 = ymm12[1,1,3,3,5,5,7,7]
113
+ vpmuldq %ymm10, %ymm6, %ymm6
114
+ vpmuldq %ymm11, %ymm7, %ymm7
115
+ vpmuldq %ymm12, %ymm8, %ymm8
116
+ vpmuldq %ymm13, %ymm9, %ymm9
117
+ vpaddq %ymm2, %ymm6, %ymm2
118
+ vpaddq %ymm3, %ymm7, %ymm3
119
+ vpaddq %ymm4, %ymm8, %ymm4
120
+ vpaddq %ymm5, %ymm9, %ymm5
121
+ vpmuldq %ymm2, %ymm0, %ymm6
122
+ vpmuldq %ymm3, %ymm0, %ymm7
123
+ vpmuldq %ymm4, %ymm0, %ymm8
124
+ vpmuldq %ymm5, %ymm0, %ymm9
125
+ vpmuldq %ymm6, %ymm1, %ymm6
126
+ vpmuldq %ymm7, %ymm1, %ymm7
127
+ vpmuldq %ymm8, %ymm1, %ymm8
128
+ vpmuldq %ymm9, %ymm1, %ymm9
129
+ vpsubq %ymm6, %ymm2, %ymm2
130
+ vpsubq %ymm7, %ymm3, %ymm3
131
+ vpsubq %ymm8, %ymm4, %ymm4
132
+ vpsubq %ymm9, %ymm5, %ymm5
133
+ vpsrlq $0x20, %ymm2, %ymm2
134
+ vmovshdup %ymm4, %ymm4 # ymm4 = ymm4[1,1,3,3,5,5,7,7]
135
+ vpblendd $0xaa, %ymm3, %ymm2, %ymm2 # ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
136
+ vpblendd $0xaa, %ymm5, %ymm4, %ymm4 # ymm4 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7]
137
+ vmovdqa %ymm2, (%rdi)
138
+ vmovdqa %ymm4, 0x20(%rdi)
139
+ addq $0x40, %rsi
140
+ addq $0x40, %rdx
141
+ addq $0x40, %rdi
142
+ addl $0x1, %eax
143
+ cmpl $0x10, %eax
144
+ jb Lpointwise_acc_l5_avx2_looptop2
145
+ retq
146
+ .cfi_endproc
147
+
148
+ MLD_ASM_FN_SIZE(pointwise_acc_l5_avx2_asm)
149
+
150
+ #endif /* MLD_ARITH_BACKEND_X86_64_DEFAULT && !MLD_CONFIG_MULTILEVEL_NO_SHARED \
151
+ && (MLD_CONFIG_MULTILEVEL_WITH_SHARED || MLDSA_L == 5) */
152
+
153
+ #if defined(__ELF__)
154
+ .section .note.GNU-stack,"",%progbits
155
+ #endif