pq_crypto 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +5 -0
  3. data/SECURITY.md +7 -0
  4. data/ext/pqcrypto/pqcrypto_version.h +1 -1
  5. data/ext/pqcrypto/vendor/.vendored +4 -4
  6. data/ext/pqcrypto/vendor/mldsa-native/README.md +23 -10
  7. data/ext/pqcrypto/vendor/mldsa-native/mldsa/README.md +23 -0
  8. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.c +114 -58
  9. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native.h +498 -461
  10. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_asm.S +145 -85
  11. data/ext/pqcrypto/vendor/mldsa-native/mldsa/mldsa_native_config.h +456 -422
  12. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/cbmc.h +47 -25
  13. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/common.h +26 -14
  14. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/ct.h +56 -81
  15. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/debug.h +17 -24
  16. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.c +33 -40
  17. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202.h +67 -87
  18. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.c +19 -14
  19. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/fips202x4.h +13 -5
  20. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.c +84 -10
  21. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/keccakf1600.h +10 -5
  22. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/auto.h +6 -0
  23. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/fips202_native_aarch64.h +22 -15
  24. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_aarch64_asm.S +376 -0
  25. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_aarch64_asm.S +204 -0
  26. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_aarch64_asm.S +259 -0
  27. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_aarch64_asm.S +1077 -0
  28. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_aarch64_asm.S +987 -0
  29. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccakf1600_round_constants.c +16 -10
  30. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_scalar.h +2 -1
  31. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x1_v84a.h +1 -1
  32. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x2_v84a.h +4 -2
  33. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_scalar.h +2 -2
  34. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/x4_v8a_v84a_scalar.h +1 -1
  35. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/api.h +60 -0
  36. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/mve.h +48 -0
  37. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/fips202_native_armv81m.h +18 -1
  38. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.S +658 -582
  39. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccak_f1600_x4_mve.c +5 -100
  40. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/keccakf1600_round_constants.c +26 -25
  41. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_extract_bytes_x4_mve.S +334 -0
  42. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/armv81m/src/state_xor_bytes_x4_mve.S +355 -0
  43. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/auto.h +8 -3
  44. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/{xkcp.h → keccak_f1600_x4_avx2.h} +11 -8
  45. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/fips202_native_x86_64.h +44 -0
  46. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccak_f1600_x4_avx2_asm.S +454 -0
  47. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/keccakf1600_constants.c +52 -0
  48. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/meta.h +37 -28
  49. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/aarch64_zetas.c +213 -196
  50. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/arith_native_aarch64.h +248 -64
  51. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt_aarch64_asm.S +753 -0
  52. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4_aarch64_asm.S +129 -0
  53. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5_aarch64_asm.S +145 -0
  54. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7_aarch64_asm.S +177 -0
  55. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt_aarch64_asm.S +653 -0
  56. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery_aarch64_asm.S +84 -0
  57. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_aarch64_asm.S +53 -0
  58. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_aarch64_asm.S +55 -0
  59. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_aarch64_asm.S +86 -0
  60. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_aarch64_asm.S +86 -0
  61. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_aarch64_asm.S +103 -0
  62. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_aarch64_asm.S +111 -0
  63. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_aarch64_asm.S +75 -0
  64. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_aarch64_asm.S +72 -0
  65. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_table.c +23 -11
  66. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_aarch64_asm.S +189 -0
  67. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_aarch64_asm.S +137 -0
  68. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_aarch64_asm.S +130 -0
  69. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta_table.c +520 -516
  70. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_table.c +34 -33
  71. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/api.h +202 -242
  72. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/meta.h +25 -17
  73. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/arith_native_x86_64.h +112 -28
  74. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.c +1 -1
  75. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/consts.h +1 -1
  76. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt_avx2_asm.S +2311 -0
  77. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt_avx2_asm.S +2383 -0
  78. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack_avx2_asm.S +238 -0
  79. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4_avx2_asm.S +139 -0
  80. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5_avx2_asm.S +155 -0
  81. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7_avx2_asm.S +187 -0
  82. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_avx2_asm.S +130 -0
  83. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2_asm.S +190 -0
  84. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c +6 -4
  85. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c +6 -4
  86. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_32_avx2.c +9 -8
  87. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_use_hint_88_avx2.c +10 -9
  88. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_17_avx2.c +8 -5
  89. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/polyz_unpack_19_avx2.c +8 -5
  90. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta2_avx2.c +6 -4
  91. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_eta4_avx2.c +6 -4
  92. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/rej_uniform_table.c +130 -129
  93. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.c +109 -180
  94. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/packing.h +169 -150
  95. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.c +56 -40
  96. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly.h +149 -164
  97. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.c +52 -57
  98. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/poly_kl.h +132 -167
  99. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.c +57 -424
  100. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec.h +167 -474
  101. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.c +308 -0
  102. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/polyvec_lazy.h +653 -0
  103. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/reduce.h +22 -29
  104. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/rounding.h +37 -43
  105. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.c +511 -367
  106. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/sign.h +456 -417
  107. data/lib/pq_crypto/version.rb +1 -1
  108. data/script/vendor_libs.rb +3 -3
  109. metadata +41 -35
  110. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_scalar_asm.S +0 -376
  111. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x1_v84a_asm.S +0 -204
  112. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x2_v84a_asm.S +0 -259
  113. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_scalar_hybrid_asm.S +0 -1077
  114. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/aarch64/src/keccak_f1600_x4_v8a_v84a_scalar_hybrid_asm.S +0 -987
  115. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +0 -488
  116. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.h +0 -16
  117. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/intt.S +0 -753
  118. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l4.S +0 -129
  119. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l5.S +0 -145
  120. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/mld_polyvecl_pointwise_acc_montgomery_l7.S +0 -177
  121. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/ntt.S +0 -653
  122. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/pointwise_montgomery.S +0 -79
  123. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_caddq_asm.S +0 -53
  124. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_chknorm_asm.S +0 -55
  125. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_32_asm.S +0 -85
  126. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_decompose_88_asm.S +0 -85
  127. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_32_asm.S +0 -102
  128. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/poly_use_hint_88_asm.S +0 -110
  129. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_17_asm.S +0 -72
  130. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/polyz_unpack_19_asm.S +0 -69
  131. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_asm.S +0 -189
  132. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta2_asm.S +0 -135
  133. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/aarch64/src/rej_uniform_eta4_asm.S +0 -128
  134. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/intt.S +0 -2311
  135. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/ntt.S +0 -2383
  136. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/nttunpack.S +0 -239
  137. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise.S +0 -131
  138. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l4.S +0 -139
  139. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l5.S +0 -155
  140. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/pointwise_acc_l7.S +0 -187
  141. data/ext/pqcrypto/vendor/mldsa-native/mldsa/src/native/x86_64/src/poly_caddq_avx2.c +0 -61
@@ -12,114 +12,19 @@
12
12
 
13
13
  #include "fips202_native_armv81m.h"
14
14
 
15
- /*
16
- * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
17
- * TODO: Replace with optimized MVE assembly implementations
18
- * (as a part of XORBytes and ExtractBytes)
19
- */
20
-
21
- /* Extract even-indexed bits from 64-bit value into lower 32 bits */
22
- static uint32_t bitinterleave_even(uint64_t x)
23
- {
24
- uint64_t t;
25
- t = x & 0x5555555555555555ULL;
26
- t = (t | (t >> 1)) & 0x3333333333333333ULL;
27
- t = (t | (t >> 2)) & 0x0f0f0f0f0f0f0f0fULL;
28
- t = (t | (t >> 4)) & 0x00ff00ff00ff00ffULL;
29
- t = (t | (t >> 8)) & 0x0000ffff0000ffffULL;
30
- t = (t | (t >> 16)) & 0x00000000ffffffffULL;
31
- return (uint32_t)t;
32
- }
33
-
34
- /* Extract odd-indexed bits from 64-bit value into lower 32 bits */
35
- static uint32_t bitinterleave_odd(uint64_t x)
36
- {
37
- return bitinterleave_even(x >> 1);
38
- }
39
-
40
- /* Spread 32-bit value across even bit positions of 64-bit result */
41
- static uint64_t spread_even(uint32_t x)
42
- {
43
- uint64_t t = x;
44
- t = (t | (t << 16)) & 0x0000ffff0000ffffULL;
45
- t = (t | (t << 8)) & 0x00ff00ff00ff00ffULL;
46
- t = (t | (t << 4)) & 0x0f0f0f0f0f0f0f0fULL;
47
- t = (t | (t << 2)) & 0x3333333333333333ULL;
48
- t = (t | (t << 1)) & 0x5555555555555555ULL;
49
- return t;
50
- }
51
-
52
- /* Combine even and odd 32-bit halves into interleaved 64-bit value */
53
- static uint64_t bitdeinterleave(uint32_t even, uint32_t odd)
54
- {
55
- return spread_even(even) | (spread_even(odd) << 1);
56
- }
57
15
 
58
16
  /*
59
- * TEMPORARY: Naive C interleaving functions.
60
- * These will be replaced with optimized MVE assembly implementations.
17
+ * Keccak-f1600 x4 permutation (on bit-interleaved state)
18
+ * State is expected to already be in bit-interleaved format.
61
19
  */
62
- static void interleave_4fold(uint64_t *state_4x, const uint64_t *state0,
63
- const uint64_t *state1, const uint64_t *state2,
64
- const uint64_t *state3)
65
- {
66
- uint32_t *state_4xl = (uint32_t *)state_4x;
67
- uint32_t *state_4xh = (uint32_t *)state_4x + 100;
68
-
69
- for (size_t i = 0; i < 25; i++)
70
- {
71
- state_4xl[i * 4 + 0] = bitinterleave_even(state0[i]);
72
- state_4xl[i * 4 + 1] = bitinterleave_even(state1[i]);
73
- state_4xl[i * 4 + 2] = bitinterleave_even(state2[i]);
74
- state_4xl[i * 4 + 3] = bitinterleave_even(state3[i]);
75
-
76
- state_4xh[i * 4 + 0] = bitinterleave_odd(state0[i]);
77
- state_4xh[i * 4 + 1] = bitinterleave_odd(state1[i]);
78
- state_4xh[i * 4 + 2] = bitinterleave_odd(state2[i]);
79
- state_4xh[i * 4 + 3] = bitinterleave_odd(state3[i]);
80
- }
81
- }
82
-
83
- static void deinterleave_4fold(uint64_t *state_4x, uint64_t *state0,
84
- uint64_t *state1, uint64_t *state2,
85
- uint64_t *state3)
86
- {
87
- uint32_t *state_4xl = (uint32_t *)state_4x;
88
- uint32_t *state_4xh = (uint32_t *)state_4x + 100;
89
-
90
- for (size_t i = 0; i < 25; i++)
91
- {
92
- state0[i] = bitdeinterleave(state_4xl[i * 4 + 0], state_4xh[i * 4 + 0]);
93
- state1[i] = bitdeinterleave(state_4xl[i * 4 + 1], state_4xh[i * 4 + 1]);
94
- state2[i] = bitdeinterleave(state_4xl[i * 4 + 2], state_4xh[i * 4 + 2]);
95
- state3[i] = bitdeinterleave(state_4xl[i * 4 + 3], state_4xh[i * 4 + 3]);
96
- }
97
- }
98
-
99
20
  #define mld_keccak_f1600_x4_native_impl \
100
21
  MLD_NAMESPACE(keccak_f1600_x4_native_impl)
101
22
  int mld_keccak_f1600_x4_native_impl(uint64_t *state)
102
23
  {
103
- /*
104
- * TEMPORARY: Bit-interleaving using efficient shift-and-mask operations.
105
- * TODO: Replace with optimized MVE assembly implementations
106
- * (as a part of XORBytes and ExtractBytes)
107
- */
108
- MLD_ALIGN uint64_t state_4x[100];
109
- MLD_ALIGN uint64_t state_4x_tmp[100];
110
-
111
- /* Interleave the 4 states into bit-interleaved format */
112
- interleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
113
-
114
- /* Run the permutation */
115
- mld_keccak_f1600_x4_mve_asm(state_4x, state_4x_tmp,
24
+ MLD_ALIGN uint64_t state_tmp[100];
25
+ mld_keccak_f1600_x4_mve_asm(state, state_tmp,
116
26
  mld_keccakf1600_round_constants);
117
-
118
- /* Deinterleave back to 4 separate states */
119
- deinterleave_4fold(state_4x, &state[0], &state[25], &state[50], &state[75]);
120
-
121
- mld_zeroize(state_4x, sizeof(state_4x));
122
- mld_zeroize(state_4x_tmp, sizeof(state_4x_tmp));
27
+ mld_zeroize(state_tmp, sizeof(state_tmp));
123
28
  return MLD_NATIVE_FUNC_SUCCESS;
124
29
  }
125
30
 
@@ -17,31 +17,32 @@
17
17
  * - low word contains even-indexed bits
18
18
  * - high word contains odd-indexed bits
19
19
  */
20
- MLD_ALIGN const uint32_t mld_keccakf1600_round_constants[48] = {
21
- 0x00000001, 0x00000000, /* RC0 */
22
- 0x00000000, 0x00000089, /* RC1 */
23
- 0x00000000, 0x8000008b, /* RC2 */
24
- 0x00000000, 0x80008080, /* RC3 */
25
- 0x00000001, 0x0000008b, /* RC4 */
26
- 0x00000001, 0x00008000, /* RC5 */
27
- 0x00000001, 0x80008088, /* RC6 */
28
- 0x00000001, 0x80000082, /* RC7 */
29
- 0x00000000, 0x0000000b, /* RC8 */
30
- 0x00000000, 0x0000000a, /* RC9 */
31
- 0x00000001, 0x00008082, /* RC10 */
32
- 0x00000000, 0x00008003, /* RC11 */
33
- 0x00000001, 0x0000808b, /* RC12 */
34
- 0x00000001, 0x8000000b, /* RC13 */
35
- 0x00000001, 0x8000008a, /* RC14 */
36
- 0x00000001, 0x80000081, /* RC15 */
37
- 0x00000000, 0x80000081, /* RC16 */
38
- 0x00000000, 0x80000008, /* RC17 */
39
- 0x00000000, 0x00000083, /* RC18 */
40
- 0x00000000, 0x80008003, /* RC19 */
41
- 0x00000001, 0x80008088, /* RC20 */
42
- 0x00000000, 0x80000088, /* RC21 */
43
- 0x00000001, 0x00008000, /* RC22 */
44
- 0x00000000, 0x80008082, /* RC23 */
20
+ MLD_ALIGN MLD_INTERNAL_DATA_DEFINITION const uint32_t
21
+ mld_keccakf1600_round_constants[48] = {
22
+ 0x00000001, 0x00000000, /* RC0 */
23
+ 0x00000000, 0x00000089, /* RC1 */
24
+ 0x00000000, 0x8000008b, /* RC2 */
25
+ 0x00000000, 0x80008080, /* RC3 */
26
+ 0x00000001, 0x0000008b, /* RC4 */
27
+ 0x00000001, 0x00008000, /* RC5 */
28
+ 0x00000001, 0x80008088, /* RC6 */
29
+ 0x00000001, 0x80000082, /* RC7 */
30
+ 0x00000000, 0x0000000b, /* RC8 */
31
+ 0x00000000, 0x0000000a, /* RC9 */
32
+ 0x00000001, 0x00008082, /* RC10 */
33
+ 0x00000000, 0x00008003, /* RC11 */
34
+ 0x00000001, 0x0000808b, /* RC12 */
35
+ 0x00000001, 0x8000000b, /* RC13 */
36
+ 0x00000001, 0x8000008a, /* RC14 */
37
+ 0x00000001, 0x80000081, /* RC15 */
38
+ 0x00000000, 0x80000081, /* RC16 */
39
+ 0x00000000, 0x80000008, /* RC17 */
40
+ 0x00000000, 0x00000083, /* RC18 */
41
+ 0x00000000, 0x80008003, /* RC19 */
42
+ 0x00000001, 0x80008088, /* RC20 */
43
+ 0x00000000, 0x80000088, /* RC21 */
44
+ 0x00000001, 0x00008000, /* RC22 */
45
+ 0x00000000, 0x80008082, /* RC23 */
45
46
  };
46
47
 
47
48
  #else /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
@@ -0,0 +1,334 @@
1
+ /*
2
+ * Copyright (c) The mlkem-native project authors
3
+ * Copyright (c) The mldsa-native project authors
4
+ * Copyright (c) 2026 Arm Limited
5
+ * SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
6
+ */
7
+
8
+ // ---------------------------------------------------------------------------
9
+ // Overview
10
+ // ---------------------------------------------------------------------------
11
+ // MVE/Helium implementation of KeccakF1600x4_StateExtractBytes
12
+ // (inverse of state_xor_bytes_x4_mve.S).
13
+ //
14
+ // void KeccakF1600x4_StateExtractBytes(state, d0, d1, d2, d3, offset, length)
15
+ //
16
+ // Reads 'length' bytes from the bit-interleaved Keccak state starting at
17
+ // byte 'offset', recombines the even and odd halves of each lane back
18
+ // into plain bytes, and writes them to four output buffers (d0..d3).
19
+ //
20
+ // ---------------------------------------------------------------------------
21
+ // Bit-interleaving background
22
+ // ---------------------------------------------------------------------------
23
+ // Each 64-bit Keccak lane is stored as two 32-bit words:
24
+ // even half -- bits 0, 2, 4, ..., 62 of the lane
25
+ // odd half -- bits 1, 3, 5, ..., 63 of the lane
26
+ // This representation allows 64-bit lane rotations (used in the Keccak
27
+ // round function) to be implemented as pairs of 32-bit rotations.
28
+ //
29
+ // Batched (x4) processing:
30
+ // Four Keccak instances are processed as a batch. Their states are
31
+ // stored interleaved in a single 800-byte buffer: first the even
32
+ // halves of all 25 lanes (400 bytes), then the odd halves (400 bytes).
33
+ // Within each 16-byte row, the four u32 words correspond to
34
+ // instances 0..3 of the same lane, enabling SIMD-parallel operations
35
+ // across all four instances.
36
+ //
37
+ // State memory layout (25 lanes x 4 instances x 2 halves):
38
+ // S[i][l]_even/odd = even/odd half of lane l, instance i (u32)
39
+ // Each row is 16 bytes (one Q-register).
40
+ // Offset Contents
41
+ // 0 S[0][ 0]_even, S[1][ 0]_even, S[2][ 0]_even, S[3][ 0]_even
42
+ // 16 S[0][ 1]_even, S[1][ 1]_even, S[2][ 1]_even, S[3][ 1]_even
43
+ // ...
44
+ // 384 S[0][24]_even, S[1][24]_even, S[2][24]_even, S[3][24]_even
45
+ // 400 S[0][ 0]_odd, S[1][ 0]_odd, S[2][ 0]_odd, S[3][ 0]_odd
46
+ // 416 S[0][ 1]_odd, S[1][ 1]_odd, S[2][ 1]_odd, S[3][ 1]_odd
47
+ // ...
48
+ // 784 S[0][24]_odd, S[1][24]_odd, S[2][24]_odd, S[3][24]_odd
49
+ //
50
+ // ---------------------------------------------------------------------------
51
+ // Three-phase structure
52
+ // ---------------------------------------------------------------------------
53
+ // Prologue -- if offset is not 8-byte aligned, extract
54
+ // min(length, 8-(offset%8)) bytes via predicated byte stores.
55
+ // Main -- process full 8-byte groups: load even/odd lane pair,
56
+ // de-interleave, scatter-store to output buffers.
57
+ // Tail -- extract remaining <8 bytes via predicated byte stores.
58
+
59
+ #include "../../../../common.h"
60
+ #if defined(MLD_FIPS202_ARMV81M_NEED_X4) && \
61
+ !defined(MLD_CONFIG_MULTILEVEL_NO_SHARED)
62
+
63
+ /*
64
+ * WARNING: This file is auto-derived from the mldsa-native source file
65
+ * dev/fips202/armv81m/src/state_extract_bytes_x4_mve.S using scripts/simpasm. Do not modify it directly.
66
+ */
67
+
68
+ .thumb
69
+ .syntax unified
70
+
71
+ .text
72
+ .balign 4
73
+ .global MLD_ASM_NAMESPACE(keccak_f1600_x4_state_extract_bytes_asm)
74
+ MLD_ASM_FN_SYMBOL(keccak_f1600_x4_state_extract_bytes_asm)
75
+
76
+ .cfi_startproc
77
+ push.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
78
+ .cfi_adjust_cfa_offset 0x28
79
+ .cfi_rel_offset r4, 0x0
80
+ .cfi_rel_offset r5, 0x4
81
+ .cfi_rel_offset r6, 0x8
82
+ .cfi_rel_offset r7, 0xc
83
+ .cfi_rel_offset r8, 0x10
84
+ .cfi_rel_offset r9, 0x14
85
+ .cfi_rel_offset r10, 0x18
86
+ .cfi_rel_offset r11, 0x1c
87
+ .cfi_rel_offset lr, 0x24
88
+ vpush {d8, d9, d10, d11, d12, d13, d14, d15}
89
+ .cfi_adjust_cfa_offset 0x40
90
+ .cfi_rel_offset d8, 0x0
91
+ .cfi_rel_offset d9, 0x8
92
+ .cfi_rel_offset d10, 0x10
93
+ .cfi_rel_offset d11, 0x18
94
+ .cfi_rel_offset d12, 0x20
95
+ .cfi_rel_offset d13, 0x28
96
+ .cfi_rel_offset d14, 0x30
97
+ .cfi_rel_offset d15, 0x38
98
+ ldr r4, [sp, #0x68]
99
+ ldr.w r10, [sp, #0x6c]
100
+ ldr r6, [sp, #0x70]
101
+ cmp r6, #0x0
102
+ beq.w Lkeccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x2ea
103
+ and r5, r10, #0x7
104
+ bic r9, r10, #0x7
105
+ add.w r8, r0, r9, lsl #1
106
+ add.w r7, r8, #0x190
107
+ cmp r5, #0x0
108
+ beq.w Lkeccak_f1600_x4_state_extract_bytes_asm_pre_main @ imm = #0x112
109
+ vldrw.u32 q0, [r8], #16
110
+ vldrw.u32 q1, [r7], #16
111
+ vrev32.16 q2, q0
112
+ vrev32.16 q3, q1
113
+ vsli.32 q0, q0, #0x8
114
+ vsli.16 q0, q0, #0x4
115
+ vsli.8 q0, q0, #0x1
116
+ vshr.u8 q4, q0, #0x3
117
+ vsli.8 q0, q4, #0x4
118
+ vshr.u8 q4, q0, #0x5
119
+ vsli.8 q0, q4, #0x6
120
+ vsli.32 q1, q1, #0x8
121
+ vsli.16 q1, q1, #0x4
122
+ vsli.8 q1, q1, #0x1
123
+ vshr.u8 q4, q1, #0x3
124
+ vsli.8 q1, q4, #0x4
125
+ vshr.u8 q4, q1, #0x5
126
+ vsli.8 q1, q4, #0x6
127
+ mov.w r0, #0x55
128
+ vdup.8 q4, r0
129
+ vand q0, q0, q4
130
+ vand q1, q1, q4
131
+ vshl.i32 q1, q1, #0x1
132
+ vorr q0, q0, q1
133
+ vsli.32 q2, q2, #0x8
134
+ vsli.16 q2, q2, #0x4
135
+ vsli.8 q2, q2, #0x1
136
+ vshr.u8 q1, q2, #0x3
137
+ vsli.8 q2, q1, #0x4
138
+ vshr.u8 q1, q2, #0x5
139
+ vsli.8 q2, q1, #0x6
140
+ vsli.32 q3, q3, #0x8
141
+ vsli.16 q3, q3, #0x4
142
+ vsli.8 q3, q3, #0x1
143
+ vshr.u8 q1, q3, #0x3
144
+ vsli.8 q3, q1, #0x4
145
+ vshr.u8 q1, q3, #0x5
146
+ vsli.8 q3, q1, #0x6
147
+ vand q1, q2, q4
148
+ vand q3, q3, q4
149
+ vshl.i32 q3, q3, #0x1
150
+ vorr q1, q1, q3
151
+ vrev64.32 q2, q0
152
+ vrev64.32 q3, q1
153
+ movw r0, #0xf0f
154
+ vmsr p0, r0
155
+ vpsel q0, q0, q3
156
+ vpsel q1, q2, q1
157
+ vmov.f64 d4, d1
158
+ vmov.f64 d6, d3
159
+ rsb.w lr, r5, #0x8
160
+ cmp r6, lr
161
+ it ls
162
+ movls lr, r6
163
+ vctp.8 lr
164
+ vmrs r11, p0
165
+ lsl.w r11, r11, r5
166
+ vmsr p0, r11
167
+ subs r1, r1, r5
168
+ subs r2, r2, r5
169
+ subs r3, r3, r5
170
+ subs r4, r4, r5
171
+ vpstttt
172
+ vstrbt.8 q0, [r1], #4
173
+ vstrbt.8 q1, [r2], #4
174
+ vstrbt.8 q2, [r3], #4
175
+ vstrbt.8 q3, [r4], #4
176
+ subs.w r6, r6, lr
177
+ cmp r6, #0x0
178
+ beq.w Lkeccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0x1cc
179
+ vmov q7[2], q7[0], r1, r3
180
+ vmov q7[3], q7[1], r2, r4
181
+ b Lkeccak_f1600_x4_state_extract_bytes_asm_main_body @ imm = #0xe
182
+
183
+ Lkeccak_f1600_x4_state_extract_bytes_asm_pre_main:
184
+ vmov q7[2], q7[0], r1, r3
185
+ vmov q7[3], q7[1], r2, r4
186
+ mov.w r12, #0x4
187
+ vsub.i32 q7, q7, r12
188
+
189
+ Lkeccak_f1600_x4_state_extract_bytes_asm_main_body:
190
+ lsr.w lr, r6, #0x3
191
+ wls lr, lr, Lkeccak_f1600_x4_state_extract_bytes_asm_main_loop_end @ imm = #0xb4
192
+
193
+ Lkeccak_f1600_x4_state_extract_bytes_asm_main_loop_start:
194
+ vldrw.u32 q0, [r8], #16
195
+ vldrw.u32 q1, [r7], #16
196
+ vrev32.16 q2, q0
197
+ vrev32.16 q3, q1
198
+ vsli.32 q0, q0, #0x8
199
+ vsli.16 q0, q0, #0x4
200
+ vsli.8 q0, q0, #0x1
201
+ vshr.u8 q4, q0, #0x3
202
+ vsli.8 q0, q4, #0x4
203
+ vshr.u8 q4, q0, #0x5
204
+ vsli.8 q0, q4, #0x6
205
+ vsli.32 q1, q1, #0x8
206
+ vsli.16 q1, q1, #0x4
207
+ vsli.8 q1, q1, #0x1
208
+ vshr.u8 q4, q1, #0x3
209
+ vsli.8 q1, q4, #0x4
210
+ vshr.u8 q4, q1, #0x5
211
+ vsli.8 q1, q4, #0x6
212
+ mov.w r0, #0x55
213
+ vdup.8 q4, r0
214
+ vand q0, q0, q4
215
+ vand q1, q1, q4
216
+ vshl.i32 q1, q1, #0x1
217
+ vorr q0, q0, q1
218
+ vsli.32 q2, q2, #0x8
219
+ vsli.16 q2, q2, #0x4
220
+ vsli.8 q2, q2, #0x1
221
+ vshr.u8 q1, q2, #0x3
222
+ vsli.8 q2, q1, #0x4
223
+ vshr.u8 q1, q2, #0x5
224
+ vsli.8 q2, q1, #0x6
225
+ vsli.32 q3, q3, #0x8
226
+ vsli.16 q3, q3, #0x4
227
+ vsli.8 q3, q3, #0x1
228
+ vshr.u8 q1, q3, #0x3
229
+ vsli.8 q3, q1, #0x4
230
+ vshr.u8 q1, q3, #0x5
231
+ vsli.8 q3, q1, #0x6
232
+ vand q1, q2, q4
233
+ vand q3, q3, q4
234
+ vshl.i32 q3, q3, #0x1
235
+ vorr q1, q1, q3
236
+ vstrw.32 q0, [q7, #4]!
237
+ vstrw.32 q1, [q7, #4]!
238
+ le lr, Lkeccak_f1600_x4_state_extract_bytes_asm_main_loop_start @ imm = #-0xb4
239
+
240
+ Lkeccak_f1600_x4_state_extract_bytes_asm_main_loop_end:
241
+ ands r6, r6, #0x7
242
+ beq Lkeccak_f1600_x4_state_extract_bytes_asm_exit @ imm = #0xee
243
+ mov.w r12, #0x4
244
+ vadd.i32 q7, q7, r12
245
+ vmov r1, r3, q7[2], q7[0]
246
+ vmov r2, r4, q7[3], q7[1]
247
+ vldrw.u32 q0, [r8], #16
248
+ vldrw.u32 q1, [r7], #16
249
+ vrev32.16 q2, q0
250
+ vrev32.16 q3, q1
251
+ vsli.32 q0, q0, #0x8
252
+ vsli.16 q0, q0, #0x4
253
+ vsli.8 q0, q0, #0x1
254
+ vshr.u8 q4, q0, #0x3
255
+ vsli.8 q0, q4, #0x4
256
+ vshr.u8 q4, q0, #0x5
257
+ vsli.8 q0, q4, #0x6
258
+ vsli.32 q1, q1, #0x8
259
+ vsli.16 q1, q1, #0x4
260
+ vsli.8 q1, q1, #0x1
261
+ vshr.u8 q4, q1, #0x3
262
+ vsli.8 q1, q4, #0x4
263
+ vshr.u8 q4, q1, #0x5
264
+ vsli.8 q1, q4, #0x6
265
+ mov.w r0, #0x55
266
+ vdup.8 q4, r0
267
+ vand q0, q0, q4
268
+ vand q1, q1, q4
269
+ vshl.i32 q1, q1, #0x1
270
+ vorr q0, q0, q1
271
+ vsli.32 q2, q2, #0x8
272
+ vsli.16 q2, q2, #0x4
273
+ vsli.8 q2, q2, #0x1
274
+ vshr.u8 q1, q2, #0x3
275
+ vsli.8 q2, q1, #0x4
276
+ vshr.u8 q1, q2, #0x5
277
+ vsli.8 q2, q1, #0x6
278
+ vsli.32 q3, q3, #0x8
279
+ vsli.16 q3, q3, #0x4
280
+ vsli.8 q3, q3, #0x1
281
+ vshr.u8 q1, q3, #0x3
282
+ vsli.8 q3, q1, #0x4
283
+ vshr.u8 q1, q3, #0x5
284
+ vsli.8 q3, q1, #0x6
285
+ vand q1, q2, q4
286
+ vand q3, q3, q4
287
+ vshl.i32 q3, q3, #0x1
288
+ vorr q1, q1, q3
289
+ vrev64.32 q2, q0
290
+ vrev64.32 q3, q1
291
+ movw r0, #0xf0f
292
+ vmsr p0, r0
293
+ vpsel q0, q0, q3
294
+ vpsel q1, q2, q1
295
+ vmov.f64 d4, d1
296
+ vmov.f64 d6, d3
297
+ vctp.8 r6
298
+ vpstttt
299
+ vstrbt.8 q0, [r1], #4
300
+ vstrbt.8 q1, [r2], #4
301
+ vstrbt.8 q2, [r3], #4
302
+ vstrbt.8 q3, [r4], #4
303
+
304
+ Lkeccak_f1600_x4_state_extract_bytes_asm_exit:
305
+ vpop {d8, d9, d10, d11, d12, d13, d14, d15}
306
+ .cfi_restore d8
307
+ .cfi_restore d9
308
+ .cfi_restore d10
309
+ .cfi_restore d11
310
+ .cfi_restore d12
311
+ .cfi_restore d13
312
+ .cfi_restore d14
313
+ .cfi_restore d15
314
+ .cfi_adjust_cfa_offset -0x40
315
+ pop.w {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
316
+ .cfi_restore r4
317
+ .cfi_restore r5
318
+ .cfi_restore r6
319
+ .cfi_restore r7
320
+ .cfi_restore r8
321
+ .cfi_restore r9
322
+ .cfi_restore r10
323
+ .cfi_restore r11
324
+ .cfi_restore lr
325
+ .cfi_adjust_cfa_offset -0x28
326
+ .cfi_endproc
327
+
328
+ MLD_ASM_FN_SIZE(keccak_f1600_x4_state_extract_bytes_asm)
329
+
330
+ #endif /* MLD_FIPS202_ARMV81M_NEED_X4 && !MLD_CONFIG_MULTILEVEL_NO_SHARED */
331
+
332
+ #if defined(__ELF__)
333
+ .section .note.GNU-stack,"",%progbits
334
+ #endif