@pinkparrot/qsafe-mayo-wasm 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. package/.gitmodules +3 -0
  2. package/.vscode/launch.json +12 -0
  3. package/LICENSE +201 -0
  4. package/bridge/mayo1_bridge.c +26 -0
  5. package/bridge/mayo2_bridge.c +26 -0
  6. package/bridge/randombytes_inject.c +44 -0
  7. package/build_mayo1.ps1 +36 -0
  8. package/build_mayo2.ps1 +36 -0
  9. package/dist/mayo.browser.min.js +216 -0
  10. package/dist/mayo1.js +0 -0
  11. package/dist/mayo2.js +0 -0
  12. package/dist/mayo_api.js +139 -0
  13. package/dist/package.json +1 -0
  14. package/gitignore +2 -0
  15. package/index.mjs +1 -0
  16. package/mayo-c/.astylerc +16 -0
  17. package/mayo-c/.cmake/flags.cmake +45 -0
  18. package/mayo-c/.cmake/sanitizers.cmake +81 -0
  19. package/mayo-c/.cmake/target.cmake +71 -0
  20. package/mayo-c/.github/workflows/ci_clang.yml +61 -0
  21. package/mayo-c/.github/workflows/ci_gcc.yml +60 -0
  22. package/mayo-c/.github/workflows/cmake.yml +160 -0
  23. package/mayo-c/.github/workflows/macos_m1.yml +68 -0
  24. package/mayo-c/CMakeLists.txt +35 -0
  25. package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.req +900 -0
  26. package/mayo-c/KAT/PQCsignKAT_24_MAYO_1.rsp +902 -0
  27. package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.req +900 -0
  28. package/mayo-c/KAT/PQCsignKAT_24_MAYO_2.rsp +902 -0
  29. package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.req +900 -0
  30. package/mayo-c/KAT/PQCsignKAT_32_MAYO_3.rsp +902 -0
  31. package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.req +900 -0
  32. package/mayo-c/KAT/PQCsignKAT_40_MAYO_5.rsp +902 -0
  33. package/mayo-c/LICENSE +202 -0
  34. package/mayo-c/META/MAYO-1_META.yml +52 -0
  35. package/mayo-c/META/MAYO-2_META.yml +52 -0
  36. package/mayo-c/META/MAYO-3_META.yml +52 -0
  37. package/mayo-c/META/MAYO-5_META.yml +52 -0
  38. package/mayo-c/NOTICE +13 -0
  39. package/mayo-c/README.md +183 -0
  40. package/mayo-c/apps/CMakeLists.txt +31 -0
  41. package/mayo-c/apps/PQCgenKAT_sign.c +281 -0
  42. package/mayo-c/apps/example.c +151 -0
  43. package/mayo-c/apps/example_nistapi.c +124 -0
  44. package/mayo-c/include/mayo.h +442 -0
  45. package/mayo-c/include/mem.h +25 -0
  46. package/mayo-c/include/randombytes.h +31 -0
  47. package/mayo-c/scripts/contstants.py +141 -0
  48. package/mayo-c/scripts/find_irred_poly.sage +39 -0
  49. package/mayo-c/src/AVX2/arithmetic_common.h +159 -0
  50. package/mayo-c/src/AVX2/echelon_form.h +91 -0
  51. package/mayo-c/src/AVX2/echelon_form_loop.h +58 -0
  52. package/mayo-c/src/AVX2/shuffle_arithmetic.h +442 -0
  53. package/mayo-c/src/CMakeLists.txt +98 -0
  54. package/mayo-c/src/arithmetic.c +128 -0
  55. package/mayo-c/src/arithmetic.h +124 -0
  56. package/mayo-c/src/common/aes128ctr.c +293 -0
  57. package/mayo-c/src/common/aes_c.c +741 -0
  58. package/mayo-c/src/common/aes_ctr.h +32 -0
  59. package/mayo-c/src/common/aes_neon.c +201 -0
  60. package/mayo-c/src/common/debug_bench_tools.h +69 -0
  61. package/mayo-c/src/common/fips202.c +1093 -0
  62. package/mayo-c/src/common/fips202.h +12 -0
  63. package/mayo-c/src/common/mem.c +19 -0
  64. package/mayo-c/src/common/randombytes_ctrdrbg.c +141 -0
  65. package/mayo-c/src/common/randombytes_system.c +399 -0
  66. package/mayo-c/src/generic/arithmetic_dynamic.h +68 -0
  67. package/mayo-c/src/generic/arithmetic_fixed.h +84 -0
  68. package/mayo-c/src/generic/echelon_form.h +152 -0
  69. package/mayo-c/src/generic/ef_inner_loop.h +56 -0
  70. package/mayo-c/src/generic/generic_arithmetic.h +294 -0
  71. package/mayo-c/src/mayo.c +675 -0
  72. package/mayo-c/src/mayo_1/api.c +46 -0
  73. package/mayo-c/src/mayo_1/api.h +43 -0
  74. package/mayo-c/src/mayo_2/api.c +46 -0
  75. package/mayo-c/src/mayo_2/api.h +43 -0
  76. package/mayo-c/src/mayo_3/api.c +46 -0
  77. package/mayo-c/src/mayo_3/api.h +43 -0
  78. package/mayo-c/src/mayo_5/api.c +46 -0
  79. package/mayo-c/src/mayo_5/api.h +43 -0
  80. package/mayo-c/src/neon/arithmetic_common.h +132 -0
  81. package/mayo-c/src/neon/echelon_form.h +55 -0
  82. package/mayo-c/src/neon/echelon_form_loop.h +58 -0
  83. package/mayo-c/src/neon/shuffle_arithmetic.h +462 -0
  84. package/mayo-c/src/params.c +42 -0
  85. package/mayo-c/src/simple_arithmetic.h +138 -0
  86. package/mayo-c/test/CMakeLists.txt +51 -0
  87. package/mayo-c/test/bench.c +166 -0
  88. package/mayo-c/test/m1cycles.c +155 -0
  89. package/mayo-c/test/m1cycles.h +13 -0
  90. package/mayo-c/test/test_kat.c +271 -0
  91. package/mayo-c/test/test_mayo.c +139 -0
  92. package/mayo-c/test/test_sample_solution.c +75 -0
  93. package/mayo-c/test/test_various.c +680 -0
  94. package/package.json +39 -0
  95. package/publish.bat +22 -0
  96. package/readme.md +80 -0
  97. package/test/test.mjs +42 -0
@@ -0,0 +1,462 @@
1
+ // SPDX-License-Identifier: Apache-2.0
2
+
3
+ #ifndef SHUFFLE_ARITHMETIC_96_H
4
+ #define SHUFFLE_ARITHMETIC_96_H
5
+
6
+ #include <arm_neon.h>
7
+ #include <stdint.h>
8
+ #include <mayo.h>
9
+ #include <arithmetic_common.h>
10
+ #include <arithmetic_fixed.h>
11
+
12
+ #define O_NEON_ROUND_UP ((O_MAX + 1)/2*2)
13
+
14
+ #define NEON_REGISTER_PER_M_VEC ((M_MAX + 31)/32)
15
+
16
+ #if M_MAX <= 32
17
+ NOT IMPLEMENTED
18
+ #elif M_MAX <= 64
19
+ // Multiply an m_vec with a vec of "Len" scalars, and accumulate in temp
20
+ #define MULTIPLY_ACCUMULATE_M_VEC(vec, multabs, Len) \
21
+ uint8x16_t in_odd0 = *(uint8x16_t *)(vec); \
22
+ uint8x16_t in_even0 = in_odd0 >> 4; \
23
+ uint8x16_t in_odd1 = *(uint8x16_t *)(vec + M_VEC_LIMBS_MAX - 2); \
24
+ uint8x16_t in_even1 = in_odd1 >> 4; \
25
+ in_odd0 &= low_nibble_mask; \
26
+ in_odd1 &= low_nibble_mask; \
27
+ for (size_t k = 0; k < Len; k += 2) { \
28
+ temp[2 * k] ^= vqtbl1q_u8((multabs)[k / 2], in_odd0); \
29
+ temp[2 * k + 1] ^= vqtbl1q_u8((multabs)[k / 2], in_even0); \
30
+ temp[2 * k + 2] ^= vqtbl1q_u8((multabs)[k / 2], in_odd1); \
31
+ temp[2 * k + 3] ^= vqtbl1q_u8((multabs)[k / 2], in_even1); \
32
+ }
33
+
34
+ // convert to normal format and add to accumulator
35
+ #define DISENTANGLE_AND_ACCUMULATE_M_VECS(Len, acc, stride) \
36
+ for (size_t k = 0; k + 1 < Len; k += 2) { \
37
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
38
+ uint8x16_t acc1 = *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2); \
39
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[2 * k], temp[2 * k + 1], 4); \
40
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[2 * k + 2], temp[2 * k + 3], 4); \
41
+ *(uint8x16_t *)(acc + (k + 1) * stride + 0) ^= vsriq_n_u8(temp[2 * k + 1], temp[2 * k], 4); \
42
+ *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2) = acc1 ^ vsriq_n_u8(temp[2 * k + 3], temp[2 * k + 2], 4); \
43
+ } \
44
+ if (Len % 2 == 1) { \
45
+ const size_t k = Len - 1; \
46
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
47
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[2 * k], temp[2 * k + 1], 4); \
48
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[2 * k + 2], temp[2 * k + 3], 4); \
49
+ }
50
+ #elif M_MAX <= 96
51
+ // Multiply an m_vec with a vec of "Len" scalars, and accumulate in temp
52
+ #define MULTIPLY_ACCUMULATE_M_VEC(vec, multabs, Len) \
53
+ uint8x16_t in_odd0 = *(uint8x16_t *)(vec); \
54
+ uint8x16_t in_even0 = in_odd0 >> 4; \
55
+ uint8x16_t in_odd1 = *(uint8x16_t *)(vec + 2); \
56
+ uint8x16_t in_even1 = in_odd1 >> 4; \
57
+ uint8x16_t in_odd2 = *(uint8x16_t *)(vec + M_VEC_LIMBS_MAX - 2); \
58
+ uint8x16_t in_even2 = in_odd2 >> 4; \
59
+ in_odd0 &= low_nibble_mask; \
60
+ in_odd1 &= low_nibble_mask; \
61
+ in_odd2 &= low_nibble_mask; \
62
+ for (size_t k = 0; k < Len; k += 2) { \
63
+ temp[3 * k] ^= vqtbl1q_u8((multabs)[k / 2], in_odd0); \
64
+ temp[3 * k + 1] ^= vqtbl1q_u8((multabs)[k / 2], in_even0); \
65
+ temp[3 * k + 2] ^= vqtbl1q_u8((multabs)[k / 2], in_odd1); \
66
+ temp[3 * k + 3] ^= vqtbl1q_u8((multabs)[k / 2], in_even1); \
67
+ temp[3 * k + 4] ^= vqtbl1q_u8((multabs)[k / 2], in_odd2); \
68
+ temp[3 * k + 5] ^= vqtbl1q_u8((multabs)[k / 2], in_even2); \
69
+ }
70
+
71
+ // convert to normal format and add to accumulator
72
+ #define DISENTANGLE_AND_ACCUMULATE_M_VECS(Len, acc, stride) \
73
+ for (size_t k = 0; k + 1 < Len; k += 2) { \
74
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
75
+ uint8x16_t acc1 = *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2); \
76
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[3 * k], temp[3 * k + 1], 4); \
77
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[3 * k + 2], temp[3 * k + 3], 4); \
78
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[3 * k + 4], temp[3 * k + 5], 4); \
79
+ *(uint8x16_t *)(acc + (k + 1) * stride + 0) ^= vsriq_n_u8(temp[3 * k + 1], temp[3 * k], 4); \
80
+ *(uint8x16_t *)(acc + (k + 1) * stride + 2) ^= vsriq_n_u8(temp[3 * k + 3], temp[3 * k + 2], 4); \
81
+ *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2) = acc1 ^ vsriq_n_u8(temp[3 * k + 5], temp[3 * k + 4], 4); \
82
+ } \
83
+ if (Len % 2 == 1) { \
84
+ const size_t k = Len - 1; \
85
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
86
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[3 * k], temp[3 * k + 1], 4); \
87
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[3 * k + 2], temp[3 * k + 3], 4); \
88
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[3 * k + 4], temp[3 * k + 5], 4); \
89
+ }
90
+ #elif M_MAX <= 128
91
+ // Multiply an m_vec with a vec of "Len" scalars, and accumulate in temp
92
+ #define MULTIPLY_ACCUMULATE_M_VEC(vec, multabs, Len) \
93
+ uint8x16_t in_odd0 = *(uint8x16_t *)(vec); \
94
+ uint8x16_t in_even0 = in_odd0 >> 4; \
95
+ uint8x16_t in_odd1 = *(uint8x16_t *)(vec + 2); \
96
+ uint8x16_t in_even1 = in_odd1 >> 4; \
97
+ uint8x16_t in_odd2 = *(uint8x16_t *)(vec + 4); \
98
+ uint8x16_t in_even2 = in_odd2 >> 4; \
99
+ uint8x16_t in_odd3 = *(uint8x16_t *)(vec + M_VEC_LIMBS_MAX - 2); \
100
+ uint8x16_t in_even3 = in_odd3 >> 4; \
101
+ in_odd0 &= low_nibble_mask; \
102
+ in_odd1 &= low_nibble_mask; \
103
+ in_odd2 &= low_nibble_mask; \
104
+ in_odd3 &= low_nibble_mask; \
105
+ for (size_t k = 0; k < Len; k += 2) { \
106
+ temp[4 * k] ^= vqtbl1q_u8((multabs)[k / 2], in_odd0); \
107
+ temp[4 * k + 1] ^= vqtbl1q_u8((multabs)[k / 2], in_even0); \
108
+ temp[4 * k + 2] ^= vqtbl1q_u8((multabs)[k / 2], in_odd1); \
109
+ temp[4 * k + 3] ^= vqtbl1q_u8((multabs)[k / 2], in_even1); \
110
+ temp[4 * k + 4] ^= vqtbl1q_u8((multabs)[k / 2], in_odd2); \
111
+ temp[4 * k + 5] ^= vqtbl1q_u8((multabs)[k / 2], in_even2); \
112
+ temp[4 * k + 6] ^= vqtbl1q_u8((multabs)[k / 2], in_odd3); \
113
+ temp[4 * k + 7] ^= vqtbl1q_u8((multabs)[k / 2], in_even3); \
114
+ }
115
+
116
+ // convert to normal format and add to accumulator
117
+ #define DISENTANGLE_AND_ACCUMULATE_M_VECS(Len, acc, stride) \
118
+ for (size_t k = 0; k + 1 < Len; k += 2) { \
119
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
120
+ uint8x16_t acc1 = *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2); \
121
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[4 * k], temp[4 * k + 1], 4); \
122
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[4 * k + 2], temp[4 * k + 3], 4); \
123
+ *(uint8x16_t *)(acc + k * stride + 4) ^= vsliq_n_u8(temp[4 * k + 4], temp[4 * k + 5], 4); \
124
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[4 * k + 6], temp[4 * k + 7], 4); \
125
+ *(uint8x16_t *)(acc + (k + 1) * stride + 0) ^= vsriq_n_u8(temp[4 * k + 1], temp[4 * k], 4); \
126
+ *(uint8x16_t *)(acc + (k + 1) * stride + 2) ^= vsriq_n_u8(temp[4 * k + 3], temp[4 * k + 2], 4); \
127
+ *(uint8x16_t *)(acc + (k + 1) * stride + 4) ^= vsriq_n_u8(temp[4 * k + 5], temp[4 * k + 4], 4); \
128
+ *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2) = acc1 ^ vsriq_n_u8(temp[4 * k + 7], temp[4 * k + 6], 4); \
129
+ } \
130
+ if (Len % 2 == 1) { \
131
+ const size_t k = Len - 1; \
132
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
133
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[4 * k], temp[4 * k + 1], 4); \
134
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[4 * k + 2], temp[4 * k + 3], 4); \
135
+ *(uint8x16_t *)(acc + k * stride + 4) ^= vsliq_n_u8(temp[4 * k + 4], temp[4 * k + 5], 4); \
136
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[4 * k + 6], temp[4 * k + 7], 4); \
137
+ }
138
+ #elif M_MAX <= 160
139
+ // Multiply an m_vec with a vec of "Len" scalars, and accumulate in temp
140
+ #define MULTIPLY_ACCUMULATE_M_VEC(vec, multabs, Len) \
141
+ uint8x16_t in_odd0 = *(uint8x16_t *)(vec); \
142
+ uint8x16_t in_even0 = in_odd0 >> 4; \
143
+ uint8x16_t in_odd1 = *(uint8x16_t *)(vec + 2); \
144
+ uint8x16_t in_even1 = in_odd1 >> 4; \
145
+ uint8x16_t in_odd2 = *(uint8x16_t *)(vec + 4); \
146
+ uint8x16_t in_even2 = in_odd2 >> 4; \
147
+ uint8x16_t in_odd3 = *(uint8x16_t *)(vec + 6); \
148
+ uint8x16_t in_even3 = in_odd3 >> 4; \
149
+ uint8x16_t in_odd4 = *(uint8x16_t *)(vec + M_VEC_LIMBS_MAX - 2); \
150
+ uint8x16_t in_even4 = in_odd4 >> 4; \
151
+ in_odd0 &= low_nibble_mask; \
152
+ in_odd1 &= low_nibble_mask; \
153
+ in_odd2 &= low_nibble_mask; \
154
+ in_odd3 &= low_nibble_mask; \
155
+ in_odd4 &= low_nibble_mask; \
156
+ for (size_t k = 0; k < Len; k += 2) { \
157
+ temp[5 * k] ^= vqtbl1q_u8((multabs)[k / 2], in_odd0); \
158
+ temp[5 * k + 1] ^= vqtbl1q_u8((multabs)[k / 2], in_even0); \
159
+ temp[5 * k + 2] ^= vqtbl1q_u8((multabs)[k / 2], in_odd1); \
160
+ temp[5 * k + 3] ^= vqtbl1q_u8((multabs)[k / 2], in_even1); \
161
+ temp[5 * k + 4] ^= vqtbl1q_u8((multabs)[k / 2], in_odd2); \
162
+ temp[5 * k + 5] ^= vqtbl1q_u8((multabs)[k / 2], in_even2); \
163
+ temp[5 * k + 6] ^= vqtbl1q_u8((multabs)[k / 2], in_odd3); \
164
+ temp[5 * k + 7] ^= vqtbl1q_u8((multabs)[k / 2], in_even3); \
165
+ temp[5 * k + 8] ^= vqtbl1q_u8((multabs)[k / 2], in_odd4); \
166
+ temp[5 * k + 9] ^= vqtbl1q_u8((multabs)[k / 2], in_even4); \
167
+ }
168
+
169
+ // convert to normal format and add to accumulator
170
+ #define DISENTANGLE_AND_ACCUMULATE_M_VECS(Len, acc, stride) \
171
+ for (size_t k = 0; k + 1 < Len; k += 2) { \
172
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
173
+ uint8x16_t acc1 = *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2); \
174
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[5 * k], temp[5 * k + 1], 4); \
175
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[5 * k + 2], temp[5 * k + 3], 4); \
176
+ *(uint8x16_t *)(acc + k * stride + 4) ^= vsliq_n_u8(temp[5 * k + 4], temp[5 * k + 5], 4); \
177
+ *(uint8x16_t *)(acc + k * stride + 6) ^= vsliq_n_u8(temp[5 * k + 6], temp[5 * k + 7], 4); \
178
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[5 * k + 8], temp[5 * k + 9], 4); \
179
+ *(uint8x16_t *)(acc + (k + 1) * stride + 0) ^= vsriq_n_u8(temp[5 * k + 1], temp[5 * k], 4); \
180
+ *(uint8x16_t *)(acc + (k + 1) * stride + 2) ^= vsriq_n_u8(temp[5 * k + 3], temp[5 * k + 2], 4); \
181
+ *(uint8x16_t *)(acc + (k + 1) * stride + 4) ^= vsriq_n_u8(temp[5 * k + 5], temp[5 * k + 4], 4); \
182
+ *(uint8x16_t *)(acc + (k + 1) * stride + 6) ^= vsriq_n_u8(temp[5 * k + 7], temp[5 * k + 6], 4); \
183
+ *(uint8x16_t *)(acc + (k + 1) * stride + M_VEC_LIMBS_MAX - 2) = acc1 ^ vsriq_n_u8(temp[5 * k + 9], temp[5 * k + 8], 4); \
184
+ } \
185
+ if (Len % 2 == 1) { \
186
+ const size_t k = Len - 1; \
187
+ uint8x16_t acc0 = *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2); \
188
+ *(uint8x16_t *)(acc + k * stride) ^= vsliq_n_u8(temp[5 * k], temp[5 * k + 1], 4); \
189
+ *(uint8x16_t *)(acc + k * stride + 2) ^= vsliq_n_u8(temp[5 * k + 2], temp[5 * k + 3], 4); \
190
+ *(uint8x16_t *)(acc + k * stride + 4) ^= vsliq_n_u8(temp[5 * k + 4], temp[5 * k + 5], 4); \
191
+ *(uint8x16_t *)(acc + k * stride + 6) ^= vsliq_n_u8(temp[5 * k + 6], temp[5 * k + 7], 4); \
192
+ *(uint8x16_t *)(acc + k * stride + M_VEC_LIMBS_MAX - 2) = acc0 ^ vsliq_n_u8(temp[5 * k + 8], temp[5 * k + 9], 4); \
193
+ }
194
+ #else
195
+ NOT IMPLEMENTED
196
+ #endif
197
+
198
+ // P1*0 -> P1: v x v, O: v x o
199
+ static
200
+ inline void P1_times_O(const uint64_t *P1, uint8x16_t *O_multabs, uint64_t *acc){
201
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
202
+ size_t limbs_used = 0;
203
+ for (size_t r = 0; r < V_MAX; r++)
204
+ {
205
+ // do multiplications for one row and accumulate results in temporary format
206
+ uint8x16_t temp[O_NEON_ROUND_UP*NEON_REGISTER_PER_M_VEC] = {0};
207
+ for (size_t c = r; c < V_MAX; c++)
208
+ {
209
+ MULTIPLY_ACCUMULATE_M_VEC(P1 + limbs_used, O_multabs + O_NEON_ROUND_UP/2*c, O_MAX);
210
+ limbs_used += M_VEC_LIMBS_MAX;
211
+ }
212
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(O_MAX , acc + r*O_MAX*M_VEC_LIMBS_MAX, M_VEC_LIMBS_MAX);
213
+ }
214
+ }
215
+
216
+ static
217
+ inline void Ot_times_P1O_P2(const uint64_t *P1O_P2, uint8x16_t *O_multabs, uint64_t *acc){
218
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
219
+ for (size_t c = 0; c < O_MAX; c++)
220
+ {
221
+ // do multiplications for one row and accumulate results in temporary format
222
+ uint8x16_t temp[O_NEON_ROUND_UP*NEON_REGISTER_PER_M_VEC] = {0};
223
+ for (size_t r = 0; r < V_MAX; r++)
224
+ {
225
+ MULTIPLY_ACCUMULATE_M_VEC(P1O_P2 + (r*O_MAX + c)*M_VEC_LIMBS_MAX, O_multabs + O_NEON_ROUND_UP/2*r, O_MAX);
226
+ }
227
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(O_MAX , acc + c*M_VEC_LIMBS_MAX, O_MAX*M_VEC_LIMBS_MAX);
228
+ }
229
+ }
230
+
231
+ static
232
+ inline void P1P1t_times_O(const mayo_params_t* p, const uint64_t *P1, const unsigned char *O, uint64_t *acc){
233
+ (void) p;
234
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
235
+
236
+ uint8x16_t O_multabs[O_NEON_ROUND_UP/2*V_MAX];
237
+ mayo_O_multabs(O, O_multabs);
238
+
239
+ size_t cols_used = 0;
240
+ for (size_t r = 0; r < V_MAX; r++)
241
+ {
242
+ // do multiplications for one row and accumulate results in temporary format
243
+ uint8x16_t temp[O_NEON_ROUND_UP*NEON_REGISTER_PER_M_VEC] = {0};
244
+ cols_used += 1;
245
+ size_t pos = r;
246
+ for (size_t c = 0; c < r; c++)
247
+ {
248
+ MULTIPLY_ACCUMULATE_M_VEC(P1 + pos * M_VEC_LIMBS_MAX, O_multabs + O_NEON_ROUND_UP/2*c, O_MAX);
249
+ pos += (V_MAX -c - 1);
250
+ }
251
+
252
+ for (size_t c = r+1; c < V_MAX; c++)
253
+ {
254
+ MULTIPLY_ACCUMULATE_M_VEC(P1 + cols_used * M_VEC_LIMBS_MAX, O_multabs + O_NEON_ROUND_UP/2*c, O_MAX);
255
+ cols_used ++;
256
+ }
257
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(O_MAX, acc + r*O_MAX*M_VEC_LIMBS_MAX, M_VEC_LIMBS_MAX);
258
+ }
259
+ }
260
+
261
+ static
262
+ inline void Vt_times_L(const uint64_t *L, const uint8x16_t *V_multabs, uint64_t *acc){
263
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
264
+
265
+ for (size_t c = 0; c < O_MAX; c++)
266
+ {
267
+ // do multiplications for one row and accumulate results in temporary format
268
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
269
+ for (size_t r = 0; r < V_MAX; r++)
270
+ {
271
+ MULTIPLY_ACCUMULATE_M_VEC(L + (r*O_MAX + c) * M_VEC_LIMBS_MAX, V_multabs + K_OVER_2*r, K_MAX);
272
+ }
273
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + c*M_VEC_LIMBS_MAX, O_MAX*M_VEC_LIMBS_MAX);
274
+ }
275
+ }
276
+
277
+ static
278
+ inline void Vt_times_Pv(const uint64_t *Pv, const uint8x16_t *V_multabs, uint64_t *acc){
279
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
280
+
281
+ for (size_t c = 0; c < K_MAX; c++)
282
+ {
283
+ // do multiplications for one row and accumulate results in temporary format
284
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
285
+ for (size_t r = 0; r < V_MAX; r++)
286
+ {
287
+ MULTIPLY_ACCUMULATE_M_VEC(Pv + (r*K_MAX + c) * M_VEC_LIMBS_MAX, V_multabs + K_OVER_2*r, K_MAX);
288
+ }
289
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + c*M_VEC_LIMBS_MAX, K_MAX*M_VEC_LIMBS_MAX);
290
+ }
291
+ }
292
+
293
+ static
294
+ inline void P1_times_Vt(const uint64_t *P1, uint8x16_t *V_multabs, uint64_t *acc){
295
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
296
+
297
+ size_t cols_used = 0;
298
+ for (size_t r = 0; r < V_MAX; r++)
299
+ {
300
+ // do multiplications for one row and accumulate results in temporary format
301
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
302
+
303
+ for (size_t c=r; c < V_MAX; c++)
304
+ {
305
+ MULTIPLY_ACCUMULATE_M_VEC(P1 + cols_used, V_multabs + K_OVER_2*c, K_MAX);
306
+ cols_used += M_VEC_LIMBS_MAX;
307
+ }
308
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + r*K_MAX*M_VEC_LIMBS_MAX, M_VEC_LIMBS_MAX);
309
+ }
310
+ }
311
+
312
+ // P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
313
+ // same as mayo_12_P1_times_Vt
314
+ static
315
+ inline void P1_times_S1(const uint64_t *_P1, uint8x16_t *S1_multabs, uint64_t *_acc){
316
+ P1_times_Vt(_P1, S1_multabs, _acc);
317
+ }
318
+
319
+ static
320
+ inline void S1t_times_PS1(const uint64_t *_PS1, uint8x16_t *S1_multabs, uint64_t *_acc){
321
+ Vt_times_Pv(_PS1, S1_multabs, _acc);
322
+ }
323
+
324
+ static
325
+ inline void S2t_times_PS2(const uint64_t *PS2, uint8x16_t *S2_multabs, uint64_t *acc){
326
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
327
+
328
+ for (size_t c = 0; c < K_MAX; c++)
329
+ {
330
+ // do multiplications for one row and accumulate results in temporary format
331
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
332
+ for (size_t r = 0; r < O_MAX; r++)
333
+ {
334
+ MULTIPLY_ACCUMULATE_M_VEC(PS2 + (r*K_MAX + c)*M_VEC_LIMBS_MAX, S2_multabs + K_OVER_2*r, K_MAX);
335
+ }
336
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + c*M_VEC_LIMBS_MAX, K_MAX*M_VEC_LIMBS_MAX);
337
+ }
338
+ }
339
+
340
+
341
+ // P2*S2 -> P2: v x o, S2: o x k
342
+ static
343
+ inline void P1_times_S1_plus_P2_times_S2(const uint64_t *P1, const uint64_t *P2, uint8x16_t *S1_multabs, uint8x16_t *S2_multabs, uint64_t *acc){
344
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
345
+
346
+ size_t P1_cols_used = 0;
347
+ for (size_t r = 0; r < V_MAX; r++)
348
+ {
349
+ // do multiplications for one row and accumulate results in temporary format
350
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
351
+
352
+ // P1 * S1
353
+ for (size_t c = r; c < V_MAX; c++)
354
+ {
355
+ MULTIPLY_ACCUMULATE_M_VEC(P1 + P1_cols_used, S1_multabs + K_OVER_2*c, K_MAX);
356
+ P1_cols_used += M_VEC_LIMBS_MAX;
357
+ }
358
+
359
+ // P2 * S2
360
+ for (size_t c = 0; c < O_MAX; c++)
361
+ {
362
+ MULTIPLY_ACCUMULATE_M_VEC(P2 + (r*O_MAX + c)*M_VEC_LIMBS_MAX, S2_multabs + K_OVER_2*c, K_MAX);
363
+ }
364
+
365
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + r*K_MAX*M_VEC_LIMBS_MAX, M_VEC_LIMBS_MAX);
366
+ }
367
+ }
368
+
369
+
370
+ // P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
371
+ static
372
+ inline void P3_times_S2(const uint64_t *P3, uint8x16_t *S2_multabs, uint64_t *acc){
373
+ const uint8x16_t low_nibble_mask = vdupq_n_u8( 0xf );
374
+
375
+ size_t cols_used = 0;
376
+ for (size_t r = 0; r < O_MAX; r++)
377
+ {
378
+ // do multiplications for one row and accumulate results in temporary format
379
+ uint8x16_t temp[K_OVER_2*2*NEON_REGISTER_PER_M_VEC] = {0};
380
+ for (size_t c = r; c < O_MAX; c++)
381
+ {
382
+ MULTIPLY_ACCUMULATE_M_VEC(P3 + cols_used, S2_multabs + K_OVER_2*c, K_MAX);
383
+ cols_used += M_VEC_LIMBS_MAX;
384
+ }
385
+
386
+ DISENTANGLE_AND_ACCUMULATE_M_VECS(K_MAX , acc + r*K_MAX*M_VEC_LIMBS_MAX, M_VEC_LIMBS_MAX);
387
+ }
388
+ }
389
+
390
+ static inline
391
+ void compute_M_and_VPV(const mayo_params_t* p, const unsigned char* Vdec, const uint64_t *L, const uint64_t *P1, uint64_t *VL, uint64_t *VP1V){
392
+ (void) p;
393
+ uint8x16_t V_multabs[(K_MAX+1)/2*V_MAX];
394
+ mayo_V_multabs(Vdec, V_multabs);
395
+
396
+ // M
397
+ Vt_times_L(L, V_multabs, VL);
398
+
399
+ // VP1V
400
+ uint64_t Pv[V_MAX * K_MAX * M_VEC_LIMBS_MAX] = {0};
401
+ P1_times_Vt(P1, V_multabs, Pv);
402
+ Vt_times_Pv(Pv, V_multabs, VP1V);
403
+ }
404
+
405
+ static inline
406
+ void compute_P3(const mayo_params_t* p, const uint64_t* P1, uint64_t *P2, const unsigned char *O, uint64_t *P3){
407
+ (void) p;
408
+ uint8x16_t O_multabs[O_NEON_ROUND_UP/2*V_MAX];
409
+ mayo_O_multabs(O, O_multabs);
410
+ P1_times_O(P1, O_multabs, P2);
411
+ Ot_times_P1O_P2(P2, O_multabs, P3);
412
+ }
413
+
414
+ // compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
415
+ // [ 0 P3 ] [S2] [ P3*S2]
416
+ // compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
417
+ // [ P3*S2 = P2 ]
418
+ static inline void m_calculate_PS_SPS(const mayo_params_t *p, const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
419
+ uint64_t *SPS) {
420
+ (void) p;
421
+ const int o = PARAM_NAME(o);
422
+ const int v = PARAM_NAME(v);
423
+ const int k = PARAM_NAME(k);
424
+ const int n = o + v;
425
+ /* Old approach which is constant time but doesn't have to be */
426
+ unsigned char S1[V_MAX*K_MAX]; // == N-O, K
427
+ unsigned char S2[O_MAX*K_MAX]; // == O, K
428
+ unsigned char *s1_write = S1;
429
+ unsigned char *s2_write = S2;
430
+
431
+ for (int r=0; r < k; r++)
432
+ {
433
+ for (int c = 0; c < n; c++)
434
+ {
435
+ if(c < v){
436
+ *(s1_write++) = S[r*n + c];
437
+ } else {
438
+ *(s2_write++) = S[r*n + c];
439
+ }
440
+ }
441
+ }
442
+
443
+ uint64_t PS[N_MAX * K_MAX * M_VEC_LIMBS_MAX] = { 0 };
444
+ (void) PS;
445
+
446
+ uint8x16_t S1_multabs[(K_MAX+1)/2*V_MAX];
447
+ uint8x16_t S2_multabs[(K_MAX+1)/2*O_MAX];
448
+ mayo_S1_multabs(S1, S1_multabs);
449
+ mayo_S2_multabs(S2, S2_multabs);
450
+
451
+ P1_times_S1_plus_P2_times_S2(P1, P2, S1_multabs, S2_multabs, PS);
452
+ P3_times_S2(P3, S2_multabs, PS + V_MAX*K_MAX*M_VEC_LIMBS_MAX); // upper triangular
453
+
454
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
455
+ S1t_times_PS1(PS, S1_multabs, SPS);
456
+ S2t_times_PS2(PS + V_MAX*K_MAX*M_VEC_LIMBS_MAX, S2_multabs, SPS);
457
+ }
458
+
459
+
460
+ #undef K_OVER_2
461
+ #endif
462
+
@@ -0,0 +1,42 @@
1
+ // SPDX-License-Identifier: Apache-2.0
2
+
3
+ #include <mayo.h>
4
+
5
+ #ifdef ENABLE_PARAMS_DYNAMIC
6
+ static const unsigned char f_tail_64[] = F_TAIL_64;
7
+ static const unsigned char f_tail_78[] = F_TAIL_78;
8
+ static const unsigned char f_tail_108[] = F_TAIL_108;
9
+ static const unsigned char f_tail_142[] = F_TAIL_142;
10
+
11
+ #define MAYO_GEN_PARAMS(nm) \
12
+ const mayo_params_t nm = { \
13
+ .m = PARAM_JOIN2(nm, m), \
14
+ .n = PARAM_JOIN2(nm, n), \
15
+ .o = PARAM_JOIN2(nm, o), \
16
+ .k = PARAM_JOIN2(nm, k), \
17
+ .q = PARAM_JOIN2(nm, q), \
18
+ .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
19
+ .m_bytes = PARAM_JOIN2(nm, m_bytes), \
20
+ .O_bytes = PARAM_JOIN2(nm, O_bytes), \
21
+ .v_bytes = PARAM_JOIN2(nm, v_bytes), \
22
+ .r_bytes = PARAM_JOIN2(nm, r_bytes), \
23
+ .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
24
+ .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
25
+ .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
26
+ .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
27
+ .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
28
+ .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
29
+ .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
30
+ .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
31
+ .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
32
+ .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
33
+ .m_vec_limbs = PARAM_JOIN2(nm, m_vec_limbs), \
34
+ .name = #nm \
35
+ };
36
+
37
+ MAYO_GEN_PARAMS(MAYO_1);
38
+ MAYO_GEN_PARAMS(MAYO_2);
39
+ MAYO_GEN_PARAMS(MAYO_3);
40
+ MAYO_GEN_PARAMS(MAYO_5);
41
+ #endif
42
+
@@ -0,0 +1,138 @@
1
+ // SPDX-License-Identifier: Apache-2.0
2
+
3
+ #ifndef SIMPLE_ARITHMETIC_H
4
+ #define SIMPLE_ARITHMETIC_H
5
+ #include <mem.h>
6
+
7
+ // GF(16) multiplication mod x^4 + x + 1
8
+ static inline unsigned char mul_f(unsigned char a, unsigned char b) {
9
+ // carryless multiply
10
+ unsigned char p;
11
+
12
+ #if !(((defined(__clang__) && __clang_major__ < 15) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ <= 12)) && (defined(__x86_64__) || defined(_M_X64)))
13
+ a ^= unsigned_char_blocker;
14
+ #endif
15
+
16
+ p = (a & 1)*b;
17
+ p ^= (a & 2)*b;
18
+ p ^= (a & 4)*b;
19
+ p ^= (a & 8)*b;
20
+
21
+ // reduce mod x^4 + x + 1
22
+ unsigned char top_p = p & 0xf0;
23
+ unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
24
+ return out;
25
+ }
26
+
27
+ static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
28
+ // carryless multiply
29
+ uint64_t p;
30
+ p = (a & 1)*b;
31
+ p ^= (a & 2)*b;
32
+ p ^= (a & 4)*b;
33
+ p ^= (a & 8)*b;
34
+
35
+ // reduce mod x^4 + x + 1
36
+ uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
37
+ uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
38
+ return out;
39
+ }
40
+
41
+ // GF(16) addition
42
+ static inline unsigned char add_f(unsigned char a, unsigned char b) {
43
+ return a ^ b;
44
+ }
45
+
46
+ // GF(16) subtraction
47
+ static inline unsigned char sub_f(unsigned char a, unsigned char b) {
48
+ return a ^ b;
49
+ }
50
+
51
+ // GF(16) negation
52
+ static inline unsigned char neg_f(unsigned char a) {
53
+ return a;
54
+ }
55
+
56
+ static inline unsigned char inverse_f(unsigned char a) {
57
+ // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
58
+ // 10, 4, 3, 8}; return table[a & 15];
59
+
60
+ unsigned char a2 = mul_f(a, a);
61
+ unsigned char a4 = mul_f(a2, a2);
62
+ unsigned char a8 = mul_f(a4, a4);
63
+ unsigned char a6 = mul_f(a2, a4);
64
+ unsigned char a14 = mul_f(a8, a6);
65
+
66
+ return a14;
67
+ }
68
+
69
+ static inline unsigned char lincomb(const unsigned char *a,
70
+ const unsigned char *b, int n, int m) {
71
+ unsigned char ret = 0;
72
+ for (int i = 0; i < n; ++i, b += m) {
73
+ ret = add_f(mul_f(a[i], *b), ret);
74
+ }
75
+ return ret;
76
+ }
77
+
78
+ static inline void mat_mul(const unsigned char *a, const unsigned char *b,
79
+ unsigned char *c, int colrow_ab, int row_a, int col_b) {
80
+ for (int i = 0; i < row_a; ++i, a += colrow_ab) {
81
+ for (int j = 0; j < col_b; ++j, ++c) {
82
+ *c = lincomb(a, b + j, colrow_ab, col_b);
83
+ }
84
+ }
85
+ }
86
+
87
+ static inline void mat_add(const unsigned char *a, const unsigned char *b,
88
+ unsigned char *c, int m, int n) {
89
+ for (int i = 0; i < m; ++i) {
90
+ for (int j = 0; j < n; ++j) {
91
+ *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
92
+ }
93
+ }
94
+ }
95
+
96
+ static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
97
+ uint64_t mask_msb = 0x8888888888888888ULL;
98
+ uint64_t a_msb;
99
+ uint64_t a64 = a;
100
+ #if !(((defined(__clang__) && __clang_major__ < 15) || (!defined(__clang__) && defined(__GNUC__) && __GNUC__ <= 12)) && (defined(__x86_64__) || defined(_M_X64)))
101
+ uint64_t b32 = b ^ unsigned_char_blocker;
102
+ #else
103
+ uint64_t b32 = b;
104
+ #endif
105
+ uint64_t r64 = a64 * (b32 & 1);
106
+
107
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
108
+ a64 ^= a_msb; // clear MSB
109
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
110
+ r64 ^= (a64) * ((b32 >> 1) & 1);
111
+
112
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
113
+ a64 ^= a_msb; // clear MSB
114
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
115
+ r64 ^= (a64) * ((b32 >> 2) & 1);
116
+
117
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
118
+ a64 ^= a_msb; // clear MSB
119
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
120
+ r64 ^= (a64) * ((b32 >> 3) & 1);
121
+
122
+ return r64;
123
+ }
124
+
125
+ // This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
126
+ // gf16 := gf2[x]/(x^4+x+1)
127
+
128
+ static inline uint32_t mul_table(uint8_t b){
129
+ uint32_t x = ((uint32_t) b) * 0x08040201;
130
+
131
+ uint32_t high_nibble_mask = 0xf0f0f0f0;
132
+
133
+ uint32_t high_half = x & high_nibble_mask;
134
+ return (x ^ (high_half >> 4) ^ (high_half >> 3));
135
+ }
136
+
137
+ #endif
138
+