ed25519_blake2b 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +9 -0
  3. data/CODE_OF_CONDUCT.md +74 -0
  4. data/Gemfile +6 -0
  5. data/Gemfile.lock +23 -0
  6. data/LICENSE +21 -0
  7. data/README.md +39 -0
  8. data/Rakefile +13 -0
  9. data/bin/console +14 -0
  10. data/bin/setup +8 -0
  11. data/ed25519_blake2b.gemspec +31 -0
  12. data/ext/ed25519_blake2b/blake2-config.h +72 -0
  13. data/ext/ed25519_blake2b/blake2-impl.h +160 -0
  14. data/ext/ed25519_blake2b/blake2.h +195 -0
  15. data/ext/ed25519_blake2b/blake2b-load-sse2.h +68 -0
  16. data/ext/ed25519_blake2b/blake2b-load-sse41.h +402 -0
  17. data/ext/ed25519_blake2b/blake2b-ref.c +373 -0
  18. data/ext/ed25519_blake2b/blake2b-round.h +157 -0
  19. data/ext/ed25519_blake2b/curve25519-donna-32bit.h +579 -0
  20. data/ext/ed25519_blake2b/curve25519-donna-64bit.h +413 -0
  21. data/ext/ed25519_blake2b/curve25519-donna-helpers.h +67 -0
  22. data/ext/ed25519_blake2b/curve25519-donna-sse2.h +1112 -0
  23. data/ext/ed25519_blake2b/ed25519-donna-32bit-sse2.h +513 -0
  24. data/ext/ed25519_blake2b/ed25519-donna-32bit-tables.h +61 -0
  25. data/ext/ed25519_blake2b/ed25519-donna-64bit-sse2.h +436 -0
  26. data/ext/ed25519_blake2b/ed25519-donna-64bit-tables.h +53 -0
  27. data/ext/ed25519_blake2b/ed25519-donna-64bit-x86-32bit.h +435 -0
  28. data/ext/ed25519_blake2b/ed25519-donna-64bit-x86.h +351 -0
  29. data/ext/ed25519_blake2b/ed25519-donna-basepoint-table.h +259 -0
  30. data/ext/ed25519_blake2b/ed25519-donna-batchverify.h +275 -0
  31. data/ext/ed25519_blake2b/ed25519-donna-impl-base.h +364 -0
  32. data/ext/ed25519_blake2b/ed25519-donna-impl-sse2.h +390 -0
  33. data/ext/ed25519_blake2b/ed25519-donna-portable-identify.h +103 -0
  34. data/ext/ed25519_blake2b/ed25519-donna-portable.h +135 -0
  35. data/ext/ed25519_blake2b/ed25519-donna.h +115 -0
  36. data/ext/ed25519_blake2b/ed25519-hash-custom.c +28 -0
  37. data/ext/ed25519_blake2b/ed25519-hash-custom.h +30 -0
  38. data/ext/ed25519_blake2b/ed25519-hash.h +219 -0
  39. data/ext/ed25519_blake2b/ed25519-randombytes-custom.h +10 -0
  40. data/ext/ed25519_blake2b/ed25519-randombytes.h +91 -0
  41. data/ext/ed25519_blake2b/ed25519.c +150 -0
  42. data/ext/ed25519_blake2b/ed25519.h +30 -0
  43. data/ext/ed25519_blake2b/extconf.rb +3 -0
  44. data/ext/ed25519_blake2b/fuzz/README.md +173 -0
  45. data/ext/ed25519_blake2b/fuzz/build-nix.php +134 -0
  46. data/ext/ed25519_blake2b/fuzz/curve25519-ref10.c +1272 -0
  47. data/ext/ed25519_blake2b/fuzz/curve25519-ref10.h +8 -0
  48. data/ext/ed25519_blake2b/fuzz/ed25519-donna-sse2.c +3 -0
  49. data/ext/ed25519_blake2b/fuzz/ed25519-donna.c +1 -0
  50. data/ext/ed25519_blake2b/fuzz/ed25519-donna.h +34 -0
  51. data/ext/ed25519_blake2b/fuzz/ed25519-ref10.c +4647 -0
  52. data/ext/ed25519_blake2b/fuzz/ed25519-ref10.h +9 -0
  53. data/ext/ed25519_blake2b/fuzz/fuzz-curve25519.c +172 -0
  54. data/ext/ed25519_blake2b/fuzz/fuzz-ed25519.c +219 -0
  55. data/ext/ed25519_blake2b/modm-donna-32bit.h +469 -0
  56. data/ext/ed25519_blake2b/modm-donna-64bit.h +361 -0
  57. data/ext/ed25519_blake2b/rbext.c +25 -0
  58. data/ext/ed25519_blake2b/regression.h +1024 -0
  59. data/lib/ed25519_blake2b/ed25519_blake2b.rb +4 -0
  60. data/lib/ed25519_blake2b/version.rb +3 -0
  61. metadata +147 -0
@@ -0,0 +1,1112 @@
1
+ /*
2
+ Public domain by Andrew M. <liquidsun@gmail.com>
3
+ See: https://github.com/floodyberry/curve25519-donna
4
+
5
+ SSE2 curve25519 implementation
6
+ */
7
+
8
+ #include <emmintrin.h>
9
+ typedef __m128i xmmi;
10
+
11
+ typedef union packedelem8_t {
12
+ unsigned char u[16];
13
+ xmmi v;
14
+ } packedelem8;
15
+
16
+ typedef union packedelem32_t {
17
+ uint32_t u[4];
18
+ xmmi v;
19
+ } packedelem32;
20
+
21
+ typedef union packedelem64_t {
22
+ uint64_t u[2];
23
+ xmmi v;
24
+ } packedelem64;
25
+
26
+ /* 10 elements + an extra 2 to fit in 3 xmm registers */
27
+ typedef uint32_t bignum25519[12];
28
+ typedef packedelem32 packed32bignum25519[5];
29
+ typedef packedelem64 packed64bignum25519[10];
30
+
31
+ static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
32
+ static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
33
+ static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
34
+ static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
35
+
36
+ /* reduction masks */
37
+ static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
38
+ static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
39
+ static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
40
+ static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
41
+ static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
42
+
43
+ /* multipliers */
44
+ static const packedelem64 packednineteen = {{19, 19}};
45
+ static const packedelem64 packednineteenone = {{19, 1}};
46
+ static const packedelem64 packedthirtyeight = {{38, 38}};
47
+ static const packedelem64 packed3819 = {{19*2,19}};
48
+ static const packedelem64 packed9638 = {{19*4,19*2}};
49
+
50
+ /* 121666,121665 */
51
+ static const packedelem64 packed121666121665 = {{121666, 121665}};
52
+
53
+ /* 2*(2^255 - 19) = 0 mod p */
54
+ static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
55
+ static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
56
+ static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
57
+
58
+ static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
59
+ static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
60
+
61
+ /* 4*(2^255 - 19) = 0 mod p */
62
+ static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
63
+ static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
64
+ static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
65
+
66
+ static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
67
+ static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
68
+
69
+ /* out = in */
70
+ DONNA_INLINE static void
71
+ curve25519_copy(bignum25519 out, const bignum25519 in) {
72
+ xmmi x0,x1,x2;
73
+ x0 = _mm_load_si128((xmmi*)in + 0);
74
+ x1 = _mm_load_si128((xmmi*)in + 1);
75
+ x2 = _mm_load_si128((xmmi*)in + 2);
76
+ _mm_store_si128((xmmi*)out + 0, x0);
77
+ _mm_store_si128((xmmi*)out + 1, x1);
78
+ _mm_store_si128((xmmi*)out + 2, x2);
79
+ }
80
+
81
+ /* out = a + b */
82
+ DONNA_INLINE static void
83
+ curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
84
+ xmmi a0,a1,a2,b0,b1,b2;
85
+ a0 = _mm_load_si128((xmmi*)a + 0);
86
+ a1 = _mm_load_si128((xmmi*)a + 1);
87
+ a2 = _mm_load_si128((xmmi*)a + 2);
88
+ b0 = _mm_load_si128((xmmi*)b + 0);
89
+ b1 = _mm_load_si128((xmmi*)b + 1);
90
+ b2 = _mm_load_si128((xmmi*)b + 2);
91
+ a0 = _mm_add_epi32(a0, b0);
92
+ a1 = _mm_add_epi32(a1, b1);
93
+ a2 = _mm_add_epi32(a2, b2);
94
+ _mm_store_si128((xmmi*)out + 0, a0);
95
+ _mm_store_si128((xmmi*)out + 1, a1);
96
+ _mm_store_si128((xmmi*)out + 2, a2);
97
+ }
98
+
99
+ #define curve25519_add_after_basic curve25519_add_reduce
100
+ DONNA_INLINE static void
101
+ curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
102
+ xmmi a0,a1,a2,b0,b1,b2;
103
+ xmmi c1,c2,c3;
104
+ xmmi r0,r1,r2,r3,r4,r5;
105
+
106
+ a0 = _mm_load_si128((xmmi*)a + 0);
107
+ a1 = _mm_load_si128((xmmi*)a + 1);
108
+ a2 = _mm_load_si128((xmmi*)a + 2);
109
+ b0 = _mm_load_si128((xmmi*)b + 0);
110
+ b1 = _mm_load_si128((xmmi*)b + 1);
111
+ b2 = _mm_load_si128((xmmi*)b + 2);
112
+ a0 = _mm_add_epi32(a0, b0);
113
+ a1 = _mm_add_epi32(a1, b1);
114
+ a2 = _mm_add_epi32(a2, b2);
115
+
116
+ r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
117
+ r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
118
+ r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
119
+ r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
120
+ r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
121
+ r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
122
+
123
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
124
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
125
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
126
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
127
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
128
+
129
+ _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
130
+ _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
131
+ _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
132
+ }
133
+
134
+ DONNA_INLINE static void
135
+ curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
136
+ xmmi a0,a1,a2,b0,b1,b2;
137
+ xmmi c1,c2;
138
+ xmmi r0,r1;
139
+
140
+ a0 = _mm_load_si128((xmmi*)a + 0);
141
+ a1 = _mm_load_si128((xmmi*)a + 1);
142
+ a2 = _mm_load_si128((xmmi*)a + 2);
143
+ a0 = _mm_add_epi32(a0, packed2p0.v);
144
+ a1 = _mm_add_epi32(a1, packed2p1.v);
145
+ a2 = _mm_add_epi32(a2, packed2p2.v);
146
+ b0 = _mm_load_si128((xmmi*)b + 0);
147
+ b1 = _mm_load_si128((xmmi*)b + 1);
148
+ b2 = _mm_load_si128((xmmi*)b + 2);
149
+ a0 = _mm_sub_epi32(a0, b0);
150
+ a1 = _mm_sub_epi32(a1, b1);
151
+ a2 = _mm_sub_epi32(a2, b2);
152
+
153
+ r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
154
+ r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
155
+
156
+ c1 = _mm_srli_epi32(r0, 26);
157
+ c2 = _mm_srli_epi32(r1, 25);
158
+ r0 = _mm_and_si128(r0, packedmask26.v);
159
+ r1 = _mm_and_si128(r1, packedmask25.v);
160
+ r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
161
+ r1 = _mm_add_epi32(r1, c1);
162
+
163
+ a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
164
+ a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
165
+
166
+ _mm_store_si128((xmmi*)out + 0, a0);
167
+ _mm_store_si128((xmmi*)out + 1, a1);
168
+ _mm_store_si128((xmmi*)out + 2, a2);
169
+ }
170
+
171
+ DONNA_INLINE static void
172
+ curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
173
+ xmmi a0,a1,a2,b0,b1,b2;
174
+ xmmi c1,c2,c3;
175
+ xmmi r0,r1,r2,r3,r4,r5;
176
+
177
+ a0 = _mm_load_si128((xmmi*)a + 0);
178
+ a1 = _mm_load_si128((xmmi*)a + 1);
179
+ a2 = _mm_load_si128((xmmi*)a + 2);
180
+ a0 = _mm_add_epi32(a0, packed4p0.v);
181
+ a1 = _mm_add_epi32(a1, packed4p1.v);
182
+ a2 = _mm_add_epi32(a2, packed4p2.v);
183
+ b0 = _mm_load_si128((xmmi*)b + 0);
184
+ b1 = _mm_load_si128((xmmi*)b + 1);
185
+ b2 = _mm_load_si128((xmmi*)b + 2);
186
+ a0 = _mm_sub_epi32(a0, b0);
187
+ a1 = _mm_sub_epi32(a1, b1);
188
+ a2 = _mm_sub_epi32(a2, b2);
189
+
190
+ r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
191
+ r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
192
+ r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
193
+ r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
194
+ r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
195
+ r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
196
+
197
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
198
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
199
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
200
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
201
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
202
+
203
+ _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
204
+ _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
205
+ _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
206
+ }
207
+
208
+ DONNA_INLINE static void
209
+ curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
210
+ xmmi a0,a1,a2,b0,b1,b2;
211
+ xmmi c1,c2,c3;
212
+ xmmi r0,r1,r2,r3,r4,r5;
213
+
214
+ a0 = _mm_load_si128((xmmi*)a + 0);
215
+ a1 = _mm_load_si128((xmmi*)a + 1);
216
+ a2 = _mm_load_si128((xmmi*)a + 2);
217
+ a0 = _mm_add_epi32(a0, packed2p0.v);
218
+ a1 = _mm_add_epi32(a1, packed2p1.v);
219
+ a2 = _mm_add_epi32(a2, packed2p2.v);
220
+ b0 = _mm_load_si128((xmmi*)b + 0);
221
+ b1 = _mm_load_si128((xmmi*)b + 1);
222
+ b2 = _mm_load_si128((xmmi*)b + 2);
223
+ a0 = _mm_sub_epi32(a0, b0);
224
+ a1 = _mm_sub_epi32(a1, b1);
225
+ a2 = _mm_sub_epi32(a2, b2);
226
+
227
+ r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
228
+ r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
229
+ r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
230
+ r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
231
+ r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
232
+ r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
233
+
234
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
235
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
236
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
237
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
238
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
239
+
240
+ _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
241
+ _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
242
+ _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
243
+ }
244
+
245
+
246
+ DONNA_INLINE static void
247
+ curve25519_neg(bignum25519 out, const bignum25519 b) {
248
+ xmmi a0,a1,a2,b0,b1,b2;
249
+ xmmi c1,c2,c3;
250
+ xmmi r0,r1,r2,r3,r4,r5;
251
+
252
+ a0 = packed2p0.v;
253
+ a1 = packed2p1.v;
254
+ a2 = packed2p2.v;
255
+ b0 = _mm_load_si128((xmmi*)b + 0);
256
+ b1 = _mm_load_si128((xmmi*)b + 1);
257
+ b2 = _mm_load_si128((xmmi*)b + 2);
258
+ a0 = _mm_sub_epi32(a0, b0);
259
+ a1 = _mm_sub_epi32(a1, b1);
260
+ a2 = _mm_sub_epi32(a2, b2);
261
+
262
+ r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
263
+ r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
264
+ r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
265
+ r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
266
+ r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
267
+ r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
268
+
269
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
270
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
271
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
272
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
273
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
274
+
275
+ _mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
276
+ _mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
277
+ _mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
278
+ }
279
+
280
+
281
+ /* Multiply two numbers: out = in2 * in */
282
+ static void
283
+ curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
284
+ xmmi m01,m23,m45,m67,m89;
285
+ xmmi m0123,m4567;
286
+ xmmi s0123,s4567;
287
+ xmmi s01,s23,s45,s67,s89;
288
+ xmmi s12,s34,s56,s78,s9;
289
+ xmmi r0,r2,r4,r6,r8;
290
+ xmmi r1,r3,r5,r7,r9;
291
+ xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
292
+ xmmi c1,c2,c3;
293
+
294
+ s0123 = _mm_load_si128((xmmi*)s + 0);
295
+ s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
296
+ s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
297
+ s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
298
+ s4567 = _mm_load_si128((xmmi*)s + 1);
299
+ s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
300
+ s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
301
+ s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
302
+ s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
303
+ s89 = _mm_load_si128((xmmi*)s + 2);
304
+ s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
305
+ s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
306
+ s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
307
+
308
+ r0 = _mm_load_si128((xmmi*)r + 0);
309
+ r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
310
+ r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
311
+ r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
312
+ r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
313
+ r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
314
+ r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
315
+ r4 = _mm_load_si128((xmmi*)r + 1);
316
+ r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
317
+ r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
318
+ r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
319
+ r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
320
+ r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
321
+ r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
322
+ r8 = _mm_load_si128((xmmi*)r + 2);
323
+ r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
324
+ r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
325
+ r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
326
+
327
+ m01 = _mm_mul_epu32(r1,s01);
328
+ m23 = _mm_mul_epu32(r1,s23);
329
+ m45 = _mm_mul_epu32(r1,s45);
330
+ m67 = _mm_mul_epu32(r1,s67);
331
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
332
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
333
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
334
+ m89 = _mm_mul_epu32(r1,s89);
335
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
336
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
337
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
338
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
339
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
340
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
341
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
342
+
343
+ /* shift up */
344
+ m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
345
+ m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
346
+ m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
347
+ m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
348
+ m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
349
+
350
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
351
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
352
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
353
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
354
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
355
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
356
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
357
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
358
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
359
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
360
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
361
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
362
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
363
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
364
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
365
+
366
+ r219 = _mm_mul_epu32(r2, packednineteen.v);
367
+ r419 = _mm_mul_epu32(r4, packednineteen.v);
368
+ r619 = _mm_mul_epu32(r6, packednineteen.v);
369
+ r819 = _mm_mul_epu32(r8, packednineteen.v);
370
+ r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
371
+ r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
372
+ r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
373
+ r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
374
+ r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
375
+
376
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
377
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
378
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
379
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
380
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
381
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
382
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
383
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
384
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
385
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
386
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
387
+ m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
388
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
389
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
390
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
391
+ m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
392
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
393
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
394
+ m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
395
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
396
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
397
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
398
+ m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
399
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
400
+ m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
401
+
402
+ r0 = _mm_unpacklo_epi64(m01, m45);
403
+ r1 = _mm_unpackhi_epi64(m01, m45);
404
+ r2 = _mm_unpacklo_epi64(m23, m67);
405
+ r3 = _mm_unpackhi_epi64(m23, m67);
406
+ r4 = _mm_unpacklo_epi64(m89, m89);
407
+ r5 = _mm_unpackhi_epi64(m89, m89);
408
+
409
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
410
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
411
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
412
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
413
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
414
+
415
+ m0123 = _mm_unpacklo_epi32(r0, r1);
416
+ m4567 = _mm_unpackhi_epi32(r0, r1);
417
+ m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
418
+ m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
419
+ m89 = _mm_unpackhi_epi32(r4, r5);
420
+
421
+ _mm_store_si128((xmmi*)out + 0, m0123);
422
+ _mm_store_si128((xmmi*)out + 1, m4567);
423
+ _mm_store_si128((xmmi*)out + 2, m89);
424
+ }
425
+
426
+ DONNA_NOINLINE static void
427
+ curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
428
+ curve25519_mul(out, r, s);
429
+ }
430
+
431
+ #define curve25519_square(r, n) curve25519_square_times(r, n, 1)
432
+ static void
433
+ curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
434
+ xmmi m01,m23,m45,m67,m89;
435
+ xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
436
+ xmmi r0a,r1a,r2a,r3a,r7a,r9a;
437
+ xmmi r0123,r4567;
438
+ xmmi r01,r23,r45,r67,r6x,r89,r8x;
439
+ xmmi r12,r34,r56,r78,r9x;
440
+ xmmi r5619;
441
+ xmmi c1,c2,c3;
442
+
443
+ r0123 = _mm_load_si128((xmmi*)in + 0);
444
+ r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
445
+ r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
446
+ r4567 = _mm_load_si128((xmmi*)in + 1);
447
+ r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
448
+ r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
449
+ r89 = _mm_load_si128((xmmi*)in + 2);
450
+ r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
451
+
452
+ do {
453
+ r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
454
+ r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
455
+ r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
456
+ r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
457
+ r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
458
+ r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
459
+ r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
460
+ r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
461
+ r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
462
+ r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
463
+ r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
464
+ r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
465
+ r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
466
+ r5619 = _mm_mul_epu32(r56, packednineteen.v);
467
+ r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
468
+ r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
469
+ r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
470
+ r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
471
+ r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
472
+ r7 = _mm_mul_epu32(r7, packed3819.v);
473
+ r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
474
+ r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
475
+ r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
476
+ r8 = _mm_mul_epu32(r8, packednineteen.v);
477
+ r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
478
+ r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
479
+ r9 = _mm_mul_epu32(r9, packed3819.v);
480
+ r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
481
+
482
+ m01 = _mm_mul_epu32(r01, r0);
483
+ m23 = _mm_mul_epu32(r23, r0a);
484
+ m45 = _mm_mul_epu32(r45, r0a);
485
+ m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
486
+ r23 = _mm_slli_epi32(r23, 1);
487
+ m67 = _mm_mul_epu32(r67, r0a);
488
+ m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
489
+ m89 = _mm_mul_epu32(r89, r0a);
490
+ m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
491
+ r67 = _mm_slli_epi32(r67, 1);
492
+ m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
493
+ r45 = _mm_slli_epi32(r45, 1);
494
+
495
+ r1 = _mm_slli_epi32(r1, 1);
496
+ r3 = _mm_slli_epi32(r3, 1);
497
+ r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
498
+ r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
499
+
500
+ m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
501
+ m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
502
+ m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
503
+ m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
504
+ r34 = _mm_slli_epi32(r34, 1);
505
+ m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
506
+ r78 = _mm_slli_epi32(r78, 1);
507
+ m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
508
+ r56 = _mm_slli_epi32(r56, 1);
509
+
510
+ m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
511
+ m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
512
+ m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
513
+ m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
514
+ m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
515
+ m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
516
+ m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
517
+ m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
518
+ m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
519
+ m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
520
+ m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
521
+ m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
522
+ m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
523
+ m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
524
+ m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
525
+
526
+ r0 = _mm_unpacklo_epi64(m01, m45);
527
+ r1 = _mm_unpackhi_epi64(m01, m45);
528
+ r2 = _mm_unpacklo_epi64(m23, m67);
529
+ r3 = _mm_unpackhi_epi64(m23, m67);
530
+ r4 = _mm_unpacklo_epi64(m89, m89);
531
+ r5 = _mm_unpackhi_epi64(m89, m89);
532
+
533
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
534
+ c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
535
+ c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
536
+ c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
537
+ c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
538
+
539
+ r01 = _mm_unpacklo_epi64(r0, r1);
540
+ r45 = _mm_unpackhi_epi64(r0, r1);
541
+ r23 = _mm_unpacklo_epi64(r2, r3);
542
+ r67 = _mm_unpackhi_epi64(r2, r3);
543
+ r89 = _mm_unpackhi_epi64(r4, r5);
544
+ } while (--count);
545
+
546
+ r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
547
+ r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
548
+ r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
549
+ r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
550
+ r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
551
+
552
+ _mm_store_si128((xmmi*)r + 0, r0123);
553
+ _mm_store_si128((xmmi*)r + 1, r4567);
554
+ _mm_store_si128((xmmi*)r + 2, r89);
555
+ }
556
+
557
+ DONNA_INLINE static void
558
+ curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
559
+ xmmi x0,x1,x2,z0,z1,z2;
560
+
561
+ x0 = _mm_load_si128((xmmi *)(x + 0));
562
+ x1 = _mm_load_si128((xmmi *)(x + 4));
563
+ x2 = _mm_load_si128((xmmi *)(x + 8));
564
+ z0 = _mm_load_si128((xmmi *)(z + 0));
565
+ z1 = _mm_load_si128((xmmi *)(z + 4));
566
+ z2 = _mm_load_si128((xmmi *)(z + 8));
567
+
568
+ out[0].v = _mm_unpacklo_epi32(x0, z0);
569
+ out[1].v = _mm_unpackhi_epi32(x0, z0);
570
+ out[2].v = _mm_unpacklo_epi32(x1, z1);
571
+ out[3].v = _mm_unpackhi_epi32(x1, z1);
572
+ out[4].v = _mm_unpacklo_epi32(x2, z2);
573
+ }
574
+
575
+ DONNA_INLINE static void
576
+ curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
577
+ xmmi t0,t1,t2,t3,t4,zero;
578
+
579
+ t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
580
+ t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
581
+ t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
582
+ t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
583
+ t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
584
+ zero = _mm_setzero_si128();
585
+ _mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
586
+ _mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
587
+ _mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
588
+ _mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
589
+ _mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
590
+ _mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
591
+ }
592
+
593
+ DONNA_INLINE static void
594
+ curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
595
+ xmmi r0,r1,r2,r3,r4;
596
+ xmmi s0,s1,s2,s3,s4,s5;
597
+ xmmi c1,c2;
598
+
599
+ r0 = _mm_add_epi32(r[0].v, s[0].v);
600
+ r1 = _mm_add_epi32(r[1].v, s[1].v);
601
+ r2 = _mm_add_epi32(r[2].v, s[2].v);
602
+ r3 = _mm_add_epi32(r[3].v, s[3].v);
603
+ r4 = _mm_add_epi32(r[4].v, s[4].v);
604
+
605
+ s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
606
+ s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
607
+ s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
608
+ s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
609
+ s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
610
+ s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
611
+
612
+ c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
613
+ c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
614
+ c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
615
+ c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
616
+ c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
617
+
618
+ out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
619
+ out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
620
+ out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
621
+ out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
622
+ out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
623
+ }
624
+
625
+ DONNA_INLINE static void
626
+ curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
627
+ out[0].v = _mm_add_epi32(r[0].v, s[0].v);
628
+ out[1].v = _mm_add_epi32(r[1].v, s[1].v);
629
+ out[2].v = _mm_add_epi32(r[2].v, s[2].v);
630
+ out[3].v = _mm_add_epi32(r[3].v, s[3].v);
631
+ out[4].v = _mm_add_epi32(r[4].v, s[4].v);
632
+ }
633
+
634
+ DONNA_INLINE static void
635
+ curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
636
+ xmmi r0,r1,r2,r3,r4;
637
+ xmmi s0,s1,s2,s3;
638
+ xmmi c1,c2;
639
+
640
+ r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
641
+ r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
642
+ r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
643
+ r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
644
+ r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
645
+ r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
646
+ r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
647
+ r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
648
+ r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
649
+ r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
650
+
651
+ s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
652
+ s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
653
+ s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
654
+ s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
655
+
656
+ c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
657
+ c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
658
+
659
+ out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
660
+ out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
661
+ out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
662
+ out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
663
+ out[4].v = r4;
664
+ }
665
+
666
+ DONNA_INLINE static void
667
+ curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
668
+ xmmi r0,r1,r2,r3,r4;
669
+ xmmi s0,s1,s2,s3,s4,s5;
670
+ xmmi c1,c2;
671
+
672
+ r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
673
+ r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
674
+ r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
675
+ r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
676
+ r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
677
+ r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
678
+ r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
679
+ r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
680
+ r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
681
+ r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
682
+
683
+ s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
684
+ s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
685
+ s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
686
+ s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
687
+ s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
688
+ s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
689
+
690
+ c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
691
+ c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
692
+ c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
693
+ c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
694
+ c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
695
+
696
+ out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
697
+ out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
698
+ out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
699
+ out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
700
+ out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
701
+ }
702
+
703
+ DONNA_INLINE static void
704
+ curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
705
+ xmmi c0,c1,c2,c3,c4,c5,t;
706
+ xmmi d0,d1,d2,d3,d4,d5;
707
+ xmmi t0,t1,t2,t3,t4,zero;
708
+
709
+ t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
710
+ t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
711
+ t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
712
+ t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
713
+ c0 = _mm_unpacklo_epi64(t0, t1);
714
+ c3 = _mm_unpackhi_epi64(t0, t1);
715
+ d0 = _mm_unpacklo_epi64(t2, t3);
716
+ d3 = _mm_unpackhi_epi64(t2, t3);
717
+ t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
718
+ t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
719
+ t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
720
+ t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
721
+
722
+ t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
723
+ t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
724
+ t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
725
+ t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
726
+ c1 = _mm_unpacklo_epi64(t0, t1);
727
+ c4 = _mm_unpackhi_epi64(t0, t1);
728
+ d1 = _mm_unpacklo_epi64(t2, t3);
729
+ d4 = _mm_unpackhi_epi64(t2, t3);
730
+ t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
731
+ t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
732
+ t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
733
+ t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
734
+
735
+ t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
736
+ zero = _mm_setzero_si128();
737
+ c2 = _mm_unpacklo_epi64(t4, zero);
738
+ c5 = _mm_unpackhi_epi64(t4, zero);
739
+ t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
740
+ d2 = _mm_unpacklo_epi64(t4, zero);
741
+ d5 = _mm_unpackhi_epi64(t4, zero);
742
+ t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
743
+ t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
744
+ }
745
+
746
+ DONNA_INLINE static void
747
+ curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
748
+ xmmi x0,x1,x2,z0,z1,z2,t;
749
+
750
+ x0 = _mm_load_si128((xmmi *)x + 0);
751
+ x1 = _mm_load_si128((xmmi *)x + 1);
752
+ x2 = _mm_load_si128((xmmi *)x + 2);
753
+ z0 = _mm_load_si128((xmmi *)z + 0);
754
+ z1 = _mm_load_si128((xmmi *)z + 1);
755
+ z2 = _mm_load_si128((xmmi *)z + 2);
756
+
757
+ t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
758
+ t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
759
+ t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
760
+ t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
761
+ t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
762
+ }
763
+
764
+ DONNA_INLINE static void
765
+ curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
766
+ xmmi x0,x1,x2;
767
+
768
+ x0 = _mm_load_si128((xmmi *)(x + 0));
769
+ x1 = _mm_load_si128((xmmi *)(x + 4));
770
+ x2 = _mm_load_si128((xmmi *)(x + 8));
771
+
772
+ out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
773
+ out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
774
+ out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
775
+ out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
776
+ out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
777
+ out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
778
+ out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
779
+ out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
780
+ out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
781
+ out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
782
+ }
783
+
784
+ DONNA_INLINE static void
785
+ curve25519_swap64(packedelem64 *out) {
786
+ out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
787
+ out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
788
+ out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
789
+ out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
790
+ out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
791
+ out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
792
+ out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
793
+ out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
794
+ out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
795
+ out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
796
+ }
797
+
798
+ DONNA_INLINE static void
799
+ curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
800
+ _mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
801
+ _mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
802
+ _mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
803
+ _mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
804
+ _mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
805
+ _mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
806
+ }
807
+
808
+ DONNA_INLINE static void
809
+ curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
810
+ xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
811
+ xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
812
+ xmmi c1,c2;
813
+
814
+ out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
815
+ out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
816
+ r1_2 = _mm_slli_epi32(r[1].v, 1);
817
+ out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
818
+ out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
819
+ r3_2 = _mm_slli_epi32(r[3].v, 1);
820
+ out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
821
+ out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
822
+ r5_2 = _mm_slli_epi32(r[5].v, 1);
823
+ out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
824
+ out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
825
+ r7_2 = _mm_slli_epi32(r[7].v, 1);
826
+ out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
827
+ out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
828
+
829
+ r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
830
+ r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
831
+ r1_2 = _mm_slli_epi32(r1, 1);
832
+ r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
833
+ r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
834
+ r3_2 = _mm_slli_epi32(r3, 1);
835
+ r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
836
+ r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
837
+ r5_2 = _mm_slli_epi32(r5, 1);
838
+ r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
839
+ r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
840
+ r7_2 = _mm_slli_epi32(r7, 1);
841
+ r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
842
+ r9_2 = _mm_slli_epi32(r9, 1);
843
+
844
+ out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
845
+ out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
846
+ out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
847
+ out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
848
+ out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
849
+ out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
850
+ out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
851
+ out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
852
+ out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
853
+
854
+ c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
855
+ c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
856
+ c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
857
+ c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
858
+ c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
859
+ c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
860
+ c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
861
+ }
862
+
863
+ DONNA_INLINE static void
864
+ curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
865
+ xmmi r0,r1,r2,r3;
866
+ xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
867
+ xmmi d5,d6,d7,d8,d9;
868
+ xmmi c1,c2;
869
+
870
+ r0 = r[0].v;
871
+ r1 = r[1].v;
872
+ r2 = r[2].v;
873
+ r3 = r[3].v;
874
+
875
+ out[0].v = _mm_mul_epu32(r0, r0);
876
+ r0 = _mm_slli_epi32(r0, 1);
877
+ out[1].v = _mm_mul_epu32(r0, r1);
878
+ r1_2 = _mm_slli_epi32(r1, 1);
879
+ out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
880
+ r1 = r1_2;
881
+ out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
882
+ r3_2 = _mm_slli_epi32(r3, 1);
883
+ out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
884
+ r2 = _mm_slli_epi32(r2, 1);
885
+ out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
886
+ r5_2 = _mm_slli_epi32(r[5].v, 1);
887
+ out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
888
+ r3 = r3_2;
889
+ out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
890
+ r7_2 = _mm_slli_epi32(r[7].v, 1);
891
+ out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
892
+ out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
893
+
894
+ d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
895
+ d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
896
+ d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
897
+ d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
898
+ d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
899
+
900
+ r4_2 = _mm_slli_epi32(r[4].v, 1);
901
+ r6_2 = _mm_slli_epi32(r[6].v, 1);
902
+ out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
903
+ out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
904
+ out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
905
+ out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
906
+ out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
907
+ out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
908
+ out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
909
+ out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
910
+ out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
911
+
912
+ c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
913
+ c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
914
+ c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
915
+ c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
916
+ c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
917
+ c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
918
+ c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
919
+ }
920
+
921
+
922
+ /* Take a little-endian, 32-byte number and expand it into polynomial form */
923
+ static void
924
+ curve25519_expand(bignum25519 out, const unsigned char in[32]) {
925
+ uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
926
+
927
+ x0 = *(uint32_t *)(in + 0);
928
+ x1 = *(uint32_t *)(in + 4);
929
+ x2 = *(uint32_t *)(in + 8);
930
+ x3 = *(uint32_t *)(in + 12);
931
+ x4 = *(uint32_t *)(in + 16);
932
+ x5 = *(uint32_t *)(in + 20);
933
+ x6 = *(uint32_t *)(in + 24);
934
+ x7 = *(uint32_t *)(in + 28);
935
+
936
+ out[0] = ( x0 ) & 0x3ffffff;
937
+ out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
938
+ out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
939
+ out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
940
+ out[4] = (( x3) >> 6) & 0x3ffffff;
941
+ out[5] = ( x4 ) & 0x1ffffff;
942
+ out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
943
+ out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
944
+ out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
945
+ out[9] = (( x7) >> 6) & 0x1ffffff;
946
+ out[10] = 0;
947
+ out[11] = 0;
948
+ }
949
+
950
+ /* Take a fully reduced polynomial form number and contract it into a
951
+ * little-endian, 32-byte array
952
+ */
953
+ static void
954
+ curve25519_contract(unsigned char out[32], const bignum25519 in) {
955
+ bignum25519 ALIGN(16) f;
956
+ curve25519_copy(f, in);
957
+
958
+ #define carry_pass() \
959
+ f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
960
+ f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
961
+ f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
962
+ f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
963
+ f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
964
+ f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
965
+ f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
966
+ f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
967
+ f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
968
+
969
+ #define carry_pass_full() \
970
+ carry_pass() \
971
+ f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
972
+
973
+ #define carry_pass_final() \
974
+ carry_pass() \
975
+ f[9] &= 0x1ffffff;
976
+
977
+ carry_pass_full()
978
+ carry_pass_full()
979
+
980
+ /* now t is between 0 and 2^255-1, properly carried. */
981
+ /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
982
+ f[0] += 19;
983
+ carry_pass_full()
984
+
985
+ /* now between 19 and 2^255-1 in both cases, and offset by 19. */
986
+ f[0] += (1 << 26) - 19;
987
+ f[1] += (1 << 25) - 1;
988
+ f[2] += (1 << 26) - 1;
989
+ f[3] += (1 << 25) - 1;
990
+ f[4] += (1 << 26) - 1;
991
+ f[5] += (1 << 25) - 1;
992
+ f[6] += (1 << 26) - 1;
993
+ f[7] += (1 << 25) - 1;
994
+ f[8] += (1 << 26) - 1;
995
+ f[9] += (1 << 25) - 1;
996
+
997
+ /* now between 2^255 and 2^256-20, and offset by 2^255. */
998
+ carry_pass_final()
999
+
1000
+ #undef carry_pass
1001
+ #undef carry_full
1002
+ #undef carry_final
1003
+
1004
+ f[1] <<= 2;
1005
+ f[2] <<= 3;
1006
+ f[3] <<= 5;
1007
+ f[4] <<= 6;
1008
+ f[6] <<= 1;
1009
+ f[7] <<= 3;
1010
+ f[8] <<= 4;
1011
+ f[9] <<= 6;
1012
+
1013
+ #define F(i, s) \
1014
+ out[s+0] |= (unsigned char )(f[i] & 0xff); \
1015
+ out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
1016
+ out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
1017
+ out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
1018
+
1019
+ out[0] = 0;
1020
+ out[16] = 0;
1021
+ F(0,0);
1022
+ F(1,3);
1023
+ F(2,6);
1024
+ F(3,9);
1025
+ F(4,12);
1026
+ F(5,16);
1027
+ F(6,19);
1028
+ F(7,22);
1029
+ F(8,25);
1030
+ F(9,28);
1031
+ #undef F
1032
+ }
1033
+
1034
+ /* if (iswap) swap(a, b) */
1035
+ DONNA_INLINE static void
1036
+ curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
1037
+ const uint32_t swap = (uint32_t)(-(int32_t)iswap);
1038
+ xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
1039
+ xmmi mask = _mm_cvtsi32_si128(swap);
1040
+ mask = _mm_shuffle_epi32(mask, 0);
1041
+ a0 = _mm_load_si128((xmmi *)a + 0);
1042
+ a1 = _mm_load_si128((xmmi *)a + 1);
1043
+ b0 = _mm_load_si128((xmmi *)b + 0);
1044
+ b1 = _mm_load_si128((xmmi *)b + 1);
1045
+ b0 = _mm_xor_si128(a0, b0);
1046
+ b1 = _mm_xor_si128(a1, b1);
1047
+ x0 = _mm_and_si128(b0, mask);
1048
+ x1 = _mm_and_si128(b1, mask);
1049
+ x0 = _mm_xor_si128(x0, a0);
1050
+ x1 = _mm_xor_si128(x1, a1);
1051
+ a0 = _mm_xor_si128(x0, b0);
1052
+ a1 = _mm_xor_si128(x1, b1);
1053
+ _mm_store_si128((xmmi *)a + 0, x0);
1054
+ _mm_store_si128((xmmi *)a + 1, x1);
1055
+ _mm_store_si128((xmmi *)b + 0, a0);
1056
+ _mm_store_si128((xmmi *)b + 1, a1);
1057
+
1058
+ a2 = _mm_load_si128((xmmi *)a + 2);
1059
+ b2 = _mm_load_si128((xmmi *)b + 2);
1060
+ b2 = _mm_xor_si128(a2, b2);
1061
+ x2 = _mm_and_si128(b2, mask);
1062
+ x2 = _mm_xor_si128(x2, a2);
1063
+ a2 = _mm_xor_si128(x2, b2);
1064
+ _mm_store_si128((xmmi *)b + 2, a2);
1065
+ _mm_store_si128((xmmi *)a + 2, x2);
1066
+ }
1067
+
1068
+ /* out = (flag) ? out : in */
1069
+ DONNA_INLINE static void
1070
+ curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
1071
+ xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
1072
+ const uint32_t nb = flag - 1;
1073
+ xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
1074
+ a0 = _mm_load_si128((xmmi *)in + 0);
1075
+ a1 = _mm_load_si128((xmmi *)in + 1);
1076
+ a2 = _mm_load_si128((xmmi *)in + 2);
1077
+ b0 = _mm_load_si128((xmmi *)out + 0);
1078
+ b1 = _mm_load_si128((xmmi *)out + 1);
1079
+ b2 = _mm_load_si128((xmmi *)out + 2);
1080
+ a0 = _mm_andnot_si128(masknb, a0);
1081
+ a1 = _mm_andnot_si128(masknb, a1);
1082
+ a2 = _mm_andnot_si128(masknb, a2);
1083
+ b0 = _mm_and_si128(masknb, b0);
1084
+ b1 = _mm_and_si128(masknb, b1);
1085
+ b2 = _mm_and_si128(masknb, b2);
1086
+ a0 = _mm_or_si128(a0, b0);
1087
+ a1 = _mm_or_si128(a1, b1);
1088
+ a2 = _mm_or_si128(a2, b2);
1089
+ _mm_store_si128((xmmi*)out + 0, a0);
1090
+ _mm_store_si128((xmmi*)out + 1, a1);
1091
+ _mm_store_si128((xmmi*)out + 2, a2);
1092
+
1093
+ a3 = _mm_load_si128((xmmi *)in + 3);
1094
+ a4 = _mm_load_si128((xmmi *)in + 4);
1095
+ a5 = _mm_load_si128((xmmi *)in + 5);
1096
+ b3 = _mm_load_si128((xmmi *)out + 3);
1097
+ b4 = _mm_load_si128((xmmi *)out + 4);
1098
+ b5 = _mm_load_si128((xmmi *)out + 5);
1099
+ a3 = _mm_andnot_si128(masknb, a3);
1100
+ a4 = _mm_andnot_si128(masknb, a4);
1101
+ a5 = _mm_andnot_si128(masknb, a5);
1102
+ b3 = _mm_and_si128(masknb, b3);
1103
+ b4 = _mm_and_si128(masknb, b4);
1104
+ b5 = _mm_and_si128(masknb, b5);
1105
+ a3 = _mm_or_si128(a3, b3);
1106
+ a4 = _mm_or_si128(a4, b4);
1107
+ a5 = _mm_or_si128(a5, b5);
1108
+ _mm_store_si128((xmmi*)out + 3, a3);
1109
+ _mm_store_si128((xmmi*)out + 4, a4);
1110
+ _mm_store_si128((xmmi*)out + 5, a5);
1111
+ }
1112
+