ed25519_blake2b 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +23 -0
- data/LICENSE +21 -0
- data/README.md +39 -0
- data/Rakefile +13 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/ed25519_blake2b.gemspec +31 -0
- data/ext/ed25519_blake2b/blake2-config.h +72 -0
- data/ext/ed25519_blake2b/blake2-impl.h +160 -0
- data/ext/ed25519_blake2b/blake2.h +195 -0
- data/ext/ed25519_blake2b/blake2b-load-sse2.h +68 -0
- data/ext/ed25519_blake2b/blake2b-load-sse41.h +402 -0
- data/ext/ed25519_blake2b/blake2b-ref.c +373 -0
- data/ext/ed25519_blake2b/blake2b-round.h +157 -0
- data/ext/ed25519_blake2b/curve25519-donna-32bit.h +579 -0
- data/ext/ed25519_blake2b/curve25519-donna-64bit.h +413 -0
- data/ext/ed25519_blake2b/curve25519-donna-helpers.h +67 -0
- data/ext/ed25519_blake2b/curve25519-donna-sse2.h +1112 -0
- data/ext/ed25519_blake2b/ed25519-donna-32bit-sse2.h +513 -0
- data/ext/ed25519_blake2b/ed25519-donna-32bit-tables.h +61 -0
- data/ext/ed25519_blake2b/ed25519-donna-64bit-sse2.h +436 -0
- data/ext/ed25519_blake2b/ed25519-donna-64bit-tables.h +53 -0
- data/ext/ed25519_blake2b/ed25519-donna-64bit-x86-32bit.h +435 -0
- data/ext/ed25519_blake2b/ed25519-donna-64bit-x86.h +351 -0
- data/ext/ed25519_blake2b/ed25519-donna-basepoint-table.h +259 -0
- data/ext/ed25519_blake2b/ed25519-donna-batchverify.h +275 -0
- data/ext/ed25519_blake2b/ed25519-donna-impl-base.h +364 -0
- data/ext/ed25519_blake2b/ed25519-donna-impl-sse2.h +390 -0
- data/ext/ed25519_blake2b/ed25519-donna-portable-identify.h +103 -0
- data/ext/ed25519_blake2b/ed25519-donna-portable.h +135 -0
- data/ext/ed25519_blake2b/ed25519-donna.h +115 -0
- data/ext/ed25519_blake2b/ed25519-hash-custom.c +28 -0
- data/ext/ed25519_blake2b/ed25519-hash-custom.h +30 -0
- data/ext/ed25519_blake2b/ed25519-hash.h +219 -0
- data/ext/ed25519_blake2b/ed25519-randombytes-custom.h +10 -0
- data/ext/ed25519_blake2b/ed25519-randombytes.h +91 -0
- data/ext/ed25519_blake2b/ed25519.c +150 -0
- data/ext/ed25519_blake2b/ed25519.h +30 -0
- data/ext/ed25519_blake2b/extconf.rb +3 -0
- data/ext/ed25519_blake2b/fuzz/README.md +173 -0
- data/ext/ed25519_blake2b/fuzz/build-nix.php +134 -0
- data/ext/ed25519_blake2b/fuzz/curve25519-ref10.c +1272 -0
- data/ext/ed25519_blake2b/fuzz/curve25519-ref10.h +8 -0
- data/ext/ed25519_blake2b/fuzz/ed25519-donna-sse2.c +3 -0
- data/ext/ed25519_blake2b/fuzz/ed25519-donna.c +1 -0
- data/ext/ed25519_blake2b/fuzz/ed25519-donna.h +34 -0
- data/ext/ed25519_blake2b/fuzz/ed25519-ref10.c +4647 -0
- data/ext/ed25519_blake2b/fuzz/ed25519-ref10.h +9 -0
- data/ext/ed25519_blake2b/fuzz/fuzz-curve25519.c +172 -0
- data/ext/ed25519_blake2b/fuzz/fuzz-ed25519.c +219 -0
- data/ext/ed25519_blake2b/modm-donna-32bit.h +469 -0
- data/ext/ed25519_blake2b/modm-donna-64bit.h +361 -0
- data/ext/ed25519_blake2b/rbext.c +25 -0
- data/ext/ed25519_blake2b/regression.h +1024 -0
- data/lib/ed25519_blake2b/ed25519_blake2b.rb +4 -0
- data/lib/ed25519_blake2b/version.rb +3 -0
- metadata +147 -0
@@ -0,0 +1,1112 @@
|
|
1
|
+
/*
|
2
|
+
Public domain by Andrew M. <liquidsun@gmail.com>
|
3
|
+
See: https://github.com/floodyberry/curve25519-donna
|
4
|
+
|
5
|
+
SSE2 curve25519 implementation
|
6
|
+
*/
|
7
|
+
|
8
|
+
#include <emmintrin.h>
|
9
|
+
typedef __m128i xmmi;
|
10
|
+
|
11
|
+
typedef union packedelem8_t {
|
12
|
+
unsigned char u[16];
|
13
|
+
xmmi v;
|
14
|
+
} packedelem8;
|
15
|
+
|
16
|
+
typedef union packedelem32_t {
|
17
|
+
uint32_t u[4];
|
18
|
+
xmmi v;
|
19
|
+
} packedelem32;
|
20
|
+
|
21
|
+
typedef union packedelem64_t {
|
22
|
+
uint64_t u[2];
|
23
|
+
xmmi v;
|
24
|
+
} packedelem64;
|
25
|
+
|
26
|
+
/* 10 elements + an extra 2 to fit in 3 xmm registers */
|
27
|
+
typedef uint32_t bignum25519[12];
|
28
|
+
typedef packedelem32 packed32bignum25519[5];
|
29
|
+
typedef packedelem64 packed64bignum25519[10];
|
30
|
+
|
31
|
+
static const packedelem32 bot32bitmask = {{0xffffffff, 0x00000000, 0xffffffff, 0x00000000}};
|
32
|
+
static const packedelem32 top32bitmask = {{0x00000000, 0xffffffff, 0x00000000, 0xffffffff}};
|
33
|
+
static const packedelem32 top64bitmask = {{0x00000000, 0x00000000, 0xffffffff, 0xffffffff}};
|
34
|
+
static const packedelem32 bot64bitmask = {{0xffffffff, 0xffffffff, 0x00000000, 0x00000000}};
|
35
|
+
|
36
|
+
/* reduction masks */
|
37
|
+
static const packedelem64 packedmask26 = {{0x03ffffff, 0x03ffffff}};
|
38
|
+
static const packedelem64 packedmask25 = {{0x01ffffff, 0x01ffffff}};
|
39
|
+
static const packedelem32 packedmask2625 = {{0x3ffffff,0,0x1ffffff,0}};
|
40
|
+
static const packedelem32 packedmask26262626 = {{0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff}};
|
41
|
+
static const packedelem32 packedmask25252525 = {{0x01ffffff, 0x01ffffff, 0x01ffffff, 0x01ffffff}};
|
42
|
+
|
43
|
+
/* multipliers */
|
44
|
+
static const packedelem64 packednineteen = {{19, 19}};
|
45
|
+
static const packedelem64 packednineteenone = {{19, 1}};
|
46
|
+
static const packedelem64 packedthirtyeight = {{38, 38}};
|
47
|
+
static const packedelem64 packed3819 = {{19*2,19}};
|
48
|
+
static const packedelem64 packed9638 = {{19*4,19*2}};
|
49
|
+
|
50
|
+
/* 121666,121665 */
|
51
|
+
static const packedelem64 packed121666121665 = {{121666, 121665}};
|
52
|
+
|
53
|
+
/* 2*(2^255 - 19) = 0 mod p */
|
54
|
+
static const packedelem32 packed2p0 = {{0x7ffffda,0x3fffffe,0x7fffffe,0x3fffffe}};
|
55
|
+
static const packedelem32 packed2p1 = {{0x7fffffe,0x3fffffe,0x7fffffe,0x3fffffe}};
|
56
|
+
static const packedelem32 packed2p2 = {{0x7fffffe,0x3fffffe,0x0000000,0x0000000}};
|
57
|
+
|
58
|
+
static const packedelem32 packed32packed2p0 = {{0x7ffffda,0x7ffffda,0x3fffffe,0x3fffffe}};
|
59
|
+
static const packedelem32 packed32packed2p1 = {{0x7fffffe,0x7fffffe,0x3fffffe,0x3fffffe}};
|
60
|
+
|
61
|
+
/* 4*(2^255 - 19) = 0 mod p */
|
62
|
+
static const packedelem32 packed4p0 = {{0xfffffb4,0x7fffffc,0xffffffc,0x7fffffc}};
|
63
|
+
static const packedelem32 packed4p1 = {{0xffffffc,0x7fffffc,0xffffffc,0x7fffffc}};
|
64
|
+
static const packedelem32 packed4p2 = {{0xffffffc,0x7fffffc,0x0000000,0x0000000}};
|
65
|
+
|
66
|
+
static const packedelem32 packed32packed4p0 = {{0xfffffb4,0xfffffb4,0x7fffffc,0x7fffffc}};
|
67
|
+
static const packedelem32 packed32packed4p1 = {{0xffffffc,0xffffffc,0x7fffffc,0x7fffffc}};
|
68
|
+
|
69
|
+
/* out = in */
|
70
|
+
DONNA_INLINE static void
|
71
|
+
curve25519_copy(bignum25519 out, const bignum25519 in) {
|
72
|
+
xmmi x0,x1,x2;
|
73
|
+
x0 = _mm_load_si128((xmmi*)in + 0);
|
74
|
+
x1 = _mm_load_si128((xmmi*)in + 1);
|
75
|
+
x2 = _mm_load_si128((xmmi*)in + 2);
|
76
|
+
_mm_store_si128((xmmi*)out + 0, x0);
|
77
|
+
_mm_store_si128((xmmi*)out + 1, x1);
|
78
|
+
_mm_store_si128((xmmi*)out + 2, x2);
|
79
|
+
}
|
80
|
+
|
81
|
+
/* out = a + b */
|
82
|
+
DONNA_INLINE static void
|
83
|
+
curve25519_add(bignum25519 out, const bignum25519 a, const bignum25519 b) {
|
84
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
85
|
+
a0 = _mm_load_si128((xmmi*)a + 0);
|
86
|
+
a1 = _mm_load_si128((xmmi*)a + 1);
|
87
|
+
a2 = _mm_load_si128((xmmi*)a + 2);
|
88
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
89
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
90
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
91
|
+
a0 = _mm_add_epi32(a0, b0);
|
92
|
+
a1 = _mm_add_epi32(a1, b1);
|
93
|
+
a2 = _mm_add_epi32(a2, b2);
|
94
|
+
_mm_store_si128((xmmi*)out + 0, a0);
|
95
|
+
_mm_store_si128((xmmi*)out + 1, a1);
|
96
|
+
_mm_store_si128((xmmi*)out + 2, a2);
|
97
|
+
}
|
98
|
+
|
99
|
+
#define curve25519_add_after_basic curve25519_add_reduce
|
100
|
+
DONNA_INLINE static void
|
101
|
+
curve25519_add_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
|
102
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
103
|
+
xmmi c1,c2,c3;
|
104
|
+
xmmi r0,r1,r2,r3,r4,r5;
|
105
|
+
|
106
|
+
a0 = _mm_load_si128((xmmi*)a + 0);
|
107
|
+
a1 = _mm_load_si128((xmmi*)a + 1);
|
108
|
+
a2 = _mm_load_si128((xmmi*)a + 2);
|
109
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
110
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
111
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
112
|
+
a0 = _mm_add_epi32(a0, b0);
|
113
|
+
a1 = _mm_add_epi32(a1, b1);
|
114
|
+
a2 = _mm_add_epi32(a2, b2);
|
115
|
+
|
116
|
+
r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
|
117
|
+
r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
|
118
|
+
r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
|
119
|
+
r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
|
120
|
+
r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
|
121
|
+
r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
|
122
|
+
|
123
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
124
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
125
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
126
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
127
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
128
|
+
|
129
|
+
_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
|
130
|
+
_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
|
131
|
+
_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
|
132
|
+
}
|
133
|
+
|
134
|
+
DONNA_INLINE static void
|
135
|
+
curve25519_sub(bignum25519 out, const bignum25519 a, const bignum25519 b) {
|
136
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
137
|
+
xmmi c1,c2;
|
138
|
+
xmmi r0,r1;
|
139
|
+
|
140
|
+
a0 = _mm_load_si128((xmmi*)a + 0);
|
141
|
+
a1 = _mm_load_si128((xmmi*)a + 1);
|
142
|
+
a2 = _mm_load_si128((xmmi*)a + 2);
|
143
|
+
a0 = _mm_add_epi32(a0, packed2p0.v);
|
144
|
+
a1 = _mm_add_epi32(a1, packed2p1.v);
|
145
|
+
a2 = _mm_add_epi32(a2, packed2p2.v);
|
146
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
147
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
148
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
149
|
+
a0 = _mm_sub_epi32(a0, b0);
|
150
|
+
a1 = _mm_sub_epi32(a1, b1);
|
151
|
+
a2 = _mm_sub_epi32(a2, b2);
|
152
|
+
|
153
|
+
r0 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(2,2,0,0)), bot32bitmask.v);
|
154
|
+
r1 = _mm_and_si128(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3,3,1,1)), bot32bitmask.v);
|
155
|
+
|
156
|
+
c1 = _mm_srli_epi32(r0, 26);
|
157
|
+
c2 = _mm_srli_epi32(r1, 25);
|
158
|
+
r0 = _mm_and_si128(r0, packedmask26.v);
|
159
|
+
r1 = _mm_and_si128(r1, packedmask25.v);
|
160
|
+
r0 = _mm_add_epi32(r0, _mm_slli_si128(c2, 8));
|
161
|
+
r1 = _mm_add_epi32(r1, c1);
|
162
|
+
|
163
|
+
a0 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpackhi_epi32(r0, r1));
|
164
|
+
a1 = _mm_add_epi32(a1, _mm_srli_si128(c2, 8));
|
165
|
+
|
166
|
+
_mm_store_si128((xmmi*)out + 0, a0);
|
167
|
+
_mm_store_si128((xmmi*)out + 1, a1);
|
168
|
+
_mm_store_si128((xmmi*)out + 2, a2);
|
169
|
+
}
|
170
|
+
|
171
|
+
DONNA_INLINE static void
|
172
|
+
curve25519_sub_after_basic(bignum25519 out, const bignum25519 a, const bignum25519 b) {
|
173
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
174
|
+
xmmi c1,c2,c3;
|
175
|
+
xmmi r0,r1,r2,r3,r4,r5;
|
176
|
+
|
177
|
+
a0 = _mm_load_si128((xmmi*)a + 0);
|
178
|
+
a1 = _mm_load_si128((xmmi*)a + 1);
|
179
|
+
a2 = _mm_load_si128((xmmi*)a + 2);
|
180
|
+
a0 = _mm_add_epi32(a0, packed4p0.v);
|
181
|
+
a1 = _mm_add_epi32(a1, packed4p1.v);
|
182
|
+
a2 = _mm_add_epi32(a2, packed4p2.v);
|
183
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
184
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
185
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
186
|
+
a0 = _mm_sub_epi32(a0, b0);
|
187
|
+
a1 = _mm_sub_epi32(a1, b1);
|
188
|
+
a2 = _mm_sub_epi32(a2, b2);
|
189
|
+
|
190
|
+
r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
|
191
|
+
r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
|
192
|
+
r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
|
193
|
+
r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
|
194
|
+
r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
|
195
|
+
r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
|
196
|
+
|
197
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
198
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
199
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
200
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
201
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
202
|
+
|
203
|
+
_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
|
204
|
+
_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
|
205
|
+
_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
|
206
|
+
}
|
207
|
+
|
208
|
+
DONNA_INLINE static void
|
209
|
+
curve25519_sub_reduce(bignum25519 out, const bignum25519 a, const bignum25519 b) {
|
210
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
211
|
+
xmmi c1,c2,c3;
|
212
|
+
xmmi r0,r1,r2,r3,r4,r5;
|
213
|
+
|
214
|
+
a0 = _mm_load_si128((xmmi*)a + 0);
|
215
|
+
a1 = _mm_load_si128((xmmi*)a + 1);
|
216
|
+
a2 = _mm_load_si128((xmmi*)a + 2);
|
217
|
+
a0 = _mm_add_epi32(a0, packed2p0.v);
|
218
|
+
a1 = _mm_add_epi32(a1, packed2p1.v);
|
219
|
+
a2 = _mm_add_epi32(a2, packed2p2.v);
|
220
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
221
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
222
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
223
|
+
a0 = _mm_sub_epi32(a0, b0);
|
224
|
+
a1 = _mm_sub_epi32(a1, b1);
|
225
|
+
a2 = _mm_sub_epi32(a2, b2);
|
226
|
+
|
227
|
+
r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
|
228
|
+
r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
|
229
|
+
r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
|
230
|
+
r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
|
231
|
+
r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
|
232
|
+
r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
|
233
|
+
|
234
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
235
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
236
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
237
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
238
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
239
|
+
|
240
|
+
_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
|
241
|
+
_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
|
242
|
+
_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
|
243
|
+
}
|
244
|
+
|
245
|
+
|
246
|
+
DONNA_INLINE static void
|
247
|
+
curve25519_neg(bignum25519 out, const bignum25519 b) {
|
248
|
+
xmmi a0,a1,a2,b0,b1,b2;
|
249
|
+
xmmi c1,c2,c3;
|
250
|
+
xmmi r0,r1,r2,r3,r4,r5;
|
251
|
+
|
252
|
+
a0 = packed2p0.v;
|
253
|
+
a1 = packed2p1.v;
|
254
|
+
a2 = packed2p2.v;
|
255
|
+
b0 = _mm_load_si128((xmmi*)b + 0);
|
256
|
+
b1 = _mm_load_si128((xmmi*)b + 1);
|
257
|
+
b2 = _mm_load_si128((xmmi*)b + 2);
|
258
|
+
a0 = _mm_sub_epi32(a0, b0);
|
259
|
+
a1 = _mm_sub_epi32(a1, b1);
|
260
|
+
a2 = _mm_sub_epi32(a2, b2);
|
261
|
+
|
262
|
+
r0 = _mm_and_si128(_mm_unpacklo_epi64(a0, a1), bot32bitmask.v);
|
263
|
+
r1 = _mm_srli_epi64(_mm_unpacklo_epi64(a0, a1), 32);
|
264
|
+
r2 = _mm_and_si128(_mm_unpackhi_epi64(a0, a1), bot32bitmask.v);
|
265
|
+
r3 = _mm_srli_epi64(_mm_unpackhi_epi64(a0, a1), 32);
|
266
|
+
r4 = _mm_and_si128(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), bot32bitmask.v);
|
267
|
+
r5 = _mm_srli_epi64(_mm_unpacklo_epi64(_mm_setzero_si128(), a2), 32);
|
268
|
+
|
269
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
270
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
271
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
272
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
273
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
274
|
+
|
275
|
+
_mm_store_si128((xmmi*)out + 0, _mm_unpacklo_epi64(_mm_unpacklo_epi32(r0, r1), _mm_unpacklo_epi32(r2, r3)));
|
276
|
+
_mm_store_si128((xmmi*)out + 1, _mm_unpacklo_epi64(_mm_unpackhi_epi32(r0, r1), _mm_unpackhi_epi32(r2, r3)));
|
277
|
+
_mm_store_si128((xmmi*)out + 2, _mm_unpackhi_epi32(r4, r5));
|
278
|
+
}
|
279
|
+
|
280
|
+
|
281
|
+
/* Multiply two numbers: out = in2 * in */
|
282
|
+
static void
|
283
|
+
curve25519_mul(bignum25519 out, const bignum25519 r, const bignum25519 s) {
|
284
|
+
xmmi m01,m23,m45,m67,m89;
|
285
|
+
xmmi m0123,m4567;
|
286
|
+
xmmi s0123,s4567;
|
287
|
+
xmmi s01,s23,s45,s67,s89;
|
288
|
+
xmmi s12,s34,s56,s78,s9;
|
289
|
+
xmmi r0,r2,r4,r6,r8;
|
290
|
+
xmmi r1,r3,r5,r7,r9;
|
291
|
+
xmmi r119,r219,r319,r419,r519,r619,r719,r819,r919;
|
292
|
+
xmmi c1,c2,c3;
|
293
|
+
|
294
|
+
s0123 = _mm_load_si128((xmmi*)s + 0);
|
295
|
+
s01 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,1,2,0));
|
296
|
+
s12 = _mm_shuffle_epi32(s0123, _MM_SHUFFLE(2,2,1,1));
|
297
|
+
s23 = _mm_shuffle_epi32(s0123,_MM_SHUFFLE(3,3,2,2));
|
298
|
+
s4567 = _mm_load_si128((xmmi*)s + 1);
|
299
|
+
s34 = _mm_unpacklo_epi64(_mm_srli_si128(s0123,12),s4567);
|
300
|
+
s45 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,1,2,0));
|
301
|
+
s56 = _mm_shuffle_epi32(s4567, _MM_SHUFFLE(2,2,1,1));
|
302
|
+
s67 = _mm_shuffle_epi32(s4567,_MM_SHUFFLE(3,3,2,2));
|
303
|
+
s89 = _mm_load_si128((xmmi*)s + 2);
|
304
|
+
s78 = _mm_unpacklo_epi64(_mm_srli_si128(s4567,12),s89);
|
305
|
+
s89 = _mm_shuffle_epi32(s89,_MM_SHUFFLE(3,1,2,0));
|
306
|
+
s9 = _mm_shuffle_epi32(s89, _MM_SHUFFLE(3,3,2,2));
|
307
|
+
|
308
|
+
r0 = _mm_load_si128((xmmi*)r + 0);
|
309
|
+
r1 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(1,1,1,1));
|
310
|
+
r1 = _mm_add_epi64(r1, _mm_and_si128(r1, top64bitmask.v));
|
311
|
+
r2 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(2,2,2,2));
|
312
|
+
r3 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(3,3,3,3));
|
313
|
+
r3 = _mm_add_epi64(r3, _mm_and_si128(r3, top64bitmask.v));
|
314
|
+
r0 = _mm_shuffle_epi32(r0, _MM_SHUFFLE(0,0,0,0));
|
315
|
+
r4 = _mm_load_si128((xmmi*)r + 1);
|
316
|
+
r5 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(1,1,1,1));
|
317
|
+
r5 = _mm_add_epi64(r5, _mm_and_si128(r5, top64bitmask.v));
|
318
|
+
r6 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(2,2,2,2));
|
319
|
+
r7 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(3,3,3,3));
|
320
|
+
r7 = _mm_add_epi64(r7, _mm_and_si128(r7, top64bitmask.v));
|
321
|
+
r4 = _mm_shuffle_epi32(r4, _MM_SHUFFLE(0,0,0,0));
|
322
|
+
r8 = _mm_load_si128((xmmi*)r + 2);
|
323
|
+
r9 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,1,3,1));
|
324
|
+
r9 = _mm_add_epi64(r9, _mm_and_si128(r9, top64bitmask.v));
|
325
|
+
r8 = _mm_shuffle_epi32(r8, _MM_SHUFFLE(3,0,3,0));
|
326
|
+
|
327
|
+
m01 = _mm_mul_epu32(r1,s01);
|
328
|
+
m23 = _mm_mul_epu32(r1,s23);
|
329
|
+
m45 = _mm_mul_epu32(r1,s45);
|
330
|
+
m67 = _mm_mul_epu32(r1,s67);
|
331
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r3,s01));
|
332
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r3,s23));
|
333
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r3,s45));
|
334
|
+
m89 = _mm_mul_epu32(r1,s89);
|
335
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r5,s01));
|
336
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r5,s23));
|
337
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r3,s67));
|
338
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r7,s01));
|
339
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r5,s45));
|
340
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r7,s23));
|
341
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r9,s01));
|
342
|
+
|
343
|
+
/* shift up */
|
344
|
+
m89 = _mm_unpackhi_epi64(m67,_mm_slli_si128(m89,8));
|
345
|
+
m67 = _mm_unpackhi_epi64(m45,_mm_slli_si128(m67,8));
|
346
|
+
m45 = _mm_unpackhi_epi64(m23,_mm_slli_si128(m45,8));
|
347
|
+
m23 = _mm_unpackhi_epi64(m01,_mm_slli_si128(m23,8));
|
348
|
+
m01 = _mm_unpackhi_epi64(_mm_setzero_si128(),_mm_slli_si128(m01,8));
|
349
|
+
|
350
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r0,s01));
|
351
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r0,s23));
|
352
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r0,s45));
|
353
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r0,s67));
|
354
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r2,s01));
|
355
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r2,s23));
|
356
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r4,s23));
|
357
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r0,s89));
|
358
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r4,s01));
|
359
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r2,s45));
|
360
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r2,s67));
|
361
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r6,s01));
|
362
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r4,s45));
|
363
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r6,s23));
|
364
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r8,s01));
|
365
|
+
|
366
|
+
r219 = _mm_mul_epu32(r2, packednineteen.v);
|
367
|
+
r419 = _mm_mul_epu32(r4, packednineteen.v);
|
368
|
+
r619 = _mm_mul_epu32(r6, packednineteen.v);
|
369
|
+
r819 = _mm_mul_epu32(r8, packednineteen.v);
|
370
|
+
r119 = _mm_shuffle_epi32(r1,_MM_SHUFFLE(0,0,2,2)); r119 = _mm_mul_epu32(r119, packednineteen.v);
|
371
|
+
r319 = _mm_shuffle_epi32(r3,_MM_SHUFFLE(0,0,2,2)); r319 = _mm_mul_epu32(r319, packednineteen.v);
|
372
|
+
r519 = _mm_shuffle_epi32(r5,_MM_SHUFFLE(0,0,2,2)); r519 = _mm_mul_epu32(r519, packednineteen.v);
|
373
|
+
r719 = _mm_shuffle_epi32(r7,_MM_SHUFFLE(0,0,2,2)); r719 = _mm_mul_epu32(r719, packednineteen.v);
|
374
|
+
r919 = _mm_shuffle_epi32(r9,_MM_SHUFFLE(0,0,2,2)); r919 = _mm_mul_epu32(r919, packednineteen.v);
|
375
|
+
|
376
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r919,s12));
|
377
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r919,s34));
|
378
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r919,s56));
|
379
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r919,s78));
|
380
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r719,s34));
|
381
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r719,s56));
|
382
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r719,s78));
|
383
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r719,s9));
|
384
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r519,s56));
|
385
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r519,s78));
|
386
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r519,s9));
|
387
|
+
m67 = _mm_add_epi64(m67,_mm_mul_epu32(r819,s89));
|
388
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r319,s78));
|
389
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r319,s9));
|
390
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r619,s89));
|
391
|
+
m89 = _mm_add_epi64(m89,_mm_mul_epu32(r919,s9));
|
392
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r819,s23));
|
393
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r819,s45));
|
394
|
+
m45 = _mm_add_epi64(m45,_mm_mul_epu32(r819,s67));
|
395
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r619,s45));
|
396
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r619,s67));
|
397
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r419,s67));
|
398
|
+
m23 = _mm_add_epi64(m23,_mm_mul_epu32(r419,s89));
|
399
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r219,s89));
|
400
|
+
m01 = _mm_add_epi64(m01,_mm_mul_epu32(r119,s9));
|
401
|
+
|
402
|
+
r0 = _mm_unpacklo_epi64(m01, m45);
|
403
|
+
r1 = _mm_unpackhi_epi64(m01, m45);
|
404
|
+
r2 = _mm_unpacklo_epi64(m23, m67);
|
405
|
+
r3 = _mm_unpackhi_epi64(m23, m67);
|
406
|
+
r4 = _mm_unpacklo_epi64(m89, m89);
|
407
|
+
r5 = _mm_unpackhi_epi64(m89, m89);
|
408
|
+
|
409
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
410
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
411
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
412
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
413
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
414
|
+
|
415
|
+
m0123 = _mm_unpacklo_epi32(r0, r1);
|
416
|
+
m4567 = _mm_unpackhi_epi32(r0, r1);
|
417
|
+
m0123 = _mm_unpacklo_epi64(m0123, _mm_unpacklo_epi32(r2, r3));
|
418
|
+
m4567 = _mm_unpacklo_epi64(m4567, _mm_unpackhi_epi32(r2, r3));
|
419
|
+
m89 = _mm_unpackhi_epi32(r4, r5);
|
420
|
+
|
421
|
+
_mm_store_si128((xmmi*)out + 0, m0123);
|
422
|
+
_mm_store_si128((xmmi*)out + 1, m4567);
|
423
|
+
_mm_store_si128((xmmi*)out + 2, m89);
|
424
|
+
}
|
425
|
+
|
426
|
+
DONNA_NOINLINE static void
|
427
|
+
curve25519_mul_noinline(bignum25519 out, const bignum25519 r, const bignum25519 s) {
|
428
|
+
curve25519_mul(out, r, s);
|
429
|
+
}
|
430
|
+
|
431
|
+
#define curve25519_square(r, n) curve25519_square_times(r, n, 1)
|
432
|
+
static void
|
433
|
+
curve25519_square_times(bignum25519 r, const bignum25519 in, int count) {
|
434
|
+
xmmi m01,m23,m45,m67,m89;
|
435
|
+
xmmi r0,r1,r2,r3,r4,r5,r6,r7,r8,r9;
|
436
|
+
xmmi r0a,r1a,r2a,r3a,r7a,r9a;
|
437
|
+
xmmi r0123,r4567;
|
438
|
+
xmmi r01,r23,r45,r67,r6x,r89,r8x;
|
439
|
+
xmmi r12,r34,r56,r78,r9x;
|
440
|
+
xmmi r5619;
|
441
|
+
xmmi c1,c2,c3;
|
442
|
+
|
443
|
+
r0123 = _mm_load_si128((xmmi*)in + 0);
|
444
|
+
r01 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,1,2,0));
|
445
|
+
r23 = _mm_shuffle_epi32(r0123,_MM_SHUFFLE(3,3,2,2));
|
446
|
+
r4567 = _mm_load_si128((xmmi*)in + 1);
|
447
|
+
r45 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,1,2,0));
|
448
|
+
r67 = _mm_shuffle_epi32(r4567,_MM_SHUFFLE(3,3,2,2));
|
449
|
+
r89 = _mm_load_si128((xmmi*)in + 2);
|
450
|
+
r89 = _mm_shuffle_epi32(r89,_MM_SHUFFLE(3,1,2,0));
|
451
|
+
|
452
|
+
do {
|
453
|
+
r12 = _mm_unpackhi_epi64(r01, _mm_slli_si128(r23, 8));
|
454
|
+
r0 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(0,0,0,0));
|
455
|
+
r0 = _mm_add_epi64(r0, _mm_and_si128(r0, top64bitmask.v));
|
456
|
+
r0a = _mm_shuffle_epi32(r0,_MM_SHUFFLE(3,2,1,2));
|
457
|
+
r1 = _mm_shuffle_epi32(r01, _MM_SHUFFLE(2,2,2,2));
|
458
|
+
r2 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(0,0,0,0));
|
459
|
+
r2 = _mm_add_epi64(r2, _mm_and_si128(r2, top64bitmask.v));
|
460
|
+
r2a = _mm_shuffle_epi32(r2,_MM_SHUFFLE(3,2,1,2));
|
461
|
+
r3 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,2,2,2));
|
462
|
+
r34 = _mm_unpackhi_epi64(r23, _mm_slli_si128(r45, 8));
|
463
|
+
r4 = _mm_shuffle_epi32(r45, _MM_SHUFFLE(0,0,0,0));
|
464
|
+
r4 = _mm_add_epi64(r4, _mm_and_si128(r4, top64bitmask.v));
|
465
|
+
r56 = _mm_unpackhi_epi64(r45, _mm_slli_si128(r67, 8));
|
466
|
+
r5619 = _mm_mul_epu32(r56, packednineteen.v);
|
467
|
+
r5 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(1,1,1,0));
|
468
|
+
r6 = _mm_shuffle_epi32(r5619, _MM_SHUFFLE(3,2,3,2));
|
469
|
+
r78 = _mm_unpackhi_epi64(r67, _mm_slli_si128(r89, 8));
|
470
|
+
r6x = _mm_unpacklo_epi64(r67, _mm_setzero_si128());
|
471
|
+
r7 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,2,2,2));
|
472
|
+
r7 = _mm_mul_epu32(r7, packed3819.v);
|
473
|
+
r7a = _mm_shuffle_epi32(r7, _MM_SHUFFLE(3,3,3,2));
|
474
|
+
r8x = _mm_unpacklo_epi64(r89, _mm_setzero_si128());
|
475
|
+
r8 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(0,0,0,0));
|
476
|
+
r8 = _mm_mul_epu32(r8, packednineteen.v);
|
477
|
+
r9 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(2,2,2,2));
|
478
|
+
r9x = _mm_slli_epi32(_mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,3,2)), 1);
|
479
|
+
r9 = _mm_mul_epu32(r9, packed3819.v);
|
480
|
+
r9a = _mm_shuffle_epi32(r9, _MM_SHUFFLE(2,2,2,2));
|
481
|
+
|
482
|
+
m01 = _mm_mul_epu32(r01, r0);
|
483
|
+
m23 = _mm_mul_epu32(r23, r0a);
|
484
|
+
m45 = _mm_mul_epu32(r45, r0a);
|
485
|
+
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r23, r2));
|
486
|
+
r23 = _mm_slli_epi32(r23, 1);
|
487
|
+
m67 = _mm_mul_epu32(r67, r0a);
|
488
|
+
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r45, r2a));
|
489
|
+
m89 = _mm_mul_epu32(r89, r0a);
|
490
|
+
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r67, r2a));
|
491
|
+
r67 = _mm_slli_epi32(r67, 1);
|
492
|
+
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r45, r4));
|
493
|
+
r45 = _mm_slli_epi32(r45, 1);
|
494
|
+
|
495
|
+
r1 = _mm_slli_epi32(r1, 1);
|
496
|
+
r3 = _mm_slli_epi32(r3, 1);
|
497
|
+
r1a = _mm_add_epi64(r1, _mm_and_si128(r1, bot64bitmask.v));
|
498
|
+
r3a = _mm_add_epi64(r3, _mm_and_si128(r3, bot64bitmask.v));
|
499
|
+
|
500
|
+
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r12, r1));
|
501
|
+
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r34, r1a));
|
502
|
+
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r56, r1a));
|
503
|
+
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r34, r3));
|
504
|
+
r34 = _mm_slli_epi32(r34, 1);
|
505
|
+
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r78, r1a));
|
506
|
+
r78 = _mm_slli_epi32(r78, 1);
|
507
|
+
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r56, r3a));
|
508
|
+
r56 = _mm_slli_epi32(r56, 1);
|
509
|
+
|
510
|
+
m01 = _mm_add_epi64(m01, _mm_mul_epu32(_mm_slli_epi32(r12, 1), r9));
|
511
|
+
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r34, r7));
|
512
|
+
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r34, r9));
|
513
|
+
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r56, r5));
|
514
|
+
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r56, r7));
|
515
|
+
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r56, r9));
|
516
|
+
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r23, r8));
|
517
|
+
m01 = _mm_add_epi64(m01, _mm_mul_epu32(r45, r6));
|
518
|
+
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r45, r8));
|
519
|
+
m23 = _mm_add_epi64(m23, _mm_mul_epu32(r6x, r6));
|
520
|
+
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r78, r7a));
|
521
|
+
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r78, r9));
|
522
|
+
m45 = _mm_add_epi64(m45, _mm_mul_epu32(r67, r8));
|
523
|
+
m67 = _mm_add_epi64(m67, _mm_mul_epu32(r8x, r8));
|
524
|
+
m89 = _mm_add_epi64(m89, _mm_mul_epu32(r9x, r9a));
|
525
|
+
|
526
|
+
r0 = _mm_unpacklo_epi64(m01, m45);
|
527
|
+
r1 = _mm_unpackhi_epi64(m01, m45);
|
528
|
+
r2 = _mm_unpacklo_epi64(m23, m67);
|
529
|
+
r3 = _mm_unpackhi_epi64(m23, m67);
|
530
|
+
r4 = _mm_unpacklo_epi64(m89, m89);
|
531
|
+
r5 = _mm_unpackhi_epi64(m89, m89);
|
532
|
+
|
533
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
534
|
+
c1 = _mm_srli_epi64(r1, 25); c2 = _mm_srli_epi64(r3, 25); r1 = _mm_and_si128(r1, packedmask25.v); r3 = _mm_and_si128(r3, packedmask25.v); r2 = _mm_add_epi64(r2, c1); r4 = _mm_add_epi64(r4, c2); c3 = _mm_slli_si128(c2, 8);
|
535
|
+
c1 = _mm_srli_epi64(r4, 26); r4 = _mm_and_si128(r4, packedmask26.v); r5 = _mm_add_epi64(r5, c1);
|
536
|
+
c1 = _mm_srli_epi64(r5, 25); r5 = _mm_and_si128(r5, packedmask25.v); r0 = _mm_add_epi64(r0, _mm_unpackhi_epi64(_mm_mul_epu32(c1, packednineteen.v), c3));
|
537
|
+
c1 = _mm_srli_epi64(r0, 26); c2 = _mm_srli_epi64(r2, 26); r0 = _mm_and_si128(r0, packedmask26.v); r2 = _mm_and_si128(r2, packedmask26.v); r1 = _mm_add_epi64(r1, c1); r3 = _mm_add_epi64(r3, c2);
|
538
|
+
|
539
|
+
r01 = _mm_unpacklo_epi64(r0, r1);
|
540
|
+
r45 = _mm_unpackhi_epi64(r0, r1);
|
541
|
+
r23 = _mm_unpacklo_epi64(r2, r3);
|
542
|
+
r67 = _mm_unpackhi_epi64(r2, r3);
|
543
|
+
r89 = _mm_unpackhi_epi64(r4, r5);
|
544
|
+
} while (--count);
|
545
|
+
|
546
|
+
r0123 = _mm_shuffle_epi32(r23, _MM_SHUFFLE(2,0,3,3));
|
547
|
+
r4567 = _mm_shuffle_epi32(r67, _MM_SHUFFLE(2,0,3,3));
|
548
|
+
r0123 = _mm_or_si128(r0123, _mm_shuffle_epi32(r01, _MM_SHUFFLE(3,3,2,0)));
|
549
|
+
r4567 = _mm_or_si128(r4567, _mm_shuffle_epi32(r45, _MM_SHUFFLE(3,3,2,0)));
|
550
|
+
r89 = _mm_shuffle_epi32(r89, _MM_SHUFFLE(3,3,2,0));
|
551
|
+
|
552
|
+
_mm_store_si128((xmmi*)r + 0, r0123);
|
553
|
+
_mm_store_si128((xmmi*)r + 1, r4567);
|
554
|
+
_mm_store_si128((xmmi*)r + 2, r89);
|
555
|
+
}
|
556
|
+
|
557
|
+
DONNA_INLINE static void
|
558
|
+
curve25519_tangle32(packedelem32 *out, const bignum25519 x, const bignum25519 z) {
|
559
|
+
xmmi x0,x1,x2,z0,z1,z2;
|
560
|
+
|
561
|
+
x0 = _mm_load_si128((xmmi *)(x + 0));
|
562
|
+
x1 = _mm_load_si128((xmmi *)(x + 4));
|
563
|
+
x2 = _mm_load_si128((xmmi *)(x + 8));
|
564
|
+
z0 = _mm_load_si128((xmmi *)(z + 0));
|
565
|
+
z1 = _mm_load_si128((xmmi *)(z + 4));
|
566
|
+
z2 = _mm_load_si128((xmmi *)(z + 8));
|
567
|
+
|
568
|
+
out[0].v = _mm_unpacklo_epi32(x0, z0);
|
569
|
+
out[1].v = _mm_unpackhi_epi32(x0, z0);
|
570
|
+
out[2].v = _mm_unpacklo_epi32(x1, z1);
|
571
|
+
out[3].v = _mm_unpackhi_epi32(x1, z1);
|
572
|
+
out[4].v = _mm_unpacklo_epi32(x2, z2);
|
573
|
+
}
|
574
|
+
|
575
|
+
DONNA_INLINE static void
|
576
|
+
curve25519_untangle32(bignum25519 x, bignum25519 z, const packedelem32 *in) {
|
577
|
+
xmmi t0,t1,t2,t3,t4,zero;
|
578
|
+
|
579
|
+
t0 = _mm_shuffle_epi32(in[0].v, _MM_SHUFFLE(3,1,2,0));
|
580
|
+
t1 = _mm_shuffle_epi32(in[1].v, _MM_SHUFFLE(3,1,2,0));
|
581
|
+
t2 = _mm_shuffle_epi32(in[2].v, _MM_SHUFFLE(3,1,2,0));
|
582
|
+
t3 = _mm_shuffle_epi32(in[3].v, _MM_SHUFFLE(3,1,2,0));
|
583
|
+
t4 = _mm_shuffle_epi32(in[4].v, _MM_SHUFFLE(3,1,2,0));
|
584
|
+
zero = _mm_setzero_si128();
|
585
|
+
_mm_store_si128((xmmi *)x + 0, _mm_unpacklo_epi64(t0, t1));
|
586
|
+
_mm_store_si128((xmmi *)x + 1, _mm_unpacklo_epi64(t2, t3));
|
587
|
+
_mm_store_si128((xmmi *)x + 2, _mm_unpacklo_epi64(t4, zero));
|
588
|
+
_mm_store_si128((xmmi *)z + 0, _mm_unpackhi_epi64(t0, t1));
|
589
|
+
_mm_store_si128((xmmi *)z + 1, _mm_unpackhi_epi64(t2, t3));
|
590
|
+
_mm_store_si128((xmmi *)z + 2, _mm_unpackhi_epi64(t4, zero));
|
591
|
+
}
|
592
|
+
|
593
|
+
DONNA_INLINE static void
|
594
|
+
curve25519_add_reduce_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
|
595
|
+
xmmi r0,r1,r2,r3,r4;
|
596
|
+
xmmi s0,s1,s2,s3,s4,s5;
|
597
|
+
xmmi c1,c2;
|
598
|
+
|
599
|
+
r0 = _mm_add_epi32(r[0].v, s[0].v);
|
600
|
+
r1 = _mm_add_epi32(r[1].v, s[1].v);
|
601
|
+
r2 = _mm_add_epi32(r[2].v, s[2].v);
|
602
|
+
r3 = _mm_add_epi32(r[3].v, s[3].v);
|
603
|
+
r4 = _mm_add_epi32(r[4].v, s[4].v);
|
604
|
+
|
605
|
+
s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
|
606
|
+
s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
|
607
|
+
s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
|
608
|
+
s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
|
609
|
+
s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
|
610
|
+
s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
|
611
|
+
|
612
|
+
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
|
613
|
+
c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
|
614
|
+
c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
|
615
|
+
c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
|
616
|
+
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
|
617
|
+
|
618
|
+
out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
|
619
|
+
out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
|
620
|
+
out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
|
621
|
+
out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
|
622
|
+
out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
|
623
|
+
}
|
624
|
+
|
625
|
+
DONNA_INLINE static void
|
626
|
+
curve25519_add_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
|
627
|
+
out[0].v = _mm_add_epi32(r[0].v, s[0].v);
|
628
|
+
out[1].v = _mm_add_epi32(r[1].v, s[1].v);
|
629
|
+
out[2].v = _mm_add_epi32(r[2].v, s[2].v);
|
630
|
+
out[3].v = _mm_add_epi32(r[3].v, s[3].v);
|
631
|
+
out[4].v = _mm_add_epi32(r[4].v, s[4].v);
|
632
|
+
}
|
633
|
+
|
634
|
+
DONNA_INLINE static void
|
635
|
+
curve25519_sub_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
|
636
|
+
xmmi r0,r1,r2,r3,r4;
|
637
|
+
xmmi s0,s1,s2,s3;
|
638
|
+
xmmi c1,c2;
|
639
|
+
|
640
|
+
r0 = _mm_add_epi32(r[0].v, packed32packed2p0.v);
|
641
|
+
r1 = _mm_add_epi32(r[1].v, packed32packed2p1.v);
|
642
|
+
r2 = _mm_add_epi32(r[2].v, packed32packed2p1.v);
|
643
|
+
r3 = _mm_add_epi32(r[3].v, packed32packed2p1.v);
|
644
|
+
r4 = _mm_add_epi32(r[4].v, packed32packed2p1.v);
|
645
|
+
r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
|
646
|
+
r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
|
647
|
+
r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
|
648
|
+
r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
|
649
|
+
r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
|
650
|
+
|
651
|
+
s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
|
652
|
+
s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
|
653
|
+
s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
|
654
|
+
s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
|
655
|
+
|
656
|
+
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
|
657
|
+
c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); r4 = _mm_add_epi32(r4, _mm_srli_si128(c2, 8)); s0 = _mm_add_epi32(s0, _mm_slli_si128(c2, 8));
|
658
|
+
|
659
|
+
out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
|
660
|
+
out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
|
661
|
+
out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
|
662
|
+
out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
|
663
|
+
out[4].v = r4;
|
664
|
+
}
|
665
|
+
|
666
|
+
DONNA_INLINE static void
|
667
|
+
curve25519_sub_after_basic_packed32(packedelem32 *out, const packedelem32 *r, const packedelem32 *s) {
|
668
|
+
xmmi r0,r1,r2,r3,r4;
|
669
|
+
xmmi s0,s1,s2,s3,s4,s5;
|
670
|
+
xmmi c1,c2;
|
671
|
+
|
672
|
+
r0 = _mm_add_epi32(r[0].v, packed32packed4p0.v);
|
673
|
+
r1 = _mm_add_epi32(r[1].v, packed32packed4p1.v);
|
674
|
+
r2 = _mm_add_epi32(r[2].v, packed32packed4p1.v);
|
675
|
+
r3 = _mm_add_epi32(r[3].v, packed32packed4p1.v);
|
676
|
+
r4 = _mm_add_epi32(r[4].v, packed32packed4p1.v);
|
677
|
+
r0 = _mm_sub_epi32(r0, s[0].v); /* 00 11 */
|
678
|
+
r1 = _mm_sub_epi32(r1, s[1].v); /* 22 33 */
|
679
|
+
r2 = _mm_sub_epi32(r2, s[2].v); /* 44 55 */
|
680
|
+
r3 = _mm_sub_epi32(r3, s[3].v); /* 66 77 */
|
681
|
+
r4 = _mm_sub_epi32(r4, s[4].v); /* 88 99 */
|
682
|
+
|
683
|
+
s0 = _mm_unpacklo_epi64(r0, r2); /* 00 44 */
|
684
|
+
s1 = _mm_unpackhi_epi64(r0, r2); /* 11 55 */
|
685
|
+
s2 = _mm_unpacklo_epi64(r1, r3); /* 22 66 */
|
686
|
+
s3 = _mm_unpackhi_epi64(r1, r3); /* 33 77 */
|
687
|
+
s4 = _mm_unpacklo_epi64(_mm_setzero_si128(), r4); /* 00 88 */
|
688
|
+
s5 = _mm_unpackhi_epi64(_mm_setzero_si128(), r4); /* 00 99 */
|
689
|
+
|
690
|
+
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
|
691
|
+
c1 = _mm_srli_epi32(s1, 25); c2 = _mm_srli_epi32(s3, 25); s1 = _mm_and_si128(s1, packedmask25252525.v); s3 = _mm_and_si128(s3, packedmask25252525.v); s2 = _mm_add_epi32(s2, c1); s4 = _mm_add_epi32(s4, _mm_unpackhi_epi64(_mm_setzero_si128(), c2)); s0 = _mm_add_epi32(s0, _mm_unpacklo_epi64(_mm_setzero_si128(), c2));
|
692
|
+
c1 = _mm_srli_epi32(s2, 26); c2 = _mm_srli_epi32(s4, 26); s2 = _mm_and_si128(s2, packedmask26262626.v); s4 = _mm_and_si128(s4, packedmask26262626.v); s3 = _mm_add_epi32(s3, c1); s5 = _mm_add_epi32(s5, c2);
|
693
|
+
c1 = _mm_srli_epi32(s3, 25); c2 = _mm_srli_epi32(s5, 25); s3 = _mm_and_si128(s3, packedmask25252525.v); s5 = _mm_and_si128(s5, packedmask25252525.v); s4 = _mm_add_epi32(s4, c1); s0 = _mm_add_epi32(s0, _mm_or_si128(_mm_slli_si128(c1, 8), _mm_srli_si128(_mm_add_epi32(_mm_add_epi32(_mm_slli_epi32(c2, 4), _mm_slli_epi32(c2, 1)), c2), 8)));
|
694
|
+
c1 = _mm_srli_epi32(s0, 26); c2 = _mm_srli_epi32(s2, 26); s0 = _mm_and_si128(s0, packedmask26262626.v); s2 = _mm_and_si128(s2, packedmask26262626.v); s1 = _mm_add_epi32(s1, c1); s3 = _mm_add_epi32(s3, c2);
|
695
|
+
|
696
|
+
out[0].v = _mm_unpacklo_epi64(s0, s1); /* 00 11 */
|
697
|
+
out[1].v = _mm_unpacklo_epi64(s2, s3); /* 22 33 */
|
698
|
+
out[2].v = _mm_unpackhi_epi64(s0, s1); /* 44 55 */
|
699
|
+
out[3].v = _mm_unpackhi_epi64(s2, s3); /* 66 77 */
|
700
|
+
out[4].v = _mm_unpackhi_epi64(s4, s5); /* 88 99 */
|
701
|
+
}
|
702
|
+
|
703
|
+
DONNA_INLINE static void
|
704
|
+
curve25519_tangle64_from32(packedelem64 *a, packedelem64 *b, const packedelem32 *c, const packedelem32 *d) {
|
705
|
+
xmmi c0,c1,c2,c3,c4,c5,t;
|
706
|
+
xmmi d0,d1,d2,d3,d4,d5;
|
707
|
+
xmmi t0,t1,t2,t3,t4,zero;
|
708
|
+
|
709
|
+
t0 = _mm_shuffle_epi32(c[0].v, _MM_SHUFFLE(3,1,2,0));
|
710
|
+
t1 = _mm_shuffle_epi32(c[1].v, _MM_SHUFFLE(3,1,2,0));
|
711
|
+
t2 = _mm_shuffle_epi32(d[0].v, _MM_SHUFFLE(3,1,2,0));
|
712
|
+
t3 = _mm_shuffle_epi32(d[1].v, _MM_SHUFFLE(3,1,2,0));
|
713
|
+
c0 = _mm_unpacklo_epi64(t0, t1);
|
714
|
+
c3 = _mm_unpackhi_epi64(t0, t1);
|
715
|
+
d0 = _mm_unpacklo_epi64(t2, t3);
|
716
|
+
d3 = _mm_unpackhi_epi64(t2, t3);
|
717
|
+
t = _mm_unpacklo_epi64(c0, d0); a[0].v = t; a[1].v = _mm_srli_epi64(t, 32);
|
718
|
+
t = _mm_unpackhi_epi64(c0, d0); a[2].v = t; a[3].v = _mm_srli_epi64(t, 32);
|
719
|
+
t = _mm_unpacklo_epi64(c3, d3); b[0].v = t; b[1].v = _mm_srli_epi64(t, 32);
|
720
|
+
t = _mm_unpackhi_epi64(c3, d3); b[2].v = t; b[3].v = _mm_srli_epi64(t, 32);
|
721
|
+
|
722
|
+
t0 = _mm_shuffle_epi32(c[2].v, _MM_SHUFFLE(3,1,2,0));
|
723
|
+
t1 = _mm_shuffle_epi32(c[3].v, _MM_SHUFFLE(3,1,2,0));
|
724
|
+
t2 = _mm_shuffle_epi32(d[2].v, _MM_SHUFFLE(3,1,2,0));
|
725
|
+
t3 = _mm_shuffle_epi32(d[3].v, _MM_SHUFFLE(3,1,2,0));
|
726
|
+
c1 = _mm_unpacklo_epi64(t0, t1);
|
727
|
+
c4 = _mm_unpackhi_epi64(t0, t1);
|
728
|
+
d1 = _mm_unpacklo_epi64(t2, t3);
|
729
|
+
d4 = _mm_unpackhi_epi64(t2, t3);
|
730
|
+
t = _mm_unpacklo_epi64(c1, d1); a[4].v = t; a[5].v = _mm_srli_epi64(t, 32);
|
731
|
+
t = _mm_unpackhi_epi64(c1, d1); a[6].v = t; a[7].v = _mm_srli_epi64(t, 32);
|
732
|
+
t = _mm_unpacklo_epi64(c4, d4); b[4].v = t; b[5].v = _mm_srli_epi64(t, 32);
|
733
|
+
t = _mm_unpackhi_epi64(c4, d4); b[6].v = t; b[7].v = _mm_srli_epi64(t, 32);
|
734
|
+
|
735
|
+
t4 = _mm_shuffle_epi32(c[4].v, _MM_SHUFFLE(3,1,2,0));
|
736
|
+
zero = _mm_setzero_si128();
|
737
|
+
c2 = _mm_unpacklo_epi64(t4, zero);
|
738
|
+
c5 = _mm_unpackhi_epi64(t4, zero);
|
739
|
+
t4 = _mm_shuffle_epi32(d[4].v, _MM_SHUFFLE(3,1,2,0));
|
740
|
+
d2 = _mm_unpacklo_epi64(t4, zero);
|
741
|
+
d5 = _mm_unpackhi_epi64(t4, zero);
|
742
|
+
t = _mm_unpacklo_epi64(c2, d2); a[8].v = t; a[9].v = _mm_srli_epi64(t, 32);
|
743
|
+
t = _mm_unpacklo_epi64(c5, d5); b[8].v = t; b[9].v = _mm_srli_epi64(t, 32);
|
744
|
+
}
|
745
|
+
|
746
|
+
DONNA_INLINE static void
|
747
|
+
curve25519_tangle64(packedelem64 *out, const bignum25519 x, const bignum25519 z) {
|
748
|
+
xmmi x0,x1,x2,z0,z1,z2,t;
|
749
|
+
|
750
|
+
x0 = _mm_load_si128((xmmi *)x + 0);
|
751
|
+
x1 = _mm_load_si128((xmmi *)x + 1);
|
752
|
+
x2 = _mm_load_si128((xmmi *)x + 2);
|
753
|
+
z0 = _mm_load_si128((xmmi *)z + 0);
|
754
|
+
z1 = _mm_load_si128((xmmi *)z + 1);
|
755
|
+
z2 = _mm_load_si128((xmmi *)z + 2);
|
756
|
+
|
757
|
+
t = _mm_unpacklo_epi64(x0, z0); out[0].v = t; out[1].v = _mm_srli_epi64(t, 32);
|
758
|
+
t = _mm_unpackhi_epi64(x0, z0); out[2].v = t; out[3].v = _mm_srli_epi64(t, 32);
|
759
|
+
t = _mm_unpacklo_epi64(x1, z1); out[4].v = t; out[5].v = _mm_srli_epi64(t, 32);
|
760
|
+
t = _mm_unpackhi_epi64(x1, z1); out[6].v = t; out[7].v = _mm_srli_epi64(t, 32);
|
761
|
+
t = _mm_unpacklo_epi64(x2, z2); out[8].v = t; out[9].v = _mm_srli_epi64(t, 32);
|
762
|
+
}
|
763
|
+
|
764
|
+
DONNA_INLINE static void
|
765
|
+
curve25519_tangleone64(packedelem64 *out, const bignum25519 x) {
|
766
|
+
xmmi x0,x1,x2;
|
767
|
+
|
768
|
+
x0 = _mm_load_si128((xmmi *)(x + 0));
|
769
|
+
x1 = _mm_load_si128((xmmi *)(x + 4));
|
770
|
+
x2 = _mm_load_si128((xmmi *)(x + 8));
|
771
|
+
|
772
|
+
out[0].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(0,0,0,0));
|
773
|
+
out[1].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(1,1,1,1));
|
774
|
+
out[2].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(2,2,2,2));
|
775
|
+
out[3].v = _mm_shuffle_epi32(x0, _MM_SHUFFLE(3,3,3,3));
|
776
|
+
out[4].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(0,0,0,0));
|
777
|
+
out[5].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,1,1,1));
|
778
|
+
out[6].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(2,2,2,2));
|
779
|
+
out[7].v = _mm_shuffle_epi32(x1, _MM_SHUFFLE(3,3,3,3));
|
780
|
+
out[8].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(0,0,0,0));
|
781
|
+
out[9].v = _mm_shuffle_epi32(x2, _MM_SHUFFLE(1,1,1,1));
|
782
|
+
}
|
783
|
+
|
784
|
+
DONNA_INLINE static void
|
785
|
+
curve25519_swap64(packedelem64 *out) {
|
786
|
+
out[0].v = _mm_shuffle_epi32(out[0].v, _MM_SHUFFLE(1,0,3,2));
|
787
|
+
out[1].v = _mm_shuffle_epi32(out[1].v, _MM_SHUFFLE(1,0,3,2));
|
788
|
+
out[2].v = _mm_shuffle_epi32(out[2].v, _MM_SHUFFLE(1,0,3,2));
|
789
|
+
out[3].v = _mm_shuffle_epi32(out[3].v, _MM_SHUFFLE(1,0,3,2));
|
790
|
+
out[4].v = _mm_shuffle_epi32(out[4].v, _MM_SHUFFLE(1,0,3,2));
|
791
|
+
out[5].v = _mm_shuffle_epi32(out[5].v, _MM_SHUFFLE(1,0,3,2));
|
792
|
+
out[6].v = _mm_shuffle_epi32(out[6].v, _MM_SHUFFLE(1,0,3,2));
|
793
|
+
out[7].v = _mm_shuffle_epi32(out[7].v, _MM_SHUFFLE(1,0,3,2));
|
794
|
+
out[8].v = _mm_shuffle_epi32(out[8].v, _MM_SHUFFLE(1,0,3,2));
|
795
|
+
out[9].v = _mm_shuffle_epi32(out[9].v, _MM_SHUFFLE(1,0,3,2));
|
796
|
+
}
|
797
|
+
|
798
|
+
DONNA_INLINE static void
|
799
|
+
curve25519_untangle64(bignum25519 x, bignum25519 z, const packedelem64 *in) {
|
800
|
+
_mm_store_si128((xmmi *)(x + 0), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[0].v, in[1].v), _mm_unpacklo_epi32(in[2].v, in[3].v)));
|
801
|
+
_mm_store_si128((xmmi *)(x + 4), _mm_unpacklo_epi64(_mm_unpacklo_epi32(in[4].v, in[5].v), _mm_unpacklo_epi32(in[6].v, in[7].v)));
|
802
|
+
_mm_store_si128((xmmi *)(x + 8), _mm_unpacklo_epi32(in[8].v, in[9].v) );
|
803
|
+
_mm_store_si128((xmmi *)(z + 0), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[0].v, in[1].v), _mm_unpackhi_epi32(in[2].v, in[3].v)));
|
804
|
+
_mm_store_si128((xmmi *)(z + 4), _mm_unpacklo_epi64(_mm_unpackhi_epi32(in[4].v, in[5].v), _mm_unpackhi_epi32(in[6].v, in[7].v)));
|
805
|
+
_mm_store_si128((xmmi *)(z + 8), _mm_unpackhi_epi32(in[8].v, in[9].v) );
|
806
|
+
}
|
807
|
+
|
808
|
+
DONNA_INLINE static void
|
809
|
+
curve25519_mul_packed64(packedelem64 *out, const packedelem64 *r, const packedelem64 *s) {
|
810
|
+
xmmi r1,r2,r3,r4,r5,r6,r7,r8,r9;
|
811
|
+
xmmi r1_2,r3_2,r5_2,r7_2,r9_2;
|
812
|
+
xmmi c1,c2;
|
813
|
+
|
814
|
+
out[0].v = _mm_mul_epu32(r[0].v, s[0].v);
|
815
|
+
out[1].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[1].v), _mm_mul_epu32(r[1].v, s[0].v));
|
816
|
+
r1_2 = _mm_slli_epi32(r[1].v, 1);
|
817
|
+
out[2].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[1].v), _mm_mul_epu32(r[2].v, s[0].v)));
|
818
|
+
out[3].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[1].v), _mm_mul_epu32(r[3].v, s[0].v))));
|
819
|
+
r3_2 = _mm_slli_epi32(r[3].v, 1);
|
820
|
+
out[4].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[1].v), _mm_mul_epu32(r[4].v, s[0].v)))));
|
821
|
+
out[5].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[1].v), _mm_mul_epu32(r[5].v, s[0].v))))));
|
822
|
+
r5_2 = _mm_slli_epi32(r[5].v, 1);
|
823
|
+
out[6].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[1].v), _mm_mul_epu32(r[6].v, s[0].v)))))));
|
824
|
+
out[7].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[1].v), _mm_mul_epu32(r[7].v , s[0].v))))))));
|
825
|
+
r7_2 = _mm_slli_epi32(r[7].v, 1);
|
826
|
+
out[8].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r1_2 , s[7].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2 , s[5].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2 , s[3].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2 , s[1].v), _mm_mul_epu32(r[8].v, s[0].v)))))))));
|
827
|
+
out[9].v = _mm_add_epi64(_mm_mul_epu32(r[0].v, s[9].v), _mm_add_epi64(_mm_mul_epu32(r[1].v, s[8].v), _mm_add_epi64(_mm_mul_epu32(r[2].v, s[7].v), _mm_add_epi64(_mm_mul_epu32(r[3].v, s[6].v), _mm_add_epi64(_mm_mul_epu32(r[4].v, s[5].v), _mm_add_epi64(_mm_mul_epu32(r[5].v, s[4].v), _mm_add_epi64(_mm_mul_epu32(r[6].v, s[3].v), _mm_add_epi64(_mm_mul_epu32(r[7].v, s[2].v), _mm_add_epi64(_mm_mul_epu32(r[8].v, s[1].v), _mm_mul_epu32(r[9].v, s[0].v))))))))));
|
828
|
+
|
829
|
+
r1 = _mm_mul_epu32(r[1].v, packednineteen.v);
|
830
|
+
r2 = _mm_mul_epu32(r[2].v, packednineteen.v);
|
831
|
+
r1_2 = _mm_slli_epi32(r1, 1);
|
832
|
+
r3 = _mm_mul_epu32(r[3].v, packednineteen.v);
|
833
|
+
r4 = _mm_mul_epu32(r[4].v, packednineteen.v);
|
834
|
+
r3_2 = _mm_slli_epi32(r3, 1);
|
835
|
+
r5 = _mm_mul_epu32(r[5].v, packednineteen.v);
|
836
|
+
r6 = _mm_mul_epu32(r[6].v, packednineteen.v);
|
837
|
+
r5_2 = _mm_slli_epi32(r5, 1);
|
838
|
+
r7 = _mm_mul_epu32(r[7].v, packednineteen.v);
|
839
|
+
r8 = _mm_mul_epu32(r[8].v, packednineteen.v);
|
840
|
+
r7_2 = _mm_slli_epi32(r7, 1);
|
841
|
+
r9 = _mm_mul_epu32(r[9].v, packednineteen.v);
|
842
|
+
r9_2 = _mm_slli_epi32(r9, 1);
|
843
|
+
|
844
|
+
out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[1].v), _mm_add_epi64(_mm_mul_epu32(r8, s[2].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r6, s[4].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r4, s[6].v), _mm_add_epi64(_mm_mul_epu32(r3_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r2, s[8].v), _mm_mul_epu32(r1_2, s[9].v))))))))));
|
845
|
+
out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[2].v), _mm_add_epi64(_mm_mul_epu32(r8, s[3].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r6, s[5].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r4, s[7].v), _mm_add_epi64(_mm_mul_epu32(r3 , s[8].v), _mm_mul_epu32(r2, s[9].v)))))))));
|
846
|
+
out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[3].v), _mm_add_epi64(_mm_mul_epu32(r8, s[4].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r6, s[6].v), _mm_add_epi64(_mm_mul_epu32(r5_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r4, s[8].v), _mm_mul_epu32(r3_2, s[9].v))))))));
|
847
|
+
out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[4].v), _mm_add_epi64(_mm_mul_epu32(r8, s[5].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r6, s[7].v), _mm_add_epi64(_mm_mul_epu32(r5 , s[8].v), _mm_mul_epu32(r4, s[9].v)))))));
|
848
|
+
out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[5].v), _mm_add_epi64(_mm_mul_epu32(r8, s[6].v), _mm_add_epi64(_mm_mul_epu32(r7_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r6, s[8].v), _mm_mul_epu32(r5_2, s[9].v))))));
|
849
|
+
out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[6].v), _mm_add_epi64(_mm_mul_epu32(r8, s[7].v), _mm_add_epi64(_mm_mul_epu32(r7 , s[8].v), _mm_mul_epu32(r6, s[9].v)))));
|
850
|
+
out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(r9_2, s[7].v), _mm_add_epi64(_mm_mul_epu32(r8, s[8].v), _mm_mul_epu32(r7_2, s[9].v))));
|
851
|
+
out[7].v = _mm_add_epi64(out[7].v, _mm_add_epi64(_mm_mul_epu32(r9 , s[8].v), _mm_mul_epu32(r8, s[9].v)));
|
852
|
+
out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(r9_2, s[9].v));
|
853
|
+
|
854
|
+
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
|
855
|
+
c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
|
856
|
+
c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
|
857
|
+
c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
|
858
|
+
c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
|
859
|
+
c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
|
860
|
+
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
|
861
|
+
}
|
862
|
+
|
863
|
+
DONNA_INLINE static void
|
864
|
+
curve25519_square_packed64(packedelem64 *out, const packedelem64 *r) {
|
865
|
+
xmmi r0,r1,r2,r3;
|
866
|
+
xmmi r1_2,r3_2,r4_2,r5_2,r6_2,r7_2;
|
867
|
+
xmmi d5,d6,d7,d8,d9;
|
868
|
+
xmmi c1,c2;
|
869
|
+
|
870
|
+
r0 = r[0].v;
|
871
|
+
r1 = r[1].v;
|
872
|
+
r2 = r[2].v;
|
873
|
+
r3 = r[3].v;
|
874
|
+
|
875
|
+
out[0].v = _mm_mul_epu32(r0, r0);
|
876
|
+
r0 = _mm_slli_epi32(r0, 1);
|
877
|
+
out[1].v = _mm_mul_epu32(r0, r1);
|
878
|
+
r1_2 = _mm_slli_epi32(r1, 1);
|
879
|
+
out[2].v = _mm_add_epi64(_mm_mul_epu32(r0, r2 ), _mm_mul_epu32(r1, r1_2));
|
880
|
+
r1 = r1_2;
|
881
|
+
out[3].v = _mm_add_epi64(_mm_mul_epu32(r0, r3 ), _mm_mul_epu32(r1, r2 ));
|
882
|
+
r3_2 = _mm_slli_epi32(r3, 1);
|
883
|
+
out[4].v = _mm_add_epi64(_mm_mul_epu32(r0, r[4].v), _mm_add_epi64(_mm_mul_epu32(r1, r3_2 ), _mm_mul_epu32(r2, r2)));
|
884
|
+
r2 = _mm_slli_epi32(r2, 1);
|
885
|
+
out[5].v = _mm_add_epi64(_mm_mul_epu32(r0, r[5].v), _mm_add_epi64(_mm_mul_epu32(r1, r[4].v), _mm_mul_epu32(r2, r3)));
|
886
|
+
r5_2 = _mm_slli_epi32(r[5].v, 1);
|
887
|
+
out[6].v = _mm_add_epi64(_mm_mul_epu32(r0, r[6].v), _mm_add_epi64(_mm_mul_epu32(r1, r5_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[4].v), _mm_mul_epu32(r3, r3_2 ))));
|
888
|
+
r3 = r3_2;
|
889
|
+
out[7].v = _mm_add_epi64(_mm_mul_epu32(r0, r[7].v), _mm_add_epi64(_mm_mul_epu32(r1, r[6].v), _mm_add_epi64(_mm_mul_epu32(r2, r[5].v), _mm_mul_epu32(r3, r[4].v))));
|
890
|
+
r7_2 = _mm_slli_epi32(r[7].v, 1);
|
891
|
+
out[8].v = _mm_add_epi64(_mm_mul_epu32(r0, r[8].v), _mm_add_epi64(_mm_mul_epu32(r1, r7_2 ), _mm_add_epi64(_mm_mul_epu32(r2, r[6].v), _mm_add_epi64(_mm_mul_epu32(r3, r5_2 ), _mm_mul_epu32(r[4].v, r[4].v)))));
|
892
|
+
out[9].v = _mm_add_epi64(_mm_mul_epu32(r0, r[9].v), _mm_add_epi64(_mm_mul_epu32(r1, r[8].v), _mm_add_epi64(_mm_mul_epu32(r2, r[7].v), _mm_add_epi64(_mm_mul_epu32(r3, r[6].v), _mm_mul_epu32(r[4].v, r5_2 )))));
|
893
|
+
|
894
|
+
d5 = _mm_mul_epu32(r[5].v, packedthirtyeight.v);
|
895
|
+
d6 = _mm_mul_epu32(r[6].v, packednineteen.v);
|
896
|
+
d7 = _mm_mul_epu32(r[7].v, packedthirtyeight.v);
|
897
|
+
d8 = _mm_mul_epu32(r[8].v, packednineteen.v);
|
898
|
+
d9 = _mm_mul_epu32(r[9].v, packedthirtyeight.v);
|
899
|
+
|
900
|
+
r4_2 = _mm_slli_epi32(r[4].v, 1);
|
901
|
+
r6_2 = _mm_slli_epi32(r[6].v, 1);
|
902
|
+
out[0].v = _mm_add_epi64(out[0].v, _mm_add_epi64(_mm_mul_epu32(d9, r1 ), _mm_add_epi64(_mm_mul_epu32(d8, r2 ), _mm_add_epi64(_mm_mul_epu32(d7, r3 ), _mm_add_epi64(_mm_mul_epu32(d6, r4_2), _mm_mul_epu32(d5, r[5].v))))));
|
903
|
+
out[1].v = _mm_add_epi64(out[1].v, _mm_add_epi64(_mm_mul_epu32(d9, _mm_srli_epi32(r2, 1)), _mm_add_epi64(_mm_mul_epu32(d8, r3 ), _mm_add_epi64(_mm_mul_epu32(d7, r[4].v), _mm_mul_epu32(d6, r5_2 )))));
|
904
|
+
out[2].v = _mm_add_epi64(out[2].v, _mm_add_epi64(_mm_mul_epu32(d9, r3 ), _mm_add_epi64(_mm_mul_epu32(d8, r4_2), _mm_add_epi64(_mm_mul_epu32(d7, r5_2 ), _mm_mul_epu32(d6, r[6].v)))));
|
905
|
+
out[3].v = _mm_add_epi64(out[3].v, _mm_add_epi64(_mm_mul_epu32(d9, r[4].v ), _mm_add_epi64(_mm_mul_epu32(d8, r5_2), _mm_mul_epu32(d7, r[6].v))));
|
906
|
+
out[4].v = _mm_add_epi64(out[4].v, _mm_add_epi64(_mm_mul_epu32(d9, r5_2 ), _mm_add_epi64(_mm_mul_epu32(d8, r6_2), _mm_mul_epu32(d7, r[7].v))));
|
907
|
+
out[5].v = _mm_add_epi64(out[5].v, _mm_add_epi64(_mm_mul_epu32(d9, r[6].v ), _mm_mul_epu32(d8, r7_2 )));
|
908
|
+
out[6].v = _mm_add_epi64(out[6].v, _mm_add_epi64(_mm_mul_epu32(d9, r7_2 ), _mm_mul_epu32(d8, r[8].v)));
|
909
|
+
out[7].v = _mm_add_epi64(out[7].v, _mm_mul_epu32(d9, r[8].v));
|
910
|
+
out[8].v = _mm_add_epi64(out[8].v, _mm_mul_epu32(d9, r[9].v));
|
911
|
+
|
912
|
+
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
|
913
|
+
c1 = _mm_srli_epi64(out[1].v, 25); c2 = _mm_srli_epi64(out[5].v, 25); out[1].v = _mm_and_si128(out[1].v, packedmask25.v); out[5].v = _mm_and_si128(out[5].v, packedmask25.v); out[2].v = _mm_add_epi64(out[2].v, c1); out[6].v = _mm_add_epi64(out[6].v, c2);
|
914
|
+
c1 = _mm_srli_epi64(out[2].v, 26); c2 = _mm_srli_epi64(out[6].v, 26); out[2].v = _mm_and_si128(out[2].v, packedmask26.v); out[6].v = _mm_and_si128(out[6].v, packedmask26.v); out[3].v = _mm_add_epi64(out[3].v, c1); out[7].v = _mm_add_epi64(out[7].v, c2);
|
915
|
+
c1 = _mm_srli_epi64(out[3].v, 25); c2 = _mm_srli_epi64(out[7].v, 25); out[3].v = _mm_and_si128(out[3].v, packedmask25.v); out[7].v = _mm_and_si128(out[7].v, packedmask25.v); out[4].v = _mm_add_epi64(out[4].v, c1); out[8].v = _mm_add_epi64(out[8].v, c2);
|
916
|
+
c2 = _mm_srli_epi64(out[8].v, 26); out[8].v = _mm_and_si128(out[8].v, packedmask26.v); out[9].v = _mm_add_epi64(out[9].v, c2);
|
917
|
+
c2 = _mm_srli_epi64(out[9].v, 25); out[9].v = _mm_and_si128(out[9].v, packedmask25.v); out[0].v = _mm_add_epi64(out[0].v, _mm_mul_epu32(c2, packednineteen.v));
|
918
|
+
c1 = _mm_srli_epi64(out[0].v, 26); c2 = _mm_srli_epi64(out[4].v, 26); out[0].v = _mm_and_si128(out[0].v, packedmask26.v); out[4].v = _mm_and_si128(out[4].v, packedmask26.v); out[1].v = _mm_add_epi64(out[1].v, c1); out[5].v = _mm_add_epi64(out[5].v, c2);
|
919
|
+
}
|
920
|
+
|
921
|
+
|
922
|
+
/* Take a little-endian, 32-byte number and expand it into polynomial form */
|
923
|
+
static void
|
924
|
+
curve25519_expand(bignum25519 out, const unsigned char in[32]) {
|
925
|
+
uint32_t x0,x1,x2,x3,x4,x5,x6,x7;
|
926
|
+
|
927
|
+
x0 = *(uint32_t *)(in + 0);
|
928
|
+
x1 = *(uint32_t *)(in + 4);
|
929
|
+
x2 = *(uint32_t *)(in + 8);
|
930
|
+
x3 = *(uint32_t *)(in + 12);
|
931
|
+
x4 = *(uint32_t *)(in + 16);
|
932
|
+
x5 = *(uint32_t *)(in + 20);
|
933
|
+
x6 = *(uint32_t *)(in + 24);
|
934
|
+
x7 = *(uint32_t *)(in + 28);
|
935
|
+
|
936
|
+
out[0] = ( x0 ) & 0x3ffffff;
|
937
|
+
out[1] = ((((uint64_t)x1 << 32) | x0) >> 26) & 0x1ffffff;
|
938
|
+
out[2] = ((((uint64_t)x2 << 32) | x1) >> 19) & 0x3ffffff;
|
939
|
+
out[3] = ((((uint64_t)x3 << 32) | x2) >> 13) & 0x1ffffff;
|
940
|
+
out[4] = (( x3) >> 6) & 0x3ffffff;
|
941
|
+
out[5] = ( x4 ) & 0x1ffffff;
|
942
|
+
out[6] = ((((uint64_t)x5 << 32) | x4) >> 25) & 0x3ffffff;
|
943
|
+
out[7] = ((((uint64_t)x6 << 32) | x5) >> 19) & 0x1ffffff;
|
944
|
+
out[8] = ((((uint64_t)x7 << 32) | x6) >> 12) & 0x3ffffff;
|
945
|
+
out[9] = (( x7) >> 6) & 0x1ffffff;
|
946
|
+
out[10] = 0;
|
947
|
+
out[11] = 0;
|
948
|
+
}
|
949
|
+
|
950
|
+
/* Take a fully reduced polynomial form number and contract it into a
|
951
|
+
* little-endian, 32-byte array
|
952
|
+
*/
|
953
|
+
static void
|
954
|
+
curve25519_contract(unsigned char out[32], const bignum25519 in) {
|
955
|
+
bignum25519 ALIGN(16) f;
|
956
|
+
curve25519_copy(f, in);
|
957
|
+
|
958
|
+
#define carry_pass() \
|
959
|
+
f[1] += f[0] >> 26; f[0] &= 0x3ffffff; \
|
960
|
+
f[2] += f[1] >> 25; f[1] &= 0x1ffffff; \
|
961
|
+
f[3] += f[2] >> 26; f[2] &= 0x3ffffff; \
|
962
|
+
f[4] += f[3] >> 25; f[3] &= 0x1ffffff; \
|
963
|
+
f[5] += f[4] >> 26; f[4] &= 0x3ffffff; \
|
964
|
+
f[6] += f[5] >> 25; f[5] &= 0x1ffffff; \
|
965
|
+
f[7] += f[6] >> 26; f[6] &= 0x3ffffff; \
|
966
|
+
f[8] += f[7] >> 25; f[7] &= 0x1ffffff; \
|
967
|
+
f[9] += f[8] >> 26; f[8] &= 0x3ffffff;
|
968
|
+
|
969
|
+
#define carry_pass_full() \
|
970
|
+
carry_pass() \
|
971
|
+
f[0] += 19 * (f[9] >> 25); f[9] &= 0x1ffffff;
|
972
|
+
|
973
|
+
#define carry_pass_final() \
|
974
|
+
carry_pass() \
|
975
|
+
f[9] &= 0x1ffffff;
|
976
|
+
|
977
|
+
carry_pass_full()
|
978
|
+
carry_pass_full()
|
979
|
+
|
980
|
+
/* now t is between 0 and 2^255-1, properly carried. */
|
981
|
+
/* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */
|
982
|
+
f[0] += 19;
|
983
|
+
carry_pass_full()
|
984
|
+
|
985
|
+
/* now between 19 and 2^255-1 in both cases, and offset by 19. */
|
986
|
+
f[0] += (1 << 26) - 19;
|
987
|
+
f[1] += (1 << 25) - 1;
|
988
|
+
f[2] += (1 << 26) - 1;
|
989
|
+
f[3] += (1 << 25) - 1;
|
990
|
+
f[4] += (1 << 26) - 1;
|
991
|
+
f[5] += (1 << 25) - 1;
|
992
|
+
f[6] += (1 << 26) - 1;
|
993
|
+
f[7] += (1 << 25) - 1;
|
994
|
+
f[8] += (1 << 26) - 1;
|
995
|
+
f[9] += (1 << 25) - 1;
|
996
|
+
|
997
|
+
/* now between 2^255 and 2^256-20, and offset by 2^255. */
|
998
|
+
carry_pass_final()
|
999
|
+
|
1000
|
+
#undef carry_pass
|
1001
|
+
#undef carry_full
|
1002
|
+
#undef carry_final
|
1003
|
+
|
1004
|
+
f[1] <<= 2;
|
1005
|
+
f[2] <<= 3;
|
1006
|
+
f[3] <<= 5;
|
1007
|
+
f[4] <<= 6;
|
1008
|
+
f[6] <<= 1;
|
1009
|
+
f[7] <<= 3;
|
1010
|
+
f[8] <<= 4;
|
1011
|
+
f[9] <<= 6;
|
1012
|
+
|
1013
|
+
#define F(i, s) \
|
1014
|
+
out[s+0] |= (unsigned char )(f[i] & 0xff); \
|
1015
|
+
out[s+1] = (unsigned char )((f[i] >> 8) & 0xff); \
|
1016
|
+
out[s+2] = (unsigned char )((f[i] >> 16) & 0xff); \
|
1017
|
+
out[s+3] = (unsigned char )((f[i] >> 24) & 0xff);
|
1018
|
+
|
1019
|
+
out[0] = 0;
|
1020
|
+
out[16] = 0;
|
1021
|
+
F(0,0);
|
1022
|
+
F(1,3);
|
1023
|
+
F(2,6);
|
1024
|
+
F(3,9);
|
1025
|
+
F(4,12);
|
1026
|
+
F(5,16);
|
1027
|
+
F(6,19);
|
1028
|
+
F(7,22);
|
1029
|
+
F(8,25);
|
1030
|
+
F(9,28);
|
1031
|
+
#undef F
|
1032
|
+
}
|
1033
|
+
|
1034
|
+
/* if (iswap) swap(a, b) */
|
1035
|
+
DONNA_INLINE static void
|
1036
|
+
curve25519_swap_conditional(bignum25519 a, bignum25519 b, uint32_t iswap) {
|
1037
|
+
const uint32_t swap = (uint32_t)(-(int32_t)iswap);
|
1038
|
+
xmmi a0,a1,a2,b0,b1,b2,x0,x1,x2;
|
1039
|
+
xmmi mask = _mm_cvtsi32_si128(swap);
|
1040
|
+
mask = _mm_shuffle_epi32(mask, 0);
|
1041
|
+
a0 = _mm_load_si128((xmmi *)a + 0);
|
1042
|
+
a1 = _mm_load_si128((xmmi *)a + 1);
|
1043
|
+
b0 = _mm_load_si128((xmmi *)b + 0);
|
1044
|
+
b1 = _mm_load_si128((xmmi *)b + 1);
|
1045
|
+
b0 = _mm_xor_si128(a0, b0);
|
1046
|
+
b1 = _mm_xor_si128(a1, b1);
|
1047
|
+
x0 = _mm_and_si128(b0, mask);
|
1048
|
+
x1 = _mm_and_si128(b1, mask);
|
1049
|
+
x0 = _mm_xor_si128(x0, a0);
|
1050
|
+
x1 = _mm_xor_si128(x1, a1);
|
1051
|
+
a0 = _mm_xor_si128(x0, b0);
|
1052
|
+
a1 = _mm_xor_si128(x1, b1);
|
1053
|
+
_mm_store_si128((xmmi *)a + 0, x0);
|
1054
|
+
_mm_store_si128((xmmi *)a + 1, x1);
|
1055
|
+
_mm_store_si128((xmmi *)b + 0, a0);
|
1056
|
+
_mm_store_si128((xmmi *)b + 1, a1);
|
1057
|
+
|
1058
|
+
a2 = _mm_load_si128((xmmi *)a + 2);
|
1059
|
+
b2 = _mm_load_si128((xmmi *)b + 2);
|
1060
|
+
b2 = _mm_xor_si128(a2, b2);
|
1061
|
+
x2 = _mm_and_si128(b2, mask);
|
1062
|
+
x2 = _mm_xor_si128(x2, a2);
|
1063
|
+
a2 = _mm_xor_si128(x2, b2);
|
1064
|
+
_mm_store_si128((xmmi *)b + 2, a2);
|
1065
|
+
_mm_store_si128((xmmi *)a + 2, x2);
|
1066
|
+
}
|
1067
|
+
|
1068
|
+
/* out = (flag) ? out : in */
|
1069
|
+
DONNA_INLINE static void
|
1070
|
+
curve25519_move_conditional_bytes(uint8_t out[96], const uint8_t in[96], uint32_t flag) {
|
1071
|
+
xmmi a0,a1,a2,a3,a4,a5,b0,b1,b2,b3,b4,b5;
|
1072
|
+
const uint32_t nb = flag - 1;
|
1073
|
+
xmmi masknb = _mm_shuffle_epi32(_mm_cvtsi32_si128(nb),0);
|
1074
|
+
a0 = _mm_load_si128((xmmi *)in + 0);
|
1075
|
+
a1 = _mm_load_si128((xmmi *)in + 1);
|
1076
|
+
a2 = _mm_load_si128((xmmi *)in + 2);
|
1077
|
+
b0 = _mm_load_si128((xmmi *)out + 0);
|
1078
|
+
b1 = _mm_load_si128((xmmi *)out + 1);
|
1079
|
+
b2 = _mm_load_si128((xmmi *)out + 2);
|
1080
|
+
a0 = _mm_andnot_si128(masknb, a0);
|
1081
|
+
a1 = _mm_andnot_si128(masknb, a1);
|
1082
|
+
a2 = _mm_andnot_si128(masknb, a2);
|
1083
|
+
b0 = _mm_and_si128(masknb, b0);
|
1084
|
+
b1 = _mm_and_si128(masknb, b1);
|
1085
|
+
b2 = _mm_and_si128(masknb, b2);
|
1086
|
+
a0 = _mm_or_si128(a0, b0);
|
1087
|
+
a1 = _mm_or_si128(a1, b1);
|
1088
|
+
a2 = _mm_or_si128(a2, b2);
|
1089
|
+
_mm_store_si128((xmmi*)out + 0, a0);
|
1090
|
+
_mm_store_si128((xmmi*)out + 1, a1);
|
1091
|
+
_mm_store_si128((xmmi*)out + 2, a2);
|
1092
|
+
|
1093
|
+
a3 = _mm_load_si128((xmmi *)in + 3);
|
1094
|
+
a4 = _mm_load_si128((xmmi *)in + 4);
|
1095
|
+
a5 = _mm_load_si128((xmmi *)in + 5);
|
1096
|
+
b3 = _mm_load_si128((xmmi *)out + 3);
|
1097
|
+
b4 = _mm_load_si128((xmmi *)out + 4);
|
1098
|
+
b5 = _mm_load_si128((xmmi *)out + 5);
|
1099
|
+
a3 = _mm_andnot_si128(masknb, a3);
|
1100
|
+
a4 = _mm_andnot_si128(masknb, a4);
|
1101
|
+
a5 = _mm_andnot_si128(masknb, a5);
|
1102
|
+
b3 = _mm_and_si128(masknb, b3);
|
1103
|
+
b4 = _mm_and_si128(masknb, b4);
|
1104
|
+
b5 = _mm_and_si128(masknb, b5);
|
1105
|
+
a3 = _mm_or_si128(a3, b3);
|
1106
|
+
a4 = _mm_or_si128(a4, b4);
|
1107
|
+
a5 = _mm_or_si128(a5, b5);
|
1108
|
+
_mm_store_si128((xmmi*)out + 3, a3);
|
1109
|
+
_mm_store_si128((xmmi*)out + 4, a4);
|
1110
|
+
_mm_store_si128((xmmi*)out + 5, a5);
|
1111
|
+
}
|
1112
|
+
|