x25519 0.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * This program is free software: you can redistribute it and/or modify
7
+ * it under the terms of the GNU Lesser General Public License as
8
+ * published by the Free Software Foundation, version 3.
9
+ *
10
+ * This program is distributed in the hope that it will be useful, but
11
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public License
16
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ */
18
+ #ifndef BYTES_H
19
+ #define BYTES_H
20
+
21
+ #include <stdint.h>
22
+ void print_bytes(uint8_t * A, int num_bytes);
23
+ int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes);
24
+
25
+ #endif /* BYTES_H */
@@ -0,0 +1,826 @@
1
+ /**
2
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * This program is free software: you can redistribute it and/or modify
7
+ * it under the terms of the GNU Lesser General Public License as
8
+ * published by the Free Software Foundation, version 3.
9
+ *
10
+ * This program is distributed in the hope that it will be useful, but
11
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public License
16
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ */
18
+ #include "random.h"
19
+ #include "bytes.h"
20
+ #include "fp25519_x64.h"
21
+
22
+ void random_EltFp25519_1w_x64(uint64_t *A)
23
+ {
24
+ random_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
25
+ A[3] &= ((uint64_t)1<<63)-1;
26
+ }
27
+
28
+ int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
29
+ {
30
+ return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
31
+ }
32
+
33
+ void print_EltFp25519_1w_x64(uint64_t *A)
34
+ {
35
+ print_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
36
+ }
37
+
38
+ /**
39
+ *
40
+ * @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
41
+ * @param a Two 256-bit integers: a[0:3] and a[4:7]
42
+ * @param b Two 256-bit integers: b[0:3] and b[4:7]
43
+ */
44
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
45
+ {
46
+ #ifdef __BMI2__
47
+ #ifdef __ADX__
48
+ __asm__ __volatile__(
49
+ "movq (%1), %%rdx # A[0] \n\t"
50
+ "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
51
+ "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
52
+ "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
53
+ "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
54
+ "adox %%rdx, %%rax \n\t"
55
+
56
+ "movq 8(%1), %%rdx # A[1] \n\t"
57
+ "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
58
+ "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
59
+ "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
60
+ "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
61
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
62
+
63
+ "movq 16(%1), %%rdx # A[2] \n\t"
64
+ "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
65
+ "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
66
+ "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
67
+ "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
68
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
69
+
70
+ "movq 24(%1), %%rdx # A[3] \n\t"
71
+ "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
72
+ "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
73
+ "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
74
+ "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
75
+ "adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
76
+
77
+ "movq 32(%1), %%rdx # A[0] \n\t"
78
+ "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
79
+ "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
80
+ "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
81
+ "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
82
+ "adox %%rdx, %%rax \n\t"
83
+
84
+ "movq 40(%1), %%rdx # A[1] \n\t"
85
+ "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
86
+ "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
87
+ "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
88
+ "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
89
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
90
+
91
+ "movq 48(%1), %%rdx # A[2] \n\t"
92
+ "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
93
+ "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
94
+ "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
95
+ "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
96
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
97
+
98
+ "movq 56(%1), %%rdx # A[3] \n\t"
99
+ "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
100
+ "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
101
+ "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
102
+ "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
103
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
104
+ :
105
+ : "r" (c), "r" (a), "r" (b)
106
+ : "memory", "cc", "%rax", "%rdx",
107
+ "%r8", "%r9", "%r10", "%r11",
108
+ "%r12", "%r13", "%r14"
109
+ );
110
+ #else
111
+ __asm__ __volatile__(
112
+ "movq (%1), %%rdx # A[0] \n\t"
113
+ "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
114
+ "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
115
+ "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
116
+ "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
117
+ "adcq $0, %%rcx \n\t"
118
+
119
+ "movq 8(%1), %%rdx # A[1] \n\t"
120
+ "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
121
+ "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
122
+ "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
123
+ "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
124
+ "adcq $0, %%rdx \n\t"
125
+
126
+ "addq %%r8, 8(%0) \n\t"
127
+ "adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
128
+ "adcq %%r11, %%rbx \n\t"
129
+ "adcq %%r13, %%rcx \n\t"
130
+ "adcq %%rdx, %%rax \n\t"
131
+
132
+ "movq 16(%1), %%rdx # A[2] \n\t"
133
+ "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
134
+ "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
135
+ "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
136
+ "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
137
+ "adcq $0, %%rdx \n\t"
138
+
139
+ "addq %%r8, 16(%0) \n\t"
140
+ "adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
141
+ "adcq %%r11, %%rcx \n\t"
142
+ "adcq %%r13, %%rax \n\t"
143
+ "adcq %%rdx, %%rbx \n\t"
144
+
145
+ "movq 24(%1), %%rdx # A[3] \n\t"
146
+ "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
147
+ "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
148
+ "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
149
+ "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
150
+ "adcq $0, %%rdx \n\t"
151
+
152
+ "addq %%r8, 24(%0) \n\t"
153
+ "adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
154
+ "adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
155
+ "adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
156
+ "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
157
+
158
+ "movq 32(%1), %%rdx # A[0] \n\t"
159
+ "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
160
+ "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
161
+ "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
162
+ "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
163
+ "adcq $0, %%rcx \n\t"
164
+
165
+ "movq 40(%1), %%rdx # A[1] \n\t"
166
+ "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
167
+ "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
168
+ "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
169
+ "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
170
+ "adcq $0, %%rdx \n\t"
171
+
172
+ "addq %%r8, 72(%0) \n\t"
173
+ "adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
174
+ "adcq %%r11, %%rbx \n\t"
175
+ "adcq %%r13, %%rcx \n\t"
176
+ "adcq %%rdx, %%rax \n\t"
177
+
178
+ "movq 48(%1), %%rdx # A[2] \n\t"
179
+ "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
180
+ "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
181
+ "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
182
+ "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
183
+ "adcq $0, %%rdx \n\t"
184
+
185
+ "addq %%r8, 80(%0) \n\t"
186
+ "adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
187
+ "adcq %%r11, %%rcx \n\t"
188
+ "adcq %%r13, %%rax \n\t"
189
+ "adcq %%rdx, %%rbx \n\t"
190
+
191
+ "movq 56(%1), %%rdx # A[3] \n\t"
192
+ "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
193
+ "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
194
+ "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
195
+ "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
196
+ "adcq $0, %%rdx \n\t"
197
+
198
+ "addq %%r8, 88(%0) \n\t"
199
+ "adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
200
+ "adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
201
+ "adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
202
+ "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
203
+ :
204
+ : "r" (c), "r" (a), "r" (b)
205
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
206
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
207
+ );
208
+ #endif
209
+ #else /* Without BMI2 */
210
+ /**
211
+ * TODO: Multiplications using MULQ instruction.
212
+ **/
213
+ #endif
214
+ }
215
+
216
+ /**
217
+ *
218
+ * @param c
219
+ * @param a
220
+ */
221
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
222
+ {
223
+ #ifdef __BMI2__
224
+ __asm__ __volatile__(
225
+ "movq (%1), %%rdx # A[0] \n\t"
226
+ "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
227
+ "movq 8(%1), %%rdx # A[1] \n\t"
228
+ "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
229
+ "movq %%r8, (%0) \n\t"
230
+ "movq %%r9, 8(%0) \n\t"
231
+ "movq %%r10, 16(%0) \n\t"
232
+ "movq %%r11, 24(%0) \n\t"
233
+
234
+ "movq 16(%1), %%rdx # A[2] \n\t"
235
+ "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
236
+ "movq 24(%1), %%rdx # A[3] \n\t"
237
+ "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
238
+ "movq %%r8, 32(%0) \n\t"
239
+ "movq %%r9, 40(%0) \n\t"
240
+ "movq %%r10, 48(%0) \n\t"
241
+ "movq %%r11, 56(%0) \n\t"
242
+
243
+ "movq 8(%1), %%rdx # A[1] \n\t"
244
+ "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
245
+ "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
246
+ "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
247
+
248
+ "movq 16(%1), %%rdx # A[2] \n\t"
249
+ "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
250
+ "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
251
+
252
+ "addq %%rax, %%r9 \n\t"
253
+ "adcq %%rdx, %%r10 \n\t"
254
+ "adcq %%rcx, %%r11 \n\t"
255
+ "adcq %%r14, %%r12 \n\t"
256
+ "adcq $0, %%r13 \n\t"
257
+ "movq $0, %%r14 \n\t"
258
+ "adcq $0, %%r14 \n\t"
259
+
260
+ "movq (%1), %%rdx # A[0] \n\t"
261
+ "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
262
+
263
+ "addq %%rax, %%r10 \n\t"
264
+ "adcq %%rdx, %%r11 \n\t"
265
+ "adcq $0, %%r12 \n\t"
266
+ "adcq $0, %%r13 \n\t"
267
+ "adcq $0, %%r14 \n\t"
268
+
269
+ "shldq $1, %%r13, %%r14 \n\t"
270
+ "shldq $1, %%r12, %%r13 \n\t"
271
+ "shldq $1, %%r11, %%r12 \n\t"
272
+ "shldq $1, %%r10, %%r11 \n\t"
273
+ "shldq $1, %%r9, %%r10 \n\t"
274
+ "shldq $1, %%r8, %%r9 \n\t"
275
+ "shlq $1, %%r8 \n\t"
276
+
277
+ "addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
278
+ "adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
279
+ "adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
280
+ "adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
281
+ "adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
282
+ "adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
283
+ "adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
284
+
285
+
286
+ "movq 32(%1), %%rdx # A[0] \n\t"
287
+ "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
288
+ "movq 40(%1), %%rdx # A[1] \n\t"
289
+ "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
290
+ "movq %%r8, 64(%0) \n\t"
291
+ "movq %%r9, 72(%0) \n\t"
292
+ "movq %%r10, 80(%0) \n\t"
293
+ "movq %%r11, 88(%0) \n\t"
294
+
295
+ "movq 48(%1), %%rdx # A[2] \n\t"
296
+ "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
297
+ "movq 56(%1), %%rdx # A[3] \n\t"
298
+ "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
299
+ "movq %%r8, 96(%0) \n\t"
300
+ "movq %%r9, 104(%0) \n\t"
301
+ "movq %%r10, 112(%0) \n\t"
302
+ "movq %%r11, 120(%0) \n\t"
303
+
304
+ "movq 40(%1), %%rdx # A[1] \n\t"
305
+ "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
306
+ "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
307
+ "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
308
+
309
+ "movq 48(%1), %%rdx # A[2] \n\t"
310
+ "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
311
+ "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
312
+
313
+ "addq %%rax, %%r9 \n\t"
314
+ "adcq %%rdx, %%r10 \n\t"
315
+ "adcq %%rcx, %%r11 \n\t"
316
+ "adcq %%r14, %%r12 \n\t"
317
+ "adcq $0, %%r13 \n\t"
318
+ "movq $0, %%r14 \n\t"
319
+ "adcq $0, %%r14 \n\t"
320
+
321
+ "movq 32(%1), %%rdx # A[0] \n\t"
322
+ "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
323
+
324
+ "addq %%rax, %%r10 \n\t"
325
+ "adcq %%rdx, %%r11 \n\t"
326
+ "adcq $0, %%r12 \n\t"
327
+ "adcq $0, %%r13 \n\t"
328
+ "adcq $0, %%r14 \n\t"
329
+
330
+ "shldq $1, %%r13, %%r14 \n\t"
331
+ "shldq $1, %%r12, %%r13 \n\t"
332
+ "shldq $1, %%r11, %%r12 \n\t"
333
+ "shldq $1, %%r10, %%r11 \n\t"
334
+ "shldq $1, %%r9, %%r10 \n\t"
335
+ "shldq $1, %%r8, %%r9 \n\t"
336
+ "shlq $1, %%r8 \n\t"
337
+
338
+ "addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
339
+ "adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
340
+ "adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
341
+ "adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
342
+ "adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
343
+ "adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
344
+ "adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
345
+ :
346
+ : "r" (c), "r" (a)
347
+ : "cc", "%rax", "%rcx", "%rdx",
348
+ "%r8", "%r9", "%r10", "%r11",
349
+ "%r12", "%r13", "%r14"
350
+ );
351
+ #else /* Without BMI2 */
352
+ /**
353
+ * TODO: Multiplications using MULQ instruction.
354
+ **/
355
+ #endif
356
+ }
357
+
358
+ /**
359
+ *
360
+ * @param c
361
+ * @param a
362
+ */
363
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
364
+ {
365
+ #ifdef __BMI2__
366
+ #ifdef __ADX__
367
+ __asm__ __volatile__(
368
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
369
+ " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
370
+ " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
371
+ " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
372
+ " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
373
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
374
+ " xorl %%ebx, %%ebx \n\t"
375
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
376
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
377
+
378
+ " mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
379
+ " mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
380
+ " mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
381
+ " mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
382
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
383
+ " xorl %%ebx, %%ebx \n\t"
384
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
385
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
386
+ :
387
+ : "r" (c), "r" (a)
388
+ : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
389
+ );
390
+ #else
391
+ __asm__ __volatile__(
392
+ "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
393
+ "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
394
+ "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
395
+ "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
396
+ "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
397
+ "adcq $0, %%rcx \n\t"
398
+
399
+ "addq (%1), %%r8 \n\t"
400
+ "adcq 8(%1), %%r10 \n\t"
401
+ "adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
402
+ "adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
403
+ "adcq $0, %%rcx \n\t"
404
+
405
+ "mulx %%rcx, %%rax, %%rcx \n\t"
406
+ "addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
407
+ "adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
408
+
409
+ "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
410
+ "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
411
+ "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
412
+ "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
413
+ "adcq $0, %%rcx \n\t"
414
+
415
+ "addq 64(%1), %%r8 \n\t"
416
+ "adcq 72(%1), %%r10 \n\t"
417
+ "adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
418
+ "adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
419
+ "adcq $0, %%rcx \n\t"
420
+
421
+ "mulx %%rcx, %%rax, %%rcx \n\t"
422
+ "addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
423
+ "adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
424
+
425
+ :
426
+ : "r" (c), "r" (a)
427
+ : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
428
+ );
429
+ #endif
430
+ #else /* Without BMI2 */
431
+ /* [TODO] */
432
+ #endif
433
+ }
434
+
435
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
436
+ {
437
+ #ifdef __BMI2__
438
+ #ifdef __ADX__
439
+ __asm__ __volatile__(
440
+ " movq (%1), %%rdx # A[0] \n\t"
441
+ " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
442
+ " mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
443
+ " mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
444
+ " mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
445
+ " adox %%rdx, %%rax \n\t"
446
+
447
+ " movq 8(%1), %%rdx # A[1] \n\t"
448
+ " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
449
+ " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
450
+ " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
451
+ " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
452
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
453
+
454
+ " movq 16(%1), %%rdx # A[2] \n\t"
455
+ " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
456
+ " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
457
+ " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
458
+ " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
459
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
460
+
461
+ " movq 24(%1), %%rdx # A[3] \n\t"
462
+ " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
463
+ " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
464
+ " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
465
+ " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
466
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
467
+ :
468
+ : "r" (c), "r" (a), "r" (b)
469
+ : "memory", "cc", "%rax", "%rdx",
470
+ "%r8", "%r9", "%r10", "%r11",
471
+ "%r12", "%r13", "%r14"
472
+ );
473
+ #else
474
+ __asm__ __volatile__(
475
+ " movq (%1), %%rdx # A[0] \n\t"
476
+ " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
477
+ " mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
478
+ " mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
479
+ " mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
480
+ " adcq $0, %%rcx \n\t"
481
+
482
+ " movq 8(%1), %%rdx # A[1] \n\t"
483
+ " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
484
+ " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
485
+ " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
486
+ " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
487
+ " adcq $0, %%rdx \n\t"
488
+
489
+ " addq %%r8, 8(%0) \n\t"
490
+ " adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
491
+ " adcq %%r11, %%rbx \n\t"
492
+ " adcq %%r13, %%rcx \n\t"
493
+ " adcq %%rdx, %%rax \n\t"
494
+
495
+ " movq 16(%1), %%rdx # A[2] \n\t"
496
+ " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
497
+ " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
498
+ " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
499
+ " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
500
+ " adcq $0, %%rdx \n\t"
501
+
502
+ " addq %%r8, 16(%0) \n\t"
503
+ " adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
504
+ " adcq %%r11, %%rcx \n\t"
505
+ " adcq %%r13, %%rax \n\t"
506
+ " adcq %%rdx, %%rbx \n\t"
507
+
508
+ " movq 24(%1), %%rdx # A[3] \n\t"
509
+ " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
510
+ " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
511
+ " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
512
+ " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
513
+ " adcq $0, %%rdx \n\t"
514
+
515
+ " addq %%r8, 24(%0) \n\t"
516
+ " adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
517
+ " adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
518
+ " adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
519
+ " adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
520
+ :
521
+ : "r" (c), "r" (a), "r" (b)
522
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
523
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
524
+ );
525
+ #endif
526
+ #else /* Without BMI2 */
527
+ /**
528
+ * TODO: Multiplications using MULQ instruction.
529
+ **/
530
+ #endif
531
+ }
532
+
533
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
534
+ {
535
+ #ifdef __BMI2__
536
+ __asm__ __volatile__(
537
+ " movq (%1), %%rdx # A[0] \n\t"
538
+ " mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
539
+ " movq 8(%1), %%rdx # A[1] \n\t"
540
+ " mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
541
+ " movq %%r8, (%0) \n\t"
542
+ " movq %%r9, 8(%0) \n\t"
543
+ " movq %%r10, 16(%0) \n\t"
544
+ " movq %%r11, 24(%0) \n\t"
545
+
546
+ " movq 16(%1), %%rdx # A[2] \n\t"
547
+ " mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
548
+ " movq 24(%1), %%rdx # A[3] \n\t"
549
+ " mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
550
+ " movq %%r8, 32(%0) \n\t"
551
+ " movq %%r9, 40(%0) \n\t"
552
+ " movq %%r10, 48(%0) \n\t"
553
+ " movq %%r11, 56(%0) \n\t"
554
+
555
+ " movq 8(%1), %%rdx # A[1] \n\t"
556
+ " mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
557
+ " mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
558
+ " mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
559
+
560
+ " movq 16(%1), %%rdx # A[2] \n\t"
561
+ " mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
562
+ " mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
563
+
564
+ " addq %%rax, %%r9 \n\t"
565
+ " adcq %%rdx, %%r10 \n\t"
566
+ " adcq %%rcx, %%r11 \n\t"
567
+ " adcq %%r14, %%r12 \n\t"
568
+ " adcq $0, %%r13 \n\t"
569
+ " movq $0, %%r14 \n\t"
570
+ " adcq $0, %%r14 \n\t"
571
+
572
+ " movq (%1), %%rdx # A[0] \n\t"
573
+ " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
574
+
575
+ " addq %%rax, %%r10 \n\t"
576
+ " adcq %%rdx, %%r11 \n\t"
577
+ " adcq $0, %%r12 \n\t"
578
+ " adcq $0, %%r13 \n\t"
579
+ " adcq $0, %%r14 \n\t"
580
+
581
+ " shldq $1, %%r13, %%r14 \n\t"
582
+ " shldq $1, %%r12, %%r13 \n\t"
583
+ " shldq $1, %%r11, %%r12 \n\t"
584
+ " shldq $1, %%r10, %%r11 \n\t"
585
+ " shldq $1, %%r9, %%r10 \n\t"
586
+ " shldq $1, %%r8, %%r9 \n\t"
587
+ " shlq $1, %%r8 \n\t"
588
+
589
+ " addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
590
+ " adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
591
+ " adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
592
+ " adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
593
+ " adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
594
+ " adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
595
+ " adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
596
+ :
597
+ : "r" (c), "r" (a)
598
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
599
+ "%r8", "%r9", "%r10", "%r11",
600
+ "%r12", "%r13", "%r14"
601
+ );
602
+ #else /* Without BMI2 */
603
+ /**
604
+ * TODO: Multiplications using MULQ instruction.
605
+ **/
606
+ #endif
607
+ }
608
+
609
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
610
+ {
611
+ #ifdef __BMI2__
612
+ #ifdef __ADX__
613
+ __asm__ __volatile__(
614
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
615
+ " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
616
+ " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
617
+ " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
618
+ " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
619
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
620
+ " xorl %%ebx, %%ebx \n\t"
621
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
622
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
623
+ :
624
+ : "r" (c), "r" (a)
625
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
626
+ );
627
+ #else
628
+ __asm__ __volatile__(
629
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
630
+ " mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
631
+ " mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
632
+ " mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
633
+ " mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
634
+ " adcq $0, %%rcx \n\t"
635
+
636
+ " addq (%1), %%r8 \n\t"
637
+ " adcq 8(%1), %%r10 \n\t"
638
+ " adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
639
+ " adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
640
+ " adcq $0, %%rcx \n\t"
641
+
642
+ " mulx %%rcx, %%rax, %%rcx \n\t"
643
+ " addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
644
+ " adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
645
+ :
646
+ : "r" (c), "r" (a)
647
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
648
+ );
649
+ #endif
650
+ #else /* Without BMI2 */
651
+ /**
652
+ * TODO: Multiplications using MULQ instruction.
653
+ **/
654
+ #endif
655
+ }
656
+
657
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
658
+ {
659
+ #if __ADX__
660
+ __asm__ __volatile__(
661
+ "movq (%2), %%rax \n\t"
662
+ "movq 8(%2), %%rcx \n\t"
663
+ "movq 16(%2), %%r8 \n\t"
664
+ "movq 24(%2), %%r9 \n\t"
665
+ "clc \n\t"
666
+ "adcx (%1), %%rax \n\t"
667
+ "adcx 8(%1), %%rcx \n\t"
668
+ "adcx 16(%1), %%r8 \n\t"
669
+ "adcx 24(%1), %%r9 \n\t"
670
+ "movq %%rcx, 8(%0) \n\t"
671
+ "movq %%r8 , 16(%0) \n\t"
672
+ "movq %%r9 , 24(%0) \n\t"
673
+ "setc %%cl \n\t"
674
+ "neg %%rcx \n\t"
675
+ "andq $38, %%rcx \n\t"
676
+ "addq %%rcx, %%rax \n\t"
677
+ "movq %%rax, (%0) \n\t"
678
+ :
679
+ : "r" (c), "r" (a), "r" (b)
680
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
681
+ );
682
+ #else
683
+ __asm__ __volatile__(
684
+ "movq (%2), %%rax \n\t"
685
+ "movq 8(%2), %%rcx \n\t"
686
+ "movq 16(%2), %%r8 \n\t"
687
+ "movq 24(%2), %%r9 \n\t"
688
+ "add (%1), %%rax \n\t"
689
+ "adc 8(%1), %%rcx \n\t"
690
+ "adc 16(%1), %%r8 \n\t"
691
+ "adc 24(%1), %%r9 \n\t"
692
+ "movq %%rcx, 8(%0) \n\t"
693
+ "movq %%r8 , 16(%0) \n\t"
694
+ "movq %%r9 , 24(%0) \n\t"
695
+ "setc %%cl \n\t"
696
+ "neg %%rcx \n\t"
697
+ "andq $38, %%rcx \n\t"
698
+ "addq %%rcx, %%rax \n\t"
699
+ "movq %%rax, (%0) \n\t"
700
+ :
701
+ : "r" (c), "r" (a), "r" (b)
702
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
703
+ );
704
+ #endif
705
+ }
706
+
707
+ inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
708
+ uint64_t *const __restrict b)
709
+ {
710
+ __asm__ __volatile__(
711
+ "movq (%1), %%rax \n\t"
712
+ "movq 8(%1), %%rcx \n\t"
713
+ "movq 16(%1), %%r8 \n\t"
714
+ "movq 24(%1), %%r9 \n\t"
715
+ "subq (%2), %%rax \n\t"
716
+ "sbbq 8(%2), %%rcx \n\t"
717
+ "sbbq 16(%2), %%r8 \n\t"
718
+ "sbbq 24(%2), %%r9 \n\t"
719
+ "movq %%rcx, 8(%0) \n\t"
720
+ "movq %%r8 , 16(%0) \n\t"
721
+ "movq %%r9 , 24(%0) \n\t"
722
+ "setc %%cl \n\t"
723
+ "neg %%rcx \n\t"
724
+ "andq $38, %%rcx \n\t"
725
+ "subq %%rcx, %%rax \n\t"
726
+ "movq %%rax, (%0) \n\t"
727
+ :
728
+ : "r" (c), "r" (a), "r" (b)
729
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
730
+ );
731
+ }
732
+
733
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
734
+ {
735
+ #ifdef __BMI2__
736
+ /**
737
+ * a24 = (A+2)/4 = (486662+2)/4 = 121666
738
+ **/
739
+ const uint64_t a24 = 121666;
740
+ __asm__ __volatile__(
741
+ "movq %2, %%rdx \n\t"
742
+ "mulx (%1), %%rax, %%r8 \n\t"
743
+ "mulx 8(%1), %%rcx, %%r9 \n\t"
744
+ "movq %%rax, (%0) \n\t"
745
+ "movq %%rcx, 8(%0) \n\t"
746
+ "mulx 16(%1), %%rax, %%r10 \n\t"
747
+ "mulx 24(%1), %%rcx, %%r11 \n\t"
748
+ "movq %%rax, 16(%0) \n\t"
749
+ "movq %%rcx, 24(%0) \n\t"
750
+ "movq $38, %%rdx \n\t"
751
+ "mulx %%r11, %%rax, %%rcx \n\t"
752
+ "addq %%rax, (%0) \n\t"
753
+ "adcq %%r8, 8(%0) \n\t"
754
+ "adcq %%r9, 16(%0) \n\t"
755
+ "adcq %%r10, 24(%0) \n\t"
756
+ :
757
+ : "r" (c), "r" (a), "r" (a24)
758
+ : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
759
+ );
760
+ #else /* Without BMI2 */
761
+ /**
762
+ * TODO: Multiplications using MULQ instruction.
763
+ **/
764
+ #endif
765
+ }
766
+
767
+ void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
768
+ {
769
+ #define sqrn_EltFp25519_1w_x64(a,times)\
770
+ counter = times;\
771
+ while(counter-- > 0)\
772
+ {\
773
+ sqr_EltFp25519_1w_x64(a);\
774
+ }
775
+
776
+ EltFp25519_1w_Buffer_x64 buffer_1w;
777
+ EltFp25519_1w_x64 x0, x1, x2;
778
+ uint64_t * T[5];
779
+ uint64_t counter;
780
+
781
+ T[0] = x0;
782
+ T[1] = pC; /* x^(-1) */
783
+ T[2] = x1;
784
+ T[3] = x2;
785
+ T[4] = pA; /* x */
786
+
787
+ copy_EltFp25519_1w_x64(T[1],pA);
788
+ sqrn_EltFp25519_1w_x64(T[1],1);
789
+ copy_EltFp25519_1w_x64(T[2],T[1]);
790
+ sqrn_EltFp25519_1w_x64(T[2],2);
791
+ mul_EltFp25519_1w_x64(T[0], pA, T[2]);
792
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
793
+ copy_EltFp25519_1w_x64(T[2],T[1]);
794
+ sqrn_EltFp25519_1w_x64(T[2],1);
795
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
796
+ copy_EltFp25519_1w_x64(T[2],T[0]);
797
+ sqrn_EltFp25519_1w_x64(T[2],5);
798
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
799
+ copy_EltFp25519_1w_x64(T[2],T[0]);
800
+ sqrn_EltFp25519_1w_x64(T[2],10);
801
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
802
+ copy_EltFp25519_1w_x64(T[3],T[2]);
803
+ sqrn_EltFp25519_1w_x64(T[3],20);
804
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
805
+ sqrn_EltFp25519_1w_x64(T[3],10);
806
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
807
+ copy_EltFp25519_1w_x64(T[0],T[3]);
808
+ sqrn_EltFp25519_1w_x64(T[0],50);
809
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
810
+ copy_EltFp25519_1w_x64(T[2],T[0]);
811
+ sqrn_EltFp25519_1w_x64(T[2],100);
812
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
813
+ sqrn_EltFp25519_1w_x64(T[2],50);
814
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
815
+ sqrn_EltFp25519_1w_x64(T[2],5);
816
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
817
+ #undef sqrn_EltFp25519_1w_x64
818
+ }
819
+
820
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c)
821
+ {
822
+ int64_t last = (((int64_t*)c)[3])>>63;
823
+ c[3] &= ((uint64_t)1<<63)-1;
824
+ c[0] += 19 & last;
825
+ }
826
+