x25519 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ /**
2
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * This program is free software: you can redistribute it and/or modify
7
+ * it under the terms of the GNU Lesser General Public License as
8
+ * published by the Free Software Foundation, version 3.
9
+ *
10
+ * This program is distributed in the hope that it will be useful, but
11
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public License
16
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ */
18
+ #ifndef BYTES_H
19
+ #define BYTES_H
20
+
21
+ #include <stdint.h>
22
+ void print_bytes(uint8_t * A, int num_bytes);
23
+ int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes);
24
+
25
+ #endif /* BYTES_H */
@@ -0,0 +1,826 @@
1
+ /**
2
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * This program is free software: you can redistribute it and/or modify
7
+ * it under the terms of the GNU Lesser General Public License as
8
+ * published by the Free Software Foundation, version 3.
9
+ *
10
+ * This program is distributed in the hope that it will be useful, but
11
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ * Lesser General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU Lesser General Public License
16
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
+ */
18
+ #include "random.h"
19
+ #include "bytes.h"
20
+ #include "fp25519_x64.h"
21
+
22
+ void random_EltFp25519_1w_x64(uint64_t *A)
23
+ {
24
+ random_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
25
+ A[3] &= ((uint64_t)1<<63)-1;
26
+ }
27
+
28
+ int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
29
+ {
30
+ return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
31
+ }
32
+
33
+ void print_EltFp25519_1w_x64(uint64_t *A)
34
+ {
35
+ print_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
36
+ }
37
+
38
+ /**
39
+ *
40
+ * @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
41
+ * @param a Two 256-bit integers: a[0:3] and a[4:7]
42
+ * @param b Two 256-bit integers: b[0:3] and b[4:7]
43
+ */
44
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
45
+ {
46
+ #ifdef __BMI2__
47
+ #ifdef __ADX__
48
+ __asm__ __volatile__(
49
+ "movq (%1), %%rdx # A[0] \n\t"
50
+ "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
51
+ "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
52
+ "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
53
+ "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
54
+ "adox %%rdx, %%rax \n\t"
55
+
56
+ "movq 8(%1), %%rdx # A[1] \n\t"
57
+ "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
58
+ "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
59
+ "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
60
+ "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
61
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
62
+
63
+ "movq 16(%1), %%rdx # A[2] \n\t"
64
+ "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
65
+ "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
66
+ "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
67
+ "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
68
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
69
+
70
+ "movq 24(%1), %%rdx # A[3] \n\t"
71
+ "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
72
+ "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
73
+ "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
74
+ "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
75
+ "adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
76
+
77
+ "movq 32(%1), %%rdx # A[0] \n\t"
78
+ "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
79
+ "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
80
+ "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
81
+ "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
82
+ "adox %%rdx, %%rax \n\t"
83
+
84
+ "movq 40(%1), %%rdx # A[1] \n\t"
85
+ "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
86
+ "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
87
+ "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
88
+ "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
89
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
90
+
91
+ "movq 48(%1), %%rdx # A[2] \n\t"
92
+ "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
93
+ "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
94
+ "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
95
+ "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
96
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
97
+
98
+ "movq 56(%1), %%rdx # A[3] \n\t"
99
+ "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
100
+ "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
101
+ "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
102
+ "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
103
+ "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
104
+ :
105
+ : "r" (c), "r" (a), "r" (b)
106
+ : "memory", "cc", "%rax", "%rdx",
107
+ "%r8", "%r9", "%r10", "%r11",
108
+ "%r12", "%r13", "%r14"
109
+ );
110
+ #else
111
+ __asm__ __volatile__(
112
+ "movq (%1), %%rdx # A[0] \n\t"
113
+ "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
114
+ "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
115
+ "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
116
+ "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
117
+ "adcq $0, %%rcx \n\t"
118
+
119
+ "movq 8(%1), %%rdx # A[1] \n\t"
120
+ "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
121
+ "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
122
+ "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
123
+ "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
124
+ "adcq $0, %%rdx \n\t"
125
+
126
+ "addq %%r8, 8(%0) \n\t"
127
+ "adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
128
+ "adcq %%r11, %%rbx \n\t"
129
+ "adcq %%r13, %%rcx \n\t"
130
+ "adcq %%rdx, %%rax \n\t"
131
+
132
+ "movq 16(%1), %%rdx # A[2] \n\t"
133
+ "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
134
+ "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
135
+ "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
136
+ "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
137
+ "adcq $0, %%rdx \n\t"
138
+
139
+ "addq %%r8, 16(%0) \n\t"
140
+ "adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
141
+ "adcq %%r11, %%rcx \n\t"
142
+ "adcq %%r13, %%rax \n\t"
143
+ "adcq %%rdx, %%rbx \n\t"
144
+
145
+ "movq 24(%1), %%rdx # A[3] \n\t"
146
+ "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
147
+ "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
148
+ "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
149
+ "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
150
+ "adcq $0, %%rdx \n\t"
151
+
152
+ "addq %%r8, 24(%0) \n\t"
153
+ "adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
154
+ "adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
155
+ "adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
156
+ "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
157
+
158
+ "movq 32(%1), %%rdx # A[0] \n\t"
159
+ "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
160
+ "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
161
+ "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
162
+ "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
163
+ "adcq $0, %%rcx \n\t"
164
+
165
+ "movq 40(%1), %%rdx # A[1] \n\t"
166
+ "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
167
+ "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
168
+ "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
169
+ "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
170
+ "adcq $0, %%rdx \n\t"
171
+
172
+ "addq %%r8, 72(%0) \n\t"
173
+ "adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
174
+ "adcq %%r11, %%rbx \n\t"
175
+ "adcq %%r13, %%rcx \n\t"
176
+ "adcq %%rdx, %%rax \n\t"
177
+
178
+ "movq 48(%1), %%rdx # A[2] \n\t"
179
+ "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
180
+ "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
181
+ "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
182
+ "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
183
+ "adcq $0, %%rdx \n\t"
184
+
185
+ "addq %%r8, 80(%0) \n\t"
186
+ "adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
187
+ "adcq %%r11, %%rcx \n\t"
188
+ "adcq %%r13, %%rax \n\t"
189
+ "adcq %%rdx, %%rbx \n\t"
190
+
191
+ "movq 56(%1), %%rdx # A[3] \n\t"
192
+ "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
193
+ "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
194
+ "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
195
+ "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
196
+ "adcq $0, %%rdx \n\t"
197
+
198
+ "addq %%r8, 88(%0) \n\t"
199
+ "adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
200
+ "adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
201
+ "adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
202
+ "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
203
+ :
204
+ : "r" (c), "r" (a), "r" (b)
205
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
206
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
207
+ );
208
+ #endif
209
+ #else /* Without BMI2 */
210
+ /**
211
+ * TODO: Multiplications using MULQ instruction.
212
+ **/
213
+ #endif
214
+ }
215
+
216
+ /**
217
+ *
218
+ * @param c
219
+ * @param a
220
+ */
221
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
222
+ {
223
+ #ifdef __BMI2__
224
+ __asm__ __volatile__(
225
+ "movq (%1), %%rdx # A[0] \n\t"
226
+ "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
227
+ "movq 8(%1), %%rdx # A[1] \n\t"
228
+ "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
229
+ "movq %%r8, (%0) \n\t"
230
+ "movq %%r9, 8(%0) \n\t"
231
+ "movq %%r10, 16(%0) \n\t"
232
+ "movq %%r11, 24(%0) \n\t"
233
+
234
+ "movq 16(%1), %%rdx # A[2] \n\t"
235
+ "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
236
+ "movq 24(%1), %%rdx # A[3] \n\t"
237
+ "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
238
+ "movq %%r8, 32(%0) \n\t"
239
+ "movq %%r9, 40(%0) \n\t"
240
+ "movq %%r10, 48(%0) \n\t"
241
+ "movq %%r11, 56(%0) \n\t"
242
+
243
+ "movq 8(%1), %%rdx # A[1] \n\t"
244
+ "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
245
+ "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
246
+ "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
247
+
248
+ "movq 16(%1), %%rdx # A[2] \n\t"
249
+ "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
250
+ "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
251
+
252
+ "addq %%rax, %%r9 \n\t"
253
+ "adcq %%rdx, %%r10 \n\t"
254
+ "adcq %%rcx, %%r11 \n\t"
255
+ "adcq %%r14, %%r12 \n\t"
256
+ "adcq $0, %%r13 \n\t"
257
+ "movq $0, %%r14 \n\t"
258
+ "adcq $0, %%r14 \n\t"
259
+
260
+ "movq (%1), %%rdx # A[0] \n\t"
261
+ "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
262
+
263
+ "addq %%rax, %%r10 \n\t"
264
+ "adcq %%rdx, %%r11 \n\t"
265
+ "adcq $0, %%r12 \n\t"
266
+ "adcq $0, %%r13 \n\t"
267
+ "adcq $0, %%r14 \n\t"
268
+
269
+ "shldq $1, %%r13, %%r14 \n\t"
270
+ "shldq $1, %%r12, %%r13 \n\t"
271
+ "shldq $1, %%r11, %%r12 \n\t"
272
+ "shldq $1, %%r10, %%r11 \n\t"
273
+ "shldq $1, %%r9, %%r10 \n\t"
274
+ "shldq $1, %%r8, %%r9 \n\t"
275
+ "shlq $1, %%r8 \n\t"
276
+
277
+ "addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
278
+ "adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
279
+ "adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
280
+ "adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
281
+ "adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
282
+ "adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
283
+ "adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
284
+
285
+
286
+ "movq 32(%1), %%rdx # A[0] \n\t"
287
+ "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
288
+ "movq 40(%1), %%rdx # A[1] \n\t"
289
+ "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
290
+ "movq %%r8, 64(%0) \n\t"
291
+ "movq %%r9, 72(%0) \n\t"
292
+ "movq %%r10, 80(%0) \n\t"
293
+ "movq %%r11, 88(%0) \n\t"
294
+
295
+ "movq 48(%1), %%rdx # A[2] \n\t"
296
+ "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
297
+ "movq 56(%1), %%rdx # A[3] \n\t"
298
+ "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
299
+ "movq %%r8, 96(%0) \n\t"
300
+ "movq %%r9, 104(%0) \n\t"
301
+ "movq %%r10, 112(%0) \n\t"
302
+ "movq %%r11, 120(%0) \n\t"
303
+
304
+ "movq 40(%1), %%rdx # A[1] \n\t"
305
+ "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
306
+ "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
307
+ "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
308
+
309
+ "movq 48(%1), %%rdx # A[2] \n\t"
310
+ "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
311
+ "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
312
+
313
+ "addq %%rax, %%r9 \n\t"
314
+ "adcq %%rdx, %%r10 \n\t"
315
+ "adcq %%rcx, %%r11 \n\t"
316
+ "adcq %%r14, %%r12 \n\t"
317
+ "adcq $0, %%r13 \n\t"
318
+ "movq $0, %%r14 \n\t"
319
+ "adcq $0, %%r14 \n\t"
320
+
321
+ "movq 32(%1), %%rdx # A[0] \n\t"
322
+ "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
323
+
324
+ "addq %%rax, %%r10 \n\t"
325
+ "adcq %%rdx, %%r11 \n\t"
326
+ "adcq $0, %%r12 \n\t"
327
+ "adcq $0, %%r13 \n\t"
328
+ "adcq $0, %%r14 \n\t"
329
+
330
+ "shldq $1, %%r13, %%r14 \n\t"
331
+ "shldq $1, %%r12, %%r13 \n\t"
332
+ "shldq $1, %%r11, %%r12 \n\t"
333
+ "shldq $1, %%r10, %%r11 \n\t"
334
+ "shldq $1, %%r9, %%r10 \n\t"
335
+ "shldq $1, %%r8, %%r9 \n\t"
336
+ "shlq $1, %%r8 \n\t"
337
+
338
+ "addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
339
+ "adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
340
+ "adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
341
+ "adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
342
+ "adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
343
+ "adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
344
+ "adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
345
+ :
346
+ : "r" (c), "r" (a)
347
+ : "cc", "%rax", "%rcx", "%rdx",
348
+ "%r8", "%r9", "%r10", "%r11",
349
+ "%r12", "%r13", "%r14"
350
+ );
351
+ #else /* Without BMI2 */
352
+ /**
353
+ * TODO: Multiplications using MULQ instruction.
354
+ **/
355
+ #endif
356
+ }
357
+
358
+ /**
359
+ *
360
+ * @param c
361
+ * @param a
362
+ */
363
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
364
+ {
365
+ #ifdef __BMI2__
366
+ #ifdef __ADX__
367
+ __asm__ __volatile__(
368
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
369
+ " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
370
+ " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
371
+ " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
372
+ " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
373
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
374
+ " xorl %%ebx, %%ebx \n\t"
375
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
376
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
377
+
378
+ " mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
379
+ " mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
380
+ " mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
381
+ " mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
382
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
383
+ " xorl %%ebx, %%ebx \n\t"
384
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
385
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
386
+ :
387
+ : "r" (c), "r" (a)
388
+ : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
389
+ );
390
+ #else
391
+ __asm__ __volatile__(
392
+ "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
393
+ "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
394
+ "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
395
+ "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
396
+ "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
397
+ "adcq $0, %%rcx \n\t"
398
+
399
+ "addq (%1), %%r8 \n\t"
400
+ "adcq 8(%1), %%r10 \n\t"
401
+ "adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
402
+ "adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
403
+ "adcq $0, %%rcx \n\t"
404
+
405
+ "mulx %%rcx, %%rax, %%rcx \n\t"
406
+ "addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
407
+ "adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
408
+
409
+ "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
410
+ "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
411
+ "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
412
+ "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
413
+ "adcq $0, %%rcx \n\t"
414
+
415
+ "addq 64(%1), %%r8 \n\t"
416
+ "adcq 72(%1), %%r10 \n\t"
417
+ "adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
418
+ "adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
419
+ "adcq $0, %%rcx \n\t"
420
+
421
+ "mulx %%rcx, %%rax, %%rcx \n\t"
422
+ "addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
423
+ "adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
424
+
425
+ :
426
+ : "r" (c), "r" (a)
427
+ : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
428
+ );
429
+ #endif
430
+ #else /* Without BMI2 */
431
+ /* [TODO] */
432
+ #endif
433
+ }
434
+
435
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
436
+ {
437
+ #ifdef __BMI2__
438
+ #ifdef __ADX__
439
+ __asm__ __volatile__(
440
+ " movq (%1), %%rdx # A[0] \n\t"
441
+ " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
442
+ " mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
443
+ " mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
444
+ " mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
445
+ " adox %%rdx, %%rax \n\t"
446
+
447
+ " movq 8(%1), %%rdx # A[1] \n\t"
448
+ " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
449
+ " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
450
+ " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
451
+ " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
452
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
453
+
454
+ " movq 16(%1), %%rdx # A[2] \n\t"
455
+ " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
456
+ " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
457
+ " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
458
+ " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
459
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
460
+
461
+ " movq 24(%1), %%rdx # A[3] \n\t"
462
+ " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
463
+ " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
464
+ " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
465
+ " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
466
+ " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
467
+ :
468
+ : "r" (c), "r" (a), "r" (b)
469
+ : "memory", "cc", "%rax", "%rdx",
470
+ "%r8", "%r9", "%r10", "%r11",
471
+ "%r12", "%r13", "%r14"
472
+ );
473
+ #else
474
+ __asm__ __volatile__(
475
+ " movq (%1), %%rdx # A[0] \n\t"
476
+ " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
477
+ " mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
478
+ " mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
479
+ " mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
480
+ " adcq $0, %%rcx \n\t"
481
+
482
+ " movq 8(%1), %%rdx # A[1] \n\t"
483
+ " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
484
+ " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
485
+ " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
486
+ " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
487
+ " adcq $0, %%rdx \n\t"
488
+
489
+ " addq %%r8, 8(%0) \n\t"
490
+ " adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
491
+ " adcq %%r11, %%rbx \n\t"
492
+ " adcq %%r13, %%rcx \n\t"
493
+ " adcq %%rdx, %%rax \n\t"
494
+
495
+ " movq 16(%1), %%rdx # A[2] \n\t"
496
+ " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
497
+ " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
498
+ " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
499
+ " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
500
+ " adcq $0, %%rdx \n\t"
501
+
502
+ " addq %%r8, 16(%0) \n\t"
503
+ " adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
504
+ " adcq %%r11, %%rcx \n\t"
505
+ " adcq %%r13, %%rax \n\t"
506
+ " adcq %%rdx, %%rbx \n\t"
507
+
508
+ " movq 24(%1), %%rdx # A[3] \n\t"
509
+ " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
510
+ " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
511
+ " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
512
+ " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
513
+ " adcq $0, %%rdx \n\t"
514
+
515
+ " addq %%r8, 24(%0) \n\t"
516
+ " adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
517
+ " adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
518
+ " adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
519
+ " adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
520
+ :
521
+ : "r" (c), "r" (a), "r" (b)
522
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
523
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
524
+ );
525
+ #endif
526
+ #else /* Without BMI2 */
527
+ /**
528
+ * TODO: Multiplications using MULQ instruction.
529
+ **/
530
+ #endif
531
+ }
532
+
533
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
534
+ {
535
+ #ifdef __BMI2__
536
+ __asm__ __volatile__(
537
+ " movq (%1), %%rdx # A[0] \n\t"
538
+ " mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
539
+ " movq 8(%1), %%rdx # A[1] \n\t"
540
+ " mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
541
+ " movq %%r8, (%0) \n\t"
542
+ " movq %%r9, 8(%0) \n\t"
543
+ " movq %%r10, 16(%0) \n\t"
544
+ " movq %%r11, 24(%0) \n\t"
545
+
546
+ " movq 16(%1), %%rdx # A[2] \n\t"
547
+ " mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
548
+ " movq 24(%1), %%rdx # A[3] \n\t"
549
+ " mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
550
+ " movq %%r8, 32(%0) \n\t"
551
+ " movq %%r9, 40(%0) \n\t"
552
+ " movq %%r10, 48(%0) \n\t"
553
+ " movq %%r11, 56(%0) \n\t"
554
+
555
+ " movq 8(%1), %%rdx # A[1] \n\t"
556
+ " mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
557
+ " mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
558
+ " mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
559
+
560
+ " movq 16(%1), %%rdx # A[2] \n\t"
561
+ " mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
562
+ " mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
563
+
564
+ " addq %%rax, %%r9 \n\t"
565
+ " adcq %%rdx, %%r10 \n\t"
566
+ " adcq %%rcx, %%r11 \n\t"
567
+ " adcq %%r14, %%r12 \n\t"
568
+ " adcq $0, %%r13 \n\t"
569
+ " movq $0, %%r14 \n\t"
570
+ " adcq $0, %%r14 \n\t"
571
+
572
+ " movq (%1), %%rdx # A[0] \n\t"
573
+ " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
574
+
575
+ " addq %%rax, %%r10 \n\t"
576
+ " adcq %%rdx, %%r11 \n\t"
577
+ " adcq $0, %%r12 \n\t"
578
+ " adcq $0, %%r13 \n\t"
579
+ " adcq $0, %%r14 \n\t"
580
+
581
+ " shldq $1, %%r13, %%r14 \n\t"
582
+ " shldq $1, %%r12, %%r13 \n\t"
583
+ " shldq $1, %%r11, %%r12 \n\t"
584
+ " shldq $1, %%r10, %%r11 \n\t"
585
+ " shldq $1, %%r9, %%r10 \n\t"
586
+ " shldq $1, %%r8, %%r9 \n\t"
587
+ " shlq $1, %%r8 \n\t"
588
+
589
+ " addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
590
+ " adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
591
+ " adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
592
+ " adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
593
+ " adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
594
+ " adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
595
+ " adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
596
+ :
597
+ : "r" (c), "r" (a)
598
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
599
+ "%r8", "%r9", "%r10", "%r11",
600
+ "%r12", "%r13", "%r14"
601
+ );
602
+ #else /* Without BMI2 */
603
+ /**
604
+ * TODO: Multiplications using MULQ instruction.
605
+ **/
606
+ #endif
607
+ }
608
+
609
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
610
+ {
611
+ #ifdef __BMI2__
612
+ #ifdef __ADX__
613
+ __asm__ __volatile__(
614
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
615
+ " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
616
+ " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
617
+ " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
618
+ " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
619
+ " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
620
+ " xorl %%ebx, %%ebx \n\t"
621
+ " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
622
+ " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
623
+ :
624
+ : "r" (c), "r" (a)
625
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
626
+ );
627
+ #else
628
+ __asm__ __volatile__(
629
+ " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
630
+ " mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
631
+ " mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
632
+ " mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
633
+ " mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
634
+ " adcq $0, %%rcx \n\t"
635
+
636
+ " addq (%1), %%r8 \n\t"
637
+ " adcq 8(%1), %%r10 \n\t"
638
+ " adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
639
+ " adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
640
+ " adcq $0, %%rcx \n\t"
641
+
642
+ " mulx %%rcx, %%rax, %%rcx \n\t"
643
+ " addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
644
+ " adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
645
+ :
646
+ : "r" (c), "r" (a)
647
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
648
+ );
649
+ #endif
650
+ #else /* Without BMI2 */
651
+ /**
652
+ * TODO: Multiplications using MULQ instruction.
653
+ **/
654
+ #endif
655
+ }
656
+
657
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
658
+ {
659
+ #if __ADX__
660
+ __asm__ __volatile__(
661
+ "movq (%2), %%rax \n\t"
662
+ "movq 8(%2), %%rcx \n\t"
663
+ "movq 16(%2), %%r8 \n\t"
664
+ "movq 24(%2), %%r9 \n\t"
665
+ "clc \n\t"
666
+ "adcx (%1), %%rax \n\t"
667
+ "adcx 8(%1), %%rcx \n\t"
668
+ "adcx 16(%1), %%r8 \n\t"
669
+ "adcx 24(%1), %%r9 \n\t"
670
+ "movq %%rcx, 8(%0) \n\t"
671
+ "movq %%r8 , 16(%0) \n\t"
672
+ "movq %%r9 , 24(%0) \n\t"
673
+ "setc %%cl \n\t"
674
+ "neg %%rcx \n\t"
675
+ "andq $38, %%rcx \n\t"
676
+ "addq %%rcx, %%rax \n\t"
677
+ "movq %%rax, (%0) \n\t"
678
+ :
679
+ : "r" (c), "r" (a), "r" (b)
680
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
681
+ );
682
+ #else
683
+ __asm__ __volatile__(
684
+ "movq (%2), %%rax \n\t"
685
+ "movq 8(%2), %%rcx \n\t"
686
+ "movq 16(%2), %%r8 \n\t"
687
+ "movq 24(%2), %%r9 \n\t"
688
+ "add (%1), %%rax \n\t"
689
+ "adc 8(%1), %%rcx \n\t"
690
+ "adc 16(%1), %%r8 \n\t"
691
+ "adc 24(%1), %%r9 \n\t"
692
+ "movq %%rcx, 8(%0) \n\t"
693
+ "movq %%r8 , 16(%0) \n\t"
694
+ "movq %%r9 , 24(%0) \n\t"
695
+ "setc %%cl \n\t"
696
+ "neg %%rcx \n\t"
697
+ "andq $38, %%rcx \n\t"
698
+ "addq %%rcx, %%rax \n\t"
699
+ "movq %%rax, (%0) \n\t"
700
+ :
701
+ : "r" (c), "r" (a), "r" (b)
702
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
703
+ );
704
+ #endif
705
+ }
706
+
707
+ inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
708
+ uint64_t *const __restrict b)
709
+ {
710
+ __asm__ __volatile__(
711
+ "movq (%1), %%rax \n\t"
712
+ "movq 8(%1), %%rcx \n\t"
713
+ "movq 16(%1), %%r8 \n\t"
714
+ "movq 24(%1), %%r9 \n\t"
715
+ "subq (%2), %%rax \n\t"
716
+ "sbbq 8(%2), %%rcx \n\t"
717
+ "sbbq 16(%2), %%r8 \n\t"
718
+ "sbbq 24(%2), %%r9 \n\t"
719
+ "movq %%rcx, 8(%0) \n\t"
720
+ "movq %%r8 , 16(%0) \n\t"
721
+ "movq %%r9 , 24(%0) \n\t"
722
+ "setc %%cl \n\t"
723
+ "neg %%rcx \n\t"
724
+ "andq $38, %%rcx \n\t"
725
+ "subq %%rcx, %%rax \n\t"
726
+ "movq %%rax, (%0) \n\t"
727
+ :
728
+ : "r" (c), "r" (a), "r" (b)
729
+ : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
730
+ );
731
+ }
732
+
733
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
734
+ {
735
+ #ifdef __BMI2__
736
+ /**
737
+ * a24 = (A+2)/4 = (486662+2)/4 = 121666
738
+ **/
739
+ const uint64_t a24 = 121666;
740
+ __asm__ __volatile__(
741
+ "movq %2, %%rdx \n\t"
742
+ "mulx (%1), %%rax, %%r8 \n\t"
743
+ "mulx 8(%1), %%rcx, %%r9 \n\t"
744
+ "movq %%rax, (%0) \n\t"
745
+ "movq %%rcx, 8(%0) \n\t"
746
+ "mulx 16(%1), %%rax, %%r10 \n\t"
747
+ "mulx 24(%1), %%rcx, %%r11 \n\t"
748
+ "movq %%rax, 16(%0) \n\t"
749
+ "movq %%rcx, 24(%0) \n\t"
750
+ "movq $38, %%rdx \n\t"
751
+ "mulx %%r11, %%rax, %%rcx \n\t"
752
+ "addq %%rax, (%0) \n\t"
753
+ "adcq %%r8, 8(%0) \n\t"
754
+ "adcq %%r9, 16(%0) \n\t"
755
+ "adcq %%r10, 24(%0) \n\t"
756
+ :
757
+ : "r" (c), "r" (a), "r" (a24)
758
+ : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
759
+ );
760
+ #else /* Without BMI2 */
761
+ /**
762
+ * TODO: Multiplications using MULQ instruction.
763
+ **/
764
+ #endif
765
+ }
766
+
767
+ void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
768
+ {
769
+ #define sqrn_EltFp25519_1w_x64(a,times)\
770
+ counter = times;\
771
+ while(counter-- > 0)\
772
+ {\
773
+ sqr_EltFp25519_1w_x64(a);\
774
+ }
775
+
776
+ EltFp25519_1w_Buffer_x64 buffer_1w;
777
+ EltFp25519_1w_x64 x0, x1, x2;
778
+ uint64_t * T[5];
779
+ uint64_t counter;
780
+
781
+ T[0] = x0;
782
+ T[1] = pC; /* x^(-1) */
783
+ T[2] = x1;
784
+ T[3] = x2;
785
+ T[4] = pA; /* x */
786
+
787
+ copy_EltFp25519_1w_x64(T[1],pA);
788
+ sqrn_EltFp25519_1w_x64(T[1],1);
789
+ copy_EltFp25519_1w_x64(T[2],T[1]);
790
+ sqrn_EltFp25519_1w_x64(T[2],2);
791
+ mul_EltFp25519_1w_x64(T[0], pA, T[2]);
792
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
793
+ copy_EltFp25519_1w_x64(T[2],T[1]);
794
+ sqrn_EltFp25519_1w_x64(T[2],1);
795
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
796
+ copy_EltFp25519_1w_x64(T[2],T[0]);
797
+ sqrn_EltFp25519_1w_x64(T[2],5);
798
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
799
+ copy_EltFp25519_1w_x64(T[2],T[0]);
800
+ sqrn_EltFp25519_1w_x64(T[2],10);
801
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
802
+ copy_EltFp25519_1w_x64(T[3],T[2]);
803
+ sqrn_EltFp25519_1w_x64(T[3],20);
804
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
805
+ sqrn_EltFp25519_1w_x64(T[3],10);
806
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
807
+ copy_EltFp25519_1w_x64(T[0],T[3]);
808
+ sqrn_EltFp25519_1w_x64(T[0],50);
809
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
810
+ copy_EltFp25519_1w_x64(T[2],T[0]);
811
+ sqrn_EltFp25519_1w_x64(T[2],100);
812
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
813
+ sqrn_EltFp25519_1w_x64(T[2],50);
814
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
815
+ sqrn_EltFp25519_1w_x64(T[2],5);
816
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
817
+ #undef sqrn_EltFp25519_1w_x64
818
+ }
819
+
820
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c)
821
+ {
822
+ int64_t last = (((int64_t*)c)[3])>>63;
823
+ c[3] &= ((uint64_t)1<<63)-1;
824
+ c[0] += 19 & last;
825
+ }
826
+