x25519 1.0.5 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +42 -0
- data/.rubocop.yml +1 -1
- data/CHANGELOG.md +90 -0
- data/Gemfile +2 -2
- data/LICENSE +32 -165
- data/README.md +35 -18
- data/ext/x25519_precomputed/extconf.rb +1 -1
- data/ext/x25519_precomputed/fp25519_x64.c +865 -746
- data/ext/x25519_precomputed/fp25519_x64.h +89 -54
- data/ext/x25519_precomputed/table_ladder_x25519.h +534 -267
- data/ext/x25519_precomputed/x25519_precomputed.h +33 -12
- data/ext/x25519_precomputed/x25519_x64.c +237 -217
- data/lib/x25519.rb +3 -2
- data/lib/x25519/montgomery_u.rb +1 -1
- data/lib/x25519/scalar.rb +1 -0
- data/lib/x25519/version.rb +1 -1
- data/x25519.gemspec +5 -5
- metadata +15 -16
- data/.travis.yml +0 -21
- data/CHANGES.md +0 -61
- data/appveyor.yml +0 -20
@@ -1,213 +1,210 @@
|
|
1
1
|
/**
|
2
|
-
* Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
|
2
|
+
* Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
|
3
3
|
* Institute of Computing.
|
4
4
|
* University of Campinas, Brazil.
|
5
5
|
*
|
6
|
-
*
|
7
|
-
*
|
8
|
-
* published by the Free Software Foundation, version 3.
|
6
|
+
* Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
|
7
|
+
* Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
|
9
8
|
*
|
10
|
-
*
|
11
|
-
*
|
12
|
-
*
|
13
|
-
* Lesser General Public License for more details.
|
9
|
+
* Redistribution and use in source and binary forms, with or without
|
10
|
+
* modification, are permitted provided that the following conditions
|
11
|
+
* are met:
|
14
12
|
*
|
15
|
-
*
|
16
|
-
*
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
13
|
+
* * Redistributions of source code must retain the above copyright
|
14
|
+
* notice, this list of conditions and the following disclaimer.
|
15
|
+
* * Redistributions in binary form must reproduce the above
|
16
|
+
* copyright notice, this list of conditions and the following
|
17
|
+
* disclaimer in the documentation and/or other materials provided
|
18
|
+
* with the distribution.
|
19
|
+
* * Neither the name of University of Campinas nor the names of its
|
20
|
+
* contributors may be used to endorse or promote products derived
|
21
|
+
* from this software without specific prior written permission.
|
22
|
+
*
|
23
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
24
|
+
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
25
|
+
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
26
|
+
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
27
|
+
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
28
|
+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
29
|
+
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
30
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
31
|
+
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
32
|
+
* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
33
|
+
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
34
|
+
* OF THE POSSIBILITY OF SUCH DAMAGE.
|
35
|
+
*/
|
30
36
|
|
31
|
-
|
32
|
-
{
|
33
|
-
return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
|
34
|
-
}
|
37
|
+
#include "fp25519_x64.h"
|
35
38
|
|
36
39
|
/**
|
37
40
|
*
|
38
|
-
* @param c Two 512-bit products:
|
39
|
-
* @param a Two 256-bit integers:
|
40
|
-
* @param b Two 256-bit integers:
|
41
|
+
* @param c Two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
|
42
|
+
* @param a Two 256-bit integers: a0[0:3] and a1[4:7]
|
43
|
+
* @param b Two 256-bit integers: b0[0:3] and b1[4:7]
|
41
44
|
*/
|
42
|
-
void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
|
43
|
-
{
|
45
|
+
void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
|
46
|
+
uint64_t *const b) {
|
44
47
|
#ifdef __BMI2__
|
45
48
|
#ifdef __ADX__
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
49
|
+
__asm__ __volatile__(
|
50
|
+
"xorl %%r14d, %%r14d ;"
|
51
|
+
"movq (%1), %%rdx; " /* A[0] */
|
52
|
+
"mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
|
53
|
+
"mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "adox %%r10, %%r12 ;"
|
54
|
+
"mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adox %%r8, %%rax ;"
|
55
|
+
"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adox %%r10, %%rbx ;"
|
56
|
+
/*******************************************/ "adox %%r14, %%rcx ;"
|
57
|
+
|
58
|
+
"movq 8(%1), %%rdx; " /* A[1] */
|
59
|
+
"mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
|
60
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
|
61
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
|
62
|
+
"mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
|
63
|
+
/*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
|
64
|
+
|
65
|
+
"movq 16(%1), %%rdx; " /* A[2] */ "xorl %%r10d, %%r10d ;"
|
66
|
+
"mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
|
67
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
|
68
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
|
69
|
+
"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
|
70
|
+
/*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
|
71
|
+
|
72
|
+
"movq 24(%1), %%rdx; " /* A[3] */ "xorl %%r10d, %%r10d ;"
|
73
|
+
"mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
|
74
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
|
75
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
|
76
|
+
"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
|
77
|
+
/*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 56(%0) ;"
|
78
|
+
|
79
|
+
"movq 32(%1), %%rdx; " /* C[0] */
|
80
|
+
"mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, 64(%0);"
|
81
|
+
"mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "adox %%r10, %%r12 ;"
|
82
|
+
"mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adox %%r8, %%rax ;"
|
83
|
+
"mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adox %%r10, %%rbx ;"
|
84
|
+
/*******************************************/ "adox %%r14, %%rcx ;"
|
85
|
+
|
86
|
+
"movq 40(%1), %%rdx; " /* C[1] */ "xorl %%r10d, %%r10d ;"
|
87
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 72(%0);"
|
88
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
|
89
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
|
90
|
+
"mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
|
91
|
+
/*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
|
92
|
+
|
93
|
+
"movq 48(%1), %%rdx; " /* C[2] */ "xorl %%r10d, %%r10d ;"
|
94
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 80(%0);"
|
95
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
|
96
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
|
97
|
+
"mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
|
98
|
+
/*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
|
99
|
+
|
100
|
+
"movq 56(%1), %%rdx; " /* C[3] */ "xorl %%r10d, %%r10d ;"
|
101
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 88(%0);"
|
102
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
|
103
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
|
104
|
+
"mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
|
105
|
+
/*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 120(%0) ;"
|
106
|
+
:
|
107
|
+
: "r" (c), "r" (a), "r" (b)
|
108
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
|
109
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
110
|
+
);
|
108
111
|
#else
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
"adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
|
201
|
-
:
|
202
|
-
: "r" (c), "r" (a), "r" (b)
|
203
|
-
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
204
|
-
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
205
|
-
);
|
112
|
+
__asm__ __volatile__(
|
113
|
+
"movq (%1), %%rdx; " /* A[0] */
|
114
|
+
"mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
|
115
|
+
"mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
|
116
|
+
"mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
|
117
|
+
"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
|
118
|
+
/*******************************************/ "adcq $0, %%rcx ;"
|
119
|
+
|
120
|
+
"movq 8(%1), %%rdx; " /* A[1] */
|
121
|
+
"mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
|
122
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
|
123
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
|
124
|
+
"mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
|
125
|
+
/*******************************************/ "adcq $0, %%r12 ;"
|
126
|
+
|
127
|
+
"addq %%r9, %%rax ;"
|
128
|
+
"adcq %%r11, %%rbx ;"
|
129
|
+
"adcq %%r13, %%rcx ;"
|
130
|
+
"adcq $0, %%r12 ;"
|
131
|
+
|
132
|
+
"movq 16(%1), %%rdx; " /* A[2] */
|
133
|
+
"mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
|
134
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
|
135
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
|
136
|
+
"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
|
137
|
+
/*******************************************/ "adcq $0, %%rax ;"
|
138
|
+
|
139
|
+
"addq %%r9, %%rbx ;"
|
140
|
+
"adcq %%r11, %%rcx ;"
|
141
|
+
"adcq %%r13, %%r12 ;"
|
142
|
+
"adcq $0, %%rax ;"
|
143
|
+
|
144
|
+
"movq 24(%1), %%rdx; " /* A[3] */
|
145
|
+
"mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
|
146
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
|
147
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
|
148
|
+
"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
|
149
|
+
/*******************************************/ "adcq $0, %%rbx ;"
|
150
|
+
|
151
|
+
"addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
|
152
|
+
"adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
|
153
|
+
"adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
|
154
|
+
"adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
|
155
|
+
|
156
|
+
"movq 32(%1), %%rdx; " /* C[0] */
|
157
|
+
"mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "movq %%r8, 64(%0) ;"
|
158
|
+
"mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "addq %%r10, %%r12 ;"
|
159
|
+
"mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adcq %%r8, %%rax ;"
|
160
|
+
"mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adcq %%r10, %%rbx ;"
|
161
|
+
/*******************************************/ "adcq $0, %%rcx ;"
|
162
|
+
|
163
|
+
"movq 40(%1), %%rdx; " /* C[1] */
|
164
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 72(%0) ;"
|
165
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adcq %%r10, %%r9 ;"
|
166
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adcq %%r8, %%r11 ;"
|
167
|
+
"mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adcq %%r10, %%r13 ;"
|
168
|
+
/*******************************************/ "adcq $0, %%r12 ;"
|
169
|
+
|
170
|
+
"addq %%r9, %%rax ;"
|
171
|
+
"adcq %%r11, %%rbx ;"
|
172
|
+
"adcq %%r13, %%rcx ;"
|
173
|
+
"adcq $0, %%r12 ;"
|
174
|
+
|
175
|
+
"movq 48(%1), %%rdx; " /* C[2] */
|
176
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 80(%0) ;"
|
177
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adcq %%r10, %%r9 ;"
|
178
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adcq %%r8, %%r11 ;"
|
179
|
+
"mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adcq %%r10, %%r13 ;"
|
180
|
+
/*******************************************/ "adcq $0, %%rax ;"
|
181
|
+
|
182
|
+
"addq %%r9, %%rbx ;"
|
183
|
+
"adcq %%r11, %%rcx ;"
|
184
|
+
"adcq %%r13, %%r12 ;"
|
185
|
+
"adcq $0, %%rax ;"
|
186
|
+
|
187
|
+
"movq 56(%1), %%rdx; " /* C[3] */
|
188
|
+
"mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 88(%0) ;"
|
189
|
+
"mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adcq %%r10, %%r9 ;"
|
190
|
+
"mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adcq %%r8, %%r11 ;"
|
191
|
+
"mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adcq %%r10, %%r13 ;"
|
192
|
+
/*******************************************/ "adcq $0, %%rbx ;"
|
193
|
+
|
194
|
+
"addq %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
|
195
|
+
"adcq %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
|
196
|
+
"adcq %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
|
197
|
+
"adcq $0, %%rbx ;" "movq %%rbx, 120(%0) ;"
|
198
|
+
:
|
199
|
+
: "r" (c), "r" (a), "r" (b)
|
200
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
|
201
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
202
|
+
);
|
206
203
|
#endif
|
207
204
|
#else /* Without BMI2 */
|
208
|
-
|
209
|
-
|
210
|
-
|
205
|
+
/**
|
206
|
+
* TODO: Multiplications using MULQ instruction.
|
207
|
+
**/
|
211
208
|
#endif
|
212
209
|
}
|
213
210
|
|
@@ -216,140 +213,186 @@ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *co
|
|
216
213
|
* @param c
|
217
214
|
* @param a
|
218
215
|
*/
|
219
|
-
void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
220
|
-
{
|
216
|
+
void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
|
221
217
|
#ifdef __BMI2__
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
218
|
+
#ifdef __ADX__
|
219
|
+
__asm__ __volatile__(
|
220
|
+
"movq (%1), %%rdx ;" /* A[0] */
|
221
|
+
"mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
|
222
|
+
"mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
|
223
|
+
"mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
|
224
|
+
"movq 24(%1), %%rdx ;" /* A[3] */
|
225
|
+
"mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
|
226
|
+
"mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
|
227
|
+
"movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
|
228
|
+
"mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
|
229
|
+
/*******************************************/ "adcx %%r15, %%r14 ;"
|
230
|
+
|
231
|
+
"xorl %%r15d, %%r15d;"
|
232
|
+
"adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
|
233
|
+
"adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
|
234
|
+
"adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
|
235
|
+
"adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
|
236
|
+
"adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
|
237
|
+
"adcx %%r13, %%r13 ;"
|
238
|
+
"adcx %%r14, %%r14 ;"
|
239
|
+
|
240
|
+
"movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
|
241
|
+
/********************/ "movq %%rax, 0(%0) ;"
|
242
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
|
243
|
+
"movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
|
244
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
|
245
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
|
246
|
+
"movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
|
247
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
|
248
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
|
249
|
+
"movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
|
250
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
|
251
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
|
252
|
+
|
253
|
+
|
254
|
+
"movq 32(%1), %%rdx ;" /* B[0] */
|
255
|
+
"mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ "xorl %%r15d, %%r15d;"
|
256
|
+
"mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ "adcx %%r14, %%r9 ;"
|
257
|
+
"mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ "adcx %%rax, %%r10 ;"
|
258
|
+
"movq 56(%1), %%rdx ;" /* B[3] */
|
259
|
+
"mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */ "adcx %%rcx, %%r11 ;"
|
260
|
+
"mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ "adcx %%rax, %%r12 ;"
|
261
|
+
"movq 40(%1), %%rdx ;" /* B[1] */ "adcx %%r15, %%r13 ;"
|
262
|
+
"mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ "movq $0, %%r14 ;"
|
263
|
+
/*******************************************/ "adcx %%r15, %%r14 ;"
|
264
|
+
|
265
|
+
"xorl %%r15d, %%r15d;"
|
266
|
+
"adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
|
267
|
+
"adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
|
268
|
+
"adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
|
269
|
+
"adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
|
270
|
+
"adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
|
271
|
+
"adcx %%r13, %%r13 ;"
|
272
|
+
"adcx %%r14, %%r14 ;"
|
273
|
+
|
274
|
+
"movq 32(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
|
275
|
+
/********************/ "movq %%rax, 64(%0) ;"
|
276
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
|
277
|
+
"movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
|
278
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
|
279
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
|
280
|
+
"movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
|
281
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
|
282
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
|
283
|
+
"movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
|
284
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
|
285
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
|
286
|
+
:
|
287
|
+
: "r" (c), "r" (a)
|
288
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
289
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
|
290
|
+
);
|
291
|
+
#else /* Without ADX */
|
292
|
+
__asm__ __volatile__(
|
293
|
+
"movq 8(%1), %%rdx ;" /* A[1] */
|
294
|
+
"mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
|
295
|
+
"mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
|
296
|
+
"mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
|
297
|
+
|
298
|
+
"movq 16(%1), %%rdx ;" /* A[2] */
|
299
|
+
"mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
|
300
|
+
"mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
|
301
|
+
|
302
|
+
"addq %%rax, %%r9 ;"
|
303
|
+
"adcq %%rdx, %%r10 ;"
|
304
|
+
"adcq %%rcx, %%r11 ;"
|
305
|
+
"adcq %%r14, %%r12 ;"
|
306
|
+
"adcq $0, %%r13 ;"
|
307
|
+
"movq $0, %%r14 ;"
|
308
|
+
"adcq $0, %%r14 ;"
|
309
|
+
|
310
|
+
"movq (%1), %%rdx ;" /* A[0] */
|
311
|
+
"mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
|
312
|
+
|
313
|
+
"addq %%rax, %%r10 ;"
|
314
|
+
"adcq %%rcx, %%r11 ;"
|
315
|
+
"adcq $0, %%r12 ;"
|
316
|
+
"adcq $0, %%r13 ;"
|
317
|
+
"adcq $0, %%r14 ;"
|
318
|
+
|
319
|
+
"shldq $1, %%r13, %%r14 ;"
|
320
|
+
"shldq $1, %%r12, %%r13 ;"
|
321
|
+
"shldq $1, %%r11, %%r12 ;"
|
322
|
+
"shldq $1, %%r10, %%r11 ;"
|
323
|
+
"shldq $1, %%r9, %%r10 ;"
|
324
|
+
"shldq $1, %%r8, %%r9 ;"
|
325
|
+
"shlq $1, %%r8 ;"
|
326
|
+
|
327
|
+
/********************/ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
|
328
|
+
/********************/ "movq %%rax, 0(%0) ;"
|
329
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
|
330
|
+
"movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
|
331
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
|
332
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
|
333
|
+
"movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
|
334
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
|
335
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
|
336
|
+
"movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
|
337
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
|
338
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
|
339
|
+
|
340
|
+
"movq 40(%1), %%rdx ;" /* B[1] */
|
341
|
+
"mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
|
342
|
+
"mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
|
343
|
+
"mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
|
344
|
+
|
345
|
+
"movq 48(%1), %%rdx ;" /* B[2] */
|
346
|
+
"mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
|
347
|
+
"mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
|
348
|
+
|
349
|
+
"addq %%rax, %%r9 ;"
|
350
|
+
"adcq %%rdx, %%r10 ;"
|
351
|
+
"adcq %%rcx, %%r11 ;"
|
352
|
+
"adcq %%r14, %%r12 ;"
|
353
|
+
"adcq $0, %%r13 ;"
|
354
|
+
"movq $0, %%r14 ;"
|
355
|
+
"adcq $0, %%r14 ;"
|
356
|
+
|
357
|
+
"movq 32(%1), %%rdx ;" /* B[0] */
|
358
|
+
"mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
|
359
|
+
|
360
|
+
"addq %%rax, %%r10 ;"
|
361
|
+
"adcq %%rcx, %%r11 ;"
|
362
|
+
"adcq $0, %%r12 ;"
|
363
|
+
"adcq $0, %%r13 ;"
|
364
|
+
"adcq $0, %%r14 ;"
|
365
|
+
|
366
|
+
"shldq $1, %%r13, %%r14 ;"
|
367
|
+
"shldq $1, %%r12, %%r13 ;"
|
368
|
+
"shldq $1, %%r11, %%r12 ;"
|
369
|
+
"shldq $1, %%r10, %%r11 ;"
|
370
|
+
"shldq $1, %%r9, %%r10 ;"
|
371
|
+
"shldq $1, %%r8, %%r9 ;"
|
372
|
+
"shlq $1, %%r8 ;"
|
373
|
+
|
374
|
+
/********************/ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
|
375
|
+
/********************/ "movq %%rax, 64(%0) ;"
|
376
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
|
377
|
+
"movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
|
378
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
|
379
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
|
380
|
+
"movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
|
381
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
|
382
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
|
383
|
+
"movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
|
384
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
|
385
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
|
386
|
+
:
|
387
|
+
: "r" (c), "r" (a)
|
388
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
389
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
390
|
+
);
|
391
|
+
#endif
|
349
392
|
#else /* Without BMI2 */
|
350
|
-
|
351
|
-
|
352
|
-
|
393
|
+
/**
|
394
|
+
* TODO: Multiplications using MULQ instruction.
|
395
|
+
**/
|
353
396
|
#endif
|
354
397
|
}
|
355
398
|
|
@@ -358,467 +401,543 @@ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
|
358
401
|
* @param c
|
359
402
|
* @param a
|
360
403
|
*/
|
361
|
-
void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
|
362
|
-
{
|
404
|
+
void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a) {
|
363
405
|
#ifdef __BMI2__
|
364
406
|
#ifdef __ADX__
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
407
|
+
__asm__ __volatile__(
|
408
|
+
"movl $38, %%edx; " /* 2*c = 38 = 2^256 */
|
409
|
+
"mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
|
410
|
+
"mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
|
411
|
+
"mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
|
412
|
+
"mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
|
413
|
+
/****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
|
414
|
+
"clc ;"
|
415
|
+
"mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
|
416
|
+
"adcx %%rax, %%r8 ;"
|
417
|
+
"adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
|
418
|
+
"adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
|
419
|
+
"adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
|
420
|
+
"mov $0, %%ecx ;"
|
421
|
+
"cmovc %%edx, %%ecx ;"
|
422
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
423
|
+
|
424
|
+
"mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox 64(%1), %%r8 ;"
|
425
|
+
"mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 72(%1), %%r9 ;"
|
426
|
+
"mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 80(%1), %%r10 ;"
|
427
|
+
"mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 88(%1), %%r11 ;"
|
428
|
+
/*****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
|
429
|
+
"clc ;"
|
430
|
+
"mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
|
431
|
+
"adcx %%rax, %%r8 ;"
|
432
|
+
"adcx %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
|
433
|
+
"adcx %%rbx, %%r10 ;" "movq %%r10, 48(%0) ;"
|
434
|
+
"adcx %%rbx, %%r11 ;" "movq %%r11, 56(%0) ;"
|
435
|
+
"mov $0, %%ecx ;"
|
436
|
+
"cmovc %%edx, %%ecx ;"
|
437
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
|
438
|
+
:
|
439
|
+
: "r" (c), "r" (a)
|
440
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
441
|
+
);
|
388
442
|
#else
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
443
|
+
__asm__ __volatile__(
|
444
|
+
"movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
|
445
|
+
"mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
|
446
|
+
"mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
|
447
|
+
"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
|
448
|
+
"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
|
449
|
+
/****************************************/ "adcq $0, %%rcx ;"
|
450
|
+
"addq (%1), %%r8 ;"
|
451
|
+
"adcq 8(%1), %%r9 ;"
|
452
|
+
"adcq 16(%1), %%r10 ;"
|
453
|
+
"adcq 24(%1), %%r11 ;"
|
454
|
+
"adcq $0, %%rcx ;"
|
455
|
+
"mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
|
456
|
+
"addq %%rax, %%r8 ;"
|
457
|
+
"adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
|
458
|
+
"adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
|
459
|
+
"adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
|
460
|
+
"mov $0, %%ecx ;"
|
461
|
+
"cmovc %%edx, %%ecx ;"
|
462
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
463
|
+
|
464
|
+
"mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
|
465
|
+
"mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
|
466
|
+
"mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
|
467
|
+
"mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
|
468
|
+
/*****************************************/ "adcq $0, %%rcx ;"
|
469
|
+
"addq 64(%1), %%r8 ;"
|
470
|
+
"adcq 72(%1), %%r9 ;"
|
471
|
+
"adcq 80(%1), %%r10 ;"
|
472
|
+
"adcq 88(%1), %%r11 ;"
|
473
|
+
"adcq $0, %%rcx ;"
|
474
|
+
"mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
|
475
|
+
"addq %%rax, %%r8 ;"
|
476
|
+
"adcq %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
|
477
|
+
"adcq $0, %%r10 ;" "movq %%r10, 48(%0) ;"
|
478
|
+
"adcq $0, %%r11 ;" "movq %%r11, 56(%0) ;"
|
479
|
+
"mov $0, %%ecx ;"
|
480
|
+
"cmovc %%edx, %%ecx ;"
|
481
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
|
482
|
+
:
|
483
|
+
: "r" (c), "r" (a)
|
484
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
485
|
+
);
|
427
486
|
#endif
|
428
487
|
#else /* Without BMI2 */
|
429
|
-
|
488
|
+
/* [TODO] */
|
430
489
|
#endif
|
431
490
|
}
|
432
491
|
|
433
|
-
void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
434
|
-
{
|
492
|
+
void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
|
435
493
|
#ifdef __BMI2__
|
436
494
|
#ifdef __ADX__
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
);
|
495
|
+
__asm__ __volatile__(
|
496
|
+
"movq (%1), %%rdx; " /* A[0] */
|
497
|
+
"mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
|
498
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(%0) ;"
|
499
|
+
"mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%r12 ;"
|
500
|
+
"mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;"
|
501
|
+
/*******************************************/ "adox %%rdx, %%rax ;"
|
502
|
+
|
503
|
+
"movq 8(%1), %%rdx; " /* A[1] */
|
504
|
+
"mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(%0), %%r8 ;" "movq %%r8, 8(%0) ;"
|
505
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 16(%0) ;"
|
506
|
+
"mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
|
507
|
+
"mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
|
508
|
+
/*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
|
509
|
+
|
510
|
+
"movq 16(%1), %%rdx; " /* A[2] */
|
511
|
+
"mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(%0), %%r8 ;" "movq %%r8, 16(%0) ;"
|
512
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 24(%0) ;"
|
513
|
+
"mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
|
514
|
+
"mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
|
515
|
+
/*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
|
516
|
+
|
517
|
+
"movq 24(%1), %%rdx; " /* A[3] */
|
518
|
+
"mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(%0), %%r8 ;" "movq %%r8, 24(%0) ;"
|
519
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 32(%0) ;"
|
520
|
+
"mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq %%r12, 40(%0) ;" "movq $0, %%r8 ;"
|
521
|
+
"mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(%0) ;" "movq $0, %%rax ;"
|
522
|
+
/*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(%0) ;"
|
523
|
+
:
|
524
|
+
: "r" (c), "r" (a), "r" (b)
|
525
|
+
: "memory", "cc", "%rax", "%rdx", "%r8",
|
526
|
+
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
527
|
+
);
|
471
528
|
#else
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
521
|
-
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
522
|
-
);
|
529
|
+
__asm__ __volatile__(
|
530
|
+
"movq (%1), %%rdx; " /* A[0] */
|
531
|
+
"mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
|
532
|
+
"mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
|
533
|
+
"mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
|
534
|
+
"mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
|
535
|
+
/*******************************************/ "adcq $0, %%rcx ;"
|
536
|
+
|
537
|
+
"movq 8(%1), %%rdx; " /* A[1] */
|
538
|
+
"mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
|
539
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
|
540
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
|
541
|
+
"mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
|
542
|
+
/*******************************************/ "adcq $0, %%r12 ;"
|
543
|
+
|
544
|
+
"addq %%r9, %%rax ;"
|
545
|
+
"adcq %%r11, %%rbx ;"
|
546
|
+
"adcq %%r13, %%rcx ;"
|
547
|
+
"adcq $0, %%r12 ;"
|
548
|
+
|
549
|
+
"movq 16(%1), %%rdx; " /* A[2] */
|
550
|
+
"mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
|
551
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
|
552
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
|
553
|
+
"mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
|
554
|
+
/*******************************************/ "adcq $0, %%rax ;"
|
555
|
+
|
556
|
+
"addq %%r9, %%rbx ;"
|
557
|
+
"adcq %%r11, %%rcx ;"
|
558
|
+
"adcq %%r13, %%r12 ;"
|
559
|
+
"adcq $0, %%rax ;"
|
560
|
+
|
561
|
+
"movq 24(%1), %%rdx; " /* A[3] */
|
562
|
+
"mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
|
563
|
+
"mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
|
564
|
+
"mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
|
565
|
+
"mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
|
566
|
+
/*******************************************/ "adcq $0, %%rbx ;"
|
567
|
+
|
568
|
+
"addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
|
569
|
+
"adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
|
570
|
+
"adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
|
571
|
+
"adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
|
572
|
+
:
|
573
|
+
: "r" (c), "r" (a), "r" (b)
|
574
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
|
575
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
576
|
+
);
|
523
577
|
#endif
|
524
578
|
#else /* Without BMI2 */
|
525
|
-
|
526
|
-
|
527
|
-
|
579
|
+
/**
|
580
|
+
* TODO: Multiplications using MULQ instruction.
|
581
|
+
**/
|
528
582
|
#endif
|
529
583
|
}
|
530
584
|
|
531
|
-
void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
532
|
-
{
|
585
|
+
void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
|
533
586
|
#ifdef __BMI2__
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
587
|
+
#ifdef __ADX__
|
588
|
+
__asm__ __volatile__(
|
589
|
+
"movq (%1), %%rdx ;" /* A[0] */
|
590
|
+
"mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
|
591
|
+
"mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
|
592
|
+
"mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
|
593
|
+
"movq 24(%1), %%rdx ;" /* A[3] */
|
594
|
+
"mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
|
595
|
+
"mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
|
596
|
+
"movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
|
597
|
+
"mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
|
598
|
+
/*******************************************/ "adcx %%r15, %%r14 ;"
|
599
|
+
|
600
|
+
"xorl %%r15d, %%r15d;"
|
601
|
+
"adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
|
602
|
+
"adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
|
603
|
+
"adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
|
604
|
+
"adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
|
605
|
+
"adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
|
606
|
+
"adcx %%r13, %%r13 ;"
|
607
|
+
"adcx %%r14, %%r14 ;"
|
608
|
+
|
609
|
+
"movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
|
610
|
+
/********************/ "movq %%rax, 0(%0) ;"
|
611
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
|
612
|
+
"movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
|
613
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
|
614
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
|
615
|
+
"movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
|
616
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
|
617
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
|
618
|
+
"movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
|
619
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
|
620
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
|
621
|
+
:
|
622
|
+
: "r" (c), "r" (a)
|
623
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
624
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
|
625
|
+
);
|
626
|
+
#else /* Without ADX */
|
627
|
+
__asm__ __volatile__(
|
628
|
+
"movq 8(%1), %%rdx ;" /* A[1] */
|
629
|
+
"mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
|
630
|
+
"mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
|
631
|
+
"mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
|
632
|
+
|
633
|
+
"movq 16(%1), %%rdx ;" /* A[2] */
|
634
|
+
"mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
|
635
|
+
"mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
|
636
|
+
|
637
|
+
"addq %%rax, %%r9 ;"
|
638
|
+
"adcq %%rdx, %%r10 ;"
|
639
|
+
"adcq %%rcx, %%r11 ;"
|
640
|
+
"adcq %%r14, %%r12 ;"
|
641
|
+
"adcq $0, %%r13 ;"
|
642
|
+
"movq $0, %%r14 ;"
|
643
|
+
"adcq $0, %%r14 ;"
|
644
|
+
|
645
|
+
"movq (%1), %%rdx ;" /* A[0] */
|
646
|
+
"mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
|
647
|
+
|
648
|
+
"addq %%rax, %%r10 ;"
|
649
|
+
"adcq %%rcx, %%r11 ;"
|
650
|
+
"adcq $0, %%r12 ;"
|
651
|
+
"adcq $0, %%r13 ;"
|
652
|
+
"adcq $0, %%r14 ;"
|
653
|
+
|
654
|
+
"shldq $1, %%r13, %%r14 ;"
|
655
|
+
"shldq $1, %%r12, %%r13 ;"
|
656
|
+
"shldq $1, %%r11, %%r12 ;"
|
657
|
+
"shldq $1, %%r10, %%r11 ;"
|
658
|
+
"shldq $1, %%r9, %%r10 ;"
|
659
|
+
"shldq $1, %%r8, %%r9 ;"
|
660
|
+
"shlq $1, %%r8 ;"
|
661
|
+
|
662
|
+
/********************/ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
|
663
|
+
/********************/ "movq %%rax, 0(%0) ;"
|
664
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
|
665
|
+
"movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
|
666
|
+
"adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
|
667
|
+
"adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
|
668
|
+
"movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
|
669
|
+
"adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
|
670
|
+
"adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
|
671
|
+
"movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
|
672
|
+
"adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
|
673
|
+
"adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
|
674
|
+
:
|
675
|
+
: "r" (c), "r" (a)
|
676
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
677
|
+
"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
678
|
+
);
|
679
|
+
#endif
|
600
680
|
#else /* Without BMI2 */
|
601
|
-
|
602
|
-
|
603
|
-
|
681
|
+
/**
|
682
|
+
* TODO: Multiplications using MULQ instruction.
|
683
|
+
**/
|
604
684
|
#endif
|
605
685
|
}
|
606
686
|
|
607
|
-
void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
|
608
|
-
{
|
687
|
+
void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
|
609
688
|
#ifdef __BMI2__
|
610
689
|
#ifdef __ADX__
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
690
|
+
__asm__ __volatile__(
|
691
|
+
"movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
|
692
|
+
"mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
|
693
|
+
"mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
|
694
|
+
"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
|
695
|
+
"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
|
696
|
+
/****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
|
697
|
+
"clc ;"
|
698
|
+
"mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
|
699
|
+
"adcx %%rax, %%r8 ;"
|
700
|
+
"adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
|
701
|
+
"adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
|
702
|
+
"adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
|
703
|
+
"mov $0, %%ecx ;"
|
704
|
+
"cmovc %%edx, %%ecx ;"
|
705
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
706
|
+
:
|
707
|
+
: "r" (c), "r" (a)
|
708
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
709
|
+
);
|
625
710
|
#else
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
711
|
+
__asm__ __volatile__(
|
712
|
+
"movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
|
713
|
+
"mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
|
714
|
+
"mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
|
715
|
+
"mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
|
716
|
+
"mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
|
717
|
+
/****************************************/ "adcq $0, %%rcx ;"
|
718
|
+
"addq (%1), %%r8 ;"
|
719
|
+
"adcq 8(%1), %%r9 ;"
|
720
|
+
"adcq 16(%1), %%r10 ;"
|
721
|
+
"adcq 24(%1), %%r11 ;"
|
722
|
+
"adcq $0, %%rcx ;"
|
723
|
+
"mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
|
724
|
+
"addq %%rax, %%r8 ;"
|
725
|
+
"adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
|
726
|
+
"adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
|
727
|
+
"adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
|
728
|
+
"mov $0, %%ecx ;"
|
729
|
+
"cmovc %%edx, %%ecx ;"
|
730
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
731
|
+
:
|
732
|
+
: "r" (c), "r" (a)
|
733
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
734
|
+
);
|
647
735
|
#endif
|
648
736
|
#else /* Without BMI2 */
|
649
|
-
|
650
|
-
|
651
|
-
|
737
|
+
/**
|
738
|
+
* TODO: Multiplications using MULQ instruction.
|
739
|
+
**/
|
652
740
|
#endif
|
653
741
|
}
|
654
742
|
|
655
|
-
inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
: "r" (c), "r" (a), "r" (b)
|
678
|
-
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
679
|
-
);
|
743
|
+
inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
|
744
|
+
#ifdef __ADX__
|
745
|
+
__asm__ __volatile__(
|
746
|
+
"mov $38, %%eax ;"
|
747
|
+
"xorl %%ecx, %%ecx ;"
|
748
|
+
"movq (%2), %%r8 ;" "adcx (%1), %%r8 ;"
|
749
|
+
"movq 8(%2), %%r9 ;" "adcx 8(%1), %%r9 ;"
|
750
|
+
"movq 16(%2), %%r10 ;" "adcx 16(%1), %%r10 ;"
|
751
|
+
"movq 24(%2), %%r11 ;" "adcx 24(%1), %%r11 ;"
|
752
|
+
"cmovc %%eax, %%ecx ;"
|
753
|
+
"xorl %%eax, %%eax ;"
|
754
|
+
"adcx %%rcx, %%r8 ;"
|
755
|
+
"adcx %%rax, %%r9 ;" "movq %%r9, 8(%0) ;"
|
756
|
+
"adcx %%rax, %%r10 ;" "movq %%r10, 16(%0) ;"
|
757
|
+
"adcx %%rax, %%r11 ;" "movq %%r11, 24(%0) ;"
|
758
|
+
"mov $38, %%ecx ;"
|
759
|
+
"cmovc %%ecx, %%eax ;"
|
760
|
+
"addq %%rax, %%r8 ;" "movq %%r8, (%0) ;"
|
761
|
+
:
|
762
|
+
: "r" (c), "r" (a), "r" (b)
|
763
|
+
: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
|
764
|
+
);
|
680
765
|
#else
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
692
|
-
|
693
|
-
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
701
|
-
);
|
766
|
+
__asm__ __volatile__(
|
767
|
+
"mov $38, %%eax ;"
|
768
|
+
"movq (%2), %%r8 ;" "addq (%1), %%r8 ;"
|
769
|
+
"movq 8(%2), %%r9 ;" "adcq 8(%1), %%r9 ;"
|
770
|
+
"movq 16(%2), %%r10 ;" "adcq 16(%1), %%r10 ;"
|
771
|
+
"movq 24(%2), %%r11 ;" "adcq 24(%1), %%r11 ;"
|
772
|
+
"mov $0, %%ecx ;"
|
773
|
+
"cmovc %%eax, %%ecx ;"
|
774
|
+
"addq %%rcx, %%r8 ;"
|
775
|
+
"adcq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
|
776
|
+
"adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
|
777
|
+
"adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
|
778
|
+
"mov $0, %%ecx ;"
|
779
|
+
"cmovc %%eax, %%ecx ;"
|
780
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
781
|
+
:
|
782
|
+
: "r" (c), "r" (a), "r" (b)
|
783
|
+
: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
|
784
|
+
);
|
702
785
|
#endif
|
703
786
|
}
|
704
787
|
|
705
|
-
inline void sub_EltFp25519_1w_x64(uint64_t *const
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
:
|
726
|
-
: "r" (c), "r" (a), "r" (b)
|
727
|
-
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
728
|
-
);
|
788
|
+
inline void sub_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
|
789
|
+
__asm__ __volatile__(
|
790
|
+
"mov $38, %%eax ;"
|
791
|
+
"movq (%1), %%r8 ;" "subq (%2), %%r8 ;"
|
792
|
+
"movq 8(%1), %%r9 ;" "sbbq 8(%2), %%r9 ;"
|
793
|
+
"movq 16(%1), %%r10 ;" "sbbq 16(%2), %%r10 ;"
|
794
|
+
"movq 24(%1), %%r11 ;" "sbbq 24(%2), %%r11 ;"
|
795
|
+
"mov $0, %%ecx ;"
|
796
|
+
"cmovc %%eax, %%ecx ;"
|
797
|
+
"subq %%rcx, %%r8 ;"
|
798
|
+
"sbbq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
|
799
|
+
"sbbq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
|
800
|
+
"sbbq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
|
801
|
+
"mov $0, %%ecx ;"
|
802
|
+
"cmovc %%eax, %%ecx ;"
|
803
|
+
"subq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
804
|
+
:
|
805
|
+
: "r" (c), "r" (a), "r" (b)
|
806
|
+
: "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
|
807
|
+
);
|
729
808
|
}
|
730
809
|
|
731
|
-
|
732
|
-
|
810
|
+
/**
|
811
|
+
* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
|
812
|
+
**/
|
813
|
+
inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
|
733
814
|
#ifdef __BMI2__
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
|
753
|
-
|
754
|
-
|
755
|
-
: "r" (c), "r" (a), "r" (a24)
|
756
|
-
: "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
757
|
-
);
|
815
|
+
const uint64_t a24 = 121666;
|
816
|
+
__asm__ __volatile__(
|
817
|
+
"movq %2, %%rdx ;"
|
818
|
+
"mulx (%1), %%r8, %%r10 ;"
|
819
|
+
"mulx 8(%1), %%r9, %%r11 ;" "addq %%r10, %%r9 ;"
|
820
|
+
"mulx 16(%1), %%r10, %%rax ;" "adcq %%r11, %%r10 ;"
|
821
|
+
"mulx 24(%1), %%r11, %%rcx ;" "adcq %%rax, %%r11 ;"
|
822
|
+
/***************************/ "adcq $0, %%rcx ;"
|
823
|
+
"movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
|
824
|
+
"mulx %%rcx, %%rax, %%rcx ;"
|
825
|
+
"addq %%rax, %%r8 ;"
|
826
|
+
"adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
|
827
|
+
"adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
|
828
|
+
"adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
|
829
|
+
"mov $0, %%ecx ;"
|
830
|
+
"cmovc %%edx, %%ecx ;"
|
831
|
+
"addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
|
832
|
+
:
|
833
|
+
: "r" (c), "r" (a), "r" (a24)
|
834
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
835
|
+
);
|
758
836
|
#else /* Without BMI2 */
|
759
|
-
|
760
|
-
|
761
|
-
|
837
|
+
/**
|
838
|
+
* TODO: Multiplications using MULQ instruction.
|
839
|
+
**/
|
762
840
|
#endif
|
763
841
|
}
|
764
842
|
|
765
|
-
void inv_EltFp25519_1w_x64(uint64_t *const
|
766
|
-
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
771
|
-
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
|
794
|
-
|
795
|
-
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
801
|
-
|
802
|
-
|
803
|
-
|
804
|
-
|
805
|
-
|
806
|
-
|
807
|
-
|
808
|
-
|
809
|
-
|
810
|
-
|
811
|
-
|
812
|
-
|
813
|
-
sqrn_EltFp25519_1w_x64(T[2],5);
|
814
|
-
mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
|
843
|
+
void inv_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
|
844
|
+
#define sqrn_EltFp25519_1w_x64(A, times)\
|
845
|
+
counter = times;\
|
846
|
+
while ( counter-- > 0) {\
|
847
|
+
sqr_EltFp25519_1w_x64(A);\
|
848
|
+
}
|
849
|
+
|
850
|
+
EltFp25519_1w_Buffer_x64 buffer_1w;
|
851
|
+
EltFp25519_1w_x64 x0, x1, x2;
|
852
|
+
uint64_t * T[5];
|
853
|
+
uint64_t counter;
|
854
|
+
|
855
|
+
T[0] = x0;
|
856
|
+
T[1] = c; /* x^(-1) */
|
857
|
+
T[2] = x1;
|
858
|
+
T[3] = x2;
|
859
|
+
T[4] = a; /* x */
|
860
|
+
|
861
|
+
copy_EltFp25519_1w_x64(T[1], a);
|
862
|
+
sqrn_EltFp25519_1w_x64(T[1], 1);
|
863
|
+
copy_EltFp25519_1w_x64(T[2], T[1]);
|
864
|
+
sqrn_EltFp25519_1w_x64(T[2], 2);
|
865
|
+
mul_EltFp25519_1w_x64(T[0], a, T[2]);
|
866
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
|
867
|
+
copy_EltFp25519_1w_x64(T[2], T[1]);
|
868
|
+
sqrn_EltFp25519_1w_x64(T[2], 1);
|
869
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
870
|
+
copy_EltFp25519_1w_x64(T[2], T[0]);
|
871
|
+
sqrn_EltFp25519_1w_x64(T[2], 5);
|
872
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
873
|
+
copy_EltFp25519_1w_x64(T[2], T[0]);
|
874
|
+
sqrn_EltFp25519_1w_x64(T[2], 10);
|
875
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
876
|
+
copy_EltFp25519_1w_x64(T[3], T[2]);
|
877
|
+
sqrn_EltFp25519_1w_x64(T[3], 20);
|
878
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
|
879
|
+
sqrn_EltFp25519_1w_x64(T[3], 10);
|
880
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
|
881
|
+
copy_EltFp25519_1w_x64(T[0], T[3]);
|
882
|
+
sqrn_EltFp25519_1w_x64(T[0], 50);
|
883
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
|
884
|
+
copy_EltFp25519_1w_x64(T[2], T[0]);
|
885
|
+
sqrn_EltFp25519_1w_x64(T[2], 100);
|
886
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
887
|
+
sqrn_EltFp25519_1w_x64(T[2], 50);
|
888
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
|
889
|
+
sqrn_EltFp25519_1w_x64(T[2], 5);
|
890
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
|
815
891
|
#undef sqrn_EltFp25519_1w_x64
|
816
892
|
}
|
817
893
|
|
818
|
-
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
894
|
+
/**
|
895
|
+
* Given C, a 256-bit number, fred_EltFp25519_1w_x64 updates C
|
896
|
+
* with a number such that 0 <= C < 2**255-19.
|
897
|
+
* Contributed by: Samuel Neves.
|
898
|
+
**/
|
899
|
+
inline void fred_EltFp25519_1w_x64(uint64_t *const c) {
|
900
|
+
__asm__ __volatile__ (
|
901
|
+
/* First, obtains a number less than 2^255. */
|
902
|
+
"btrq $63, 24(%0) ;"
|
903
|
+
"sbbl %%ecx, %%ecx ;"
|
904
|
+
"andq $19, %%rcx ;"
|
905
|
+
"addq %%rcx, (%0) ;"
|
906
|
+
"adcq $0, 8(%0) ;"
|
907
|
+
"adcq $0, 16(%0) ;"
|
908
|
+
"adcq $0, 24(%0) ;"
|
909
|
+
|
910
|
+
"btrq $63, 24(%0) ;"
|
911
|
+
"sbbl %%ecx, %%ecx ;"
|
912
|
+
"andq $19, %%rcx ;"
|
913
|
+
"addq %%rcx, (%0) ;"
|
914
|
+
"adcq $0, 8(%0) ;"
|
915
|
+
"adcq $0, 16(%0) ;"
|
916
|
+
"adcq $0, 24(%0) ;"
|
917
|
+
|
918
|
+
/* Then, in case the number fall into [2^255-19, 2^255-1] */
|
919
|
+
"cmpq $-19, (%0) ;"
|
920
|
+
"setaeb %%al ;"
|
921
|
+
"cmpq $-1, 8(%0) ;"
|
922
|
+
"setzb %%bl ;"
|
923
|
+
"cmpq $-1, 16(%0) ;"
|
924
|
+
"setzb %%cl ;"
|
925
|
+
"movq 24(%0), %%rdx ;"
|
926
|
+
"addq $1, %%rdx ;"
|
927
|
+
"shrq $63, %%rdx ;"
|
928
|
+
"andb %%bl, %%al ;"
|
929
|
+
"andb %%dl, %%cl ;"
|
930
|
+
"test %%cl, %%al ;"
|
931
|
+
"movl $0, %%eax ;"
|
932
|
+
"movl $19, %%ecx ;"
|
933
|
+
"cmovnz %%rcx, %%rax ;"
|
934
|
+
"addq %%rax, (%0) ;"
|
935
|
+
"adcq $0, 8(%0) ;"
|
936
|
+
"adcq $0, 16(%0) ;"
|
937
|
+
"adcq $0, 24(%0) ;"
|
938
|
+
"btrq $63, 24(%0) ;"
|
939
|
+
:
|
940
|
+
: "r"(c)
|
941
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
|
942
|
+
);
|
823
943
|
}
|
824
|
-
|