x25519 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +5 -0
- data/.rubocop.yml +32 -0
- data/.travis.yml +12 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +11 -0
- data/LICENSE +165 -0
- data/README.md +67 -0
- data/Rakefile +11 -0
- data/ext/x25519/bytes.c +42 -0
- data/ext/x25519/bytes.h +25 -0
- data/ext/x25519/fp25519_x64.c +826 -0
- data/ext/x25519/fp25519_x64.h +91 -0
- data/ext/x25519/random.c +51 -0
- data/ext/x25519/random.h +24 -0
- data/ext/x25519/rfc7748_precompted.h +49 -0
- data/ext/x25519/rfc7748_precomputed.c +20 -0
- data/ext/x25519/table_ladder_x25519.h +277 -0
- data/ext/x25519/x25519_x64.c +244 -0
- data/lib/x25519.rb +7 -0
- data/lib/x25519/version.rb +5 -0
- data/x25519.gemspec +28 -0
- metadata +82 -0
data/ext/x25519/bytes.h
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
|
3
|
+
* Institute of Computing.
|
4
|
+
* University of Campinas, Brazil.
|
5
|
+
*
|
6
|
+
* This program is free software: you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU Lesser General Public License as
|
8
|
+
* published by the Free Software Foundation, version 3.
|
9
|
+
*
|
10
|
+
* This program is distributed in the hope that it will be useful, but
|
11
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
* Lesser General Public License for more details.
|
14
|
+
*
|
15
|
+
* You should have received a copy of the GNU Lesser General Public License
|
16
|
+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
*/
|
18
|
+
#ifndef BYTES_H
|
19
|
+
#define BYTES_H
|
20
|
+
|
21
|
+
#include <stdint.h>
|
22
|
+
void print_bytes(uint8_t * A, int num_bytes);
|
23
|
+
int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes);
|
24
|
+
|
25
|
+
#endif /* BYTES_H */
|
@@ -0,0 +1,826 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
|
3
|
+
* Institute of Computing.
|
4
|
+
* University of Campinas, Brazil.
|
5
|
+
*
|
6
|
+
* This program is free software: you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU Lesser General Public License as
|
8
|
+
* published by the Free Software Foundation, version 3.
|
9
|
+
*
|
10
|
+
* This program is distributed in the hope that it will be useful, but
|
11
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
* Lesser General Public License for more details.
|
14
|
+
*
|
15
|
+
* You should have received a copy of the GNU Lesser General Public License
|
16
|
+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
*/
|
18
|
+
#include "random.h"
|
19
|
+
#include "bytes.h"
|
20
|
+
#include "fp25519_x64.h"
|
21
|
+
|
22
|
+
void random_EltFp25519_1w_x64(uint64_t *A)
|
23
|
+
{
|
24
|
+
random_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
|
25
|
+
A[3] &= ((uint64_t)1<<63)-1;
|
26
|
+
}
|
27
|
+
|
28
|
+
int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
|
29
|
+
{
|
30
|
+
return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
|
31
|
+
}
|
32
|
+
|
33
|
+
void print_EltFp25519_1w_x64(uint64_t *A)
|
34
|
+
{
|
35
|
+
print_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
|
36
|
+
}
|
37
|
+
|
38
|
+
/**
|
39
|
+
*
|
40
|
+
* @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
|
41
|
+
* @param a Two 256-bit integers: a[0:3] and a[4:7]
|
42
|
+
* @param b Two 256-bit integers: b[0:3] and b[4:7]
|
43
|
+
*/
|
44
|
+
void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
45
|
+
{
|
46
|
+
#ifdef __BMI2__
|
47
|
+
#ifdef __ADX__
|
48
|
+
__asm__ __volatile__(
|
49
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
50
|
+
"mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
|
51
|
+
"mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
|
52
|
+
"mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
|
53
|
+
"mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
54
|
+
"adox %%rdx, %%rax \n\t"
|
55
|
+
|
56
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
57
|
+
"mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
|
58
|
+
"mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
|
59
|
+
"mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
60
|
+
"mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
61
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
62
|
+
|
63
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
64
|
+
"mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
|
65
|
+
"mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
|
66
|
+
"mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
67
|
+
"mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
68
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
69
|
+
|
70
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
71
|
+
"mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
|
72
|
+
"mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
|
73
|
+
"mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
|
74
|
+
"mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
|
75
|
+
"adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
|
76
|
+
|
77
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
78
|
+
"mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
|
79
|
+
"mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
|
80
|
+
"mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
|
81
|
+
"mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
82
|
+
"adox %%rdx, %%rax \n\t"
|
83
|
+
|
84
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
85
|
+
"mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
|
86
|
+
"mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
|
87
|
+
"mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
88
|
+
"mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
89
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
90
|
+
|
91
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
92
|
+
"mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
|
93
|
+
"mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
|
94
|
+
"mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
95
|
+
"mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
96
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
97
|
+
|
98
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
99
|
+
"mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
|
100
|
+
"mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
|
101
|
+
"mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
|
102
|
+
"mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
|
103
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
|
104
|
+
:
|
105
|
+
: "r" (c), "r" (a), "r" (b)
|
106
|
+
: "memory", "cc", "%rax", "%rdx",
|
107
|
+
"%r8", "%r9", "%r10", "%r11",
|
108
|
+
"%r12", "%r13", "%r14"
|
109
|
+
);
|
110
|
+
#else
|
111
|
+
__asm__ __volatile__(
|
112
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
113
|
+
"mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
|
114
|
+
"mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
|
115
|
+
"mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
|
116
|
+
"mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
|
117
|
+
"adcq $0, %%rcx \n\t"
|
118
|
+
|
119
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
120
|
+
"mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
121
|
+
"mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
122
|
+
"mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
123
|
+
"mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
124
|
+
"adcq $0, %%rdx \n\t"
|
125
|
+
|
126
|
+
"addq %%r8, 8(%0) \n\t"
|
127
|
+
"adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
|
128
|
+
"adcq %%r11, %%rbx \n\t"
|
129
|
+
"adcq %%r13, %%rcx \n\t"
|
130
|
+
"adcq %%rdx, %%rax \n\t"
|
131
|
+
|
132
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
133
|
+
"mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
134
|
+
"mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
135
|
+
"mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
136
|
+
"mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
137
|
+
"adcq $0, %%rdx \n\t"
|
138
|
+
|
139
|
+
"addq %%r8, 16(%0) \n\t"
|
140
|
+
"adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
|
141
|
+
"adcq %%r11, %%rcx \n\t"
|
142
|
+
"adcq %%r13, %%rax \n\t"
|
143
|
+
"adcq %%rdx, %%rbx \n\t"
|
144
|
+
|
145
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
146
|
+
"mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
147
|
+
"mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
148
|
+
"mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
149
|
+
"mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
150
|
+
"adcq $0, %%rdx \n\t"
|
151
|
+
|
152
|
+
"addq %%r8, 24(%0) \n\t"
|
153
|
+
"adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
|
154
|
+
"adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
|
155
|
+
"adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
|
156
|
+
"adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
|
157
|
+
|
158
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
159
|
+
"mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
|
160
|
+
"mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
|
161
|
+
"mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
|
162
|
+
"mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
|
163
|
+
"adcq $0, %%rcx \n\t"
|
164
|
+
|
165
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
166
|
+
"mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
167
|
+
"mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
168
|
+
"mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
169
|
+
"mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
170
|
+
"adcq $0, %%rdx \n\t"
|
171
|
+
|
172
|
+
"addq %%r8, 72(%0) \n\t"
|
173
|
+
"adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
|
174
|
+
"adcq %%r11, %%rbx \n\t"
|
175
|
+
"adcq %%r13, %%rcx \n\t"
|
176
|
+
"adcq %%rdx, %%rax \n\t"
|
177
|
+
|
178
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
179
|
+
"mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
180
|
+
"mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
181
|
+
"mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
182
|
+
"mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
183
|
+
"adcq $0, %%rdx \n\t"
|
184
|
+
|
185
|
+
"addq %%r8, 80(%0) \n\t"
|
186
|
+
"adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
|
187
|
+
"adcq %%r11, %%rcx \n\t"
|
188
|
+
"adcq %%r13, %%rax \n\t"
|
189
|
+
"adcq %%rdx, %%rbx \n\t"
|
190
|
+
|
191
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
192
|
+
"mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
193
|
+
"mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
194
|
+
"mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
195
|
+
"mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
196
|
+
"adcq $0, %%rdx \n\t"
|
197
|
+
|
198
|
+
"addq %%r8, 88(%0) \n\t"
|
199
|
+
"adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
|
200
|
+
"adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
|
201
|
+
"adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
|
202
|
+
"adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
|
203
|
+
:
|
204
|
+
: "r" (c), "r" (a), "r" (b)
|
205
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
206
|
+
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
207
|
+
);
|
208
|
+
#endif
|
209
|
+
#else /* Without BMI2 */
|
210
|
+
/**
|
211
|
+
* TODO: Multiplications using MULQ instruction.
|
212
|
+
**/
|
213
|
+
#endif
|
214
|
+
}
|
215
|
+
|
216
|
+
/**
|
217
|
+
*
|
218
|
+
* @param c
|
219
|
+
* @param a
|
220
|
+
*/
|
221
|
+
void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
222
|
+
{
|
223
|
+
#ifdef __BMI2__
|
224
|
+
__asm__ __volatile__(
|
225
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
226
|
+
"mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
227
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
228
|
+
"mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
229
|
+
"movq %%r8, (%0) \n\t"
|
230
|
+
"movq %%r9, 8(%0) \n\t"
|
231
|
+
"movq %%r10, 16(%0) \n\t"
|
232
|
+
"movq %%r11, 24(%0) \n\t"
|
233
|
+
|
234
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
235
|
+
"mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
236
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
237
|
+
"mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
238
|
+
"movq %%r8, 32(%0) \n\t"
|
239
|
+
"movq %%r9, 40(%0) \n\t"
|
240
|
+
"movq %%r10, 48(%0) \n\t"
|
241
|
+
"movq %%r11, 56(%0) \n\t"
|
242
|
+
|
243
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
244
|
+
"mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
245
|
+
"mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
246
|
+
"mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
247
|
+
|
248
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
249
|
+
"mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
250
|
+
"mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
251
|
+
|
252
|
+
"addq %%rax, %%r9 \n\t"
|
253
|
+
"adcq %%rdx, %%r10 \n\t"
|
254
|
+
"adcq %%rcx, %%r11 \n\t"
|
255
|
+
"adcq %%r14, %%r12 \n\t"
|
256
|
+
"adcq $0, %%r13 \n\t"
|
257
|
+
"movq $0, %%r14 \n\t"
|
258
|
+
"adcq $0, %%r14 \n\t"
|
259
|
+
|
260
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
261
|
+
"mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
262
|
+
|
263
|
+
"addq %%rax, %%r10 \n\t"
|
264
|
+
"adcq %%rdx, %%r11 \n\t"
|
265
|
+
"adcq $0, %%r12 \n\t"
|
266
|
+
"adcq $0, %%r13 \n\t"
|
267
|
+
"adcq $0, %%r14 \n\t"
|
268
|
+
|
269
|
+
"shldq $1, %%r13, %%r14 \n\t"
|
270
|
+
"shldq $1, %%r12, %%r13 \n\t"
|
271
|
+
"shldq $1, %%r11, %%r12 \n\t"
|
272
|
+
"shldq $1, %%r10, %%r11 \n\t"
|
273
|
+
"shldq $1, %%r9, %%r10 \n\t"
|
274
|
+
"shldq $1, %%r8, %%r9 \n\t"
|
275
|
+
"shlq $1, %%r8 \n\t"
|
276
|
+
|
277
|
+
"addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
|
278
|
+
"adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
|
279
|
+
"adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
|
280
|
+
"adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
|
281
|
+
"adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
|
282
|
+
"adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
|
283
|
+
"adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
|
284
|
+
|
285
|
+
|
286
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
287
|
+
"mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
288
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
289
|
+
"mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
290
|
+
"movq %%r8, 64(%0) \n\t"
|
291
|
+
"movq %%r9, 72(%0) \n\t"
|
292
|
+
"movq %%r10, 80(%0) \n\t"
|
293
|
+
"movq %%r11, 88(%0) \n\t"
|
294
|
+
|
295
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
296
|
+
"mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
297
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
298
|
+
"mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
299
|
+
"movq %%r8, 96(%0) \n\t"
|
300
|
+
"movq %%r9, 104(%0) \n\t"
|
301
|
+
"movq %%r10, 112(%0) \n\t"
|
302
|
+
"movq %%r11, 120(%0) \n\t"
|
303
|
+
|
304
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
305
|
+
"mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
306
|
+
"mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
307
|
+
"mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
308
|
+
|
309
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
310
|
+
"mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
311
|
+
"mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
312
|
+
|
313
|
+
"addq %%rax, %%r9 \n\t"
|
314
|
+
"adcq %%rdx, %%r10 \n\t"
|
315
|
+
"adcq %%rcx, %%r11 \n\t"
|
316
|
+
"adcq %%r14, %%r12 \n\t"
|
317
|
+
"adcq $0, %%r13 \n\t"
|
318
|
+
"movq $0, %%r14 \n\t"
|
319
|
+
"adcq $0, %%r14 \n\t"
|
320
|
+
|
321
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
322
|
+
"mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
323
|
+
|
324
|
+
"addq %%rax, %%r10 \n\t"
|
325
|
+
"adcq %%rdx, %%r11 \n\t"
|
326
|
+
"adcq $0, %%r12 \n\t"
|
327
|
+
"adcq $0, %%r13 \n\t"
|
328
|
+
"adcq $0, %%r14 \n\t"
|
329
|
+
|
330
|
+
"shldq $1, %%r13, %%r14 \n\t"
|
331
|
+
"shldq $1, %%r12, %%r13 \n\t"
|
332
|
+
"shldq $1, %%r11, %%r12 \n\t"
|
333
|
+
"shldq $1, %%r10, %%r11 \n\t"
|
334
|
+
"shldq $1, %%r9, %%r10 \n\t"
|
335
|
+
"shldq $1, %%r8, %%r9 \n\t"
|
336
|
+
"shlq $1, %%r8 \n\t"
|
337
|
+
|
338
|
+
"addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
|
339
|
+
"adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
|
340
|
+
"adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
|
341
|
+
"adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
|
342
|
+
"adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
|
343
|
+
"adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
|
344
|
+
"adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
|
345
|
+
:
|
346
|
+
: "r" (c), "r" (a)
|
347
|
+
: "cc", "%rax", "%rcx", "%rdx",
|
348
|
+
"%r8", "%r9", "%r10", "%r11",
|
349
|
+
"%r12", "%r13", "%r14"
|
350
|
+
);
|
351
|
+
#else /* Without BMI2 */
|
352
|
+
/**
|
353
|
+
* TODO: Multiplications using MULQ instruction.
|
354
|
+
**/
|
355
|
+
#endif
|
356
|
+
}
|
357
|
+
|
358
|
+
/**
|
359
|
+
*
|
360
|
+
* @param c
|
361
|
+
* @param a
|
362
|
+
*/
|
363
|
+
void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
|
364
|
+
{
|
365
|
+
#ifdef __BMI2__
|
366
|
+
#ifdef __ADX__
|
367
|
+
__asm__ __volatile__(
|
368
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
369
|
+
" mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
|
370
|
+
" mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
|
371
|
+
" mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
372
|
+
" mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
|
373
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
374
|
+
" xorl %%ebx, %%ebx \n\t"
|
375
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
376
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
377
|
+
|
378
|
+
" mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
|
379
|
+
" mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
|
380
|
+
" mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
|
381
|
+
" mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
|
382
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
383
|
+
" xorl %%ebx, %%ebx \n\t"
|
384
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
|
385
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
|
386
|
+
:
|
387
|
+
: "r" (c), "r" (a)
|
388
|
+
: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
389
|
+
);
|
390
|
+
#else
|
391
|
+
__asm__ __volatile__(
|
392
|
+
"movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
393
|
+
"mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
|
394
|
+
"mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
|
395
|
+
"mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
|
396
|
+
"mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
|
397
|
+
"adcq $0, %%rcx \n\t"
|
398
|
+
|
399
|
+
"addq (%1), %%r8 \n\t"
|
400
|
+
"adcq 8(%1), %%r10 \n\t"
|
401
|
+
"adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
|
402
|
+
"adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
|
403
|
+
"adcq $0, %%rcx \n\t"
|
404
|
+
|
405
|
+
"mulx %%rcx, %%rax, %%rcx \n\t"
|
406
|
+
"addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
|
407
|
+
"adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
|
408
|
+
|
409
|
+
"mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
|
410
|
+
"mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
|
411
|
+
"mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
|
412
|
+
"mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
|
413
|
+
"adcq $0, %%rcx \n\t"
|
414
|
+
|
415
|
+
"addq 64(%1), %%r8 \n\t"
|
416
|
+
"adcq 72(%1), %%r10 \n\t"
|
417
|
+
"adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
|
418
|
+
"adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
|
419
|
+
"adcq $0, %%rcx \n\t"
|
420
|
+
|
421
|
+
"mulx %%rcx, %%rax, %%rcx \n\t"
|
422
|
+
"addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
|
423
|
+
"adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
|
424
|
+
|
425
|
+
:
|
426
|
+
: "r" (c), "r" (a)
|
427
|
+
: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
428
|
+
);
|
429
|
+
#endif
|
430
|
+
#else /* Without BMI2 */
|
431
|
+
/* [TODO] */
|
432
|
+
#endif
|
433
|
+
}
|
434
|
+
|
435
|
+
void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
436
|
+
{
|
437
|
+
#ifdef __BMI2__
|
438
|
+
#ifdef __ADX__
|
439
|
+
__asm__ __volatile__(
|
440
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
441
|
+
" mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
|
442
|
+
" mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
|
443
|
+
" mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
|
444
|
+
" mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
445
|
+
" adox %%rdx, %%rax \n\t"
|
446
|
+
|
447
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
448
|
+
" mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
|
449
|
+
" mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
450
|
+
" mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
451
|
+
" mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
452
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
453
|
+
|
454
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
455
|
+
" mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
|
456
|
+
" mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
|
457
|
+
" mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
458
|
+
" mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
459
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
460
|
+
|
461
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
462
|
+
" mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
|
463
|
+
" mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
|
464
|
+
" mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
|
465
|
+
" mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
|
466
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
|
467
|
+
:
|
468
|
+
: "r" (c), "r" (a), "r" (b)
|
469
|
+
: "memory", "cc", "%rax", "%rdx",
|
470
|
+
"%r8", "%r9", "%r10", "%r11",
|
471
|
+
"%r12", "%r13", "%r14"
|
472
|
+
);
|
473
|
+
#else
|
474
|
+
__asm__ __volatile__(
|
475
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
476
|
+
" mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
|
477
|
+
" mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
478
|
+
" mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
|
479
|
+
" mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
|
480
|
+
" adcq $0, %%rcx \n\t"
|
481
|
+
|
482
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
483
|
+
" mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
484
|
+
" mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
485
|
+
" mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
486
|
+
" mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
487
|
+
" adcq $0, %%rdx \n\t"
|
488
|
+
|
489
|
+
" addq %%r8, 8(%0) \n\t"
|
490
|
+
" adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
|
491
|
+
" adcq %%r11, %%rbx \n\t"
|
492
|
+
" adcq %%r13, %%rcx \n\t"
|
493
|
+
" adcq %%rdx, %%rax \n\t"
|
494
|
+
|
495
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
496
|
+
" mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
497
|
+
" mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
498
|
+
" mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
499
|
+
" mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
500
|
+
" adcq $0, %%rdx \n\t"
|
501
|
+
|
502
|
+
" addq %%r8, 16(%0) \n\t"
|
503
|
+
" adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
|
504
|
+
" adcq %%r11, %%rcx \n\t"
|
505
|
+
" adcq %%r13, %%rax \n\t"
|
506
|
+
" adcq %%rdx, %%rbx \n\t"
|
507
|
+
|
508
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
509
|
+
" mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
510
|
+
" mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
511
|
+
" mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
512
|
+
" mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
513
|
+
" adcq $0, %%rdx \n\t"
|
514
|
+
|
515
|
+
" addq %%r8, 24(%0) \n\t"
|
516
|
+
" adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
|
517
|
+
" adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
|
518
|
+
" adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
|
519
|
+
" adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
|
520
|
+
:
|
521
|
+
: "r" (c), "r" (a), "r" (b)
|
522
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
523
|
+
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
524
|
+
);
|
525
|
+
#endif
|
526
|
+
#else /* Without BMI2 */
|
527
|
+
/**
|
528
|
+
* TODO: Multiplications using MULQ instruction.
|
529
|
+
**/
|
530
|
+
#endif
|
531
|
+
}
|
532
|
+
|
533
|
+
void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
534
|
+
{
|
535
|
+
#ifdef __BMI2__
|
536
|
+
__asm__ __volatile__(
|
537
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
538
|
+
" mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
539
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
540
|
+
" mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
541
|
+
" movq %%r8, (%0) \n\t"
|
542
|
+
" movq %%r9, 8(%0) \n\t"
|
543
|
+
" movq %%r10, 16(%0) \n\t"
|
544
|
+
" movq %%r11, 24(%0) \n\t"
|
545
|
+
|
546
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
547
|
+
" mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
548
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
549
|
+
" mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
550
|
+
" movq %%r8, 32(%0) \n\t"
|
551
|
+
" movq %%r9, 40(%0) \n\t"
|
552
|
+
" movq %%r10, 48(%0) \n\t"
|
553
|
+
" movq %%r11, 56(%0) \n\t"
|
554
|
+
|
555
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
556
|
+
" mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
557
|
+
" mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
558
|
+
" mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
559
|
+
|
560
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
561
|
+
" mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
562
|
+
" mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
563
|
+
|
564
|
+
" addq %%rax, %%r9 \n\t"
|
565
|
+
" adcq %%rdx, %%r10 \n\t"
|
566
|
+
" adcq %%rcx, %%r11 \n\t"
|
567
|
+
" adcq %%r14, %%r12 \n\t"
|
568
|
+
" adcq $0, %%r13 \n\t"
|
569
|
+
" movq $0, %%r14 \n\t"
|
570
|
+
" adcq $0, %%r14 \n\t"
|
571
|
+
|
572
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
573
|
+
" mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
574
|
+
|
575
|
+
" addq %%rax, %%r10 \n\t"
|
576
|
+
" adcq %%rdx, %%r11 \n\t"
|
577
|
+
" adcq $0, %%r12 \n\t"
|
578
|
+
" adcq $0, %%r13 \n\t"
|
579
|
+
" adcq $0, %%r14 \n\t"
|
580
|
+
|
581
|
+
" shldq $1, %%r13, %%r14 \n\t"
|
582
|
+
" shldq $1, %%r12, %%r13 \n\t"
|
583
|
+
" shldq $1, %%r11, %%r12 \n\t"
|
584
|
+
" shldq $1, %%r10, %%r11 \n\t"
|
585
|
+
" shldq $1, %%r9, %%r10 \n\t"
|
586
|
+
" shldq $1, %%r8, %%r9 \n\t"
|
587
|
+
" shlq $1, %%r8 \n\t"
|
588
|
+
|
589
|
+
" addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
|
590
|
+
" adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
|
591
|
+
" adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
|
592
|
+
" adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
|
593
|
+
" adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
|
594
|
+
" adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
|
595
|
+
" adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
|
596
|
+
:
|
597
|
+
: "r" (c), "r" (a)
|
598
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
599
|
+
"%r8", "%r9", "%r10", "%r11",
|
600
|
+
"%r12", "%r13", "%r14"
|
601
|
+
);
|
602
|
+
#else /* Without BMI2 */
|
603
|
+
/**
|
604
|
+
* TODO: Multiplications using MULQ instruction.
|
605
|
+
**/
|
606
|
+
#endif
|
607
|
+
}
|
608
|
+
|
609
|
+
void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
|
610
|
+
{
|
611
|
+
#ifdef __BMI2__
|
612
|
+
#ifdef __ADX__
|
613
|
+
__asm__ __volatile__(
|
614
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
615
|
+
" mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
|
616
|
+
" mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
|
617
|
+
" mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
618
|
+
" mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
|
619
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
620
|
+
" xorl %%ebx, %%ebx \n\t"
|
621
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
622
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
623
|
+
:
|
624
|
+
: "r" (c), "r" (a)
|
625
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
626
|
+
);
|
627
|
+
#else
|
628
|
+
__asm__ __volatile__(
|
629
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
630
|
+
" mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
|
631
|
+
" mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
|
632
|
+
" mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
|
633
|
+
" mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
|
634
|
+
" adcq $0, %%rcx \n\t"
|
635
|
+
|
636
|
+
" addq (%1), %%r8 \n\t"
|
637
|
+
" adcq 8(%1), %%r10 \n\t"
|
638
|
+
" adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
|
639
|
+
" adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
|
640
|
+
" adcq $0, %%rcx \n\t"
|
641
|
+
|
642
|
+
" mulx %%rcx, %%rax, %%rcx \n\t"
|
643
|
+
" addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
644
|
+
" adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
|
645
|
+
:
|
646
|
+
: "r" (c), "r" (a)
|
647
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
648
|
+
);
|
649
|
+
#endif
|
650
|
+
#else /* Without BMI2 */
|
651
|
+
/**
|
652
|
+
* TODO: Multiplications using MULQ instruction.
|
653
|
+
**/
|
654
|
+
#endif
|
655
|
+
}
|
656
|
+
|
657
|
+
inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
658
|
+
{
|
659
|
+
#if __ADX__
|
660
|
+
__asm__ __volatile__(
|
661
|
+
"movq (%2), %%rax \n\t"
|
662
|
+
"movq 8(%2), %%rcx \n\t"
|
663
|
+
"movq 16(%2), %%r8 \n\t"
|
664
|
+
"movq 24(%2), %%r9 \n\t"
|
665
|
+
"clc \n\t"
|
666
|
+
"adcx (%1), %%rax \n\t"
|
667
|
+
"adcx 8(%1), %%rcx \n\t"
|
668
|
+
"adcx 16(%1), %%r8 \n\t"
|
669
|
+
"adcx 24(%1), %%r9 \n\t"
|
670
|
+
"movq %%rcx, 8(%0) \n\t"
|
671
|
+
"movq %%r8 , 16(%0) \n\t"
|
672
|
+
"movq %%r9 , 24(%0) \n\t"
|
673
|
+
"setc %%cl \n\t"
|
674
|
+
"neg %%rcx \n\t"
|
675
|
+
"andq $38, %%rcx \n\t"
|
676
|
+
"addq %%rcx, %%rax \n\t"
|
677
|
+
"movq %%rax, (%0) \n\t"
|
678
|
+
:
|
679
|
+
: "r" (c), "r" (a), "r" (b)
|
680
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
681
|
+
);
|
682
|
+
#else
|
683
|
+
__asm__ __volatile__(
|
684
|
+
"movq (%2), %%rax \n\t"
|
685
|
+
"movq 8(%2), %%rcx \n\t"
|
686
|
+
"movq 16(%2), %%r8 \n\t"
|
687
|
+
"movq 24(%2), %%r9 \n\t"
|
688
|
+
"add (%1), %%rax \n\t"
|
689
|
+
"adc 8(%1), %%rcx \n\t"
|
690
|
+
"adc 16(%1), %%r8 \n\t"
|
691
|
+
"adc 24(%1), %%r9 \n\t"
|
692
|
+
"movq %%rcx, 8(%0) \n\t"
|
693
|
+
"movq %%r8 , 16(%0) \n\t"
|
694
|
+
"movq %%r9 , 24(%0) \n\t"
|
695
|
+
"setc %%cl \n\t"
|
696
|
+
"neg %%rcx \n\t"
|
697
|
+
"andq $38, %%rcx \n\t"
|
698
|
+
"addq %%rcx, %%rax \n\t"
|
699
|
+
"movq %%rax, (%0) \n\t"
|
700
|
+
:
|
701
|
+
: "r" (c), "r" (a), "r" (b)
|
702
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
703
|
+
);
|
704
|
+
#endif
|
705
|
+
}
|
706
|
+
|
707
|
+
inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
|
708
|
+
uint64_t *const __restrict b)
|
709
|
+
{
|
710
|
+
__asm__ __volatile__(
|
711
|
+
"movq (%1), %%rax \n\t"
|
712
|
+
"movq 8(%1), %%rcx \n\t"
|
713
|
+
"movq 16(%1), %%r8 \n\t"
|
714
|
+
"movq 24(%1), %%r9 \n\t"
|
715
|
+
"subq (%2), %%rax \n\t"
|
716
|
+
"sbbq 8(%2), %%rcx \n\t"
|
717
|
+
"sbbq 16(%2), %%r8 \n\t"
|
718
|
+
"sbbq 24(%2), %%r9 \n\t"
|
719
|
+
"movq %%rcx, 8(%0) \n\t"
|
720
|
+
"movq %%r8 , 16(%0) \n\t"
|
721
|
+
"movq %%r9 , 24(%0) \n\t"
|
722
|
+
"setc %%cl \n\t"
|
723
|
+
"neg %%rcx \n\t"
|
724
|
+
"andq $38, %%rcx \n\t"
|
725
|
+
"subq %%rcx, %%rax \n\t"
|
726
|
+
"movq %%rax, (%0) \n\t"
|
727
|
+
:
|
728
|
+
: "r" (c), "r" (a), "r" (b)
|
729
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
730
|
+
);
|
731
|
+
}
|
732
|
+
|
733
|
+
inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
|
734
|
+
{
|
735
|
+
#ifdef __BMI2__
|
736
|
+
/**
|
737
|
+
* a24 = (A+2)/4 = (486662+2)/4 = 121666
|
738
|
+
**/
|
739
|
+
const uint64_t a24 = 121666;
|
740
|
+
__asm__ __volatile__(
|
741
|
+
"movq %2, %%rdx \n\t"
|
742
|
+
"mulx (%1), %%rax, %%r8 \n\t"
|
743
|
+
"mulx 8(%1), %%rcx, %%r9 \n\t"
|
744
|
+
"movq %%rax, (%0) \n\t"
|
745
|
+
"movq %%rcx, 8(%0) \n\t"
|
746
|
+
"mulx 16(%1), %%rax, %%r10 \n\t"
|
747
|
+
"mulx 24(%1), %%rcx, %%r11 \n\t"
|
748
|
+
"movq %%rax, 16(%0) \n\t"
|
749
|
+
"movq %%rcx, 24(%0) \n\t"
|
750
|
+
"movq $38, %%rdx \n\t"
|
751
|
+
"mulx %%r11, %%rax, %%rcx \n\t"
|
752
|
+
"addq %%rax, (%0) \n\t"
|
753
|
+
"adcq %%r8, 8(%0) \n\t"
|
754
|
+
"adcq %%r9, 16(%0) \n\t"
|
755
|
+
"adcq %%r10, 24(%0) \n\t"
|
756
|
+
:
|
757
|
+
: "r" (c), "r" (a), "r" (a24)
|
758
|
+
: "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
759
|
+
);
|
760
|
+
#else /* Without BMI2 */
|
761
|
+
/**
|
762
|
+
* TODO: Multiplications using MULQ instruction.
|
763
|
+
**/
|
764
|
+
#endif
|
765
|
+
}
|
766
|
+
|
767
|
+
void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
|
768
|
+
{
|
769
|
+
#define sqrn_EltFp25519_1w_x64(a,times)\
|
770
|
+
counter = times;\
|
771
|
+
while(counter-- > 0)\
|
772
|
+
{\
|
773
|
+
sqr_EltFp25519_1w_x64(a);\
|
774
|
+
}
|
775
|
+
|
776
|
+
EltFp25519_1w_Buffer_x64 buffer_1w;
|
777
|
+
EltFp25519_1w_x64 x0, x1, x2;
|
778
|
+
uint64_t * T[5];
|
779
|
+
uint64_t counter;
|
780
|
+
|
781
|
+
T[0] = x0;
|
782
|
+
T[1] = pC; /* x^(-1) */
|
783
|
+
T[2] = x1;
|
784
|
+
T[3] = x2;
|
785
|
+
T[4] = pA; /* x */
|
786
|
+
|
787
|
+
copy_EltFp25519_1w_x64(T[1],pA);
|
788
|
+
sqrn_EltFp25519_1w_x64(T[1],1);
|
789
|
+
copy_EltFp25519_1w_x64(T[2],T[1]);
|
790
|
+
sqrn_EltFp25519_1w_x64(T[2],2);
|
791
|
+
mul_EltFp25519_1w_x64(T[0], pA, T[2]);
|
792
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
|
793
|
+
copy_EltFp25519_1w_x64(T[2],T[1]);
|
794
|
+
sqrn_EltFp25519_1w_x64(T[2],1);
|
795
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
796
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
797
|
+
sqrn_EltFp25519_1w_x64(T[2],5);
|
798
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
799
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
800
|
+
sqrn_EltFp25519_1w_x64(T[2],10);
|
801
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
802
|
+
copy_EltFp25519_1w_x64(T[3],T[2]);
|
803
|
+
sqrn_EltFp25519_1w_x64(T[3],20);
|
804
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
|
805
|
+
sqrn_EltFp25519_1w_x64(T[3],10);
|
806
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
|
807
|
+
copy_EltFp25519_1w_x64(T[0],T[3]);
|
808
|
+
sqrn_EltFp25519_1w_x64(T[0],50);
|
809
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
|
810
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
811
|
+
sqrn_EltFp25519_1w_x64(T[2],100);
|
812
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
813
|
+
sqrn_EltFp25519_1w_x64(T[2],50);
|
814
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
|
815
|
+
sqrn_EltFp25519_1w_x64(T[2],5);
|
816
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
|
817
|
+
#undef sqrn_EltFp25519_1w_x64
|
818
|
+
}
|
819
|
+
|
820
|
+
inline void fred_EltFp25519_1w_x64(uint64_t *const c)
|
821
|
+
{
|
822
|
+
int64_t last = (((int64_t*)c)[3])>>63;
|
823
|
+
c[3] &= ((uint64_t)1<<63)-1;
|
824
|
+
c[0] += 19 & last;
|
825
|
+
}
|
826
|
+
|