x25519 0.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +12 -0
- data/.rspec +5 -0
- data/.rubocop.yml +32 -0
- data/.travis.yml +12 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +11 -0
- data/LICENSE +165 -0
- data/README.md +67 -0
- data/Rakefile +11 -0
- data/ext/x25519/bytes.c +42 -0
- data/ext/x25519/bytes.h +25 -0
- data/ext/x25519/fp25519_x64.c +826 -0
- data/ext/x25519/fp25519_x64.h +91 -0
- data/ext/x25519/random.c +51 -0
- data/ext/x25519/random.h +24 -0
- data/ext/x25519/rfc7748_precompted.h +49 -0
- data/ext/x25519/rfc7748_precomputed.c +20 -0
- data/ext/x25519/table_ladder_x25519.h +277 -0
- data/ext/x25519/x25519_x64.c +244 -0
- data/lib/x25519.rb +7 -0
- data/lib/x25519/version.rb +5 -0
- data/x25519.gemspec +28 -0
- metadata +82 -0
data/ext/x25519/bytes.h
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
|
3
|
+
* Institute of Computing.
|
4
|
+
* University of Campinas, Brazil.
|
5
|
+
*
|
6
|
+
* This program is free software: you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU Lesser General Public License as
|
8
|
+
* published by the Free Software Foundation, version 3.
|
9
|
+
*
|
10
|
+
* This program is distributed in the hope that it will be useful, but
|
11
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
* Lesser General Public License for more details.
|
14
|
+
*
|
15
|
+
* You should have received a copy of the GNU Lesser General Public License
|
16
|
+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
*/
|
18
|
+
#ifndef BYTES_H
|
19
|
+
#define BYTES_H
|
20
|
+
|
21
|
+
#include <stdint.h>
|
22
|
+
void print_bytes(uint8_t * A, int num_bytes);
|
23
|
+
int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes);
|
24
|
+
|
25
|
+
#endif /* BYTES_H */
|
@@ -0,0 +1,826 @@
|
|
1
|
+
/**
|
2
|
+
* Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
|
3
|
+
* Institute of Computing.
|
4
|
+
* University of Campinas, Brazil.
|
5
|
+
*
|
6
|
+
* This program is free software: you can redistribute it and/or modify
|
7
|
+
* it under the terms of the GNU Lesser General Public License as
|
8
|
+
* published by the Free Software Foundation, version 3.
|
9
|
+
*
|
10
|
+
* This program is distributed in the hope that it will be useful, but
|
11
|
+
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
* Lesser General Public License for more details.
|
14
|
+
*
|
15
|
+
* You should have received a copy of the GNU Lesser General Public License
|
16
|
+
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
17
|
+
*/
|
18
|
+
#include "random.h"
|
19
|
+
#include "bytes.h"
|
20
|
+
#include "fp25519_x64.h"
|
21
|
+
|
22
|
+
void random_EltFp25519_1w_x64(uint64_t *A)
|
23
|
+
{
|
24
|
+
random_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
|
25
|
+
A[3] &= ((uint64_t)1<<63)-1;
|
26
|
+
}
|
27
|
+
|
28
|
+
int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
|
29
|
+
{
|
30
|
+
return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
|
31
|
+
}
|
32
|
+
|
33
|
+
void print_EltFp25519_1w_x64(uint64_t *A)
|
34
|
+
{
|
35
|
+
print_bytes((uint8_t*)A,SIZE_ELEMENT_BYTES);
|
36
|
+
}
|
37
|
+
|
38
|
+
/**
|
39
|
+
*
|
40
|
+
* @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
|
41
|
+
* @param a Two 256-bit integers: a[0:3] and a[4:7]
|
42
|
+
* @param b Two 256-bit integers: b[0:3] and b[4:7]
|
43
|
+
*/
|
44
|
+
void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
45
|
+
{
|
46
|
+
#ifdef __BMI2__
|
47
|
+
#ifdef __ADX__
|
48
|
+
__asm__ __volatile__(
|
49
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
50
|
+
"mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
|
51
|
+
"mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
|
52
|
+
"mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
|
53
|
+
"mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
54
|
+
"adox %%rdx, %%rax \n\t"
|
55
|
+
|
56
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
57
|
+
"mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
|
58
|
+
"mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
|
59
|
+
"mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
60
|
+
"mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
61
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
62
|
+
|
63
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
64
|
+
"mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
|
65
|
+
"mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
|
66
|
+
"mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
67
|
+
"mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
68
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
69
|
+
|
70
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
71
|
+
"mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
|
72
|
+
"mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
|
73
|
+
"mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
|
74
|
+
"mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
|
75
|
+
"adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
|
76
|
+
|
77
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
78
|
+
"mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
|
79
|
+
"mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
|
80
|
+
"mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
|
81
|
+
"mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
82
|
+
"adox %%rdx, %%rax \n\t"
|
83
|
+
|
84
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
85
|
+
"mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
|
86
|
+
"mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
|
87
|
+
"mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
88
|
+
"mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
89
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
90
|
+
|
91
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
92
|
+
"mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
|
93
|
+
"mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
|
94
|
+
"mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
95
|
+
"mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
96
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
97
|
+
|
98
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
99
|
+
"mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
|
100
|
+
"mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
|
101
|
+
"mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
|
102
|
+
"mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
|
103
|
+
"adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
|
104
|
+
:
|
105
|
+
: "r" (c), "r" (a), "r" (b)
|
106
|
+
: "memory", "cc", "%rax", "%rdx",
|
107
|
+
"%r8", "%r9", "%r10", "%r11",
|
108
|
+
"%r12", "%r13", "%r14"
|
109
|
+
);
|
110
|
+
#else
|
111
|
+
__asm__ __volatile__(
|
112
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
113
|
+
"mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
|
114
|
+
"mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
|
115
|
+
"mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
|
116
|
+
"mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
|
117
|
+
"adcq $0, %%rcx \n\t"
|
118
|
+
|
119
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
120
|
+
"mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
121
|
+
"mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
122
|
+
"mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
123
|
+
"mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
124
|
+
"adcq $0, %%rdx \n\t"
|
125
|
+
|
126
|
+
"addq %%r8, 8(%0) \n\t"
|
127
|
+
"adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
|
128
|
+
"adcq %%r11, %%rbx \n\t"
|
129
|
+
"adcq %%r13, %%rcx \n\t"
|
130
|
+
"adcq %%rdx, %%rax \n\t"
|
131
|
+
|
132
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
133
|
+
"mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
134
|
+
"mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
135
|
+
"mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
136
|
+
"mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
137
|
+
"adcq $0, %%rdx \n\t"
|
138
|
+
|
139
|
+
"addq %%r8, 16(%0) \n\t"
|
140
|
+
"adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
|
141
|
+
"adcq %%r11, %%rcx \n\t"
|
142
|
+
"adcq %%r13, %%rax \n\t"
|
143
|
+
"adcq %%rdx, %%rbx \n\t"
|
144
|
+
|
145
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
146
|
+
"mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
147
|
+
"mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
148
|
+
"mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
149
|
+
"mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
150
|
+
"adcq $0, %%rdx \n\t"
|
151
|
+
|
152
|
+
"addq %%r8, 24(%0) \n\t"
|
153
|
+
"adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
|
154
|
+
"adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
|
155
|
+
"adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
|
156
|
+
"adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
|
157
|
+
|
158
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
159
|
+
"mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
|
160
|
+
"mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
|
161
|
+
"mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
|
162
|
+
"mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
|
163
|
+
"adcq $0, %%rcx \n\t"
|
164
|
+
|
165
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
166
|
+
"mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
167
|
+
"mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
168
|
+
"mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
169
|
+
"mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
170
|
+
"adcq $0, %%rdx \n\t"
|
171
|
+
|
172
|
+
"addq %%r8, 72(%0) \n\t"
|
173
|
+
"adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
|
174
|
+
"adcq %%r11, %%rbx \n\t"
|
175
|
+
"adcq %%r13, %%rcx \n\t"
|
176
|
+
"adcq %%rdx, %%rax \n\t"
|
177
|
+
|
178
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
179
|
+
"mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
180
|
+
"mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
181
|
+
"mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
182
|
+
"mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
183
|
+
"adcq $0, %%rdx \n\t"
|
184
|
+
|
185
|
+
"addq %%r8, 80(%0) \n\t"
|
186
|
+
"adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
|
187
|
+
"adcq %%r11, %%rcx \n\t"
|
188
|
+
"adcq %%r13, %%rax \n\t"
|
189
|
+
"adcq %%rdx, %%rbx \n\t"
|
190
|
+
|
191
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
192
|
+
"mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
193
|
+
"mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
|
194
|
+
"mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
|
195
|
+
"mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
|
196
|
+
"adcq $0, %%rdx \n\t"
|
197
|
+
|
198
|
+
"addq %%r8, 88(%0) \n\t"
|
199
|
+
"adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
|
200
|
+
"adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
|
201
|
+
"adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
|
202
|
+
"adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
|
203
|
+
:
|
204
|
+
: "r" (c), "r" (a), "r" (b)
|
205
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
206
|
+
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
207
|
+
);
|
208
|
+
#endif
|
209
|
+
#else /* Without BMI2 */
|
210
|
+
/**
|
211
|
+
* TODO: Multiplications using MULQ instruction.
|
212
|
+
**/
|
213
|
+
#endif
|
214
|
+
}
|
215
|
+
|
216
|
+
/**
|
217
|
+
*
|
218
|
+
* @param c
|
219
|
+
* @param a
|
220
|
+
*/
|
221
|
+
void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
222
|
+
{
|
223
|
+
#ifdef __BMI2__
|
224
|
+
__asm__ __volatile__(
|
225
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
226
|
+
"mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
227
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
228
|
+
"mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
229
|
+
"movq %%r8, (%0) \n\t"
|
230
|
+
"movq %%r9, 8(%0) \n\t"
|
231
|
+
"movq %%r10, 16(%0) \n\t"
|
232
|
+
"movq %%r11, 24(%0) \n\t"
|
233
|
+
|
234
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
235
|
+
"mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
236
|
+
"movq 24(%1), %%rdx # A[3] \n\t"
|
237
|
+
"mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
238
|
+
"movq %%r8, 32(%0) \n\t"
|
239
|
+
"movq %%r9, 40(%0) \n\t"
|
240
|
+
"movq %%r10, 48(%0) \n\t"
|
241
|
+
"movq %%r11, 56(%0) \n\t"
|
242
|
+
|
243
|
+
"movq 8(%1), %%rdx # A[1] \n\t"
|
244
|
+
"mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
245
|
+
"mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
246
|
+
"mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
247
|
+
|
248
|
+
"movq 16(%1), %%rdx # A[2] \n\t"
|
249
|
+
"mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
250
|
+
"mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
251
|
+
|
252
|
+
"addq %%rax, %%r9 \n\t"
|
253
|
+
"adcq %%rdx, %%r10 \n\t"
|
254
|
+
"adcq %%rcx, %%r11 \n\t"
|
255
|
+
"adcq %%r14, %%r12 \n\t"
|
256
|
+
"adcq $0, %%r13 \n\t"
|
257
|
+
"movq $0, %%r14 \n\t"
|
258
|
+
"adcq $0, %%r14 \n\t"
|
259
|
+
|
260
|
+
"movq (%1), %%rdx # A[0] \n\t"
|
261
|
+
"mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
262
|
+
|
263
|
+
"addq %%rax, %%r10 \n\t"
|
264
|
+
"adcq %%rdx, %%r11 \n\t"
|
265
|
+
"adcq $0, %%r12 \n\t"
|
266
|
+
"adcq $0, %%r13 \n\t"
|
267
|
+
"adcq $0, %%r14 \n\t"
|
268
|
+
|
269
|
+
"shldq $1, %%r13, %%r14 \n\t"
|
270
|
+
"shldq $1, %%r12, %%r13 \n\t"
|
271
|
+
"shldq $1, %%r11, %%r12 \n\t"
|
272
|
+
"shldq $1, %%r10, %%r11 \n\t"
|
273
|
+
"shldq $1, %%r9, %%r10 \n\t"
|
274
|
+
"shldq $1, %%r8, %%r9 \n\t"
|
275
|
+
"shlq $1, %%r8 \n\t"
|
276
|
+
|
277
|
+
"addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
|
278
|
+
"adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
|
279
|
+
"adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
|
280
|
+
"adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
|
281
|
+
"adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
|
282
|
+
"adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
|
283
|
+
"adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
|
284
|
+
|
285
|
+
|
286
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
287
|
+
"mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
288
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
289
|
+
"mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
290
|
+
"movq %%r8, 64(%0) \n\t"
|
291
|
+
"movq %%r9, 72(%0) \n\t"
|
292
|
+
"movq %%r10, 80(%0) \n\t"
|
293
|
+
"movq %%r11, 88(%0) \n\t"
|
294
|
+
|
295
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
296
|
+
"mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
297
|
+
"movq 56(%1), %%rdx # A[3] \n\t"
|
298
|
+
"mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
299
|
+
"movq %%r8, 96(%0) \n\t"
|
300
|
+
"movq %%r9, 104(%0) \n\t"
|
301
|
+
"movq %%r10, 112(%0) \n\t"
|
302
|
+
"movq %%r11, 120(%0) \n\t"
|
303
|
+
|
304
|
+
"movq 40(%1), %%rdx # A[1] \n\t"
|
305
|
+
"mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
306
|
+
"mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
307
|
+
"mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
308
|
+
|
309
|
+
"movq 48(%1), %%rdx # A[2] \n\t"
|
310
|
+
"mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
311
|
+
"mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
312
|
+
|
313
|
+
"addq %%rax, %%r9 \n\t"
|
314
|
+
"adcq %%rdx, %%r10 \n\t"
|
315
|
+
"adcq %%rcx, %%r11 \n\t"
|
316
|
+
"adcq %%r14, %%r12 \n\t"
|
317
|
+
"adcq $0, %%r13 \n\t"
|
318
|
+
"movq $0, %%r14 \n\t"
|
319
|
+
"adcq $0, %%r14 \n\t"
|
320
|
+
|
321
|
+
"movq 32(%1), %%rdx # A[0] \n\t"
|
322
|
+
"mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
323
|
+
|
324
|
+
"addq %%rax, %%r10 \n\t"
|
325
|
+
"adcq %%rdx, %%r11 \n\t"
|
326
|
+
"adcq $0, %%r12 \n\t"
|
327
|
+
"adcq $0, %%r13 \n\t"
|
328
|
+
"adcq $0, %%r14 \n\t"
|
329
|
+
|
330
|
+
"shldq $1, %%r13, %%r14 \n\t"
|
331
|
+
"shldq $1, %%r12, %%r13 \n\t"
|
332
|
+
"shldq $1, %%r11, %%r12 \n\t"
|
333
|
+
"shldq $1, %%r10, %%r11 \n\t"
|
334
|
+
"shldq $1, %%r9, %%r10 \n\t"
|
335
|
+
"shldq $1, %%r8, %%r9 \n\t"
|
336
|
+
"shlq $1, %%r8 \n\t"
|
337
|
+
|
338
|
+
"addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
|
339
|
+
"adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
|
340
|
+
"adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
|
341
|
+
"adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
|
342
|
+
"adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
|
343
|
+
"adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
|
344
|
+
"adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
|
345
|
+
:
|
346
|
+
: "r" (c), "r" (a)
|
347
|
+
: "cc", "%rax", "%rcx", "%rdx",
|
348
|
+
"%r8", "%r9", "%r10", "%r11",
|
349
|
+
"%r12", "%r13", "%r14"
|
350
|
+
);
|
351
|
+
#else /* Without BMI2 */
|
352
|
+
/**
|
353
|
+
* TODO: Multiplications using MULQ instruction.
|
354
|
+
**/
|
355
|
+
#endif
|
356
|
+
}
|
357
|
+
|
358
|
+
/**
|
359
|
+
*
|
360
|
+
* @param c
|
361
|
+
* @param a
|
362
|
+
*/
|
363
|
+
void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
|
364
|
+
{
|
365
|
+
#ifdef __BMI2__
|
366
|
+
#ifdef __ADX__
|
367
|
+
__asm__ __volatile__(
|
368
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
369
|
+
" mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
|
370
|
+
" mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
|
371
|
+
" mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
372
|
+
" mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
|
373
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
374
|
+
" xorl %%ebx, %%ebx \n\t"
|
375
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
376
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
377
|
+
|
378
|
+
" mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
|
379
|
+
" mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
|
380
|
+
" mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
|
381
|
+
" mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
|
382
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
383
|
+
" xorl %%ebx, %%ebx \n\t"
|
384
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
|
385
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
|
386
|
+
:
|
387
|
+
: "r" (c), "r" (a)
|
388
|
+
: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
389
|
+
);
|
390
|
+
#else
|
391
|
+
__asm__ __volatile__(
|
392
|
+
"movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
393
|
+
"mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
|
394
|
+
"mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
|
395
|
+
"mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
|
396
|
+
"mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
|
397
|
+
"adcq $0, %%rcx \n\t"
|
398
|
+
|
399
|
+
"addq (%1), %%r8 \n\t"
|
400
|
+
"adcq 8(%1), %%r10 \n\t"
|
401
|
+
"adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
|
402
|
+
"adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
|
403
|
+
"adcq $0, %%rcx \n\t"
|
404
|
+
|
405
|
+
"mulx %%rcx, %%rax, %%rcx \n\t"
|
406
|
+
"addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
|
407
|
+
"adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
|
408
|
+
|
409
|
+
"mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
|
410
|
+
"mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
|
411
|
+
"mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
|
412
|
+
"mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
|
413
|
+
"adcq $0, %%rcx \n\t"
|
414
|
+
|
415
|
+
"addq 64(%1), %%r8 \n\t"
|
416
|
+
"adcq 72(%1), %%r10 \n\t"
|
417
|
+
"adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
|
418
|
+
"adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
|
419
|
+
"adcq $0, %%rcx \n\t"
|
420
|
+
|
421
|
+
"mulx %%rcx, %%rax, %%rcx \n\t"
|
422
|
+
"addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
|
423
|
+
"adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
|
424
|
+
|
425
|
+
:
|
426
|
+
: "r" (c), "r" (a)
|
427
|
+
: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
428
|
+
);
|
429
|
+
#endif
|
430
|
+
#else /* Without BMI2 */
|
431
|
+
/* [TODO] */
|
432
|
+
#endif
|
433
|
+
}
|
434
|
+
|
435
|
+
void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
436
|
+
{
|
437
|
+
#ifdef __BMI2__
|
438
|
+
#ifdef __ADX__
|
439
|
+
__asm__ __volatile__(
|
440
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
441
|
+
" mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
|
442
|
+
" mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
|
443
|
+
" mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
|
444
|
+
" mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
|
445
|
+
" adox %%rdx, %%rax \n\t"
|
446
|
+
|
447
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
448
|
+
" mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
|
449
|
+
" mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
450
|
+
" mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
451
|
+
" mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
452
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
453
|
+
|
454
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
455
|
+
" mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
|
456
|
+
" mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
|
457
|
+
" mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
|
458
|
+
" mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
|
459
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
|
460
|
+
|
461
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
462
|
+
" mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
|
463
|
+
" mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
|
464
|
+
" mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
|
465
|
+
" mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
|
466
|
+
" adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
|
467
|
+
:
|
468
|
+
: "r" (c), "r" (a), "r" (b)
|
469
|
+
: "memory", "cc", "%rax", "%rdx",
|
470
|
+
"%r8", "%r9", "%r10", "%r11",
|
471
|
+
"%r12", "%r13", "%r14"
|
472
|
+
);
|
473
|
+
#else
|
474
|
+
__asm__ __volatile__(
|
475
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
476
|
+
" mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
|
477
|
+
" mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
478
|
+
" mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
|
479
|
+
" mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
|
480
|
+
" adcq $0, %%rcx \n\t"
|
481
|
+
|
482
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
483
|
+
" mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
|
484
|
+
" mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
485
|
+
" mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
486
|
+
" mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
487
|
+
" adcq $0, %%rdx \n\t"
|
488
|
+
|
489
|
+
" addq %%r8, 8(%0) \n\t"
|
490
|
+
" adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
|
491
|
+
" adcq %%r11, %%rbx \n\t"
|
492
|
+
" adcq %%r13, %%rcx \n\t"
|
493
|
+
" adcq %%rdx, %%rax \n\t"
|
494
|
+
|
495
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
496
|
+
" mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
|
497
|
+
" mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
498
|
+
" mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
499
|
+
" mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
500
|
+
" adcq $0, %%rdx \n\t"
|
501
|
+
|
502
|
+
" addq %%r8, 16(%0) \n\t"
|
503
|
+
" adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
|
504
|
+
" adcq %%r11, %%rcx \n\t"
|
505
|
+
" adcq %%r13, %%rax \n\t"
|
506
|
+
" adcq %%rdx, %%rbx \n\t"
|
507
|
+
|
508
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
509
|
+
" mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
|
510
|
+
" mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
|
511
|
+
" mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
|
512
|
+
" mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
|
513
|
+
" adcq $0, %%rdx \n\t"
|
514
|
+
|
515
|
+
" addq %%r8, 24(%0) \n\t"
|
516
|
+
" adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
|
517
|
+
" adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
|
518
|
+
" adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
|
519
|
+
" adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
|
520
|
+
:
|
521
|
+
: "r" (c), "r" (a), "r" (b)
|
522
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
|
523
|
+
"%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
|
524
|
+
);
|
525
|
+
#endif
|
526
|
+
#else /* Without BMI2 */
|
527
|
+
/**
|
528
|
+
* TODO: Multiplications using MULQ instruction.
|
529
|
+
**/
|
530
|
+
#endif
|
531
|
+
}
|
532
|
+
|
533
|
+
void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
|
534
|
+
{
|
535
|
+
#ifdef __BMI2__
|
536
|
+
__asm__ __volatile__(
|
537
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
538
|
+
" mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
|
539
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
540
|
+
" mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
|
541
|
+
" movq %%r8, (%0) \n\t"
|
542
|
+
" movq %%r9, 8(%0) \n\t"
|
543
|
+
" movq %%r10, 16(%0) \n\t"
|
544
|
+
" movq %%r11, 24(%0) \n\t"
|
545
|
+
|
546
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
547
|
+
" mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
|
548
|
+
" movq 24(%1), %%rdx # A[3] \n\t"
|
549
|
+
" mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
|
550
|
+
" movq %%r8, 32(%0) \n\t"
|
551
|
+
" movq %%r9, 40(%0) \n\t"
|
552
|
+
" movq %%r10, 48(%0) \n\t"
|
553
|
+
" movq %%r11, 56(%0) \n\t"
|
554
|
+
|
555
|
+
" movq 8(%1), %%rdx # A[1] \n\t"
|
556
|
+
" mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
|
557
|
+
" mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
|
558
|
+
" mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
|
559
|
+
|
560
|
+
" movq 16(%1), %%rdx # A[2] \n\t"
|
561
|
+
" mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
|
562
|
+
" mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
|
563
|
+
|
564
|
+
" addq %%rax, %%r9 \n\t"
|
565
|
+
" adcq %%rdx, %%r10 \n\t"
|
566
|
+
" adcq %%rcx, %%r11 \n\t"
|
567
|
+
" adcq %%r14, %%r12 \n\t"
|
568
|
+
" adcq $0, %%r13 \n\t"
|
569
|
+
" movq $0, %%r14 \n\t"
|
570
|
+
" adcq $0, %%r14 \n\t"
|
571
|
+
|
572
|
+
" movq (%1), %%rdx # A[0] \n\t"
|
573
|
+
" mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
|
574
|
+
|
575
|
+
" addq %%rax, %%r10 \n\t"
|
576
|
+
" adcq %%rdx, %%r11 \n\t"
|
577
|
+
" adcq $0, %%r12 \n\t"
|
578
|
+
" adcq $0, %%r13 \n\t"
|
579
|
+
" adcq $0, %%r14 \n\t"
|
580
|
+
|
581
|
+
" shldq $1, %%r13, %%r14 \n\t"
|
582
|
+
" shldq $1, %%r12, %%r13 \n\t"
|
583
|
+
" shldq $1, %%r11, %%r12 \n\t"
|
584
|
+
" shldq $1, %%r10, %%r11 \n\t"
|
585
|
+
" shldq $1, %%r9, %%r10 \n\t"
|
586
|
+
" shldq $1, %%r8, %%r9 \n\t"
|
587
|
+
" shlq $1, %%r8 \n\t"
|
588
|
+
|
589
|
+
" addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
|
590
|
+
" adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
|
591
|
+
" adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
|
592
|
+
" adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
|
593
|
+
" adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
|
594
|
+
" adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
|
595
|
+
" adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
|
596
|
+
:
|
597
|
+
: "r" (c), "r" (a)
|
598
|
+
: "memory", "cc", "%rax", "%rcx", "%rdx",
|
599
|
+
"%r8", "%r9", "%r10", "%r11",
|
600
|
+
"%r12", "%r13", "%r14"
|
601
|
+
);
|
602
|
+
#else /* Without BMI2 */
|
603
|
+
/**
|
604
|
+
* TODO: Multiplications using MULQ instruction.
|
605
|
+
**/
|
606
|
+
#endif
|
607
|
+
}
|
608
|
+
|
609
|
+
void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
|
610
|
+
{
|
611
|
+
#ifdef __BMI2__
|
612
|
+
#ifdef __ADX__
|
613
|
+
__asm__ __volatile__(
|
614
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
615
|
+
" mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
|
616
|
+
" mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
|
617
|
+
" mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
|
618
|
+
" mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
|
619
|
+
" adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
|
620
|
+
" xorl %%ebx, %%ebx \n\t"
|
621
|
+
" mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
622
|
+
" adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
|
623
|
+
:
|
624
|
+
: "r" (c), "r" (a)
|
625
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
626
|
+
);
|
627
|
+
#else
|
628
|
+
__asm__ __volatile__(
|
629
|
+
" movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
|
630
|
+
" mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
|
631
|
+
" mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
|
632
|
+
" mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
|
633
|
+
" mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
|
634
|
+
" adcq $0, %%rcx \n\t"
|
635
|
+
|
636
|
+
" addq (%1), %%r8 \n\t"
|
637
|
+
" adcq 8(%1), %%r10 \n\t"
|
638
|
+
" adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
|
639
|
+
" adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
|
640
|
+
" adcq $0, %%rcx \n\t"
|
641
|
+
|
642
|
+
" mulx %%rcx, %%rax, %%rcx \n\t"
|
643
|
+
" addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
|
644
|
+
" adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
|
645
|
+
:
|
646
|
+
: "r" (c), "r" (a)
|
647
|
+
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
|
648
|
+
);
|
649
|
+
#endif
|
650
|
+
#else /* Without BMI2 */
|
651
|
+
/**
|
652
|
+
* TODO: Multiplications using MULQ instruction.
|
653
|
+
**/
|
654
|
+
#endif
|
655
|
+
}
|
656
|
+
|
657
|
+
inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
|
658
|
+
{
|
659
|
+
#if __ADX__
|
660
|
+
__asm__ __volatile__(
|
661
|
+
"movq (%2), %%rax \n\t"
|
662
|
+
"movq 8(%2), %%rcx \n\t"
|
663
|
+
"movq 16(%2), %%r8 \n\t"
|
664
|
+
"movq 24(%2), %%r9 \n\t"
|
665
|
+
"clc \n\t"
|
666
|
+
"adcx (%1), %%rax \n\t"
|
667
|
+
"adcx 8(%1), %%rcx \n\t"
|
668
|
+
"adcx 16(%1), %%r8 \n\t"
|
669
|
+
"adcx 24(%1), %%r9 \n\t"
|
670
|
+
"movq %%rcx, 8(%0) \n\t"
|
671
|
+
"movq %%r8 , 16(%0) \n\t"
|
672
|
+
"movq %%r9 , 24(%0) \n\t"
|
673
|
+
"setc %%cl \n\t"
|
674
|
+
"neg %%rcx \n\t"
|
675
|
+
"andq $38, %%rcx \n\t"
|
676
|
+
"addq %%rcx, %%rax \n\t"
|
677
|
+
"movq %%rax, (%0) \n\t"
|
678
|
+
:
|
679
|
+
: "r" (c), "r" (a), "r" (b)
|
680
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
681
|
+
);
|
682
|
+
#else
|
683
|
+
__asm__ __volatile__(
|
684
|
+
"movq (%2), %%rax \n\t"
|
685
|
+
"movq 8(%2), %%rcx \n\t"
|
686
|
+
"movq 16(%2), %%r8 \n\t"
|
687
|
+
"movq 24(%2), %%r9 \n\t"
|
688
|
+
"add (%1), %%rax \n\t"
|
689
|
+
"adc 8(%1), %%rcx \n\t"
|
690
|
+
"adc 16(%1), %%r8 \n\t"
|
691
|
+
"adc 24(%1), %%r9 \n\t"
|
692
|
+
"movq %%rcx, 8(%0) \n\t"
|
693
|
+
"movq %%r8 , 16(%0) \n\t"
|
694
|
+
"movq %%r9 , 24(%0) \n\t"
|
695
|
+
"setc %%cl \n\t"
|
696
|
+
"neg %%rcx \n\t"
|
697
|
+
"andq $38, %%rcx \n\t"
|
698
|
+
"addq %%rcx, %%rax \n\t"
|
699
|
+
"movq %%rax, (%0) \n\t"
|
700
|
+
:
|
701
|
+
: "r" (c), "r" (a), "r" (b)
|
702
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
703
|
+
);
|
704
|
+
#endif
|
705
|
+
}
|
706
|
+
|
707
|
+
inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
|
708
|
+
uint64_t *const __restrict b)
|
709
|
+
{
|
710
|
+
__asm__ __volatile__(
|
711
|
+
"movq (%1), %%rax \n\t"
|
712
|
+
"movq 8(%1), %%rcx \n\t"
|
713
|
+
"movq 16(%1), %%r8 \n\t"
|
714
|
+
"movq 24(%1), %%r9 \n\t"
|
715
|
+
"subq (%2), %%rax \n\t"
|
716
|
+
"sbbq 8(%2), %%rcx \n\t"
|
717
|
+
"sbbq 16(%2), %%r8 \n\t"
|
718
|
+
"sbbq 24(%2), %%r9 \n\t"
|
719
|
+
"movq %%rcx, 8(%0) \n\t"
|
720
|
+
"movq %%r8 , 16(%0) \n\t"
|
721
|
+
"movq %%r9 , 24(%0) \n\t"
|
722
|
+
"setc %%cl \n\t"
|
723
|
+
"neg %%rcx \n\t"
|
724
|
+
"andq $38, %%rcx \n\t"
|
725
|
+
"subq %%rcx, %%rax \n\t"
|
726
|
+
"movq %%rax, (%0) \n\t"
|
727
|
+
:
|
728
|
+
: "r" (c), "r" (a), "r" (b)
|
729
|
+
: "memory","cc", "%rax", "%rcx", "%r8", "%r9"
|
730
|
+
);
|
731
|
+
}
|
732
|
+
|
733
|
+
inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
|
734
|
+
{
|
735
|
+
#ifdef __BMI2__
|
736
|
+
/**
|
737
|
+
* a24 = (A+2)/4 = (486662+2)/4 = 121666
|
738
|
+
**/
|
739
|
+
const uint64_t a24 = 121666;
|
740
|
+
__asm__ __volatile__(
|
741
|
+
"movq %2, %%rdx \n\t"
|
742
|
+
"mulx (%1), %%rax, %%r8 \n\t"
|
743
|
+
"mulx 8(%1), %%rcx, %%r9 \n\t"
|
744
|
+
"movq %%rax, (%0) \n\t"
|
745
|
+
"movq %%rcx, 8(%0) \n\t"
|
746
|
+
"mulx 16(%1), %%rax, %%r10 \n\t"
|
747
|
+
"mulx 24(%1), %%rcx, %%r11 \n\t"
|
748
|
+
"movq %%rax, 16(%0) \n\t"
|
749
|
+
"movq %%rcx, 24(%0) \n\t"
|
750
|
+
"movq $38, %%rdx \n\t"
|
751
|
+
"mulx %%r11, %%rax, %%rcx \n\t"
|
752
|
+
"addq %%rax, (%0) \n\t"
|
753
|
+
"adcq %%r8, 8(%0) \n\t"
|
754
|
+
"adcq %%r9, 16(%0) \n\t"
|
755
|
+
"adcq %%r10, 24(%0) \n\t"
|
756
|
+
:
|
757
|
+
: "r" (c), "r" (a), "r" (a24)
|
758
|
+
: "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
|
759
|
+
);
|
760
|
+
#else /* Without BMI2 */
|
761
|
+
/**
|
762
|
+
* TODO: Multiplications using MULQ instruction.
|
763
|
+
**/
|
764
|
+
#endif
|
765
|
+
}
|
766
|
+
|
767
|
+
void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
|
768
|
+
{
|
769
|
+
#define sqrn_EltFp25519_1w_x64(a,times)\
|
770
|
+
counter = times;\
|
771
|
+
while(counter-- > 0)\
|
772
|
+
{\
|
773
|
+
sqr_EltFp25519_1w_x64(a);\
|
774
|
+
}
|
775
|
+
|
776
|
+
EltFp25519_1w_Buffer_x64 buffer_1w;
|
777
|
+
EltFp25519_1w_x64 x0, x1, x2;
|
778
|
+
uint64_t * T[5];
|
779
|
+
uint64_t counter;
|
780
|
+
|
781
|
+
T[0] = x0;
|
782
|
+
T[1] = pC; /* x^(-1) */
|
783
|
+
T[2] = x1;
|
784
|
+
T[3] = x2;
|
785
|
+
T[4] = pA; /* x */
|
786
|
+
|
787
|
+
copy_EltFp25519_1w_x64(T[1],pA);
|
788
|
+
sqrn_EltFp25519_1w_x64(T[1],1);
|
789
|
+
copy_EltFp25519_1w_x64(T[2],T[1]);
|
790
|
+
sqrn_EltFp25519_1w_x64(T[2],2);
|
791
|
+
mul_EltFp25519_1w_x64(T[0], pA, T[2]);
|
792
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
|
793
|
+
copy_EltFp25519_1w_x64(T[2],T[1]);
|
794
|
+
sqrn_EltFp25519_1w_x64(T[2],1);
|
795
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
796
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
797
|
+
sqrn_EltFp25519_1w_x64(T[2],5);
|
798
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
|
799
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
800
|
+
sqrn_EltFp25519_1w_x64(T[2],10);
|
801
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
802
|
+
copy_EltFp25519_1w_x64(T[3],T[2]);
|
803
|
+
sqrn_EltFp25519_1w_x64(T[3],20);
|
804
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
|
805
|
+
sqrn_EltFp25519_1w_x64(T[3],10);
|
806
|
+
mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
|
807
|
+
copy_EltFp25519_1w_x64(T[0],T[3]);
|
808
|
+
sqrn_EltFp25519_1w_x64(T[0],50);
|
809
|
+
mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
|
810
|
+
copy_EltFp25519_1w_x64(T[2],T[0]);
|
811
|
+
sqrn_EltFp25519_1w_x64(T[2],100);
|
812
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
|
813
|
+
sqrn_EltFp25519_1w_x64(T[2],50);
|
814
|
+
mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
|
815
|
+
sqrn_EltFp25519_1w_x64(T[2],5);
|
816
|
+
mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
|
817
|
+
#undef sqrn_EltFp25519_1w_x64
|
818
|
+
}
|
819
|
+
|
820
|
+
inline void fred_EltFp25519_1w_x64(uint64_t *const c)
|
821
|
+
{
|
822
|
+
int64_t last = (((int64_t*)c)[3])>>63;
|
823
|
+
c[3] &= ((uint64_t)1<<63)-1;
|
824
|
+
c[0] += 19 & last;
|
825
|
+
}
|
826
|
+
|