x25519 1.0.5 → 1.0.9

Sign up to get free protection for your applications and to get access to all the features.
@@ -4,7 +4,7 @@
4
4
 
5
5
  require "mkmf"
6
6
 
7
- $CFLAGS << " -Wall -O3 -pedantic -std=c99 -mbmi -mbmi2 -march=native -mtune=native"
7
+ $CFLAGS << " -Wall -O3 -pedantic -std=c99 -mbmi -mbmi2 -march=haswell"
8
8
 
9
9
  create_makefile "x25519_precomputed"
10
10
 
@@ -1,213 +1,210 @@
1
1
  /**
2
- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
2
+ * Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
3
3
  * Institute of Computing.
4
4
  * University of Campinas, Brazil.
5
5
  *
6
- * This program is free software: you can redistribute it and/or modify
7
- * it under the terms of the GNU Lesser General Public License as
8
- * published by the Free Software Foundation, version 3.
6
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
7
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
9
8
  *
10
- * This program is distributed in the hope that it will be useful, but
11
- * WITHOUT ANY WARRANTY; without even the implied warranty of
12
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
- * Lesser General Public License for more details.
9
+ * Redistribution and use in source and binary forms, with or without
10
+ * modification, are permitted provided that the following conditions
11
+ * are met:
14
12
  *
15
- * You should have received a copy of the GNU Lesser General Public License
16
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
- */
18
- #include "fp25519_x64.h"
19
-
20
- int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes)
21
- {
22
- unsigned int i=0;
23
- uint8_t ret=0;
24
- for(i=0;i<num_bytes;i++)
25
- {
26
- ret += A[i]^B[i];
27
- }
28
- return ret;
29
- }
13
+ * * Redistributions of source code must retain the above copyright
14
+ * notice, this list of conditions and the following disclaimer.
15
+ * * Redistributions in binary form must reproduce the above
16
+ * copyright notice, this list of conditions and the following
17
+ * disclaimer in the documentation and/or other materials provided
18
+ * with the distribution.
19
+ * * Neither the name of University of Campinas nor the names of its
20
+ * contributors may be used to endorse or promote products derived
21
+ * from this software without specific prior written permission.
22
+ *
23
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
34
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ */
30
36
 
31
- int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
32
- {
33
- return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
34
- }
37
+ #include "fp25519_x64.h"
35
38
 
36
39
  /**
37
40
  *
38
- * @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
39
- * @param a Two 256-bit integers: a[0:3] and a[4:7]
40
- * @param b Two 256-bit integers: b[0:3] and b[4:7]
41
+ * @param c Two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
42
+ * @param a Two 256-bit integers: a0[0:3] and a1[4:7]
43
+ * @param b Two 256-bit integers: b0[0:3] and b1[4:7]
41
44
  */
42
- void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
43
- {
45
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
46
+ uint64_t *const b) {
44
47
  #ifdef __BMI2__
45
48
  #ifdef __ADX__
46
- __asm__ __volatile__(
47
- "movq (%1), %%rdx # A[0] \n\t"
48
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
49
- "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
50
- "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
51
- "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
52
- "adox %%rdx, %%rax \n\t"
53
-
54
- "movq 8(%1), %%rdx # A[1] \n\t"
55
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
56
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
57
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
58
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
59
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
60
-
61
- "movq 16(%1), %%rdx # A[2] \n\t"
62
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
63
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
64
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
65
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
66
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
67
-
68
- "movq 24(%1), %%rdx # A[3] \n\t"
69
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
70
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
71
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
72
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
73
- "adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
74
-
75
- "movq 32(%1), %%rdx # A[0] \n\t"
76
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
77
- "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
78
- "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
79
- "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
80
- "adox %%rdx, %%rax \n\t"
81
-
82
- "movq 40(%1), %%rdx # A[1] \n\t"
83
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
84
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
85
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
86
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
87
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
88
-
89
- "movq 48(%1), %%rdx # A[2] \n\t"
90
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
91
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
92
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
93
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
94
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
95
-
96
- "movq 56(%1), %%rdx # A[3] \n\t"
97
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
98
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
99
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
100
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
101
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
102
- :
103
- : "r" (c), "r" (a), "r" (b)
104
- : "memory", "cc", "%rax", "%rdx",
105
- "%r8", "%r9", "%r10", "%r11",
106
- "%r12", "%r13", "%r14"
107
- );
49
+ __asm__ __volatile__(
50
+ "xorl %%r14d, %%r14d ;"
51
+ "movq (%1), %%rdx; " /* A[0] */
52
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
53
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "adox %%r10, %%r12 ;"
54
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adox %%r8, %%rax ;"
55
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adox %%r10, %%rbx ;"
56
+ /*******************************************/ "adox %%r14, %%rcx ;"
57
+
58
+ "movq 8(%1), %%rdx; " /* A[1] */
59
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
60
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
61
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
62
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
63
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
64
+
65
+ "movq 16(%1), %%rdx; " /* A[2] */ "xorl %%r10d, %%r10d ;"
66
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
67
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
68
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
69
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
70
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
71
+
72
+ "movq 24(%1), %%rdx; " /* A[3] */ "xorl %%r10d, %%r10d ;"
73
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
74
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
75
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
76
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
77
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 56(%0) ;"
78
+
79
+ "movq 32(%1), %%rdx; " /* C[0] */
80
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, 64(%0);"
81
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "adox %%r10, %%r12 ;"
82
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adox %%r8, %%rax ;"
83
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adox %%r10, %%rbx ;"
84
+ /*******************************************/ "adox %%r14, %%rcx ;"
85
+
86
+ "movq 40(%1), %%rdx; " /* C[1] */ "xorl %%r10d, %%r10d ;"
87
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 72(%0);"
88
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
89
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
90
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
91
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
92
+
93
+ "movq 48(%1), %%rdx; " /* C[2] */ "xorl %%r10d, %%r10d ;"
94
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 80(%0);"
95
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
96
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
97
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
98
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
99
+
100
+ "movq 56(%1), %%rdx; " /* C[3] */ "xorl %%r10d, %%r10d ;"
101
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 88(%0);"
102
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
103
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
104
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
105
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 120(%0) ;"
106
+ :
107
+ : "r" (c), "r" (a), "r" (b)
108
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
109
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
110
+ );
108
111
  #else
109
- __asm__ __volatile__(
110
- "movq (%1), %%rdx # A[0] \n\t"
111
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
112
- "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
113
- "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
114
- "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
115
- "adcq $0, %%rcx \n\t"
116
-
117
- "movq 8(%1), %%rdx # A[1] \n\t"
118
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
119
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
120
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
121
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
122
- "adcq $0, %%rdx \n\t"
123
-
124
- "addq %%r8, 8(%0) \n\t"
125
- "adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
126
- "adcq %%r11, %%rbx \n\t"
127
- "adcq %%r13, %%rcx \n\t"
128
- "adcq %%rdx, %%rax \n\t"
129
-
130
- "movq 16(%1), %%rdx # A[2] \n\t"
131
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
132
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
133
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
134
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
135
- "adcq $0, %%rdx \n\t"
136
-
137
- "addq %%r8, 16(%0) \n\t"
138
- "adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
139
- "adcq %%r11, %%rcx \n\t"
140
- "adcq %%r13, %%rax \n\t"
141
- "adcq %%rdx, %%rbx \n\t"
142
-
143
- "movq 24(%1), %%rdx # A[3] \n\t"
144
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
145
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
146
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
147
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
148
- "adcq $0, %%rdx \n\t"
149
-
150
- "addq %%r8, 24(%0) \n\t"
151
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
152
- "adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
153
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
154
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
155
-
156
- "movq 32(%1), %%rdx # A[0] \n\t"
157
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
158
- "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
159
- "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
160
- "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
161
- "adcq $0, %%rcx \n\t"
162
-
163
- "movq 40(%1), %%rdx # A[1] \n\t"
164
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
165
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
166
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
167
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
168
- "adcq $0, %%rdx \n\t"
169
-
170
- "addq %%r8, 72(%0) \n\t"
171
- "adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
172
- "adcq %%r11, %%rbx \n\t"
173
- "adcq %%r13, %%rcx \n\t"
174
- "adcq %%rdx, %%rax \n\t"
175
-
176
- "movq 48(%1), %%rdx # A[2] \n\t"
177
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
178
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
179
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
180
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
181
- "adcq $0, %%rdx \n\t"
182
-
183
- "addq %%r8, 80(%0) \n\t"
184
- "adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
185
- "adcq %%r11, %%rcx \n\t"
186
- "adcq %%r13, %%rax \n\t"
187
- "adcq %%rdx, %%rbx \n\t"
188
-
189
- "movq 56(%1), %%rdx # A[3] \n\t"
190
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
191
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
192
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
193
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
194
- "adcq $0, %%rdx \n\t"
195
-
196
- "addq %%r8, 88(%0) \n\t"
197
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
198
- "adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
199
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
200
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
201
- :
202
- : "r" (c), "r" (a), "r" (b)
203
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
204
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
205
- );
112
+ __asm__ __volatile__(
113
+ "movq (%1), %%rdx; " /* A[0] */
114
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
115
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
116
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
117
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
118
+ /*******************************************/ "adcq $0, %%rcx ;"
119
+
120
+ "movq 8(%1), %%rdx; " /* A[1] */
121
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
122
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
123
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
124
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
125
+ /*******************************************/ "adcq $0, %%r12 ;"
126
+
127
+ "addq %%r9, %%rax ;"
128
+ "adcq %%r11, %%rbx ;"
129
+ "adcq %%r13, %%rcx ;"
130
+ "adcq $0, %%r12 ;"
131
+
132
+ "movq 16(%1), %%rdx; " /* A[2] */
133
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
134
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
135
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
136
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
137
+ /*******************************************/ "adcq $0, %%rax ;"
138
+
139
+ "addq %%r9, %%rbx ;"
140
+ "adcq %%r11, %%rcx ;"
141
+ "adcq %%r13, %%r12 ;"
142
+ "adcq $0, %%rax ;"
143
+
144
+ "movq 24(%1), %%rdx; " /* A[3] */
145
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
146
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
147
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
148
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
149
+ /*******************************************/ "adcq $0, %%rbx ;"
150
+
151
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
152
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
153
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
154
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
155
+
156
+ "movq 32(%1), %%rdx; " /* C[0] */
157
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "movq %%r8, 64(%0) ;"
158
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "addq %%r10, %%r12 ;"
159
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adcq %%r8, %%rax ;"
160
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adcq %%r10, %%rbx ;"
161
+ /*******************************************/ "adcq $0, %%rcx ;"
162
+
163
+ "movq 40(%1), %%rdx; " /* C[1] */
164
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 72(%0) ;"
165
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adcq %%r10, %%r9 ;"
166
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adcq %%r8, %%r11 ;"
167
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adcq %%r10, %%r13 ;"
168
+ /*******************************************/ "adcq $0, %%r12 ;"
169
+
170
+ "addq %%r9, %%rax ;"
171
+ "adcq %%r11, %%rbx ;"
172
+ "adcq %%r13, %%rcx ;"
173
+ "adcq $0, %%r12 ;"
174
+
175
+ "movq 48(%1), %%rdx; " /* C[2] */
176
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 80(%0) ;"
177
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adcq %%r10, %%r9 ;"
178
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adcq %%r8, %%r11 ;"
179
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adcq %%r10, %%r13 ;"
180
+ /*******************************************/ "adcq $0, %%rax ;"
181
+
182
+ "addq %%r9, %%rbx ;"
183
+ "adcq %%r11, %%rcx ;"
184
+ "adcq %%r13, %%r12 ;"
185
+ "adcq $0, %%rax ;"
186
+
187
+ "movq 56(%1), %%rdx; " /* C[3] */
188
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 88(%0) ;"
189
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adcq %%r10, %%r9 ;"
190
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adcq %%r8, %%r11 ;"
191
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adcq %%r10, %%r13 ;"
192
+ /*******************************************/ "adcq $0, %%rbx ;"
193
+
194
+ "addq %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
195
+ "adcq %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
196
+ "adcq %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
197
+ "adcq $0, %%rbx ;" "movq %%rbx, 120(%0) ;"
198
+ :
199
+ : "r" (c), "r" (a), "r" (b)
200
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
201
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
202
+ );
206
203
  #endif
207
204
  #else /* Without BMI2 */
208
- /**
209
- * TODO: Multiplications using MULQ instruction.
210
- **/
205
+ /**
206
+ * TODO: Multiplications using MULQ instruction.
207
+ **/
211
208
  #endif
212
209
  }
213
210
 
@@ -216,140 +213,186 @@ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *co
216
213
  * @param c
217
214
  * @param a
218
215
  */
219
- void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
220
- {
216
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
221
217
  #ifdef __BMI2__
222
- __asm__ __volatile__(
223
- "movq (%1), %%rdx # A[0] \n\t"
224
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
225
- "movq 8(%1), %%rdx # A[1] \n\t"
226
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
227
- "movq %%r8, (%0) \n\t"
228
- "movq %%r9, 8(%0) \n\t"
229
- "movq %%r10, 16(%0) \n\t"
230
- "movq %%r11, 24(%0) \n\t"
231
-
232
- "movq 16(%1), %%rdx # A[2] \n\t"
233
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
234
- "movq 24(%1), %%rdx # A[3] \n\t"
235
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
236
- "movq %%r8, 32(%0) \n\t"
237
- "movq %%r9, 40(%0) \n\t"
238
- "movq %%r10, 48(%0) \n\t"
239
- "movq %%r11, 56(%0) \n\t"
240
-
241
- "movq 8(%1), %%rdx # A[1] \n\t"
242
- "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
243
- "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
244
- "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
245
-
246
- "movq 16(%1), %%rdx # A[2] \n\t"
247
- "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
248
- "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
249
-
250
- "addq %%rax, %%r9 \n\t"
251
- "adcq %%rdx, %%r10 \n\t"
252
- "adcq %%rcx, %%r11 \n\t"
253
- "adcq %%r14, %%r12 \n\t"
254
- "adcq $0, %%r13 \n\t"
255
- "movq $0, %%r14 \n\t"
256
- "adcq $0, %%r14 \n\t"
257
-
258
- "movq (%1), %%rdx # A[0] \n\t"
259
- "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
260
-
261
- "addq %%rax, %%r10 \n\t"
262
- "adcq %%rdx, %%r11 \n\t"
263
- "adcq $0, %%r12 \n\t"
264
- "adcq $0, %%r13 \n\t"
265
- "adcq $0, %%r14 \n\t"
266
-
267
- "shldq $1, %%r13, %%r14 \n\t"
268
- "shldq $1, %%r12, %%r13 \n\t"
269
- "shldq $1, %%r11, %%r12 \n\t"
270
- "shldq $1, %%r10, %%r11 \n\t"
271
- "shldq $1, %%r9, %%r10 \n\t"
272
- "shldq $1, %%r8, %%r9 \n\t"
273
- "shlq $1, %%r8 \n\t"
274
-
275
- "addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
276
- "adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
277
- "adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
278
- "adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
279
- "adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
280
- "adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
281
- "adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
282
-
283
-
284
- "movq 32(%1), %%rdx # A[0] \n\t"
285
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
286
- "movq 40(%1), %%rdx # A[1] \n\t"
287
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
288
- "movq %%r8, 64(%0) \n\t"
289
- "movq %%r9, 72(%0) \n\t"
290
- "movq %%r10, 80(%0) \n\t"
291
- "movq %%r11, 88(%0) \n\t"
292
-
293
- "movq 48(%1), %%rdx # A[2] \n\t"
294
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
295
- "movq 56(%1), %%rdx # A[3] \n\t"
296
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
297
- "movq %%r8, 96(%0) \n\t"
298
- "movq %%r9, 104(%0) \n\t"
299
- "movq %%r10, 112(%0) \n\t"
300
- "movq %%r11, 120(%0) \n\t"
301
-
302
- "movq 40(%1), %%rdx # A[1] \n\t"
303
- "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
304
- "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
305
- "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
306
-
307
- "movq 48(%1), %%rdx # A[2] \n\t"
308
- "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
309
- "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
310
-
311
- "addq %%rax, %%r9 \n\t"
312
- "adcq %%rdx, %%r10 \n\t"
313
- "adcq %%rcx, %%r11 \n\t"
314
- "adcq %%r14, %%r12 \n\t"
315
- "adcq $0, %%r13 \n\t"
316
- "movq $0, %%r14 \n\t"
317
- "adcq $0, %%r14 \n\t"
318
-
319
- "movq 32(%1), %%rdx # A[0] \n\t"
320
- "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
321
-
322
- "addq %%rax, %%r10 \n\t"
323
- "adcq %%rdx, %%r11 \n\t"
324
- "adcq $0, %%r12 \n\t"
325
- "adcq $0, %%r13 \n\t"
326
- "adcq $0, %%r14 \n\t"
327
-
328
- "shldq $1, %%r13, %%r14 \n\t"
329
- "shldq $1, %%r12, %%r13 \n\t"
330
- "shldq $1, %%r11, %%r12 \n\t"
331
- "shldq $1, %%r10, %%r11 \n\t"
332
- "shldq $1, %%r9, %%r10 \n\t"
333
- "shldq $1, %%r8, %%r9 \n\t"
334
- "shlq $1, %%r8 \n\t"
335
-
336
- "addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
337
- "adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
338
- "adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
339
- "adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
340
- "adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
341
- "adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
342
- "adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
343
- :
344
- : "r" (c), "r" (a)
345
- : "cc", "%rax", "%rcx", "%rdx",
346
- "%r8", "%r9", "%r10", "%r11",
347
- "%r12", "%r13", "%r14"
348
- );
218
+ #ifdef __ADX__
219
+ __asm__ __volatile__(
220
+ "movq (%1), %%rdx ;" /* A[0] */
221
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
222
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
223
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
224
+ "movq 24(%1), %%rdx ;" /* A[3] */
225
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
226
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
227
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
228
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
229
+ /*******************************************/ "adcx %%r15, %%r14 ;"
230
+
231
+ "xorl %%r15d, %%r15d;"
232
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
233
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
234
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
235
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
236
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
237
+ "adcx %%r13, %%r13 ;"
238
+ "adcx %%r14, %%r14 ;"
239
+
240
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
241
+ /********************/ "movq %%rax, 0(%0) ;"
242
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
243
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
244
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
245
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
246
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
247
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
248
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
249
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
250
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
251
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
252
+
253
+
254
+ "movq 32(%1), %%rdx ;" /* B[0] */
255
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ "xorl %%r15d, %%r15d;"
256
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ "adcx %%r14, %%r9 ;"
257
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ "adcx %%rax, %%r10 ;"
258
+ "movq 56(%1), %%rdx ;" /* B[3] */
259
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */ "adcx %%rcx, %%r11 ;"
260
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ "adcx %%rax, %%r12 ;"
261
+ "movq 40(%1), %%rdx ;" /* B[1] */ "adcx %%r15, %%r13 ;"
262
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ "movq $0, %%r14 ;"
263
+ /*******************************************/ "adcx %%r15, %%r14 ;"
264
+
265
+ "xorl %%r15d, %%r15d;"
266
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
267
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
268
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
269
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
270
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
271
+ "adcx %%r13, %%r13 ;"
272
+ "adcx %%r14, %%r14 ;"
273
+
274
+ "movq 32(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
275
+ /********************/ "movq %%rax, 64(%0) ;"
276
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
277
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
278
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
279
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
280
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
281
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
282
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
283
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
284
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
285
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
286
+ :
287
+ : "r" (c), "r" (a)
288
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
289
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
290
+ );
291
+ #else /* Without ADX */
292
+ __asm__ __volatile__(
293
+ "movq 8(%1), %%rdx ;" /* A[1] */
294
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
295
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
296
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
297
+
298
+ "movq 16(%1), %%rdx ;" /* A[2] */
299
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
300
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
301
+
302
+ "addq %%rax, %%r9 ;"
303
+ "adcq %%rdx, %%r10 ;"
304
+ "adcq %%rcx, %%r11 ;"
305
+ "adcq %%r14, %%r12 ;"
306
+ "adcq $0, %%r13 ;"
307
+ "movq $0, %%r14 ;"
308
+ "adcq $0, %%r14 ;"
309
+
310
+ "movq (%1), %%rdx ;" /* A[0] */
311
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
312
+
313
+ "addq %%rax, %%r10 ;"
314
+ "adcq %%rcx, %%r11 ;"
315
+ "adcq $0, %%r12 ;"
316
+ "adcq $0, %%r13 ;"
317
+ "adcq $0, %%r14 ;"
318
+
319
+ "shldq $1, %%r13, %%r14 ;"
320
+ "shldq $1, %%r12, %%r13 ;"
321
+ "shldq $1, %%r11, %%r12 ;"
322
+ "shldq $1, %%r10, %%r11 ;"
323
+ "shldq $1, %%r9, %%r10 ;"
324
+ "shldq $1, %%r8, %%r9 ;"
325
+ "shlq $1, %%r8 ;"
326
+
327
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
328
+ /********************/ "movq %%rax, 0(%0) ;"
329
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
330
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
331
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
332
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
333
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
334
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
335
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
336
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
337
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
338
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
339
+
340
+ "movq 40(%1), %%rdx ;" /* B[1] */
341
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
342
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
343
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
344
+
345
+ "movq 48(%1), %%rdx ;" /* B[2] */
346
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
347
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
348
+
349
+ "addq %%rax, %%r9 ;"
350
+ "adcq %%rdx, %%r10 ;"
351
+ "adcq %%rcx, %%r11 ;"
352
+ "adcq %%r14, %%r12 ;"
353
+ "adcq $0, %%r13 ;"
354
+ "movq $0, %%r14 ;"
355
+ "adcq $0, %%r14 ;"
356
+
357
+ "movq 32(%1), %%rdx ;" /* B[0] */
358
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
359
+
360
+ "addq %%rax, %%r10 ;"
361
+ "adcq %%rcx, %%r11 ;"
362
+ "adcq $0, %%r12 ;"
363
+ "adcq $0, %%r13 ;"
364
+ "adcq $0, %%r14 ;"
365
+
366
+ "shldq $1, %%r13, %%r14 ;"
367
+ "shldq $1, %%r12, %%r13 ;"
368
+ "shldq $1, %%r11, %%r12 ;"
369
+ "shldq $1, %%r10, %%r11 ;"
370
+ "shldq $1, %%r9, %%r10 ;"
371
+ "shldq $1, %%r8, %%r9 ;"
372
+ "shlq $1, %%r8 ;"
373
+
374
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
375
+ /********************/ "movq %%rax, 64(%0) ;"
376
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
377
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
378
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
379
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
380
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
381
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
382
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
383
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
384
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
385
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
386
+ :
387
+ : "r" (c), "r" (a)
388
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
389
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
390
+ );
391
+ #endif
349
392
  #else /* Without BMI2 */
350
- /**
351
- * TODO: Multiplications using MULQ instruction.
352
- **/
393
+ /**
394
+ * TODO: Multiplications using MULQ instruction.
395
+ **/
353
396
  #endif
354
397
  }
355
398
 
@@ -358,467 +401,543 @@ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
358
401
  * @param c
359
402
  * @param a
360
403
  */
361
- void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
362
- {
404
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a) {
363
405
  #ifdef __BMI2__
364
406
  #ifdef __ADX__
365
- __asm__ __volatile__(
366
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
367
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
368
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
369
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
370
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
371
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
372
- " xorl %%ebx, %%ebx \n\t"
373
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
374
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
375
-
376
- " mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
377
- " mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
378
- " mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
379
- " mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
380
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
381
- " xorl %%ebx, %%ebx \n\t"
382
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
383
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
384
- :
385
- : "r" (c), "r" (a)
386
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
387
- );
407
+ __asm__ __volatile__(
408
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
409
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
410
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
411
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
412
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
413
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
414
+ "clc ;"
415
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
416
+ "adcx %%rax, %%r8 ;"
417
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
418
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
419
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
420
+ "mov $0, %%ecx ;"
421
+ "cmovc %%edx, %%ecx ;"
422
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
423
+
424
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox 64(%1), %%r8 ;"
425
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 72(%1), %%r9 ;"
426
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 80(%1), %%r10 ;"
427
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 88(%1), %%r11 ;"
428
+ /*****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
429
+ "clc ;"
430
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
431
+ "adcx %%rax, %%r8 ;"
432
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
433
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 48(%0) ;"
434
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 56(%0) ;"
435
+ "mov $0, %%ecx ;"
436
+ "cmovc %%edx, %%ecx ;"
437
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
438
+ :
439
+ : "r" (c), "r" (a)
440
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
441
+ );
388
442
  #else
389
- __asm__ __volatile__(
390
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
391
- "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
392
- "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
393
- "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
394
- "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
395
- "adcq $0, %%rcx \n\t"
396
-
397
- "addq (%1), %%r8 \n\t"
398
- "adcq 8(%1), %%r10 \n\t"
399
- "adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
400
- "adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
401
- "adcq $0, %%rcx \n\t"
402
-
403
- "mulx %%rcx, %%rax, %%rcx \n\t"
404
- "addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
405
- "adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
406
-
407
- "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
408
- "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
409
- "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
410
- "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
411
- "adcq $0, %%rcx \n\t"
412
-
413
- "addq 64(%1), %%r8 \n\t"
414
- "adcq 72(%1), %%r10 \n\t"
415
- "adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
416
- "adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
417
- "adcq $0, %%rcx \n\t"
418
-
419
- "mulx %%rcx, %%rax, %%rcx \n\t"
420
- "addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
421
- "adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
422
-
423
- :
424
- : "r" (c), "r" (a)
425
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
426
- );
443
+ __asm__ __volatile__(
444
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
445
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
446
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
447
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
448
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
449
+ /****************************************/ "adcq $0, %%rcx ;"
450
+ "addq (%1), %%r8 ;"
451
+ "adcq 8(%1), %%r9 ;"
452
+ "adcq 16(%1), %%r10 ;"
453
+ "adcq 24(%1), %%r11 ;"
454
+ "adcq $0, %%rcx ;"
455
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
456
+ "addq %%rax, %%r8 ;"
457
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
458
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
459
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
460
+ "mov $0, %%ecx ;"
461
+ "cmovc %%edx, %%ecx ;"
462
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
463
+
464
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
465
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
466
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
467
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
468
+ /*****************************************/ "adcq $0, %%rcx ;"
469
+ "addq 64(%1), %%r8 ;"
470
+ "adcq 72(%1), %%r9 ;"
471
+ "adcq 80(%1), %%r10 ;"
472
+ "adcq 88(%1), %%r11 ;"
473
+ "adcq $0, %%rcx ;"
474
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
475
+ "addq %%rax, %%r8 ;"
476
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
477
+ "adcq $0, %%r10 ;" "movq %%r10, 48(%0) ;"
478
+ "adcq $0, %%r11 ;" "movq %%r11, 56(%0) ;"
479
+ "mov $0, %%ecx ;"
480
+ "cmovc %%edx, %%ecx ;"
481
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
482
+ :
483
+ : "r" (c), "r" (a)
484
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
485
+ );
427
486
  #endif
428
487
  #else /* Without BMI2 */
429
- /* [TODO] */
488
+ /* [TODO] */
430
489
  #endif
431
490
  }
432
491
 
433
- void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
434
- {
492
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
435
493
  #ifdef __BMI2__
436
494
  #ifdef __ADX__
437
- __asm__ __volatile__(
438
- " movq (%1), %%rdx # A[0] \n\t"
439
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
440
- " mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
441
- " mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
442
- " mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
443
- " adox %%rdx, %%rax \n\t"
444
-
445
- " movq 8(%1), %%rdx # A[1] \n\t"
446
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
447
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
448
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
449
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
450
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
451
-
452
- " movq 16(%1), %%rdx # A[2] \n\t"
453
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
454
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
455
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
456
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
457
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
458
-
459
- " movq 24(%1), %%rdx # A[3] \n\t"
460
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
461
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
462
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
463
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
464
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
465
- :
466
- : "r" (c), "r" (a), "r" (b)
467
- : "memory", "cc", "%rax", "%rdx",
468
- "%r8", "%r9", "%r10", "%r11",
469
- "%r12", "%r13", "%r14"
470
- );
495
+ __asm__ __volatile__(
496
+ "movq (%1), %%rdx; " /* A[0] */
497
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
498
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(%0) ;"
499
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%r12 ;"
500
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;"
501
+ /*******************************************/ "adox %%rdx, %%rax ;"
502
+
503
+ "movq 8(%1), %%rdx; " /* A[1] */
504
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(%0), %%r8 ;" "movq %%r8, 8(%0) ;"
505
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 16(%0) ;"
506
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
507
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
508
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
509
+
510
+ "movq 16(%1), %%rdx; " /* A[2] */
511
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(%0), %%r8 ;" "movq %%r8, 16(%0) ;"
512
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 24(%0) ;"
513
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
514
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
515
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
516
+
517
+ "movq 24(%1), %%rdx; " /* A[3] */
518
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(%0), %%r8 ;" "movq %%r8, 24(%0) ;"
519
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 32(%0) ;"
520
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq %%r12, 40(%0) ;" "movq $0, %%r8 ;"
521
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(%0) ;" "movq $0, %%rax ;"
522
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(%0) ;"
523
+ :
524
+ : "r" (c), "r" (a), "r" (b)
525
+ : "memory", "cc", "%rax", "%rdx", "%r8",
526
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
527
+ );
471
528
  #else
472
- __asm__ __volatile__(
473
- " movq (%1), %%rdx # A[0] \n\t"
474
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
475
- " mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
476
- " mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
477
- " mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
478
- " adcq $0, %%rcx \n\t"
479
-
480
- " movq 8(%1), %%rdx # A[1] \n\t"
481
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
482
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
483
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
484
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
485
- " adcq $0, %%rdx \n\t"
486
-
487
- " addq %%r8, 8(%0) \n\t"
488
- " adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
489
- " adcq %%r11, %%rbx \n\t"
490
- " adcq %%r13, %%rcx \n\t"
491
- " adcq %%rdx, %%rax \n\t"
492
-
493
- " movq 16(%1), %%rdx # A[2] \n\t"
494
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
495
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
496
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
497
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
498
- " adcq $0, %%rdx \n\t"
499
-
500
- " addq %%r8, 16(%0) \n\t"
501
- " adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
502
- " adcq %%r11, %%rcx \n\t"
503
- " adcq %%r13, %%rax \n\t"
504
- " adcq %%rdx, %%rbx \n\t"
505
-
506
- " movq 24(%1), %%rdx # A[3] \n\t"
507
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
508
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
509
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
510
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
511
- " adcq $0, %%rdx \n\t"
512
-
513
- " addq %%r8, 24(%0) \n\t"
514
- " adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
515
- " adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
516
- " adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
517
- " adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
518
- :
519
- : "r" (c), "r" (a), "r" (b)
520
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
521
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
522
- );
529
+ __asm__ __volatile__(
530
+ "movq (%1), %%rdx; " /* A[0] */
531
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
532
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
533
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
534
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
535
+ /*******************************************/ "adcq $0, %%rcx ;"
536
+
537
+ "movq 8(%1), %%rdx; " /* A[1] */
538
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
539
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
540
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
541
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
542
+ /*******************************************/ "adcq $0, %%r12 ;"
543
+
544
+ "addq %%r9, %%rax ;"
545
+ "adcq %%r11, %%rbx ;"
546
+ "adcq %%r13, %%rcx ;"
547
+ "adcq $0, %%r12 ;"
548
+
549
+ "movq 16(%1), %%rdx; " /* A[2] */
550
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
551
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
552
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
553
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
554
+ /*******************************************/ "adcq $0, %%rax ;"
555
+
556
+ "addq %%r9, %%rbx ;"
557
+ "adcq %%r11, %%rcx ;"
558
+ "adcq %%r13, %%r12 ;"
559
+ "adcq $0, %%rax ;"
560
+
561
+ "movq 24(%1), %%rdx; " /* A[3] */
562
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
563
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
564
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
565
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
566
+ /*******************************************/ "adcq $0, %%rbx ;"
567
+
568
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
569
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
570
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
571
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
572
+ :
573
+ : "r" (c), "r" (a), "r" (b)
574
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
575
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
576
+ );
523
577
  #endif
524
578
  #else /* Without BMI2 */
525
- /**
526
- * TODO: Multiplications using MULQ instruction.
527
- **/
579
+ /**
580
+ * TODO: Multiplications using MULQ instruction.
581
+ **/
528
582
  #endif
529
583
  }
530
584
 
531
- void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
532
- {
585
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
533
586
  #ifdef __BMI2__
534
- __asm__ __volatile__(
535
- " movq (%1), %%rdx # A[0] \n\t"
536
- " mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
537
- " movq 8(%1), %%rdx # A[1] \n\t"
538
- " mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
539
- " movq %%r8, (%0) \n\t"
540
- " movq %%r9, 8(%0) \n\t"
541
- " movq %%r10, 16(%0) \n\t"
542
- " movq %%r11, 24(%0) \n\t"
543
-
544
- " movq 16(%1), %%rdx # A[2] \n\t"
545
- " mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
546
- " movq 24(%1), %%rdx # A[3] \n\t"
547
- " mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
548
- " movq %%r8, 32(%0) \n\t"
549
- " movq %%r9, 40(%0) \n\t"
550
- " movq %%r10, 48(%0) \n\t"
551
- " movq %%r11, 56(%0) \n\t"
552
-
553
- " movq 8(%1), %%rdx # A[1] \n\t"
554
- " mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
555
- " mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
556
- " mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
557
-
558
- " movq 16(%1), %%rdx # A[2] \n\t"
559
- " mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
560
- " mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
561
-
562
- " addq %%rax, %%r9 \n\t"
563
- " adcq %%rdx, %%r10 \n\t"
564
- " adcq %%rcx, %%r11 \n\t"
565
- " adcq %%r14, %%r12 \n\t"
566
- " adcq $0, %%r13 \n\t"
567
- " movq $0, %%r14 \n\t"
568
- " adcq $0, %%r14 \n\t"
569
-
570
- " movq (%1), %%rdx # A[0] \n\t"
571
- " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
572
-
573
- " addq %%rax, %%r10 \n\t"
574
- " adcq %%rdx, %%r11 \n\t"
575
- " adcq $0, %%r12 \n\t"
576
- " adcq $0, %%r13 \n\t"
577
- " adcq $0, %%r14 \n\t"
578
-
579
- " shldq $1, %%r13, %%r14 \n\t"
580
- " shldq $1, %%r12, %%r13 \n\t"
581
- " shldq $1, %%r11, %%r12 \n\t"
582
- " shldq $1, %%r10, %%r11 \n\t"
583
- " shldq $1, %%r9, %%r10 \n\t"
584
- " shldq $1, %%r8, %%r9 \n\t"
585
- " shlq $1, %%r8 \n\t"
586
-
587
- " addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
588
- " adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
589
- " adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
590
- " adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
591
- " adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
592
- " adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
593
- " adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
594
- :
595
- : "r" (c), "r" (a)
596
- : "memory", "cc", "%rax", "%rcx", "%rdx",
597
- "%r8", "%r9", "%r10", "%r11",
598
- "%r12", "%r13", "%r14"
599
- );
587
+ #ifdef __ADX__
588
+ __asm__ __volatile__(
589
+ "movq (%1), %%rdx ;" /* A[0] */
590
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
591
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
592
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
593
+ "movq 24(%1), %%rdx ;" /* A[3] */
594
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
595
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
596
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
597
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
598
+ /*******************************************/ "adcx %%r15, %%r14 ;"
599
+
600
+ "xorl %%r15d, %%r15d;"
601
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
602
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
603
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
604
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
605
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
606
+ "adcx %%r13, %%r13 ;"
607
+ "adcx %%r14, %%r14 ;"
608
+
609
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
610
+ /********************/ "movq %%rax, 0(%0) ;"
611
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
612
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
613
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
614
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
615
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
616
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
617
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
618
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
619
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
620
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
621
+ :
622
+ : "r" (c), "r" (a)
623
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
624
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
625
+ );
626
+ #else /* Without ADX */
627
+ __asm__ __volatile__(
628
+ "movq 8(%1), %%rdx ;" /* A[1] */
629
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
630
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
631
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
632
+
633
+ "movq 16(%1), %%rdx ;" /* A[2] */
634
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
635
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
636
+
637
+ "addq %%rax, %%r9 ;"
638
+ "adcq %%rdx, %%r10 ;"
639
+ "adcq %%rcx, %%r11 ;"
640
+ "adcq %%r14, %%r12 ;"
641
+ "adcq $0, %%r13 ;"
642
+ "movq $0, %%r14 ;"
643
+ "adcq $0, %%r14 ;"
644
+
645
+ "movq (%1), %%rdx ;" /* A[0] */
646
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
647
+
648
+ "addq %%rax, %%r10 ;"
649
+ "adcq %%rcx, %%r11 ;"
650
+ "adcq $0, %%r12 ;"
651
+ "adcq $0, %%r13 ;"
652
+ "adcq $0, %%r14 ;"
653
+
654
+ "shldq $1, %%r13, %%r14 ;"
655
+ "shldq $1, %%r12, %%r13 ;"
656
+ "shldq $1, %%r11, %%r12 ;"
657
+ "shldq $1, %%r10, %%r11 ;"
658
+ "shldq $1, %%r9, %%r10 ;"
659
+ "shldq $1, %%r8, %%r9 ;"
660
+ "shlq $1, %%r8 ;"
661
+
662
+ /********************/ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
663
+ /********************/ "movq %%rax, 0(%0) ;"
664
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
665
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
666
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
667
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
668
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
669
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
670
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
671
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
672
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
673
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
674
+ :
675
+ : "r" (c), "r" (a)
676
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
677
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
678
+ );
679
+ #endif
600
680
  #else /* Without BMI2 */
601
- /**
602
- * TODO: Multiplications using MULQ instruction.
603
- **/
681
+ /**
682
+ * TODO: Multiplications using MULQ instruction.
683
+ **/
604
684
  #endif
605
685
  }
606
686
 
607
- void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
608
- {
687
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
609
688
  #ifdef __BMI2__
610
689
  #ifdef __ADX__
611
- __asm__ __volatile__(
612
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
613
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
614
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
615
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
616
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
617
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
618
- " xorl %%ebx, %%ebx \n\t"
619
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
620
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
621
- :
622
- : "r" (c), "r" (a)
623
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
624
- );
690
+ __asm__ __volatile__(
691
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
692
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
693
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
694
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
695
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
696
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
697
+ "clc ;"
698
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
699
+ "adcx %%rax, %%r8 ;"
700
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
701
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
702
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
703
+ "mov $0, %%ecx ;"
704
+ "cmovc %%edx, %%ecx ;"
705
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
706
+ :
707
+ : "r" (c), "r" (a)
708
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
709
+ );
625
710
  #else
626
- __asm__ __volatile__(
627
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
628
- " mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
629
- " mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
630
- " mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
631
- " mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
632
- " adcq $0, %%rcx \n\t"
633
-
634
- " addq (%1), %%r8 \n\t"
635
- " adcq 8(%1), %%r10 \n\t"
636
- " adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
637
- " adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
638
- " adcq $0, %%rcx \n\t"
639
-
640
- " mulx %%rcx, %%rax, %%rcx \n\t"
641
- " addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
642
- " adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
643
- :
644
- : "r" (c), "r" (a)
645
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
646
- );
711
+ __asm__ __volatile__(
712
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
713
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
714
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
715
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
716
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
717
+ /****************************************/ "adcq $0, %%rcx ;"
718
+ "addq (%1), %%r8 ;"
719
+ "adcq 8(%1), %%r9 ;"
720
+ "adcq 16(%1), %%r10 ;"
721
+ "adcq 24(%1), %%r11 ;"
722
+ "adcq $0, %%rcx ;"
723
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
724
+ "addq %%rax, %%r8 ;"
725
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
726
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
727
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
728
+ "mov $0, %%ecx ;"
729
+ "cmovc %%edx, %%ecx ;"
730
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
731
+ :
732
+ : "r" (c), "r" (a)
733
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
734
+ );
647
735
  #endif
648
736
  #else /* Without BMI2 */
649
- /**
650
- * TODO: Multiplications using MULQ instruction.
651
- **/
737
+ /**
738
+ * TODO: Multiplications using MULQ instruction.
739
+ **/
652
740
  #endif
653
741
  }
654
742
 
655
- inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
656
- {
657
- #if __ADX__
658
- __asm__ __volatile__(
659
- "movq (%2), %%rax \n\t"
660
- "movq 8(%2), %%rcx \n\t"
661
- "movq 16(%2), %%r8 \n\t"
662
- "movq 24(%2), %%r9 \n\t"
663
- "clc \n\t"
664
- "adcx (%1), %%rax \n\t"
665
- "adcx 8(%1), %%rcx \n\t"
666
- "adcx 16(%1), %%r8 \n\t"
667
- "adcx 24(%1), %%r9 \n\t"
668
- "movq %%rcx, 8(%0) \n\t"
669
- "movq %%r8 , 16(%0) \n\t"
670
- "movq %%r9 , 24(%0) \n\t"
671
- "setc %%cl \n\t"
672
- "neg %%rcx \n\t"
673
- "andq $38, %%rcx \n\t"
674
- "addq %%rcx, %%rax \n\t"
675
- "movq %%rax, (%0) \n\t"
676
- :
677
- : "r" (c), "r" (a), "r" (b)
678
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
679
- );
743
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
744
+ #ifdef __ADX__
745
+ __asm__ __volatile__(
746
+ "mov $38, %%eax ;"
747
+ "xorl %%ecx, %%ecx ;"
748
+ "movq (%2), %%r8 ;" "adcx (%1), %%r8 ;"
749
+ "movq 8(%2), %%r9 ;" "adcx 8(%1), %%r9 ;"
750
+ "movq 16(%2), %%r10 ;" "adcx 16(%1), %%r10 ;"
751
+ "movq 24(%2), %%r11 ;" "adcx 24(%1), %%r11 ;"
752
+ "cmovc %%eax, %%ecx ;"
753
+ "xorl %%eax, %%eax ;"
754
+ "adcx %%rcx, %%r8 ;"
755
+ "adcx %%rax, %%r9 ;" "movq %%r9, 8(%0) ;"
756
+ "adcx %%rax, %%r10 ;" "movq %%r10, 16(%0) ;"
757
+ "adcx %%rax, %%r11 ;" "movq %%r11, 24(%0) ;"
758
+ "mov $38, %%ecx ;"
759
+ "cmovc %%ecx, %%eax ;"
760
+ "addq %%rax, %%r8 ;" "movq %%r8, (%0) ;"
761
+ :
762
+ : "r" (c), "r" (a), "r" (b)
763
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
764
+ );
680
765
  #else
681
- __asm__ __volatile__(
682
- "movq (%2), %%rax \n\t"
683
- "movq 8(%2), %%rcx \n\t"
684
- "movq 16(%2), %%r8 \n\t"
685
- "movq 24(%2), %%r9 \n\t"
686
- "add (%1), %%rax \n\t"
687
- "adc 8(%1), %%rcx \n\t"
688
- "adc 16(%1), %%r8 \n\t"
689
- "adc 24(%1), %%r9 \n\t"
690
- "movq %%rcx, 8(%0) \n\t"
691
- "movq %%r8 , 16(%0) \n\t"
692
- "movq %%r9 , 24(%0) \n\t"
693
- "setc %%cl \n\t"
694
- "neg %%rcx \n\t"
695
- "andq $38, %%rcx \n\t"
696
- "addq %%rcx, %%rax \n\t"
697
- "movq %%rax, (%0) \n\t"
698
- :
699
- : "r" (c), "r" (a), "r" (b)
700
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
701
- );
766
+ __asm__ __volatile__(
767
+ "mov $38, %%eax ;"
768
+ "movq (%2), %%r8 ;" "addq (%1), %%r8 ;"
769
+ "movq 8(%2), %%r9 ;" "adcq 8(%1), %%r9 ;"
770
+ "movq 16(%2), %%r10 ;" "adcq 16(%1), %%r10 ;"
771
+ "movq 24(%2), %%r11 ;" "adcq 24(%1), %%r11 ;"
772
+ "mov $0, %%ecx ;"
773
+ "cmovc %%eax, %%ecx ;"
774
+ "addq %%rcx, %%r8 ;"
775
+ "adcq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
776
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
777
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
778
+ "mov $0, %%ecx ;"
779
+ "cmovc %%eax, %%ecx ;"
780
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
781
+ :
782
+ : "r" (c), "r" (a), "r" (b)
783
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
784
+ );
702
785
  #endif
703
786
  }
704
787
 
705
- inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
706
- uint64_t *const __restrict b)
707
- {
708
- __asm__ __volatile__(
709
- "movq (%1), %%rax \n\t"
710
- "movq 8(%1), %%rcx \n\t"
711
- "movq 16(%1), %%r8 \n\t"
712
- "movq 24(%1), %%r9 \n\t"
713
- "subq (%2), %%rax \n\t"
714
- "sbbq 8(%2), %%rcx \n\t"
715
- "sbbq 16(%2), %%r8 \n\t"
716
- "sbbq 24(%2), %%r9 \n\t"
717
- "movq %%rcx, 8(%0) \n\t"
718
- "movq %%r8 , 16(%0) \n\t"
719
- "movq %%r9 , 24(%0) \n\t"
720
- "setc %%cl \n\t"
721
- "neg %%rcx \n\t"
722
- "andq $38, %%rcx \n\t"
723
- "subq %%rcx, %%rax \n\t"
724
- "movq %%rax, (%0) \n\t"
725
- :
726
- : "r" (c), "r" (a), "r" (b)
727
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
728
- );
788
+ inline void sub_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
789
+ __asm__ __volatile__(
790
+ "mov $38, %%eax ;"
791
+ "movq (%1), %%r8 ;" "subq (%2), %%r8 ;"
792
+ "movq 8(%1), %%r9 ;" "sbbq 8(%2), %%r9 ;"
793
+ "movq 16(%1), %%r10 ;" "sbbq 16(%2), %%r10 ;"
794
+ "movq 24(%1), %%r11 ;" "sbbq 24(%2), %%r11 ;"
795
+ "mov $0, %%ecx ;"
796
+ "cmovc %%eax, %%ecx ;"
797
+ "subq %%rcx, %%r8 ;"
798
+ "sbbq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
799
+ "sbbq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
800
+ "sbbq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
801
+ "mov $0, %%ecx ;"
802
+ "cmovc %%eax, %%ecx ;"
803
+ "subq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
804
+ :
805
+ : "r" (c), "r" (a), "r" (b)
806
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
807
+ );
729
808
  }
730
809
 
731
- inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
732
- {
810
+ /**
811
+ * Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
812
+ **/
813
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
733
814
  #ifdef __BMI2__
734
- /**
735
- * a24 = (A+2)/4 = (486662+2)/4 = 121666
736
- **/
737
- const uint64_t a24 = 121666;
738
- __asm__ __volatile__(
739
- "movq %2, %%rdx \n\t"
740
- "mulx (%1), %%rax, %%r8 \n\t"
741
- "mulx 8(%1), %%rcx, %%r9 \n\t"
742
- "movq %%rax, (%0) \n\t"
743
- "movq %%rcx, 8(%0) \n\t"
744
- "mulx 16(%1), %%rax, %%r10 \n\t"
745
- "mulx 24(%1), %%rcx, %%r11 \n\t"
746
- "movq %%rax, 16(%0) \n\t"
747
- "movq %%rcx, 24(%0) \n\t"
748
- "movq $38, %%rdx \n\t"
749
- "mulx %%r11, %%rax, %%rcx \n\t"
750
- "addq %%rax, (%0) \n\t"
751
- "adcq %%r8, 8(%0) \n\t"
752
- "adcq %%r9, 16(%0) \n\t"
753
- "adcq %%r10, 24(%0) \n\t"
754
- :
755
- : "r" (c), "r" (a), "r" (a24)
756
- : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
757
- );
815
+ const uint64_t a24 = 121666;
816
+ __asm__ __volatile__(
817
+ "movq %2, %%rdx ;"
818
+ "mulx (%1), %%r8, %%r10 ;"
819
+ "mulx 8(%1), %%r9, %%r11 ;" "addq %%r10, %%r9 ;"
820
+ "mulx 16(%1), %%r10, %%rax ;" "adcq %%r11, %%r10 ;"
821
+ "mulx 24(%1), %%r11, %%rcx ;" "adcq %%rax, %%r11 ;"
822
+ /***************************/ "adcq $0, %%rcx ;"
823
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
824
+ "mulx %%rcx, %%rax, %%rcx ;"
825
+ "addq %%rax, %%r8 ;"
826
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
827
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
828
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
829
+ "mov $0, %%ecx ;"
830
+ "cmovc %%edx, %%ecx ;"
831
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
832
+ :
833
+ : "r" (c), "r" (a), "r" (a24)
834
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
835
+ );
758
836
  #else /* Without BMI2 */
759
- /**
760
- * TODO: Multiplications using MULQ instruction.
761
- **/
837
+ /**
838
+ * TODO: Multiplications using MULQ instruction.
839
+ **/
762
840
  #endif
763
841
  }
764
842
 
765
- void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
766
- {
767
- #define sqrn_EltFp25519_1w_x64(a,times)\
768
- counter = times;\
769
- while(counter-- > 0)\
770
- {\
771
- sqr_EltFp25519_1w_x64(a);\
772
- }
773
-
774
- EltFp25519_1w_Buffer_x64 buffer_1w;
775
- EltFp25519_1w_x64 x0, x1, x2;
776
- uint64_t * T[5];
777
- uint64_t counter;
778
-
779
- T[0] = x0;
780
- T[1] = pC; /* x^(-1) */
781
- T[2] = x1;
782
- T[3] = x2;
783
- T[4] = pA; /* x */
784
-
785
- copy_EltFp25519_1w_x64(T[1],pA);
786
- sqrn_EltFp25519_1w_x64(T[1],1);
787
- copy_EltFp25519_1w_x64(T[2],T[1]);
788
- sqrn_EltFp25519_1w_x64(T[2],2);
789
- mul_EltFp25519_1w_x64(T[0], pA, T[2]);
790
- mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
791
- copy_EltFp25519_1w_x64(T[2],T[1]);
792
- sqrn_EltFp25519_1w_x64(T[2],1);
793
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
794
- copy_EltFp25519_1w_x64(T[2],T[0]);
795
- sqrn_EltFp25519_1w_x64(T[2],5);
796
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
797
- copy_EltFp25519_1w_x64(T[2],T[0]);
798
- sqrn_EltFp25519_1w_x64(T[2],10);
799
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
800
- copy_EltFp25519_1w_x64(T[3],T[2]);
801
- sqrn_EltFp25519_1w_x64(T[3],20);
802
- mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
803
- sqrn_EltFp25519_1w_x64(T[3],10);
804
- mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
805
- copy_EltFp25519_1w_x64(T[0],T[3]);
806
- sqrn_EltFp25519_1w_x64(T[0],50);
807
- mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
808
- copy_EltFp25519_1w_x64(T[2],T[0]);
809
- sqrn_EltFp25519_1w_x64(T[2],100);
810
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
811
- sqrn_EltFp25519_1w_x64(T[2],50);
812
- mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
813
- sqrn_EltFp25519_1w_x64(T[2],5);
814
- mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
843
+ void inv_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
844
+ #define sqrn_EltFp25519_1w_x64(A, times)\
845
+ counter = times;\
846
+ while ( counter-- > 0) {\
847
+ sqr_EltFp25519_1w_x64(A);\
848
+ }
849
+
850
+ EltFp25519_1w_Buffer_x64 buffer_1w;
851
+ EltFp25519_1w_x64 x0, x1, x2;
852
+ uint64_t * T[5];
853
+ uint64_t counter;
854
+
855
+ T[0] = x0;
856
+ T[1] = c; /* x^(-1) */
857
+ T[2] = x1;
858
+ T[3] = x2;
859
+ T[4] = a; /* x */
860
+
861
+ copy_EltFp25519_1w_x64(T[1], a);
862
+ sqrn_EltFp25519_1w_x64(T[1], 1);
863
+ copy_EltFp25519_1w_x64(T[2], T[1]);
864
+ sqrn_EltFp25519_1w_x64(T[2], 2);
865
+ mul_EltFp25519_1w_x64(T[0], a, T[2]);
866
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
867
+ copy_EltFp25519_1w_x64(T[2], T[1]);
868
+ sqrn_EltFp25519_1w_x64(T[2], 1);
869
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
870
+ copy_EltFp25519_1w_x64(T[2], T[0]);
871
+ sqrn_EltFp25519_1w_x64(T[2], 5);
872
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
873
+ copy_EltFp25519_1w_x64(T[2], T[0]);
874
+ sqrn_EltFp25519_1w_x64(T[2], 10);
875
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
876
+ copy_EltFp25519_1w_x64(T[3], T[2]);
877
+ sqrn_EltFp25519_1w_x64(T[3], 20);
878
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
879
+ sqrn_EltFp25519_1w_x64(T[3], 10);
880
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
881
+ copy_EltFp25519_1w_x64(T[0], T[3]);
882
+ sqrn_EltFp25519_1w_x64(T[0], 50);
883
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
884
+ copy_EltFp25519_1w_x64(T[2], T[0]);
885
+ sqrn_EltFp25519_1w_x64(T[2], 100);
886
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
887
+ sqrn_EltFp25519_1w_x64(T[2], 50);
888
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
889
+ sqrn_EltFp25519_1w_x64(T[2], 5);
890
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
815
891
  #undef sqrn_EltFp25519_1w_x64
816
892
  }
817
893
 
818
- inline void fred_EltFp25519_1w_x64(uint64_t *const c)
819
- {
820
- int64_t last = (((int64_t*)c)[3])>>63;
821
- c[3] &= ((uint64_t)1<<63)-1;
822
- c[0] += 19 & last;
894
+ /**
895
+ * Given C, a 256-bit number, fred_EltFp25519_1w_x64 updates C
896
+ * with a number such that 0 <= C < 2**255-19.
897
+ * Contributed by: Samuel Neves.
898
+ **/
899
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c) {
900
+ __asm__ __volatile__ (
901
+ /* First, obtains a number less than 2^255. */
902
+ "btrq $63, 24(%0) ;"
903
+ "sbbl %%ecx, %%ecx ;"
904
+ "andq $19, %%rcx ;"
905
+ "addq %%rcx, (%0) ;"
906
+ "adcq $0, 8(%0) ;"
907
+ "adcq $0, 16(%0) ;"
908
+ "adcq $0, 24(%0) ;"
909
+
910
+ "btrq $63, 24(%0) ;"
911
+ "sbbl %%ecx, %%ecx ;"
912
+ "andq $19, %%rcx ;"
913
+ "addq %%rcx, (%0) ;"
914
+ "adcq $0, 8(%0) ;"
915
+ "adcq $0, 16(%0) ;"
916
+ "adcq $0, 24(%0) ;"
917
+
918
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
919
+ "cmpq $-19, (%0) ;"
920
+ "setaeb %%al ;"
921
+ "cmpq $-1, 8(%0) ;"
922
+ "setzb %%bl ;"
923
+ "cmpq $-1, 16(%0) ;"
924
+ "setzb %%cl ;"
925
+ "movq 24(%0), %%rdx ;"
926
+ "addq $1, %%rdx ;"
927
+ "shrq $63, %%rdx ;"
928
+ "andb %%bl, %%al ;"
929
+ "andb %%dl, %%cl ;"
930
+ "test %%cl, %%al ;"
931
+ "movl $0, %%eax ;"
932
+ "movl $19, %%ecx ;"
933
+ "cmovnz %%rcx, %%rax ;"
934
+ "addq %%rax, (%0) ;"
935
+ "adcq $0, 8(%0) ;"
936
+ "adcq $0, 16(%0) ;"
937
+ "adcq $0, 24(%0) ;"
938
+ "btrq $63, 24(%0) ;"
939
+ :
940
+ : "r"(c)
941
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
942
+ );
823
943
  }
824
-