x25519 1.0.5 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,7 @@
4
4
 
5
5
  require "mkmf"
6
6
 
7
- $CFLAGS << " -Wall -O3 -pedantic -std=c99 -mbmi -mbmi2 -march=native -mtune=native"
7
+ $CFLAGS << " -Wall -O3 -pedantic -std=c99 -mbmi -mbmi2 -march=haswell"
8
8
 
9
9
  create_makefile "x25519_precomputed"
10
10
 
@@ -1,213 +1,210 @@
1
1
  /**
2
- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
2
+ * Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
3
3
  * Institute of Computing.
4
4
  * University of Campinas, Brazil.
5
5
  *
6
- * This program is free software: you can redistribute it and/or modify
7
- * it under the terms of the GNU Lesser General Public License as
8
- * published by the Free Software Foundation, version 3.
6
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
7
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
9
8
  *
10
- * This program is distributed in the hope that it will be useful, but
11
- * WITHOUT ANY WARRANTY; without even the implied warranty of
12
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
- * Lesser General Public License for more details.
9
+ * Redistribution and use in source and binary forms, with or without
10
+ * modification, are permitted provided that the following conditions
11
+ * are met:
14
12
  *
15
- * You should have received a copy of the GNU Lesser General Public License
16
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
17
- */
18
- #include "fp25519_x64.h"
19
-
20
- int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes)
21
- {
22
- unsigned int i=0;
23
- uint8_t ret=0;
24
- for(i=0;i<num_bytes;i++)
25
- {
26
- ret += A[i]^B[i];
27
- }
28
- return ret;
29
- }
13
+ * * Redistributions of source code must retain the above copyright
14
+ * notice, this list of conditions and the following disclaimer.
15
+ * * Redistributions in binary form must reproduce the above
16
+ * copyright notice, this list of conditions and the following
17
+ * disclaimer in the documentation and/or other materials provided
18
+ * with the distribution.
19
+ * * Neither the name of University of Campinas nor the names of its
20
+ * contributors may be used to endorse or promote products derived
21
+ * from this software without specific prior written permission.
22
+ *
23
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
34
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ */
30
36
 
31
- int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
32
- {
33
- return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
34
- }
37
+ #include "fp25519_x64.h"
35
38
 
36
39
  /**
37
40
  *
38
- * @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
39
- * @param a Two 256-bit integers: a[0:3] and a[4:7]
40
- * @param b Two 256-bit integers: b[0:3] and b[4:7]
41
+ * @param c Two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
42
+ * @param a Two 256-bit integers: a0[0:3] and a1[4:7]
43
+ * @param b Two 256-bit integers: b0[0:3] and b1[4:7]
41
44
  */
42
- void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
43
- {
45
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
46
+ uint64_t *const b) {
44
47
  #ifdef __BMI2__
45
48
  #ifdef __ADX__
46
- __asm__ __volatile__(
47
- "movq (%1), %%rdx # A[0] \n\t"
48
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
49
- "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
50
- "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
51
- "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
52
- "adox %%rdx, %%rax \n\t"
53
-
54
- "movq 8(%1), %%rdx # A[1] \n\t"
55
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
56
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
57
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
58
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
59
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
60
-
61
- "movq 16(%1), %%rdx # A[2] \n\t"
62
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
63
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
64
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
65
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
66
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
67
-
68
- "movq 24(%1), %%rdx # A[3] \n\t"
69
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
70
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
71
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
72
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
73
- "adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
74
-
75
- "movq 32(%1), %%rdx # A[0] \n\t"
76
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
77
- "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
78
- "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
79
- "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
80
- "adox %%rdx, %%rax \n\t"
81
-
82
- "movq 40(%1), %%rdx # A[1] \n\t"
83
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
84
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
85
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
86
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
87
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
88
-
89
- "movq 48(%1), %%rdx # A[2] \n\t"
90
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
91
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
92
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
93
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
94
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
95
-
96
- "movq 56(%1), %%rdx # A[3] \n\t"
97
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
98
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
99
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
100
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
101
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
102
- :
103
- : "r" (c), "r" (a), "r" (b)
104
- : "memory", "cc", "%rax", "%rdx",
105
- "%r8", "%r9", "%r10", "%r11",
106
- "%r12", "%r13", "%r14"
107
- );
49
+ __asm__ __volatile__(
50
+ "xorl %%r14d, %%r14d ;"
51
+ "movq (%1), %%rdx; " /* A[0] */
52
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
53
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "adox %%r10, %%r12 ;"
54
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adox %%r8, %%rax ;"
55
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adox %%r10, %%rbx ;"
56
+ /*******************************************/ "adox %%r14, %%rcx ;"
57
+
58
+ "movq 8(%1), %%rdx; " /* A[1] */
59
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
60
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
61
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
62
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
63
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
64
+
65
+ "movq 16(%1), %%rdx; " /* A[2] */ "xorl %%r10d, %%r10d ;"
66
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
67
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
68
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
69
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
70
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
71
+
72
+ "movq 24(%1), %%rdx; " /* A[3] */ "xorl %%r10d, %%r10d ;"
73
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
74
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
75
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
76
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
77
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 56(%0) ;"
78
+
79
+ "movq 32(%1), %%rdx; " /* C[0] */
80
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, 64(%0);"
81
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "adox %%r10, %%r12 ;"
82
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adox %%r8, %%rax ;"
83
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adox %%r10, %%rbx ;"
84
+ /*******************************************/ "adox %%r14, %%rcx ;"
85
+
86
+ "movq 40(%1), %%rdx; " /* C[1] */ "xorl %%r10d, %%r10d ;"
87
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 72(%0);"
88
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
89
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
90
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
91
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
92
+
93
+ "movq 48(%1), %%rdx; " /* C[2] */ "xorl %%r10d, %%r10d ;"
94
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 80(%0);"
95
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
96
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
97
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
98
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
99
+
100
+ "movq 56(%1), %%rdx; " /* C[3] */ "xorl %%r10d, %%r10d ;"
101
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 88(%0);"
102
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
103
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
104
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
105
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 120(%0) ;"
106
+ :
107
+ : "r" (c), "r" (a), "r" (b)
108
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
109
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
110
+ );
108
111
  #else
109
- __asm__ __volatile__(
110
- "movq (%1), %%rdx # A[0] \n\t"
111
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
112
- "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
113
- "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
114
- "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
115
- "adcq $0, %%rcx \n\t"
116
-
117
- "movq 8(%1), %%rdx # A[1] \n\t"
118
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
119
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
120
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
121
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
122
- "adcq $0, %%rdx \n\t"
123
-
124
- "addq %%r8, 8(%0) \n\t"
125
- "adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
126
- "adcq %%r11, %%rbx \n\t"
127
- "adcq %%r13, %%rcx \n\t"
128
- "adcq %%rdx, %%rax \n\t"
129
-
130
- "movq 16(%1), %%rdx # A[2] \n\t"
131
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
132
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
133
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
134
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
135
- "adcq $0, %%rdx \n\t"
136
-
137
- "addq %%r8, 16(%0) \n\t"
138
- "adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
139
- "adcq %%r11, %%rcx \n\t"
140
- "adcq %%r13, %%rax \n\t"
141
- "adcq %%rdx, %%rbx \n\t"
142
-
143
- "movq 24(%1), %%rdx # A[3] \n\t"
144
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
145
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
146
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
147
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
148
- "adcq $0, %%rdx \n\t"
149
-
150
- "addq %%r8, 24(%0) \n\t"
151
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
152
- "adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
153
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
154
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
155
-
156
- "movq 32(%1), %%rdx # A[0] \n\t"
157
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
158
- "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
159
- "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
160
- "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
161
- "adcq $0, %%rcx \n\t"
162
-
163
- "movq 40(%1), %%rdx # A[1] \n\t"
164
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
165
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
166
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
167
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
168
- "adcq $0, %%rdx \n\t"
169
-
170
- "addq %%r8, 72(%0) \n\t"
171
- "adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
172
- "adcq %%r11, %%rbx \n\t"
173
- "adcq %%r13, %%rcx \n\t"
174
- "adcq %%rdx, %%rax \n\t"
175
-
176
- "movq 48(%1), %%rdx # A[2] \n\t"
177
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
178
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
179
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
180
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
181
- "adcq $0, %%rdx \n\t"
182
-
183
- "addq %%r8, 80(%0) \n\t"
184
- "adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
185
- "adcq %%r11, %%rcx \n\t"
186
- "adcq %%r13, %%rax \n\t"
187
- "adcq %%rdx, %%rbx \n\t"
188
-
189
- "movq 56(%1), %%rdx # A[3] \n\t"
190
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
191
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
192
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
193
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
194
- "adcq $0, %%rdx \n\t"
195
-
196
- "addq %%r8, 88(%0) \n\t"
197
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
198
- "adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
199
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
200
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
201
- :
202
- : "r" (c), "r" (a), "r" (b)
203
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
204
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
205
- );
112
+ __asm__ __volatile__(
113
+ "movq (%1), %%rdx; " /* A[0] */
114
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
115
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
116
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
117
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
118
+ /*******************************************/ "adcq $0, %%rcx ;"
119
+
120
+ "movq 8(%1), %%rdx; " /* A[1] */
121
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
122
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
123
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
124
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
125
+ /*******************************************/ "adcq $0, %%r12 ;"
126
+
127
+ "addq %%r9, %%rax ;"
128
+ "adcq %%r11, %%rbx ;"
129
+ "adcq %%r13, %%rcx ;"
130
+ "adcq $0, %%r12 ;"
131
+
132
+ "movq 16(%1), %%rdx; " /* A[2] */
133
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
134
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
135
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
136
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
137
+ /*******************************************/ "adcq $0, %%rax ;"
138
+
139
+ "addq %%r9, %%rbx ;"
140
+ "adcq %%r11, %%rcx ;"
141
+ "adcq %%r13, %%r12 ;"
142
+ "adcq $0, %%rax ;"
143
+
144
+ "movq 24(%1), %%rdx; " /* A[3] */
145
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
146
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
147
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
148
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
149
+ /*******************************************/ "adcq $0, %%rbx ;"
150
+
151
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
152
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
153
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
154
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
155
+
156
+ "movq 32(%1), %%rdx; " /* C[0] */
157
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "movq %%r8, 64(%0) ;"
158
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "addq %%r10, %%r12 ;"
159
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adcq %%r8, %%rax ;"
160
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adcq %%r10, %%rbx ;"
161
+ /*******************************************/ "adcq $0, %%rcx ;"
162
+
163
+ "movq 40(%1), %%rdx; " /* C[1] */
164
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 72(%0) ;"
165
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adcq %%r10, %%r9 ;"
166
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adcq %%r8, %%r11 ;"
167
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adcq %%r10, %%r13 ;"
168
+ /*******************************************/ "adcq $0, %%r12 ;"
169
+
170
+ "addq %%r9, %%rax ;"
171
+ "adcq %%r11, %%rbx ;"
172
+ "adcq %%r13, %%rcx ;"
173
+ "adcq $0, %%r12 ;"
174
+
175
+ "movq 48(%1), %%rdx; " /* C[2] */
176
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 80(%0) ;"
177
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adcq %%r10, %%r9 ;"
178
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adcq %%r8, %%r11 ;"
179
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adcq %%r10, %%r13 ;"
180
+ /*******************************************/ "adcq $0, %%rax ;"
181
+
182
+ "addq %%r9, %%rbx ;"
183
+ "adcq %%r11, %%rcx ;"
184
+ "adcq %%r13, %%r12 ;"
185
+ "adcq $0, %%rax ;"
186
+
187
+ "movq 56(%1), %%rdx; " /* C[3] */
188
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 88(%0) ;"
189
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adcq %%r10, %%r9 ;"
190
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adcq %%r8, %%r11 ;"
191
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adcq %%r10, %%r13 ;"
192
+ /*******************************************/ "adcq $0, %%rbx ;"
193
+
194
+ "addq %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
195
+ "adcq %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
196
+ "adcq %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
197
+ "adcq $0, %%rbx ;" "movq %%rbx, 120(%0) ;"
198
+ :
199
+ : "r" (c), "r" (a), "r" (b)
200
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
201
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
202
+ );
206
203
  #endif
207
204
  #else /* Without BMI2 */
208
- /**
209
- * TODO: Multiplications using MULQ instruction.
210
- **/
205
+ /**
206
+ * TODO: Multiplications using MULQ instruction.
207
+ **/
211
208
  #endif
212
209
  }
213
210
 
@@ -216,140 +213,186 @@ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *co
216
213
  * @param c
217
214
  * @param a
218
215
  */
219
- void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
220
- {
216
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
221
217
  #ifdef __BMI2__
222
- __asm__ __volatile__(
223
- "movq (%1), %%rdx # A[0] \n\t"
224
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
225
- "movq 8(%1), %%rdx # A[1] \n\t"
226
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
227
- "movq %%r8, (%0) \n\t"
228
- "movq %%r9, 8(%0) \n\t"
229
- "movq %%r10, 16(%0) \n\t"
230
- "movq %%r11, 24(%0) \n\t"
231
-
232
- "movq 16(%1), %%rdx # A[2] \n\t"
233
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
234
- "movq 24(%1), %%rdx # A[3] \n\t"
235
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
236
- "movq %%r8, 32(%0) \n\t"
237
- "movq %%r9, 40(%0) \n\t"
238
- "movq %%r10, 48(%0) \n\t"
239
- "movq %%r11, 56(%0) \n\t"
240
-
241
- "movq 8(%1), %%rdx # A[1] \n\t"
242
- "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
243
- "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
244
- "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
245
-
246
- "movq 16(%1), %%rdx # A[2] \n\t"
247
- "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
248
- "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
249
-
250
- "addq %%rax, %%r9 \n\t"
251
- "adcq %%rdx, %%r10 \n\t"
252
- "adcq %%rcx, %%r11 \n\t"
253
- "adcq %%r14, %%r12 \n\t"
254
- "adcq $0, %%r13 \n\t"
255
- "movq $0, %%r14 \n\t"
256
- "adcq $0, %%r14 \n\t"
257
-
258
- "movq (%1), %%rdx # A[0] \n\t"
259
- "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
260
-
261
- "addq %%rax, %%r10 \n\t"
262
- "adcq %%rdx, %%r11 \n\t"
263
- "adcq $0, %%r12 \n\t"
264
- "adcq $0, %%r13 \n\t"
265
- "adcq $0, %%r14 \n\t"
266
-
267
- "shldq $1, %%r13, %%r14 \n\t"
268
- "shldq $1, %%r12, %%r13 \n\t"
269
- "shldq $1, %%r11, %%r12 \n\t"
270
- "shldq $1, %%r10, %%r11 \n\t"
271
- "shldq $1, %%r9, %%r10 \n\t"
272
- "shldq $1, %%r8, %%r9 \n\t"
273
- "shlq $1, %%r8 \n\t"
274
-
275
- "addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
276
- "adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
277
- "adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
278
- "adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
279
- "adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
280
- "adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
281
- "adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
282
-
283
-
284
- "movq 32(%1), %%rdx # A[0] \n\t"
285
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
286
- "movq 40(%1), %%rdx # A[1] \n\t"
287
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
288
- "movq %%r8, 64(%0) \n\t"
289
- "movq %%r9, 72(%0) \n\t"
290
- "movq %%r10, 80(%0) \n\t"
291
- "movq %%r11, 88(%0) \n\t"
292
-
293
- "movq 48(%1), %%rdx # A[2] \n\t"
294
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
295
- "movq 56(%1), %%rdx # A[3] \n\t"
296
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
297
- "movq %%r8, 96(%0) \n\t"
298
- "movq %%r9, 104(%0) \n\t"
299
- "movq %%r10, 112(%0) \n\t"
300
- "movq %%r11, 120(%0) \n\t"
301
-
302
- "movq 40(%1), %%rdx # A[1] \n\t"
303
- "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
304
- "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
305
- "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
306
-
307
- "movq 48(%1), %%rdx # A[2] \n\t"
308
- "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
309
- "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
310
-
311
- "addq %%rax, %%r9 \n\t"
312
- "adcq %%rdx, %%r10 \n\t"
313
- "adcq %%rcx, %%r11 \n\t"
314
- "adcq %%r14, %%r12 \n\t"
315
- "adcq $0, %%r13 \n\t"
316
- "movq $0, %%r14 \n\t"
317
- "adcq $0, %%r14 \n\t"
318
-
319
- "movq 32(%1), %%rdx # A[0] \n\t"
320
- "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
321
-
322
- "addq %%rax, %%r10 \n\t"
323
- "adcq %%rdx, %%r11 \n\t"
324
- "adcq $0, %%r12 \n\t"
325
- "adcq $0, %%r13 \n\t"
326
- "adcq $0, %%r14 \n\t"
327
-
328
- "shldq $1, %%r13, %%r14 \n\t"
329
- "shldq $1, %%r12, %%r13 \n\t"
330
- "shldq $1, %%r11, %%r12 \n\t"
331
- "shldq $1, %%r10, %%r11 \n\t"
332
- "shldq $1, %%r9, %%r10 \n\t"
333
- "shldq $1, %%r8, %%r9 \n\t"
334
- "shlq $1, %%r8 \n\t"
335
-
336
- "addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
337
- "adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
338
- "adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
339
- "adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
340
- "adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
341
- "adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
342
- "adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
343
- :
344
- : "r" (c), "r" (a)
345
- : "cc", "%rax", "%rcx", "%rdx",
346
- "%r8", "%r9", "%r10", "%r11",
347
- "%r12", "%r13", "%r14"
348
- );
218
+ #ifdef __ADX__
219
+ __asm__ __volatile__(
220
+ "movq (%1), %%rdx ;" /* A[0] */
221
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
222
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
223
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
224
+ "movq 24(%1), %%rdx ;" /* A[3] */
225
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
226
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
227
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
228
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
229
+ /*******************************************/ "adcx %%r15, %%r14 ;"
230
+
231
+ "xorl %%r15d, %%r15d;"
232
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
233
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
234
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
235
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
236
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
237
+ "adcx %%r13, %%r13 ;"
238
+ "adcx %%r14, %%r14 ;"
239
+
240
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
241
+ /********************/ "movq %%rax, 0(%0) ;"
242
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
243
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
244
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
245
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
246
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
247
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
248
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
249
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
250
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
251
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
252
+
253
+
254
+ "movq 32(%1), %%rdx ;" /* B[0] */
255
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ "xorl %%r15d, %%r15d;"
256
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ "adcx %%r14, %%r9 ;"
257
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ "adcx %%rax, %%r10 ;"
258
+ "movq 56(%1), %%rdx ;" /* B[3] */
259
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */ "adcx %%rcx, %%r11 ;"
260
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ "adcx %%rax, %%r12 ;"
261
+ "movq 40(%1), %%rdx ;" /* B[1] */ "adcx %%r15, %%r13 ;"
262
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ "movq $0, %%r14 ;"
263
+ /*******************************************/ "adcx %%r15, %%r14 ;"
264
+
265
+ "xorl %%r15d, %%r15d;"
266
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
267
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
268
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
269
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
270
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
271
+ "adcx %%r13, %%r13 ;"
272
+ "adcx %%r14, %%r14 ;"
273
+
274
+ "movq 32(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
275
+ /********************/ "movq %%rax, 64(%0) ;"
276
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
277
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
278
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
279
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
280
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
281
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
282
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
283
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
284
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
285
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
286
+ :
287
+ : "r" (c), "r" (a)
288
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
289
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
290
+ );
291
+ #else /* Without ADX */
292
+ __asm__ __volatile__(
293
+ "movq 8(%1), %%rdx ;" /* A[1] */
294
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
295
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
296
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
297
+
298
+ "movq 16(%1), %%rdx ;" /* A[2] */
299
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
300
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
301
+
302
+ "addq %%rax, %%r9 ;"
303
+ "adcq %%rdx, %%r10 ;"
304
+ "adcq %%rcx, %%r11 ;"
305
+ "adcq %%r14, %%r12 ;"
306
+ "adcq $0, %%r13 ;"
307
+ "movq $0, %%r14 ;"
308
+ "adcq $0, %%r14 ;"
309
+
310
+ "movq (%1), %%rdx ;" /* A[0] */
311
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
312
+
313
+ "addq %%rax, %%r10 ;"
314
+ "adcq %%rcx, %%r11 ;"
315
+ "adcq $0, %%r12 ;"
316
+ "adcq $0, %%r13 ;"
317
+ "adcq $0, %%r14 ;"
318
+
319
+ "shldq $1, %%r13, %%r14 ;"
320
+ "shldq $1, %%r12, %%r13 ;"
321
+ "shldq $1, %%r11, %%r12 ;"
322
+ "shldq $1, %%r10, %%r11 ;"
323
+ "shldq $1, %%r9, %%r10 ;"
324
+ "shldq $1, %%r8, %%r9 ;"
325
+ "shlq $1, %%r8 ;"
326
+
327
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
328
+ /********************/ "movq %%rax, 0(%0) ;"
329
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
330
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
331
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
332
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
333
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
334
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
335
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
336
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
337
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
338
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
339
+
340
+ "movq 40(%1), %%rdx ;" /* B[1] */
341
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
342
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
343
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
344
+
345
+ "movq 48(%1), %%rdx ;" /* B[2] */
346
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
347
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
348
+
349
+ "addq %%rax, %%r9 ;"
350
+ "adcq %%rdx, %%r10 ;"
351
+ "adcq %%rcx, %%r11 ;"
352
+ "adcq %%r14, %%r12 ;"
353
+ "adcq $0, %%r13 ;"
354
+ "movq $0, %%r14 ;"
355
+ "adcq $0, %%r14 ;"
356
+
357
+ "movq 32(%1), %%rdx ;" /* B[0] */
358
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
359
+
360
+ "addq %%rax, %%r10 ;"
361
+ "adcq %%rcx, %%r11 ;"
362
+ "adcq $0, %%r12 ;"
363
+ "adcq $0, %%r13 ;"
364
+ "adcq $0, %%r14 ;"
365
+
366
+ "shldq $1, %%r13, %%r14 ;"
367
+ "shldq $1, %%r12, %%r13 ;"
368
+ "shldq $1, %%r11, %%r12 ;"
369
+ "shldq $1, %%r10, %%r11 ;"
370
+ "shldq $1, %%r9, %%r10 ;"
371
+ "shldq $1, %%r8, %%r9 ;"
372
+ "shlq $1, %%r8 ;"
373
+
374
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
375
+ /********************/ "movq %%rax, 64(%0) ;"
376
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
377
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
378
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
379
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
380
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
381
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
382
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
383
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
384
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
385
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
386
+ :
387
+ : "r" (c), "r" (a)
388
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
389
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
390
+ );
391
+ #endif
349
392
  #else /* Without BMI2 */
350
- /**
351
- * TODO: Multiplications using MULQ instruction.
352
- **/
393
+ /**
394
+ * TODO: Multiplications using MULQ instruction.
395
+ **/
353
396
  #endif
354
397
  }
355
398
 
@@ -358,467 +401,543 @@ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
358
401
  * @param c
359
402
  * @param a
360
403
  */
361
- void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
362
- {
404
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a) {
363
405
  #ifdef __BMI2__
364
406
  #ifdef __ADX__
365
- __asm__ __volatile__(
366
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
367
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
368
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
369
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
370
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
371
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
372
- " xorl %%ebx, %%ebx \n\t"
373
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
374
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
375
-
376
- " mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
377
- " mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
378
- " mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
379
- " mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
380
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
381
- " xorl %%ebx, %%ebx \n\t"
382
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
383
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
384
- :
385
- : "r" (c), "r" (a)
386
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
387
- );
407
+ __asm__ __volatile__(
408
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
409
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
410
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
411
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
412
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
413
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
414
+ "clc ;"
415
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
416
+ "adcx %%rax, %%r8 ;"
417
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
418
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
419
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
420
+ "mov $0, %%ecx ;"
421
+ "cmovc %%edx, %%ecx ;"
422
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
423
+
424
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox 64(%1), %%r8 ;"
425
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 72(%1), %%r9 ;"
426
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 80(%1), %%r10 ;"
427
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 88(%1), %%r11 ;"
428
+ /*****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
429
+ "clc ;"
430
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
431
+ "adcx %%rax, %%r8 ;"
432
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
433
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 48(%0) ;"
434
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 56(%0) ;"
435
+ "mov $0, %%ecx ;"
436
+ "cmovc %%edx, %%ecx ;"
437
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
438
+ :
439
+ : "r" (c), "r" (a)
440
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
441
+ );
388
442
  #else
389
- __asm__ __volatile__(
390
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
391
- "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
392
- "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
393
- "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
394
- "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
395
- "adcq $0, %%rcx \n\t"
396
-
397
- "addq (%1), %%r8 \n\t"
398
- "adcq 8(%1), %%r10 \n\t"
399
- "adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
400
- "adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
401
- "adcq $0, %%rcx \n\t"
402
-
403
- "mulx %%rcx, %%rax, %%rcx \n\t"
404
- "addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
405
- "adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
406
-
407
- "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
408
- "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
409
- "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
410
- "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
411
- "adcq $0, %%rcx \n\t"
412
-
413
- "addq 64(%1), %%r8 \n\t"
414
- "adcq 72(%1), %%r10 \n\t"
415
- "adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
416
- "adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
417
- "adcq $0, %%rcx \n\t"
418
-
419
- "mulx %%rcx, %%rax, %%rcx \n\t"
420
- "addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
421
- "adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
422
-
423
- :
424
- : "r" (c), "r" (a)
425
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
426
- );
443
+ __asm__ __volatile__(
444
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
445
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
446
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
447
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
448
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
449
+ /****************************************/ "adcq $0, %%rcx ;"
450
+ "addq (%1), %%r8 ;"
451
+ "adcq 8(%1), %%r9 ;"
452
+ "adcq 16(%1), %%r10 ;"
453
+ "adcq 24(%1), %%r11 ;"
454
+ "adcq $0, %%rcx ;"
455
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
456
+ "addq %%rax, %%r8 ;"
457
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
458
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
459
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
460
+ "mov $0, %%ecx ;"
461
+ "cmovc %%edx, %%ecx ;"
462
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
463
+
464
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
465
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
466
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
467
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
468
+ /*****************************************/ "adcq $0, %%rcx ;"
469
+ "addq 64(%1), %%r8 ;"
470
+ "adcq 72(%1), %%r9 ;"
471
+ "adcq 80(%1), %%r10 ;"
472
+ "adcq 88(%1), %%r11 ;"
473
+ "adcq $0, %%rcx ;"
474
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
475
+ "addq %%rax, %%r8 ;"
476
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
477
+ "adcq $0, %%r10 ;" "movq %%r10, 48(%0) ;"
478
+ "adcq $0, %%r11 ;" "movq %%r11, 56(%0) ;"
479
+ "mov $0, %%ecx ;"
480
+ "cmovc %%edx, %%ecx ;"
481
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
482
+ :
483
+ : "r" (c), "r" (a)
484
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
485
+ );
427
486
  #endif
428
487
  #else /* Without BMI2 */
429
- /* [TODO] */
488
+ /* [TODO] */
430
489
  #endif
431
490
  }
432
491
 
433
- void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
434
- {
492
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
435
493
  #ifdef __BMI2__
436
494
  #ifdef __ADX__
437
- __asm__ __volatile__(
438
- " movq (%1), %%rdx # A[0] \n\t"
439
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
440
- " mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
441
- " mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
442
- " mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
443
- " adox %%rdx, %%rax \n\t"
444
-
445
- " movq 8(%1), %%rdx # A[1] \n\t"
446
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
447
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
448
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
449
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
450
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
451
-
452
- " movq 16(%1), %%rdx # A[2] \n\t"
453
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
454
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
455
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
456
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
457
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
458
-
459
- " movq 24(%1), %%rdx # A[3] \n\t"
460
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
461
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
462
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
463
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
464
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
465
- :
466
- : "r" (c), "r" (a), "r" (b)
467
- : "memory", "cc", "%rax", "%rdx",
468
- "%r8", "%r9", "%r10", "%r11",
469
- "%r12", "%r13", "%r14"
470
- );
495
+ __asm__ __volatile__(
496
+ "movq (%1), %%rdx; " /* A[0] */
497
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
498
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(%0) ;"
499
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%r12 ;"
500
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;"
501
+ /*******************************************/ "adox %%rdx, %%rax ;"
502
+
503
+ "movq 8(%1), %%rdx; " /* A[1] */
504
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(%0), %%r8 ;" "movq %%r8, 8(%0) ;"
505
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 16(%0) ;"
506
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
507
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
508
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
509
+
510
+ "movq 16(%1), %%rdx; " /* A[2] */
511
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(%0), %%r8 ;" "movq %%r8, 16(%0) ;"
512
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 24(%0) ;"
513
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
514
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
515
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
516
+
517
+ "movq 24(%1), %%rdx; " /* A[3] */
518
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(%0), %%r8 ;" "movq %%r8, 24(%0) ;"
519
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 32(%0) ;"
520
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq %%r12, 40(%0) ;" "movq $0, %%r8 ;"
521
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(%0) ;" "movq $0, %%rax ;"
522
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(%0) ;"
523
+ :
524
+ : "r" (c), "r" (a), "r" (b)
525
+ : "memory", "cc", "%rax", "%rdx", "%r8",
526
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
527
+ );
471
528
  #else
472
- __asm__ __volatile__(
473
- " movq (%1), %%rdx # A[0] \n\t"
474
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
475
- " mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
476
- " mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
477
- " mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
478
- " adcq $0, %%rcx \n\t"
479
-
480
- " movq 8(%1), %%rdx # A[1] \n\t"
481
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
482
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
483
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
484
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
485
- " adcq $0, %%rdx \n\t"
486
-
487
- " addq %%r8, 8(%0) \n\t"
488
- " adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
489
- " adcq %%r11, %%rbx \n\t"
490
- " adcq %%r13, %%rcx \n\t"
491
- " adcq %%rdx, %%rax \n\t"
492
-
493
- " movq 16(%1), %%rdx # A[2] \n\t"
494
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
495
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
496
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
497
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
498
- " adcq $0, %%rdx \n\t"
499
-
500
- " addq %%r8, 16(%0) \n\t"
501
- " adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
502
- " adcq %%r11, %%rcx \n\t"
503
- " adcq %%r13, %%rax \n\t"
504
- " adcq %%rdx, %%rbx \n\t"
505
-
506
- " movq 24(%1), %%rdx # A[3] \n\t"
507
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
508
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
509
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
510
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
511
- " adcq $0, %%rdx \n\t"
512
-
513
- " addq %%r8, 24(%0) \n\t"
514
- " adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
515
- " adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
516
- " adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
517
- " adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
518
- :
519
- : "r" (c), "r" (a), "r" (b)
520
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
521
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
522
- );
529
+ __asm__ __volatile__(
530
+ "movq (%1), %%rdx; " /* A[0] */
531
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
532
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
533
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
534
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
535
+ /*******************************************/ "adcq $0, %%rcx ;"
536
+
537
+ "movq 8(%1), %%rdx; " /* A[1] */
538
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
539
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
540
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
541
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
542
+ /*******************************************/ "adcq $0, %%r12 ;"
543
+
544
+ "addq %%r9, %%rax ;"
545
+ "adcq %%r11, %%rbx ;"
546
+ "adcq %%r13, %%rcx ;"
547
+ "adcq $0, %%r12 ;"
548
+
549
+ "movq 16(%1), %%rdx; " /* A[2] */
550
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
551
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
552
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
553
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
554
+ /*******************************************/ "adcq $0, %%rax ;"
555
+
556
+ "addq %%r9, %%rbx ;"
557
+ "adcq %%r11, %%rcx ;"
558
+ "adcq %%r13, %%r12 ;"
559
+ "adcq $0, %%rax ;"
560
+
561
+ "movq 24(%1), %%rdx; " /* A[3] */
562
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
563
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
564
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
565
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
566
+ /*******************************************/ "adcq $0, %%rbx ;"
567
+
568
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
569
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
570
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
571
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
572
+ :
573
+ : "r" (c), "r" (a), "r" (b)
574
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
575
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
576
+ );
523
577
  #endif
524
578
  #else /* Without BMI2 */
525
- /**
526
- * TODO: Multiplications using MULQ instruction.
527
- **/
579
+ /**
580
+ * TODO: Multiplications using MULQ instruction.
581
+ **/
528
582
  #endif
529
583
  }
530
584
 
531
- void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
532
- {
585
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
533
586
  #ifdef __BMI2__
534
- __asm__ __volatile__(
535
- " movq (%1), %%rdx # A[0] \n\t"
536
- " mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
537
- " movq 8(%1), %%rdx # A[1] \n\t"
538
- " mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
539
- " movq %%r8, (%0) \n\t"
540
- " movq %%r9, 8(%0) \n\t"
541
- " movq %%r10, 16(%0) \n\t"
542
- " movq %%r11, 24(%0) \n\t"
543
-
544
- " movq 16(%1), %%rdx # A[2] \n\t"
545
- " mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
546
- " movq 24(%1), %%rdx # A[3] \n\t"
547
- " mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
548
- " movq %%r8, 32(%0) \n\t"
549
- " movq %%r9, 40(%0) \n\t"
550
- " movq %%r10, 48(%0) \n\t"
551
- " movq %%r11, 56(%0) \n\t"
552
-
553
- " movq 8(%1), %%rdx # A[1] \n\t"
554
- " mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
555
- " mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
556
- " mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
557
-
558
- " movq 16(%1), %%rdx # A[2] \n\t"
559
- " mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
560
- " mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
561
-
562
- " addq %%rax, %%r9 \n\t"
563
- " adcq %%rdx, %%r10 \n\t"
564
- " adcq %%rcx, %%r11 \n\t"
565
- " adcq %%r14, %%r12 \n\t"
566
- " adcq $0, %%r13 \n\t"
567
- " movq $0, %%r14 \n\t"
568
- " adcq $0, %%r14 \n\t"
569
-
570
- " movq (%1), %%rdx # A[0] \n\t"
571
- " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
572
-
573
- " addq %%rax, %%r10 \n\t"
574
- " adcq %%rdx, %%r11 \n\t"
575
- " adcq $0, %%r12 \n\t"
576
- " adcq $0, %%r13 \n\t"
577
- " adcq $0, %%r14 \n\t"
578
-
579
- " shldq $1, %%r13, %%r14 \n\t"
580
- " shldq $1, %%r12, %%r13 \n\t"
581
- " shldq $1, %%r11, %%r12 \n\t"
582
- " shldq $1, %%r10, %%r11 \n\t"
583
- " shldq $1, %%r9, %%r10 \n\t"
584
- " shldq $1, %%r8, %%r9 \n\t"
585
- " shlq $1, %%r8 \n\t"
586
-
587
- " addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
588
- " adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
589
- " adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
590
- " adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
591
- " adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
592
- " adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
593
- " adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
594
- :
595
- : "r" (c), "r" (a)
596
- : "memory", "cc", "%rax", "%rcx", "%rdx",
597
- "%r8", "%r9", "%r10", "%r11",
598
- "%r12", "%r13", "%r14"
599
- );
587
+ #ifdef __ADX__
588
+ __asm__ __volatile__(
589
+ "movq (%1), %%rdx ;" /* A[0] */
590
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
591
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
592
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
593
+ "movq 24(%1), %%rdx ;" /* A[3] */
594
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
595
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
596
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
597
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
598
+ /*******************************************/ "adcx %%r15, %%r14 ;"
599
+
600
+ "xorl %%r15d, %%r15d;"
601
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
602
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
603
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
604
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
605
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
606
+ "adcx %%r13, %%r13 ;"
607
+ "adcx %%r14, %%r14 ;"
608
+
609
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
610
+ /********************/ "movq %%rax, 0(%0) ;"
611
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
612
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
613
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
614
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
615
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
616
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
617
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
618
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
619
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
620
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
621
+ :
622
+ : "r" (c), "r" (a)
623
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
624
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
625
+ );
626
+ #else /* Without ADX */
627
+ __asm__ __volatile__(
628
+ "movq 8(%1), %%rdx ;" /* A[1] */
629
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
630
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
631
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
632
+
633
+ "movq 16(%1), %%rdx ;" /* A[2] */
634
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
635
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
636
+
637
+ "addq %%rax, %%r9 ;"
638
+ "adcq %%rdx, %%r10 ;"
639
+ "adcq %%rcx, %%r11 ;"
640
+ "adcq %%r14, %%r12 ;"
641
+ "adcq $0, %%r13 ;"
642
+ "movq $0, %%r14 ;"
643
+ "adcq $0, %%r14 ;"
644
+
645
+ "movq (%1), %%rdx ;" /* A[0] */
646
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
647
+
648
+ "addq %%rax, %%r10 ;"
649
+ "adcq %%rcx, %%r11 ;"
650
+ "adcq $0, %%r12 ;"
651
+ "adcq $0, %%r13 ;"
652
+ "adcq $0, %%r14 ;"
653
+
654
+ "shldq $1, %%r13, %%r14 ;"
655
+ "shldq $1, %%r12, %%r13 ;"
656
+ "shldq $1, %%r11, %%r12 ;"
657
+ "shldq $1, %%r10, %%r11 ;"
658
+ "shldq $1, %%r9, %%r10 ;"
659
+ "shldq $1, %%r8, %%r9 ;"
660
+ "shlq $1, %%r8 ;"
661
+
662
+ /********************/ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
663
+ /********************/ "movq %%rax, 0(%0) ;"
664
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
665
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
666
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
667
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
668
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
669
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
670
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
671
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
672
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
673
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
674
+ :
675
+ : "r" (c), "r" (a)
676
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
677
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
678
+ );
679
+ #endif
600
680
  #else /* Without BMI2 */
601
- /**
602
- * TODO: Multiplications using MULQ instruction.
603
- **/
681
+ /**
682
+ * TODO: Multiplications using MULQ instruction.
683
+ **/
604
684
  #endif
605
685
  }
606
686
 
607
- void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
608
- {
687
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
609
688
  #ifdef __BMI2__
610
689
  #ifdef __ADX__
611
- __asm__ __volatile__(
612
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
613
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
614
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
615
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
616
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
617
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
618
- " xorl %%ebx, %%ebx \n\t"
619
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
620
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
621
- :
622
- : "r" (c), "r" (a)
623
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
624
- );
690
+ __asm__ __volatile__(
691
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
692
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
693
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
694
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
695
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
696
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
697
+ "clc ;"
698
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
699
+ "adcx %%rax, %%r8 ;"
700
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
701
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
702
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
703
+ "mov $0, %%ecx ;"
704
+ "cmovc %%edx, %%ecx ;"
705
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
706
+ :
707
+ : "r" (c), "r" (a)
708
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
709
+ );
625
710
  #else
626
- __asm__ __volatile__(
627
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
628
- " mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
629
- " mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
630
- " mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
631
- " mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
632
- " adcq $0, %%rcx \n\t"
633
-
634
- " addq (%1), %%r8 \n\t"
635
- " adcq 8(%1), %%r10 \n\t"
636
- " adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
637
- " adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
638
- " adcq $0, %%rcx \n\t"
639
-
640
- " mulx %%rcx, %%rax, %%rcx \n\t"
641
- " addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
642
- " adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
643
- :
644
- : "r" (c), "r" (a)
645
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
646
- );
711
+ __asm__ __volatile__(
712
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
713
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
714
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
715
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
716
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
717
+ /****************************************/ "adcq $0, %%rcx ;"
718
+ "addq (%1), %%r8 ;"
719
+ "adcq 8(%1), %%r9 ;"
720
+ "adcq 16(%1), %%r10 ;"
721
+ "adcq 24(%1), %%r11 ;"
722
+ "adcq $0, %%rcx ;"
723
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
724
+ "addq %%rax, %%r8 ;"
725
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
726
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
727
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
728
+ "mov $0, %%ecx ;"
729
+ "cmovc %%edx, %%ecx ;"
730
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
731
+ :
732
+ : "r" (c), "r" (a)
733
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
734
+ );
647
735
  #endif
648
736
  #else /* Without BMI2 */
649
- /**
650
- * TODO: Multiplications using MULQ instruction.
651
- **/
737
+ /**
738
+ * TODO: Multiplications using MULQ instruction.
739
+ **/
652
740
  #endif
653
741
  }
654
742
 
655
- inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
656
- {
657
- #if __ADX__
658
- __asm__ __volatile__(
659
- "movq (%2), %%rax \n\t"
660
- "movq 8(%2), %%rcx \n\t"
661
- "movq 16(%2), %%r8 \n\t"
662
- "movq 24(%2), %%r9 \n\t"
663
- "clc \n\t"
664
- "adcx (%1), %%rax \n\t"
665
- "adcx 8(%1), %%rcx \n\t"
666
- "adcx 16(%1), %%r8 \n\t"
667
- "adcx 24(%1), %%r9 \n\t"
668
- "movq %%rcx, 8(%0) \n\t"
669
- "movq %%r8 , 16(%0) \n\t"
670
- "movq %%r9 , 24(%0) \n\t"
671
- "setc %%cl \n\t"
672
- "neg %%rcx \n\t"
673
- "andq $38, %%rcx \n\t"
674
- "addq %%rcx, %%rax \n\t"
675
- "movq %%rax, (%0) \n\t"
676
- :
677
- : "r" (c), "r" (a), "r" (b)
678
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
679
- );
743
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
744
+ #ifdef __ADX__
745
+ __asm__ __volatile__(
746
+ "mov $38, %%eax ;"
747
+ "xorl %%ecx, %%ecx ;"
748
+ "movq (%2), %%r8 ;" "adcx (%1), %%r8 ;"
749
+ "movq 8(%2), %%r9 ;" "adcx 8(%1), %%r9 ;"
750
+ "movq 16(%2), %%r10 ;" "adcx 16(%1), %%r10 ;"
751
+ "movq 24(%2), %%r11 ;" "adcx 24(%1), %%r11 ;"
752
+ "cmovc %%eax, %%ecx ;"
753
+ "xorl %%eax, %%eax ;"
754
+ "adcx %%rcx, %%r8 ;"
755
+ "adcx %%rax, %%r9 ;" "movq %%r9, 8(%0) ;"
756
+ "adcx %%rax, %%r10 ;" "movq %%r10, 16(%0) ;"
757
+ "adcx %%rax, %%r11 ;" "movq %%r11, 24(%0) ;"
758
+ "mov $38, %%ecx ;"
759
+ "cmovc %%ecx, %%eax ;"
760
+ "addq %%rax, %%r8 ;" "movq %%r8, (%0) ;"
761
+ :
762
+ : "r" (c), "r" (a), "r" (b)
763
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
764
+ );
680
765
  #else
681
- __asm__ __volatile__(
682
- "movq (%2), %%rax \n\t"
683
- "movq 8(%2), %%rcx \n\t"
684
- "movq 16(%2), %%r8 \n\t"
685
- "movq 24(%2), %%r9 \n\t"
686
- "add (%1), %%rax \n\t"
687
- "adc 8(%1), %%rcx \n\t"
688
- "adc 16(%1), %%r8 \n\t"
689
- "adc 24(%1), %%r9 \n\t"
690
- "movq %%rcx, 8(%0) \n\t"
691
- "movq %%r8 , 16(%0) \n\t"
692
- "movq %%r9 , 24(%0) \n\t"
693
- "setc %%cl \n\t"
694
- "neg %%rcx \n\t"
695
- "andq $38, %%rcx \n\t"
696
- "addq %%rcx, %%rax \n\t"
697
- "movq %%rax, (%0) \n\t"
698
- :
699
- : "r" (c), "r" (a), "r" (b)
700
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
701
- );
766
+ __asm__ __volatile__(
767
+ "mov $38, %%eax ;"
768
+ "movq (%2), %%r8 ;" "addq (%1), %%r8 ;"
769
+ "movq 8(%2), %%r9 ;" "adcq 8(%1), %%r9 ;"
770
+ "movq 16(%2), %%r10 ;" "adcq 16(%1), %%r10 ;"
771
+ "movq 24(%2), %%r11 ;" "adcq 24(%1), %%r11 ;"
772
+ "mov $0, %%ecx ;"
773
+ "cmovc %%eax, %%ecx ;"
774
+ "addq %%rcx, %%r8 ;"
775
+ "adcq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
776
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
777
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
778
+ "mov $0, %%ecx ;"
779
+ "cmovc %%eax, %%ecx ;"
780
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
781
+ :
782
+ : "r" (c), "r" (a), "r" (b)
783
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
784
+ );
702
785
  #endif
703
786
  }
704
787
 
705
- inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
706
- uint64_t *const __restrict b)
707
- {
708
- __asm__ __volatile__(
709
- "movq (%1), %%rax \n\t"
710
- "movq 8(%1), %%rcx \n\t"
711
- "movq 16(%1), %%r8 \n\t"
712
- "movq 24(%1), %%r9 \n\t"
713
- "subq (%2), %%rax \n\t"
714
- "sbbq 8(%2), %%rcx \n\t"
715
- "sbbq 16(%2), %%r8 \n\t"
716
- "sbbq 24(%2), %%r9 \n\t"
717
- "movq %%rcx, 8(%0) \n\t"
718
- "movq %%r8 , 16(%0) \n\t"
719
- "movq %%r9 , 24(%0) \n\t"
720
- "setc %%cl \n\t"
721
- "neg %%rcx \n\t"
722
- "andq $38, %%rcx \n\t"
723
- "subq %%rcx, %%rax \n\t"
724
- "movq %%rax, (%0) \n\t"
725
- :
726
- : "r" (c), "r" (a), "r" (b)
727
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
728
- );
788
+ inline void sub_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
789
+ __asm__ __volatile__(
790
+ "mov $38, %%eax ;"
791
+ "movq (%1), %%r8 ;" "subq (%2), %%r8 ;"
792
+ "movq 8(%1), %%r9 ;" "sbbq 8(%2), %%r9 ;"
793
+ "movq 16(%1), %%r10 ;" "sbbq 16(%2), %%r10 ;"
794
+ "movq 24(%1), %%r11 ;" "sbbq 24(%2), %%r11 ;"
795
+ "mov $0, %%ecx ;"
796
+ "cmovc %%eax, %%ecx ;"
797
+ "subq %%rcx, %%r8 ;"
798
+ "sbbq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
799
+ "sbbq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
800
+ "sbbq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
801
+ "mov $0, %%ecx ;"
802
+ "cmovc %%eax, %%ecx ;"
803
+ "subq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
804
+ :
805
+ : "r" (c), "r" (a), "r" (b)
806
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
807
+ );
729
808
  }
730
809
 
731
- inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
732
- {
810
+ /**
811
+ * Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
812
+ **/
813
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
733
814
  #ifdef __BMI2__
734
- /**
735
- * a24 = (A+2)/4 = (486662+2)/4 = 121666
736
- **/
737
- const uint64_t a24 = 121666;
738
- __asm__ __volatile__(
739
- "movq %2, %%rdx \n\t"
740
- "mulx (%1), %%rax, %%r8 \n\t"
741
- "mulx 8(%1), %%rcx, %%r9 \n\t"
742
- "movq %%rax, (%0) \n\t"
743
- "movq %%rcx, 8(%0) \n\t"
744
- "mulx 16(%1), %%rax, %%r10 \n\t"
745
- "mulx 24(%1), %%rcx, %%r11 \n\t"
746
- "movq %%rax, 16(%0) \n\t"
747
- "movq %%rcx, 24(%0) \n\t"
748
- "movq $38, %%rdx \n\t"
749
- "mulx %%r11, %%rax, %%rcx \n\t"
750
- "addq %%rax, (%0) \n\t"
751
- "adcq %%r8, 8(%0) \n\t"
752
- "adcq %%r9, 16(%0) \n\t"
753
- "adcq %%r10, 24(%0) \n\t"
754
- :
755
- : "r" (c), "r" (a), "r" (a24)
756
- : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
757
- );
815
+ const uint64_t a24 = 121666;
816
+ __asm__ __volatile__(
817
+ "movq %2, %%rdx ;"
818
+ "mulx (%1), %%r8, %%r10 ;"
819
+ "mulx 8(%1), %%r9, %%r11 ;" "addq %%r10, %%r9 ;"
820
+ "mulx 16(%1), %%r10, %%rax ;" "adcq %%r11, %%r10 ;"
821
+ "mulx 24(%1), %%r11, %%rcx ;" "adcq %%rax, %%r11 ;"
822
+ /***************************/ "adcq $0, %%rcx ;"
823
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
824
+ "mulx %%rcx, %%rax, %%rcx ;"
825
+ "addq %%rax, %%r8 ;"
826
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
827
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
828
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
829
+ "mov $0, %%ecx ;"
830
+ "cmovc %%edx, %%ecx ;"
831
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
832
+ :
833
+ : "r" (c), "r" (a), "r" (a24)
834
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
835
+ );
758
836
  #else /* Without BMI2 */
759
- /**
760
- * TODO: Multiplications using MULQ instruction.
761
- **/
837
+ /**
838
+ * TODO: Multiplications using MULQ instruction.
839
+ **/
762
840
  #endif
763
841
  }
764
842
 
765
- void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
766
- {
767
- #define sqrn_EltFp25519_1w_x64(a,times)\
768
- counter = times;\
769
- while(counter-- > 0)\
770
- {\
771
- sqr_EltFp25519_1w_x64(a);\
772
- }
773
-
774
- EltFp25519_1w_Buffer_x64 buffer_1w;
775
- EltFp25519_1w_x64 x0, x1, x2;
776
- uint64_t * T[5];
777
- uint64_t counter;
778
-
779
- T[0] = x0;
780
- T[1] = pC; /* x^(-1) */
781
- T[2] = x1;
782
- T[3] = x2;
783
- T[4] = pA; /* x */
784
-
785
- copy_EltFp25519_1w_x64(T[1],pA);
786
- sqrn_EltFp25519_1w_x64(T[1],1);
787
- copy_EltFp25519_1w_x64(T[2],T[1]);
788
- sqrn_EltFp25519_1w_x64(T[2],2);
789
- mul_EltFp25519_1w_x64(T[0], pA, T[2]);
790
- mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
791
- copy_EltFp25519_1w_x64(T[2],T[1]);
792
- sqrn_EltFp25519_1w_x64(T[2],1);
793
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
794
- copy_EltFp25519_1w_x64(T[2],T[0]);
795
- sqrn_EltFp25519_1w_x64(T[2],5);
796
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
797
- copy_EltFp25519_1w_x64(T[2],T[0]);
798
- sqrn_EltFp25519_1w_x64(T[2],10);
799
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
800
- copy_EltFp25519_1w_x64(T[3],T[2]);
801
- sqrn_EltFp25519_1w_x64(T[3],20);
802
- mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
803
- sqrn_EltFp25519_1w_x64(T[3],10);
804
- mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
805
- copy_EltFp25519_1w_x64(T[0],T[3]);
806
- sqrn_EltFp25519_1w_x64(T[0],50);
807
- mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
808
- copy_EltFp25519_1w_x64(T[2],T[0]);
809
- sqrn_EltFp25519_1w_x64(T[2],100);
810
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
811
- sqrn_EltFp25519_1w_x64(T[2],50);
812
- mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
813
- sqrn_EltFp25519_1w_x64(T[2],5);
814
- mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
843
+ void inv_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
844
+ #define sqrn_EltFp25519_1w_x64(A, times)\
845
+ counter = times;\
846
+ while ( counter-- > 0) {\
847
+ sqr_EltFp25519_1w_x64(A);\
848
+ }
849
+
850
+ EltFp25519_1w_Buffer_x64 buffer_1w;
851
+ EltFp25519_1w_x64 x0, x1, x2;
852
+ uint64_t * T[5];
853
+ uint64_t counter;
854
+
855
+ T[0] = x0;
856
+ T[1] = c; /* x^(-1) */
857
+ T[2] = x1;
858
+ T[3] = x2;
859
+ T[4] = a; /* x */
860
+
861
+ copy_EltFp25519_1w_x64(T[1], a);
862
+ sqrn_EltFp25519_1w_x64(T[1], 1);
863
+ copy_EltFp25519_1w_x64(T[2], T[1]);
864
+ sqrn_EltFp25519_1w_x64(T[2], 2);
865
+ mul_EltFp25519_1w_x64(T[0], a, T[2]);
866
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
867
+ copy_EltFp25519_1w_x64(T[2], T[1]);
868
+ sqrn_EltFp25519_1w_x64(T[2], 1);
869
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
870
+ copy_EltFp25519_1w_x64(T[2], T[0]);
871
+ sqrn_EltFp25519_1w_x64(T[2], 5);
872
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
873
+ copy_EltFp25519_1w_x64(T[2], T[0]);
874
+ sqrn_EltFp25519_1w_x64(T[2], 10);
875
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
876
+ copy_EltFp25519_1w_x64(T[3], T[2]);
877
+ sqrn_EltFp25519_1w_x64(T[3], 20);
878
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
879
+ sqrn_EltFp25519_1w_x64(T[3], 10);
880
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
881
+ copy_EltFp25519_1w_x64(T[0], T[3]);
882
+ sqrn_EltFp25519_1w_x64(T[0], 50);
883
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
884
+ copy_EltFp25519_1w_x64(T[2], T[0]);
885
+ sqrn_EltFp25519_1w_x64(T[2], 100);
886
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
887
+ sqrn_EltFp25519_1w_x64(T[2], 50);
888
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
889
+ sqrn_EltFp25519_1w_x64(T[2], 5);
890
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
815
891
  #undef sqrn_EltFp25519_1w_x64
816
892
  }
817
893
 
818
- inline void fred_EltFp25519_1w_x64(uint64_t *const c)
819
- {
820
- int64_t last = (((int64_t*)c)[3])>>63;
821
- c[3] &= ((uint64_t)1<<63)-1;
822
- c[0] += 19 & last;
894
+ /**
895
+ * Given C, a 256-bit number, fred_EltFp25519_1w_x64 updates C
896
+ * with a number such that 0 <= C < 2**255-19.
897
+ * Contributed by: Samuel Neves.
898
+ **/
899
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c) {
900
+ __asm__ __volatile__ (
901
+ /* First, obtains a number less than 2^255. */
902
+ "btrq $63, 24(%0) ;"
903
+ "sbbl %%ecx, %%ecx ;"
904
+ "andq $19, %%rcx ;"
905
+ "addq %%rcx, (%0) ;"
906
+ "adcq $0, 8(%0) ;"
907
+ "adcq $0, 16(%0) ;"
908
+ "adcq $0, 24(%0) ;"
909
+
910
+ "btrq $63, 24(%0) ;"
911
+ "sbbl %%ecx, %%ecx ;"
912
+ "andq $19, %%rcx ;"
913
+ "addq %%rcx, (%0) ;"
914
+ "adcq $0, 8(%0) ;"
915
+ "adcq $0, 16(%0) ;"
916
+ "adcq $0, 24(%0) ;"
917
+
918
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
919
+ "cmpq $-19, (%0) ;"
920
+ "setaeb %%al ;"
921
+ "cmpq $-1, 8(%0) ;"
922
+ "setzb %%bl ;"
923
+ "cmpq $-1, 16(%0) ;"
924
+ "setzb %%cl ;"
925
+ "movq 24(%0), %%rdx ;"
926
+ "addq $1, %%rdx ;"
927
+ "shrq $63, %%rdx ;"
928
+ "andb %%bl, %%al ;"
929
+ "andb %%dl, %%cl ;"
930
+ "test %%cl, %%al ;"
931
+ "movl $0, %%eax ;"
932
+ "movl $19, %%ecx ;"
933
+ "cmovnz %%rcx, %%rax ;"
934
+ "addq %%rax, (%0) ;"
935
+ "adcq $0, 8(%0) ;"
936
+ "adcq $0, 16(%0) ;"
937
+ "adcq $0, 24(%0) ;"
938
+ "btrq $63, 24(%0) ;"
939
+ :
940
+ : "r"(c)
941
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
942
+ );
823
943
  }
824
-