x25519 1.0.6 → 1.0.7

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -7,9 +7,9 @@
7
7
  [appveyor-image]: https://ci.appveyor.com/api/projects/status/4s05bcae0mow85v1?svg=true
8
8
  [appveyor-link]: https://ci.appveyor.com/project/tarcieri/x25519
9
9
  [docs-image]: https://img.shields.io/badge/yard-docs-blue.svg
10
- [docs-link]: http://www.rubydoc.info/gems/x25519/1.0.5
11
- [license-image]: https://img.shields.io/badge/License-LGPL%20v3-blue.svg
12
- [license-link]: https://www.gnu.org/licenses/lgpl-3.0
10
+ [docs-link]: http://www.rubydoc.info/gems/x25519/1.0.6
11
+ [license-image]: https://img.shields.io/badge/License-LGPL%20v2.1-blue.svg
12
+ [license-link]: https://www.gnu.org/licenses/lgpl-2.1
13
13
  [gitter-image]: https://badges.gitter.im/badge.svg
14
14
  [gitter-link]: https://gitter.im/crypto-rb/Lobby
15
15
 
@@ -32,6 +32,36 @@ Curve25519.
32
32
  [rfc7748_precomputed]: https://github.com/armfazh/rfc7748_precomputed
33
33
  [ed25519 gem]: https://github.com/crypto-rb/ed25519
34
34
 
35
+ ### Is it any good?
36
+
37
+ [Yes.](http://news.ycombinator.com/item?id=3067434)
38
+
39
+ ### What is it useful for?
40
+
41
+ X25519 is a key exchange/agreement algorithm generally used as a low-level
42
+ building block in cryptographic protocols.
43
+
44
+ ### Can I use X25519 to encrypt things?
45
+
46
+ Please use [RbNaCl::Box] or the (experimental) [XSTREAM] library if you would
47
+ like to use X25519 for public-key encryption. Otherwise, the X25519 algorithm
48
+ is not directly useful for encryption without a higher-level encryption protocol
49
+ built on top of it.
50
+
51
+ [RbNaCl::Box]: https://github.com/crypto-rb/rbnacl/wiki/Public-Key-Encryption
52
+ [XSTREAM]: https://github.com/miscreant/xstream
53
+
54
+ ## Help and Discussion
55
+
56
+ Have questions? Want to suggest a feature or change? Join a discussion group:
57
+
58
+ * [Crypto.rb Gitter]: web-based chat about Ruby crypto projects including **x25519**.
59
+ * [Crypto.rb Google Group]: join via web or email ([crypto-rb+subscribe@googlegroups.com])
60
+
61
+ [Crypto.rb Gitter]: https://gitter.im/crypto-rb/Lobby
62
+ [Crypto.rb Google Group]: https://groups.google.com/forum/#!forum/crypto-rb
63
+ [crypto-rb+subscribe@googlegroups.com]: mailto:crypto-rb+subscribe@googlegroups.com?subject=subscribe
64
+
35
65
  ## Requirements
36
66
 
37
67
  **x25519.rb** is supported on and tested against the following platforms:
@@ -260,7 +290,7 @@ The optimized [rfc7748_precomputed] implementation was designed by:
260
290
  Copyright (c) 2017-2018 Armando Faz, Tony Arcieri
261
291
 
262
292
  This gem is available as open source under the terms of the
263
- GNU Lesser General Public License v3.0 ([LICENSE](https://www.gnu.org/licenses/lgpl-3.0.txt))
293
+ GNU Lesser General Public License v2.1 ([LICENSE](https://www.gnu.org/licenses/lgpl-2.1.txt))
264
294
 
265
295
  ## Code of Conduct
266
296
 
@@ -1,13 +1,16 @@
1
1
  /**
2
- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>.
2
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
3
3
  * Institute of Computing.
4
4
  * University of Campinas, Brazil.
5
5
  *
6
- * This program is free software: you can redistribute it and/or modify
7
- * it under the terms of the GNU Lesser General Public License as
8
- * published by the Free Software Foundation, version 3.
6
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
7
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
8
+ *
9
+ * This program is free software: you can redistribute it and/or modify
10
+ * it under the terms of the GNU Lesser General Public License as
11
+ * published by the Free Software Foundation, version 2 or greater.
9
12
  *
10
- * This program is distributed in the hope that it will be useful, but
13
+ * This program is distributed in the hope that it will be useful, but
11
14
  * WITHOUT ANY WARRANTY; without even the implied warranty of
12
15
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
16
  * Lesser General Public License for more details.
@@ -17,197 +20,175 @@
17
20
  */
18
21
  #include "fp25519_x64.h"
19
22
 
20
- int compare_bytes(uint8_t* A, uint8_t* B,unsigned int num_bytes)
21
- {
22
- unsigned int i=0;
23
- uint8_t ret=0;
24
- for(i=0;i<num_bytes;i++)
25
- {
26
- ret += A[i]^B[i];
27
- }
28
- return ret;
29
- }
30
-
31
- int compare_EltFp25519_1w_x64(uint64_t *A, uint64_t *B)
32
- {
33
- return compare_bytes((uint8_t*)A,(uint8_t*)B,SIZE_ELEMENT_BYTES);
34
- }
35
-
36
23
  /**
37
24
  *
38
- * @param c Two 512-bit products: c[0:7]=a[0:3]*b[0:3] and c[8:15]=a[4:7]*b[4:7]
39
- * @param a Two 256-bit integers: a[0:3] and a[4:7]
40
- * @param b Two 256-bit integers: b[0:3] and b[4:7]
25
+ * @param c Two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
26
+ * @param a Two 256-bit integers: a0[0:3] and a1[4:7]
27
+ * @param b Two 256-bit integers: b0[0:3] and b1[4:7]
41
28
  */
42
- void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
43
- {
29
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
30
+ uint64_t *const b) {
44
31
  #ifdef __BMI2__
45
32
  #ifdef __ADX__
46
- __asm__ __volatile__(
47
- "movq (%1), %%rdx # A[0] \n\t"
48
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, (%0) \n\t"
49
- "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
50
- "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
51
- "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
52
- "adox %%rdx, %%rax \n\t"
53
-
54
- "movq 8(%1), %%rdx # A[1] \n\t"
55
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
56
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 16(%0) \n\t"
57
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
58
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
59
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
60
-
61
- "movq 16(%1), %%rdx # A[2] \n\t"
62
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 16(%0), %%r8 \n\t" "movq %%r8, 16(%0) \n\t"
63
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
64
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
65
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
66
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
67
-
68
- "movq 24(%1), %%rdx # A[3] \n\t"
69
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 24(%0), %%r8 \n\t" "movq %%r8, 24(%0) \n\t"
70
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 32(%0) \n\t"
71
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
72
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
73
- "adox %%rdx, %%rax \n\t" "adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
74
-
75
- "movq 32(%1), %%rdx # A[0] \n\t"
76
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "movq %%r8, 64(%0) \n\t"
77
- "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "movq %%r10, 72(%0) \n\t"
78
- "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t" "adox %%r11, %%r12 \n\t"
79
- "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t" "adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
80
- "adox %%rdx, %%rax \n\t"
81
-
82
- "movq 40(%1), %%rdx # A[1] \n\t"
83
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
84
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 80(%0) \n\t"
85
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
86
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
87
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
88
-
89
- "movq 48(%1), %%rdx # A[2] \n\t"
90
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 80(%0), %%r8 \n\t" "movq %%r8, 80(%0) \n\t"
91
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
92
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
93
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
94
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
95
-
96
- "movq 56(%1), %%rdx # A[3] \n\t"
97
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t" "xorl %%r10d, %%r10d \n\t" "adcx 88(%0), %%r8 \n\t" "movq %%r8, 88(%0) \n\t"
98
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "adox %%r9, %%r10 \n\t" "adcx %%r12, %%r10 \n\t" "movq %%r10, 96(%0) \n\t"
99
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adox %%r11, %%r12 \n\t" "adcx %%r14, %%r12 \n\t" "movq %%r12, 104(%0) \n\t" " movq $0, %%r8 \n\t"
100
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adox %%r13, %%r14 \n\t" "adcx %%rax, %%r14 \n\t" "movq %%r14, 112(%0) \n\t" " movq $0, %%rax \n\t"
101
- "adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 120(%0) \n\t"
102
- :
103
- : "r" (c), "r" (a), "r" (b)
104
- : "memory", "cc", "%rax", "%rdx",
105
- "%r8", "%r9", "%r10", "%r11",
106
- "%r12", "%r13", "%r14"
107
- );
33
+ __asm__ __volatile__(
34
+ "xorl %%r14d, %%r14d ;"
35
+ "movq (%1), %%rdx; " /* A[0] */
36
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
37
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "adox %%r10, %%r12 ;"
38
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adox %%r8, %%rax ;"
39
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adox %%r10, %%rbx ;"
40
+ /*******************************************/ "adox %%r14, %%rcx ;"
41
+
42
+ "movq 8(%1), %%rdx; " /* A[1] */
43
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
44
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
45
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
46
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
47
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
48
+
49
+ "movq 16(%1), %%rdx; " /* A[2] */ "xorl %%r10d, %%r10d ;"
50
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
51
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
52
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
53
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
54
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
55
+
56
+ "movq 24(%1), %%rdx; " /* A[3] */ "xorl %%r10d, %%r10d ;"
57
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
58
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
59
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
60
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
61
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 56(%0) ;"
62
+
63
+ "movq 32(%1), %%rdx; " /* C[0] */
64
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, 64(%0);"
65
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "adox %%r10, %%r12 ;"
66
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adox %%r8, %%rax ;"
67
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adox %%r10, %%rbx ;"
68
+ /*******************************************/ "adox %%r14, %%rcx ;"
69
+
70
+ "movq 40(%1), %%rdx; " /* C[1] */ "xorl %%r10d, %%r10d ;"
71
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 72(%0);"
72
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
73
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
74
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
75
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
76
+
77
+ "movq 48(%1), %%rdx; " /* C[2] */ "xorl %%r10d, %%r10d ;"
78
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 80(%0);"
79
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
80
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
81
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
82
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
83
+
84
+ "movq 56(%1), %%rdx; " /* C[3] */ "xorl %%r10d, %%r10d ;"
85
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 88(%0);"
86
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
87
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
88
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
89
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 120(%0) ;"
90
+ :
91
+ : "r" (c), "r" (a), "r" (b)
92
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
93
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
94
+ );
108
95
  #else
109
- __asm__ __volatile__(
110
- "movq (%1), %%rdx # A[0] \n\t"
111
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, (%0) \n\t"
112
- "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 8(%0) \n\t"
113
- "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
114
- "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
115
- "adcq $0, %%rcx \n\t"
116
-
117
- "movq 8(%1), %%rdx # A[1] \n\t"
118
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
119
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
120
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
121
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
122
- "adcq $0, %%rdx \n\t"
123
-
124
- "addq %%r8, 8(%0) \n\t"
125
- "adcq %%rax, %%r9 \n\t" "movq %%r9, 16(%0) \n\t" "movq $0, %%rax \n\t"
126
- "adcq %%r11, %%rbx \n\t"
127
- "adcq %%r13, %%rcx \n\t"
128
- "adcq %%rdx, %%rax \n\t"
129
-
130
- "movq 16(%1), %%rdx # A[2] \n\t"
131
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
132
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
133
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
134
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
135
- "adcq $0, %%rdx \n\t"
136
-
137
- "addq %%r8, 16(%0) \n\t"
138
- "adcq %%rbx, %%r9 \n\t" "movq %%r9, 24(%0) \n\t" "movq $0, %%rbx \n\t"
139
- "adcq %%r11, %%rcx \n\t"
140
- "adcq %%r13, %%rax \n\t"
141
- "adcq %%rdx, %%rbx \n\t"
142
-
143
- "movq 24(%1), %%rdx # A[3] \n\t"
144
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
145
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
146
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
147
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
148
- "adcq $0, %%rdx \n\t"
149
-
150
- "addq %%r8, 24(%0) \n\t"
151
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
152
- "adcq %%r11, %%rax \n\t" "movq %%rax, 40(%0) \n\t"
153
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 48(%0) \n\t"
154
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 56(%0) \n\t"
155
-
156
- "movq 32(%1), %%rdx # A[0] \n\t"
157
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t" "movq %%r8, 64(%0) \n\t"
158
- "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t" "addq %%r10, %%r9 \n\t" "movq %%r9, 72(%0) \n\t"
159
- "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t" "adcq %%r12, %%rax \n\t"
160
- "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t" "adcq %%r14, %%rbx \n\t"
161
- "adcq $0, %%rcx \n\t"
162
-
163
- "movq 40(%1), %%rdx # A[1] \n\t"
164
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
165
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
166
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
167
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
168
- "adcq $0, %%rdx \n\t"
169
-
170
- "addq %%r8, 72(%0) \n\t"
171
- "adcq %%rax, %%r9 \n\t" " movq %%r9, 80(%0) \n\t" " movq $0, %%rax \n\t"
172
- "adcq %%r11, %%rbx \n\t"
173
- "adcq %%r13, %%rcx \n\t"
174
- "adcq %%rdx, %%rax \n\t"
175
-
176
- "movq 48(%1), %%rdx # A[2] \n\t"
177
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
178
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
179
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
180
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
181
- "adcq $0, %%rdx \n\t"
182
-
183
- "addq %%r8, 80(%0) \n\t"
184
- "adcq %%rbx, %%r9 \n\t" " movq %%r9, 88(%0) \n\t" " movq $0, %%rbx \n\t"
185
- "adcq %%r11, %%rcx \n\t"
186
- "adcq %%r13, %%rax \n\t"
187
- "adcq %%rdx, %%rbx \n\t"
188
-
189
- "movq 56(%1), %%rdx # A[3] \n\t"
190
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
191
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t" "addq %%r10, %%r9 \n\t"
192
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t" "adcq %%r12, %%r11 \n\t"
193
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t" "adcq %%r14, %%r13 \n\t"
194
- "adcq $0, %%rdx \n\t"
195
-
196
- "addq %%r8, 88(%0) \n\t"
197
- "adcq %%rcx, %%r9 \n\t" "movq %%r9, 96(%0) \n\t" " movq $0, %%rcx \n\t"
198
- "adcq %%r11, %%rax \n\t" "movq %%rax, 104(%0) \n\t"
199
- "adcq %%r13, %%rbx \n\t" "movq %%rbx, 112(%0) \n\t"
200
- "adcq %%rdx, %%rcx \n\t" "movq %%rcx, 120(%0) \n\t"
201
- :
202
- : "r" (c), "r" (a), "r" (b)
203
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
204
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
205
- );
96
+ __asm__ __volatile__(
97
+ "movq (%1), %%rdx; " /* A[0] */
98
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
99
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
100
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
101
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
102
+ /*******************************************/ "adcq $0, %%rcx ;"
103
+
104
+ "movq 8(%1), %%rdx; " /* A[1] */
105
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
106
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
107
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
108
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
109
+ /*******************************************/ "adcq $0, %%r12 ;"
110
+
111
+ "addq %%r9, %%rax ;"
112
+ "adcq %%r11, %%rbx ;"
113
+ "adcq %%r13, %%rcx ;"
114
+ "adcq $0, %%r12 ;"
115
+
116
+ "movq 16(%1), %%rdx; " /* A[2] */
117
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
118
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
119
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
120
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
121
+ /*******************************************/ "adcq $0, %%rax ;"
122
+
123
+ "addq %%r9, %%rbx ;"
124
+ "adcq %%r11, %%rcx ;"
125
+ "adcq %%r13, %%r12 ;"
126
+ "adcq $0, %%rax ;"
127
+
128
+ "movq 24(%1), %%rdx; " /* A[3] */
129
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
130
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
131
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
132
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
133
+ /*******************************************/ "adcq $0, %%rbx ;"
134
+
135
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
136
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
137
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
138
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
139
+
140
+ "movq 32(%1), %%rdx; " /* C[0] */
141
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "movq %%r8, 64(%0) ;"
142
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "addq %%r10, %%r12 ;"
143
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adcq %%r8, %%rax ;"
144
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adcq %%r10, %%rbx ;"
145
+ /*******************************************/ "adcq $0, %%rcx ;"
146
+
147
+ "movq 40(%1), %%rdx; " /* C[1] */
148
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 72(%0) ;"
149
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adcq %%r10, %%r9 ;"
150
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adcq %%r8, %%r11 ;"
151
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adcq %%r10, %%r13 ;"
152
+ /*******************************************/ "adcq $0, %%r12 ;"
153
+
154
+ "addq %%r9, %%rax ;"
155
+ "adcq %%r11, %%rbx ;"
156
+ "adcq %%r13, %%rcx ;"
157
+ "adcq $0, %%r12 ;"
158
+
159
+ "movq 48(%1), %%rdx; " /* C[2] */
160
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 80(%0) ;"
161
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adcq %%r10, %%r9 ;"
162
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adcq %%r8, %%r11 ;"
163
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adcq %%r10, %%r13 ;"
164
+ /*******************************************/ "adcq $0, %%rax ;"
165
+
166
+ "addq %%r9, %%rbx ;"
167
+ "adcq %%r11, %%rcx ;"
168
+ "adcq %%r13, %%r12 ;"
169
+ "adcq $0, %%rax ;"
170
+
171
+ "movq 56(%1), %%rdx; " /* C[3] */
172
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 88(%0) ;"
173
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adcq %%r10, %%r9 ;"
174
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adcq %%r8, %%r11 ;"
175
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adcq %%r10, %%r13 ;"
176
+ /*******************************************/ "adcq $0, %%rbx ;"
177
+
178
+ "addq %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
179
+ "adcq %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
180
+ "adcq %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
181
+ "adcq $0, %%rbx ;" "movq %%rbx, 120(%0) ;"
182
+ :
183
+ : "r" (c), "r" (a), "r" (b)
184
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
185
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
186
+ );
206
187
  #endif
207
188
  #else /* Without BMI2 */
208
- /**
209
- * TODO: Multiplications using MULQ instruction.
210
- **/
189
+ /**
190
+ * TODO: Multiplications using MULQ instruction.
191
+ **/
211
192
  #endif
212
193
  }
213
194
 
@@ -216,140 +197,186 @@ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *co
216
197
  * @param c
217
198
  * @param a
218
199
  */
219
- void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
220
- {
200
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
221
201
  #ifdef __BMI2__
222
- __asm__ __volatile__(
223
- "movq (%1), %%rdx # A[0] \n\t"
224
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
225
- "movq 8(%1), %%rdx # A[1] \n\t"
226
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
227
- "movq %%r8, (%0) \n\t"
228
- "movq %%r9, 8(%0) \n\t"
229
- "movq %%r10, 16(%0) \n\t"
230
- "movq %%r11, 24(%0) \n\t"
231
-
232
- "movq 16(%1), %%rdx # A[2] \n\t"
233
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
234
- "movq 24(%1), %%rdx # A[3] \n\t"
235
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
236
- "movq %%r8, 32(%0) \n\t"
237
- "movq %%r9, 40(%0) \n\t"
238
- "movq %%r10, 48(%0) \n\t"
239
- "movq %%r11, 56(%0) \n\t"
240
-
241
- "movq 8(%1), %%rdx # A[1] \n\t"
242
- "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
243
- "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
244
- "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
245
-
246
- "movq 16(%1), %%rdx # A[2] \n\t"
247
- "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
248
- "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
249
-
250
- "addq %%rax, %%r9 \n\t"
251
- "adcq %%rdx, %%r10 \n\t"
252
- "adcq %%rcx, %%r11 \n\t"
253
- "adcq %%r14, %%r12 \n\t"
254
- "adcq $0, %%r13 \n\t"
255
- "movq $0, %%r14 \n\t"
256
- "adcq $0, %%r14 \n\t"
257
-
258
- "movq (%1), %%rdx # A[0] \n\t"
259
- "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
260
-
261
- "addq %%rax, %%r10 \n\t"
262
- "adcq %%rdx, %%r11 \n\t"
263
- "adcq $0, %%r12 \n\t"
264
- "adcq $0, %%r13 \n\t"
265
- "adcq $0, %%r14 \n\t"
266
-
267
- "shldq $1, %%r13, %%r14 \n\t"
268
- "shldq $1, %%r12, %%r13 \n\t"
269
- "shldq $1, %%r11, %%r12 \n\t"
270
- "shldq $1, %%r10, %%r11 \n\t"
271
- "shldq $1, %%r9, %%r10 \n\t"
272
- "shldq $1, %%r8, %%r9 \n\t"
273
- "shlq $1, %%r8 \n\t"
274
-
275
- "addq 8(%0), %%r8 \n\t" "movq %%r8, 8(%0) \n\t"
276
- "adcq 16(%0), %%r9 \n\t" "movq %%r9, 16(%0) \n\t"
277
- "adcq 24(%0), %%r10 \n\t" "movq %%r10, 24(%0) \n\t"
278
- "adcq 32(%0), %%r11 \n\t" "movq %%r11, 32(%0) \n\t"
279
- "adcq 40(%0), %%r12 \n\t" "movq %%r12, 40(%0) \n\t"
280
- "adcq 48(%0), %%r13 \n\t" "movq %%r13, 48(%0) \n\t"
281
- "adcq 56(%0), %%r14 \n\t" "movq %%r14, 56(%0) \n\t"
282
-
283
-
284
- "movq 32(%1), %%rdx # A[0] \n\t"
285
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
286
- "movq 40(%1), %%rdx # A[1] \n\t"
287
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
288
- "movq %%r8, 64(%0) \n\t"
289
- "movq %%r9, 72(%0) \n\t"
290
- "movq %%r10, 80(%0) \n\t"
291
- "movq %%r11, 88(%0) \n\t"
292
-
293
- "movq 48(%1), %%rdx # A[2] \n\t"
294
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
295
- "movq 56(%1), %%rdx # A[3] \n\t"
296
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
297
- "movq %%r8, 96(%0) \n\t"
298
- "movq %%r9, 104(%0) \n\t"
299
- "movq %%r10, 112(%0) \n\t"
300
- "movq %%r11, 120(%0) \n\t"
301
-
302
- "movq 40(%1), %%rdx # A[1] \n\t"
303
- "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
304
- "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
305
- "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
306
-
307
- "movq 48(%1), %%rdx # A[2] \n\t"
308
- "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
309
- "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
310
-
311
- "addq %%rax, %%r9 \n\t"
312
- "adcq %%rdx, %%r10 \n\t"
313
- "adcq %%rcx, %%r11 \n\t"
314
- "adcq %%r14, %%r12 \n\t"
315
- "adcq $0, %%r13 \n\t"
316
- "movq $0, %%r14 \n\t"
317
- "adcq $0, %%r14 \n\t"
318
-
319
- "movq 32(%1), %%rdx # A[0] \n\t"
320
- "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
321
-
322
- "addq %%rax, %%r10 \n\t"
323
- "adcq %%rdx, %%r11 \n\t"
324
- "adcq $0, %%r12 \n\t"
325
- "adcq $0, %%r13 \n\t"
326
- "adcq $0, %%r14 \n\t"
327
-
328
- "shldq $1, %%r13, %%r14 \n\t"
329
- "shldq $1, %%r12, %%r13 \n\t"
330
- "shldq $1, %%r11, %%r12 \n\t"
331
- "shldq $1, %%r10, %%r11 \n\t"
332
- "shldq $1, %%r9, %%r10 \n\t"
333
- "shldq $1, %%r8, %%r9 \n\t"
334
- "shlq $1, %%r8 \n\t"
335
-
336
- "addq 72(%0), %%r8 \n\t" "movq %%r8, 72(%0) \n\t"
337
- "adcq 80(%0), %%r9 \n\t" "movq %%r9, 80(%0) \n\t"
338
- "adcq 88(%0), %%r10 \n\t" "movq %%r10, 88(%0) \n\t"
339
- "adcq 96(%0), %%r11 \n\t" "movq %%r11, 96(%0) \n\t"
340
- "adcq 104(%0), %%r12 \n\t" "movq %%r12, 104(%0) \n\t"
341
- "adcq 112(%0), %%r13 \n\t" "movq %%r13, 112(%0) \n\t"
342
- "adcq 120(%0), %%r14 \n\t" "movq %%r14, 120(%0) \n\t"
343
- :
344
- : "r" (c), "r" (a)
345
- : "cc", "%rax", "%rcx", "%rdx",
346
- "%r8", "%r9", "%r10", "%r11",
347
- "%r12", "%r13", "%r14"
348
- );
202
+ #ifdef __ADX__
203
+ __asm__ __volatile__(
204
+ "movq (%1), %%rdx ;" /* A[0] */
205
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
206
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
207
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
208
+ "movq 24(%1), %%rdx ;" /* A[3] */
209
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
210
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
211
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
212
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
213
+ /*******************************************/ "adcx %%r15, %%r14 ;"
214
+
215
+ "xorl %%r15d, %%r15d;"
216
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
217
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
218
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
219
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
220
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
221
+ "adcx %%r13, %%r13 ;"
222
+ "adcx %%r14, %%r14 ;"
223
+
224
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
225
+ /********************/ "movq %%rax, 0(%0) ;"
226
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
227
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
228
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
229
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
230
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
231
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
232
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
233
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
234
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
235
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
236
+
237
+
238
+ "movq 32(%1), %%rdx ;" /* B[0] */
239
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ "xorl %%r15d, %%r15d;"
240
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ "adcx %%r14, %%r9 ;"
241
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ "adcx %%rax, %%r10 ;"
242
+ "movq 56(%1), %%rdx ;" /* B[3] */
243
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */ "adcx %%rcx, %%r11 ;"
244
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ "adcx %%rax, %%r12 ;"
245
+ "movq 40(%1), %%rdx ;" /* B[1] */ "adcx %%r15, %%r13 ;"
246
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ "movq $0, %%r14 ;"
247
+ /*******************************************/ "adcx %%r15, %%r14 ;"
248
+
249
+ "xorl %%r15d, %%r15d;"
250
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
251
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
252
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
253
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
254
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
255
+ "adcx %%r13, %%r13 ;"
256
+ "adcx %%r14, %%r14 ;"
257
+
258
+ "movq 32(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
259
+ /********************/ "movq %%rax, 64(%0) ;"
260
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
261
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
262
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
263
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
264
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
265
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
266
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
267
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
268
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
269
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
270
+ :
271
+ : "r" (c), "r" (a)
272
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
273
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
274
+ );
275
+ #else /* Without ADX */
276
+ __asm__ __volatile__(
277
+ "movq 8(%1), %%rdx ;" /* A[1] */
278
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
279
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
280
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
281
+
282
+ "movq 16(%1), %%rdx ;" /* A[2] */
283
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
284
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
285
+
286
+ "addq %%rax, %%r9 ;"
287
+ "adcq %%rdx, %%r10 ;"
288
+ "adcq %%rcx, %%r11 ;"
289
+ "adcq %%r14, %%r12 ;"
290
+ "adcq $0, %%r13 ;"
291
+ "movq $0, %%r14 ;"
292
+ "adcq $0, %%r14 ;"
293
+
294
+ "movq (%1), %%rdx ;" /* A[0] */
295
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
296
+
297
+ "addq %%rax, %%r10 ;"
298
+ "adcq %%rcx, %%r11 ;"
299
+ "adcq $0, %%r12 ;"
300
+ "adcq $0, %%r13 ;"
301
+ "adcq $0, %%r14 ;"
302
+
303
+ "shldq $1, %%r13, %%r14 ;"
304
+ "shldq $1, %%r12, %%r13 ;"
305
+ "shldq $1, %%r11, %%r12 ;"
306
+ "shldq $1, %%r10, %%r11 ;"
307
+ "shldq $1, %%r9, %%r10 ;"
308
+ "shldq $1, %%r8, %%r9 ;"
309
+ "shlq $1, %%r8 ;"
310
+
311
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
312
+ /********************/ "movq %%rax, 0(%0) ;"
313
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
314
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
315
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
316
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
317
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
318
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
319
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
320
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
321
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
322
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
323
+
324
+ "movq 40(%1), %%rdx ;" /* B[1] */
325
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
326
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
327
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
328
+
329
+ "movq 48(%1), %%rdx ;" /* B[2] */
330
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
331
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
332
+
333
+ "addq %%rax, %%r9 ;"
334
+ "adcq %%rdx, %%r10 ;"
335
+ "adcq %%rcx, %%r11 ;"
336
+ "adcq %%r14, %%r12 ;"
337
+ "adcq $0, %%r13 ;"
338
+ "movq $0, %%r14 ;"
339
+ "adcq $0, %%r14 ;"
340
+
341
+ "movq 32(%1), %%rdx ;" /* B[0] */
342
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
343
+
344
+ "addq %%rax, %%r10 ;"
345
+ "adcq %%rcx, %%r11 ;"
346
+ "adcq $0, %%r12 ;"
347
+ "adcq $0, %%r13 ;"
348
+ "adcq $0, %%r14 ;"
349
+
350
+ "shldq $1, %%r13, %%r14 ;"
351
+ "shldq $1, %%r12, %%r13 ;"
352
+ "shldq $1, %%r11, %%r12 ;"
353
+ "shldq $1, %%r10, %%r11 ;"
354
+ "shldq $1, %%r9, %%r10 ;"
355
+ "shldq $1, %%r8, %%r9 ;"
356
+ "shlq $1, %%r8 ;"
357
+
358
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
359
+ /********************/ "movq %%rax, 64(%0) ;"
360
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
361
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
362
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
363
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
364
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
365
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
366
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
367
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
368
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
369
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
370
+ :
371
+ : "r" (c), "r" (a)
372
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
373
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
374
+ );
375
+ #endif
349
376
  #else /* Without BMI2 */
350
- /**
351
- * TODO: Multiplications using MULQ instruction.
352
- **/
377
+ /**
378
+ * TODO: Multiplications using MULQ instruction.
379
+ **/
353
380
  #endif
354
381
  }
355
382
 
@@ -358,467 +385,543 @@ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
358
385
  * @param c
359
386
  * @param a
360
387
  */
361
- void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a)
362
- {
388
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a) {
363
389
  #ifdef __BMI2__
364
390
  #ifdef __ADX__
365
- __asm__ __volatile__(
366
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
367
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
368
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
369
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
370
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
371
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
372
- " xorl %%ebx, %%ebx \n\t"
373
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
374
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
375
-
376
- " mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox 64(%1), %%r8 \n\t"
377
- " mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 72(%1), %%r9 \n\t"
378
- " mulx 112(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 80(%1), %%r10 \n\t" " movq %%r10, 48(%0) \n\t"
379
- " mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 88(%1), %%r11 \n\t" " movq %%r11, 56(%0) \n\t"
380
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
381
- " xorl %%ebx, %%ebx \n\t"
382
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
383
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 40(%0) \n\t"
384
- :
385
- : "r" (c), "r" (a)
386
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
387
- );
391
+ __asm__ __volatile__(
392
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
393
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
394
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
395
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
396
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
397
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
398
+ "clc ;"
399
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
400
+ "adcx %%rax, %%r8 ;"
401
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
402
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
403
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
404
+ "mov $0, %%ecx ;"
405
+ "cmovc %%edx, %%ecx ;"
406
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
407
+
408
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox 64(%1), %%r8 ;"
409
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 72(%1), %%r9 ;"
410
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 80(%1), %%r10 ;"
411
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 88(%1), %%r11 ;"
412
+ /*****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
413
+ "clc ;"
414
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
415
+ "adcx %%rax, %%r8 ;"
416
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
417
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 48(%0) ;"
418
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 56(%0) ;"
419
+ "mov $0, %%ecx ;"
420
+ "cmovc %%edx, %%ecx ;"
421
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
422
+ :
423
+ : "r" (c), "r" (a)
424
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
425
+ );
388
426
  #else
389
- __asm__ __volatile__(
390
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
391
- "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
392
- "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
393
- "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
394
- "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
395
- "adcq $0, %%rcx \n\t"
396
-
397
- "addq (%1), %%r8 \n\t"
398
- "adcq 8(%1), %%r10 \n\t"
399
- "adcq 16(%1), %%r12 \n\t" "movq %%r12, 16(%0) \n\t"
400
- "adcq 24(%1), %%rax \n\t" "movq %%rax, 24(%0) \n\t"
401
- "adcq $0, %%rcx \n\t"
402
-
403
- "mulx %%rcx, %%rax, %%rcx \n\t"
404
- "addq %%rax, %%r8 \n\t" "movq %%r8, (%0) \n\t"
405
- "adcq %%rcx, %%r10 \n\t" "movq %%r10, 8(%0) \n\t"
406
-
407
- "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
408
- "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t" "addq %%r9, %%r10 \n\t"
409
- "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t" "adcq %%r11, %%r12 \n\t"
410
- "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t" "adcq %%r13, %%rax \n\t"
411
- "adcq $0, %%rcx \n\t"
412
-
413
- "addq 64(%1), %%r8 \n\t"
414
- "adcq 72(%1), %%r10 \n\t"
415
- "adcq 80(%1), %%r12 \n\t" "movq %%r12, 48(%0) \n\t"
416
- "adcq 88(%1), %%rax \n\t" "movq %%rax, 56(%0) \n\t"
417
- "adcq $0, %%rcx \n\t"
418
-
419
- "mulx %%rcx, %%rax, %%rcx \n\t"
420
- "addq %%rax, %%r8 \n\t" " movq %%r8, 32(%0) \n\t"
421
- "adcq %%rcx, %%r10 \n\t" " movq %%r10, 40(%0) \n\t"
422
-
423
- :
424
- : "r" (c), "r" (a)
425
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
426
- );
427
+ __asm__ __volatile__(
428
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
429
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
430
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
431
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
432
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
433
+ /****************************************/ "adcq $0, %%rcx ;"
434
+ "addq (%1), %%r8 ;"
435
+ "adcq 8(%1), %%r9 ;"
436
+ "adcq 16(%1), %%r10 ;"
437
+ "adcq 24(%1), %%r11 ;"
438
+ "adcq $0, %%rcx ;"
439
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
440
+ "addq %%rax, %%r8 ;"
441
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
442
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
443
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
444
+ "mov $0, %%ecx ;"
445
+ "cmovc %%edx, %%ecx ;"
446
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
447
+
448
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
449
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
450
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
451
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
452
+ /*****************************************/ "adcq $0, %%rcx ;"
453
+ "addq 64(%1), %%r8 ;"
454
+ "adcq 72(%1), %%r9 ;"
455
+ "adcq 80(%1), %%r10 ;"
456
+ "adcq 88(%1), %%r11 ;"
457
+ "adcq $0, %%rcx ;"
458
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
459
+ "addq %%rax, %%r8 ;"
460
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
461
+ "adcq $0, %%r10 ;" "movq %%r10, 48(%0) ;"
462
+ "adcq $0, %%r11 ;" "movq %%r11, 56(%0) ;"
463
+ "mov $0, %%ecx ;"
464
+ "cmovc %%edx, %%ecx ;"
465
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
466
+ :
467
+ : "r" (c), "r" (a)
468
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
469
+ );
427
470
  #endif
428
471
  #else /* Without BMI2 */
429
- /* [TODO] */
472
+ /* [TODO] */
430
473
  #endif
431
474
  }
432
475
 
433
- void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
434
- {
476
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
435
477
  #ifdef __BMI2__
436
478
  #ifdef __ADX__
437
- __asm__ __volatile__(
438
- " movq (%1), %%rdx # A[0] \n\t"
439
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " movq %%r8, (%0) \n\t"
440
- " mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
441
- " mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t" " adox %%r11, %%r12 \n\t"
442
- " mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " movq $0, %%rax \n\t"
443
- " adox %%rdx, %%rax \n\t"
444
-
445
- " movq 8(%1), %%rdx # A[1] \n\t"
446
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
447
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
448
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
449
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
450
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
451
-
452
- " movq 16(%1), %%rdx # A[2] \n\t"
453
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 16(%0), %%r8 \n\t" " movq %%r8, 16(%0) \n\t"
454
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
455
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq $0, %%r8 \n\t"
456
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq $0, %%rax \n\t"
457
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t"
458
-
459
- " movq 24(%1), %%rdx # A[3] \n\t"
460
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t" " xorl %%r10d, %%r10d \n\t" " adcx 24(%0), %%r8 \n\t" " movq %%r8, 24(%0) \n\t"
461
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " adox %%r9, %%r10 \n\t" " adcx %%r12, %%r10 \n\t" " movq %%r10, 32(%0) \n\t"
462
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adox %%r11, %%r12 \n\t" " adcx %%r14, %%r12 \n\t" " movq %%r12, 40(%0) \n\t" " movq $0, %%r8 \n\t"
463
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adox %%r13, %%r14 \n\t" " adcx %%rax, %%r14 \n\t" " movq %%r14, 48(%0) \n\t" " movq $0, %%rax \n\t"
464
- " adox %%rdx, %%rax \n\t" " adcx %%r8, %%rax \n\t" " movq %%rax, 56(%0) \n\t"
465
- :
466
- : "r" (c), "r" (a), "r" (b)
467
- : "memory", "cc", "%rax", "%rdx",
468
- "%r8", "%r9", "%r10", "%r11",
469
- "%r12", "%r13", "%r14"
470
- );
479
+ __asm__ __volatile__(
480
+ "movq (%1), %%rdx; " /* A[0] */
481
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
482
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(%0) ;"
483
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%r12 ;"
484
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;"
485
+ /*******************************************/ "adox %%rdx, %%rax ;"
486
+
487
+ "movq 8(%1), %%rdx; " /* A[1] */
488
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(%0), %%r8 ;" "movq %%r8, 8(%0) ;"
489
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 16(%0) ;"
490
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
491
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
492
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
493
+
494
+ "movq 16(%1), %%rdx; " /* A[2] */
495
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(%0), %%r8 ;" "movq %%r8, 16(%0) ;"
496
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 24(%0) ;"
497
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
498
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
499
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
500
+
501
+ "movq 24(%1), %%rdx; " /* A[3] */
502
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(%0), %%r8 ;" "movq %%r8, 24(%0) ;"
503
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 32(%0) ;"
504
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq %%r12, 40(%0) ;" "movq $0, %%r8 ;"
505
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(%0) ;" "movq $0, %%rax ;"
506
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(%0) ;"
507
+ :
508
+ : "r" (c), "r" (a), "r" (b)
509
+ : "memory", "cc", "%rax", "%rdx", "%r8",
510
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
511
+ );
471
512
  #else
472
- __asm__ __volatile__(
473
- " movq (%1), %%rdx # A[0] \n\t"
474
- " mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" " movq %%r8, (%0) \n\t"
475
- " mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t" " addq %%r10, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
476
- " mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t" " adcq %%r12, %%rax \n\t"
477
- " mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t" " adcq %%r14, %%rbx \n\t"
478
- " adcq $0, %%rcx \n\t"
479
-
480
- " movq 8(%1), %%rdx # A[1] \n\t"
481
- " mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
482
- " mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
483
- " mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
484
- " mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
485
- " adcq $0, %%rdx \n\t"
486
-
487
- " addq %%r8, 8(%0) \n\t"
488
- " adcq %%rax, %%r9 \n\t" " movq %%r9, 16(%0) \n\t" " movq $0, %%rax \n\t"
489
- " adcq %%r11, %%rbx \n\t"
490
- " adcq %%r13, %%rcx \n\t"
491
- " adcq %%rdx, %%rax \n\t"
492
-
493
- " movq 16(%1), %%rdx # A[2] \n\t"
494
- " mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
495
- " mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
496
- " mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
497
- " mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
498
- " adcq $0, %%rdx \n\t"
499
-
500
- " addq %%r8, 16(%0) \n\t"
501
- " adcq %%rbx, %%r9 \n\t" " movq %%r9, 24(%0) \n\t" " movq $0, %%rbx \n\t"
502
- " adcq %%r11, %%rcx \n\t"
503
- " adcq %%r13, %%rax \n\t"
504
- " adcq %%rdx, %%rbx \n\t"
505
-
506
- " movq 24(%1), %%rdx # A[3] \n\t"
507
- " mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
508
- " mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t" " addq %%r10, %%r9 \n\t"
509
- " mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t" " adcq %%r12, %%r11 \n\t"
510
- " mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t" " adcq %%r14, %%r13 \n\t"
511
- " adcq $0, %%rdx \n\t"
512
-
513
- " addq %%r8, 24(%0) \n\t"
514
- " adcq %%rcx, %%r9 \n\t" " movq %%r9, 32(%0) \n\t" " movq $0, %%rcx \n\t"
515
- " adcq %%r11, %%rax \n\t" " movq %%rax, 40(%0) \n\t"
516
- " adcq %%r13, %%rbx \n\t" " movq %%rbx, 48(%0) \n\t"
517
- " adcq %%rdx, %%rcx \n\t" " movq %%rcx, 56(%0) \n\t"
518
- :
519
- : "r" (c), "r" (a), "r" (b)
520
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
521
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
522
- );
513
+ __asm__ __volatile__(
514
+ "movq (%1), %%rdx; " /* A[0] */
515
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
516
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
517
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
518
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
519
+ /*******************************************/ "adcq $0, %%rcx ;"
520
+
521
+ "movq 8(%1), %%rdx; " /* A[1] */
522
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
523
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
524
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
525
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
526
+ /*******************************************/ "adcq $0, %%r12 ;"
527
+
528
+ "addq %%r9, %%rax ;"
529
+ "adcq %%r11, %%rbx ;"
530
+ "adcq %%r13, %%rcx ;"
531
+ "adcq $0, %%r12 ;"
532
+
533
+ "movq 16(%1), %%rdx; " /* A[2] */
534
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
535
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
536
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
537
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
538
+ /*******************************************/ "adcq $0, %%rax ;"
539
+
540
+ "addq %%r9, %%rbx ;"
541
+ "adcq %%r11, %%rcx ;"
542
+ "adcq %%r13, %%r12 ;"
543
+ "adcq $0, %%rax ;"
544
+
545
+ "movq 24(%1), %%rdx; " /* A[3] */
546
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
547
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
548
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
549
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
550
+ /*******************************************/ "adcq $0, %%rbx ;"
551
+
552
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
553
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
554
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
555
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
556
+ :
557
+ : "r" (c), "r" (a), "r" (b)
558
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
559
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
560
+ );
523
561
  #endif
524
562
  #else /* Without BMI2 */
525
- /**
526
- * TODO: Multiplications using MULQ instruction.
527
- **/
563
+ /**
564
+ * TODO: Multiplications using MULQ instruction.
565
+ **/
528
566
  #endif
529
567
  }
530
568
 
531
- void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a)
532
- {
569
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
533
570
  #ifdef __BMI2__
534
- __asm__ __volatile__(
535
- " movq (%1), %%rdx # A[0] \n\t"
536
- " mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
537
- " movq 8(%1), %%rdx # A[1] \n\t"
538
- " mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
539
- " movq %%r8, (%0) \n\t"
540
- " movq %%r9, 8(%0) \n\t"
541
- " movq %%r10, 16(%0) \n\t"
542
- " movq %%r11, 24(%0) \n\t"
543
-
544
- " movq 16(%1), %%rdx # A[2] \n\t"
545
- " mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
546
- " movq 24(%1), %%rdx # A[3] \n\t"
547
- " mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
548
- " movq %%r8, 32(%0) \n\t"
549
- " movq %%r9, 40(%0) \n\t"
550
- " movq %%r10, 48(%0) \n\t"
551
- " movq %%r11, 56(%0) \n\t"
552
-
553
- " movq 8(%1), %%rdx # A[1] \n\t"
554
- " mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
555
- " mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
556
- " mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
557
-
558
- " movq 16(%1), %%rdx # A[2] \n\t"
559
- " mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
560
- " mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
561
-
562
- " addq %%rax, %%r9 \n\t"
563
- " adcq %%rdx, %%r10 \n\t"
564
- " adcq %%rcx, %%r11 \n\t"
565
- " adcq %%r14, %%r12 \n\t"
566
- " adcq $0, %%r13 \n\t"
567
- " movq $0, %%r14 \n\t"
568
- " adcq $0, %%r14 \n\t"
569
-
570
- " movq (%1), %%rdx # A[0] \n\t"
571
- " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
572
-
573
- " addq %%rax, %%r10 \n\t"
574
- " adcq %%rdx, %%r11 \n\t"
575
- " adcq $0, %%r12 \n\t"
576
- " adcq $0, %%r13 \n\t"
577
- " adcq $0, %%r14 \n\t"
578
-
579
- " shldq $1, %%r13, %%r14 \n\t"
580
- " shldq $1, %%r12, %%r13 \n\t"
581
- " shldq $1, %%r11, %%r12 \n\t"
582
- " shldq $1, %%r10, %%r11 \n\t"
583
- " shldq $1, %%r9, %%r10 \n\t"
584
- " shldq $1, %%r8, %%r9 \n\t"
585
- " shlq $1, %%r8 \n\t"
586
-
587
- " addq 8(%0), %%r8 \n\t" " movq %%r8, 8(%0) \n\t"
588
- " adcq 16(%0), %%r9 \n\t" " movq %%r9, 16(%0) \n\t"
589
- " adcq 24(%0), %%r10 \n\t" " movq %%r10, 24(%0) \n\t"
590
- " adcq 32(%0), %%r11 \n\t" " movq %%r11, 32(%0) \n\t"
591
- " adcq 40(%0), %%r12 \n\t" " movq %%r12, 40(%0) \n\t"
592
- " adcq 48(%0), %%r13 \n\t" " movq %%r13, 48(%0) \n\t"
593
- " adcq 56(%0), %%r14 \n\t" " movq %%r14, 56(%0) \n\t"
594
- :
595
- : "r" (c), "r" (a)
596
- : "memory", "cc", "%rax", "%rcx", "%rdx",
597
- "%r8", "%r9", "%r10", "%r11",
598
- "%r12", "%r13", "%r14"
599
- );
571
+ #ifdef __ADX__
572
+ __asm__ __volatile__(
573
+ "movq (%1), %%rdx ;" /* A[0] */
574
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
575
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
576
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
577
+ "movq 24(%1), %%rdx ;" /* A[3] */
578
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
579
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
580
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
581
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
582
+ /*******************************************/ "adcx %%r15, %%r14 ;"
583
+
584
+ "xorl %%r15d, %%r15d;"
585
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
586
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
587
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
588
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
589
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
590
+ "adcx %%r13, %%r13 ;"
591
+ "adcx %%r14, %%r14 ;"
592
+
593
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
594
+ /********************/ "movq %%rax, 0(%0) ;"
595
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
596
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
597
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
598
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
599
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
600
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
601
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
602
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
603
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
604
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
605
+ :
606
+ : "r" (c), "r" (a)
607
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
608
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
609
+ );
610
+ #else /* Without ADX */
611
+ __asm__ __volatile__(
612
+ "movq 8(%1), %%rdx ;" /* A[1] */
613
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
614
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
615
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
616
+
617
+ "movq 16(%1), %%rdx ;" /* A[2] */
618
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
619
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
620
+
621
+ "addq %%rax, %%r9 ;"
622
+ "adcq %%rdx, %%r10 ;"
623
+ "adcq %%rcx, %%r11 ;"
624
+ "adcq %%r14, %%r12 ;"
625
+ "adcq $0, %%r13 ;"
626
+ "movq $0, %%r14 ;"
627
+ "adcq $0, %%r14 ;"
628
+
629
+ "movq (%1), %%rdx ;" /* A[0] */
630
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
631
+
632
+ "addq %%rax, %%r10 ;"
633
+ "adcq %%rcx, %%r11 ;"
634
+ "adcq $0, %%r12 ;"
635
+ "adcq $0, %%r13 ;"
636
+ "adcq $0, %%r14 ;"
637
+
638
+ "shldq $1, %%r13, %%r14 ;"
639
+ "shldq $1, %%r12, %%r13 ;"
640
+ "shldq $1, %%r11, %%r12 ;"
641
+ "shldq $1, %%r10, %%r11 ;"
642
+ "shldq $1, %%r9, %%r10 ;"
643
+ "shldq $1, %%r8, %%r9 ;"
644
+ "shlq $1, %%r8 ;"
645
+
646
+ /********************/ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
647
+ /********************/ "movq %%rax, 0(%0) ;"
648
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
649
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
650
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
651
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
652
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
653
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
654
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
655
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
656
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
657
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
658
+ :
659
+ : "r" (c), "r" (a)
660
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
661
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
662
+ );
663
+ #endif
600
664
  #else /* Without BMI2 */
601
- /**
602
- * TODO: Multiplications using MULQ instruction.
603
- **/
665
+ /**
666
+ * TODO: Multiplications using MULQ instruction.
667
+ **/
604
668
  #endif
605
669
  }
606
670
 
607
- void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
608
- {
671
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
609
672
  #ifdef __BMI2__
610
673
  #ifdef __ADX__
611
- __asm__ __volatile__(
612
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
613
- " mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" " xorl %%ebx, %%ebx \n\t" " adox (%1), %%r8 \n\t"
614
- " mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t" " adcx %%r10, %%r9 \n\t" " adox 8(%1), %%r9 \n\t"
615
- " mulx 48(%1), %%r10, %%rax # c*C[6] \n\t" " adcx %%r11, %%r10 \n\t" " adox 16(%1), %%r10 \n\t" " movq %%r10, 16(%0) \n\t"
616
- " mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t" " adcx %%rax, %%r11 \n\t" " adox 24(%1), %%r11 \n\t" " movq %%r11, 24(%0) \n\t"
617
- " adcx %%rbx, %%rcx \n\t" " adox %%rbx, %%rcx \n\t"
618
- " xorl %%ebx, %%ebx \n\t"
619
- " mulx %%rcx, %%rax, %%rcx \n\t" " adcx %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
620
- " adcx %%rcx, %%r9 \n\t" " movq %%r9, 8(%0) \n\t"
621
- :
622
- : "r" (c), "r" (a)
623
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
624
- );
674
+ __asm__ __volatile__(
675
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
676
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
677
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
678
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
679
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
680
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
681
+ "clc ;"
682
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
683
+ "adcx %%rax, %%r8 ;"
684
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
685
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
686
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
687
+ "mov $0, %%ecx ;"
688
+ "cmovc %%edx, %%ecx ;"
689
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
690
+ :
691
+ : "r" (c), "r" (a)
692
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
693
+ );
625
694
  #else
626
- __asm__ __volatile__(
627
- " movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
628
- " mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
629
- " mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t" " addq %%r9, %%r10 \n\t"
630
- " mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t" " adcq %%r11, %%r12 \n\t"
631
- " mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t" " adcq %%r13, %%rax \n\t"
632
- " adcq $0, %%rcx \n\t"
633
-
634
- " addq (%1), %%r8 \n\t"
635
- " adcq 8(%1), %%r10 \n\t"
636
- " adcq 16(%1), %%r12 \n\t" " movq %%r12, 16(%0) \n\t"
637
- " adcq 24(%1), %%rax \n\t" " movq %%rax, 24(%0) \n\t"
638
- " adcq $0, %%rcx \n\t"
639
-
640
- " mulx %%rcx, %%rax, %%rcx \n\t"
641
- " addq %%rax, %%r8 \n\t" " movq %%r8, (%0) \n\t"
642
- " adcq %%rcx, %%r10 \n\t" " movq %%r10, 8(%0) \n\t"
643
- :
644
- : "r" (c), "r" (a)
645
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
646
- );
695
+ __asm__ __volatile__(
696
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
697
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
698
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
699
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
700
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
701
+ /****************************************/ "adcq $0, %%rcx ;"
702
+ "addq (%1), %%r8 ;"
703
+ "adcq 8(%1), %%r9 ;"
704
+ "adcq 16(%1), %%r10 ;"
705
+ "adcq 24(%1), %%r11 ;"
706
+ "adcq $0, %%rcx ;"
707
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
708
+ "addq %%rax, %%r8 ;"
709
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
710
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
711
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
712
+ "mov $0, %%ecx ;"
713
+ "cmovc %%edx, %%ecx ;"
714
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
715
+ :
716
+ : "r" (c), "r" (a)
717
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
718
+ );
647
719
  #endif
648
720
  #else /* Without BMI2 */
649
- /**
650
- * TODO: Multiplications using MULQ instruction.
651
- **/
721
+ /**
722
+ * TODO: Multiplications using MULQ instruction.
723
+ **/
652
724
  #endif
653
725
  }
654
726
 
655
- inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b)
656
- {
657
- #if __ADX__
658
- __asm__ __volatile__(
659
- "movq (%2), %%rax \n\t"
660
- "movq 8(%2), %%rcx \n\t"
661
- "movq 16(%2), %%r8 \n\t"
662
- "movq 24(%2), %%r9 \n\t"
663
- "clc \n\t"
664
- "adcx (%1), %%rax \n\t"
665
- "adcx 8(%1), %%rcx \n\t"
666
- "adcx 16(%1), %%r8 \n\t"
667
- "adcx 24(%1), %%r9 \n\t"
668
- "movq %%rcx, 8(%0) \n\t"
669
- "movq %%r8 , 16(%0) \n\t"
670
- "movq %%r9 , 24(%0) \n\t"
671
- "setc %%cl \n\t"
672
- "neg %%rcx \n\t"
673
- "andq $38, %%rcx \n\t"
674
- "addq %%rcx, %%rax \n\t"
675
- "movq %%rax, (%0) \n\t"
676
- :
677
- : "r" (c), "r" (a), "r" (b)
678
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
679
- );
727
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
728
+ #ifdef __ADX__
729
+ __asm__ __volatile__(
730
+ "mov $38, %%eax ;"
731
+ "xorl %%ecx, %%ecx ;"
732
+ "movq (%2), %%r8 ;" "adcx (%1), %%r8 ;"
733
+ "movq 8(%2), %%r9 ;" "adcx 8(%1), %%r9 ;"
734
+ "movq 16(%2), %%r10 ;" "adcx 16(%1), %%r10 ;"
735
+ "movq 24(%2), %%r11 ;" "adcx 24(%1), %%r11 ;"
736
+ "cmovc %%eax, %%ecx ;"
737
+ "xorl %%eax, %%eax ;"
738
+ "adcx %%rcx, %%r8 ;"
739
+ "adcx %%rax, %%r9 ;" "movq %%r9, 8(%0) ;"
740
+ "adcx %%rax, %%r10 ;" "movq %%r10, 16(%0) ;"
741
+ "adcx %%rax, %%r11 ;" "movq %%r11, 24(%0) ;"
742
+ "mov $38, %%ecx ;"
743
+ "cmovc %%ecx, %%eax ;"
744
+ "addq %%rax, %%r8 ;" "movq %%r8, (%0) ;"
745
+ :
746
+ : "r" (c), "r" (a), "r" (b)
747
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
748
+ );
680
749
  #else
681
- __asm__ __volatile__(
682
- "movq (%2), %%rax \n\t"
683
- "movq 8(%2), %%rcx \n\t"
684
- "movq 16(%2), %%r8 \n\t"
685
- "movq 24(%2), %%r9 \n\t"
686
- "add (%1), %%rax \n\t"
687
- "adc 8(%1), %%rcx \n\t"
688
- "adc 16(%1), %%r8 \n\t"
689
- "adc 24(%1), %%r9 \n\t"
690
- "movq %%rcx, 8(%0) \n\t"
691
- "movq %%r8 , 16(%0) \n\t"
692
- "movq %%r9 , 24(%0) \n\t"
693
- "setc %%cl \n\t"
694
- "neg %%rcx \n\t"
695
- "andq $38, %%rcx \n\t"
696
- "addq %%rcx, %%rax \n\t"
697
- "movq %%rax, (%0) \n\t"
698
- :
699
- : "r" (c), "r" (a), "r" (b)
700
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
701
- );
750
+ __asm__ __volatile__(
751
+ "mov $38, %%eax ;"
752
+ "movq (%2), %%r8 ;" "addq (%1), %%r8 ;"
753
+ "movq 8(%2), %%r9 ;" "adcq 8(%1), %%r9 ;"
754
+ "movq 16(%2), %%r10 ;" "adcq 16(%1), %%r10 ;"
755
+ "movq 24(%2), %%r11 ;" "adcq 24(%1), %%r11 ;"
756
+ "mov $0, %%ecx ;"
757
+ "cmovc %%eax, %%ecx ;"
758
+ "addq %%rcx, %%r8 ;"
759
+ "adcq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
760
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
761
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
762
+ "mov $0, %%ecx ;"
763
+ "cmovc %%eax, %%ecx ;"
764
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
765
+ :
766
+ : "r" (c), "r" (a), "r" (b)
767
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
768
+ );
702
769
  #endif
703
770
  }
704
771
 
705
- inline void sub_EltFp25519_1w_x64(uint64_t *const __restrict c, uint64_t *const __restrict a,
706
- uint64_t *const __restrict b)
707
- {
708
- __asm__ __volatile__(
709
- "movq (%1), %%rax \n\t"
710
- "movq 8(%1), %%rcx \n\t"
711
- "movq 16(%1), %%r8 \n\t"
712
- "movq 24(%1), %%r9 \n\t"
713
- "subq (%2), %%rax \n\t"
714
- "sbbq 8(%2), %%rcx \n\t"
715
- "sbbq 16(%2), %%r8 \n\t"
716
- "sbbq 24(%2), %%r9 \n\t"
717
- "movq %%rcx, 8(%0) \n\t"
718
- "movq %%r8 , 16(%0) \n\t"
719
- "movq %%r9 , 24(%0) \n\t"
720
- "setc %%cl \n\t"
721
- "neg %%rcx \n\t"
722
- "andq $38, %%rcx \n\t"
723
- "subq %%rcx, %%rax \n\t"
724
- "movq %%rax, (%0) \n\t"
725
- :
726
- : "r" (c), "r" (a), "r" (b)
727
- : "memory","cc", "%rax", "%rcx", "%r8", "%r9"
728
- );
772
+ inline void sub_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
773
+ __asm__ __volatile__(
774
+ "mov $38, %%eax ;"
775
+ "movq (%1), %%r8 ;" "subq (%2), %%r8 ;"
776
+ "movq 8(%1), %%r9 ;" "sbbq 8(%2), %%r9 ;"
777
+ "movq 16(%1), %%r10 ;" "sbbq 16(%2), %%r10 ;"
778
+ "movq 24(%1), %%r11 ;" "sbbq 24(%2), %%r11 ;"
779
+ "mov $0, %%ecx ;"
780
+ "cmovc %%eax, %%ecx ;"
781
+ "subq %%rcx, %%r8 ;"
782
+ "sbbq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
783
+ "sbbq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
784
+ "sbbq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
785
+ "mov $0, %%ecx ;"
786
+ "cmovc %%eax, %%ecx ;"
787
+ "subq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
788
+ :
789
+ : "r" (c), "r" (a), "r" (b)
790
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
791
+ );
729
792
  }
730
793
 
731
- inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a)
732
- {
794
+ /**
795
+ * Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
796
+ **/
797
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
733
798
  #ifdef __BMI2__
734
- /**
735
- * a24 = (A+2)/4 = (486662+2)/4 = 121666
736
- **/
737
- const uint64_t a24 = 121666;
738
- __asm__ __volatile__(
739
- "movq %2, %%rdx \n\t"
740
- "mulx (%1), %%rax, %%r8 \n\t"
741
- "mulx 8(%1), %%rcx, %%r9 \n\t"
742
- "movq %%rax, (%0) \n\t"
743
- "movq %%rcx, 8(%0) \n\t"
744
- "mulx 16(%1), %%rax, %%r10 \n\t"
745
- "mulx 24(%1), %%rcx, %%r11 \n\t"
746
- "movq %%rax, 16(%0) \n\t"
747
- "movq %%rcx, 24(%0) \n\t"
748
- "movq $38, %%rdx \n\t"
749
- "mulx %%r11, %%rax, %%rcx \n\t"
750
- "addq %%rax, (%0) \n\t"
751
- "adcq %%r8, 8(%0) \n\t"
752
- "adcq %%r9, 16(%0) \n\t"
753
- "adcq %%r10, 24(%0) \n\t"
754
- :
755
- : "r" (c), "r" (a), "r" (a24)
756
- : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
757
- );
799
+ const uint64_t a24 = 121666;
800
+ __asm__ __volatile__(
801
+ "movq %2, %%rdx ;"
802
+ "mulx (%1), %%r8, %%r10 ;"
803
+ "mulx 8(%1), %%r9, %%r11 ;" "addq %%r10, %%r9 ;"
804
+ "mulx 16(%1), %%r10, %%rax ;" "adcq %%r11, %%r10 ;"
805
+ "mulx 24(%1), %%r11, %%rcx ;" "adcq %%rax, %%r11 ;"
806
+ /***************************/ "adcq $0, %%rcx ;"
807
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
808
+ "mulx %%rcx, %%rax, %%rcx ;"
809
+ "addq %%rax, %%r8 ;"
810
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
811
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
812
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
813
+ "mov $0, %%ecx ;"
814
+ "cmovc %%edx, %%ecx ;"
815
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
816
+ :
817
+ : "r" (c), "r" (a), "r" (a24)
818
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
819
+ );
758
820
  #else /* Without BMI2 */
759
- /**
760
- * TODO: Multiplications using MULQ instruction.
761
- **/
821
+ /**
822
+ * TODO: Multiplications using MULQ instruction.
823
+ **/
762
824
  #endif
763
825
  }
764
826
 
765
- void inv_EltFp25519_1w_x64(uint64_t *const pC, uint64_t *const pA)
766
- {
767
- #define sqrn_EltFp25519_1w_x64(a,times)\
768
- counter = times;\
769
- while(counter-- > 0)\
770
- {\
771
- sqr_EltFp25519_1w_x64(a);\
772
- }
773
-
774
- EltFp25519_1w_Buffer_x64 buffer_1w;
775
- EltFp25519_1w_x64 x0, x1, x2;
776
- uint64_t * T[5];
777
- uint64_t counter;
778
-
779
- T[0] = x0;
780
- T[1] = pC; /* x^(-1) */
781
- T[2] = x1;
782
- T[3] = x2;
783
- T[4] = pA; /* x */
784
-
785
- copy_EltFp25519_1w_x64(T[1],pA);
786
- sqrn_EltFp25519_1w_x64(T[1],1);
787
- copy_EltFp25519_1w_x64(T[2],T[1]);
788
- sqrn_EltFp25519_1w_x64(T[2],2);
789
- mul_EltFp25519_1w_x64(T[0], pA, T[2]);
790
- mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
791
- copy_EltFp25519_1w_x64(T[2],T[1]);
792
- sqrn_EltFp25519_1w_x64(T[2],1);
793
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
794
- copy_EltFp25519_1w_x64(T[2],T[0]);
795
- sqrn_EltFp25519_1w_x64(T[2],5);
796
- mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
797
- copy_EltFp25519_1w_x64(T[2],T[0]);
798
- sqrn_EltFp25519_1w_x64(T[2],10);
799
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
800
- copy_EltFp25519_1w_x64(T[3],T[2]);
801
- sqrn_EltFp25519_1w_x64(T[3],20);
802
- mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
803
- sqrn_EltFp25519_1w_x64(T[3],10);
804
- mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
805
- copy_EltFp25519_1w_x64(T[0],T[3]);
806
- sqrn_EltFp25519_1w_x64(T[0],50);
807
- mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
808
- copy_EltFp25519_1w_x64(T[2],T[0]);
809
- sqrn_EltFp25519_1w_x64(T[2],100);
810
- mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
811
- sqrn_EltFp25519_1w_x64(T[2],50);
812
- mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
813
- sqrn_EltFp25519_1w_x64(T[2],5);
814
- mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
827
+ void inv_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
828
+ #define sqrn_EltFp25519_1w_x64(A, times)\
829
+ counter = times;\
830
+ while ( counter-- > 0) {\
831
+ sqr_EltFp25519_1w_x64(A);\
832
+ }
833
+
834
+ EltFp25519_1w_Buffer_x64 buffer_1w;
835
+ EltFp25519_1w_x64 x0, x1, x2;
836
+ uint64_t * T[5];
837
+ uint64_t counter;
838
+
839
+ T[0] = x0;
840
+ T[1] = c; /* x^(-1) */
841
+ T[2] = x1;
842
+ T[3] = x2;
843
+ T[4] = a; /* x */
844
+
845
+ copy_EltFp25519_1w_x64(T[1], a);
846
+ sqrn_EltFp25519_1w_x64(T[1], 1);
847
+ copy_EltFp25519_1w_x64(T[2], T[1]);
848
+ sqrn_EltFp25519_1w_x64(T[2], 2);
849
+ mul_EltFp25519_1w_x64(T[0], a, T[2]);
850
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
851
+ copy_EltFp25519_1w_x64(T[2], T[1]);
852
+ sqrn_EltFp25519_1w_x64(T[2], 1);
853
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
854
+ copy_EltFp25519_1w_x64(T[2], T[0]);
855
+ sqrn_EltFp25519_1w_x64(T[2], 5);
856
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
857
+ copy_EltFp25519_1w_x64(T[2], T[0]);
858
+ sqrn_EltFp25519_1w_x64(T[2], 10);
859
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
860
+ copy_EltFp25519_1w_x64(T[3], T[2]);
861
+ sqrn_EltFp25519_1w_x64(T[3], 20);
862
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
863
+ sqrn_EltFp25519_1w_x64(T[3], 10);
864
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
865
+ copy_EltFp25519_1w_x64(T[0], T[3]);
866
+ sqrn_EltFp25519_1w_x64(T[0], 50);
867
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
868
+ copy_EltFp25519_1w_x64(T[2], T[0]);
869
+ sqrn_EltFp25519_1w_x64(T[2], 100);
870
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
871
+ sqrn_EltFp25519_1w_x64(T[2], 50);
872
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
873
+ sqrn_EltFp25519_1w_x64(T[2], 5);
874
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
815
875
  #undef sqrn_EltFp25519_1w_x64
816
876
  }
817
877
 
818
- inline void fred_EltFp25519_1w_x64(uint64_t *const c)
819
- {
820
- int64_t last = (((int64_t*)c)[3])>>63;
821
- c[3] &= ((uint64_t)1<<63)-1;
822
- c[0] += 19 & last;
878
+ /**
879
+ * Given C, a 256-bit number, fred_EltFp25519_1w_x64 updates C
880
+ * with a number such that 0 <= C < 2**255-19.
881
+ * Contributed by: Samuel Neves.
882
+ **/
883
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c) {
884
+ __asm__ __volatile__ (
885
+ /* First, obtains a number less than 2^255. */
886
+ "btrq $63, 24(%0) ;"
887
+ "sbbl %%ecx, %%ecx ;"
888
+ "andq $19, %%rcx ;"
889
+ "addq %%rcx, (%0) ;"
890
+ "adcq $0, 8(%0) ;"
891
+ "adcq $0, 16(%0) ;"
892
+ "adcq $0, 24(%0) ;"
893
+
894
+ "btrq $63, 24(%0) ;"
895
+ "sbbl %%ecx, %%ecx ;"
896
+ "andq $19, %%rcx ;"
897
+ "addq %%rcx, (%0) ;"
898
+ "adcq $0, 8(%0) ;"
899
+ "adcq $0, 16(%0) ;"
900
+ "adcq $0, 24(%0) ;"
901
+
902
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
903
+ "cmpq $-19, (%0) ;"
904
+ "setaeb %%al ;"
905
+ "cmpq $-1, 8(%0) ;"
906
+ "setzb %%bl ;"
907
+ "cmpq $-1, 16(%0) ;"
908
+ "setzb %%cl ;"
909
+ "movq 24(%0), %%rdx ;"
910
+ "addq $1, %%rdx ;"
911
+ "shrq $63, %%rdx ;"
912
+ "andb %%bl, %%al ;"
913
+ "andb %%dl, %%cl ;"
914
+ "test %%cl, %%al ;"
915
+ "movl $0, %%eax ;"
916
+ "movl $19, %%ecx ;"
917
+ "cmovnz %%rcx, %%rax ;"
918
+ "addq %%rax, (%0) ;"
919
+ "adcq $0, 8(%0) ;"
920
+ "adcq $0, 16(%0) ;"
921
+ "adcq $0, 24(%0) ;"
922
+ "btrq $63, 24(%0) ;"
923
+ :
924
+ : "r"(c)
925
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
926
+ );
823
927
  }
824
-