x25519-termux 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/clean"
5
+
6
+ CLEAN.include("**/*.o", "**/*.so", "**/*.bundle", "pkg", "tmp")
7
+
8
+ require "rake/extensiontask"
9
+ Rake::ExtensionTask.new("x25519_ref10") do |ext|
10
+ ext.ext_dir = "ext/x25519_ref10"
11
+ end
12
+
13
+ # unless ARGV.include? '--disable-preccomputed'
14
+ # Rake::ExtensionTask.new("x25519_precomputed") do |ext|
15
+ # ext.ext_dir = "ext/x25519_precomputed"
16
+ # end
17
+ # end
18
+
19
+ require "rspec/core/rake_task"
20
+ RSpec::Core::RakeTask.new
21
+
22
+ require "rubocop/rake_task"
23
+ RuboCop::RakeTask.new
24
+
25
+ task default: %w[compile spec rubocop]
@@ -0,0 +1,20 @@
1
+ branches:
2
+ only:
3
+ - master
4
+
5
+ environment:
6
+ PATH: C:\Ruby%RUBY_VERSION%\DevKit\mingw\bin;C:\Ruby%RUBY_VERSION%\bin;C:\Ruby%RUBY_VERSION%\DevKit\bin;%PATH%
7
+ matrix:
8
+ - RUBY_VERSION: "22-x64"
9
+ - RUBY_VERSION: "23-x64"
10
+ - RUBY_VERSION: "24-x64"
11
+
12
+ build: off
13
+
14
+ test_script:
15
+ - SET RAKEOPT=-rdevkit
16
+ - ruby -v
17
+ - gem -v
18
+ - bundle -v
19
+ - bundle
20
+ - bundle exec rake compile spec
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Style/GlobalVars
4
+
5
+ def add_cflags flags
6
+ print "checking if the C compiler accepts #{flags}... "
7
+ with_cflags("#{$CFLAGS} #{flags}") do
8
+ if test_compile
9
+ puts 'OK'
10
+ true
11
+ else
12
+ puts "\rC compiler does not accept #{flags}"
13
+ false
14
+ end
15
+ end
16
+ end
17
+
18
+ def test_compile
19
+ try_compile "int main() {return 0;}", "", {werror: true}
20
+ end
@@ -0,0 +1,74 @@
1
+ /*
2
+ Test for 4th generation Intel Core processor family features (e.g. Haswell)
3
+ From https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
4
+ */
5
+
6
+ #include <stdint.h>
7
+ #if defined(_MSC_VER)
8
+ # include <intrin.h>
9
+ #endif
10
+
11
+ static void run_cpuid(uint32_t eax, uint32_t ecx, uint32_t* abcd)
12
+ {
13
+ #if defined(_MSC_VER)
14
+ __cpuidex(abcd, eax, ecx);
15
+ #else
16
+ uint32_t ebx = 0, edx;
17
+ # if defined( __i386__ ) && defined ( __PIC__ )
18
+ /* in case of PIC under 32-bit EBX cannot be clobbered */
19
+ __asm__ ( "movl %%ebx, %%edi \n\t cpuid \n\t xchgl %%ebx, %%edi" : "=D" (ebx),
20
+ # else
21
+ __asm__ ( "cpuid" : "+b" (ebx),
22
+ # endif
23
+ "+a" (eax), "+c" (ecx), "=d" (edx) );
24
+ abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
25
+ #endif
26
+ }
27
+
28
+ static int check_xcr0_ymm()
29
+ {
30
+ uint32_t xcr0;
31
+ #if defined(_MSC_VER)
32
+ xcr0 = (uint32_t)_xgetbv(0); /* min VS2010 SP1 compiler is required */
33
+ #else
34
+ __asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
35
+ #endif
36
+ return ((xcr0 & 6) == 6); /* checking if xmm and ymm state are enabled in XCR0 */
37
+ }
38
+
39
+ int check_4th_gen_intel_core_features()
40
+ {
41
+ uint32_t abcd[4];
42
+ uint32_t fma_movbe_osxsave_mask = ((1 << 12) | (1 << 22) | (1 << 27));
43
+ uint32_t avx2_bmi12_mask = (1 << 5) | (1 << 3) | (1 << 8);
44
+
45
+ extern int errno;
46
+
47
+ /* CPUID.(EAX=01H, ECX=0H):ECX.FMA[bit 12]==1 &&
48
+ CPUID.(EAX=01H, ECX=0H):ECX.MOVBE[bit 22]==1 &&
49
+ CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 */
50
+ run_cpuid( 1, 0, abcd );
51
+
52
+ if (errno)
53
+ return 0;
54
+
55
+ if ( (abcd[2] & fma_movbe_osxsave_mask) != fma_movbe_osxsave_mask )
56
+ return 0;
57
+
58
+ if ( ! check_xcr0_ymm() )
59
+ return 0;
60
+
61
+ /* CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 &&
62
+ CPUID.(EAX=07H, ECX=0H):EBX.BMI1[bit 3]==1 &&
63
+ CPUID.(EAX=07H, ECX=0H):EBX.BMI2[bit 8]==1 */
64
+ run_cpuid( 7, 0, abcd );
65
+ if ( (abcd[1] & avx2_bmi12_mask) != avx2_bmi12_mask )
66
+ return 0;
67
+
68
+ /* CPUID.(EAX=80000001H):ECX.LZCNT[bit 5]==1 */
69
+ run_cpuid( 0x80000001, 0, abcd );
70
+ if ( (abcd[2] & (1 << 5)) == 0)
71
+ return 0;
72
+
73
+ return 1;
74
+ }
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Style/GlobalVars
4
+
5
+ require "mkmf"
6
+
7
+ require_relative '../extconf_helpers'
8
+
9
+ if enable_config('precomputed') == false
10
+ add_cflags ' -DDISABLE_PRECOMPUTED'
11
+ end
12
+
13
+ add_cflags '-Wall -O3 -pedantic -std=c99'
14
+ add_cflags '-mbmi -mbmi2'
15
+ add_cflags '-march=native -mtune=native'
16
+
17
+ create_makefile("x25519_precomputed")
18
+
19
+ # rubocop:enable Style/GlobalVars
@@ -0,0 +1,943 @@
1
+ /**
2
+ * Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
7
+ * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
8
+ *
9
+ * Redistribution and use in source and binary forms, with or without
10
+ * modification, are permitted provided that the following conditions
11
+ * are met:
12
+ *
13
+ * * Redistributions of source code must retain the above copyright
14
+ * notice, this list of conditions and the following disclaimer.
15
+ * * Redistributions in binary form must reproduce the above
16
+ * copyright notice, this list of conditions and the following
17
+ * disclaimer in the documentation and/or other materials provided
18
+ * with the distribution.
19
+ * * Neither the name of University of Campinas nor the names of its
20
+ * contributors may be used to endorse or promote products derived
21
+ * from this software without specific prior written permission.
22
+ *
23
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
33
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
34
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
35
+ */
36
+
37
+ #include "fp25519_x64.h"
38
+
39
+ /**
40
+ *
41
+ * @param c Two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
42
+ * @param a Two 256-bit integers: a0[0:3] and a1[4:7]
43
+ * @param b Two 256-bit integers: b0[0:3] and b1[4:7]
44
+ */
45
+ void mul2_256x256_integer_x64(uint64_t *const c, uint64_t *const a,
46
+ uint64_t *const b) {
47
+ #ifdef __BMI2__
48
+ #ifdef __ADX__
49
+ __asm__ __volatile__(
50
+ "xorl %%r14d, %%r14d ;"
51
+ "movq (%1), %%rdx; " /* A[0] */
52
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
53
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "adox %%r10, %%r12 ;"
54
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adox %%r8, %%rax ;"
55
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adox %%r10, %%rbx ;"
56
+ /*******************************************/ "adox %%r14, %%rcx ;"
57
+
58
+ "movq 8(%1), %%rdx; " /* A[1] */
59
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
60
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
61
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
62
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
63
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
64
+
65
+ "movq 16(%1), %%rdx; " /* A[2] */ "xorl %%r10d, %%r10d ;"
66
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
67
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
68
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
69
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
70
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
71
+
72
+ "movq 24(%1), %%rdx; " /* A[3] */ "xorl %%r10d, %%r10d ;"
73
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
74
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
75
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
76
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
77
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 56(%0) ;"
78
+
79
+ "movq 32(%1), %%rdx; " /* C[0] */
80
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, 64(%0);"
81
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "adox %%r10, %%r12 ;"
82
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adox %%r8, %%rax ;"
83
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adox %%r10, %%rbx ;"
84
+ /*******************************************/ "adox %%r14, %%rcx ;"
85
+
86
+ "movq 40(%1), %%rdx; " /* C[1] */ "xorl %%r10d, %%r10d ;"
87
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "adox %%r12, %%r8 ;" "movq %%r8, 72(%0);"
88
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rax ;"
89
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rbx ;"
90
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rcx ;"
91
+ /*******************************************/ "adox %%r14, %%r12 ;" "adcx %%r14, %%r12 ;"
92
+
93
+ "movq 48(%1), %%rdx; " /* C[2] */ "xorl %%r10d, %%r10d ;"
94
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "adox %%rax, %%r8 ;" "movq %%r8, 80(%0);"
95
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rbx ;"
96
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%rcx ;"
97
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%r12 ;"
98
+ /*******************************************/ "adox %%r14, %%rax ;" "adcx %%r14, %%rax ;"
99
+
100
+ "movq 56(%1), %%rdx; " /* C[3] */ "xorl %%r10d, %%r10d ;"
101
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "adox %%rbx, %%r8 ;" "movq %%r8, 88(%0);"
102
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adox %%r10, %%r9 ;" "adcx %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
103
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adox %%r8, %%r11 ;" "adcx %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
104
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adox %%r10, %%r13 ;" "adcx %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
105
+ /*******************************************/ "adox %%r14, %%rbx ;" "adcx %%r14, %%rbx ;" "movq %%rbx, 120(%0) ;"
106
+ :
107
+ : "r" (c), "r" (a), "r" (b)
108
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
109
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
110
+ );
111
+ #else
112
+ __asm__ __volatile__(
113
+ "movq (%1), %%rdx; " /* A[0] */
114
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
115
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
116
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
117
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
118
+ /*******************************************/ "adcq $0, %%rcx ;"
119
+
120
+ "movq 8(%1), %%rdx; " /* A[1] */
121
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
122
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
123
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
124
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
125
+ /*******************************************/ "adcq $0, %%r12 ;"
126
+
127
+ "addq %%r9, %%rax ;"
128
+ "adcq %%r11, %%rbx ;"
129
+ "adcq %%r13, %%rcx ;"
130
+ "adcq $0, %%r12 ;"
131
+
132
+ "movq 16(%1), %%rdx; " /* A[2] */
133
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
134
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
135
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
136
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
137
+ /*******************************************/ "adcq $0, %%rax ;"
138
+
139
+ "addq %%r9, %%rbx ;"
140
+ "adcq %%r11, %%rcx ;"
141
+ "adcq %%r13, %%r12 ;"
142
+ "adcq $0, %%rax ;"
143
+
144
+ "movq 24(%1), %%rdx; " /* A[3] */
145
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
146
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
147
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
148
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
149
+ /*******************************************/ "adcq $0, %%rbx ;"
150
+
151
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
152
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
153
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
154
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
155
+
156
+ "movq 32(%1), %%rdx; " /* C[0] */
157
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */ "movq %%r8, 64(%0) ;"
158
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */ "addq %%r10, %%r12 ;"
159
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */ "adcq %%r8, %%rax ;"
160
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */ "adcq %%r10, %%rbx ;"
161
+ /*******************************************/ "adcq $0, %%rcx ;"
162
+
163
+ "movq 40(%1), %%rdx; " /* C[1] */
164
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 72(%0) ;"
165
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */ "adcq %%r10, %%r9 ;"
166
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */ "adcq %%r8, %%r11 ;"
167
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */ "adcq %%r10, %%r13 ;"
168
+ /*******************************************/ "adcq $0, %%r12 ;"
169
+
170
+ "addq %%r9, %%rax ;"
171
+ "adcq %%r11, %%rbx ;"
172
+ "adcq %%r13, %%rcx ;"
173
+ "adcq $0, %%r12 ;"
174
+
175
+ "movq 48(%1), %%rdx; " /* C[2] */
176
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 80(%0) ;"
177
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */ "adcq %%r10, %%r9 ;"
178
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */ "adcq %%r8, %%r11 ;"
179
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */ "adcq %%r10, %%r13 ;"
180
+ /*******************************************/ "adcq $0, %%rax ;"
181
+
182
+ "addq %%r9, %%rbx ;"
183
+ "adcq %%r11, %%rcx ;"
184
+ "adcq %%r13, %%r12 ;"
185
+ "adcq $0, %%rax ;"
186
+
187
+ "movq 56(%1), %%rdx; " /* C[3] */
188
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 88(%0) ;"
189
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */ "adcq %%r10, %%r9 ;"
190
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */ "adcq %%r8, %%r11 ;"
191
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */ "adcq %%r10, %%r13 ;"
192
+ /*******************************************/ "adcq $0, %%rbx ;"
193
+
194
+ "addq %%r9, %%rcx ;" "movq %%rcx, 96(%0) ;"
195
+ "adcq %%r11, %%r12 ;" "movq %%r12, 104(%0) ;"
196
+ "adcq %%r13, %%rax ;" "movq %%rax, 112(%0) ;"
197
+ "adcq $0, %%rbx ;" "movq %%rbx, 120(%0) ;"
198
+ :
199
+ : "r" (c), "r" (a), "r" (b)
200
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
201
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
202
+ );
203
+ #endif
204
+ #else /* Without BMI2 */
205
+ /**
206
+ * TODO: Multiplications using MULQ instruction.
207
+ **/
208
+ #endif
209
+ }
210
+
211
+ /**
212
+ *
213
+ * @param c
214
+ * @param a
215
+ */
216
+ void sqr2_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
217
+ #ifdef __BMI2__
218
+ #ifdef __ADX__
219
+ __asm__ __volatile__(
220
+ "movq (%1), %%rdx ;" /* A[0] */
221
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
222
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
223
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
224
+ "movq 24(%1), %%rdx ;" /* A[3] */
225
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
226
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
227
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
228
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
229
+ /*******************************************/ "adcx %%r15, %%r14 ;"
230
+
231
+ "xorl %%r15d, %%r15d;"
232
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
233
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
234
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
235
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
236
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
237
+ "adcx %%r13, %%r13 ;"
238
+ "adcx %%r14, %%r14 ;"
239
+
240
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
241
+ /********************/ "movq %%rax, 0(%0) ;"
242
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
243
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
244
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
245
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
246
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
247
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
248
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
249
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
250
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
251
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
252
+
253
+
254
+ "movq 32(%1), %%rdx ;" /* B[0] */
255
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */ "xorl %%r15d, %%r15d;"
256
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */ "adcx %%r14, %%r9 ;"
257
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */ "adcx %%rax, %%r10 ;"
258
+ "movq 56(%1), %%rdx ;" /* B[3] */
259
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */ "adcx %%rcx, %%r11 ;"
260
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */ "adcx %%rax, %%r12 ;"
261
+ "movq 40(%1), %%rdx ;" /* B[1] */ "adcx %%r15, %%r13 ;"
262
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */ "movq $0, %%r14 ;"
263
+ /*******************************************/ "adcx %%r15, %%r14 ;"
264
+
265
+ "xorl %%r15d, %%r15d;"
266
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
267
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
268
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
269
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
270
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
271
+ "adcx %%r13, %%r13 ;"
272
+ "adcx %%r14, %%r14 ;"
273
+
274
+ "movq 32(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
275
+ /********************/ "movq %%rax, 64(%0) ;"
276
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
277
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
278
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
279
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
280
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
281
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
282
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
283
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
284
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
285
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
286
+ :
287
+ : "r" (c), "r" (a)
288
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
289
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
290
+ );
291
+ #else /* Without ADX */
292
+ __asm__ __volatile__(
293
+ "movq 8(%1), %%rdx ;" /* A[1] */
294
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
295
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
296
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
297
+
298
+ "movq 16(%1), %%rdx ;" /* A[2] */
299
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
300
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
301
+
302
+ "addq %%rax, %%r9 ;"
303
+ "adcq %%rdx, %%r10 ;"
304
+ "adcq %%rcx, %%r11 ;"
305
+ "adcq %%r14, %%r12 ;"
306
+ "adcq $0, %%r13 ;"
307
+ "movq $0, %%r14 ;"
308
+ "adcq $0, %%r14 ;"
309
+
310
+ "movq (%1), %%rdx ;" /* A[0] */
311
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
312
+
313
+ "addq %%rax, %%r10 ;"
314
+ "adcq %%rcx, %%r11 ;"
315
+ "adcq $0, %%r12 ;"
316
+ "adcq $0, %%r13 ;"
317
+ "adcq $0, %%r14 ;"
318
+
319
+ "shldq $1, %%r13, %%r14 ;"
320
+ "shldq $1, %%r12, %%r13 ;"
321
+ "shldq $1, %%r11, %%r12 ;"
322
+ "shldq $1, %%r10, %%r11 ;"
323
+ "shldq $1, %%r9, %%r10 ;"
324
+ "shldq $1, %%r8, %%r9 ;"
325
+ "shlq $1, %%r8 ;"
326
+
327
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
328
+ /********************/ "movq %%rax, 0(%0) ;"
329
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
330
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
331
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
332
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
333
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
334
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
335
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
336
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
337
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
338
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
339
+
340
+ "movq 40(%1), %%rdx ;" /* B[1] */
341
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
342
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
343
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
344
+
345
+ "movq 48(%1), %%rdx ;" /* B[2] */
346
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
347
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
348
+
349
+ "addq %%rax, %%r9 ;"
350
+ "adcq %%rdx, %%r10 ;"
351
+ "adcq %%rcx, %%r11 ;"
352
+ "adcq %%r14, %%r12 ;"
353
+ "adcq $0, %%r13 ;"
354
+ "movq $0, %%r14 ;"
355
+ "adcq $0, %%r14 ;"
356
+
357
+ "movq 32(%1), %%rdx ;" /* B[0] */
358
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
359
+
360
+ "addq %%rax, %%r10 ;"
361
+ "adcq %%rcx, %%r11 ;"
362
+ "adcq $0, %%r12 ;"
363
+ "adcq $0, %%r13 ;"
364
+ "adcq $0, %%r14 ;"
365
+
366
+ "shldq $1, %%r13, %%r14 ;"
367
+ "shldq $1, %%r12, %%r13 ;"
368
+ "shldq $1, %%r11, %%r12 ;"
369
+ "shldq $1, %%r10, %%r11 ;"
370
+ "shldq $1, %%r9, %%r10 ;"
371
+ "shldq $1, %%r8, %%r9 ;"
372
+ "shlq $1, %%r8 ;"
373
+
374
+ /********************/ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
375
+ /********************/ "movq %%rax, 64(%0) ;"
376
+ "addq %%rcx, %%r8 ;" "movq %%r8, 72(%0) ;"
377
+ "movq 40(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
378
+ "adcq %%rax, %%r9 ;" "movq %%r9, 80(%0) ;"
379
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 88(%0) ;"
380
+ "movq 48(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
381
+ "adcq %%rax, %%r11 ;" "movq %%r11, 96(%0) ;"
382
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 104(%0) ;"
383
+ "movq 56(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
384
+ "adcq %%rax, %%r13 ;" "movq %%r13, 112(%0) ;"
385
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 120(%0) ;"
386
+ :
387
+ : "r" (c), "r" (a)
388
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
389
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
390
+ );
391
+ #endif
392
+ #else /* Without BMI2 */
393
+ /**
394
+ * TODO: Multiplications using MULQ instruction.
395
+ **/
396
+ #endif
397
+ }
398
+
399
+ /**
400
+ *
401
+ * @param c
402
+ * @param a
403
+ */
404
+ void red_EltFp25519_2w_x64(uint64_t *const c, uint64_t *const a) {
405
+ #ifdef __BMI2__
406
+ #ifdef __ADX__
407
+ __asm__ __volatile__(
408
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
409
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
410
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
411
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
412
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
413
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
414
+ "clc ;"
415
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
416
+ "adcx %%rax, %%r8 ;"
417
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
418
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
419
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
420
+ "mov $0, %%ecx ;"
421
+ "cmovc %%edx, %%ecx ;"
422
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
423
+
424
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox 64(%1), %%r8 ;"
425
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 72(%1), %%r9 ;"
426
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 80(%1), %%r10 ;"
427
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 88(%1), %%r11 ;"
428
+ /*****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
429
+ "clc ;"
430
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
431
+ "adcx %%rax, %%r8 ;"
432
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
433
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 48(%0) ;"
434
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 56(%0) ;"
435
+ "mov $0, %%ecx ;"
436
+ "cmovc %%edx, %%ecx ;"
437
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
438
+ :
439
+ : "r" (c), "r" (a)
440
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
441
+ );
442
+ #else
443
+ __asm__ __volatile__(
444
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
445
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
446
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
447
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
448
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
449
+ /****************************************/ "adcq $0, %%rcx ;"
450
+ "addq (%1), %%r8 ;"
451
+ "adcq 8(%1), %%r9 ;"
452
+ "adcq 16(%1), %%r10 ;"
453
+ "adcq 24(%1), %%r11 ;"
454
+ "adcq $0, %%rcx ;"
455
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
456
+ "addq %%rax, %%r8 ;"
457
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
458
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
459
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
460
+ "mov $0, %%ecx ;"
461
+ "cmovc %%edx, %%ecx ;"
462
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
463
+
464
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
465
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
466
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
467
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
468
+ /*****************************************/ "adcq $0, %%rcx ;"
469
+ "addq 64(%1), %%r8 ;"
470
+ "adcq 72(%1), %%r9 ;"
471
+ "adcq 80(%1), %%r10 ;"
472
+ "adcq 88(%1), %%r11 ;"
473
+ "adcq $0, %%rcx ;"
474
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
475
+ "addq %%rax, %%r8 ;"
476
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 40(%0) ;"
477
+ "adcq $0, %%r10 ;" "movq %%r10, 48(%0) ;"
478
+ "adcq $0, %%r11 ;" "movq %%r11, 56(%0) ;"
479
+ "mov $0, %%ecx ;"
480
+ "cmovc %%edx, %%ecx ;"
481
+ "addq %%rcx, %%r8 ;" "movq %%r8, 32(%0) ;"
482
+ :
483
+ : "r" (c), "r" (a)
484
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
485
+ );
486
+ #endif
487
+ #else /* Without BMI2 */
488
+ /* [TODO] */
489
+ #endif
490
+ }
491
+
492
+ void mul_256x256_integer_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
493
+ #ifdef __BMI2__
494
+ #ifdef __ADX__
495
+ __asm__ __volatile__(
496
+ "movq (%1), %%rdx; " /* A[0] */
497
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */ "xorl %%r10d, %%r10d ;" "movq %%r8, (%0) ;"
498
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */ "adox %%r9, %%r10 ;" "movq %%r10, 8(%0) ;"
499
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */ "adox %%r11, %%r12 ;"
500
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */ "adox %%r13, %%r14 ;" "movq $0, %%rax ;"
501
+ /*******************************************/ "adox %%rdx, %%rax ;"
502
+
503
+ "movq 8(%1), %%rdx; " /* A[1] */
504
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 8(%0), %%r8 ;" "movq %%r8, 8(%0) ;"
505
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 16(%0) ;"
506
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
507
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
508
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
509
+
510
+ "movq 16(%1), %%rdx; " /* A[2] */
511
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 16(%0), %%r8 ;" "movq %%r8, 16(%0) ;"
512
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 24(%0) ;"
513
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq $0, %%r8 ;"
514
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq $0, %%rax ;"
515
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;"
516
+
517
+ "movq 24(%1), %%rdx; " /* A[3] */
518
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "xorl %%r10d, %%r10d ;" "adcx 24(%0), %%r8 ;" "movq %%r8, 24(%0) ;"
519
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adox %%r9, %%r10 ;" "adcx %%r12, %%r10 ;" "movq %%r10, 32(%0) ;"
520
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */ "adox %%r11, %%r12 ;" "adcx %%r14, %%r12 ;" "movq %%r12, 40(%0) ;" "movq $0, %%r8 ;"
521
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */ "adox %%r13, %%r14 ;" "adcx %%rax, %%r14 ;" "movq %%r14, 48(%0) ;" "movq $0, %%rax ;"
522
+ /*******************************************/ "adox %%rdx, %%rax ;" "adcx %%r8, %%rax ;" "movq %%rax, 56(%0) ;"
523
+ :
524
+ : "r" (c), "r" (a), "r" (b)
525
+ : "memory", "cc", "%rax", "%rdx", "%r8",
526
+ "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
527
+ );
528
+ #else
529
+ __asm__ __volatile__(
530
+ "movq (%1), %%rdx; " /* A[0] */
531
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */ "movq %%r8, (%0) ;"
532
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */ "addq %%r10, %%r12 ;"
533
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */ "adcq %%r8, %%rax ;"
534
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */ "adcq %%r10, %%rbx ;"
535
+ /*******************************************/ "adcq $0, %%rcx ;"
536
+
537
+ "movq 8(%1), %%rdx; " /* A[1] */
538
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */ "addq %%r12, %%r8 ;" "movq %%r8, 8(%0) ;"
539
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */ "adcq %%r10, %%r9 ;"
540
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */ "adcq %%r8, %%r11 ;"
541
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */ "adcq %%r10, %%r13 ;"
542
+ /*******************************************/ "adcq $0, %%r12 ;"
543
+
544
+ "addq %%r9, %%rax ;"
545
+ "adcq %%r11, %%rbx ;"
546
+ "adcq %%r13, %%rcx ;"
547
+ "adcq $0, %%r12 ;"
548
+
549
+ "movq 16(%1), %%rdx; " /* A[2] */
550
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */ "addq %%rax, %%r8 ;" "movq %%r8, 16(%0) ;"
551
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */ "adcq %%r10, %%r9 ;"
552
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */ "adcq %%r8, %%r11 ;"
553
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */ "adcq %%r10, %%r13 ;"
554
+ /*******************************************/ "adcq $0, %%rax ;"
555
+
556
+ "addq %%r9, %%rbx ;"
557
+ "adcq %%r11, %%rcx ;"
558
+ "adcq %%r13, %%r12 ;"
559
+ "adcq $0, %%rax ;"
560
+
561
+ "movq 24(%1), %%rdx; " /* A[3] */
562
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */ "addq %%rbx, %%r8 ;" "movq %%r8, 24(%0) ;"
563
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */ "adcq %%r10, %%r9 ;"
564
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */ "adcq %%r8, %%r11 ;"
565
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */ "adcq %%r10, %%r13 ;"
566
+ /*******************************************/ "adcq $0, %%rbx ;"
567
+
568
+ "addq %%r9, %%rcx ;" "movq %%rcx, 32(%0) ;"
569
+ "adcq %%r11, %%r12 ;" "movq %%r12, 40(%0) ;"
570
+ "adcq %%r13, %%rax ;" "movq %%rax, 48(%0) ;"
571
+ "adcq $0, %%rbx ;" "movq %%rbx, 56(%0) ;"
572
+ :
573
+ : "r" (c), "r" (a), "r" (b)
574
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx",
575
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"
576
+ );
577
+ #endif
578
+ #else /* Without BMI2 */
579
+ /**
580
+ * TODO: Multiplications using MULQ instruction.
581
+ **/
582
+ #endif
583
+ }
584
+
585
+ void sqr_256x256_integer_x64(uint64_t *const c, uint64_t *const a) {
586
+ #ifdef __BMI2__
587
+ #ifdef __ADX__
588
+ __asm__ __volatile__(
589
+ "movq (%1), %%rdx ;" /* A[0] */
590
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */ "xorl %%r15d, %%r15d;"
591
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */ "adcx %%r14, %%r9 ;"
592
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */ "adcx %%rax, %%r10 ;"
593
+ "movq 24(%1), %%rdx ;" /* A[3] */
594
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */ "adcx %%rcx, %%r11 ;"
595
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */ "adcx %%rax, %%r12 ;"
596
+ "movq 8(%1), %%rdx ;" /* A[1] */ "adcx %%r15, %%r13 ;"
597
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */ "movq $0, %%r14 ;"
598
+ /*******************************************/ "adcx %%r15, %%r14 ;"
599
+
600
+ "xorl %%r15d, %%r15d;"
601
+ "adox %%rax, %%r10 ;" "adcx %%r8, %%r8 ;"
602
+ "adox %%rcx, %%r11 ;" "adcx %%r9, %%r9 ;"
603
+ "adox %%r15, %%r12 ;" "adcx %%r10, %%r10 ;"
604
+ "adox %%r15, %%r13 ;" "adcx %%r11, %%r11 ;"
605
+ "adox %%r15, %%r14 ;" "adcx %%r12, %%r12 ;"
606
+ "adcx %%r13, %%r13 ;"
607
+ "adcx %%r14, %%r14 ;"
608
+
609
+ "movq (%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
610
+ /********************/ "movq %%rax, 0(%0) ;"
611
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
612
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
613
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
614
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
615
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
616
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
617
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
618
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
619
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
620
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
621
+ :
622
+ : "r" (c), "r" (a)
623
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
624
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15"
625
+ );
626
+ #else /* Without ADX */
627
+ __asm__ __volatile__(
628
+ "movq 8(%1), %%rdx ;" /* A[1] */
629
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
630
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
631
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
632
+
633
+ "movq 16(%1), %%rdx ;" /* A[2] */
634
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
635
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
636
+
637
+ "addq %%rax, %%r9 ;"
638
+ "adcq %%rdx, %%r10 ;"
639
+ "adcq %%rcx, %%r11 ;"
640
+ "adcq %%r14, %%r12 ;"
641
+ "adcq $0, %%r13 ;"
642
+ "movq $0, %%r14 ;"
643
+ "adcq $0, %%r14 ;"
644
+
645
+ "movq (%1), %%rdx ;" /* A[0] */
646
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
647
+
648
+ "addq %%rax, %%r10 ;"
649
+ "adcq %%rcx, %%r11 ;"
650
+ "adcq $0, %%r12 ;"
651
+ "adcq $0, %%r13 ;"
652
+ "adcq $0, %%r14 ;"
653
+
654
+ "shldq $1, %%r13, %%r14 ;"
655
+ "shldq $1, %%r12, %%r13 ;"
656
+ "shldq $1, %%r11, %%r12 ;"
657
+ "shldq $1, %%r10, %%r11 ;"
658
+ "shldq $1, %%r9, %%r10 ;"
659
+ "shldq $1, %%r8, %%r9 ;"
660
+ "shlq $1, %%r8 ;"
661
+
662
+ /********************/ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
663
+ /********************/ "movq %%rax, 0(%0) ;"
664
+ "addq %%rcx, %%r8 ;" "movq %%r8, 8(%0) ;"
665
+ "movq 8(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
666
+ "adcq %%rax, %%r9 ;" "movq %%r9, 16(%0) ;"
667
+ "adcq %%rcx, %%r10 ;" "movq %%r10, 24(%0) ;"
668
+ "movq 16(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
669
+ "adcq %%rax, %%r11 ;" "movq %%r11, 32(%0) ;"
670
+ "adcq %%rcx, %%r12 ;" "movq %%r12, 40(%0) ;"
671
+ "movq 24(%1), %%rdx ;" "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
672
+ "adcq %%rax, %%r13 ;" "movq %%r13, 48(%0) ;"
673
+ "adcq %%rcx, %%r14 ;" "movq %%r14, 56(%0) ;"
674
+ :
675
+ : "r" (c), "r" (a)
676
+ : "memory", "cc", "%rax", "%rcx", "%rdx",
677
+ "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"
678
+ );
679
+ #endif
680
+ #else /* Without BMI2 */
681
+ /**
682
+ * TODO: Multiplications using MULQ instruction.
683
+ **/
684
+ #endif
685
+ }
686
+
687
+ void red_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
688
+ #ifdef __BMI2__
689
+ #ifdef __ADX__
690
+ __asm__ __volatile__(
691
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
692
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */ "xorl %%ebx, %%ebx ;" "adox (%1), %%r8 ;"
693
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "adcx %%r10, %%r9 ;" "adox 8(%1), %%r9 ;"
694
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcx %%r11, %%r10 ;" "adox 16(%1), %%r10 ;"
695
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcx %%rax, %%r11 ;" "adox 24(%1), %%r11 ;"
696
+ /****************************************/ "adcx %%rbx, %%rcx ;" "adox %%rbx, %%rcx ;"
697
+ "clc ;"
698
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
699
+ "adcx %%rax, %%r8 ;"
700
+ "adcx %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
701
+ "adcx %%rbx, %%r10 ;" "movq %%r10, 16(%0) ;"
702
+ "adcx %%rbx, %%r11 ;" "movq %%r11, 24(%0) ;"
703
+ "mov $0, %%ecx ;"
704
+ "cmovc %%edx, %%ecx ;"
705
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
706
+ :
707
+ : "r" (c), "r" (a)
708
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
709
+ );
710
+ #else
711
+ __asm__ __volatile__(
712
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
713
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
714
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */ "addq %%r10, %%r9 ;"
715
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */ "adcq %%r11, %%r10 ;"
716
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */ "adcq %%rax, %%r11 ;"
717
+ /****************************************/ "adcq $0, %%rcx ;"
718
+ "addq (%1), %%r8 ;"
719
+ "adcq 8(%1), %%r9 ;"
720
+ "adcq 16(%1), %%r10 ;"
721
+ "adcq 24(%1), %%r11 ;"
722
+ "adcq $0, %%rcx ;"
723
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
724
+ "addq %%rax, %%r8 ;"
725
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
726
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
727
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
728
+ "mov $0, %%ecx ;"
729
+ "cmovc %%edx, %%ecx ;"
730
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
731
+ :
732
+ : "r" (c), "r" (a)
733
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
734
+ );
735
+ #endif
736
+ #else /* Without BMI2 */
737
+ /**
738
+ * TODO: Multiplications using MULQ instruction.
739
+ **/
740
+ #endif
741
+ }
742
+
743
+ inline void add_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
744
+ #ifdef __ADX__
745
+ __asm__ __volatile__(
746
+ "mov $38, %%eax ;"
747
+ "xorl %%ecx, %%ecx ;"
748
+ "movq (%2), %%r8 ;" "adcx (%1), %%r8 ;"
749
+ "movq 8(%2), %%r9 ;" "adcx 8(%1), %%r9 ;"
750
+ "movq 16(%2), %%r10 ;" "adcx 16(%1), %%r10 ;"
751
+ "movq 24(%2), %%r11 ;" "adcx 24(%1), %%r11 ;"
752
+ "cmovc %%eax, %%ecx ;"
753
+ "xorl %%eax, %%eax ;"
754
+ "adcx %%rcx, %%r8 ;"
755
+ "adcx %%rax, %%r9 ;" "movq %%r9, 8(%0) ;"
756
+ "adcx %%rax, %%r10 ;" "movq %%r10, 16(%0) ;"
757
+ "adcx %%rax, %%r11 ;" "movq %%r11, 24(%0) ;"
758
+ "mov $38, %%ecx ;"
759
+ "cmovc %%ecx, %%eax ;"
760
+ "addq %%rax, %%r8 ;" "movq %%r8, (%0) ;"
761
+ :
762
+ : "r" (c), "r" (a), "r" (b)
763
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
764
+ );
765
+ #else
766
+ __asm__ __volatile__(
767
+ "mov $38, %%eax ;"
768
+ "movq (%2), %%r8 ;" "addq (%1), %%r8 ;"
769
+ "movq 8(%2), %%r9 ;" "adcq 8(%1), %%r9 ;"
770
+ "movq 16(%2), %%r10 ;" "adcq 16(%1), %%r10 ;"
771
+ "movq 24(%2), %%r11 ;" "adcq 24(%1), %%r11 ;"
772
+ "mov $0, %%ecx ;"
773
+ "cmovc %%eax, %%ecx ;"
774
+ "addq %%rcx, %%r8 ;"
775
+ "adcq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
776
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
777
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
778
+ "mov $0, %%ecx ;"
779
+ "cmovc %%eax, %%ecx ;"
780
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
781
+ :
782
+ : "r" (c), "r" (a), "r" (b)
783
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
784
+ );
785
+ #endif
786
+ }
787
+
788
+ inline void sub_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a, uint64_t *const b) {
789
+ __asm__ __volatile__(
790
+ "mov $38, %%eax ;"
791
+ "movq (%1), %%r8 ;" "subq (%2), %%r8 ;"
792
+ "movq 8(%1), %%r9 ;" "sbbq 8(%2), %%r9 ;"
793
+ "movq 16(%1), %%r10 ;" "sbbq 16(%2), %%r10 ;"
794
+ "movq 24(%1), %%r11 ;" "sbbq 24(%2), %%r11 ;"
795
+ "mov $0, %%ecx ;"
796
+ "cmovc %%eax, %%ecx ;"
797
+ "subq %%rcx, %%r8 ;"
798
+ "sbbq $0, %%r9 ;" "movq %%r9, 8(%0) ;"
799
+ "sbbq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
800
+ "sbbq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
801
+ "mov $0, %%ecx ;"
802
+ "cmovc %%eax, %%ecx ;"
803
+ "subq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
804
+ :
805
+ : "r" (c), "r" (a), "r" (b)
806
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11"
807
+ );
808
+ }
809
+
810
+ /**
811
+ * Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
812
+ **/
813
+ inline void mul_a24_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
814
+ #ifdef __BMI2__
815
+ const uint64_t a24 = 121666;
816
+ __asm__ __volatile__(
817
+ "movq %2, %%rdx ;"
818
+ "mulx (%1), %%r8, %%r10 ;"
819
+ "mulx 8(%1), %%r9, %%r11 ;" "addq %%r10, %%r9 ;"
820
+ "mulx 16(%1), %%r10, %%rax ;" "adcq %%r11, %%r10 ;"
821
+ "mulx 24(%1), %%r11, %%rcx ;" "adcq %%rax, %%r11 ;"
822
+ /***************************/ "adcq $0, %%rcx ;"
823
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
824
+ "mulx %%rcx, %%rax, %%rcx ;"
825
+ "addq %%rax, %%r8 ;"
826
+ "adcq %%rcx, %%r9 ;" "movq %%r9, 8(%0) ;"
827
+ "adcq $0, %%r10 ;" "movq %%r10, 16(%0) ;"
828
+ "adcq $0, %%r11 ;" "movq %%r11, 24(%0) ;"
829
+ "mov $0, %%ecx ;"
830
+ "cmovc %%edx, %%ecx ;"
831
+ "addq %%rcx, %%r8 ;" "movq %%r8, (%0) ;"
832
+ :
833
+ : "r" (c), "r" (a), "r" (a24)
834
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"
835
+ );
836
+ #else /* Without BMI2 */
837
+ /**
838
+ * TODO: Multiplications using MULQ instruction.
839
+ **/
840
+ #endif
841
+ }
842
+
843
+ void inv_EltFp25519_1w_x64(uint64_t *const c, uint64_t *const a) {
844
+ #define sqrn_EltFp25519_1w_x64(A, times)\
845
+ counter = times;\
846
+ while ( counter-- > 0) {\
847
+ sqr_EltFp25519_1w_x64(A);\
848
+ }
849
+
850
+ EltFp25519_1w_Buffer_x64 buffer_1w;
851
+ EltFp25519_1w_x64 x0, x1, x2;
852
+ uint64_t * T[5];
853
+ uint64_t counter;
854
+
855
+ T[0] = x0;
856
+ T[1] = c; /* x^(-1) */
857
+ T[2] = x1;
858
+ T[3] = x2;
859
+ T[4] = a; /* x */
860
+
861
+ copy_EltFp25519_1w_x64(T[1], a);
862
+ sqrn_EltFp25519_1w_x64(T[1], 1);
863
+ copy_EltFp25519_1w_x64(T[2], T[1]);
864
+ sqrn_EltFp25519_1w_x64(T[2], 2);
865
+ mul_EltFp25519_1w_x64(T[0], a, T[2]);
866
+ mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
867
+ copy_EltFp25519_1w_x64(T[2], T[1]);
868
+ sqrn_EltFp25519_1w_x64(T[2], 1);
869
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
870
+ copy_EltFp25519_1w_x64(T[2], T[0]);
871
+ sqrn_EltFp25519_1w_x64(T[2], 5);
872
+ mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
873
+ copy_EltFp25519_1w_x64(T[2], T[0]);
874
+ sqrn_EltFp25519_1w_x64(T[2], 10);
875
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
876
+ copy_EltFp25519_1w_x64(T[3], T[2]);
877
+ sqrn_EltFp25519_1w_x64(T[3], 20);
878
+ mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
879
+ sqrn_EltFp25519_1w_x64(T[3], 10);
880
+ mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
881
+ copy_EltFp25519_1w_x64(T[0], T[3]);
882
+ sqrn_EltFp25519_1w_x64(T[0], 50);
883
+ mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
884
+ copy_EltFp25519_1w_x64(T[2], T[0]);
885
+ sqrn_EltFp25519_1w_x64(T[2], 100);
886
+ mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
887
+ sqrn_EltFp25519_1w_x64(T[2], 50);
888
+ mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
889
+ sqrn_EltFp25519_1w_x64(T[2], 5);
890
+ mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
891
+ #undef sqrn_EltFp25519_1w_x64
892
+ }
893
+
894
+ /**
895
+ * Given C, a 256-bit number, fred_EltFp25519_1w_x64 updates C
896
+ * with a number such that 0 <= C < 2**255-19.
897
+ * Contributed by: Samuel Neves.
898
+ **/
899
+ inline void fred_EltFp25519_1w_x64(uint64_t *const c) {
900
+ __asm__ __volatile__ (
901
+ /* First, obtains a number less than 2^255. */
902
+ "btrq $63, 24(%0) ;"
903
+ "sbbl %%ecx, %%ecx ;"
904
+ "andq $19, %%rcx ;"
905
+ "addq %%rcx, (%0) ;"
906
+ "adcq $0, 8(%0) ;"
907
+ "adcq $0, 16(%0) ;"
908
+ "adcq $0, 24(%0) ;"
909
+
910
+ "btrq $63, 24(%0) ;"
911
+ "sbbl %%ecx, %%ecx ;"
912
+ "andq $19, %%rcx ;"
913
+ "addq %%rcx, (%0) ;"
914
+ "adcq $0, 8(%0) ;"
915
+ "adcq $0, 16(%0) ;"
916
+ "adcq $0, 24(%0) ;"
917
+
918
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
919
+ "cmpq $-19, (%0) ;"
920
+ "setaeb %%al ;"
921
+ "cmpq $-1, 8(%0) ;"
922
+ "setzb %%bl ;"
923
+ "cmpq $-1, 16(%0) ;"
924
+ "setzb %%cl ;"
925
+ "movq 24(%0), %%rdx ;"
926
+ "addq $1, %%rdx ;"
927
+ "shrq $63, %%rdx ;"
928
+ "andb %%bl, %%al ;"
929
+ "andb %%dl, %%cl ;"
930
+ "test %%cl, %%al ;"
931
+ "movl $0, %%eax ;"
932
+ "movl $19, %%ecx ;"
933
+ "cmovnz %%rcx, %%rax ;"
934
+ "addq %%rax, (%0) ;"
935
+ "adcq $0, 8(%0) ;"
936
+ "adcq $0, 16(%0) ;"
937
+ "adcq $0, 24(%0) ;"
938
+ "btrq $63, 24(%0) ;"
939
+ :
940
+ : "r"(c)
941
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
942
+ );
943
+ }