x25519-termux 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ /*
2
+ Ruby C extension providing bindings to the rfc7748_precomputed implementation of
3
+ the X25519 Diffie-Hellman algorithm
4
+ */
5
+
6
+ #include "ruby.h"
7
+ #include "x25519_precomputed.h"
8
+
9
+ static VALUE mX25519 = Qnil;
10
+ static VALUE mX25519_Provider = Qnil;
11
+ static VALUE mX25519_Provider_Precomputed = Qnil;
12
+
13
+ static VALUE mX25519_Provider_Precomputed_scalarmult(VALUE self, VALUE scalar, VALUE montgomery_u);
14
+ static VALUE mX25519_Provider_Precomputed_scalarmult_base(VALUE self, VALUE scalar);
15
+ static VALUE mX25519_is_available(VALUE self);
16
+ static VALUE mX25519_disabled(VALUE self);
17
+
18
+ /* Initialize the x25519_precomputed C extension */
19
+ void Init_x25519_precomputed()
20
+ {
21
+ mX25519 = rb_define_module("X25519");
22
+ mX25519_Provider = rb_define_module_under(mX25519, "Provider");
23
+ mX25519_Provider_Precomputed = rb_define_module_under(mX25519_Provider, "Precomputed");
24
+
25
+ #ifdef DISABLE_PRECOMPUTED
26
+ rb_define_singleton_method(mX25519_Provider_Precomputed, "available?", mX25519_disabled, 0);
27
+ #else
28
+ rb_define_singleton_method(mX25519_Provider_Precomputed, "scalarmult", mX25519_Provider_Precomputed_scalarmult, 2);
29
+ rb_define_singleton_method(mX25519_Provider_Precomputed, "scalarmult_base", mX25519_Provider_Precomputed_scalarmult_base, 1);
30
+ rb_define_singleton_method(mX25519_Provider_Precomputed, "available?", mX25519_is_available, 0);
31
+ #endif
32
+ }
33
+
34
+ /* Variable-base scalar multiplication */
35
+ static VALUE mX25519_Provider_Precomputed_scalarmult(VALUE self, VALUE scalar, VALUE montgomery_u)
36
+ {
37
+ /* X25519_KEY ensures inputs are aligned at 32-bytes */
38
+ X25519_KEY raw_scalar, raw_montgomery_u, product;
39
+
40
+ StringValue(scalar);
41
+ if(RSTRING_LEN(scalar) != X25519_KEYSIZE_BYTES) {
42
+ rb_raise(
43
+ rb_eArgError,
44
+ "expected %d-byte scalar, got %ld",
45
+ X25519_KEYSIZE_BYTES,
46
+ RSTRING_LEN(scalar)
47
+ );
48
+ }
49
+
50
+ StringValue(montgomery_u);
51
+ if(RSTRING_LEN(montgomery_u) != X25519_KEYSIZE_BYTES) {
52
+ rb_raise(
53
+ rb_eArgError,
54
+ "expected %d-byte Montgomery-u coordinate, got %ld",
55
+ X25519_KEYSIZE_BYTES,
56
+ RSTRING_LEN(montgomery_u)
57
+ );
58
+ }
59
+
60
+ memcpy(raw_scalar, RSTRING_PTR(scalar), X25519_KEYSIZE_BYTES);
61
+ memcpy(raw_montgomery_u, RSTRING_PTR(montgomery_u), X25519_KEYSIZE_BYTES);
62
+ x25519_precomputed_scalarmult(product, raw_scalar, raw_montgomery_u);
63
+
64
+ return rb_str_new((const char *)product, X25519_KEYSIZE_BYTES);
65
+ }
66
+
67
+ /* Fixed-base scalar multiplication */
68
+ static VALUE mX25519_Provider_Precomputed_scalarmult_base(VALUE self, VALUE scalar)
69
+ {
70
+ /* X25519_KEY ensures inputs are aligned at 32-bytes */
71
+ X25519_KEY raw_scalar, product;
72
+
73
+ StringValue(scalar);
74
+ if(RSTRING_LEN(scalar) != X25519_KEYSIZE_BYTES) {
75
+ rb_raise(
76
+ rb_eArgError,
77
+ "expected %d-byte scalar, got %ld",
78
+ X25519_KEYSIZE_BYTES,
79
+ RSTRING_LEN(scalar)
80
+ );
81
+ }
82
+
83
+ memcpy(raw_scalar, RSTRING_PTR(scalar), X25519_KEYSIZE_BYTES);
84
+ x25519_precomputed_scalarmult_base(product, raw_scalar);
85
+
86
+ return rb_str_new((const char *)product, X25519_KEYSIZE_BYTES);
87
+ }
88
+
89
+ /* Is the x25519_precomputed backend supported on this CPU? */
90
+ static VALUE mX25519_is_available(VALUE self)
91
+ {
92
+ return check_4th_gen_intel_core_features() ? Qtrue : Qfalse;
93
+ }
94
+
95
+ /* Set availability to return false if extension is skipped */
96
+ static VALUE mX25519_disabled(VALUE self)
97
+ {
98
+ return Qfalse;
99
+ }
@@ -0,0 +1,58 @@
1
+ /**
2
+ * Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * Redistribution and use in source and binary forms, with or without
7
+ * modification, are permitted provided that the following conditions
8
+ * are met:
9
+ *
10
+ * * Redistributions of source code must retain the above copyright
11
+ * notice, this list of conditions and the following disclaimer.
12
+ * * Redistributions in binary form must reproduce the above
13
+ * copyright notice, this list of conditions and the following
14
+ * disclaimer in the documentation and/or other materials provided
15
+ * with the distribution.
16
+ * * Neither the name of University of Campinas nor the names of its
17
+ * contributors may be used to endorse or promote products derived
18
+ * from this software without specific prior written permission.
19
+ *
20
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
23
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
24
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
25
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
31
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
32
+ */
33
+
34
+ #ifndef X25519_PRECOMPUTED_H
35
+ #define X25519_PRECOMPUTED_H
36
+
37
+ #include <stdint.h>
38
+
39
+ #ifndef ALIGN_BYTES
40
+ #define ALIGN_BYTES 32
41
+ #endif
42
+
43
+ #ifndef ALIGN
44
+ #ifdef __INTEL_COMPILER
45
+ #define ALIGN __declspec(align(ALIGN_BYTES))
46
+ #else
47
+ #define ALIGN __attribute__((aligned(ALIGN_BYTES)))
48
+ #endif
49
+ #endif
50
+
51
+ #define X25519_KEYSIZE_BYTES 32
52
+ typedef ALIGN uint8_t X25519_KEY[X25519_KEYSIZE_BYTES];
53
+
54
+ void x25519_precomputed_scalarmult(uint8_t *shared, uint8_t *private_key, uint8_t *session_key);
55
+ void x25519_precomputed_scalarmult_base(uint8_t *session_key, uint8_t *private_key);
56
+ int check_4th_gen_intel_core_features();
57
+
58
+ #endif /* X25519_PRECOMPUTED_H */
@@ -0,0 +1,251 @@
1
+ /**
2
+ * Copyright (c) 2017, Armando Faz <armfazh@ic.unicamp.br>. All rights reserved.
3
+ * Institute of Computing.
4
+ * University of Campinas, Brazil.
5
+ *
6
+ * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
7
+ *
8
+ * Redistribution and use in source and binary forms, with or without
9
+ * modification, are permitted provided that the following conditions
10
+ * are met:
11
+ *
12
+ * * Redistributions of source code must retain the above copyright
13
+ * notice, this list of conditions and the following disclaimer.
14
+ * * Redistributions in binary form must reproduce the above
15
+ * copyright notice, this list of conditions and the following
16
+ * disclaimer in the documentation and/or other materials provided
17
+ * with the distribution.
18
+ * * Neither the name of University of Campinas nor the names of its
19
+ * contributors may be used to endorse or promote products derived
20
+ * from this software without specific prior written permission.
21
+ *
22
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
28
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
29
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
31
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
32
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
33
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ */
35
+
36
+ #include <string.h>
37
+ #include "fp25519_x64.h"
38
+ #include "x25519_precomputed.h"
39
+ #include "table_ladder_x25519.h"
40
+
41
+ static inline void cswap_x64(uint64_t bit, uint64_t *const px,
42
+ uint64_t *const py) {
43
+ int i = 0;
44
+ uint64_t mask = (uint64_t)0 - bit;
45
+ for (i = 0; i < NUM_WORDS_ELTFP25519_X64; i++) {
46
+ uint64_t t = mask & (px[i] ^ py[i]);
47
+ px[i] = px[i] ^ t;
48
+ py[i] = py[i] ^ t;
49
+ }
50
+ }
51
+
52
+
53
+ /** Original rfc7748_precomputed name: 'x25519_shared_secret_x64' */
54
+ void x25519_precomputed_scalarmult(uint8_t *shared, uint8_t *private_key,
55
+ uint8_t *session_key) {
56
+ ALIGN uint64_t buffer[4 * NUM_WORDS_ELTFP25519_X64];
57
+ ALIGN uint64_t coordinates[4 * NUM_WORDS_ELTFP25519_X64];
58
+ ALIGN uint64_t workspace[6 * NUM_WORDS_ELTFP25519_X64];
59
+ ALIGN uint8_t session[X25519_KEYSIZE_BYTES];
60
+ ALIGN uint8_t private[X25519_KEYSIZE_BYTES];
61
+
62
+ int i = 0, j = 0;
63
+ uint64_t prev = 0;
64
+ uint64_t *const X1 = (uint64_t *)session;
65
+ uint64_t *const key = (uint64_t *)private;
66
+ uint64_t *const Px = coordinates + 0;
67
+ uint64_t *const Pz = coordinates + 4;
68
+ uint64_t *const Qx = coordinates + 8;
69
+ uint64_t *const Qz = coordinates + 12;
70
+ uint64_t *const X2 = Qx;
71
+ uint64_t *const Z2 = Qz;
72
+ uint64_t *const X3 = Px;
73
+ uint64_t *const Z3 = Pz;
74
+ uint64_t *const X2Z2 = Qx;
75
+ uint64_t *const X3Z3 = Px;
76
+
77
+ uint64_t *const A = workspace + 0;
78
+ uint64_t *const B = workspace + 4;
79
+ uint64_t *const D = workspace + 8;
80
+ uint64_t *const C = workspace + 12;
81
+ uint64_t *const DA = workspace + 16;
82
+ uint64_t *const CB = workspace + 20;
83
+ uint64_t *const AB = A;
84
+ uint64_t *const DC = D;
85
+ uint64_t *const DACB = DA;
86
+ uint64_t *const buffer_1w = buffer;
87
+ uint64_t *const buffer_2w = buffer;
88
+
89
+ memcpy(private, private_key, sizeof(private));
90
+ memcpy(session, session_key, sizeof(session));
91
+
92
+ /* clampC function */
93
+ private
94
+ [0] = private[0] & (~(uint8_t)0x7);
95
+ private
96
+ [X25519_KEYSIZE_BYTES - 1] =
97
+ (uint8_t)64 | (private[X25519_KEYSIZE_BYTES - 1] & (uint8_t)0x7F);
98
+
99
+ /**
100
+ * As in the draft:
101
+ * When receiving such an array, implementations of curve25519
102
+ * MUST mask the most-significant bit in the final byte. This
103
+ * is done to preserve compatibility with point formats which
104
+ * reserve the sign bit for use in other protocols and to
105
+ * increase resistance to implementation fingerprinting
106
+ **/
107
+ session[X25519_KEYSIZE_BYTES - 1] &= (1 << (255 % 8)) - 1;
108
+
109
+ copy_EltFp25519_1w_x64(Px, X1);
110
+ setzero_EltFp25519_1w_x64(Pz);
111
+ setzero_EltFp25519_1w_x64(Qx);
112
+ setzero_EltFp25519_1w_x64(Qz);
113
+
114
+ Pz[0] = 1;
115
+ Qx[0] = 1;
116
+
117
+ /* main-loop */
118
+ prev = 0;
119
+ j = 62;
120
+ for (i = 3; i >= 0; i--) {
121
+ while (j >= 0) {
122
+ uint64_t bit = (key[i] >> j) & 0x1;
123
+ uint64_t swap = bit ^ prev;
124
+ prev = bit;
125
+
126
+ add_EltFp25519_1w_x64(A, X2, Z2); /* A = (X2+Z2) */
127
+ sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */
128
+ add_EltFp25519_1w_x64(C, X3, Z3); /* C = (X3+Z3) */
129
+ sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */
130
+ mul_EltFp25519_2w_x64(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
131
+
132
+ cswap_x64(swap, A, C);
133
+ cswap_x64(swap, B, D);
134
+
135
+ sqr_EltFp25519_2w_x64(AB); /* [AA|BB] = [A^2|B^2] */
136
+ add_EltFp25519_1w_x64(X3, DA, CB); /* X3 = (DA+CB) */
137
+ sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */
138
+ sqr_EltFp25519_2w_x64(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
139
+
140
+ copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */
141
+ sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */
142
+
143
+ mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */
144
+ add_EltFp25519_1w_x64(B, B, X2); /* B = a24*E+B */
145
+ mul_EltFp25519_2w_x64(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
146
+ mul_EltFp25519_1w_x64(Z3, Z3, X1); /* Z3 = Z3*X1 */
147
+ j--;
148
+ }
149
+ j = 63;
150
+ }
151
+
152
+ inv_EltFp25519_1w_x64(A, Qz);
153
+ mul_EltFp25519_1w_x64((uint64_t *)shared, Qx, A);
154
+ fred_EltFp25519_1w_x64((uint64_t *)shared);
155
+ }
156
+
157
+ /* Original rfc7748_precomputed name: 'x25519_keygen_precmp_x64' */
158
+ void x25519_precomputed_scalarmult_base(uint8_t *session_key, uint8_t *private_key) {
159
+ ALIGN uint64_t buffer[4 * NUM_WORDS_ELTFP25519_X64];
160
+ ALIGN uint64_t coordinates[4 * NUM_WORDS_ELTFP25519_X64];
161
+ ALIGN uint64_t workspace[4 * NUM_WORDS_ELTFP25519_X64];
162
+ ALIGN uint8_t private[X25519_KEYSIZE_BYTES];
163
+
164
+ int i = 0, j = 0, k = 0;
165
+ uint64_t *const key = (uint64_t *)private;
166
+ uint64_t *const Ur1 = coordinates + 0;
167
+ uint64_t *const Zr1 = coordinates + 4;
168
+ uint64_t *const Ur2 = coordinates + 8;
169
+ uint64_t *const Zr2 = coordinates + 12;
170
+
171
+ uint64_t *const UZr1 = coordinates + 0;
172
+ uint64_t *const ZUr2 = coordinates + 8;
173
+
174
+ uint64_t *const A = workspace + 0;
175
+ uint64_t *const B = workspace + 4;
176
+ uint64_t *const C = workspace + 8;
177
+ uint64_t *const D = workspace + 12;
178
+
179
+ uint64_t *const AB = workspace + 0;
180
+ uint64_t *const CD = workspace + 8;
181
+
182
+ uint64_t *const buffer_1w = buffer;
183
+ uint64_t *const buffer_2w = buffer;
184
+ uint64_t *P = (uint64_t *)Table_Ladder_8k;
185
+
186
+ memcpy(private, private_key, sizeof(private));
187
+
188
+ /* clampC function */
189
+ private
190
+ [0] = private[0] & (~(uint8_t)0x7);
191
+ private
192
+ [X25519_KEYSIZE_BYTES - 1] =
193
+ (uint8_t)64 | (private[X25519_KEYSIZE_BYTES - 1] & (uint8_t)0x7F);
194
+
195
+ setzero_EltFp25519_1w_x64(Ur1);
196
+ setzero_EltFp25519_1w_x64(Zr1);
197
+ setzero_EltFp25519_1w_x64(Zr2);
198
+ Ur1[0] = 1;
199
+ Zr1[0] = 1;
200
+ Zr2[0] = 1;
201
+
202
+ /* G-S */
203
+ Ur2[3] = 0x1eaecdeee27cab34;
204
+ Ur2[2] = 0xadc7a0b9235d48e2;
205
+ Ur2[1] = 0xbbf095ae14b2edf8;
206
+ Ur2[0] = 0x7e94e1fec82faabd;
207
+
208
+ /* main-loop */
209
+ const int ite[4] = {64, 64, 64, 63};
210
+ const int q = 3;
211
+ uint64_t swap = 1;
212
+
213
+ j = q;
214
+ for (i = 0; i < NUM_WORDS_ELTFP25519_X64; i++) {
215
+ while (j < ite[i]) {
216
+ k = (64 * i + j - q);
217
+ uint64_t bit = (key[i] >> j) & 0x1;
218
+ swap = swap ^ bit;
219
+ cswap_x64(swap, Ur1, Ur2);
220
+ cswap_x64(swap, Zr1, Zr2);
221
+ swap = bit;
222
+ /** Addition */
223
+ sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
224
+ add_EltFp25519_1w_x64(A, Ur1, Zr1); /* A = Ur1+Zr1 */
225
+ mul_EltFp25519_1w_x64(C, &P[4 * k], B); /* C = M0-B */
226
+ sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
227
+ add_EltFp25519_1w_x64(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
228
+ sqr_EltFp25519_2w_x64(AB); /* A = A^2 | B = B^2 */
229
+ mul_EltFp25519_2w_x64(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
230
+ j++;
231
+ }
232
+ j = 0;
233
+ }
234
+
235
+ /** Doubling */
236
+ for (i = 0; i < q; i++) {
237
+ add_EltFp25519_1w_x64(A, Ur1, Zr1); /* A = Ur1+Zr1 */
238
+ sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
239
+ sqr_EltFp25519_2w_x64(AB); /* A = A**2 B = B**2 */
240
+ copy_EltFp25519_1w_x64(C, B); /* C = B */
241
+ sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */
242
+ mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */
243
+ add_EltFp25519_1w_x64(D, D, C); /* D = D+C */
244
+ mul_EltFp25519_2w_x64(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
245
+ }
246
+
247
+ /* Convert to affine coordinates */
248
+ inv_EltFp25519_1w_x64(A, Zr1);
249
+ mul_EltFp25519_1w_x64((uint64_t *)session_key, Ur1, A);
250
+ fred_EltFp25519_1w_x64((uint64_t *)session_key);
251
+ }
@@ -0,0 +1,2 @@
1
+ #define CRYPTO_BYTES 32
2
+ #define CRYPTO_SCALARBYTES 32
@@ -0,0 +1,9 @@
1
+ #include "fe.h"
2
+ #include "x25519_ref10.h"
3
+
4
+ static const uint8_t x25519_basepoint[32] = {9};
5
+
6
+ int x25519_ref10_scalarmult_base(uint8_t *q, const uint8_t *n)
7
+ {
8
+ return x25519_ref10_scalarmult(q,n,x25519_basepoint);
9
+ }
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ # rubocop:disable Style/GlobalVars
4
+
5
+ require "mkmf"
6
+
7
+ require_relative '../extconf_helpers'
8
+
9
+ add_cflags '-Wall -O3 -pedantic -std=c99'
10
+
11
+ create_makefile("x25519_ref10")
12
+
13
+ # rubocop:enable Style/GlobalVars
@@ -0,0 +1,912 @@
1
+ #include "fe.h"
2
+
3
+ /*
4
+ h = 0
5
+ */
6
+
7
+ void fe_0(fe h)
8
+ {
9
+ h[0] = 0;
10
+ h[1] = 0;
11
+ h[2] = 0;
12
+ h[3] = 0;
13
+ h[4] = 0;
14
+ h[5] = 0;
15
+ h[6] = 0;
16
+ h[7] = 0;
17
+ h[8] = 0;
18
+ h[9] = 0;
19
+ }
20
+
21
+ /*
22
+ h = 1
23
+ */
24
+
25
+ void fe_1(fe h)
26
+ {
27
+ h[0] = 1;
28
+ h[1] = 0;
29
+ h[2] = 0;
30
+ h[3] = 0;
31
+ h[4] = 0;
32
+ h[5] = 0;
33
+ h[6] = 0;
34
+ h[7] = 0;
35
+ h[8] = 0;
36
+ h[9] = 0;
37
+ }
38
+
39
+ /*
40
+ h = f + g
41
+ Can overlap h with f or g.
42
+
43
+ Preconditions:
44
+ |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
45
+ |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
46
+
47
+ Postconditions:
48
+ |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
49
+ */
50
+
51
+ void fe_add(fe h,fe f,fe g)
52
+ {
53
+ int32_t f0 = f[0];
54
+ int32_t f1 = f[1];
55
+ int32_t f2 = f[2];
56
+ int32_t f3 = f[3];
57
+ int32_t f4 = f[4];
58
+ int32_t f5 = f[5];
59
+ int32_t f6 = f[6];
60
+ int32_t f7 = f[7];
61
+ int32_t f8 = f[8];
62
+ int32_t f9 = f[9];
63
+ int32_t g0 = g[0];
64
+ int32_t g1 = g[1];
65
+ int32_t g2 = g[2];
66
+ int32_t g3 = g[3];
67
+ int32_t g4 = g[4];
68
+ int32_t g5 = g[5];
69
+ int32_t g6 = g[6];
70
+ int32_t g7 = g[7];
71
+ int32_t g8 = g[8];
72
+ int32_t g9 = g[9];
73
+ int32_t h0 = f0 + g0;
74
+ int32_t h1 = f1 + g1;
75
+ int32_t h2 = f2 + g2;
76
+ int32_t h3 = f3 + g3;
77
+ int32_t h4 = f4 + g4;
78
+ int32_t h5 = f5 + g5;
79
+ int32_t h6 = f6 + g6;
80
+ int32_t h7 = f7 + g7;
81
+ int32_t h8 = f8 + g8;
82
+ int32_t h9 = f9 + g9;
83
+ h[0] = h0;
84
+ h[1] = h1;
85
+ h[2] = h2;
86
+ h[3] = h3;
87
+ h[4] = h4;
88
+ h[5] = h5;
89
+ h[6] = h6;
90
+ h[7] = h7;
91
+ h[8] = h8;
92
+ h[9] = h9;
93
+ }
94
+
95
+ /*
96
+ h = f
97
+ */
98
+
99
+ void fe_copy(fe h,fe f)
100
+ {
101
+ int32_t f0 = f[0];
102
+ int32_t f1 = f[1];
103
+ int32_t f2 = f[2];
104
+ int32_t f3 = f[3];
105
+ int32_t f4 = f[4];
106
+ int32_t f5 = f[5];
107
+ int32_t f6 = f[6];
108
+ int32_t f7 = f[7];
109
+ int32_t f8 = f[8];
110
+ int32_t f9 = f[9];
111
+ h[0] = f0;
112
+ h[1] = f1;
113
+ h[2] = f2;
114
+ h[3] = f3;
115
+ h[4] = f4;
116
+ h[5] = f5;
117
+ h[6] = f6;
118
+ h[7] = f7;
119
+ h[8] = f8;
120
+ h[9] = f9;
121
+ }
122
+
123
+ /*
124
+ Replace (f,g) with (g,f) if b == 1;
125
+ replace (f,g) with (f,g) if b == 0.
126
+
127
+ Preconditions: b in {0,1}.
128
+ */
129
+
130
+ void fe_cswap(fe f,fe g,unsigned int b)
131
+ {
132
+ int32_t f0 = f[0];
133
+ int32_t f1 = f[1];
134
+ int32_t f2 = f[2];
135
+ int32_t f3 = f[3];
136
+ int32_t f4 = f[4];
137
+ int32_t f5 = f[5];
138
+ int32_t f6 = f[6];
139
+ int32_t f7 = f[7];
140
+ int32_t f8 = f[8];
141
+ int32_t f9 = f[9];
142
+ int32_t g0 = g[0];
143
+ int32_t g1 = g[1];
144
+ int32_t g2 = g[2];
145
+ int32_t g3 = g[3];
146
+ int32_t g4 = g[4];
147
+ int32_t g5 = g[5];
148
+ int32_t g6 = g[6];
149
+ int32_t g7 = g[7];
150
+ int32_t g8 = g[8];
151
+ int32_t g9 = g[9];
152
+ int32_t x0 = f0 ^ g0;
153
+ int32_t x1 = f1 ^ g1;
154
+ int32_t x2 = f2 ^ g2;
155
+ int32_t x3 = f3 ^ g3;
156
+ int32_t x4 = f4 ^ g4;
157
+ int32_t x5 = f5 ^ g5;
158
+ int32_t x6 = f6 ^ g6;
159
+ int32_t x7 = f7 ^ g7;
160
+ int32_t x8 = f8 ^ g8;
161
+ int32_t x9 = f9 ^ g9;
162
+ b = -b;
163
+ x0 &= b;
164
+ x1 &= b;
165
+ x2 &= b;
166
+ x3 &= b;
167
+ x4 &= b;
168
+ x5 &= b;
169
+ x6 &= b;
170
+ x7 &= b;
171
+ x8 &= b;
172
+ x9 &= b;
173
+ f[0] = f0 ^ x0;
174
+ f[1] = f1 ^ x1;
175
+ f[2] = f2 ^ x2;
176
+ f[3] = f3 ^ x3;
177
+ f[4] = f4 ^ x4;
178
+ f[5] = f5 ^ x5;
179
+ f[6] = f6 ^ x6;
180
+ f[7] = f7 ^ x7;
181
+ f[8] = f8 ^ x8;
182
+ f[9] = f9 ^ x9;
183
+ g[0] = g0 ^ x0;
184
+ g[1] = g1 ^ x1;
185
+ g[2] = g2 ^ x2;
186
+ g[3] = g3 ^ x3;
187
+ g[4] = g4 ^ x4;
188
+ g[5] = g5 ^ x5;
189
+ g[6] = g6 ^ x6;
190
+ g[7] = g7 ^ x7;
191
+ g[8] = g8 ^ x8;
192
+ g[9] = g9 ^ x9;
193
+ }
194
+
195
+ static uint64_t load_3(const unsigned char *in)
196
+ {
197
+ uint64_t result;
198
+ result = (uint64_t) in[0];
199
+ result |= ((uint64_t) in[1]) << 8;
200
+ result |= ((uint64_t) in[2]) << 16;
201
+ return result;
202
+ }
203
+
204
+ static uint64_t load_4(const unsigned char *in)
205
+ {
206
+ uint64_t result;
207
+ result = (uint64_t) in[0];
208
+ result |= ((uint64_t) in[1]) << 8;
209
+ result |= ((uint64_t) in[2]) << 16;
210
+ result |= ((uint64_t) in[3]) << 24;
211
+ return result;
212
+ }
213
+
214
+ void fe_frombytes(fe h,const unsigned char *s)
215
+ {
216
+ int64_t h0 = load_4(s);
217
+ int64_t h1 = load_3(s + 4) << 6;
218
+ int64_t h2 = load_3(s + 7) << 5;
219
+ int64_t h3 = load_3(s + 10) << 3;
220
+ int64_t h4 = load_3(s + 13) << 2;
221
+ int64_t h5 = load_4(s + 16);
222
+ int64_t h6 = load_3(s + 20) << 7;
223
+ int64_t h7 = load_3(s + 23) << 5;
224
+ int64_t h8 = load_3(s + 26) << 4;
225
+ int64_t h9 = (load_3(s + 29) & 8388607) << 2;
226
+ int64_t carry0;
227
+ int64_t carry1;
228
+ int64_t carry2;
229
+ int64_t carry3;
230
+ int64_t carry4;
231
+ int64_t carry5;
232
+ int64_t carry6;
233
+ int64_t carry7;
234
+ int64_t carry8;
235
+ int64_t carry9;
236
+
237
+ carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
238
+ carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
239
+ carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
240
+ carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
241
+ carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
242
+
243
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
244
+ carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
245
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
246
+ carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
247
+ carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
248
+
249
+ h[0] = (int32_t)h0;
250
+ h[1] = (int32_t)h1;
251
+ h[2] = (int32_t)h2;
252
+ h[3] = (int32_t)h3;
253
+ h[4] = (int32_t)h4;
254
+ h[5] = (int32_t)h5;
255
+ h[6] = (int32_t)h6;
256
+ h[7] = (int32_t)h7;
257
+ h[8] = (int32_t)h8;
258
+ h[9] = (int32_t)h9;
259
+ }
260
+
261
+ void fe_invert(fe out,fe z)
262
+ {
263
+ fe t0;
264
+ fe t1;
265
+ fe t2;
266
+ fe t3;
267
+ int i;
268
+
269
+ #include "pow225521.h"
270
+
271
+ return;
272
+ }
273
+
274
+ /*
275
+ h = f * g
276
+ Can overlap h with f or g.
277
+
278
+ Preconditions:
279
+ |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
280
+ |g| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
281
+
282
+ Postconditions:
283
+ |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
284
+ */
285
+
286
+ /*
287
+ Notes on implementation strategy:
288
+
289
+ Using schoolbook multiplication.
290
+ Karatsuba would save a little in some cost models.
291
+
292
+ Most multiplications by 2 and 19 are 32-bit precomputations;
293
+ cheaper than 64-bit postcomputations.
294
+
295
+ There is one remaining multiplication by 19 in the carry chain;
296
+ one *19 precomputation can be merged into this,
297
+ but the resulting data flow is considerably less clean.
298
+
299
+ There are 12 carries below.
300
+ 10 of them are 2-way parallelizable and vectorizable.
301
+ Can get away with 11 carries, but then data flow is much deeper.
302
+
303
+ With tighter constraints on inputs can squeeze carries into int32.
304
+ */
305
+
306
+ void fe_mul(fe h,fe f,fe g)
307
+ {
308
+ int32_t f0 = f[0];
309
+ int32_t f1 = f[1];
310
+ int32_t f2 = f[2];
311
+ int32_t f3 = f[3];
312
+ int32_t f4 = f[4];
313
+ int32_t f5 = f[5];
314
+ int32_t f6 = f[6];
315
+ int32_t f7 = f[7];
316
+ int32_t f8 = f[8];
317
+ int32_t f9 = f[9];
318
+ int32_t g0 = g[0];
319
+ int32_t g1 = g[1];
320
+ int32_t g2 = g[2];
321
+ int32_t g3 = g[3];
322
+ int32_t g4 = g[4];
323
+ int32_t g5 = g[5];
324
+ int32_t g6 = g[6];
325
+ int32_t g7 = g[7];
326
+ int32_t g8 = g[8];
327
+ int32_t g9 = g[9];
328
+ int32_t g1_19 = 19 * g1; /* 1.4*2^29 */
329
+ int32_t g2_19 = 19 * g2; /* 1.4*2^30; still ok */
330
+ int32_t g3_19 = 19 * g3;
331
+ int32_t g4_19 = 19 * g4;
332
+ int32_t g5_19 = 19 * g5;
333
+ int32_t g6_19 = 19 * g6;
334
+ int32_t g7_19 = 19 * g7;
335
+ int32_t g8_19 = 19 * g8;
336
+ int32_t g9_19 = 19 * g9;
337
+ int32_t f1_2 = 2 * f1;
338
+ int32_t f3_2 = 2 * f3;
339
+ int32_t f5_2 = 2 * f5;
340
+ int32_t f7_2 = 2 * f7;
341
+ int32_t f9_2 = 2 * f9;
342
+ int64_t f0g0 = f0 * (int64_t) g0;
343
+ int64_t f0g1 = f0 * (int64_t) g1;
344
+ int64_t f0g2 = f0 * (int64_t) g2;
345
+ int64_t f0g3 = f0 * (int64_t) g3;
346
+ int64_t f0g4 = f0 * (int64_t) g4;
347
+ int64_t f0g5 = f0 * (int64_t) g5;
348
+ int64_t f0g6 = f0 * (int64_t) g6;
349
+ int64_t f0g7 = f0 * (int64_t) g7;
350
+ int64_t f0g8 = f0 * (int64_t) g8;
351
+ int64_t f0g9 = f0 * (int64_t) g9;
352
+ int64_t f1g0 = f1 * (int64_t) g0;
353
+ int64_t f1g1_2 = f1_2 * (int64_t) g1;
354
+ int64_t f1g2 = f1 * (int64_t) g2;
355
+ int64_t f1g3_2 = f1_2 * (int64_t) g3;
356
+ int64_t f1g4 = f1 * (int64_t) g4;
357
+ int64_t f1g5_2 = f1_2 * (int64_t) g5;
358
+ int64_t f1g6 = f1 * (int64_t) g6;
359
+ int64_t f1g7_2 = f1_2 * (int64_t) g7;
360
+ int64_t f1g8 = f1 * (int64_t) g8;
361
+ int64_t f1g9_38 = f1_2 * (int64_t) g9_19;
362
+ int64_t f2g0 = f2 * (int64_t) g0;
363
+ int64_t f2g1 = f2 * (int64_t) g1;
364
+ int64_t f2g2 = f2 * (int64_t) g2;
365
+ int64_t f2g3 = f2 * (int64_t) g3;
366
+ int64_t f2g4 = f2 * (int64_t) g4;
367
+ int64_t f2g5 = f2 * (int64_t) g5;
368
+ int64_t f2g6 = f2 * (int64_t) g6;
369
+ int64_t f2g7 = f2 * (int64_t) g7;
370
+ int64_t f2g8_19 = f2 * (int64_t) g8_19;
371
+ int64_t f2g9_19 = f2 * (int64_t) g9_19;
372
+ int64_t f3g0 = f3 * (int64_t) g0;
373
+ int64_t f3g1_2 = f3_2 * (int64_t) g1;
374
+ int64_t f3g2 = f3 * (int64_t) g2;
375
+ int64_t f3g3_2 = f3_2 * (int64_t) g3;
376
+ int64_t f3g4 = f3 * (int64_t) g4;
377
+ int64_t f3g5_2 = f3_2 * (int64_t) g5;
378
+ int64_t f3g6 = f3 * (int64_t) g6;
379
+ int64_t f3g7_38 = f3_2 * (int64_t) g7_19;
380
+ int64_t f3g8_19 = f3 * (int64_t) g8_19;
381
+ int64_t f3g9_38 = f3_2 * (int64_t) g9_19;
382
+ int64_t f4g0 = f4 * (int64_t) g0;
383
+ int64_t f4g1 = f4 * (int64_t) g1;
384
+ int64_t f4g2 = f4 * (int64_t) g2;
385
+ int64_t f4g3 = f4 * (int64_t) g3;
386
+ int64_t f4g4 = f4 * (int64_t) g4;
387
+ int64_t f4g5 = f4 * (int64_t) g5;
388
+ int64_t f4g6_19 = f4 * (int64_t) g6_19;
389
+ int64_t f4g7_19 = f4 * (int64_t) g7_19;
390
+ int64_t f4g8_19 = f4 * (int64_t) g8_19;
391
+ int64_t f4g9_19 = f4 * (int64_t) g9_19;
392
+ int64_t f5g0 = f5 * (int64_t) g0;
393
+ int64_t f5g1_2 = f5_2 * (int64_t) g1;
394
+ int64_t f5g2 = f5 * (int64_t) g2;
395
+ int64_t f5g3_2 = f5_2 * (int64_t) g3;
396
+ int64_t f5g4 = f5 * (int64_t) g4;
397
+ int64_t f5g5_38 = f5_2 * (int64_t) g5_19;
398
+ int64_t f5g6_19 = f5 * (int64_t) g6_19;
399
+ int64_t f5g7_38 = f5_2 * (int64_t) g7_19;
400
+ int64_t f5g8_19 = f5 * (int64_t) g8_19;
401
+ int64_t f5g9_38 = f5_2 * (int64_t) g9_19;
402
+ int64_t f6g0 = f6 * (int64_t) g0;
403
+ int64_t f6g1 = f6 * (int64_t) g1;
404
+ int64_t f6g2 = f6 * (int64_t) g2;
405
+ int64_t f6g3 = f6 * (int64_t) g3;
406
+ int64_t f6g4_19 = f6 * (int64_t) g4_19;
407
+ int64_t f6g5_19 = f6 * (int64_t) g5_19;
408
+ int64_t f6g6_19 = f6 * (int64_t) g6_19;
409
+ int64_t f6g7_19 = f6 * (int64_t) g7_19;
410
+ int64_t f6g8_19 = f6 * (int64_t) g8_19;
411
+ int64_t f6g9_19 = f6 * (int64_t) g9_19;
412
+ int64_t f7g0 = f7 * (int64_t) g0;
413
+ int64_t f7g1_2 = f7_2 * (int64_t) g1;
414
+ int64_t f7g2 = f7 * (int64_t) g2;
415
+ int64_t f7g3_38 = f7_2 * (int64_t) g3_19;
416
+ int64_t f7g4_19 = f7 * (int64_t) g4_19;
417
+ int64_t f7g5_38 = f7_2 * (int64_t) g5_19;
418
+ int64_t f7g6_19 = f7 * (int64_t) g6_19;
419
+ int64_t f7g7_38 = f7_2 * (int64_t) g7_19;
420
+ int64_t f7g8_19 = f7 * (int64_t) g8_19;
421
+ int64_t f7g9_38 = f7_2 * (int64_t) g9_19;
422
+ int64_t f8g0 = f8 * (int64_t) g0;
423
+ int64_t f8g1 = f8 * (int64_t) g1;
424
+ int64_t f8g2_19 = f8 * (int64_t) g2_19;
425
+ int64_t f8g3_19 = f8 * (int64_t) g3_19;
426
+ int64_t f8g4_19 = f8 * (int64_t) g4_19;
427
+ int64_t f8g5_19 = f8 * (int64_t) g5_19;
428
+ int64_t f8g6_19 = f8 * (int64_t) g6_19;
429
+ int64_t f8g7_19 = f8 * (int64_t) g7_19;
430
+ int64_t f8g8_19 = f8 * (int64_t) g8_19;
431
+ int64_t f8g9_19 = f8 * (int64_t) g9_19;
432
+ int64_t f9g0 = f9 * (int64_t) g0;
433
+ int64_t f9g1_38 = f9_2 * (int64_t) g1_19;
434
+ int64_t f9g2_19 = f9 * (int64_t) g2_19;
435
+ int64_t f9g3_38 = f9_2 * (int64_t) g3_19;
436
+ int64_t f9g4_19 = f9 * (int64_t) g4_19;
437
+ int64_t f9g5_38 = f9_2 * (int64_t) g5_19;
438
+ int64_t f9g6_19 = f9 * (int64_t) g6_19;
439
+ int64_t f9g7_38 = f9_2 * (int64_t) g7_19;
440
+ int64_t f9g8_19 = f9 * (int64_t) g8_19;
441
+ int64_t f9g9_38 = f9_2 * (int64_t) g9_19;
442
+ int64_t h0 = f0g0+f1g9_38+f2g8_19+f3g7_38+f4g6_19+f5g5_38+f6g4_19+f7g3_38+f8g2_19+f9g1_38;
443
+ int64_t h1 = f0g1+f1g0 +f2g9_19+f3g8_19+f4g7_19+f5g6_19+f6g5_19+f7g4_19+f8g3_19+f9g2_19;
444
+ int64_t h2 = f0g2+f1g1_2 +f2g0 +f3g9_38+f4g8_19+f5g7_38+f6g6_19+f7g5_38+f8g4_19+f9g3_38;
445
+ int64_t h3 = f0g3+f1g2 +f2g1 +f3g0 +f4g9_19+f5g8_19+f6g7_19+f7g6_19+f8g5_19+f9g4_19;
446
+ int64_t h4 = f0g4+f1g3_2 +f2g2 +f3g1_2 +f4g0 +f5g9_38+f6g8_19+f7g7_38+f8g6_19+f9g5_38;
447
+ int64_t h5 = f0g5+f1g4 +f2g3 +f3g2 +f4g1 +f5g0 +f6g9_19+f7g8_19+f8g7_19+f9g6_19;
448
+ int64_t h6 = f0g6+f1g5_2 +f2g4 +f3g3_2 +f4g2 +f5g1_2 +f6g0 +f7g9_38+f8g8_19+f9g7_38;
449
+ int64_t h7 = f0g7+f1g6 +f2g5 +f3g4 +f4g3 +f5g2 +f6g1 +f7g0 +f8g9_19+f9g8_19;
450
+ int64_t h8 = f0g8+f1g7_2 +f2g6 +f3g5_2 +f4g4 +f5g3_2 +f6g2 +f7g1_2 +f8g0 +f9g9_38;
451
+ int64_t h9 = f0g9+f1g8 +f2g7 +f3g6 +f4g5 +f5g4 +f6g3 +f7g2 +f8g1 +f9g0 ;
452
+ int64_t carry0;
453
+ int64_t carry1;
454
+ int64_t carry2;
455
+ int64_t carry3;
456
+ int64_t carry4;
457
+ int64_t carry5;
458
+ int64_t carry6;
459
+ int64_t carry7;
460
+ int64_t carry8;
461
+ int64_t carry9;
462
+
463
+ /*
464
+ |h0| <= (1.1*1.1*2^52*(1+19+19+19+19)+1.1*1.1*2^50*(38+38+38+38+38))
465
+ i.e. |h0| <= 1.2*2^59; narrower ranges for h2, h4, h6, h8
466
+ |h1| <= (1.1*1.1*2^51*(1+1+19+19+19+19+19+19+19+19))
467
+ i.e. |h1| <= 1.5*2^58; narrower ranges for h3, h5, h7, h9
468
+ */
469
+
470
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
471
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
472
+ /* |h0| <= 2^25 */
473
+ /* |h4| <= 2^25 */
474
+ /* |h1| <= 1.51*2^58 */
475
+ /* |h5| <= 1.51*2^58 */
476
+
477
+ carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
478
+ carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
479
+ /* |h1| <= 2^24; from now on fits into int32 */
480
+ /* |h5| <= 2^24; from now on fits into int32 */
481
+ /* |h2| <= 1.21*2^59 */
482
+ /* |h6| <= 1.21*2^59 */
483
+
484
+ carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
485
+ carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
486
+ /* |h2| <= 2^25; from now on fits into int32 unchanged */
487
+ /* |h6| <= 2^25; from now on fits into int32 unchanged */
488
+ /* |h3| <= 1.51*2^58 */
489
+ /* |h7| <= 1.51*2^58 */
490
+
491
+ carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
492
+ carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
493
+ /* |h3| <= 2^24; from now on fits into int32 unchanged */
494
+ /* |h7| <= 2^24; from now on fits into int32 unchanged */
495
+ /* |h4| <= 1.52*2^33 */
496
+ /* |h8| <= 1.52*2^33 */
497
+
498
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
499
+ carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
500
+ /* |h4| <= 2^25; from now on fits into int32 unchanged */
501
+ /* |h8| <= 2^25; from now on fits into int32 unchanged */
502
+ /* |h5| <= 1.01*2^24 */
503
+ /* |h9| <= 1.51*2^58 */
504
+
505
+ carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
506
+ /* |h9| <= 2^24; from now on fits into int32 unchanged */
507
+ /* |h0| <= 1.8*2^37 */
508
+
509
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
510
+ /* |h0| <= 2^25; from now on fits into int32 unchanged */
511
+ /* |h1| <= 1.01*2^24 */
512
+
513
+ h[0] = (int32_t)h0;
514
+ h[1] = (int32_t)h1;
515
+ h[2] = (int32_t)h2;
516
+ h[3] = (int32_t)h3;
517
+ h[4] = (int32_t)h4;
518
+ h[5] = (int32_t)h5;
519
+ h[6] = (int32_t)h6;
520
+ h[7] = (int32_t)h7;
521
+ h[8] = (int32_t)h8;
522
+ h[9] = (int32_t)h9;
523
+ }
524
+
525
+ /*
526
+ h = f * 121666
527
+ Can overlap h with f.
528
+
529
+ Preconditions:
530
+ |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
531
+
532
+ Postconditions:
533
+ |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
534
+ */
535
+
536
+ void fe_mul121666(fe h,fe f)
537
+ {
538
+ int32_t f0 = f[0];
539
+ int32_t f1 = f[1];
540
+ int32_t f2 = f[2];
541
+ int32_t f3 = f[3];
542
+ int32_t f4 = f[4];
543
+ int32_t f5 = f[5];
544
+ int32_t f6 = f[6];
545
+ int32_t f7 = f[7];
546
+ int32_t f8 = f[8];
547
+ int32_t f9 = f[9];
548
+ int64_t h0 = f0 * (int64_t) 121666;
549
+ int64_t h1 = f1 * (int64_t) 121666;
550
+ int64_t h2 = f2 * (int64_t) 121666;
551
+ int64_t h3 = f3 * (int64_t) 121666;
552
+ int64_t h4 = f4 * (int64_t) 121666;
553
+ int64_t h5 = f5 * (int64_t) 121666;
554
+ int64_t h6 = f6 * (int64_t) 121666;
555
+ int64_t h7 = f7 * (int64_t) 121666;
556
+ int64_t h8 = f8 * (int64_t) 121666;
557
+ int64_t h9 = f9 * (int64_t) 121666;
558
+ int64_t carry0;
559
+ int64_t carry1;
560
+ int64_t carry2;
561
+ int64_t carry3;
562
+ int64_t carry4;
563
+ int64_t carry5;
564
+ int64_t carry6;
565
+ int64_t carry7;
566
+ int64_t carry8;
567
+ int64_t carry9;
568
+
569
+ carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
570
+ carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
571
+ carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
572
+ carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
573
+ carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
574
+
575
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
576
+ carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
577
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
578
+ carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
579
+ carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
580
+
581
+ h[0] = (int32_t)h0;
582
+ h[1] = (int32_t)h1;
583
+ h[2] = (int32_t)h2;
584
+ h[3] = (int32_t)h3;
585
+ h[4] = (int32_t)h4;
586
+ h[5] = (int32_t)h5;
587
+ h[6] = (int32_t)h6;
588
+ h[7] = (int32_t)h7;
589
+ h[8] = (int32_t)h8;
590
+ h[9] = (int32_t)h9;
591
+ }
592
+
593
+ /*
594
+ h = f * f
595
+ Can overlap h with f.
596
+
597
+ Preconditions:
598
+ |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
599
+
600
+ Postconditions:
601
+ |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
602
+ */
603
+
604
+ /*
605
+ See fe_mul.c for discussion of implementation strategy.
606
+ */
607
+
608
+ void fe_sq(fe h,fe f)
609
+ {
610
+ int32_t f0 = f[0];
611
+ int32_t f1 = f[1];
612
+ int32_t f2 = f[2];
613
+ int32_t f3 = f[3];
614
+ int32_t f4 = f[4];
615
+ int32_t f5 = f[5];
616
+ int32_t f6 = f[6];
617
+ int32_t f7 = f[7];
618
+ int32_t f8 = f[8];
619
+ int32_t f9 = f[9];
620
+ int32_t f0_2 = 2 * f0;
621
+ int32_t f1_2 = 2 * f1;
622
+ int32_t f2_2 = 2 * f2;
623
+ int32_t f3_2 = 2 * f3;
624
+ int32_t f4_2 = 2 * f4;
625
+ int32_t f5_2 = 2 * f5;
626
+ int32_t f6_2 = 2 * f6;
627
+ int32_t f7_2 = 2 * f7;
628
+ int32_t f5_38 = 38 * f5; /* 1.31*2^30 */
629
+ int32_t f6_19 = 19 * f6; /* 1.31*2^30 */
630
+ int32_t f7_38 = 38 * f7; /* 1.31*2^30 */
631
+ int32_t f8_19 = 19 * f8; /* 1.31*2^30 */
632
+ int32_t f9_38 = 38 * f9; /* 1.31*2^30 */
633
+ int64_t f0f0 = f0 * (int64_t) f0;
634
+ int64_t f0f1_2 = f0_2 * (int64_t) f1;
635
+ int64_t f0f2_2 = f0_2 * (int64_t) f2;
636
+ int64_t f0f3_2 = f0_2 * (int64_t) f3;
637
+ int64_t f0f4_2 = f0_2 * (int64_t) f4;
638
+ int64_t f0f5_2 = f0_2 * (int64_t) f5;
639
+ int64_t f0f6_2 = f0_2 * (int64_t) f6;
640
+ int64_t f0f7_2 = f0_2 * (int64_t) f7;
641
+ int64_t f0f8_2 = f0_2 * (int64_t) f8;
642
+ int64_t f0f9_2 = f0_2 * (int64_t) f9;
643
+ int64_t f1f1_2 = f1_2 * (int64_t) f1;
644
+ int64_t f1f2_2 = f1_2 * (int64_t) f2;
645
+ int64_t f1f3_4 = f1_2 * (int64_t) f3_2;
646
+ int64_t f1f4_2 = f1_2 * (int64_t) f4;
647
+ int64_t f1f5_4 = f1_2 * (int64_t) f5_2;
648
+ int64_t f1f6_2 = f1_2 * (int64_t) f6;
649
+ int64_t f1f7_4 = f1_2 * (int64_t) f7_2;
650
+ int64_t f1f8_2 = f1_2 * (int64_t) f8;
651
+ int64_t f1f9_76 = f1_2 * (int64_t) f9_38;
652
+ int64_t f2f2 = f2 * (int64_t) f2;
653
+ int64_t f2f3_2 = f2_2 * (int64_t) f3;
654
+ int64_t f2f4_2 = f2_2 * (int64_t) f4;
655
+ int64_t f2f5_2 = f2_2 * (int64_t) f5;
656
+ int64_t f2f6_2 = f2_2 * (int64_t) f6;
657
+ int64_t f2f7_2 = f2_2 * (int64_t) f7;
658
+ int64_t f2f8_38 = f2_2 * (int64_t) f8_19;
659
+ int64_t f2f9_38 = f2 * (int64_t) f9_38;
660
+ int64_t f3f3_2 = f3_2 * (int64_t) f3;
661
+ int64_t f3f4_2 = f3_2 * (int64_t) f4;
662
+ int64_t f3f5_4 = f3_2 * (int64_t) f5_2;
663
+ int64_t f3f6_2 = f3_2 * (int64_t) f6;
664
+ int64_t f3f7_76 = f3_2 * (int64_t) f7_38;
665
+ int64_t f3f8_38 = f3_2 * (int64_t) f8_19;
666
+ int64_t f3f9_76 = f3_2 * (int64_t) f9_38;
667
+ int64_t f4f4 = f4 * (int64_t) f4;
668
+ int64_t f4f5_2 = f4_2 * (int64_t) f5;
669
+ int64_t f4f6_38 = f4_2 * (int64_t) f6_19;
670
+ int64_t f4f7_38 = f4 * (int64_t) f7_38;
671
+ int64_t f4f8_38 = f4_2 * (int64_t) f8_19;
672
+ int64_t f4f9_38 = f4 * (int64_t) f9_38;
673
+ int64_t f5f5_38 = f5 * (int64_t) f5_38;
674
+ int64_t f5f6_38 = f5_2 * (int64_t) f6_19;
675
+ int64_t f5f7_76 = f5_2 * (int64_t) f7_38;
676
+ int64_t f5f8_38 = f5_2 * (int64_t) f8_19;
677
+ int64_t f5f9_76 = f5_2 * (int64_t) f9_38;
678
+ int64_t f6f6_19 = f6 * (int64_t) f6_19;
679
+ int64_t f6f7_38 = f6 * (int64_t) f7_38;
680
+ int64_t f6f8_38 = f6_2 * (int64_t) f8_19;
681
+ int64_t f6f9_38 = f6 * (int64_t) f9_38;
682
+ int64_t f7f7_38 = f7 * (int64_t) f7_38;
683
+ int64_t f7f8_38 = f7_2 * (int64_t) f8_19;
684
+ int64_t f7f9_76 = f7_2 * (int64_t) f9_38;
685
+ int64_t f8f8_19 = f8 * (int64_t) f8_19;
686
+ int64_t f8f9_38 = f8 * (int64_t) f9_38;
687
+ int64_t f9f9_38 = f9 * (int64_t) f9_38;
688
+ int64_t h0 = f0f0 +f1f9_76+f2f8_38+f3f7_76+f4f6_38+f5f5_38;
689
+ int64_t h1 = f0f1_2+f2f9_38+f3f8_38+f4f7_38+f5f6_38;
690
+ int64_t h2 = f0f2_2+f1f1_2 +f3f9_76+f4f8_38+f5f7_76+f6f6_19;
691
+ int64_t h3 = f0f3_2+f1f2_2 +f4f9_38+f5f8_38+f6f7_38;
692
+ int64_t h4 = f0f4_2+f1f3_4 +f2f2 +f5f9_76+f6f8_38+f7f7_38;
693
+ int64_t h5 = f0f5_2+f1f4_2 +f2f3_2 +f6f9_38+f7f8_38;
694
+ int64_t h6 = f0f6_2+f1f5_4 +f2f4_2 +f3f3_2 +f7f9_76+f8f8_19;
695
+ int64_t h7 = f0f7_2+f1f6_2 +f2f5_2 +f3f4_2 +f8f9_38;
696
+ int64_t h8 = f0f8_2+f1f7_4 +f2f6_2 +f3f5_4 +f4f4 +f9f9_38;
697
+ int64_t h9 = f0f9_2+f1f8_2 +f2f7_2 +f3f6_2 +f4f5_2;
698
+ int64_t carry0;
699
+ int64_t carry1;
700
+ int64_t carry2;
701
+ int64_t carry3;
702
+ int64_t carry4;
703
+ int64_t carry5;
704
+ int64_t carry6;
705
+ int64_t carry7;
706
+ int64_t carry8;
707
+ int64_t carry9;
708
+
709
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
710
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
711
+
712
+ carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
713
+ carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
714
+
715
+ carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
716
+ carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
717
+
718
+ carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
719
+ carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
720
+
721
+ carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
722
+ carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
723
+
724
+ carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
725
+
726
+ carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
727
+
728
+ h[0] = (int32_t)h0;
729
+ h[1] = (int32_t)h1;
730
+ h[2] = (int32_t)h2;
731
+ h[3] = (int32_t)h3;
732
+ h[4] = (int32_t)h4;
733
+ h[5] = (int32_t)h5;
734
+ h[6] = (int32_t)h6;
735
+ h[7] = (int32_t)h7;
736
+ h[8] = (int32_t)h8;
737
+ h[9] = (int32_t)h9;
738
+ }
739
+
740
+ /*
741
+ h = f - g
742
+ Can overlap h with f or g.
743
+
744
+ Preconditions:
745
+ |f| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
746
+ |g| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
747
+
748
+ Postconditions:
749
+ |h| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
750
+ */
751
+
752
+ void fe_sub(fe h,fe f,fe g)
753
+ {
754
+ int32_t f0 = f[0];
755
+ int32_t f1 = f[1];
756
+ int32_t f2 = f[2];
757
+ int32_t f3 = f[3];
758
+ int32_t f4 = f[4];
759
+ int32_t f5 = f[5];
760
+ int32_t f6 = f[6];
761
+ int32_t f7 = f[7];
762
+ int32_t f8 = f[8];
763
+ int32_t f9 = f[9];
764
+ int32_t g0 = g[0];
765
+ int32_t g1 = g[1];
766
+ int32_t g2 = g[2];
767
+ int32_t g3 = g[3];
768
+ int32_t g4 = g[4];
769
+ int32_t g5 = g[5];
770
+ int32_t g6 = g[6];
771
+ int32_t g7 = g[7];
772
+ int32_t g8 = g[8];
773
+ int32_t g9 = g[9];
774
+ int32_t h0 = f0 - g0;
775
+ int32_t h1 = f1 - g1;
776
+ int32_t h2 = f2 - g2;
777
+ int32_t h3 = f3 - g3;
778
+ int32_t h4 = f4 - g4;
779
+ int32_t h5 = f5 - g5;
780
+ int32_t h6 = f6 - g6;
781
+ int32_t h7 = f7 - g7;
782
+ int32_t h8 = f8 - g8;
783
+ int32_t h9 = f9 - g9;
784
+ h[0] = h0;
785
+ h[1] = h1;
786
+ h[2] = h2;
787
+ h[3] = h3;
788
+ h[4] = h4;
789
+ h[5] = h5;
790
+ h[6] = h6;
791
+ h[7] = h7;
792
+ h[8] = h8;
793
+ h[9] = h9;
794
+ }
795
+
796
+ /*
797
+ Preconditions:
798
+ |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc.
799
+
800
+ Write p=2^255-19; q=floor(h/p).
801
+ Basic claim: q = floor(2^(-255)(h + 19 2^(-25)h9 + 2^(-1))).
802
+
803
+ Proof:
804
+ Have |h|<=p so |q|<=1 so |19^2 2^(-255) q|<1/4.
805
+ Also have |h-2^230 h9|<2^230 so |19 2^(-255)(h-2^230 h9)|<1/4.
806
+
807
+ Write y=2^(-1)-19^2 2^(-255)q-19 2^(-255)(h-2^230 h9).
808
+ Then 0<y<1.
809
+
810
+ Write r=h-pq.
811
+ Have 0<=r<=p-1=2^255-20.
812
+ Thus 0<=r+19(2^-255)r<r+19(2^-255)2^255<=2^255-1.
813
+
814
+ Write x=r+19(2^-255)r+y.
815
+ Then 0<x<2^255 so floor(2^(-255)x) = 0 so floor(q+2^(-255)x) = q.
816
+
817
+ Have q+2^(-255)x = 2^(-255)(h + 19 2^(-25) h9 + 2^(-1))
818
+ so floor(2^(-255)(h + 19 2^(-25) h9 + 2^(-1))) = q.
819
+ */
820
+
821
+ void fe_tobytes(unsigned char *s,fe h)
822
+ {
823
+ int32_t h0 = h[0];
824
+ int32_t h1 = h[1];
825
+ int32_t h2 = h[2];
826
+ int32_t h3 = h[3];
827
+ int32_t h4 = h[4];
828
+ int32_t h5 = h[5];
829
+ int32_t h6 = h[6];
830
+ int32_t h7 = h[7];
831
+ int32_t h8 = h[8];
832
+ int32_t h9 = h[9];
833
+ int32_t q;
834
+ int32_t carry0;
835
+ int32_t carry1;
836
+ int32_t carry2;
837
+ int32_t carry3;
838
+ int32_t carry4;
839
+ int32_t carry5;
840
+ int32_t carry6;
841
+ int32_t carry7;
842
+ int32_t carry8;
843
+ int32_t carry9;
844
+
845
+ q = (19 * h9 + (((int32_t) 1) << 24)) >> 25;
846
+ q = (h0 + q) >> 26;
847
+ q = (h1 + q) >> 25;
848
+ q = (h2 + q) >> 26;
849
+ q = (h3 + q) >> 25;
850
+ q = (h4 + q) >> 26;
851
+ q = (h5 + q) >> 25;
852
+ q = (h6 + q) >> 26;
853
+ q = (h7 + q) >> 25;
854
+ q = (h8 + q) >> 26;
855
+ q = (h9 + q) >> 25;
856
+
857
+ /* Goal: Output h-(2^255-19)q, which is between 0 and 2^255-20. */
858
+ h0 += 19 * q;
859
+ /* Goal: Output h-2^255 q, which is between 0 and 2^255-20. */
860
+
861
+ carry0 = h0 >> 26; h1 += carry0; h0 -= carry0 << 26;
862
+ carry1 = h1 >> 25; h2 += carry1; h1 -= carry1 << 25;
863
+ carry2 = h2 >> 26; h3 += carry2; h2 -= carry2 << 26;
864
+ carry3 = h3 >> 25; h4 += carry3; h3 -= carry3 << 25;
865
+ carry4 = h4 >> 26; h5 += carry4; h4 -= carry4 << 26;
866
+ carry5 = h5 >> 25; h6 += carry5; h5 -= carry5 << 25;
867
+ carry6 = h6 >> 26; h7 += carry6; h6 -= carry6 << 26;
868
+ carry7 = h7 >> 25; h8 += carry7; h7 -= carry7 << 25;
869
+ carry8 = h8 >> 26; h9 += carry8; h8 -= carry8 << 26;
870
+ carry9 = h9 >> 25; h9 -= carry9 << 25;
871
+ /* h10 = carry9 */
872
+
873
+ /*
874
+ Goal: Output h0+...+2^255 h10-2^255 q, which is between 0 and 2^255-20.
875
+ Have h0+...+2^230 h9 between 0 and 2^255-1;
876
+ evidently 2^255 h10-2^255 q = 0.
877
+ Goal: Output h0+...+2^230 h9.
878
+ */
879
+
880
+ s[0] = h0 >> 0;
881
+ s[1] = h0 >> 8;
882
+ s[2] = h0 >> 16;
883
+ s[3] = (h0 >> 24) | (h1 << 2);
884
+ s[4] = h1 >> 6;
885
+ s[5] = h1 >> 14;
886
+ s[6] = (h1 >> 22) | (h2 << 3);
887
+ s[7] = h2 >> 5;
888
+ s[8] = h2 >> 13;
889
+ s[9] = (h2 >> 21) | (h3 << 5);
890
+ s[10] = h3 >> 3;
891
+ s[11] = h3 >> 11;
892
+ s[12] = (h3 >> 19) | (h4 << 6);
893
+ s[13] = h4 >> 2;
894
+ s[14] = h4 >> 10;
895
+ s[15] = h4 >> 18;
896
+ s[16] = h5 >> 0;
897
+ s[17] = h5 >> 8;
898
+ s[18] = h5 >> 16;
899
+ s[19] = (h5 >> 24) | (h6 << 1);
900
+ s[20] = h6 >> 7;
901
+ s[21] = h6 >> 15;
902
+ s[22] = (h6 >> 23) | (h7 << 3);
903
+ s[23] = h7 >> 5;
904
+ s[24] = h7 >> 13;
905
+ s[25] = (h7 >> 21) | (h8 << 4);
906
+ s[26] = h8 >> 4;
907
+ s[27] = h8 >> 12;
908
+ s[28] = (h8 >> 20) | (h9 << 6);
909
+ s[29] = h9 >> 2;
910
+ s[30] = h9 >> 10;
911
+ s[31] = h9 >> 18;
912
+ }