bigdecimal 4.0.1 → 4.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,24 +17,15 @@
17
17
  # include <float.h>
18
18
  #endif
19
19
 
20
- #if defined(HAVE_INT64_T) && !defined(BIGDECIMAL_USE_DECDIG_UINT16_T)
21
- # define DECDIG uint32_t
22
- # define DECDIG_DBL uint64_t
23
- # define DECDIG_DBL_SIGNED int64_t
24
- # define SIZEOF_DECDIG 4
25
- # define PRI_DECDIG_PREFIX ""
26
- # ifdef PRI_LL_PREFIX
27
- # define PRI_DECDIG_DBL_PREFIX PRI_LL_PREFIX
28
- # else
29
- # define PRI_DECDIG_DBL_PREFIX "l"
30
- # endif
20
+ #define DECDIG uint32_t
21
+ #define DECDIG_DBL uint64_t
22
+ #define DECDIG_DBL_SIGNED int64_t
23
+ #define SIZEOF_DECDIG 4
24
+ #define PRI_DECDIG_PREFIX ""
25
+ #ifdef PRI_LL_PREFIX
26
+ # define PRI_DECDIG_DBL_PREFIX PRI_LL_PREFIX
31
27
  #else
32
- # define DECDIG uint16_t
33
- # define DECDIG_DBL uint32_t
34
- # define DECDIG_DBL_SIGNED int32_t
35
- # define SIZEOF_DECDIG 2
36
- # define PRI_DECDIG_PREFIX "h"
37
- # define PRI_DECDIG_DBL_PREFIX ""
28
+ # define PRI_DECDIG_DBL_PREFIX "l"
38
29
  #endif
39
30
 
40
31
  #define PRIdDECDIG PRI_DECDIG_PREFIX"d"
@@ -51,31 +42,15 @@
51
42
  #define PRIxDECDIG_DBL PRI_DECDIG_DBL_PREFIX"x"
52
43
  #define PRIXDECDIG_DBL PRI_DECDIG_DBL_PREFIX"X"
53
44
 
54
- #if SIZEOF_DECDIG == 4
55
- # define BIGDECIMAL_BASE ((DECDIG)1000000000U)
56
- # define BIGDECIMAL_COMPONENT_FIGURES 9
45
+ #define BIGDECIMAL_BASE ((DECDIG)1000000000U)
46
+ #define BIGDECIMAL_COMPONENT_FIGURES 9
57
47
  /*
58
48
  * The number of components required for a 64-bit integer.
59
49
  *
60
50
  * INT64_MAX: 9_223372036_854775807
61
51
  * UINT64_MAX: 18_446744073_709551615
62
52
  */
63
- # define BIGDECIMAL_INT64_MAX_LENGTH 3
64
-
65
- #elif SIZEOF_DECDIG == 2
66
- # define BIGDECIMAL_BASE ((DECDIG)10000U)
67
- # define BIGDECIMAL_COMPONENT_FIGURES 4
68
- /*
69
- * The number of components required for a 64-bit integer.
70
- *
71
- * INT64_MAX: 922_3372_0368_5477_5807
72
- * UINT64_MAX: 1844_6744_0737_0955_1615
73
- */
74
- # define BIGDECIMAL_INT64_MAX_LENGTH 5
75
-
76
- #else
77
- # error Unknown size of DECDIG
78
- #endif
53
+ #define BIGDECIMAL_INT64_MAX_LENGTH 3
79
54
 
80
55
  #define BIGDECIMAL_DOUBLE_FIGURES (1+DBL_DIG)
81
56
 
@@ -188,6 +163,16 @@ typedef struct {
188
163
  DECDIG frac[FLEXIBLE_ARRAY_SIZE]; /* Array of fraction part. */
189
164
  } Real;
190
165
 
166
+ typedef struct {
167
+ VALUE bigdecimal;
168
+ Real *real;
169
+ } BDVALUE;
170
+
171
+ typedef struct {
172
+ VALUE bigdecimal_or_nil;
173
+ Real *real_or_null;
174
+ } NULLABLE_BDVALUE;
175
+
191
176
  /*
192
177
  * ------------------
193
178
  * EXPORTables.
@@ -214,7 +199,7 @@ VP_EXPORT unsigned short VpSetRoundMode(unsigned short n);
214
199
  VP_EXPORT int VpException(unsigned short f,const char *str,int always);
215
200
  VP_EXPORT size_t VpNumOfChars(Real *vp,const char *pszFmt);
216
201
  VP_EXPORT size_t VpInit(DECDIG BaseVal);
217
- VP_EXPORT Real *VpAlloc(const char *szVal, int strict_p, int exc);
202
+ VP_EXPORT NULLABLE_BDVALUE VpAlloc(const char *szVal, int strict_p, int exc);
218
203
  VP_EXPORT size_t VpAsgn(Real *c, Real *a, int isw);
219
204
  VP_EXPORT size_t VpAddSub(Real *c,Real *a,Real *b,int operation);
220
205
  VP_EXPORT size_t VpMult(Real *c,Real *a,Real *b);
@@ -232,10 +217,31 @@ VP_EXPORT int VpActiveRound(Real *y, Real *x, unsigned short f, ssize_t il);
232
217
  VP_EXPORT int VpMidRound(Real *y, unsigned short f, ssize_t nf);
233
218
  VP_EXPORT int VpLeftRound(Real *y, unsigned short f, ssize_t nf);
234
219
  VP_EXPORT void VpFrac(Real *y, Real *x);
220
+ VP_EXPORT int AddExponent(Real *a, SIGNED_VALUE n);
235
221
 
236
222
  /* VP constants */
237
223
  VP_EXPORT Real *VpOne(void);
238
224
 
225
+ /*
226
+ * **** BigDecimal part ****
227
+ */
228
+ VP_EXPORT VALUE BigDecimal_lt(VALUE self, VALUE r);
229
+ VP_EXPORT VALUE BigDecimal_ge(VALUE self, VALUE r);
230
+ VP_EXPORT VALUE BigDecimal_exponent(VALUE self);
231
+ VP_EXPORT VALUE BigDecimal_fix(VALUE self);
232
+ VP_EXPORT VALUE BigDecimal_frac(VALUE self);
233
+ VP_EXPORT VALUE BigDecimal_add(VALUE self, VALUE b);
234
+ VP_EXPORT VALUE BigDecimal_sub(VALUE self, VALUE b);
235
+ VP_EXPORT VALUE BigDecimal_mult(VALUE self, VALUE b);
236
+ VP_EXPORT VALUE BigDecimal_add2(VALUE self, VALUE b, VALUE n);
237
+ VP_EXPORT VALUE BigDecimal_sub2(VALUE self, VALUE b, VALUE n);
238
+ VP_EXPORT VALUE BigDecimal_mult2(VALUE self, VALUE b, VALUE n);
239
+ VP_EXPORT VALUE BigDecimal_split(VALUE self);
240
+ VP_EXPORT VALUE BigDecimal_decimal_shift(VALUE self, VALUE v);
241
+ VP_EXPORT inline BDVALUE GetBDValueMust(VALUE v);
242
+ VP_EXPORT inline BDVALUE rbd_allocate_struct_zero_wrap(int sign, size_t const digits);
243
+ #define NewZeroWrap rbd_allocate_struct_zero_wrap
244
+
239
245
  /*
240
246
  * ------------------
241
247
  * MACRO definitions.
@@ -0,0 +1,192 @@
1
+ // Calculate the inverse of x using the Newton-Raphson method.
2
+ static VALUE
3
+ newton_raphson_inverse(VALUE x, size_t prec) {
4
+ BDVALUE bdone = NewZeroWrap(1, 1);
5
+ VpSetOne(bdone.real);
6
+ VALUE one = bdone.bigdecimal;
7
+
8
+ // Initial approximation in 2 digits
9
+ BDVALUE bdx = GetBDValueMust(x);
10
+ BDVALUE inv0 = NewZeroWrap(1, 2 * BIGDECIMAL_COMPONENT_FIGURES);
11
+ VpSetOne(inv0.real);
12
+ DECDIG_DBL numerator = (DECDIG_DBL)BIGDECIMAL_BASE * 100;
13
+ DECDIG_DBL denominator = (DECDIG_DBL)bdx.real->frac[0] * 100 + (DECDIG_DBL)(bdx.real->Prec >= 2 ? bdx.real->frac[1] : 0) * 100 / BIGDECIMAL_BASE;
14
+ inv0.real->frac[0] = (DECDIG)(numerator / denominator);
15
+ inv0.real->frac[1] = (DECDIG)((numerator % denominator) * (BIGDECIMAL_BASE / 100) / denominator * 100);
16
+ inv0.real->Prec = 2;
17
+ inv0.real->exponent = 1 - bdx.real->exponent;
18
+ VpNmlz(inv0.real);
19
+ RB_GC_GUARD(bdx.bigdecimal);
20
+ VALUE inv = inv0.bigdecimal;
21
+
22
+ int bl = 1;
23
+ while (((size_t)1 << bl) < prec) bl++;
24
+
25
+ for (int i = bl; i >= 0; i--) {
26
+ size_t n = (prec >> i) + 2;
27
+ if (n > prec) n = prec;
28
+ // Newton-Raphson iteration: inv_next = inv + inv * (1 - x * inv)
29
+ VALUE one_minus_x_inv = BigDecimal_sub2(
30
+ one,
31
+ BigDecimal_mult(BigDecimal_mult2(x, one, SIZET2NUM(n + 1)), inv),
32
+ SIZET2NUM(SIZET2NUM(n / 2))
33
+ );
34
+ inv = BigDecimal_add2(
35
+ inv,
36
+ BigDecimal_mult(inv, one_minus_x_inv),
37
+ SIZET2NUM(n)
38
+ );
39
+ }
40
+ return inv;
41
+ }
42
+
43
+ // Calculates divmod by multiplying approximate reciprocal of y
44
+ static void
45
+ divmod_by_inv_mul(VALUE x, VALUE y, VALUE inv, VALUE *res_div, VALUE *res_mod) {
46
+ VALUE div = BigDecimal_fix(BigDecimal_mult(x, inv));
47
+ VALUE mod = BigDecimal_sub(x, BigDecimal_mult(div, y));
48
+ while (RTEST(BigDecimal_lt(mod, INT2FIX(0)))) {
49
+ mod = BigDecimal_add(mod, y);
50
+ div = BigDecimal_sub(div, INT2FIX(1));
51
+ }
52
+ while (RTEST(BigDecimal_ge(mod, y))) {
53
+ mod = BigDecimal_sub(mod, y);
54
+ div = BigDecimal_add(div, INT2FIX(1));
55
+ }
56
+ *res_div = div;
57
+ *res_mod = mod;
58
+ }
59
+
60
+ static void
61
+ slice_copy(DECDIG *dest, Real *src, size_t rshift, size_t length) {
62
+ ssize_t start = src->exponent - rshift - length;
63
+ if (start >= (ssize_t)src->Prec) return;
64
+ if (start < 0) {
65
+ dest -= start;
66
+ length += start;
67
+ start = 0;
68
+ }
69
+ size_t max_length = src->Prec - start;
70
+ memcpy(dest, src->frac + start, Min(length, max_length) * sizeof(DECDIG));
71
+ }
72
+
73
+ /* Calculates divmod using Newton-Raphson method.
74
+ * x and y must be a BigDecimal representing an integer value.
75
+ *
76
+ * To calculate with low cost, we need to split x into blocks and perform divmod for each block.
77
+ * x_digits = remaining_digits(<= y_digits) + block_digits * num_blocks
78
+ *
79
+ * Example:
80
+ * xxx_xxxxx_xxxxx_xxxxx(18 digits) / yyyyy(5 digits)
81
+ * remaining_digits = 3, block_digits = 5, num_blocks = 3
82
+ * repeating xxxxx_xxxxxx.divmod(yyyyy) calculation 3 times.
83
+ *
84
+ * In each divmod step, dividend is at most (y_digits + block_digits) digits and divisor is y_digits digits.
85
+ * Reciprocal of y needs block_digits + 1 precision.
86
+ */
87
+ static void
88
+ divmod_newton(VALUE x, VALUE y, VALUE *div_out, VALUE *mod_out) {
89
+ size_t x_digits = NUM2SIZET(BigDecimal_exponent(x));
90
+ size_t y_digits = NUM2SIZET(BigDecimal_exponent(y));
91
+ if (x_digits <= y_digits) x_digits = y_digits + 1;
92
+
93
+ size_t n = x_digits / y_digits;
94
+ size_t block_figs = (x_digits - y_digits) / n / BIGDECIMAL_COMPONENT_FIGURES + 1;
95
+ size_t block_digits = block_figs * BIGDECIMAL_COMPONENT_FIGURES;
96
+ size_t num_blocks = (x_digits - y_digits + block_digits - 1) / block_digits;
97
+ size_t y_figs = (y_digits - 1) / BIGDECIMAL_COMPONENT_FIGURES + 1;
98
+ VALUE yinv = newton_raphson_inverse(y, block_digits + 1);
99
+
100
+ BDVALUE divident = NewZeroWrap(1, BIGDECIMAL_COMPONENT_FIGURES * (y_figs + block_figs));
101
+ BDVALUE div_result = NewZeroWrap(1, BIGDECIMAL_COMPONENT_FIGURES * (num_blocks * block_figs + 1));
102
+ BDVALUE bdx = GetBDValueMust(x);
103
+
104
+ VALUE mod = BigDecimal_fix(BigDecimal_decimal_shift(x, SSIZET2NUM(-num_blocks * block_digits)));
105
+ for (ssize_t i = num_blocks - 1; i >= 0; i--) {
106
+ memset(divident.real->frac, 0, (y_figs + block_figs) * sizeof(DECDIG));
107
+
108
+ BDVALUE bdmod = GetBDValueMust(mod);
109
+ slice_copy(divident.real->frac, bdmod.real, 0, y_figs);
110
+ slice_copy(divident.real->frac + y_figs, bdx.real, i * block_figs, block_figs);
111
+ RB_GC_GUARD(bdmod.bigdecimal);
112
+
113
+ VpSetSign(divident.real, 1);
114
+ divident.real->exponent = y_figs + block_figs;
115
+ divident.real->Prec = y_figs + block_figs;
116
+ VpNmlz(divident.real);
117
+
118
+ VALUE div;
119
+ divmod_by_inv_mul(divident.bigdecimal, y, yinv, &div, &mod);
120
+ BDVALUE bddiv = GetBDValueMust(div);
121
+ slice_copy(div_result.real->frac + (num_blocks - i - 1) * block_figs, bddiv.real, 0, block_figs + 1);
122
+ RB_GC_GUARD(bddiv.bigdecimal);
123
+ }
124
+ VpSetSign(div_result.real, 1);
125
+ div_result.real->exponent = num_blocks * block_figs + 1;
126
+ div_result.real->Prec = num_blocks * block_figs + 1;
127
+ VpNmlz(div_result.real);
128
+ RB_GC_GUARD(bdx.bigdecimal);
129
+ RB_GC_GUARD(divident.bigdecimal);
130
+ RB_GC_GUARD(div_result.bigdecimal);
131
+ *div_out = div_result.bigdecimal;
132
+ *mod_out = mod;
133
+ }
134
+
135
+ static VALUE
136
+ VpDivdNewtonInner(VALUE args_ptr)
137
+ {
138
+ Real **args = (Real**)args_ptr;
139
+ Real *c = args[0], *r = args[1], *a = args[2], *b = args[3];
140
+ BDVALUE a2, b2, c2, r2;
141
+ VALUE div, mod, a2_frac = Qnil;
142
+ size_t div_prec = c->MaxPrec - 1;
143
+ size_t base_prec = b->Prec;
144
+
145
+ a2 = NewZeroWrap(1, a->Prec * BIGDECIMAL_COMPONENT_FIGURES);
146
+ b2 = NewZeroWrap(1, b->Prec * BIGDECIMAL_COMPONENT_FIGURES);
147
+ VpAsgn(a2.real, a, 1);
148
+ VpAsgn(b2.real, b, 1);
149
+ VpSetSign(a2.real, 1);
150
+ VpSetSign(b2.real, 1);
151
+ a2.real->exponent = base_prec + div_prec;
152
+ b2.real->exponent = base_prec;
153
+
154
+ if ((ssize_t)a2.real->Prec > a2.real->exponent) {
155
+ a2_frac = BigDecimal_frac(a2.bigdecimal);
156
+ VpMidRound(a2.real, VP_ROUND_DOWN, 0);
157
+ }
158
+ divmod_newton(a2.bigdecimal, b2.bigdecimal, &div, &mod);
159
+ if (a2_frac != Qnil) mod = BigDecimal_add(mod, a2_frac);
160
+
161
+ c2 = GetBDValueMust(div);
162
+ r2 = GetBDValueMust(mod);
163
+ VpAsgn(c, c2.real, VpGetSign(a) * VpGetSign(b));
164
+ VpAsgn(r, r2.real, VpGetSign(a));
165
+ AddExponent(c, a->exponent);
166
+ AddExponent(c, -b->exponent);
167
+ AddExponent(c, -div_prec);
168
+ AddExponent(r, a->exponent);
169
+ AddExponent(r, -base_prec - div_prec);
170
+ RB_GC_GUARD(a2.bigdecimal);
171
+ RB_GC_GUARD(a2.bigdecimal);
172
+ RB_GC_GUARD(c2.bigdecimal);
173
+ RB_GC_GUARD(r2.bigdecimal);
174
+ return Qnil;
175
+ }
176
+
177
+ static VALUE
178
+ ensure_restore_prec_limit(VALUE limit)
179
+ {
180
+ VpSetPrecLimit(NUM2SIZET(limit));
181
+ return Qnil;
182
+ }
183
+
184
+ static void
185
+ VpDivdNewton(Real *c, Real *r, Real *a, Real *b)
186
+ {
187
+ Real *args[4] = {c, r, a, b};
188
+ size_t pl = VpGetPrecLimit();
189
+ VpSetPrecLimit(0);
190
+ // Ensure restoring prec limit because some methods used in VpDivdNewtonInner may raise an exception
191
+ rb_ensure(VpDivdNewtonInner, (VALUE)args, ensure_restore_prec_limit, SIZET2NUM(pl));
192
+ }
@@ -1,4 +1,4 @@
1
- # frozen_string_literal: false
1
+ # frozen_string_literal: true
2
2
  require 'mkmf'
3
3
 
4
4
  def have_builtin_func(name, check_expr, opt = "", &b)
@@ -46,13 +46,16 @@ have_func("rb_opts_exception_p", "ruby.h")
46
46
  have_func("rb_category_warn", "ruby.h")
47
47
  have_const("RB_WARN_CATEGORY_DEPRECATED", "ruby.h")
48
48
 
49
+ if RUBY_ENGINE == "ruby"
50
+ have_const("RUBY_TYPED_EMBEDDABLE", "ruby.h") # RUBY_VERSION >= 3.3
51
+ end
52
+
49
53
  if File.file?(File.expand_path('../lib/bigdecimal.rb', __FILE__))
50
54
  bigdecimal_rb = "$(srcdir)/lib/bigdecimal.rb"
51
55
  else
52
56
  bigdecimal_rb = "$(srcdir)/../../lib/bigdecimal.rb"
53
57
  end
54
58
 
55
- $defs.push '-DBIGDECIMAL_USE_DECDIG_UINT16_T' if ENV['BIGDECIMAL_USE_DECDIG_UINT16_T'] == 'true'
56
59
  $defs.push '-DBIGDECIMAL_USE_VP_TEST_METHODS' if ENV['BIGDECIMAL_USE_VP_TEST_METHODS'] == 'true'
57
60
 
58
61
  create_makefile('bigdecimal') {|mf|
@@ -58,7 +58,7 @@ char *BigDecimal_dtoa(double d_, int mode, int ndigits, int *decpt, int *sign, c
58
58
 
59
59
  #ifndef HAVE_RB_COMPLEX_REAL
60
60
  static inline VALUE
61
- rb_complex_real(VALUE cmp)
61
+ rb_complex_real_fallback(VALUE cmp)
62
62
  {
63
63
  #ifdef RCOMPLEX
64
64
  return RCOMPLEX(cmp)->real;
@@ -66,11 +66,12 @@ rb_complex_real(VALUE cmp)
66
66
  return rb_funcall(cmp, rb_intern("real"), 0);
67
67
  #endif
68
68
  }
69
+ #define rb_complex_real rb_complex_real_fallback
69
70
  #endif
70
71
 
71
72
  #ifndef HAVE_RB_COMPLEX_IMAG
72
73
  static inline VALUE
73
- rb_complex_imag(VALUE cmp)
74
+ rb_complex_imag_fallback(VALUE cmp)
74
75
  {
75
76
  # ifdef RCOMPLEX
76
77
  return RCOMPLEX(cmp)->imag;
@@ -78,6 +79,7 @@ rb_complex_imag(VALUE cmp)
78
79
  return rb_funcall(cmp, rb_intern("imag"), 0);
79
80
  # endif
80
81
  }
82
+ #define rb_complex_imag rb_complex_imag_fallback
81
83
  #endif
82
84
 
83
85
  /* st */
@@ -0,0 +1,191 @@
1
+ // NTT (Number Theoretic Transform) implementation for BigDecimal multiplication
2
+
3
+ #define NTT_PRIMITIVE_ROOT 17
4
+ #define NTT_PRIME_BASE1 24
5
+ #define NTT_PRIME_BASE2 26
6
+ #define NTT_PRIME_BASE3 29
7
+ #define NTT_PRIME_SHIFT 27
8
+ #define NTT_PRIME1 (((uint32_t)NTT_PRIME_BASE1 << NTT_PRIME_SHIFT) | 1)
9
+ #define NTT_PRIME2 (((uint32_t)NTT_PRIME_BASE2 << NTT_PRIME_SHIFT) | 1)
10
+ #define NTT_PRIME3 (((uint32_t)NTT_PRIME_BASE3 << NTT_PRIME_SHIFT) | 1)
11
+ #define MAX_NTT32_BITS 27
12
+ #define NTT_DECDIG_BASE 1000000000
13
+
14
+ // Calculates base**ex % mod
15
+ static uint32_t
16
+ mod_pow(uint32_t base, uint32_t ex, uint32_t mod) {
17
+ uint32_t res = 1;
18
+ uint32_t bit = 1;
19
+ while (true) {
20
+ if (ex & bit) {
21
+ ex ^= bit;
22
+ res = ((uint64_t)res * base) % mod;
23
+ }
24
+ if (!ex) break;
25
+ base = ((uint64_t)base * base) % mod;
26
+ bit <<= 1;
27
+ }
28
+ return res;
29
+ }
30
+
31
+ // Recursively performs butterfly operations of NTT
32
+ static void
33
+ ntt_recursive(int size_bits, uint32_t *input, uint32_t *output, uint32_t *tmp, int depth, uint32_t r, uint32_t prime) {
34
+ if (depth > 0) {
35
+ ntt_recursive(size_bits, input, tmp, output, depth - 1, ((uint64_t)r * r) % prime, prime);
36
+ } else {
37
+ tmp = input;
38
+ }
39
+ uint32_t size_half = (uint32_t)1 << (size_bits - 1);
40
+ uint32_t stride = (uint32_t)1 << (size_bits - depth - 1);
41
+ uint32_t n = size_half / stride;
42
+ uint32_t rn = 1, rm = prime - 1;
43
+ for (uint32_t i = 0; i < n; i++) {
44
+ uint32_t *aptr = tmp + i * 2 * stride;
45
+ uint32_t *bptr = aptr + stride;
46
+ uint32_t *out1 = output + stride * i;
47
+ uint32_t *out2 = out1 + size_half;
48
+ for (uint32_t k = 0; k < stride; k++) {
49
+ uint32_t a = aptr[k], b = bptr[k];
50
+ out1[k] = (a + (uint64_t)rn * b) % prime;
51
+ out2[k] = (a + (uint64_t)rm * b) % prime;
52
+ }
53
+ rn = ((uint64_t)rn * r) % prime;
54
+ rm = ((uint64_t)rm * r) % prime;
55
+ }
56
+ }
57
+
58
+ /* Perform NTT on input array.
59
+ * base, shift: Represent the prime number as (base << shift | 1)
60
+ * r_base: Primitive root of unity modulo prime
61
+ * size_bits: log2 of the size of the input array. Should be less or equal to shift
62
+ * input: input array of size (1 << size_bits)
63
+ */
64
+ static void
65
+ ntt(int size_bits, uint32_t *input, uint32_t *output, uint32_t *tmp, int r_base, int base, int shift, int dir) {
66
+ uint32_t size = (uint32_t)1 << size_bits;
67
+ uint32_t prime = ((uint32_t)base << shift) | 1;
68
+
69
+ // rmax**(1 << shift) % prime == 1
70
+ // r**size % prime == 1
71
+ uint32_t rmax = mod_pow(r_base, base, prime);
72
+ uint32_t r = mod_pow(rmax, (uint32_t)1 << (shift - size_bits), prime);
73
+
74
+ if (dir < 0) r = mod_pow(r, prime - 2, prime);
75
+ ntt_recursive(size_bits, input, output, tmp, size_bits - 1, r, prime);
76
+ if (dir < 0) {
77
+ uint32_t n_inv = mod_pow((uint32_t)size, prime - 2, prime);
78
+ for (uint32_t i = 0; i < size; i++) {
79
+ output[i] = ((uint64_t)output[i] * n_inv) % prime;
80
+ }
81
+ }
82
+ }
83
+
84
+ /* Calculate c that satisfies: c % PRIME1 == mod1 && c % PRIME2 == mod2 && c % PRIME3 == mod3
85
+ * c = (mod1 * 35002755423056150739595925972 + mod2 * 14584479687667766215746868453 + mod3 * 37919651490985126265126719818) % (PRIME1 * PRIME2 * PRIME3)
86
+ * Assume c <= 999999999**2*(1<<27)
87
+ */
88
+ static inline void
89
+ mod_restore_prime_24_26_29_shift_27(uint32_t mod1, uint32_t mod2, uint32_t mod3, uint32_t *digits) {
90
+ // Use mixed radix notation to eliminate modulo by PRIME1 * PRIME2 * PRIME3
91
+ // [DIG0, DIG1, DIG2] = DIG0 + DIG1 * PRIME1 + DIG2 * PRIME1 * PRIME2
92
+ // DIG0: 0...PRIME1, DIG1: 0...PRIME2, DIG2: 0...PRIME3
93
+ // 35002755423056150739595925972 = [1, 3489660916, 3113851359]
94
+ // 14584479687667766215746868453 = [0, 13, 1297437912]
95
+ // 37919651490985126265126719818 = [0, 0, 3373338954]
96
+ uint64_t c0 = mod1;
97
+ uint64_t c1 = (uint64_t)mod2 * 13 + (uint64_t)mod1 * 3489660916;
98
+ uint64_t c2 = (uint64_t)mod3 * 3373338954 % NTT_PRIME3 + (uint64_t)mod2 * 1297437912 % NTT_PRIME3 + (uint64_t)mod1 * 3113851359 % NTT_PRIME3;
99
+ c2 += c1 / NTT_PRIME2;
100
+ c1 %= NTT_PRIME2;
101
+ c2 %= NTT_PRIME3;
102
+ // Base conversion. c fits in 3 digits.
103
+ c1 += c2 % NTT_DECDIG_BASE * NTT_PRIME2;
104
+ c0 += c1 % NTT_DECDIG_BASE * NTT_PRIME1;
105
+ c1 /= NTT_DECDIG_BASE;
106
+ digits[0] = c0 % NTT_DECDIG_BASE;
107
+ c0 /= NTT_DECDIG_BASE;
108
+ c1 += c2 / NTT_DECDIG_BASE % NTT_DECDIG_BASE * NTT_PRIME2;
109
+ c0 += c1 % NTT_DECDIG_BASE * NTT_PRIME1;
110
+ c1 /= NTT_DECDIG_BASE;
111
+ digits[1] = c0 % NTT_DECDIG_BASE;
112
+ digits[2] = (uint32_t)(c0 / NTT_DECDIG_BASE + c1 % NTT_DECDIG_BASE * NTT_PRIME1);
113
+ }
114
+
115
+ /*
116
+ * NTT multiplication
117
+ * Uses three NTTs with mod (24 << 27 | 1), (26 << 27 | 1), and (29 << 27 | 1)
118
+ */
119
+ static void
120
+ ntt_multiply(size_t a_size, size_t b_size, uint32_t *a, uint32_t *b, uint32_t *c) {
121
+ if (a_size < b_size) {
122
+ ntt_multiply(b_size, a_size, b, a, c);
123
+ return;
124
+ }
125
+
126
+ int ntt_size_bits = bit_length(b_size - 1) + 1;
127
+ if (ntt_size_bits > MAX_NTT32_BITS) {
128
+ rb_raise(rb_eArgError, "Multiply size too large");
129
+ }
130
+
131
+ // To calculate large_a * small_b faster, split into several batches.
132
+ uint32_t ntt_size = (uint32_t)1 << ntt_size_bits;
133
+ uint32_t batch_size = ntt_size - (uint32_t)b_size;
134
+ uint32_t batch_count = (uint32_t)((a_size + batch_size - 1) / batch_size);
135
+
136
+ uint32_t *mem = ruby_xcalloc(sizeof(uint32_t), ntt_size * 9);
137
+ uint32_t *ntt1 = mem;
138
+ uint32_t *ntt2 = mem + ntt_size;
139
+ uint32_t *ntt3 = mem + ntt_size * 2;
140
+ uint32_t *tmp1 = mem + ntt_size * 3;
141
+ uint32_t *tmp2 = mem + ntt_size * 4;
142
+ uint32_t *tmp3 = mem + ntt_size * 5;
143
+ uint32_t *conv1 = mem + ntt_size * 6;
144
+ uint32_t *conv2 = mem + ntt_size * 7;
145
+ uint32_t *conv3 = mem + ntt_size * 8;
146
+
147
+ // Calculate NTT for b in three primes. Result is reused for each batch of a.
148
+ memcpy(tmp1, b, b_size * sizeof(uint32_t));
149
+ memset(tmp1 + b_size, 0, (ntt_size - b_size) * sizeof(uint32_t));
150
+ ntt(ntt_size_bits, tmp1, ntt1, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, +1);
151
+ ntt(ntt_size_bits, tmp1, ntt2, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, +1);
152
+ ntt(ntt_size_bits, tmp1, ntt3, tmp2, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, +1);
153
+
154
+ memset(c, 0, (a_size + b_size) * sizeof(uint32_t));
155
+ for (uint32_t idx = 0; idx < batch_count; idx++) {
156
+ uint32_t len = idx == batch_count - 1 ? (uint32_t)a_size - idx * batch_size : batch_size;
157
+ memcpy(tmp1, a + idx * batch_size, len * sizeof(uint32_t));
158
+ memset(tmp1 + len, 0, (ntt_size - len) * sizeof(uint32_t));
159
+ // Calculate convolution for this batch in three primes
160
+ ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, +1);
161
+ for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt1[i]) % NTT_PRIME1;
162
+ ntt(ntt_size_bits, tmp2, conv1, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE1, NTT_PRIME_SHIFT, -1);
163
+ ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, +1);
164
+ for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt2[i]) % NTT_PRIME2;
165
+ ntt(ntt_size_bits, tmp2, conv2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE2, NTT_PRIME_SHIFT, -1);
166
+ ntt(ntt_size_bits, tmp1, tmp2, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, +1);
167
+ for (uint32_t i = 0; i < ntt_size; i++) tmp2[i] = ((uint64_t)tmp2[i] * ntt3[i]) % NTT_PRIME3;
168
+ ntt(ntt_size_bits, tmp2, conv3, tmp3, NTT_PRIMITIVE_ROOT, NTT_PRIME_BASE3, NTT_PRIME_SHIFT, -1);
169
+
170
+ // Restore the original convolution value from three convolutions calculated in three primes.
171
+ // Each convolution value is maximum 999999999**2*(1<<27)/2
172
+ for (uint32_t i = 0; i < ntt_size; i++) {
173
+ uint32_t dig[3];
174
+ mod_restore_prime_24_26_29_shift_27(conv1[i], conv2[i], conv3[i], dig);
175
+ // Maximum values of dig[0], dig[1], and dig[2] are 999999999, 999999999 and 67108863 respectively
176
+ // Maximum overlapped sum (considering overlaps between 2 batches) is less than 4134217722
177
+ // so this sum doesn't overflow uint32_t.
178
+ for (int j = 0; j < 3; j++) {
179
+ // Index check: if dig[j] is non-zero, assign index is within valid range.
180
+ if (dig[j]) c[idx * batch_size + i + 1 - j] += dig[j];
181
+ }
182
+ }
183
+ }
184
+ uint32_t carry = 0;
185
+ for (int32_t i = (uint32_t)(a_size + b_size - 1); i >= 0; i--) {
186
+ uint32_t v = c[i] + carry;
187
+ c[i] = v % NTT_DECDIG_BASE;
188
+ carry = v / NTT_DECDIG_BASE;
189
+ }
190
+ ruby_xfree(mem);
191
+ }