numo-pocketfft 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2190 @@
1
+ /*
2
+ * This file is part of pocketfft.
3
+ * Licensed under a 3-clause BSD style license - see LICENSE.md
4
+ */
5
+
6
+ /*
7
+ * Main implementation file.
8
+ *
9
+ * Copyright (C) 2004-2018 Max-Planck-Society
10
+ * \author Martin Reinecke
11
+ */
12
+
13
+ #include <math.h>
14
+ #include <string.h>
15
+
16
+ #include "pocketfft.h"
17
+
18
+ #define RALLOC(type,num) \
19
+ ((type *)malloc((num)*sizeof(type)))
20
+ #define DEALLOC(ptr) \
21
+ do { free(ptr); (ptr)=NULL; } while(0)
22
+
23
+ #define SWAP(a,b,type) \
24
+ do { type tmp_=(a); (a)=(b); (b)=tmp_; } while(0)
25
+
26
+ #ifdef __GNUC__
27
+ #define NOINLINE __attribute__((noinline))
28
+ #define WARN_UNUSED_RESULT __attribute__ ((warn_unused_result))
29
+ #else
30
+ #define NOINLINE
31
+ #define WARN_UNUSED_RESULT
32
+ #endif
33
+
34
+ // adapted from https://stackoverflow.com/questions/42792939/
35
+ // CAUTION: this function only works for arguments in the range [-0.25; 0.25]!
36
+ static void my_sincosm1pi (double a, double *restrict res)
37
+ {
38
+ double s = a * a;
39
+ /* Approximate cos(pi*x)-1 for x in [-0.25,0.25] */
40
+ double r = -1.0369917389758117e-4;
41
+ r = fma (r, s, 1.9294935641298806e-3);
42
+ r = fma (r, s, -2.5806887942825395e-2);
43
+ r = fma (r, s, 2.3533063028328211e-1);
44
+ r = fma (r, s, -1.3352627688538006e+0);
45
+ r = fma (r, s, 4.0587121264167623e+0);
46
+ r = fma (r, s, -4.9348022005446790e+0);
47
+ double c = r*s;
48
+ /* Approximate sin(pi*x) for x in [-0.25,0.25] */
49
+ r = 4.6151442520157035e-4;
50
+ r = fma (r, s, -7.3700183130883555e-3);
51
+ r = fma (r, s, 8.2145868949323936e-2);
52
+ r = fma (r, s, -5.9926452893214921e-1);
53
+ r = fma (r, s, 2.5501640398732688e+0);
54
+ r = fma (r, s, -5.1677127800499516e+0);
55
+ s = s * a;
56
+ r = r * s;
57
+ s = fma (a, 3.1415926535897931e+0, r);
58
+ res[0] = c;
59
+ res[1] = s;
60
+ }
61
+
62
+ NOINLINE static void calc_first_octant(size_t den, double * restrict res)
63
+ {
64
+ size_t n = (den+4)>>3;
65
+ if (n==0) return;
66
+ res[0]=1.; res[1]=0.;
67
+ if (n==1) return;
68
+ size_t l1=(size_t)sqrt(n);
69
+ for (size_t i=1; i<l1; ++i)
70
+ my_sincosm1pi((2.*i)/den,&res[2*i]);
71
+ size_t start=l1;
72
+ while(start<n)
73
+ {
74
+ double cs[2];
75
+ my_sincosm1pi((2.*start)/den,cs);
76
+ res[2*start] = cs[0]+1.;
77
+ res[2*start+1] = cs[1];
78
+ size_t end = l1;
79
+ if (start+end>n) end = n-start;
80
+ for (size_t i=1; i<end; ++i)
81
+ {
82
+ double csx[2]={res[2*i], res[2*i+1]};
83
+ res[2*(start+i)] = ((cs[0]*csx[0] - cs[1]*csx[1] + cs[0]) + csx[0]) + 1.;
84
+ res[2*(start+i)+1] = (cs[0]*csx[1] + cs[1]*csx[0]) + cs[1] + csx[1];
85
+ }
86
+ start += l1;
87
+ }
88
+ for (size_t i=1; i<l1; ++i)
89
+ res[2*i] += 1.;
90
+ }
91
+
92
+ NOINLINE static void calc_first_quadrant(size_t n, double * restrict res)
93
+ {
94
+ double * restrict p = res+n;
95
+ calc_first_octant(n<<1, p);
96
+ size_t ndone=(n+2)>>2;
97
+ size_t i=0, idx1=0, idx2=2*ndone-2;
98
+ for (; i+1<ndone; i+=2, idx1+=2, idx2-=2)
99
+ {
100
+ res[idx1] = p[2*i];
101
+ res[idx1+1] = p[2*i+1];
102
+ res[idx2] = p[2*i+3];
103
+ res[idx2+1] = p[2*i+2];
104
+ }
105
+ if (i!=ndone)
106
+ {
107
+ res[idx1 ] = p[2*i];
108
+ res[idx1+1] = p[2*i+1];
109
+ }
110
+ }
111
+
112
+ NOINLINE static void calc_first_half(size_t n, double * restrict res)
113
+ {
114
+ int ndone=(n+1)>>1;
115
+ double * p = res+n-1;
116
+ calc_first_octant(n<<2, p);
117
+ int i4=0, in=n, i=0;
118
+ for (; i4<=in-i4; ++i, i4+=4) // octant 0
119
+ {
120
+ res[2*i] = p[2*i4]; res[2*i+1] = p[2*i4+1];
121
+ }
122
+ for (; i4-in <= 0; ++i, i4+=4) // octant 1
123
+ {
124
+ int xm = in-i4;
125
+ res[2*i] = p[2*xm+1]; res[2*i+1] = p[2*xm];
126
+ }
127
+ for (; i4<=3*in-i4; ++i, i4+=4) // octant 2
128
+ {
129
+ int xm = i4-in;
130
+ res[2*i] = -p[2*xm+1]; res[2*i+1] = p[2*xm];
131
+ }
132
+ for (; i<ndone; ++i, i4+=4) // octant 3
133
+ {
134
+ int xm = 2*in-i4;
135
+ res[2*i] = -p[2*xm]; res[2*i+1] = p[2*xm+1];
136
+ }
137
+ }
138
+
139
+ NOINLINE static void fill_first_quadrant(size_t n, double * restrict res)
140
+ {
141
+ const double hsqt2 = 0.707106781186547524400844362104849;
142
+ size_t quart = n>>2;
143
+ if ((n&7)==0)
144
+ res[quart] = res[quart+1] = hsqt2;
145
+ for (size_t i=2, j=2*quart-2; i<quart; i+=2, j-=2)
146
+ {
147
+ res[j ] = res[i+1];
148
+ res[j+1] = res[i ];
149
+ }
150
+ }
151
+
152
+ NOINLINE static void fill_first_half(size_t n, double * restrict res)
153
+ {
154
+ size_t half = n>>1;
155
+ if ((n&3)==0)
156
+ for (size_t i=0; i<half; i+=2)
157
+ {
158
+ res[i+half] = -res[i+1];
159
+ res[i+half+1] = res[i ];
160
+ }
161
+ else
162
+ for (size_t i=2, j=2*half-2; i<half; i+=2, j-=2)
163
+ {
164
+ res[j ] = -res[i ];
165
+ res[j+1] = res[i+1];
166
+ }
167
+ }
168
+
169
+ NOINLINE static void fill_second_half(size_t n, double * restrict res)
170
+ {
171
+ if ((n&1)==0)
172
+ for (size_t i=0; i<n; ++i)
173
+ res[i+n] = -res[i];
174
+ else
175
+ for (size_t i=2, j=2*n-2; i<n; i+=2, j-=2)
176
+ {
177
+ res[j ] = res[i ];
178
+ res[j+1] = -res[i+1];
179
+ }
180
+ }
181
+
182
+ NOINLINE static void sincos_2pibyn_half(size_t n, double * restrict res)
183
+ {
184
+ if ((n&3)==0)
185
+ {
186
+ calc_first_octant(n, res);
187
+ fill_first_quadrant(n, res);
188
+ fill_first_half(n, res);
189
+ }
190
+ else if ((n&1)==0)
191
+ {
192
+ calc_first_quadrant(n, res);
193
+ fill_first_half(n, res);
194
+ }
195
+ else
196
+ calc_first_half(n, res);
197
+ }
198
+
199
+ NOINLINE static void sincos_2pibyn(size_t n, double * restrict res)
200
+ {
201
+ sincos_2pibyn_half(n, res);
202
+ fill_second_half(n, res);
203
+ }
204
+
205
+ NOINLINE static size_t largest_prime_factor (size_t n)
206
+ {
207
+ size_t res=1;
208
+ size_t tmp;
209
+ while (((tmp=(n>>1))<<1)==n)
210
+ { res=2; n=tmp; }
211
+
212
+ size_t limit=(size_t)sqrt(n+0.01);
213
+ for (size_t x=3; x<=limit; x+=2)
214
+ while (((tmp=(n/x))*x)==n)
215
+ {
216
+ res=x;
217
+ n=tmp;
218
+ limit=(size_t)sqrt(n+0.01);
219
+ }
220
+ if (n>1) res=n;
221
+
222
+ return res;
223
+ }
224
+
225
+ NOINLINE static double cost_guess (size_t n)
226
+ {
227
+ const double lfp=1.1; // penalty for non-hardcoded larger factors
228
+ size_t ni=n;
229
+ double result=0.;
230
+ size_t tmp;
231
+ while (((tmp=(n>>1))<<1)==n)
232
+ { result+=2; n=tmp; }
233
+
234
+ size_t limit=(size_t)sqrt(n+0.01);
235
+ for (size_t x=3; x<=limit; x+=2)
236
+ while ((tmp=(n/x))*x==n)
237
+ {
238
+ result+= (x<=5) ? x : lfp*x; // penalize larger prime factors
239
+ n=tmp;
240
+ limit=(size_t)sqrt(n+0.01);
241
+ }
242
+ if (n>1) result+=(n<=5) ? n : lfp*n;
243
+
244
+ return result*ni;
245
+ }
246
+
247
+ /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
248
+ NOINLINE static size_t good_size(size_t n)
249
+ {
250
+ if (n<=6) return n;
251
+
252
+ size_t bestfac=2*n;
253
+ for (size_t f2=1; f2<bestfac; f2*=2)
254
+ for (size_t f23=f2; f23<bestfac; f23*=3)
255
+ for (size_t f235=f23; f235<bestfac; f235*=5)
256
+ for (size_t f2357=f235; f2357<bestfac; f2357*=7)
257
+ for (size_t f235711=f2357; f235711<bestfac; f235711*=11)
258
+ if (f235711>=n) bestfac=f235711;
259
+ return bestfac;
260
+ }
261
+
262
+ typedef struct cmplx {
263
+ double r,i;
264
+ } cmplx;
265
+
266
+ #define NFCT 25
267
+ typedef struct cfftp_fctdata
268
+ {
269
+ size_t fct;
270
+ cmplx *tw, *tws;
271
+ } cfftp_fctdata;
272
+
273
+ typedef struct cfftp_plan_i
274
+ {
275
+ size_t length, nfct;
276
+ cmplx *mem;
277
+ cfftp_fctdata fct[NFCT];
278
+ } cfftp_plan_i;
279
+ typedef struct cfftp_plan_i * cfftp_plan;
280
+
281
+ #define PMC(a,b,c,d) { a.r=c.r+d.r; a.i=c.i+d.i; b.r=c.r-d.r; b.i=c.i-d.i; }
282
+ #define ADDC(a,b,c) { a.r=b.r+c.r; a.i=b.i+c.i; }
283
+ #define SCALEC(a,b) { a.r*=b; a.i*=b; }
284
+ #define ROT90(a) { double tmp_=a.r; a.r=-a.i; a.i=tmp_; }
285
+ #define ROTM90(a) { double tmp_=-a.r; a.r=a.i; a.i=tmp_; }
286
+ #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
287
+ #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
288
+ #define WA(x,i) wa[(i)-1+(x)*(ido-1)]
289
+ /* a = b*c */
290
+ #define A_EQ_B_MUL_C(a,b,c) { a.r=b.r*c.r-b.i*c.i; a.i=b.r*c.i+b.i*c.r; }
291
+ /* a = conj(b)*c*/
292
+ #define A_EQ_CB_MUL_C(a,b,c) { a.r=b.r*c.r+b.i*c.i; a.i=b.r*c.i-b.i*c.r; }
293
+
294
+ #define PMSIGNC(a,b,c,d) { a.r=c.r+sign*d.r; a.i=c.i+sign*d.i; b.r=c.r-sign*d.r; b.i=c.i-sign*d.i; }
295
+ /* a = b*c */
296
+ #define MULPMSIGNC(a,b,c) { a.r=b.r*c.r-sign*b.i*c.i; a.i=b.r*c.i+sign*b.i*c.r; }
297
+ /* a *= b */
298
+ #define MULPMSIGNCEQ(a,b) { double xtmp=a.r; a.r=b.r*a.r-sign*b.i*a.i; a.i=b.r*a.i+sign*b.i*xtmp; }
299
+
300
+ NOINLINE static void pass2b (size_t ido, size_t l1, const cmplx * restrict cc,
301
+ cmplx * restrict ch, const cmplx * restrict wa)
302
+ {
303
+ const size_t cdim=2;
304
+
305
+ if (ido==1)
306
+ for (size_t k=0; k<l1; ++k)
307
+ PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
308
+ else
309
+ for (size_t k=0; k<l1; ++k)
310
+ {
311
+ PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
312
+ for (size_t i=1; i<ido; ++i)
313
+ {
314
+ cmplx t;
315
+ PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
316
+ A_EQ_B_MUL_C (CH(i,k,1),WA(0,i),t)
317
+ }
318
+ }
319
+ }
320
+
321
+ NOINLINE static void pass2f (size_t ido, size_t l1, const cmplx * restrict cc,
322
+ cmplx * restrict ch, const cmplx * restrict wa)
323
+ {
324
+ const size_t cdim=2;
325
+
326
+ if (ido==1)
327
+ for (size_t k=0; k<l1; ++k)
328
+ PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
329
+ else
330
+ for (size_t k=0; k<l1; ++k)
331
+ {
332
+ PMC (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(0,1,k))
333
+ for (size_t i=1; i<ido; ++i)
334
+ {
335
+ cmplx t;
336
+ PMC (CH(i,k,0),t,CC(i,0,k),CC(i,1,k))
337
+ A_EQ_CB_MUL_C (CH(i,k,1),WA(0,i),t)
338
+ }
339
+ }
340
+ }
341
+
342
+ #define PREP3(idx) \
343
+ cmplx t0 = CC(idx,0,k), t1, t2; \
344
+ PMC (t1,t2,CC(idx,1,k),CC(idx,2,k)) \
345
+ CH(idx,k,0).r=t0.r+t1.r; \
346
+ CH(idx,k,0).i=t0.i+t1.i;
347
+ #define PARTSTEP3a(u1,u2,twr,twi) \
348
+ { \
349
+ cmplx ca,cb; \
350
+ ca.r=t0.r+twr*t1.r; \
351
+ ca.i=t0.i+twr*t1.i; \
352
+ cb.i=twi*t2.r; \
353
+ cb.r=-(twi*t2.i); \
354
+ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
355
+ }
356
+
357
+ #define PARTSTEP3b(u1,u2,twr,twi) \
358
+ { \
359
+ cmplx ca,cb,da,db; \
360
+ ca.r=t0.r+twr*t1.r; \
361
+ ca.i=t0.i+twr*t1.i; \
362
+ cb.i=twi*t2.r; \
363
+ cb.r=-(twi*t2.i); \
364
+ PMC(da,db,ca,cb) \
365
+ A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
366
+ A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
367
+ }
368
+ NOINLINE static void pass3b (size_t ido, size_t l1, const cmplx * restrict cc,
369
+ cmplx * restrict ch, const cmplx * restrict wa)
370
+ {
371
+ const size_t cdim=3;
372
+ const double tw1r=-0.5, tw1i= 0.86602540378443864676;
373
+
374
+ if (ido==1)
375
+ for (size_t k=0; k<l1; ++k)
376
+ {
377
+ PREP3(0)
378
+ PARTSTEP3a(1,2,tw1r,tw1i)
379
+ }
380
+ else
381
+ for (size_t k=0; k<l1; ++k)
382
+ {
383
+ {
384
+ PREP3(0)
385
+ PARTSTEP3a(1,2,tw1r,tw1i)
386
+ }
387
+ for (size_t i=1; i<ido; ++i)
388
+ {
389
+ PREP3(i)
390
+ PARTSTEP3b(1,2,tw1r,tw1i)
391
+ }
392
+ }
393
+ }
394
+ #define PARTSTEP3f(u1,u2,twr,twi) \
395
+ { \
396
+ cmplx ca,cb,da,db; \
397
+ ca.r=t0.r+twr*t1.r; \
398
+ ca.i=t0.i+twr*t1.i; \
399
+ cb.i=twi*t2.r; \
400
+ cb.r=-(twi*t2.i); \
401
+ PMC(da,db,ca,cb) \
402
+ A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
403
+ A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
404
+ }
405
+ NOINLINE static void pass3f (size_t ido, size_t l1, const cmplx * restrict cc,
406
+ cmplx * restrict ch, const cmplx * restrict wa)
407
+ {
408
+ const size_t cdim=3;
409
+ const double tw1r=-0.5, tw1i= -0.86602540378443864676;
410
+
411
+ if (ido==1)
412
+ for (size_t k=0; k<l1; ++k)
413
+ {
414
+ PREP3(0)
415
+ PARTSTEP3a(1,2,tw1r,tw1i)
416
+ }
417
+ else
418
+ for (size_t k=0; k<l1; ++k)
419
+ {
420
+ {
421
+ PREP3(0)
422
+ PARTSTEP3a(1,2,tw1r,tw1i)
423
+ }
424
+ for (size_t i=1; i<ido; ++i)
425
+ {
426
+ PREP3(i)
427
+ PARTSTEP3f(1,2,tw1r,tw1i)
428
+ }
429
+ }
430
+ }
431
+
432
+ NOINLINE static void pass4b (size_t ido, size_t l1, const cmplx * restrict cc,
433
+ cmplx * restrict ch, const cmplx * restrict wa)
434
+ {
435
+ const size_t cdim=4;
436
+
437
+ if (ido==1)
438
+ for (size_t k=0; k<l1; ++k)
439
+ {
440
+ cmplx t1, t2, t3, t4;
441
+ PMC(t2,t1,CC(0,0,k),CC(0,2,k))
442
+ PMC(t3,t4,CC(0,1,k),CC(0,3,k))
443
+ ROT90(t4)
444
+ PMC(CH(0,k,0),CH(0,k,2),t2,t3)
445
+ PMC(CH(0,k,1),CH(0,k,3),t1,t4)
446
+ }
447
+ else
448
+ for (size_t k=0; k<l1; ++k)
449
+ {
450
+ {
451
+ cmplx t1, t2, t3, t4;
452
+ PMC(t2,t1,CC(0,0,k),CC(0,2,k))
453
+ PMC(t3,t4,CC(0,1,k),CC(0,3,k))
454
+ ROT90(t4)
455
+ PMC(CH(0,k,0),CH(0,k,2),t2,t3)
456
+ PMC(CH(0,k,1),CH(0,k,3),t1,t4)
457
+ }
458
+ for (size_t i=1; i<ido; ++i)
459
+ {
460
+ cmplx c2, c3, c4, t1, t2, t3, t4;
461
+ cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
462
+ PMC(t2,t1,cc0,cc2)
463
+ PMC(t3,t4,cc1,cc3)
464
+ ROT90(t4)
465
+ cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
466
+ PMC(CH(i,k,0),c3,t2,t3)
467
+ PMC(c2,c4,t1,t4)
468
+ A_EQ_B_MUL_C (CH(i,k,1),wa0,c2)
469
+ A_EQ_B_MUL_C (CH(i,k,2),wa1,c3)
470
+ A_EQ_B_MUL_C (CH(i,k,3),wa2,c4)
471
+ }
472
+ }
473
+ }
474
+ NOINLINE static void pass4f (size_t ido, size_t l1, const cmplx * restrict cc,
475
+ cmplx * restrict ch, const cmplx * restrict wa)
476
+ {
477
+ const size_t cdim=4;
478
+
479
+ if (ido==1)
480
+ for (size_t k=0; k<l1; ++k)
481
+ {
482
+ cmplx t1, t2, t3, t4;
483
+ PMC(t2,t1,CC(0,0,k),CC(0,2,k))
484
+ PMC(t3,t4,CC(0,1,k),CC(0,3,k))
485
+ ROTM90(t4)
486
+ PMC(CH(0,k,0),CH(0,k,2),t2,t3)
487
+ PMC(CH(0,k,1),CH(0,k,3),t1,t4)
488
+ }
489
+ else
490
+ for (size_t k=0; k<l1; ++k)
491
+ {
492
+ {
493
+ cmplx t1, t2, t3, t4;
494
+ PMC(t2,t1,CC(0,0,k),CC(0,2,k))
495
+ PMC(t3,t4,CC(0,1,k),CC(0,3,k))
496
+ ROTM90(t4)
497
+ PMC(CH(0,k,0),CH(0,k,2),t2,t3)
498
+ PMC (CH(0,k,1),CH(0,k,3),t1,t4)
499
+ }
500
+ for (size_t i=1; i<ido; ++i)
501
+ {
502
+ cmplx c2, c3, c4, t1, t2, t3, t4;
503
+ cmplx cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
504
+ PMC(t2,t1,cc0,cc2)
505
+ PMC(t3,t4,cc1,cc3)
506
+ ROTM90(t4)
507
+ cmplx wa0=WA(0,i), wa1=WA(1,i),wa2=WA(2,i);
508
+ PMC(CH(i,k,0),c3,t2,t3)
509
+ PMC(c2,c4,t1,t4)
510
+ A_EQ_CB_MUL_C (CH(i,k,1),wa0,c2)
511
+ A_EQ_CB_MUL_C (CH(i,k,2),wa1,c3)
512
+ A_EQ_CB_MUL_C (CH(i,k,3),wa2,c4)
513
+ }
514
+ }
515
+ }
516
+
517
+ #define PREP5(idx) \
518
+ cmplx t0 = CC(idx,0,k), t1, t2, t3, t4; \
519
+ PMC (t1,t4,CC(idx,1,k),CC(idx,4,k)) \
520
+ PMC (t2,t3,CC(idx,2,k),CC(idx,3,k)) \
521
+ CH(idx,k,0).r=t0.r+t1.r+t2.r; \
522
+ CH(idx,k,0).i=t0.i+t1.i+t2.i;
523
+
524
+ #define PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
525
+ { \
526
+ cmplx ca,cb; \
527
+ ca.r=t0.r+twar*t1.r+twbr*t2.r; \
528
+ ca.i=t0.i+twar*t1.i+twbr*t2.i; \
529
+ cb.i=twai*t4.r twbi*t3.r; \
530
+ cb.r=-(twai*t4.i twbi*t3.i); \
531
+ PMC(CH(0,k,u1),CH(0,k,u2),ca,cb) \
532
+ }
533
+
534
+ #define PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
535
+ { \
536
+ cmplx ca,cb,da,db; \
537
+ ca.r=t0.r+twar*t1.r+twbr*t2.r; \
538
+ ca.i=t0.i+twar*t1.i+twbr*t2.i; \
539
+ cb.i=twai*t4.r twbi*t3.r; \
540
+ cb.r=-(twai*t4.i twbi*t3.i); \
541
+ PMC(da,db,ca,cb) \
542
+ A_EQ_B_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
543
+ A_EQ_B_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
544
+ }
545
+ NOINLINE static void pass5b (size_t ido, size_t l1, const cmplx * restrict cc,
546
+ cmplx * restrict ch, const cmplx * restrict wa)
547
+ {
548
+ const size_t cdim=5;
549
+ const double tw1r= 0.3090169943749474241,
550
+ tw1i= 0.95105651629515357212,
551
+ tw2r= -0.8090169943749474241,
552
+ tw2i= 0.58778525229247312917;
553
+
554
+ if (ido==1)
555
+ for (size_t k=0; k<l1; ++k)
556
+ {
557
+ PREP5(0)
558
+ PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
559
+ PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
560
+ }
561
+ else
562
+ for (size_t k=0; k<l1; ++k)
563
+ {
564
+ {
565
+ PREP5(0)
566
+ PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
567
+ PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
568
+ }
569
+ for (size_t i=1; i<ido; ++i)
570
+ {
571
+ PREP5(i)
572
+ PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
573
+ PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
574
+ }
575
+ }
576
+ }
577
+ #define PARTSTEP5f(u1,u2,twar,twbr,twai,twbi) \
578
+ { \
579
+ cmplx ca,cb,da,db; \
580
+ ca.r=t0.r+twar*t1.r+twbr*t2.r; \
581
+ ca.i=t0.i+twar*t1.i+twbr*t2.i; \
582
+ cb.i=twai*t4.r twbi*t3.r; \
583
+ cb.r=-(twai*t4.i twbi*t3.i); \
584
+ PMC(da,db,ca,cb) \
585
+ A_EQ_CB_MUL_C (CH(i,k,u1),WA(u1-1,i),da) \
586
+ A_EQ_CB_MUL_C (CH(i,k,u2),WA(u2-1,i),db) \
587
+ }
588
+ NOINLINE static void pass5f (size_t ido, size_t l1, const cmplx * restrict cc,
589
+ cmplx * restrict ch, const cmplx * restrict wa)
590
+ {
591
+ const size_t cdim=5;
592
+ const double tw1r= 0.3090169943749474241,
593
+ tw1i= -0.95105651629515357212,
594
+ tw2r= -0.8090169943749474241,
595
+ tw2i= -0.58778525229247312917;
596
+
597
+ if (ido==1)
598
+ for (size_t k=0; k<l1; ++k)
599
+ {
600
+ PREP5(0)
601
+ PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
602
+ PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
603
+ }
604
+ else
605
+ for (size_t k=0; k<l1; ++k)
606
+ {
607
+ {
608
+ PREP5(0)
609
+ PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
610
+ PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
611
+ }
612
+ for (size_t i=1; i<ido; ++i)
613
+ {
614
+ PREP5(i)
615
+ PARTSTEP5f(1,4,tw1r,tw2r,+tw1i,+tw2i)
616
+ PARTSTEP5f(2,3,tw2r,tw1r,+tw2i,-tw1i)
617
+ }
618
+ }
619
+ }
620
+
621
+ #define PREP7(idx) \
622
+ cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
623
+ PMC (t2,t7,CC(idx,1,k),CC(idx,6,k)) \
624
+ PMC (t3,t6,CC(idx,2,k),CC(idx,5,k)) \
625
+ PMC (t4,t5,CC(idx,3,k),CC(idx,4,k)) \
626
+ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
627
+ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
628
+
629
+ #define PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
630
+ { \
631
+ cmplx ca,cb; \
632
+ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
633
+ ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
634
+ cb.i=y1*t7.r y2*t6.r y3*t5.r; \
635
+ cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
636
+ PMC(out1,out2,ca,cb) \
637
+ }
638
+ #define PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
639
+ PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
640
+ #define PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
641
+ { \
642
+ cmplx da,db; \
643
+ PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
644
+ MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
645
+ MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
646
+ }
647
+
648
+ NOINLINE static void pass7(size_t ido, size_t l1, const cmplx * restrict cc,
649
+ cmplx * restrict ch, const cmplx * restrict wa, const int sign)
650
+ {
651
+ const size_t cdim=7;
652
+ const double tw1r= 0.623489801858733530525,
653
+ tw1i= sign * 0.7818314824680298087084,
654
+ tw2r= -0.222520933956314404289,
655
+ tw2i= sign * 0.9749279121818236070181,
656
+ tw3r= -0.9009688679024191262361,
657
+ tw3i= sign * 0.4338837391175581204758;
658
+
659
+ if (ido==1)
660
+ for (size_t k=0; k<l1; ++k)
661
+ {
662
+ PREP7(0)
663
+ PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
664
+ PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
665
+ PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
666
+ }
667
+ else
668
+ for (size_t k=0; k<l1; ++k)
669
+ {
670
+ {
671
+ PREP7(0)
672
+ PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
673
+ PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
674
+ PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
675
+ }
676
+ for (size_t i=1; i<ido; ++i)
677
+ {
678
+ PREP7(i)
679
+ PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
680
+ PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
681
+ PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
682
+ }
683
+ }
684
+ }
685
+
686
+ #define PREP11(idx) \
687
+ cmplx t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
688
+ PMC (t2,t11,CC(idx,1,k),CC(idx,10,k)) \
689
+ PMC (t3,t10,CC(idx,2,k),CC(idx, 9,k)) \
690
+ PMC (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)) \
691
+ PMC (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)) \
692
+ PMC (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)) \
693
+ CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
694
+ CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
695
+
696
+ #define PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
697
+ { \
698
+ cmplx ca,cb; \
699
+ ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r+x4*t5.r+x5*t6.r; \
700
+ ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i+x4*t5.i+x5*t6.i; \
701
+ cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
702
+ cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
703
+ PMC(out1,out2,ca,cb) \
704
+ }
705
+ #define PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
706
+ PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
707
+ #define PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
708
+ { \
709
+ cmplx da,db; \
710
+ PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
711
+ MULPMSIGNC (CH(i,k,u1),WA(u1-1,i),da) \
712
+ MULPMSIGNC (CH(i,k,u2),WA(u2-1,i),db) \
713
+ }
714
+
715
+ NOINLINE static void pass11 (size_t ido, size_t l1, const cmplx * restrict cc,
716
+ cmplx * restrict ch, const cmplx * restrict wa, const int sign)
717
+ {
718
+ const size_t cdim=11;
719
+ const double tw1r = 0.8412535328311811688618,
720
+ tw1i = sign * 0.5406408174555975821076,
721
+ tw2r = 0.4154150130018864255293,
722
+ tw2i = sign * 0.9096319953545183714117,
723
+ tw3r = -0.1423148382732851404438,
724
+ tw3i = sign * 0.9898214418809327323761,
725
+ tw4r = -0.6548607339452850640569,
726
+ tw4i = sign * 0.755749574354258283774,
727
+ tw5r = -0.9594929736144973898904,
728
+ tw5i = sign * 0.2817325568414296977114;
729
+
730
+ if (ido==1)
731
+ for (size_t k=0; k<l1; ++k)
732
+ {
733
+ PREP11(0)
734
+ PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
735
+ PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
736
+ PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
737
+ PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
738
+ PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
739
+ }
740
+ else
741
+ for (size_t k=0; k<l1; ++k)
742
+ {
743
+ {
744
+ PREP11(0)
745
+ PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
746
+ PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
747
+ PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
748
+ PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
749
+ PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
750
+ }
751
+ for (size_t i=1; i<ido; ++i)
752
+ {
753
+ PREP11(i)
754
+ PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
755
+ PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
756
+ PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
757
+ PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
758
+ PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
759
+ }
760
+ }
761
+ }
762
+
763
+ #define CX(a,b,c) cc[(a)+ido*((b)+l1*(c))]
764
+ #define CX2(a,b) cc[(a)+idl1*(b)]
765
+ #define CH2(a,b) ch[(a)+idl1*(b)]
766
+
767
+ NOINLINE static int passg (size_t ido, size_t ip, size_t l1,
768
+ cmplx * restrict cc, cmplx * restrict ch, const cmplx * restrict wa,
769
+ const cmplx * restrict csarr, const int sign)
770
+ {
771
+ const size_t cdim=ip;
772
+ size_t ipph = (ip+1)/2;
773
+ size_t idl1 = ido*l1;
774
+
775
+ cmplx * restrict wal=RALLOC(cmplx,ip);
776
+ if (!wal) return -1;
777
+ wal[0]=(cmplx){1.,0.};
778
+ for (size_t i=1; i<ip; ++i)
779
+ wal[i]=(cmplx){csarr[i].r,sign*csarr[i].i};
780
+
781
+ for (size_t k=0; k<l1; ++k)
782
+ for (size_t i=0; i<ido; ++i)
783
+ CH(i,k,0) = CC(i,0,k);
784
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
785
+ for (size_t k=0; k<l1; ++k)
786
+ for (size_t i=0; i<ido; ++i)
787
+ PMC(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k))
788
+ for (size_t k=0; k<l1; ++k)
789
+ for (size_t i=0; i<ido; ++i)
790
+ {
791
+ cmplx tmp = CH(i,k,0);
792
+ for (size_t j=1; j<ipph; ++j)
793
+ ADDC(tmp,tmp,CH(i,k,j))
794
+ CX(i,k,0) = tmp;
795
+ }
796
+ for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
797
+ {
798
+ // j=0
799
+ for (size_t ik=0; ik<idl1; ++ik)
800
+ {
801
+ CX2(ik,l).r = CH2(ik,0).r+wal[l].r*CH2(ik,1).r+wal[2*l].r*CH2(ik,2).r;
802
+ CX2(ik,l).i = CH2(ik,0).i+wal[l].r*CH2(ik,1).i+wal[2*l].r*CH2(ik,2).i;
803
+ CX2(ik,lc).r=-wal[l].i*CH2(ik,ip-1).i-wal[2*l].i*CH2(ik,ip-2).i;
804
+ CX2(ik,lc).i=wal[l].i*CH2(ik,ip-1).r+wal[2*l].i*CH2(ik,ip-2).r;
805
+ }
806
+
807
+ size_t iwal=2*l;
808
+ size_t j=3, jc=ip-3;
809
+ for (; j<ipph-1; j+=2, jc-=2)
810
+ {
811
+ iwal+=l; if (iwal>ip) iwal-=ip;
812
+ cmplx xwal=wal[iwal];
813
+ iwal+=l; if (iwal>ip) iwal-=ip;
814
+ cmplx xwal2=wal[iwal];
815
+ for (size_t ik=0; ik<idl1; ++ik)
816
+ {
817
+ CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
818
+ CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
819
+ CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
820
+ CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
821
+ }
822
+ }
823
+ for (; j<ipph; ++j, --jc)
824
+ {
825
+ iwal+=l; if (iwal>ip) iwal-=ip;
826
+ cmplx xwal=wal[iwal];
827
+ for (size_t ik=0; ik<idl1; ++ik)
828
+ {
829
+ CX2(ik,l).r += CH2(ik,j).r*xwal.r;
830
+ CX2(ik,l).i += CH2(ik,j).i*xwal.r;
831
+ CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
832
+ CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
833
+ }
834
+ }
835
+ }
836
+ DEALLOC(wal);
837
+
838
+ // shuffling and twiddling
839
+ if (ido==1)
840
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
841
+ for (size_t ik=0; ik<idl1; ++ik)
842
+ {
843
+ cmplx t1=CX2(ik,j), t2=CX2(ik,jc);
844
+ PMC(CX2(ik,j),CX2(ik,jc),t1,t2)
845
+ }
846
+ else
847
+ {
848
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
849
+ for (size_t k=0; k<l1; ++k)
850
+ {
851
+ cmplx t1=CX(0,k,j), t2=CX(0,k,jc);
852
+ PMC(CX(0,k,j),CX(0,k,jc),t1,t2)
853
+ for (size_t i=1; i<ido; ++i)
854
+ {
855
+ cmplx x1, x2;
856
+ PMC(x1,x2,CX(i,k,j),CX(i,k,jc))
857
+ size_t idij=(j-1)*(ido-1)+i-1;
858
+ MULPMSIGNC (CX(i,k,j),wa[idij],x1)
859
+ idij=(jc-1)*(ido-1)+i-1;
860
+ MULPMSIGNC (CX(i,k,jc),wa[idij],x2)
861
+ }
862
+ }
863
+ }
864
+ return 0;
865
+ }
866
+
867
+ #undef CH2
868
+ #undef CX2
869
+ #undef CX
870
+
871
+ NOINLINE WARN_UNUSED_RESULT static int pass_all(cfftp_plan plan, cmplx c[], double fct,
872
+ const int sign)
873
+ {
874
+ if (plan->length==1) return 0;
875
+ size_t len=plan->length;
876
+ size_t l1=1, nf=plan->nfct;
877
+ cmplx *ch = RALLOC(cmplx, len);
878
+ if (!ch) return -1;
879
+ cmplx *p1=c, *p2=ch;
880
+
881
+ for(size_t k1=0; k1<nf; k1++)
882
+ {
883
+ size_t ip=plan->fct[k1].fct;
884
+ size_t l2=ip*l1;
885
+ size_t ido = len/l2;
886
+ if (ip==4)
887
+ sign>0 ? pass4b (ido, l1, p1, p2, plan->fct[k1].tw)
888
+ : pass4f (ido, l1, p1, p2, plan->fct[k1].tw);
889
+ else if(ip==2)
890
+ sign>0 ? pass2b (ido, l1, p1, p2, plan->fct[k1].tw)
891
+ : pass2f (ido, l1, p1, p2, plan->fct[k1].tw);
892
+ else if(ip==3)
893
+ sign>0 ? pass3b (ido, l1, p1, p2, plan->fct[k1].tw)
894
+ : pass3f (ido, l1, p1, p2, plan->fct[k1].tw);
895
+ else if(ip==5)
896
+ sign>0 ? pass5b (ido, l1, p1, p2, plan->fct[k1].tw)
897
+ : pass5f (ido, l1, p1, p2, plan->fct[k1].tw);
898
+ else if(ip==7) pass7 (ido, l1, p1, p2, plan->fct[k1].tw, sign);
899
+ else if(ip==11) pass11(ido, l1, p1, p2, plan->fct[k1].tw, sign);
900
+ else
901
+ {
902
+ if (passg(ido, ip, l1, p1, p2, plan->fct[k1].tw, plan->fct[k1].tws, sign))
903
+ { DEALLOC(ch); return -1; }
904
+ SWAP(p1,p2,cmplx *);
905
+ }
906
+ SWAP(p1,p2,cmplx *);
907
+ l1=l2;
908
+ }
909
+ if (p1!=c)
910
+ {
911
+ if (fct!=1.)
912
+ for (size_t i=0; i<len; ++i)
913
+ {
914
+ c[i].r = ch[i].r*fct;
915
+ c[i].i = ch[i].i*fct;
916
+ }
917
+ else
918
+ memcpy (c,p1,len*sizeof(cmplx));
919
+ }
920
+ else
921
+ if (fct!=1.)
922
+ for (size_t i=0; i<len; ++i)
923
+ {
924
+ c[i].r *= fct;
925
+ c[i].i *= fct;
926
+ }
927
+ DEALLOC(ch);
928
+ return 0;
929
+ }
930
+
931
+ #undef PMSIGNC
932
+ #undef A_EQ_B_MUL_C
933
+ #undef A_EQ_CB_MUL_C
934
+ #undef MULPMSIGNC
935
+ #undef MULPMSIGNCEQ
936
+
937
+ #undef WA
938
+ #undef CC
939
+ #undef CH
940
+ #undef ROT90
941
+ #undef SCALEC
942
+ #undef ADDC
943
+ #undef PMC
944
+
945
+ NOINLINE WARN_UNUSED_RESULT
946
+ static int cfftp_forward(cfftp_plan plan, double c[], double fct)
947
+ { return pass_all(plan,(cmplx *)c, fct, -1); }
948
+
949
+ NOINLINE WARN_UNUSED_RESULT
950
+ static int cfftp_backward(cfftp_plan plan, double c[], double fct)
951
+ { return pass_all(plan,(cmplx *)c, fct, 1); }
952
+
953
+ NOINLINE WARN_UNUSED_RESULT
954
+ static int cfftp_factorize (cfftp_plan plan)
955
+ {
956
+ size_t length=plan->length;
957
+ size_t nfct=0;
958
+ while ((length%4)==0)
959
+ { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
960
+ if ((length%2)==0)
961
+ {
962
+ length>>=1;
963
+ // factor 2 should be at the front of the factor list
964
+ if (nfct>=NFCT) return -1;
965
+ plan->fct[nfct++].fct=2;
966
+ SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
967
+ }
968
+ size_t maxl=(size_t)(sqrt((double)length))+1;
969
+ for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
970
+ if ((length%divisor)==0)
971
+ {
972
+ while ((length%divisor)==0)
973
+ {
974
+ if (nfct>=NFCT) return -1;
975
+ plan->fct[nfct++].fct=divisor;
976
+ length/=divisor;
977
+ }
978
+ maxl=(size_t)(sqrt((double)length))+1;
979
+ }
980
+ if (length>1) plan->fct[nfct++].fct=length;
981
+ plan->nfct=nfct;
982
+ return 0;
983
+ }
984
+
985
+ NOINLINE static size_t cfftp_twsize (cfftp_plan plan)
986
+ {
987
+ size_t twsize=0, l1=1;
988
+ for (size_t k=0; k<plan->nfct; ++k)
989
+ {
990
+ size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
991
+ twsize+=(ip-1)*(ido-1);
992
+ if (ip>11)
993
+ twsize+=ip;
994
+ l1*=ip;
995
+ }
996
+ return twsize;
997
+ }
998
+
999
+ NOINLINE WARN_UNUSED_RESULT static int cfftp_comp_twiddle (cfftp_plan plan)
1000
+ {
1001
+ size_t length=plan->length;
1002
+ double *twid = RALLOC(double, 2*length);
1003
+ if (!twid) return -1;
1004
+ sincos_2pibyn(length, twid);
1005
+ size_t l1=1;
1006
+ size_t memofs=0;
1007
+ for (size_t k=0; k<plan->nfct; ++k)
1008
+ {
1009
+ size_t ip=plan->fct[k].fct, ido= length/(l1*ip);
1010
+ plan->fct[k].tw=plan->mem+memofs;
1011
+ memofs+=(ip-1)*(ido-1);
1012
+ for (size_t j=1; j<ip; ++j)
1013
+ for (size_t i=1; i<ido; ++i)
1014
+ {
1015
+ plan->fct[k].tw[(j-1)*(ido-1)+i-1].r = twid[2*j*l1*i];
1016
+ plan->fct[k].tw[(j-1)*(ido-1)+i-1].i = twid[2*j*l1*i+1];
1017
+ }
1018
+ if (ip>11)
1019
+ {
1020
+ plan->fct[k].tws=plan->mem+memofs;
1021
+ memofs+=ip;
1022
+ for (size_t j=0; j<ip; ++j)
1023
+ {
1024
+ plan->fct[k].tws[j].r = twid[2*j*l1*ido];
1025
+ plan->fct[k].tws[j].i = twid[2*j*l1*ido+1];
1026
+ }
1027
+ }
1028
+ l1*=ip;
1029
+ }
1030
+ DEALLOC(twid);
1031
+ return 0;
1032
+ }
1033
+
1034
+ static cfftp_plan make_cfftp_plan (size_t length)
1035
+ {
1036
+ if (length==0) return NULL;
1037
+ cfftp_plan plan = RALLOC(cfftp_plan_i,1);
1038
+ if (!plan) return NULL;
1039
+ plan->length=length;
1040
+ plan->nfct=0;
1041
+ for (size_t i=0; i<NFCT; ++i)
1042
+ plan->fct[i]=(cfftp_fctdata){0,0,0};
1043
+ plan->mem=0;
1044
+ if (length==1) return plan;
1045
+ if (cfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1046
+ size_t tws=cfftp_twsize(plan);
1047
+ plan->mem=RALLOC(cmplx,tws);
1048
+ if (!plan->mem) { DEALLOC(plan); return NULL; }
1049
+ if (cfftp_comp_twiddle(plan)!=0)
1050
+ { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1051
+ return plan;
1052
+ }
1053
+
1054
+ static void destroy_cfftp_plan (cfftp_plan plan)
1055
+ {
1056
+ DEALLOC(plan->mem);
1057
+ DEALLOC(plan);
1058
+ }
1059
+
1060
+ typedef struct rfftp_fctdata
1061
+ {
1062
+ size_t fct;
1063
+ double *tw, *tws;
1064
+ } rfftp_fctdata;
1065
+
1066
+ typedef struct rfftp_plan_i
1067
+ {
1068
+ size_t length, nfct;
1069
+ double *mem;
1070
+ rfftp_fctdata fct[NFCT];
1071
+ } rfftp_plan_i;
1072
+ typedef struct rfftp_plan_i * rfftp_plan;
1073
+
1074
+ #define WA(x,i) wa[(i)+(x)*(ido-1)]
1075
+ #define PM(a,b,c,d) { a=c+d; b=c-d; }
1076
+ /* (a+ib) = conj(c+id) * (e+if) */
1077
+ #define MULPM(a,b,c,d,e,f) { a=c*e+d*f; b=c*f-d*e; }
1078
+
1079
+ #define CC(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1080
+ #define CH(a,b,c) ch[(a)+ido*((b)+cdim*(c))]
1081
+
1082
+ NOINLINE static void radf2 (size_t ido, size_t l1, const double * restrict cc,
1083
+ double * restrict ch, const double * restrict wa)
1084
+ {
1085
+ const size_t cdim=2;
1086
+
1087
+ for (size_t k=0; k<l1; k++)
1088
+ PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1))
1089
+ if ((ido&1)==0)
1090
+ for (size_t k=0; k<l1; k++)
1091
+ {
1092
+ CH( 0,1,k) = -CC(ido-1,k,1);
1093
+ CH(ido-1,0,k) = CC(ido-1,k,0);
1094
+ }
1095
+ if (ido<=2) return;
1096
+ for (size_t k=0; k<l1; k++)
1097
+ for (size_t i=2; i<ido; i+=2)
1098
+ {
1099
+ size_t ic=ido-i;
1100
+ double tr2, ti2;
1101
+ MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1102
+ PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2)
1103
+ PM (CH(i ,0,k),CH(ic ,1,k),ti2,CC(i ,k,0))
1104
+ }
1105
+ }
1106
+
1107
+ NOINLINE static void radf3(size_t ido, size_t l1, const double * restrict cc,
1108
+ double * restrict ch, const double * restrict wa)
1109
+ {
1110
+ const size_t cdim=3;
1111
+ static const double taur=-0.5, taui=0.86602540378443864676;
1112
+
1113
+ for (size_t k=0; k<l1; k++)
1114
+ {
1115
+ double cr2=CC(0,k,1)+CC(0,k,2);
1116
+ CH(0,0,k) = CC(0,k,0)+cr2;
1117
+ CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
1118
+ CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
1119
+ }
1120
+ if (ido==1) return;
1121
+ for (size_t k=0; k<l1; k++)
1122
+ for (size_t i=2; i<ido; i+=2)
1123
+ {
1124
+ size_t ic=ido-i;
1125
+ double di2, di3, dr2, dr3;
1126
+ MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)) // d2=conj(WA0)*CC1
1127
+ MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)) // d3=conj(WA1)*CC2
1128
+ double cr2=dr2+dr3; // c add
1129
+ double ci2=di2+di3;
1130
+ CH(i-1,0,k) = CC(i-1,k,0)+cr2; // c add
1131
+ CH(i ,0,k) = CC(i ,k,0)+ci2;
1132
+ double tr2 = CC(i-1,k,0)+taur*cr2; // c add
1133
+ double ti2 = CC(i ,k,0)+taur*ci2;
1134
+ double tr3 = taui*(di2-di3); // t3 = taui*i*(d3-d2)?
1135
+ double ti3 = taui*(dr3-dr2);
1136
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3) // PM(i) = t2+t3
1137
+ PM(CH(i ,2,k),CH(ic ,1,k),ti3,ti2) // PM(ic) = conj(t2-t3)
1138
+ }
1139
+ }
1140
+
1141
+ NOINLINE static void radf4(size_t ido, size_t l1, const double * restrict cc,
1142
+ double * restrict ch, const double * restrict wa)
1143
+ {
1144
+ const size_t cdim=4;
1145
+ static const double hsqt2=0.70710678118654752440;
1146
+
1147
+ for (size_t k=0; k<l1; k++)
1148
+ {
1149
+ double tr1,tr2;
1150
+ PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1))
1151
+ PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2))
1152
+ PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1)
1153
+ }
1154
+ if ((ido&1)==0)
1155
+ for (size_t k=0; k<l1; k++)
1156
+ {
1157
+ double ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
1158
+ double tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
1159
+ PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1)
1160
+ PM (CH( 0,3,k),CH( 0,1,k),ti1,CC(ido-1,k,2))
1161
+ }
1162
+ if (ido<=2) return;
1163
+ for (size_t k=0; k<l1; k++)
1164
+ for (size_t i=2; i<ido; i+=2)
1165
+ {
1166
+ size_t ic=ido-i;
1167
+ double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1168
+ MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1169
+ MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1170
+ MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1171
+ PM(tr1,tr4,cr4,cr2)
1172
+ PM(ti1,ti4,ci2,ci4)
1173
+ PM(tr2,tr3,CC(i-1,k,0),cr3)
1174
+ PM(ti2,ti3,CC(i ,k,0),ci3)
1175
+ PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1)
1176
+ PM(CH(i ,0,k),CH(ic ,3,k),ti1,ti2)
1177
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4)
1178
+ PM(CH(i ,2,k),CH(ic ,1,k),tr4,ti3)
1179
+ }
1180
+ }
1181
+
1182
+ NOINLINE static void radf5(size_t ido, size_t l1, const double * restrict cc,
1183
+ double * restrict ch, const double * restrict wa)
1184
+ {
1185
+ const size_t cdim=5;
1186
+ static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1187
+ tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1188
+
1189
+ for (size_t k=0; k<l1; k++)
1190
+ {
1191
+ double cr2, cr3, ci4, ci5;
1192
+ PM (cr2,ci5,CC(0,k,4),CC(0,k,1))
1193
+ PM (cr3,ci4,CC(0,k,3),CC(0,k,2))
1194
+ CH(0,0,k)=CC(0,k,0)+cr2+cr3;
1195
+ CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
1196
+ CH(0,2,k)=ti11*ci5+ti12*ci4;
1197
+ CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
1198
+ CH(0,4,k)=ti12*ci5-ti11*ci4;
1199
+ }
1200
+ if (ido==1) return;
1201
+ for (size_t k=0; k<l1;++k)
1202
+ for (size_t i=2; i<ido; i+=2)
1203
+ {
1204
+ double ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3,
1205
+ dr4, dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
1206
+ size_t ic=ido-i;
1207
+ MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1))
1208
+ MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2))
1209
+ MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3))
1210
+ MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4))
1211
+ PM(cr2,ci5,dr5,dr2)
1212
+ PM(ci2,cr5,di2,di5)
1213
+ PM(cr3,ci4,dr4,dr3)
1214
+ PM(ci3,cr4,di3,di4)
1215
+ CH(i-1,0,k)=CC(i-1,k,0)+cr2+cr3;
1216
+ CH(i ,0,k)=CC(i ,k,0)+ci2+ci3;
1217
+ tr2=CC(i-1,k,0)+tr11*cr2+tr12*cr3;
1218
+ ti2=CC(i ,k,0)+tr11*ci2+tr12*ci3;
1219
+ tr3=CC(i-1,k,0)+tr12*cr2+tr11*cr3;
1220
+ ti3=CC(i ,k,0)+tr12*ci2+tr11*ci3;
1221
+ MULPM(tr5,tr4,cr5,cr4,ti11,ti12)
1222
+ MULPM(ti5,ti4,ci5,ci4,ti11,ti12)
1223
+ PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5)
1224
+ PM(CH(i ,2,k),CH(ic ,1,k),ti5,ti2)
1225
+ PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4)
1226
+ PM(CH(i ,4,k),CH(ic ,3,k),ti4,ti3)
1227
+ }
1228
+ }
1229
+
1230
+ #undef CC
1231
+ #undef CH
1232
+ #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1233
+ #define C2(a,b) cc[(a)+idl1*(b)]
1234
+ #define CH2(a,b) ch[(a)+idl1*(b)]
1235
+ #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1236
+ #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1237
+ NOINLINE static void radfg(size_t ido, size_t ip, size_t l1,
1238
+ double * restrict cc, double * restrict ch, const double * restrict wa,
1239
+ const double * restrict csarr)
1240
+ {
1241
+ const size_t cdim=ip;
1242
+ size_t ipph=(ip+1)/2;
1243
+ size_t idl1 = ido*l1;
1244
+
1245
+ if (ido>1)
1246
+ {
1247
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 114
1248
+ {
1249
+ size_t is=(j-1)*(ido-1),
1250
+ is2=(jc-1)*(ido-1);
1251
+ for (size_t k=0; k<l1; ++k) // 113
1252
+ {
1253
+ size_t idij=is;
1254
+ size_t idij2=is2;
1255
+ for (size_t i=1; i<=ido-2; i+=2) // 112
1256
+ {
1257
+ double t1=C1(i,k,j ), t2=C1(i+1,k,j ),
1258
+ t3=C1(i,k,jc), t4=C1(i+1,k,jc);
1259
+ double x1=wa[idij]*t1 + wa[idij+1]*t2,
1260
+ x2=wa[idij]*t2 - wa[idij+1]*t1,
1261
+ x3=wa[idij2]*t3 + wa[idij2+1]*t4,
1262
+ x4=wa[idij2]*t4 - wa[idij2+1]*t3;
1263
+ C1(i ,k,j ) = x1+x3;
1264
+ C1(i ,k,jc) = x2-x4;
1265
+ C1(i+1,k,j ) = x2+x4;
1266
+ C1(i+1,k,jc) = x3-x1;
1267
+ idij+=2;
1268
+ idij2+=2;
1269
+ }
1270
+ }
1271
+ }
1272
+ }
1273
+
1274
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 123
1275
+ for (size_t k=0; k<l1; ++k) // 122
1276
+ {
1277
+ double t1=C1(0,k,j), t2=C1(0,k,jc);
1278
+ C1(0,k,j ) = t1+t2;
1279
+ C1(0,k,jc) = t2-t1;
1280
+ }
1281
+
1282
+ //everything in C
1283
+ //memset(ch,0,ip*l1*ido*sizeof(double));
1284
+
1285
+ for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc) // 127
1286
+ {
1287
+ for (size_t ik=0; ik<idl1; ++ik) // 124
1288
+ {
1289
+ CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
1290
+ CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
1291
+ }
1292
+ size_t iang = 2*l;
1293
+ size_t j=3, jc=ip-3;
1294
+ for (; j<ipph-3; j+=4,jc-=4) // 126
1295
+ {
1296
+ iang+=l; if (iang>=ip) iang-=ip;
1297
+ double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1298
+ iang+=l; if (iang>=ip) iang-=ip;
1299
+ double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1300
+ iang+=l; if (iang>=ip) iang-=ip;
1301
+ double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1302
+ iang+=l; if (iang>=ip) iang-=ip;
1303
+ double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1304
+ for (size_t ik=0; ik<idl1; ++ik) // 125
1305
+ {
1306
+ CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
1307
+ +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
1308
+ CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
1309
+ +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
1310
+ }
1311
+ }
1312
+ for (; j<ipph-1; j+=2,jc-=2) // 126
1313
+ {
1314
+ iang+=l; if (iang>=ip) iang-=ip;
1315
+ double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1316
+ iang+=l; if (iang>=ip) iang-=ip;
1317
+ double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1318
+ for (size_t ik=0; ik<idl1; ++ik) // 125
1319
+ {
1320
+ CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
1321
+ CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
1322
+ }
1323
+ }
1324
+ for (; j<ipph; ++j,--jc) // 126
1325
+ {
1326
+ iang+=l; if (iang>=ip) iang-=ip;
1327
+ double ar=csarr[2*iang], ai=csarr[2*iang+1];
1328
+ for (size_t ik=0; ik<idl1; ++ik) // 125
1329
+ {
1330
+ CH2(ik,l ) += ar*C2(ik,j );
1331
+ CH2(ik,lc) += ai*C2(ik,jc);
1332
+ }
1333
+ }
1334
+ }
1335
+ for (size_t ik=0; ik<idl1; ++ik) // 101
1336
+ CH2(ik,0) = C2(ik,0);
1337
+ for (size_t j=1; j<ipph; ++j) // 129
1338
+ for (size_t ik=0; ik<idl1; ++ik) // 128
1339
+ CH2(ik,0) += C2(ik,j);
1340
+
1341
+ // everything in CH at this point!
1342
+ //memset(cc,0,ip*l1*ido*sizeof(double));
1343
+
1344
+ for (size_t k=0; k<l1; ++k) // 131
1345
+ for (size_t i=0; i<ido; ++i) // 130
1346
+ CC(i,0,k) = CH(i,k,0);
1347
+
1348
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 137
1349
+ {
1350
+ size_t j2=2*j-1;
1351
+ for (size_t k=0; k<l1; ++k) // 136
1352
+ {
1353
+ CC(ido-1,j2,k) = CH(0,k,j);
1354
+ CC(0,j2+1,k) = CH(0,k,jc);
1355
+ }
1356
+ }
1357
+
1358
+ if (ido==1) return;
1359
+
1360
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 140
1361
+ {
1362
+ size_t j2=2*j-1;
1363
+ for(size_t k=0; k<l1; ++k) // 139
1364
+ for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 138
1365
+ {
1366
+ CC(i ,j2+1,k) = CH(i ,k,j )+CH(i ,k,jc);
1367
+ CC(ic ,j2 ,k) = CH(i ,k,j )-CH(i ,k,jc);
1368
+ CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
1369
+ CC(ic+1,j2 ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
1370
+ }
1371
+ }
1372
+ }
1373
+ #undef C1
1374
+ #undef C2
1375
+ #undef CH2
1376
+
1377
+ #undef CH
1378
+ #undef CC
1379
+ #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1380
+ #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1381
+
1382
+ NOINLINE static void radb2(size_t ido, size_t l1, const double * restrict cc,
1383
+ double * restrict ch, const double * restrict wa)
1384
+ {
1385
+ const size_t cdim=2;
1386
+
1387
+ for (size_t k=0; k<l1; k++)
1388
+ PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k))
1389
+ if ((ido&1)==0)
1390
+ for (size_t k=0; k<l1; k++)
1391
+ {
1392
+ CH(ido-1,k,0) = 2.*CC(ido-1,0,k);
1393
+ CH(ido-1,k,1) =-2.*CC(0 ,1,k);
1394
+ }
1395
+ if (ido<=2) return;
1396
+ for (size_t k=0; k<l1;++k)
1397
+ for (size_t i=2; i<ido; i+=2)
1398
+ {
1399
+ size_t ic=ido-i;
1400
+ double ti2, tr2;
1401
+ PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k))
1402
+ PM (ti2,CH(i ,k,0),CC(i ,0,k),CC(ic ,1,k))
1403
+ MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2)
1404
+ }
1405
+ }
1406
+
1407
+ NOINLINE static void radb3(size_t ido, size_t l1, const double * restrict cc,
1408
+ double * restrict ch, const double * restrict wa)
1409
+ {
1410
+ const size_t cdim=3;
1411
+ static const double taur=-0.5, taui=0.86602540378443864676;
1412
+
1413
+ for (size_t k=0; k<l1; k++)
1414
+ {
1415
+ double tr2=2.*CC(ido-1,1,k);
1416
+ double cr2=CC(0,0,k)+taur*tr2;
1417
+ CH(0,k,0)=CC(0,0,k)+tr2;
1418
+ double ci3=2.*taui*CC(0,2,k);
1419
+ PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
1420
+ }
1421
+ if (ido==1) return;
1422
+ for (size_t k=0; k<l1; k++)
1423
+ for (size_t i=2; i<ido; i+=2)
1424
+ {
1425
+ size_t ic=ido-i;
1426
+ double tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
1427
+ double ti2=CC(i ,2,k)-CC(ic ,1,k);
1428
+ double cr2=CC(i-1,0,k)+taur*tr2; // c2=CC +taur*t2
1429
+ double ci2=CC(i ,0,k)+taur*ti2;
1430
+ CH(i-1,k,0)=CC(i-1,0,k)+tr2; // CH=CC+t2
1431
+ CH(i ,k,0)=CC(i ,0,k)+ti2;
1432
+ double cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
1433
+ double ci3=taui*(CC(i ,2,k)+CC(ic ,1,k));
1434
+ double di2, di3, dr2, dr3;
1435
+ PM(dr3,dr2,cr2,ci3) // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
1436
+ PM(di2,di3,ci2,cr3) // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
1437
+ MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2) // ch = WA*d2
1438
+ MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1439
+ }
1440
+ }
1441
+
1442
+ NOINLINE static void radb4(size_t ido, size_t l1, const double * restrict cc,
1443
+ double * restrict ch, const double * restrict wa)
1444
+ {
1445
+ const size_t cdim=4;
1446
+ static const double sqrt2=1.41421356237309504880;
1447
+
1448
+ for (size_t k=0; k<l1; k++)
1449
+ {
1450
+ double tr1, tr2;
1451
+ PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k))
1452
+ double tr3=2.*CC(ido-1,1,k);
1453
+ double tr4=2.*CC(0,2,k);
1454
+ PM (CH(0,k,0),CH(0,k,2),tr2,tr3)
1455
+ PM (CH(0,k,3),CH(0,k,1),tr1,tr4)
1456
+ }
1457
+ if ((ido&1)==0)
1458
+ for (size_t k=0; k<l1; k++)
1459
+ {
1460
+ double tr1,tr2,ti1,ti2;
1461
+ PM (ti1,ti2,CC(0 ,3,k),CC(0 ,1,k))
1462
+ PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k))
1463
+ CH(ido-1,k,0)=tr2+tr2;
1464
+ CH(ido-1,k,1)=sqrt2*(tr1-ti1);
1465
+ CH(ido-1,k,2)=ti2+ti2;
1466
+ CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
1467
+ }
1468
+ if (ido<=2) return;
1469
+ for (size_t k=0; k<l1;++k)
1470
+ for (size_t i=2; i<ido; i+=2)
1471
+ {
1472
+ double ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
1473
+ size_t ic=ido-i;
1474
+ PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k))
1475
+ PM (ti1,ti2,CC(i ,0,k),CC(ic ,3,k))
1476
+ PM (tr4,ti3,CC(i ,2,k),CC(ic ,1,k))
1477
+ PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k))
1478
+ PM (CH(i-1,k,0),cr3,tr2,tr3)
1479
+ PM (CH(i ,k,0),ci3,ti2,ti3)
1480
+ PM (cr4,cr2,tr1,tr4)
1481
+ PM (ci2,ci4,ti1,ti4)
1482
+ MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2)
1483
+ MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3)
1484
+ MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4)
1485
+ }
1486
+ }
1487
+
1488
+ NOINLINE static void radb5(size_t ido, size_t l1, const double * restrict cc,
1489
+ double * restrict ch, const double * restrict wa)
1490
+ {
1491
+ const size_t cdim=5;
1492
+ static const double tr11= 0.3090169943749474241, ti11=0.95105651629515357212,
1493
+ tr12=-0.8090169943749474241, ti12=0.58778525229247312917;
1494
+
1495
+ for (size_t k=0; k<l1; k++)
1496
+ {
1497
+ double ti5=CC(0,2,k)+CC(0,2,k);
1498
+ double ti4=CC(0,4,k)+CC(0,4,k);
1499
+ double tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
1500
+ double tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
1501
+ CH(0,k,0)=CC(0,0,k)+tr2+tr3;
1502
+ double cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
1503
+ double cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
1504
+ double ci4, ci5;
1505
+ MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1506
+ PM(CH(0,k,4),CH(0,k,1),cr2,ci5)
1507
+ PM(CH(0,k,3),CH(0,k,2),cr3,ci4)
1508
+ }
1509
+ if (ido==1) return;
1510
+ for (size_t k=0; k<l1;++k)
1511
+ for (size_t i=2; i<ido; i+=2)
1512
+ {
1513
+ size_t ic=ido-i;
1514
+ double tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
1515
+ PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k))
1516
+ PM(ti5,ti2,CC(i ,2,k),CC(ic ,1,k))
1517
+ PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k))
1518
+ PM(ti4,ti3,CC(i ,4,k),CC(ic ,3,k))
1519
+ CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
1520
+ CH(i ,k,0)=CC(i ,0,k)+ti2+ti3;
1521
+ double cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
1522
+ double ci2=CC(i ,0,k)+tr11*ti2+tr12*ti3;
1523
+ double cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
1524
+ double ci3=CC(i ,0,k)+tr12*ti2+tr11*ti3;
1525
+ double ci4, ci5, cr5, cr4;
1526
+ MULPM(cr5,cr4,tr5,tr4,ti11,ti12)
1527
+ MULPM(ci5,ci4,ti5,ti4,ti11,ti12)
1528
+ double dr2, dr3, dr4, dr5, di2, di3, di4, di5;
1529
+ PM(dr4,dr3,cr3,ci4)
1530
+ PM(di3,di4,ci3,cr4)
1531
+ PM(dr5,dr2,cr2,ci5)
1532
+ PM(di2,di5,ci2,cr5)
1533
+ MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2)
1534
+ MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3)
1535
+ MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4)
1536
+ MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5)
1537
+ }
1538
+ }
1539
+
1540
+ #undef CC
1541
+ #undef CH
1542
+ #define CC(a,b,c) cc[(a)+ido*((b)+cdim*(c))]
1543
+ #define CH(a,b,c) ch[(a)+ido*((b)+l1*(c))]
1544
+ #define C1(a,b,c) cc[(a)+ido*((b)+l1*(c))]
1545
+ #define C2(a,b) cc[(a)+idl1*(b)]
1546
+ #define CH2(a,b) ch[(a)+idl1*(b)]
1547
+
1548
+ NOINLINE static void radbg(size_t ido, size_t ip, size_t l1,
1549
+ double * restrict cc, double * restrict ch, const double * restrict wa,
1550
+ const double * restrict csarr)
1551
+ {
1552
+ const size_t cdim=ip;
1553
+ size_t ipph=(ip+1)/ 2;
1554
+ size_t idl1 = ido*l1;
1555
+
1556
+ for (size_t k=0; k<l1; ++k) // 102
1557
+ for (size_t i=0; i<ido; ++i) // 101
1558
+ CH(i,k,0) = CC(i,0,k);
1559
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 108
1560
+ {
1561
+ size_t j2=2*j-1;
1562
+ for (size_t k=0; k<l1; ++k)
1563
+ {
1564
+ CH(0,k,j ) = 2*CC(ido-1,j2,k);
1565
+ CH(0,k,jc) = 2*CC(0,j2+1,k);
1566
+ }
1567
+ }
1568
+
1569
+ if (ido!=1)
1570
+ {
1571
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 111
1572
+ {
1573
+ size_t j2=2*j-1;
1574
+ for (size_t k=0; k<l1; ++k)
1575
+ for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2) // 109
1576
+ {
1577
+ CH(i ,k,j ) = CC(i ,j2+1,k)+CC(ic ,j2,k);
1578
+ CH(i ,k,jc) = CC(i ,j2+1,k)-CC(ic ,j2,k);
1579
+ CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
1580
+ CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
1581
+ }
1582
+ }
1583
+ }
1584
+ for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
1585
+ {
1586
+ for (size_t ik=0; ik<idl1; ++ik)
1587
+ {
1588
+ C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
1589
+ C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
1590
+ }
1591
+ size_t iang=2*l;
1592
+ size_t j=3,jc=ip-3;
1593
+ for(; j<ipph-3; j+=4,jc-=4)
1594
+ {
1595
+ iang+=l; if(iang>ip) iang-=ip;
1596
+ double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1597
+ iang+=l; if(iang>ip) iang-=ip;
1598
+ double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1599
+ iang+=l; if(iang>ip) iang-=ip;
1600
+ double ar3=csarr[2*iang], ai3=csarr[2*iang+1];
1601
+ iang+=l; if(iang>ip) iang-=ip;
1602
+ double ar4=csarr[2*iang], ai4=csarr[2*iang+1];
1603
+ for (size_t ik=0; ik<idl1; ++ik)
1604
+ {
1605
+ C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
1606
+ +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
1607
+ C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
1608
+ +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
1609
+ }
1610
+ }
1611
+ for(; j<ipph-1; j+=2,jc-=2)
1612
+ {
1613
+ iang+=l; if(iang>ip) iang-=ip;
1614
+ double ar1=csarr[2*iang], ai1=csarr[2*iang+1];
1615
+ iang+=l; if(iang>ip) iang-=ip;
1616
+ double ar2=csarr[2*iang], ai2=csarr[2*iang+1];
1617
+ for (size_t ik=0; ik<idl1; ++ik)
1618
+ {
1619
+ C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
1620
+ C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
1621
+ }
1622
+ }
1623
+ for(; j<ipph; ++j,--jc)
1624
+ {
1625
+ iang+=l; if(iang>ip) iang-=ip;
1626
+ double war=csarr[2*iang], wai=csarr[2*iang+1];
1627
+ for (size_t ik=0; ik<idl1; ++ik)
1628
+ {
1629
+ C2(ik,l ) += war*CH2(ik,j );
1630
+ C2(ik,lc) += wai*CH2(ik,jc);
1631
+ }
1632
+ }
1633
+ }
1634
+ for (size_t j=1; j<ipph; ++j)
1635
+ for (size_t ik=0; ik<idl1; ++ik)
1636
+ CH2(ik,0) += CH2(ik,j);
1637
+ for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc) // 124
1638
+ for (size_t k=0; k<l1; ++k)
1639
+ {
1640
+ CH(0,k,j ) = C1(0,k,j)-C1(0,k,jc);
1641
+ CH(0,k,jc) = C1(0,k,j)+C1(0,k,jc);
1642
+ }
1643
+
1644
+ if (ido==1) return;
1645
+
1646
+ for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc) // 127
1647
+ for (size_t k=0; k<l1; ++k)
1648
+ for (size_t i=1; i<=ido-2; i+=2)
1649
+ {
1650
+ CH(i ,k,j ) = C1(i ,k,j)-C1(i+1,k,jc);
1651
+ CH(i ,k,jc) = C1(i ,k,j)+C1(i+1,k,jc);
1652
+ CH(i+1,k,j ) = C1(i+1,k,j)+C1(i ,k,jc);
1653
+ CH(i+1,k,jc) = C1(i+1,k,j)-C1(i ,k,jc);
1654
+ }
1655
+
1656
+ // All in CH
1657
+
1658
+ for (size_t j=1; j<ip; ++j)
1659
+ {
1660
+ size_t is = (j-1)*(ido-1);
1661
+ for (size_t k=0; k<l1; ++k)
1662
+ {
1663
+ size_t idij = is;
1664
+ for (size_t i=1; i<=ido-2; i+=2)
1665
+ {
1666
+ double t1=CH(i,k,j), t2=CH(i+1,k,j);
1667
+ CH(i ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
1668
+ CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
1669
+ idij+=2;
1670
+ }
1671
+ }
1672
+ }
1673
+ }
1674
+ #undef C1
1675
+ #undef C2
1676
+ #undef CH2
1677
+
1678
+ #undef CC
1679
+ #undef CH
1680
+ #undef PM
1681
+ #undef MULPM
1682
+ #undef WA
1683
+
1684
+ static void copy_and_norm(double *c, double *p1, size_t n, double fct)
1685
+ {
1686
+ if (p1!=c)
1687
+ {
1688
+ if (fct!=1.)
1689
+ for (size_t i=0; i<n; ++i)
1690
+ c[i] = fct*p1[i];
1691
+ else
1692
+ memcpy (c,p1,n*sizeof(double));
1693
+ }
1694
+ else
1695
+ if (fct!=1.)
1696
+ for (size_t i=0; i<n; ++i)
1697
+ c[i] *= fct;
1698
+ }
1699
+
1700
+ WARN_UNUSED_RESULT
1701
+ static int rfftp_forward(rfftp_plan plan, double c[], double fct)
1702
+ {
1703
+ if (plan->length==1) return 0;
1704
+ size_t n=plan->length;
1705
+ size_t l1=n, nf=plan->nfct;
1706
+ double *ch = RALLOC(double, n);
1707
+ if (!ch) return -1;
1708
+ double *p1=c, *p2=ch;
1709
+
1710
+ for(size_t k1=0; k1<nf;++k1)
1711
+ {
1712
+ size_t k=nf-k1-1;
1713
+ size_t ip=plan->fct[k].fct;
1714
+ size_t ido=n / l1;
1715
+ l1 /= ip;
1716
+ if(ip==4)
1717
+ radf4(ido, l1, p1, p2, plan->fct[k].tw);
1718
+ else if(ip==2)
1719
+ radf2(ido, l1, p1, p2, plan->fct[k].tw);
1720
+ else if(ip==3)
1721
+ radf3(ido, l1, p1, p2, plan->fct[k].tw);
1722
+ else if(ip==5)
1723
+ radf5(ido, l1, p1, p2, plan->fct[k].tw);
1724
+ else
1725
+ {
1726
+ radfg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1727
+ SWAP (p1,p2,double *);
1728
+ }
1729
+ SWAP (p1,p2,double *);
1730
+ }
1731
+ copy_and_norm(c,p1,n,fct);
1732
+ DEALLOC(ch);
1733
+ return 0;
1734
+ }
1735
+
1736
+ WARN_UNUSED_RESULT
1737
+ static int rfftp_backward(rfftp_plan plan, double c[], double fct)
1738
+ {
1739
+ if (plan->length==1) return 0;
1740
+ size_t n=plan->length;
1741
+ size_t l1=1, nf=plan->nfct;
1742
+ double *ch = RALLOC(double, n);
1743
+ if (!ch) return -1;
1744
+ double *p1=c, *p2=ch;
1745
+
1746
+ for(size_t k=0; k<nf; k++)
1747
+ {
1748
+ size_t ip = plan->fct[k].fct,
1749
+ ido= n/(ip*l1);
1750
+ if(ip==4)
1751
+ radb4(ido, l1, p1, p2, plan->fct[k].tw);
1752
+ else if(ip==2)
1753
+ radb2(ido, l1, p1, p2, plan->fct[k].tw);
1754
+ else if(ip==3)
1755
+ radb3(ido, l1, p1, p2, plan->fct[k].tw);
1756
+ else if(ip==5)
1757
+ radb5(ido, l1, p1, p2, plan->fct[k].tw);
1758
+ else
1759
+ radbg(ido, ip, l1, p1, p2, plan->fct[k].tw, plan->fct[k].tws);
1760
+ SWAP (p1,p2,double *);
1761
+ l1*=ip;
1762
+ }
1763
+ copy_and_norm(c,p1,n,fct);
1764
+ DEALLOC(ch);
1765
+ return 0;
1766
+ }
1767
+
1768
+ WARN_UNUSED_RESULT
1769
+ static int rfftp_factorize (rfftp_plan plan)
1770
+ {
1771
+ size_t length=plan->length;
1772
+ size_t nfct=0;
1773
+ while ((length%4)==0)
1774
+ { if (nfct>=NFCT) return -1; plan->fct[nfct++].fct=4; length>>=2; }
1775
+ if ((length%2)==0)
1776
+ {
1777
+ length>>=1;
1778
+ // factor 2 should be at the front of the factor list
1779
+ if (nfct>=NFCT) return -1;
1780
+ plan->fct[nfct++].fct=2;
1781
+ SWAP(plan->fct[0].fct, plan->fct[nfct-1].fct,size_t);
1782
+ }
1783
+ size_t maxl=(size_t)(sqrt((double)length))+1;
1784
+ for (size_t divisor=3; (length>1)&&(divisor<maxl); divisor+=2)
1785
+ if ((length%divisor)==0)
1786
+ {
1787
+ while ((length%divisor)==0)
1788
+ {
1789
+ if (nfct>=NFCT) return -1;
1790
+ plan->fct[nfct++].fct=divisor;
1791
+ length/=divisor;
1792
+ }
1793
+ maxl=(size_t)(sqrt((double)length))+1;
1794
+ }
1795
+ if (length>1) plan->fct[nfct++].fct=length;
1796
+ plan->nfct=nfct;
1797
+ return 0;
1798
+ }
1799
+
1800
+ static size_t rfftp_twsize(rfftp_plan plan)
1801
+ {
1802
+ size_t twsize=0, l1=1;
1803
+ for (size_t k=0; k<plan->nfct; ++k)
1804
+ {
1805
+ size_t ip=plan->fct[k].fct, ido= plan->length/(l1*ip);
1806
+ twsize+=(ip-1)*(ido-1);
1807
+ if (ip>5) twsize+=2*ip;
1808
+ l1*=ip;
1809
+ }
1810
+ return twsize;
1811
+ return 0;
1812
+ }
1813
+
1814
+ WARN_UNUSED_RESULT NOINLINE static int rfftp_comp_twiddle (rfftp_plan plan)
1815
+ {
1816
+ size_t length=plan->length;
1817
+ double *twid = RALLOC(double, 2*length);
1818
+ if (!twid) return -1;
1819
+ sincos_2pibyn_half(length, twid);
1820
+ size_t l1=1;
1821
+ double *ptr=plan->mem;
1822
+ for (size_t k=0; k<plan->nfct; ++k)
1823
+ {
1824
+ size_t ip=plan->fct[k].fct, ido=length/(l1*ip);
1825
+ if (k<plan->nfct-1) // last factor doesn't need twiddles
1826
+ {
1827
+ plan->fct[k].tw=ptr; ptr+=(ip-1)*(ido-1);
1828
+ for (size_t j=1; j<ip; ++j)
1829
+ for (size_t i=1; i<=(ido-1)/2; ++i)
1830
+ {
1831
+ plan->fct[k].tw[(j-1)*(ido-1)+2*i-2] = twid[2*j*l1*i];
1832
+ plan->fct[k].tw[(j-1)*(ido-1)+2*i-1] = twid[2*j*l1*i+1];
1833
+ }
1834
+ }
1835
+ if (ip>5) // special factors required by *g functions
1836
+ {
1837
+ plan->fct[k].tws=ptr; ptr+=2*ip;
1838
+ plan->fct[k].tws[0] = 1.;
1839
+ plan->fct[k].tws[1] = 0.;
1840
+ for (size_t i=1; i<=(ip>>1); ++i)
1841
+ {
1842
+ plan->fct[k].tws[2*i ] = twid[2*i*(length/ip)];
1843
+ plan->fct[k].tws[2*i+1] = twid[2*i*(length/ip)+1];
1844
+ plan->fct[k].tws[2*(ip-i) ] = twid[2*i*(length/ip)];
1845
+ plan->fct[k].tws[2*(ip-i)+1] = -twid[2*i*(length/ip)+1];
1846
+ }
1847
+ }
1848
+ l1*=ip;
1849
+ }
1850
+ DEALLOC(twid);
1851
+ return 0;
1852
+ }
1853
+
1854
+ NOINLINE static rfftp_plan make_rfftp_plan (size_t length)
1855
+ {
1856
+ if (length==0) return NULL;
1857
+ rfftp_plan plan = RALLOC(rfftp_plan_i,1);
1858
+ if (!plan) return NULL;
1859
+ plan->length=length;
1860
+ plan->nfct=0;
1861
+ plan->mem=NULL;
1862
+ for (size_t i=0; i<NFCT; ++i)
1863
+ plan->fct[i]=(rfftp_fctdata){0,0,0};
1864
+ if (length==1) return plan;
1865
+ if (rfftp_factorize(plan)!=0) { DEALLOC(plan); return NULL; }
1866
+ size_t tws=rfftp_twsize(plan);
1867
+ plan->mem=RALLOC(double,tws);
1868
+ if (!plan->mem) { DEALLOC(plan); return NULL; }
1869
+ if (rfftp_comp_twiddle(plan)!=0)
1870
+ { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1871
+ return plan;
1872
+ }
1873
+
1874
+ NOINLINE static void destroy_rfftp_plan (rfftp_plan plan)
1875
+ {
1876
+ DEALLOC(plan->mem);
1877
+ DEALLOC(plan);
1878
+ }
1879
+
1880
+ typedef struct fftblue_plan_i
1881
+ {
1882
+ size_t n, n2;
1883
+ cfftp_plan plan;
1884
+ double *mem;
1885
+ double *bk, *bkf;
1886
+ } fftblue_plan_i;
1887
+ typedef struct fftblue_plan_i * fftblue_plan;
1888
+
1889
+ NOINLINE static fftblue_plan make_fftblue_plan (size_t length)
1890
+ {
1891
+ fftblue_plan plan = RALLOC(fftblue_plan_i,1);
1892
+ if (!plan) return NULL;
1893
+ plan->n = length;
1894
+ plan->n2 = good_size(plan->n*2-1);
1895
+ plan->mem = RALLOC(double, 2*plan->n+2*plan->n2);
1896
+ if (!plan->mem) { DEALLOC(plan); return NULL; }
1897
+ plan->bk = plan->mem;
1898
+ plan->bkf = plan->bk+2*plan->n;
1899
+
1900
+ /* initialize b_k */
1901
+ double *tmp = RALLOC(double,4*plan->n);
1902
+ if (!tmp) { DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1903
+ sincos_2pibyn(2*plan->n,tmp);
1904
+ plan->bk[0] = 1;
1905
+ plan->bk[1] = 0;
1906
+
1907
+ size_t coeff=0;
1908
+ for (size_t m=1; m<plan->n; ++m)
1909
+ {
1910
+ coeff+=2*m-1;
1911
+ if (coeff>=2*plan->n) coeff-=2*plan->n;
1912
+ plan->bk[2*m ] = tmp[2*coeff ];
1913
+ plan->bk[2*m+1] = tmp[2*coeff+1];
1914
+ }
1915
+
1916
+ /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
1917
+ double xn2 = 1./plan->n2;
1918
+ plan->bkf[0] = plan->bk[0]*xn2;
1919
+ plan->bkf[1] = plan->bk[1]*xn2;
1920
+ for (size_t m=2; m<2*plan->n; m+=2)
1921
+ {
1922
+ plan->bkf[m] = plan->bkf[2*plan->n2-m] = plan->bk[m] *xn2;
1923
+ plan->bkf[m+1] = plan->bkf[2*plan->n2-m+1] = plan->bk[m+1] *xn2;
1924
+ }
1925
+ for (size_t m=2*plan->n;m<=(2*plan->n2-2*plan->n+1);++m)
1926
+ plan->bkf[m]=0.;
1927
+ plan->plan=make_cfftp_plan(plan->n2);
1928
+ if (!plan->plan)
1929
+ { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1930
+ if (cfftp_forward(plan->plan,plan->bkf,1.)!=0)
1931
+ { DEALLOC(tmp); DEALLOC(plan->mem); DEALLOC(plan); return NULL; }
1932
+ DEALLOC(tmp);
1933
+
1934
+ return plan;
1935
+ }
1936
+
1937
+ NOINLINE static void destroy_fftblue_plan (fftblue_plan plan)
1938
+ {
1939
+ DEALLOC(plan->mem);
1940
+ destroy_cfftp_plan(plan->plan);
1941
+ DEALLOC(plan);
1942
+ }
1943
+
1944
+ NOINLINE WARN_UNUSED_RESULT
1945
+ static int fftblue_fft(fftblue_plan plan, double c[], int isign, double fct)
1946
+ {
1947
+ size_t n=plan->n;
1948
+ size_t n2=plan->n2;
1949
+ double *bk = plan->bk;
1950
+ double *bkf = plan->bkf;
1951
+ double *akf = RALLOC(double, 2*n2);
1952
+ if (!akf) return -1;
1953
+
1954
+ /* initialize a_k and FFT it */
1955
+ if (isign>0)
1956
+ for (size_t m=0; m<2*n; m+=2)
1957
+ {
1958
+ akf[m] = c[m]*bk[m] - c[m+1]*bk[m+1];
1959
+ akf[m+1] = c[m]*bk[m+1] + c[m+1]*bk[m];
1960
+ }
1961
+ else
1962
+ for (size_t m=0; m<2*n; m+=2)
1963
+ {
1964
+ akf[m] = c[m]*bk[m] + c[m+1]*bk[m+1];
1965
+ akf[m+1] =-c[m]*bk[m+1] + c[m+1]*bk[m];
1966
+ }
1967
+ for (size_t m=2*n; m<2*n2; ++m)
1968
+ akf[m]=0;
1969
+
1970
+ if (cfftp_forward (plan->plan,akf,fct)!=0)
1971
+ { DEALLOC(akf); return -1; }
1972
+
1973
+ /* do the convolution */
1974
+ if (isign>0)
1975
+ for (size_t m=0; m<2*n2; m+=2)
1976
+ {
1977
+ double im = -akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1978
+ akf[m ] = akf[m]*bkf[m] + akf[m+1]*bkf[m+1];
1979
+ akf[m+1] = im;
1980
+ }
1981
+ else
1982
+ for (size_t m=0; m<2*n2; m+=2)
1983
+ {
1984
+ double im = akf[m]*bkf[m+1] + akf[m+1]*bkf[m];
1985
+ akf[m ] = akf[m]*bkf[m] - akf[m+1]*bkf[m+1];
1986
+ akf[m+1] = im;
1987
+ }
1988
+
1989
+ /* inverse FFT */
1990
+ if (cfftp_backward (plan->plan,akf,1.)!=0)
1991
+ { DEALLOC(akf); return -1; }
1992
+
1993
+ /* multiply by b_k */
1994
+ if (isign>0)
1995
+ for (size_t m=0; m<2*n; m+=2)
1996
+ {
1997
+ c[m] = bk[m] *akf[m] - bk[m+1]*akf[m+1];
1998
+ c[m+1] = bk[m+1]*akf[m] + bk[m] *akf[m+1];
1999
+ }
2000
+ else
2001
+ for (size_t m=0; m<2*n; m+=2)
2002
+ {
2003
+ c[m] = bk[m] *akf[m] + bk[m+1]*akf[m+1];
2004
+ c[m+1] =-bk[m+1]*akf[m] + bk[m] *akf[m+1];
2005
+ }
2006
+ DEALLOC(akf);
2007
+ return 0;
2008
+ }
2009
+
2010
+ WARN_UNUSED_RESULT
2011
+ static int cfftblue_backward(fftblue_plan plan, double c[], double fct)
2012
+ { return fftblue_fft(plan,c,1,fct); }
2013
+
2014
+ WARN_UNUSED_RESULT
2015
+ static int cfftblue_forward(fftblue_plan plan, double c[], double fct)
2016
+ { return fftblue_fft(plan,c,-1,fct); }
2017
+
2018
+ WARN_UNUSED_RESULT
2019
+ static int rfftblue_backward(fftblue_plan plan, double c[], double fct)
2020
+ {
2021
+ size_t n=plan->n;
2022
+ double *tmp = RALLOC(double,2*n);
2023
+ if (!tmp) return -1;
2024
+ tmp[0]=c[0];
2025
+ tmp[1]=0.;
2026
+ memcpy (tmp+2,c+1, (n-1)*sizeof(double));
2027
+ if ((n&1)==0) tmp[n+1]=0.;
2028
+ for (size_t m=2; m<n; m+=2)
2029
+ {
2030
+ tmp[2*n-m]=tmp[m];
2031
+ tmp[2*n-m+1]=-tmp[m+1];
2032
+ }
2033
+ if (fftblue_fft(plan,tmp,1,fct)!=0)
2034
+ { DEALLOC(tmp); return -1; }
2035
+ for (size_t m=0; m<n; ++m)
2036
+ c[m] = tmp[2*m];
2037
+ DEALLOC(tmp);
2038
+ return 0;
2039
+ }
2040
+
2041
+ WARN_UNUSED_RESULT
2042
+ static int rfftblue_forward(fftblue_plan plan, double c[], double fct)
2043
+ {
2044
+ size_t n=plan->n;
2045
+ double *tmp = RALLOC(double,2*n);
2046
+ if (!tmp) return -1;
2047
+ for (size_t m=0; m<n; ++m)
2048
+ {
2049
+ tmp[2*m] = c[m];
2050
+ tmp[2*m+1] = 0.;
2051
+ }
2052
+ if (fftblue_fft(plan,tmp,-1,fct)!=0)
2053
+ { DEALLOC(tmp); return -1; }
2054
+ c[0] = tmp[0];
2055
+ memcpy (c+1, tmp+2, (n-1)*sizeof(double));
2056
+ DEALLOC(tmp);
2057
+ return 0;
2058
+ }
2059
+
2060
+ typedef struct cfft_plan_i
2061
+ {
2062
+ cfftp_plan packplan;
2063
+ fftblue_plan blueplan;
2064
+ } cfft_plan_i;
2065
+
2066
+ cfft_plan make_cfft_plan (size_t length)
2067
+ {
2068
+ if (length==0) return NULL;
2069
+ cfft_plan plan = RALLOC(cfft_plan_i,1);
2070
+ if (!plan) return NULL;
2071
+ plan->blueplan=0;
2072
+ plan->packplan=0;
2073
+ if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2074
+ {
2075
+ plan->packplan=make_cfftp_plan(length);
2076
+ if (!plan->packplan) { DEALLOC(plan); return NULL; }
2077
+ return plan;
2078
+ }
2079
+ double comp1 = cost_guess(length);
2080
+ double comp2 = 2*cost_guess(good_size(2*length-1));
2081
+ comp2*=1.5; /* fudge factor that appears to give good overall performance */
2082
+ if (comp2<comp1) // use Bluestein
2083
+ {
2084
+ plan->blueplan=make_fftblue_plan(length);
2085
+ if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2086
+ }
2087
+ else
2088
+ {
2089
+ plan->packplan=make_cfftp_plan(length);
2090
+ if (!plan->packplan) { DEALLOC(plan); return NULL; }
2091
+ }
2092
+ return plan;
2093
+ }
2094
+
2095
+ void destroy_cfft_plan (cfft_plan plan)
2096
+ {
2097
+ if (plan->blueplan)
2098
+ destroy_fftblue_plan(plan->blueplan);
2099
+ if (plan->packplan)
2100
+ destroy_cfftp_plan(plan->packplan);
2101
+ DEALLOC(plan);
2102
+ }
2103
+
2104
+ WARN_UNUSED_RESULT int cfft_backward(cfft_plan plan, double c[], double fct)
2105
+ {
2106
+ if (plan->packplan)
2107
+ return cfftp_backward(plan->packplan,c,fct);
2108
+ // if (plan->blueplan)
2109
+ return cfftblue_backward(plan->blueplan,c,fct);
2110
+ }
2111
+
2112
+ WARN_UNUSED_RESULT int cfft_forward(cfft_plan plan, double c[], double fct)
2113
+ {
2114
+ if (plan->packplan)
2115
+ return cfftp_forward(plan->packplan,c,fct);
2116
+ // if (plan->blueplan)
2117
+ return cfftblue_forward(plan->blueplan,c,fct);
2118
+ }
2119
+
2120
+ typedef struct rfft_plan_i
2121
+ {
2122
+ rfftp_plan packplan;
2123
+ fftblue_plan blueplan;
2124
+ } rfft_plan_i;
2125
+
2126
+ rfft_plan make_rfft_plan (size_t length)
2127
+ {
2128
+ if (length==0) return NULL;
2129
+ rfft_plan plan = RALLOC(rfft_plan_i,1);
2130
+ if (!plan) return NULL;
2131
+ plan->blueplan=0;
2132
+ plan->packplan=0;
2133
+ if ((length<50) || (largest_prime_factor(length)<=sqrt(length)))
2134
+ {
2135
+ plan->packplan=make_rfftp_plan(length);
2136
+ if (!plan->packplan) { DEALLOC(plan); return NULL; }
2137
+ return plan;
2138
+ }
2139
+ double comp1 = 0.5*cost_guess(length);
2140
+ double comp2 = 2*cost_guess(good_size(2*length-1));
2141
+ comp2*=1.5; /* fudge factor that appears to give good overall performance */
2142
+ if (comp2<comp1) // use Bluestein
2143
+ {
2144
+ plan->blueplan=make_fftblue_plan(length);
2145
+ if (!plan->blueplan) { DEALLOC(plan); return NULL; }
2146
+ }
2147
+ else
2148
+ {
2149
+ plan->packplan=make_rfftp_plan(length);
2150
+ if (!plan->packplan) { DEALLOC(plan); return NULL; }
2151
+ }
2152
+ return plan;
2153
+ }
2154
+
2155
+ void destroy_rfft_plan (rfft_plan plan)
2156
+ {
2157
+ if (plan->blueplan)
2158
+ destroy_fftblue_plan(plan->blueplan);
2159
+ if (plan->packplan)
2160
+ destroy_rfftp_plan(plan->packplan);
2161
+ DEALLOC(plan);
2162
+ }
2163
+
2164
+ size_t rfft_length(rfft_plan plan)
2165
+ {
2166
+ if (plan->packplan) return plan->packplan->length;
2167
+ return plan->blueplan->n;
2168
+ }
2169
+
2170
+ size_t cfft_length(cfft_plan plan)
2171
+ {
2172
+ if (plan->packplan) return plan->packplan->length;
2173
+ return plan->blueplan->n;
2174
+ }
2175
+
2176
+ WARN_UNUSED_RESULT int rfft_backward(rfft_plan plan, double c[], double fct)
2177
+ {
2178
+ if (plan->packplan)
2179
+ return rfftp_backward(plan->packplan,c,fct);
2180
+ else // if (plan->blueplan)
2181
+ return rfftblue_backward(plan->blueplan,c,fct);
2182
+ }
2183
+
2184
+ WARN_UNUSED_RESULT int rfft_forward(rfft_plan plan, double c[], double fct)
2185
+ {
2186
+ if (plan->packplan)
2187
+ return rfftp_forward(plan->packplan,c,fct);
2188
+ else // if (plan->blueplan)
2189
+ return rfftblue_forward(plan->blueplan,c,fct);
2190
+ }