paddlec 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2645 @@
1
+ /* Copyright (C) 2019 Théotime Bollengier <theotime.bollengier@gmail.com>
2
+ *
3
+ * This file is part of PaddleC
4
+ *
5
+ * PaddleC is free software: you can redistribute it and/or modify
6
+ * it under the terms of the GNU General Public License as published by
7
+ * the Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * PaddleC is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ * GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License
16
+ * along with PaddleC. If not, see <https://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include <immintrin.h>
20
+
21
+ #if !(defined __FMA__) && defined __FMA4__
22
+ #include <x86intrin.h>
23
+ #endif
24
+
25
+
26
+ void pdlc_fir_filter_inspect(pdlc_fir_filter_t* fir)
27
+ {
28
+ size_t i, j;
29
+
30
+ printf("nb_coefs: %u, state_len: %u, coef_len: %u, index_mask: 0x%x, index: %u\n",
31
+ fir->nb_coefs, fir->state_len, fir->coef_len, fir->index_mask, fir->index);
32
+ printf("state: [%.7g", fir->stater[0]);
33
+ for (i = 1; i < fir->state_len; i++)
34
+ printf(", %.7g", fir->stater[i]);
35
+ printf("]\n");
36
+ for (j = 0; j < 8; j++) {
37
+ printf("coefs: {%lu}[%.7g", j, fir->coefs[j][0]);
38
+ for (i = 1; i < fir->coef_len; i++)
39
+ printf(", %.7g", fir->coefs[j][i]);
40
+ printf("]\n");
41
+ }
42
+ }
43
+
44
+
45
+ void pdlc_fir_filter_initialize(pdlc_fir_filter_t* fir, int order)
46
+ {
47
+ int i;
48
+
49
+ if (fir->coefs) {
50
+ for (i = 0; i < 8; i++)
51
+ if (fir->coefs[i])
52
+ _mm_free(fir->coefs[i]);
53
+ free(fir->coefs);
54
+ fir->coefs = NULL;
55
+ }
56
+
57
+ if (fir->stater)
58
+ _mm_free(fir->stater);
59
+ fir->stater = NULL;
60
+
61
+ if (fir->statei)
62
+ _mm_free(fir->statei);
63
+ fir->statei = NULL;
64
+
65
+ fir->nb_coefs = 0;
66
+ fir->state_len = 0;
67
+ fir->coef_len = 0;
68
+ fir->index = 0;
69
+ fir->index_mask = 0;
70
+ fir->counter = 0;
71
+ fir->max_counter = 1;
72
+
73
+ if (order < 0)
74
+ return;
75
+
76
+ if (order > 67108863) {
77
+ fprintf(stderr, "ERROR: libpaddlec: Filter order cannot be greater than 67108864\n");
78
+ exit(EXIT_FAILURE);
79
+ }
80
+
81
+ fir->nb_coefs = (unsigned int)(order + 1);
82
+ fir->coef_len = ((fir->nb_coefs + 7 + 7) >> 3) << 3;
83
+ fir->state_len = (unsigned int)(pow(2.0, ceil(log2(fir->coef_len))));
84
+ fir->index = 0;
85
+ fir->index_mask = fir->state_len - 1;
86
+
87
+ fir->coefs = malloc(8*sizeof(float*));
88
+ if (fir->coefs == NULL) {
89
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", 8 * sizeof(float*));
90
+ exit(EXIT_FAILURE);
91
+ }
92
+
93
+ for (i = 0; i < 8; i++) {
94
+ fir->coefs[i] = _mm_malloc(fir->coef_len * sizeof(float), sizeof(__m256));
95
+ if (fir->coefs[i] == NULL) {
96
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->coef_len * sizeof(float));
97
+ exit(EXIT_FAILURE);
98
+ }
99
+ }
100
+
101
+ fir->stater = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m256));
102
+ if (fir->stater == NULL) {
103
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
104
+ exit(EXIT_FAILURE);
105
+ }
106
+
107
+ fir->statei = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m256));
108
+ if (fir->statei == NULL) {
109
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
110
+ exit(EXIT_FAILURE);
111
+ }
112
+
113
+ memset(fir->stater, 0, fir->state_len * sizeof(float));
114
+ memset(fir->statei, 0, fir->state_len * sizeof(float));
115
+ for (i = 0; i < 8; i++)
116
+ memset(fir->coefs[i], 0, fir->coef_len * sizeof(float));
117
+ }
118
+
119
+
120
+ void pdlc_fir_filter_free(pdlc_fir_filter_t* fir)
121
+ {
122
+ int i;
123
+
124
+ if (!fir)
125
+ return;
126
+
127
+ if (fir->coefs) {
128
+ for (i = 0; i < 8; i++)
129
+ if (fir->coefs[i])
130
+ _mm_free(fir->coefs[i]);
131
+ free(fir->coefs);
132
+ }
133
+
134
+ if (fir->stater)
135
+ _mm_free(fir->stater);
136
+
137
+ if (fir->statei)
138
+ _mm_free(fir->statei);
139
+
140
+ free(fir);
141
+ }
142
+
143
+
144
+ size_t pdlc_fir_filter_size(pdlc_fir_filter_t* fir)
145
+ {
146
+ size_t res;
147
+
148
+ res = sizeof(pdlc_fir_filter_t);
149
+ res += sizeof(float*)* 8;
150
+ res += sizeof(float) * fir->state_len * 2;
151
+ res += sizeof(float) * fir->coef_len * 8;
152
+
153
+ return res;
154
+ }
155
+
156
+
157
+ int pdlc_fir_filter_set_coef_at(pdlc_fir_filter_t* fir, int index, float value)
158
+ {
159
+ int i;
160
+
161
+ if (index < 0 || index >= (int)fir->nb_coefs)
162
+ return -1;
163
+
164
+ for (i = 0; i < 8; i++)
165
+ fir->coefs[i][(fir->nb_coefs - 1 - index + i) % fir->coef_len] = value;
166
+
167
+ return 0;
168
+ }
169
+
170
+
171
+ float pdlc_fir_filter_filter_float(pdlc_fir_filter_t* fir, float sample, float *delayed)
172
+ {
173
+ const unsigned int nb_coefs = fir->nb_coefs;
174
+ const unsigned int flt_len = fir->state_len;
175
+ const unsigned int mask = fir->index_mask;
176
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
177
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
178
+ const unsigned int lensimd = fir->coef_len >> 3;
179
+ const unsigned int startsimd = start_index >> 3;
180
+ const unsigned int masksimd = mask >> 3;
181
+ unsigned int i, j;
182
+ register __m256 acc;
183
+ #if !(defined __FMA__ || defined __FMA4__)
184
+ register __m256 prod;
185
+ #endif
186
+ const __m256 *coefs = (__m256*)fir->coefs[start_index & 7];
187
+ __m256 *stater = (__m256*)fir->stater;
188
+
189
+ fir->stater[fir->index] = sample;
190
+ fir->index = (fir->index + 1) & mask;
191
+
192
+ if (delayed) {
193
+ if (nb_coefs & 1)
194
+ *delayed = fir->stater[middle_index];
195
+ else
196
+ *delayed = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
197
+ }
198
+
199
+ acc = _mm256_setzero_ps();
200
+ j = startsimd;
201
+ for (i = 0; i < lensimd; i++) {
202
+ #if defined __FMA__
203
+ acc = _mm256_fmadd_ps(coefs[i], stater[j], acc);
204
+ #elif defined __FMA4__
205
+ acc = _mm256_macc_ps(coefs[i], stater[j], acc);
206
+ #else
207
+ prod = _mm256_mul_ps(coefs[i], stater[j]);
208
+ acc = _mm256_add_ps(acc, prod);
209
+ #endif
210
+ j = (j+1) & masksimd;
211
+ }
212
+
213
+ return acc[0] + acc[1] + acc[2] + acc[3] + acc[4] + acc[5] + acc[6] + acc[7];
214
+ }
215
+
216
+
217
+ pdlc_complex_t pdlc_fir_filter_filter_complex(pdlc_fir_filter_t* fir, pdlc_complex_t sample, pdlc_complex_t *delayed)
218
+ {
219
+ const unsigned int nb_coefs = fir->nb_coefs;
220
+ const unsigned int flt_len = fir->state_len;
221
+ const unsigned int mask = fir->index_mask;
222
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
223
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
224
+ const unsigned int lensimd = fir->coef_len >> 3;
225
+ const unsigned int startsimd = start_index >> 3;
226
+ const unsigned int masksimd = mask >> 3;
227
+ unsigned int i, j;
228
+ pdlc_complex_t res = {0.0f, 0.0f};
229
+ register __m256 accr, acci;
230
+ #if !(defined __FMA__ || defined __FMA4__)
231
+ register __m256 prodr, prodi;
232
+ #endif
233
+ const __m256 *coefs = (__m256*)fir->coefs[start_index & 7];
234
+ __m256 *stater = (__m256*)fir->stater;
235
+ __m256 *statei = (__m256*)fir->statei;
236
+
237
+ fir->stater[fir->index] = sample.real;
238
+ fir->statei[fir->index] = sample.imag;
239
+ fir->index = (fir->index + 1) & mask;
240
+
241
+ accr = _mm256_setzero_ps();
242
+ acci = _mm256_setzero_ps();
243
+ j = startsimd;
244
+ for (i = 0; i < lensimd; i++) {
245
+ #if defined __FMA__
246
+ accr = _mm256_fmadd_ps(coefs[i], stater[j], accr);
247
+ acci = _mm256_fmadd_ps(coefs[i], statei[j], acci);
248
+ #elif defined __FMA4__
249
+ accr = _mm256_macc_ps(coefs[i], stater[j], accr);
250
+ acci = _mm256_macc_ps(coefs[i], statei[j], acci);
251
+ #else
252
+ prodr = _mm256_mul_ps(coefs[i], stater[j]);
253
+ prodi = _mm256_mul_ps(coefs[i], statei[j]);
254
+ accr = _mm256_add_ps(accr, prodr);
255
+ acci = _mm256_add_ps(acci, prodi);
256
+ #endif
257
+ j = (j+1) & masksimd;
258
+ }
259
+ res.real = accr[0] + accr[1] + accr[2] + accr[3] + accr[4] + accr[5] + accr[6] + accr[7];
260
+ res.imag = acci[0] + acci[1] + acci[2] + acci[3] + acci[4] + acci[5] + acci[6] + acci[7];
261
+
262
+ if (delayed) {
263
+ if (nb_coefs & 1) {
264
+ delayed->real = fir->stater[middle_index];
265
+ delayed->imag = fir->statei[middle_index];
266
+ }
267
+ else {
268
+ delayed->real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
269
+ delayed->imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
270
+ }
271
+ }
272
+
273
+ return res;
274
+ }
275
+
276
+
277
+ pdlc_buffer_t* pdlc_fir_filter_filter_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf, pdlc_buffer_t *delayed)
278
+ {
279
+ const unsigned int nb_coefs = fir->nb_coefs;
280
+ const unsigned int flt_len = fir->state_len;
281
+ const unsigned int mask = fir->index_mask;
282
+ const unsigned int lensimd = fir->coef_len >> 3;
283
+ const unsigned int masksimd = mask >> 3;
284
+ const size_t ibuflen = ifbuf->length;
285
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
286
+ unsigned int startsimd = start_index >> 3;
287
+ unsigned int middle_index;
288
+ unsigned int i, j;
289
+ size_t k;
290
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
291
+ #if !(defined __FMA__ || defined __FMA4__)
292
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
293
+ #endif
294
+ register __m256 statereal;
295
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
296
+ __m256 *stater = (__m256*)fir->stater;
297
+
298
+ if (!ofbuf)
299
+ ofbuf = pdlc_buffer_new(ibuflen);
300
+ else if (ofbuf->length != ibuflen)
301
+ pdlc_buffer_resize(ofbuf, ibuflen, 0);
302
+
303
+ if (delayed) {
304
+ if (delayed->length != ibuflen)
305
+ pdlc_buffer_resize(delayed, ibuflen, 0);
306
+ middle_index = (start_index + nb_coefs / 2) & mask;
307
+ if (nb_coefs & 1) {
308
+ k = 0;
309
+ while ((start_index & 7) && k < ibuflen) {
310
+ fir->stater[fir->index] = ifbuf->data[k];
311
+ fir->index = (fir->index + 1) & mask;
312
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
313
+ acc0 = _mm256_setzero_ps();
314
+ j = startsimd;
315
+ for (i = 0; i < lensimd; i++) {
316
+ #if defined __FMA__
317
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
318
+ #elif defined __FMA4__
319
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
320
+ #else
321
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
322
+ acc0 = _mm256_add_ps(acc0, prod0);
323
+ #endif
324
+ j = (j+1) & masksimd;
325
+ }
326
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
327
+ start_index = (start_index + 1) & mask;
328
+ startsimd = start_index >> 3;
329
+ delayed->data[k] = fir->stater[middle_index];
330
+ middle_index = (middle_index + 1) & mask;
331
+ k++;
332
+ }
333
+ while (k + 8 <= ibuflen) {
334
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
335
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
336
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
337
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
338
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
339
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
340
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
341
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
342
+ fir->index = (fir->index + 8) & mask;
343
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
344
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
345
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
346
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
347
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
348
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
349
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
350
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
351
+ acc0 = _mm256_setzero_ps();
352
+ acc1 = _mm256_setzero_ps();
353
+ acc2 = _mm256_setzero_ps();
354
+ acc3 = _mm256_setzero_ps();
355
+ acc4 = _mm256_setzero_ps();
356
+ acc5 = _mm256_setzero_ps();
357
+ acc6 = _mm256_setzero_ps();
358
+ acc7 = _mm256_setzero_ps();
359
+ j = startsimd;
360
+ for (i = 0; i < lensimd; i++) {
361
+ statereal = stater[j];
362
+ #if defined __FMA__
363
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
364
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
365
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
366
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
367
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
368
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
369
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
370
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
371
+ #elif defined __FMA4__
372
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
373
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
374
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
375
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
376
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
377
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
378
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
379
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
380
+ #else
381
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
382
+ acc0 = _mm256_add_ps(acc0, prod0);
383
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
384
+ acc1 = _mm256_add_ps(acc1, prod1);
385
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
386
+ acc2 = _mm256_add_ps(acc2, prod2);
387
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
388
+ acc3 = _mm256_add_ps(acc3, prod3);
389
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
390
+ acc4 = _mm256_add_ps(acc4, prod4);
391
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
392
+ acc5 = _mm256_add_ps(acc5, prod5);
393
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
394
+ acc6 = _mm256_add_ps(acc6, prod6);
395
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
396
+ acc7 = _mm256_add_ps(acc7, prod7);
397
+ #endif
398
+ j = (j+1) & masksimd;
399
+ }
400
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
401
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
402
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
403
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
404
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
405
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
406
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
407
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
408
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
409
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
410
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
411
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
412
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
413
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
414
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
415
+ _mm256_storeu_ps(ofbuf->data + k, h30);
416
+ start_index = (start_index + 8) & mask;
417
+ startsimd = start_index >> 3;
418
+ delayed->data[k] = fir->stater[middle_index];
419
+ middle_index = (middle_index + 1) & mask;
420
+ k++;
421
+ delayed->data[k] = fir->stater[middle_index];
422
+ middle_index = (middle_index + 1) & mask;
423
+ k++;
424
+ delayed->data[k] = fir->stater[middle_index];
425
+ middle_index = (middle_index + 1) & mask;
426
+ k++;
427
+ delayed->data[k] = fir->stater[middle_index];
428
+ middle_index = (middle_index + 1) & mask;
429
+ k++;
430
+ delayed->data[k] = fir->stater[middle_index];
431
+ middle_index = (middle_index + 1) & mask;
432
+ k++;
433
+ delayed->data[k] = fir->stater[middle_index];
434
+ middle_index = (middle_index + 1) & mask;
435
+ k++;
436
+ delayed->data[k] = fir->stater[middle_index];
437
+ middle_index = (middle_index + 1) & mask;
438
+ k++;
439
+ delayed->data[k] = fir->stater[middle_index];
440
+ middle_index = (middle_index + 1) & mask;
441
+ k++;
442
+ }
443
+ for (; k < ibuflen; k++) {
444
+ fir->stater[fir->index] = ifbuf->data[k];
445
+ fir->index = (fir->index + 1) & mask;
446
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
447
+ acc0 = _mm256_setzero_ps();
448
+ j = startsimd;
449
+ for (i = 0; i < lensimd; i++) {
450
+ #if defined __FMA__
451
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
452
+ #elif defined __FMA4__
453
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
454
+ #else
455
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
456
+ acc0 = _mm256_add_ps(acc0, prod0);
457
+ #endif
458
+ j = (j+1) & masksimd;
459
+ }
460
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
461
+ start_index = (start_index + 1) & mask;
462
+ startsimd = start_index >> 3;
463
+ delayed->data[k] = fir->stater[middle_index];
464
+ middle_index = (middle_index + 1) & mask;
465
+ }
466
+ }
467
+ else {
468
+ k = 0;
469
+ while ((start_index & 7) && k < ibuflen) {
470
+ fir->stater[fir->index] = ifbuf->data[k];
471
+ fir->index = (fir->index + 1) & mask;
472
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
473
+ acc0 = _mm256_setzero_ps();
474
+ j = startsimd;
475
+ for (i = 0; i < lensimd; i++) {
476
+ #if defined __FMA__
477
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
478
+ #elif defined __FMA4__
479
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
480
+ #else
481
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
482
+ acc0 = _mm256_add_ps(acc0, prod0);
483
+ #endif
484
+ j = (j+1) & masksimd;
485
+ }
486
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
487
+ start_index = (start_index + 1) & mask;
488
+ startsimd = start_index >> 3;
489
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
490
+ middle_index = (middle_index + 1) & mask;
491
+ k++;
492
+ }
493
+ while (k + 8 <= ibuflen) {
494
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
495
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
496
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
497
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
498
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
499
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
500
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
501
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
502
+ fir->index = (fir->index + 8) & mask;
503
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
504
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
505
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
506
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
507
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
508
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
509
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
510
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
511
+ acc0 = _mm256_setzero_ps();
512
+ acc1 = _mm256_setzero_ps();
513
+ acc2 = _mm256_setzero_ps();
514
+ acc3 = _mm256_setzero_ps();
515
+ acc4 = _mm256_setzero_ps();
516
+ acc5 = _mm256_setzero_ps();
517
+ acc6 = _mm256_setzero_ps();
518
+ acc7 = _mm256_setzero_ps();
519
+ j = startsimd;
520
+ for (i = 0; i < lensimd; i++) {
521
+ statereal = stater[j];
522
+ #if defined __FMA__
523
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
524
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
525
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
526
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
527
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
528
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
529
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
530
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
531
+ #elif defined __FMA4__
532
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
533
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
534
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
535
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
536
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
537
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
538
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
539
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
540
+ #else
541
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
542
+ acc0 = _mm256_add_ps(acc0, prod0);
543
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
544
+ acc1 = _mm256_add_ps(acc1, prod1);
545
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
546
+ acc2 = _mm256_add_ps(acc2, prod2);
547
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
548
+ acc3 = _mm256_add_ps(acc3, prod3);
549
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
550
+ acc4 = _mm256_add_ps(acc4, prod4);
551
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
552
+ acc5 = _mm256_add_ps(acc5, prod5);
553
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
554
+ acc6 = _mm256_add_ps(acc6, prod6);
555
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
556
+ acc7 = _mm256_add_ps(acc7, prod7);
557
+ #endif
558
+ j = (j+1) & masksimd;
559
+ }
560
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
561
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
562
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
563
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
564
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
565
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
566
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
567
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
568
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
569
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
570
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
571
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
572
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
573
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
574
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
575
+ _mm256_storeu_ps(ofbuf->data + k, h30);
576
+ start_index = (start_index + 8) & mask;
577
+ startsimd = start_index >> 3;
578
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
579
+ middle_index = (middle_index + 1) & mask;
580
+ k++;
581
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
582
+ middle_index = (middle_index + 1) & mask;
583
+ k++;
584
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
585
+ middle_index = (middle_index + 1) & mask;
586
+ k++;
587
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
588
+ middle_index = (middle_index + 1) & mask;
589
+ k++;
590
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
591
+ middle_index = (middle_index + 1) & mask;
592
+ k++;
593
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
594
+ middle_index = (middle_index + 1) & mask;
595
+ k++;
596
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
597
+ middle_index = (middle_index + 1) & mask;
598
+ k++;
599
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
600
+ middle_index = (middle_index + 1) & mask;
601
+ k++;
602
+ }
603
+ for (; k < ibuflen; k++) {
604
+ fir->stater[fir->index] = ifbuf->data[k];
605
+ fir->index = (fir->index + 1) & mask;
606
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
607
+ acc0 = _mm256_setzero_ps();
608
+ j = startsimd;
609
+ for (i = 0; i < lensimd; i++) {
610
+ #if defined __FMA__
611
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
612
+ #elif defined __FMA4__
613
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
614
+ #else
615
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
616
+ acc0 = _mm256_add_ps(acc0, prod0);
617
+ #endif
618
+ j = (j+1) & masksimd;
619
+ }
620
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
621
+ start_index = (start_index + 1) & mask;
622
+ startsimd = start_index >> 3;
623
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
624
+ middle_index = (middle_index + 1) & mask;
625
+ }
626
+ }
627
+ }
628
+ else {
629
+ k = 0;
630
+ while ((start_index & 7) && k < ibuflen) {
631
+ fir->stater[fir->index] = ifbuf->data[k];
632
+ fir->index = (fir->index + 1) & mask;
633
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
634
+ acc0 = _mm256_setzero_ps();
635
+ j = startsimd;
636
+ for (i = 0; i < lensimd; i++) {
637
+ #if defined __FMA__
638
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
639
+ #elif defined __FMA4__
640
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
641
+ #else
642
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
643
+ acc0 = _mm256_add_ps(acc0, prod0);
644
+ #endif
645
+ j = (j+1) & masksimd;
646
+ }
647
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
648
+ start_index = (start_index + 1) & mask;
649
+ startsimd = start_index >> 3;
650
+ k++;
651
+ }
652
+ while (k + 8 <= ibuflen) {
653
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
654
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
655
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
656
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
657
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
658
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
659
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
660
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
661
+ fir->index = (fir->index + 8) & mask;
662
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
663
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
664
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
665
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
666
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
667
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
668
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
669
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
670
+ acc0 = _mm256_setzero_ps();
671
+ acc1 = _mm256_setzero_ps();
672
+ acc2 = _mm256_setzero_ps();
673
+ acc3 = _mm256_setzero_ps();
674
+ acc4 = _mm256_setzero_ps();
675
+ acc5 = _mm256_setzero_ps();
676
+ acc6 = _mm256_setzero_ps();
677
+ acc7 = _mm256_setzero_ps();
678
+ j = startsimd;
679
+ for (i = 0; i < lensimd; i++) {
680
+ statereal = stater[j];
681
+ #if defined __FMA__
682
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
683
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
684
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
685
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
686
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
687
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
688
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
689
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
690
+ #elif defined __FMA4__
691
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
692
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
693
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
694
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
695
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
696
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
697
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
698
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
699
+ #else
700
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
701
+ acc0 = _mm256_add_ps(acc0, prod0);
702
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
703
+ acc1 = _mm256_add_ps(acc1, prod1);
704
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
705
+ acc2 = _mm256_add_ps(acc2, prod2);
706
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
707
+ acc3 = _mm256_add_ps(acc3, prod3);
708
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
709
+ acc4 = _mm256_add_ps(acc4, prod4);
710
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
711
+ acc5 = _mm256_add_ps(acc5, prod5);
712
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
713
+ acc6 = _mm256_add_ps(acc6, prod6);
714
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
715
+ acc7 = _mm256_add_ps(acc7, prod7);
716
+ #endif
717
+ j = (j+1) & masksimd;
718
+ }
719
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
720
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
721
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
722
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
723
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
724
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
725
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
726
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
727
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
728
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
729
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
730
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
731
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
732
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
733
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
734
+ _mm256_storeu_ps(ofbuf->data + k, h30);
735
+ start_index = (start_index + 8) & mask;
736
+ startsimd = start_index >> 3;
737
+ k += 8;
738
+ }
739
+ for (; k < ibuflen; k++) {
740
+ fir->stater[fir->index] = ifbuf->data[k];
741
+ fir->index = (fir->index + 1) & mask;
742
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
743
+ acc0 = _mm256_setzero_ps();
744
+ j = startsimd;
745
+ for (i = 0; i < lensimd; i++) {
746
+ #if defined __FMA__
747
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
748
+ #elif defined __FMA4__
749
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
750
+ #else
751
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
752
+ acc0 = _mm256_add_ps(acc0, prod0);
753
+ #endif
754
+ j = (j+1) & masksimd;
755
+ }
756
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
757
+ start_index = (start_index + 1) & mask;
758
+ startsimd = start_index >> 3;
759
+ }
760
+ }
761
+
762
+ return ofbuf;
763
+ }
764
+
765
+
766
+ pdlc_complex_buffer_t* pdlc_fir_filter_filter_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf, pdlc_complex_buffer_t *delayed)
767
+ {
768
+ const unsigned int nb_coefs = fir->nb_coefs;
769
+ const unsigned int flt_len = fir->state_len;
770
+ const unsigned int mask = fir->index_mask;
771
+ const unsigned int lensimd = fir->coef_len >> 3;
772
+ const unsigned int masksimd = mask >> 3;
773
+ const size_t ibuflen = icbuf->length;
774
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
775
+ unsigned int startsimd = start_index >> 3;
776
+ unsigned int middle_index;
777
+ unsigned int i, j;
778
+ size_t k;
779
+ register __m256 acc0r, acc1r, acc2r, acc3r, acc4r, acc5r, acc6r, acc7r;
780
+ register __m256 acc0i, acc1i, acc2i, acc3i, acc4i, acc5i, acc6i, acc7i;
781
+ #if !(defined __FMA__ || defined __FMA4__)
782
+ register __m256 prod0r, prod1r, prod2r, prod3r, prod4r, prod5r, prod6r, prod7r;
783
+ register __m256 prod0i, prod1i, prod2i, prod3i, prod4i, prod5i, prod6i, prod7i;
784
+ #endif
785
+ register __m256 statereal, stateimag;
786
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
787
+ __m256 *stater = (__m256*)fir->stater;
788
+ __m256 *statei = (__m256*)fir->statei;
789
+
790
+ if (!ocbuf)
791
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
792
+ else if (ocbuf->length != ibuflen)
793
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
794
+
795
+ if (delayed) {
796
+ if (delayed->length != ibuflen)
797
+ pdlc_complex_buffer_resize(delayed, ibuflen, 0);
798
+ middle_index = (start_index + nb_coefs / 2) & mask;
799
+ if (nb_coefs & 1) {
800
+ k = 0;
801
+ while ((start_index & 7) && k < ibuflen) {
802
+ fir->stater[fir->index] = icbuf->data[k].real;
803
+ fir->statei[fir->index] = icbuf->data[k].imag;
804
+ fir->index = (fir->index + 1) & mask;
805
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
806
+ acc0r = _mm256_setzero_ps();
807
+ acc0i = _mm256_setzero_ps();
808
+ j = startsimd;
809
+ for (i = 0; i < lensimd; i++) {
810
+ #if defined __FMA__
811
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
812
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
813
+ #elif defined __FMA4__
814
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
815
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
816
+ #else
817
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
818
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
819
+ acc0r = _mm256_add_ps(acc0r, prod0r);
820
+ acc0i = _mm256_add_ps(acc0i, prod0i);
821
+ #endif
822
+ j = (j+1) & masksimd;
823
+ }
824
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
825
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
826
+ start_index = (start_index + 1) & mask;
827
+ startsimd = start_index >> 3;
828
+ delayed->data[k].real = fir->stater[middle_index];
829
+ delayed->data[k].imag = fir->statei[middle_index];
830
+ middle_index = (middle_index + 1) & mask;
831
+ k++;
832
+ }
833
+ while (k + 8 <= ibuflen) {
834
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
835
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
836
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
837
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
838
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
839
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
840
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
841
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
842
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
843
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
844
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
845
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
846
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
847
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
848
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
849
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
850
+ fir->index = (fir->index + 8) & mask;
851
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
852
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
853
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
854
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
855
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
856
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
857
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
858
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
859
+ acc0r = _mm256_setzero_ps();
860
+ acc0i = _mm256_setzero_ps();
861
+ acc1r = _mm256_setzero_ps();
862
+ acc1i = _mm256_setzero_ps();
863
+ acc2r = _mm256_setzero_ps();
864
+ acc2i = _mm256_setzero_ps();
865
+ acc3r = _mm256_setzero_ps();
866
+ acc3i = _mm256_setzero_ps();
867
+ acc4r = _mm256_setzero_ps();
868
+ acc4i = _mm256_setzero_ps();
869
+ acc5r = _mm256_setzero_ps();
870
+ acc5i = _mm256_setzero_ps();
871
+ acc6r = _mm256_setzero_ps();
872
+ acc6i = _mm256_setzero_ps();
873
+ acc7r = _mm256_setzero_ps();
874
+ acc7i = _mm256_setzero_ps();
875
+ j = startsimd;
876
+ for (i = 0; i < lensimd; i++) {
877
+ statereal = stater[j];
878
+ stateimag = statei[j];
879
+ #if defined __FMA__
880
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
881
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
882
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
883
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
884
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
885
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
886
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
887
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
888
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
889
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
890
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
891
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
892
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
893
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
894
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
895
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
896
+ #elif defined __FMA4__
897
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
898
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
899
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
900
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
901
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
902
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
903
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
904
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
905
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
906
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
907
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
908
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
909
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
910
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
911
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
912
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
913
+ #else
914
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
915
+ acc0r = _mm256_add_ps(acc0r, prod0r);
916
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
917
+ acc1r = _mm256_add_ps(acc1r, prod1r);
918
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
919
+ acc2r = _mm256_add_ps(acc2r, prod2r);
920
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
921
+ acc3r = _mm256_add_ps(acc3r, prod3r);
922
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
923
+ acc4r = _mm256_add_ps(acc4r, prod4r);
924
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
925
+ acc5r = _mm256_add_ps(acc5r, prod5r);
926
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
927
+ acc6r = _mm256_add_ps(acc6r, prod6r);
928
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
929
+ acc7r = _mm256_add_ps(acc7r, prod7r);
930
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
931
+ acc0i = _mm256_add_ps(acc0i, prod0i);
932
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
933
+ acc1i = _mm256_add_ps(acc1i, prod1i);
934
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
935
+ acc2i = _mm256_add_ps(acc2i, prod2i);
936
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
937
+ acc3i = _mm256_add_ps(acc3i, prod3i);
938
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
939
+ acc4i = _mm256_add_ps(acc4i, prod4i);
940
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
941
+ acc5i = _mm256_add_ps(acc5i, prod5i);
942
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
943
+ acc6i = _mm256_add_ps(acc6i, prod6i);
944
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
945
+ acc7i = _mm256_add_ps(acc7i, prod7i);
946
+ #endif
947
+ j = (j+1) & masksimd;
948
+ }
949
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
950
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
951
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
952
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
953
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
954
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
955
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
956
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
957
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
958
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
959
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
960
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
961
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
962
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
963
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
964
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
965
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
966
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
967
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
968
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
969
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
970
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
971
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
972
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
973
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
974
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
975
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
976
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
977
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
978
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
979
+ ocbuf->data[k+0].real = h30r[0];
980
+ ocbuf->data[k+0].imag = h30i[0];
981
+ ocbuf->data[k+1].real = h30r[1];
982
+ ocbuf->data[k+1].imag = h30i[1];
983
+ ocbuf->data[k+2].real = h30r[2];
984
+ ocbuf->data[k+2].imag = h30i[2];
985
+ ocbuf->data[k+3].real = h30r[3];
986
+ ocbuf->data[k+3].imag = h30i[3];
987
+ ocbuf->data[k+4].real = h30r[4];
988
+ ocbuf->data[k+4].imag = h30i[4];
989
+ ocbuf->data[k+5].real = h30r[5];
990
+ ocbuf->data[k+5].imag = h30i[5];
991
+ ocbuf->data[k+6].real = h30r[6];
992
+ ocbuf->data[k+6].imag = h30i[6];
993
+ ocbuf->data[k+7].real = h30r[7];
994
+ ocbuf->data[k+7].imag = h30i[7];
995
+ start_index = (start_index + 8) & mask;
996
+ startsimd = start_index >> 3;
997
+ delayed->data[k].real = fir->stater[middle_index];
998
+ delayed->data[k].imag = fir->statei[middle_index];
999
+ middle_index = (middle_index + 1) & mask;
1000
+ k++;
1001
+ delayed->data[k].real = fir->stater[middle_index];
1002
+ delayed->data[k].imag = fir->statei[middle_index];
1003
+ middle_index = (middle_index + 1) & mask;
1004
+ k++;
1005
+ delayed->data[k].real = fir->stater[middle_index];
1006
+ delayed->data[k].imag = fir->statei[middle_index];
1007
+ middle_index = (middle_index + 1) & mask;
1008
+ k++;
1009
+ delayed->data[k].real = fir->stater[middle_index];
1010
+ delayed->data[k].imag = fir->statei[middle_index];
1011
+ middle_index = (middle_index + 1) & mask;
1012
+ k++;
1013
+ delayed->data[k].real = fir->stater[middle_index];
1014
+ delayed->data[k].imag = fir->statei[middle_index];
1015
+ middle_index = (middle_index + 1) & mask;
1016
+ k++;
1017
+ delayed->data[k].real = fir->stater[middle_index];
1018
+ delayed->data[k].imag = fir->statei[middle_index];
1019
+ middle_index = (middle_index + 1) & mask;
1020
+ k++;
1021
+ delayed->data[k].real = fir->stater[middle_index];
1022
+ delayed->data[k].imag = fir->statei[middle_index];
1023
+ middle_index = (middle_index + 1) & mask;
1024
+ k++;
1025
+ delayed->data[k].real = fir->stater[middle_index];
1026
+ delayed->data[k].imag = fir->statei[middle_index];
1027
+ middle_index = (middle_index + 1) & mask;
1028
+ k++;
1029
+ }
1030
+ for (; k < ibuflen; k++) {
1031
+ fir->stater[fir->index] = icbuf->data[k].real;
1032
+ fir->statei[fir->index] = icbuf->data[k].imag;
1033
+ fir->index = (fir->index + 1) & mask;
1034
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1035
+ acc0r = _mm256_setzero_ps();
1036
+ acc0i = _mm256_setzero_ps();
1037
+ j = startsimd;
1038
+ for (i = 0; i < lensimd; i++) {
1039
+ #if defined __FMA__
1040
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1041
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1042
+ #elif defined __FMA4__
1043
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1044
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1045
+ #else
1046
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1047
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1048
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1049
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1050
+ #endif
1051
+ j = (j+1) & masksimd;
1052
+ }
1053
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1054
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1055
+ start_index = (start_index + 1) & mask;
1056
+ startsimd = start_index >> 3;
1057
+ delayed->data[k].real = fir->stater[middle_index];
1058
+ delayed->data[k].imag = fir->statei[middle_index];
1059
+ middle_index = (middle_index + 1) & mask;
1060
+ }
1061
+ }
1062
+ else {
1063
+ k = 0;
1064
+ while ((start_index & 7) && k < ibuflen) {
1065
+ fir->stater[fir->index] = icbuf->data[k].real;
1066
+ fir->statei[fir->index] = icbuf->data[k].imag;
1067
+ fir->index = (fir->index + 1) & mask;
1068
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1069
+ acc0r = _mm256_setzero_ps();
1070
+ acc0i = _mm256_setzero_ps();
1071
+ j = startsimd;
1072
+ for (i = 0; i < lensimd; i++) {
1073
+ #if defined __FMA__
1074
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1075
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1076
+ #elif defined __FMA4__
1077
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1078
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1079
+ #else
1080
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1081
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1082
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1083
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1084
+ #endif
1085
+ j = (j+1) & masksimd;
1086
+ }
1087
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1088
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1089
+ start_index = (start_index + 1) & mask;
1090
+ startsimd = start_index >> 3;
1091
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1092
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1093
+ middle_index = (middle_index + 1) & mask;
1094
+ k++;
1095
+ }
1096
+ while (k + 4 <= ibuflen) {
1097
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
1098
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
1099
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
1100
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
1101
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
1102
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
1103
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
1104
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
1105
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
1106
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
1107
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
1108
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
1109
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
1110
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
1111
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
1112
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
1113
+ fir->index = (fir->index + 8) & mask;
1114
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1115
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1116
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1117
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1118
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1119
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1120
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1121
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1122
+ acc0r = _mm256_setzero_ps();
1123
+ acc0i = _mm256_setzero_ps();
1124
+ acc1r = _mm256_setzero_ps();
1125
+ acc1i = _mm256_setzero_ps();
1126
+ acc2r = _mm256_setzero_ps();
1127
+ acc2i = _mm256_setzero_ps();
1128
+ acc3r = _mm256_setzero_ps();
1129
+ acc3i = _mm256_setzero_ps();
1130
+ acc4r = _mm256_setzero_ps();
1131
+ acc4i = _mm256_setzero_ps();
1132
+ acc5r = _mm256_setzero_ps();
1133
+ acc5i = _mm256_setzero_ps();
1134
+ acc6r = _mm256_setzero_ps();
1135
+ acc6i = _mm256_setzero_ps();
1136
+ acc7r = _mm256_setzero_ps();
1137
+ acc7i = _mm256_setzero_ps();
1138
+ j = startsimd;
1139
+ for (i = 0; i < lensimd; i++) {
1140
+ statereal = stater[j];
1141
+ stateimag = statei[j];
1142
+ #if defined __FMA__
1143
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1144
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1145
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1146
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1147
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1148
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1149
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1150
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1151
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1152
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1153
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1154
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1155
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1156
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1157
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1158
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1159
+ #elif defined __FMA4__
1160
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1161
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1162
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1163
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1164
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1165
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1166
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1167
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1168
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1169
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1170
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1171
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1172
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1173
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1174
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1175
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1176
+ #else
1177
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1178
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1179
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1180
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1181
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1182
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1183
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1184
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1185
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1186
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1187
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1188
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1189
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1190
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1191
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1192
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1193
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1194
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1195
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1196
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1197
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1198
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1199
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1200
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1201
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1202
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1203
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
1204
+ acc5i = _mm256_add_ps(acc5i, prod5i);
1205
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
1206
+ acc6i = _mm256_add_ps(acc6i, prod6i);
1207
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
1208
+ acc7i = _mm256_add_ps(acc7i, prod7i);
1209
+ #endif
1210
+ j = (j+1) & masksimd;
1211
+ }
1212
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
1213
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
1214
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
1215
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
1216
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
1217
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
1218
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
1219
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
1220
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
1221
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
1222
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
1223
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
1224
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
1225
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
1226
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
1227
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
1228
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
1229
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
1230
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
1231
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
1232
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
1233
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
1234
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
1235
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
1236
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
1237
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
1238
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
1239
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
1240
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
1241
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
1242
+ ocbuf->data[k+0].real = h30r[0];
1243
+ ocbuf->data[k+0].imag = h30i[0];
1244
+ ocbuf->data[k+1].real = h30r[1];
1245
+ ocbuf->data[k+1].imag = h30i[1];
1246
+ ocbuf->data[k+2].real = h30r[2];
1247
+ ocbuf->data[k+2].imag = h30i[2];
1248
+ ocbuf->data[k+3].real = h30r[3];
1249
+ ocbuf->data[k+3].imag = h30i[3];
1250
+ ocbuf->data[k+4].real = h30r[4];
1251
+ ocbuf->data[k+4].imag = h30i[4];
1252
+ ocbuf->data[k+5].real = h30r[5];
1253
+ ocbuf->data[k+5].imag = h30i[5];
1254
+ ocbuf->data[k+6].real = h30r[6];
1255
+ ocbuf->data[k+6].imag = h30i[6];
1256
+ ocbuf->data[k+7].real = h30r[7];
1257
+ ocbuf->data[k+7].imag = h30i[7];
1258
+ start_index = (start_index + 8) & mask;
1259
+ startsimd = start_index >> 3;
1260
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1261
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1262
+ middle_index = (middle_index + 1) & mask;
1263
+ k++;
1264
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1265
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1266
+ middle_index = (middle_index + 1) & mask;
1267
+ k++;
1268
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1269
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1270
+ middle_index = (middle_index + 1) & mask;
1271
+ k++;
1272
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1273
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1274
+ middle_index = (middle_index + 1) & mask;
1275
+ k++;
1276
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1277
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1278
+ middle_index = (middle_index + 1) & mask;
1279
+ k++;
1280
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1281
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1282
+ middle_index = (middle_index + 1) & mask;
1283
+ k++;
1284
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1285
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1286
+ middle_index = (middle_index + 1) & mask;
1287
+ k++;
1288
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1289
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1290
+ middle_index = (middle_index + 1) & mask;
1291
+ k++;
1292
+ }
1293
+ for (; k < ibuflen; k++) {
1294
+ fir->stater[fir->index] = icbuf->data[k].real;
1295
+ fir->statei[fir->index] = icbuf->data[k].imag;
1296
+ fir->index = (fir->index + 1) & mask;
1297
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1298
+ acc0r = _mm256_setzero_ps();
1299
+ acc0i = _mm256_setzero_ps();
1300
+ j = startsimd;
1301
+ for (i = 0; i < lensimd; i++) {
1302
+ #if defined __FMA__
1303
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1304
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1305
+ #elif defined __FMA4__
1306
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1307
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1308
+ #else
1309
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1310
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1311
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1312
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1313
+ #endif
1314
+ j = (j+1) & masksimd;
1315
+ }
1316
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1317
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1318
+ start_index = (start_index + 1) & mask;
1319
+ startsimd = start_index >> 3;
1320
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1321
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1322
+ middle_index = (middle_index + 1) & mask;
1323
+ }
1324
+ }
1325
+ }
1326
+ else {
1327
+ k = 0;
1328
+ while ((start_index & 7) && k < ibuflen) {
1329
+ fir->stater[fir->index] = icbuf->data[k].real;
1330
+ fir->statei[fir->index] = icbuf->data[k].imag;
1331
+ fir->index = (fir->index + 1) & mask;
1332
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1333
+ acc0r = _mm256_setzero_ps();
1334
+ acc0i = _mm256_setzero_ps();
1335
+ j = startsimd;
1336
+ for (i = 0; i < lensimd; i++) {
1337
+ #if defined __FMA__
1338
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1339
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1340
+ #elif defined __FMA4__
1341
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1342
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1343
+ #else
1344
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1345
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1346
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1347
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1348
+ #endif
1349
+ j = (j+1) & masksimd;
1350
+ }
1351
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1352
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1353
+ start_index = (start_index + 1) & mask;
1354
+ startsimd = start_index >> 3;
1355
+ k++;
1356
+ }
1357
+ while (k + 8 <= ibuflen) {
1358
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
1359
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
1360
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
1361
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
1362
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
1363
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
1364
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
1365
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
1366
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
1367
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
1368
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
1369
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
1370
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
1371
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
1372
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
1373
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
1374
+ fir->index = (fir->index + 8) & mask;
1375
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1376
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1377
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1378
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1379
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1380
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1381
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1382
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1383
+ acc0r = _mm256_setzero_ps();
1384
+ acc0i = _mm256_setzero_ps();
1385
+ acc1r = _mm256_setzero_ps();
1386
+ acc1i = _mm256_setzero_ps();
1387
+ acc2r = _mm256_setzero_ps();
1388
+ acc2i = _mm256_setzero_ps();
1389
+ acc3r = _mm256_setzero_ps();
1390
+ acc3i = _mm256_setzero_ps();
1391
+ acc4r = _mm256_setzero_ps();
1392
+ acc4i = _mm256_setzero_ps();
1393
+ acc5r = _mm256_setzero_ps();
1394
+ acc5i = _mm256_setzero_ps();
1395
+ acc6r = _mm256_setzero_ps();
1396
+ acc6i = _mm256_setzero_ps();
1397
+ acc7r = _mm256_setzero_ps();
1398
+ acc7i = _mm256_setzero_ps();
1399
+ j = startsimd;
1400
+ for (i = 0; i < lensimd; i++) {
1401
+ statereal = stater[j];
1402
+ stateimag = statei[j];
1403
+ #if defined __FMA__
1404
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1405
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1406
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1407
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1408
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1409
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1410
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1411
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1412
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1413
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1414
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1415
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1416
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1417
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1418
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1419
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1420
+ #elif defined __FMA4__
1421
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1422
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1423
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1424
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1425
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1426
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1427
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1428
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1429
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1430
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1431
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1432
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1433
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1434
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1435
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1436
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1437
+ #else
1438
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1439
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1440
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1441
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1442
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1443
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1444
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1445
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1446
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1447
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1448
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1449
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1450
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1451
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1452
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1453
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1454
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1455
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1456
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1457
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1458
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1459
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1460
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1461
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1462
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1463
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1464
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
1465
+ acc5i = _mm256_add_ps(acc5i, prod5i);
1466
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
1467
+ acc6i = _mm256_add_ps(acc6i, prod6i);
1468
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
1469
+ acc7i = _mm256_add_ps(acc7i, prod7i);
1470
+ #endif
1471
+ j = (j+1) & masksimd;
1472
+ }
1473
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
1474
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
1475
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
1476
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
1477
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
1478
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
1479
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
1480
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
1481
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
1482
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
1483
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
1484
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
1485
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
1486
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
1487
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
1488
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
1489
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
1490
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
1491
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
1492
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
1493
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
1494
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
1495
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
1496
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
1497
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
1498
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
1499
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
1500
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
1501
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
1502
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
1503
+ ocbuf->data[k+0].real = h30r[0];
1504
+ ocbuf->data[k+0].imag = h30i[0];
1505
+ ocbuf->data[k+1].real = h30r[1];
1506
+ ocbuf->data[k+1].imag = h30i[1];
1507
+ ocbuf->data[k+2].real = h30r[2];
1508
+ ocbuf->data[k+2].imag = h30i[2];
1509
+ ocbuf->data[k+3].real = h30r[3];
1510
+ ocbuf->data[k+3].imag = h30i[3];
1511
+ ocbuf->data[k+4].real = h30r[4];
1512
+ ocbuf->data[k+4].imag = h30i[4];
1513
+ ocbuf->data[k+5].real = h30r[5];
1514
+ ocbuf->data[k+5].imag = h30i[5];
1515
+ ocbuf->data[k+6].real = h30r[6];
1516
+ ocbuf->data[k+6].imag = h30i[6];
1517
+ ocbuf->data[k+7].real = h30r[7];
1518
+ ocbuf->data[k+7].imag = h30i[7];
1519
+ start_index = (start_index + 8) & mask;
1520
+ startsimd = start_index >> 3;
1521
+ k += 8;
1522
+ }
1523
+ for (; k < ibuflen; k++) {
1524
+ fir->stater[fir->index] = icbuf->data[k].real;
1525
+ fir->statei[fir->index] = icbuf->data[k].imag;
1526
+ fir->index = (fir->index + 1) & mask;
1527
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1528
+ acc0r = _mm256_setzero_ps();
1529
+ acc0i = _mm256_setzero_ps();
1530
+ j = startsimd;
1531
+ for (i = 0; i < lensimd; i++) {
1532
+ #if defined __FMA__
1533
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1534
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1535
+ #elif defined __FMA4__
1536
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1537
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1538
+ #else
1539
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1540
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1541
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1542
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1543
+ #endif
1544
+ j = (j+1) & masksimd;
1545
+ }
1546
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1547
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1548
+ start_index = (start_index + 1) & mask;
1549
+ startsimd = start_index >> 3;
1550
+ }
1551
+ }
1552
+
1553
+ return ocbuf;
1554
+ }
1555
+
1556
+
1557
+ pdlc_buffer_t* pdlc_fir_filter_interpolate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1558
+ {
1559
+ const unsigned int nb_coefs = fir->nb_coefs;
1560
+ const unsigned int flt_len = fir->state_len;
1561
+ const unsigned int mask = fir->index_mask;
1562
+ const unsigned int lensimd = fir->coef_len >> 3;
1563
+ const unsigned int masksimd = mask >> 3;
1564
+ const size_t ibuflen = ifbuf->length;
1565
+ const size_t obuflen = ibuflen*fir->max_counter;
1566
+ const float ffactor = (float)(fir->max_counter);
1567
+ const size_t mcounter = fir->max_counter;
1568
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1569
+ unsigned int startsimd = start_index >> 3;
1570
+ unsigned int i, j;
1571
+ size_t k = 0, l = 0;
1572
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
1573
+ #if !(defined __FMA__ || defined __FMA4__)
1574
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
1575
+ #endif
1576
+ register __m256 statereal;
1577
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
1578
+ __m256 *stater = (__m256*)fir->stater;
1579
+
1580
+
1581
+ if (!ofbuf)
1582
+ ofbuf = pdlc_buffer_new(obuflen);
1583
+ else if (ofbuf->length != obuflen)
1584
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1585
+
1586
+
1587
+ while ((start_index & 7) && k < obuflen) {
1588
+ if ((k % mcounter) == 0)
1589
+ fir->stater[fir->index] = ifbuf->data[l++];
1590
+ else
1591
+ fir->stater[fir->index] = 0.0f;
1592
+ fir->index = (fir->index + 1) & mask;
1593
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1594
+ acc0 = _mm256_setzero_ps();
1595
+ j = startsimd;
1596
+ for (i = 0; i < lensimd; i++) {
1597
+ #if defined __FMA__
1598
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
1599
+ #elif defined __FMA4__
1600
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
1601
+ #else
1602
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
1603
+ acc0 = _mm256_add_ps(acc0, prod0);
1604
+ #endif
1605
+ j = (j+1) & masksimd;
1606
+ }
1607
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7]) * ffactor;
1608
+ start_index = (start_index + 1) & mask;
1609
+ startsimd = start_index >> 3;
1610
+ k++;
1611
+ }
1612
+ while (k + 8 <= obuflen) {
1613
+ if (((k+0) % mcounter) == 0)
1614
+ fir->stater[fir->index] = ifbuf->data[l++];
1615
+ else
1616
+ fir->stater[fir->index] = 0.0f;
1617
+ fir->index = (fir->index + 1) & mask;
1618
+ if (((k+1) % mcounter) == 0)
1619
+ fir->stater[fir->index] = ifbuf->data[l++];
1620
+ else
1621
+ fir->stater[fir->index] = 0.0f;
1622
+ fir->index = (fir->index + 1) & mask;
1623
+ if (((k+2) % mcounter) == 0)
1624
+ fir->stater[fir->index] = ifbuf->data[l++];
1625
+ else
1626
+ fir->stater[fir->index] = 0.0f;
1627
+ fir->index = (fir->index + 1) & mask;
1628
+ if (((k+3) % mcounter) == 0)
1629
+ fir->stater[fir->index] = ifbuf->data[l++];
1630
+ else
1631
+ fir->stater[fir->index] = 0.0f;
1632
+ fir->index = (fir->index + 1) & mask;
1633
+ if (((k+4) % mcounter) == 0)
1634
+ fir->stater[fir->index] = ifbuf->data[l++];
1635
+ else
1636
+ fir->stater[fir->index] = 0.0f;
1637
+ fir->index = (fir->index + 1) & mask;
1638
+ if (((k+5) % mcounter) == 0)
1639
+ fir->stater[fir->index] = ifbuf->data[l++];
1640
+ else
1641
+ fir->stater[fir->index] = 0.0f;
1642
+ fir->index = (fir->index + 1) & mask;
1643
+ if (((k+6) % mcounter) == 0)
1644
+ fir->stater[fir->index] = ifbuf->data[l++];
1645
+ else
1646
+ fir->stater[fir->index] = 0.0f;
1647
+ fir->index = (fir->index + 1) & mask;
1648
+ if (((k+7) % mcounter) == 0)
1649
+ fir->stater[fir->index] = ifbuf->data[l++];
1650
+ else
1651
+ fir->stater[fir->index] = 0.0f;
1652
+ fir->index = (fir->index + 1) & mask;
1653
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1654
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1655
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1656
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1657
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1658
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1659
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1660
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1661
+ acc0 = _mm256_setzero_ps();
1662
+ acc1 = _mm256_setzero_ps();
1663
+ acc2 = _mm256_setzero_ps();
1664
+ acc3 = _mm256_setzero_ps();
1665
+ acc4 = _mm256_setzero_ps();
1666
+ acc5 = _mm256_setzero_ps();
1667
+ acc6 = _mm256_setzero_ps();
1668
+ acc7 = _mm256_setzero_ps();
1669
+ j = startsimd;
1670
+ for (i = 0; i < lensimd; i++) {
1671
+ statereal = stater[j];
1672
+ #if defined __FMA__
1673
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
1674
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
1675
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
1676
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
1677
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
1678
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
1679
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
1680
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
1681
+ #elif defined __FMA4__
1682
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
1683
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
1684
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
1685
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
1686
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
1687
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
1688
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
1689
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
1690
+ #else
1691
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
1692
+ acc0 = _mm256_add_ps(acc0, prod0);
1693
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
1694
+ acc1 = _mm256_add_ps(acc1, prod1);
1695
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
1696
+ acc2 = _mm256_add_ps(acc2, prod2);
1697
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
1698
+ acc3 = _mm256_add_ps(acc3, prod3);
1699
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
1700
+ acc4 = _mm256_add_ps(acc4, prod4);
1701
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
1702
+ acc5 = _mm256_add_ps(acc5, prod5);
1703
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
1704
+ acc6 = _mm256_add_ps(acc6, prod6);
1705
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
1706
+ acc7 = _mm256_add_ps(acc7, prod7);
1707
+ #endif
1708
+ j = (j+1) & masksimd;
1709
+ }
1710
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
1711
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
1712
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
1713
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
1714
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
1715
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
1716
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
1717
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
1718
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
1719
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
1720
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
1721
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
1722
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
1723
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
1724
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
1725
+ h30 = _mm256_mul_ps(h30, _mm256_set1_ps(ffactor));
1726
+ _mm256_storeu_ps(ofbuf->data + k, h30);
1727
+ start_index = (start_index + 8) & mask;
1728
+ startsimd = start_index >> 3;
1729
+ k += 8;
1730
+ }
1731
+ for (; k < obuflen; k++) {
1732
+ if ((k % mcounter) == 0)
1733
+ fir->stater[fir->index] = ifbuf->data[l++];
1734
+ else
1735
+ fir->stater[fir->index] = 0.0f;
1736
+ fir->index = (fir->index + 1) & mask;
1737
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1738
+ acc0 = _mm256_setzero_ps();
1739
+ j = startsimd;
1740
+ for (i = 0; i < lensimd; i++) {
1741
+ #if defined __FMA__
1742
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
1743
+ #elif defined __FMA4__
1744
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
1745
+ #else
1746
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
1747
+ acc0 = _mm256_add_ps(acc0, prod0);
1748
+ #endif
1749
+ j = (j+1) & masksimd;
1750
+ }
1751
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7]) * ffactor;
1752
+ start_index = (start_index + 1) & mask;
1753
+ startsimd = start_index >> 3;
1754
+ }
1755
+
1756
+ return ofbuf;
1757
+ }
1758
+
1759
+
1760
+ pdlc_complex_buffer_t* pdlc_fir_filter_interpolate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1761
+ {
1762
+ const unsigned int nb_coefs = fir->nb_coefs;
1763
+ const unsigned int flt_len = fir->state_len;
1764
+ const unsigned int mask = fir->index_mask;
1765
+ const unsigned int lensimd = fir->coef_len >> 3;
1766
+ const unsigned int masksimd = mask >> 3;
1767
+ const size_t ibuflen = icbuf->length;
1768
+ const size_t obuflen = ibuflen*fir->max_counter;
1769
+ const float ffactor = (float)(fir->max_counter);
1770
+ const size_t mcounter = fir->max_counter;
1771
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1772
+ unsigned int startsimd = start_index >> 3;
1773
+ unsigned int i, j;
1774
+ size_t k = 0, l = 0;
1775
+ register __m256 acc0r, acc1r, acc2r, acc3r, acc4r, acc5r, acc6r, acc7r;
1776
+ register __m256 acc0i, acc1i, acc2i, acc3i, acc4i, acc5i, acc6i, acc7i;
1777
+ #if !(defined __FMA__ || defined __FMA4__)
1778
+ register __m256 prod0r, prod1r, prod2r, prod3r, prod4r, prod5r, prod6r, prod7r;
1779
+ register __m256 prod0i, prod1i, prod2i, prod3i, prod4i, prod5i, prod6i, prod7i;
1780
+ #endif
1781
+ register __m256 statereal, stateimag;
1782
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
1783
+ __m256 *stater = (__m256*)fir->stater;
1784
+ __m256 *statei = (__m256*)fir->statei;
1785
+
1786
+
1787
+ if (!ocbuf)
1788
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1789
+ else if (ocbuf->length != obuflen)
1790
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1791
+
1792
+
1793
+ while ((start_index & 7) && k < obuflen) {
1794
+ if ((k % mcounter) == 0) {
1795
+ fir->stater[fir->index] = icbuf->data[l].real;
1796
+ fir->statei[fir->index] = icbuf->data[l].imag;
1797
+ l++;
1798
+ }
1799
+ else {
1800
+ fir->stater[fir->index] = 0.0f;
1801
+ fir->statei[fir->index] = 0.0f;
1802
+ }
1803
+ fir->index = (fir->index + 1) & mask;
1804
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1805
+ acc0r = _mm256_setzero_ps();
1806
+ acc0i = _mm256_setzero_ps();
1807
+ j = startsimd;
1808
+ for (i = 0; i < lensimd; i++) {
1809
+ #if defined __FMA__
1810
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1811
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1812
+ #elif defined __FMA4__
1813
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1814
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1815
+ #else
1816
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1817
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1818
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1819
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1820
+ #endif
1821
+ j = (j+1) & masksimd;
1822
+ }
1823
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7]) * ffactor;
1824
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7]) * ffactor;
1825
+ start_index = (start_index + 1) & mask;
1826
+ startsimd = start_index >> 3;
1827
+ k++;
1828
+ }
1829
+ while (k + 8 <= obuflen) {
1830
+ if (((k+0) % mcounter) == 0) {
1831
+ fir->stater[fir->index] = icbuf->data[l].real;
1832
+ fir->statei[fir->index] = icbuf->data[l].imag;
1833
+ l++;
1834
+ }
1835
+ else {
1836
+ fir->stater[fir->index] = 0.0f;
1837
+ fir->statei[fir->index] = 0.0f;
1838
+ }
1839
+ fir->index = (fir->index + 1) & mask;
1840
+ if (((k+1) % mcounter) == 0) {
1841
+ fir->stater[fir->index] = icbuf->data[l].real;
1842
+ fir->statei[fir->index] = icbuf->data[l].imag;
1843
+ l++;
1844
+ }
1845
+ else {
1846
+ fir->stater[fir->index] = 0.0f;
1847
+ fir->statei[fir->index] = 0.0f;
1848
+ }
1849
+ fir->index = (fir->index + 1) & mask;
1850
+ if (((k+2) % mcounter) == 0) {
1851
+ fir->stater[fir->index] = icbuf->data[l].real;
1852
+ fir->statei[fir->index] = icbuf->data[l].imag;
1853
+ l++;
1854
+ }
1855
+ else {
1856
+ fir->stater[fir->index] = 0.0f;
1857
+ fir->statei[fir->index] = 0.0f;
1858
+ }
1859
+ fir->index = (fir->index + 1) & mask;
1860
+ if (((k+3) % mcounter) == 0) {
1861
+ fir->stater[fir->index] = icbuf->data[l].real;
1862
+ fir->statei[fir->index] = icbuf->data[l].imag;
1863
+ l++;
1864
+ }
1865
+ else {
1866
+ fir->stater[fir->index] = 0.0f;
1867
+ fir->statei[fir->index] = 0.0f;
1868
+ }
1869
+ fir->index = (fir->index + 1) & mask;
1870
+ if (((k+4) % mcounter) == 0) {
1871
+ fir->stater[fir->index] = icbuf->data[l].real;
1872
+ fir->statei[fir->index] = icbuf->data[l].imag;
1873
+ l++;
1874
+ }
1875
+ else {
1876
+ fir->stater[fir->index] = 0.0f;
1877
+ fir->statei[fir->index] = 0.0f;
1878
+ }
1879
+ fir->index = (fir->index + 1) & mask;
1880
+ if (((k+5) % mcounter) == 0) {
1881
+ fir->stater[fir->index] = icbuf->data[l].real;
1882
+ fir->statei[fir->index] = icbuf->data[l].imag;
1883
+ l++;
1884
+ }
1885
+ else {
1886
+ fir->stater[fir->index] = 0.0f;
1887
+ fir->statei[fir->index] = 0.0f;
1888
+ }
1889
+ fir->index = (fir->index + 1) & mask;
1890
+ if (((k+6) % mcounter) == 0) {
1891
+ fir->stater[fir->index] = icbuf->data[l].real;
1892
+ fir->statei[fir->index] = icbuf->data[l].imag;
1893
+ l++;
1894
+ }
1895
+ else {
1896
+ fir->stater[fir->index] = 0.0f;
1897
+ fir->statei[fir->index] = 0.0f;
1898
+ }
1899
+ fir->index = (fir->index + 1) & mask;
1900
+ if (((k+7) % mcounter) == 0) {
1901
+ fir->stater[fir->index] = icbuf->data[l].real;
1902
+ fir->statei[fir->index] = icbuf->data[l].imag;
1903
+ l++;
1904
+ }
1905
+ else {
1906
+ fir->stater[fir->index] = 0.0f;
1907
+ fir->statei[fir->index] = 0.0f;
1908
+ }
1909
+ fir->index = (fir->index + 1) & mask;
1910
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1911
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1912
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1913
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1914
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1915
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1916
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1917
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1918
+ acc0r = _mm256_setzero_ps();
1919
+ acc0i = _mm256_setzero_ps();
1920
+ acc1r = _mm256_setzero_ps();
1921
+ acc1i = _mm256_setzero_ps();
1922
+ acc2r = _mm256_setzero_ps();
1923
+ acc2i = _mm256_setzero_ps();
1924
+ acc3r = _mm256_setzero_ps();
1925
+ acc3i = _mm256_setzero_ps();
1926
+ acc4r = _mm256_setzero_ps();
1927
+ acc4i = _mm256_setzero_ps();
1928
+ acc5r = _mm256_setzero_ps();
1929
+ acc5i = _mm256_setzero_ps();
1930
+ acc6r = _mm256_setzero_ps();
1931
+ acc6i = _mm256_setzero_ps();
1932
+ acc7r = _mm256_setzero_ps();
1933
+ acc7i = _mm256_setzero_ps();
1934
+ j = startsimd;
1935
+ for (i = 0; i < lensimd; i++) {
1936
+ statereal = stater[j];
1937
+ stateimag = statei[j];
1938
+ #if defined __FMA__
1939
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1940
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1941
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1942
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1943
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1944
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1945
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1946
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1947
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1948
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1949
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1950
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1951
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1952
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1953
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1954
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1955
+ #elif defined __FMA4__
1956
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1957
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1958
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1959
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1960
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1961
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1962
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1963
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1964
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1965
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1966
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1967
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1968
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1969
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1970
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1971
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1972
+ #else
1973
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1974
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1975
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1976
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1977
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1978
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1979
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1980
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1981
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1982
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1983
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1984
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1985
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1986
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1987
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1988
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1989
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1990
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1991
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1992
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1993
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1994
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1995
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1996
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1997
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1998
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1999
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
2000
+ acc5i = _mm256_add_ps(acc5i, prod5i);
2001
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
2002
+ acc6i = _mm256_add_ps(acc6i, prod6i);
2003
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
2004
+ acc7i = _mm256_add_ps(acc7i, prod7i);
2005
+ #endif
2006
+ j = (j+1) & masksimd;
2007
+ }
2008
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
2009
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
2010
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
2011
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
2012
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
2013
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
2014
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
2015
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
2016
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
2017
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
2018
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
2019
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
2020
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
2021
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
2022
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
2023
+ h30r = _mm256_mul_ps(h30r, _mm256_set1_ps(ffactor));
2024
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
2025
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
2026
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
2027
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
2028
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
2029
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
2030
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
2031
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
2032
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
2033
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
2034
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
2035
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
2036
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
2037
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
2038
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
2039
+ h30i = _mm256_mul_ps(h30i, _mm256_set1_ps(ffactor));
2040
+ ocbuf->data[k+0].real = h30r[0];
2041
+ ocbuf->data[k+0].imag = h30i[0];
2042
+ ocbuf->data[k+1].real = h30r[1];
2043
+ ocbuf->data[k+1].imag = h30i[1];
2044
+ ocbuf->data[k+2].real = h30r[2];
2045
+ ocbuf->data[k+2].imag = h30i[2];
2046
+ ocbuf->data[k+3].real = h30r[3];
2047
+ ocbuf->data[k+3].imag = h30i[3];
2048
+ ocbuf->data[k+4].real = h30r[4];
2049
+ ocbuf->data[k+4].imag = h30i[4];
2050
+ ocbuf->data[k+5].real = h30r[5];
2051
+ ocbuf->data[k+5].imag = h30i[5];
2052
+ ocbuf->data[k+6].real = h30r[6];
2053
+ ocbuf->data[k+6].imag = h30i[6];
2054
+ ocbuf->data[k+7].real = h30r[7];
2055
+ ocbuf->data[k+7].imag = h30i[7];
2056
+ start_index = (start_index + 8) & mask;
2057
+ startsimd = start_index >> 3;
2058
+ k += 8;
2059
+ }
2060
+ for (; k < obuflen; k++) {
2061
+ if ((k % mcounter) == 0) {
2062
+ fir->stater[fir->index] = icbuf->data[l].real;
2063
+ fir->statei[fir->index] = icbuf->data[l].imag;
2064
+ l++;
2065
+ }
2066
+ else {
2067
+ fir->stater[fir->index] = 0.0f;
2068
+ fir->statei[fir->index] = 0.0f;
2069
+ }
2070
+ fir->index = (fir->index + 1) & mask;
2071
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2072
+ acc0r = _mm256_setzero_ps();
2073
+ acc0i = _mm256_setzero_ps();
2074
+ j = startsimd;
2075
+ for (i = 0; i < lensimd; i++) {
2076
+ #if defined __FMA__
2077
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
2078
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
2079
+ #elif defined __FMA4__
2080
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
2081
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
2082
+ #else
2083
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
2084
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
2085
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2086
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2087
+ #endif
2088
+ j = (j+1) & masksimd;
2089
+ }
2090
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7]) * ffactor;
2091
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7]) * ffactor;
2092
+ start_index = (start_index + 1) & mask;
2093
+ startsimd = start_index >> 3;
2094
+ }
2095
+
2096
+ return ocbuf;
2097
+ }
2098
+
2099
+
2100
+ pdlc_buffer_t* pdlc_fir_filter_decimate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
2101
+ {
2102
+ const unsigned int nb_coefs = fir->nb_coefs;
2103
+ const unsigned int flt_len = fir->state_len;
2104
+ const unsigned int mask = fir->index_mask;
2105
+ const unsigned int lensimd = fir->coef_len >> 3;
2106
+ const unsigned int masksimd = mask >> 3;
2107
+ const int mcounter = fir->max_counter;
2108
+ const size_t ibuflen = ifbuf->length;
2109
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
2110
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
2111
+ unsigned int startsimd = start_index >> 3;
2112
+ unsigned int i0, i1, j0, j1;
2113
+ size_t k, l;
2114
+ register __m256 acc0, acc1;
2115
+ #if !(defined __FMA__ || defined __FMA4__)
2116
+ register __m256 prod0, prod1;
2117
+ #endif
2118
+ const __m256 *coefs;
2119
+ __m256 *stater = (__m256*)fir->stater;
2120
+
2121
+
2122
+ if (!ofbuf)
2123
+ ofbuf = pdlc_buffer_new(obuflen);
2124
+ else if (ofbuf->length != obuflen)
2125
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
2126
+
2127
+
2128
+ for (k = 0, l = 0; k < ibuflen; k++) {
2129
+ fir->stater[fir->index] = ifbuf->data[k];
2130
+ fir->index = (fir->index + 1) & mask;
2131
+ if (fir->counter == 0) {
2132
+ coefs = (__m256*)fir->coefs[start_index & 7];
2133
+ acc0 = _mm256_setzero_ps();
2134
+ acc1 = _mm256_setzero_ps();
2135
+ j0 = startsimd;
2136
+ j1 = (startsimd+1) & masksimd;
2137
+ i0 = 0;
2138
+ i1 = 1;
2139
+ while (i1 < lensimd) {
2140
+ #if defined __FMA__
2141
+ acc0 = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0);
2142
+ acc1 = _mm256_fmadd_ps(coefs[i1], stater[j1], acc1);
2143
+ #elif defined __FMA4__
2144
+ acc0 = _mm256_macc_ps(coefs[i0], stater[j0], acc0);
2145
+ acc1 = _mm256_macc_ps(coefs[i1], stater[j1], acc1);
2146
+ #else
2147
+ prod0 = _mm256_mul_ps(coefs[i0], stater[j0]);
2148
+ acc0 = _mm256_add_ps(acc0, prod0);
2149
+ prod1 = _mm256_mul_ps(coefs[i1], stater[j1]);
2150
+ acc1 = _mm256_add_ps(acc1, prod1);
2151
+ #endif
2152
+ i0 += 2;
2153
+ i1 += 2;
2154
+ j0 = (j0+2) & masksimd;
2155
+ j1 = (j1+2) & masksimd;
2156
+ }
2157
+ while (i0 < lensimd) {
2158
+ #if defined __FMA__
2159
+ acc0 = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0);
2160
+ #elif defined __FMA4__
2161
+ acc0 = _mm256_macc_ps(coefs[i0], stater[j0], acc0);
2162
+ #else
2163
+ prod0 = _mm256_mul_ps(coefs[i0], stater[j0]);
2164
+ acc0 = _mm256_add_ps(acc0, prod0);
2165
+ #endif
2166
+ i0 += 2;
2167
+ j0 = (j0+2) & masksimd;
2168
+ }
2169
+ acc0 = _mm256_add_ps(acc0, acc1);
2170
+ ofbuf->data[l++] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2171
+ start_index = (start_index + mcounter) & mask;
2172
+ startsimd = start_index >> 3;
2173
+ }
2174
+ fir->counter = (fir->counter + 1) % mcounter;
2175
+ }
2176
+
2177
+ return ofbuf;
2178
+ }
2179
+
2180
+
2181
+ pdlc_complex_buffer_t* pdlc_fir_filter_decimate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
2182
+ {
2183
+ const unsigned int nb_coefs = fir->nb_coefs;
2184
+ const unsigned int flt_len = fir->state_len;
2185
+ const unsigned int mask = fir->index_mask;
2186
+ const unsigned int lensimd = fir->coef_len >> 3;
2187
+ const unsigned int masksimd = mask >> 3;
2188
+ const int mcounter = fir->max_counter;
2189
+ const size_t ibuflen = icbuf->length;
2190
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
2191
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
2192
+ unsigned int startsimd = start_index >> 3;
2193
+ unsigned int i0, j0, i1, j1;
2194
+ size_t k, l;
2195
+ register __m256 acc0r, acc0i, acc1r, acc1i;
2196
+ #if !(defined __FMA__ || defined __FMA4__)
2197
+ register __m256 prod0r, prod0i, prod1r, prod1i;
2198
+ #endif
2199
+ const __m256 *coefs;
2200
+ __m256 *stater = (__m256*)fir->stater;
2201
+ __m256 *statei = (__m256*)fir->statei;
2202
+
2203
+
2204
+ if (!ocbuf)
2205
+ ocbuf = pdlc_complex_buffer_new(obuflen);
2206
+ else if (ocbuf->length != obuflen)
2207
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
2208
+
2209
+
2210
+ for (k = 0, l = 0; k < ibuflen; k++) {
2211
+ fir->stater[fir->index] = icbuf->data[k].real;
2212
+ fir->statei[fir->index] = icbuf->data[k].imag;
2213
+ fir->index = (fir->index + 1) & mask;
2214
+ if (fir->counter == 0) {
2215
+ coefs = (__m256*)fir->coefs[start_index & 7];
2216
+ acc0r = _mm256_setzero_ps();
2217
+ acc0i = _mm256_setzero_ps();
2218
+ acc1r = _mm256_setzero_ps();
2219
+ acc1i = _mm256_setzero_ps();
2220
+ j0 = startsimd;
2221
+ j1 = (startsimd+1) & masksimd;
2222
+ i0 = 0;
2223
+ i1 = 1;
2224
+ while (i1 < lensimd) {
2225
+ #if defined __FMA__
2226
+ acc0r = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0r);
2227
+ acc0i = _mm256_fmadd_ps(coefs[i0], statei[j0], acc0i);
2228
+ acc1r = _mm256_fmadd_ps(coefs[i1], stater[j1], acc1r);
2229
+ acc1i = _mm256_fmadd_ps(coefs[i1], statei[j1], acc1i);
2230
+ #elif defined __FMA4__
2231
+ acc0r = _mm256_macc_ps(coefs[i0], stater[j0], acc0r);
2232
+ acc0i = _mm256_macc_ps(coefs[i0], statei[j0], acc0i);
2233
+ acc1r = _mm256_macc_ps(coefs[i1], stater[j1], acc1r);
2234
+ acc1i = _mm256_macc_ps(coefs[i1], statei[j1], acc1i);
2235
+ #else
2236
+ prod0r = _mm256_mul_ps(coefs[i0], stater[j0]);
2237
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2238
+ prod0i = _mm256_mul_ps(coefs[i0], statei[j0]);
2239
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2240
+ prod1r = _mm256_mul_ps(coefs[i1], stater[j1]);
2241
+ acc1r = _mm256_add_ps(acc1r, prod1r);
2242
+ prod1i = _mm256_mul_ps(coefs[i1], statei[j1]);
2243
+ acc1i = _mm256_add_ps(acc1i, prod1i);
2244
+ #endif
2245
+ i0 += 2;
2246
+ i1 += 2;
2247
+ j0 = (j0+2) & masksimd;
2248
+ j1 = (j1+2) & masksimd;
2249
+ }
2250
+ while (i0 < lensimd) {
2251
+ #if defined __FMA__
2252
+ acc0r = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0r);
2253
+ acc0i = _mm256_fmadd_ps(coefs[i0], statei[j0], acc0i);
2254
+ #elif defined __FMA4__
2255
+ acc0r = _mm256_macc_ps(coefs[i0], stater[j0], acc0r);
2256
+ acc0i = _mm256_macc_ps(coefs[i0], statei[j0], acc0i);
2257
+ #else
2258
+ prod0r = _mm256_mul_ps(coefs[i0], stater[j0]);
2259
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2260
+ prod0i = _mm256_mul_ps(coefs[i0], statei[j0]);
2261
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2262
+ #endif
2263
+ i0 += 2;
2264
+ j0 = (j0+2) & masksimd;
2265
+ }
2266
+ acc0r = _mm256_add_ps(acc0r, acc1r);
2267
+ acc0i = _mm256_add_ps(acc0i, acc1i);
2268
+ ocbuf->data[l].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
2269
+ ocbuf->data[l].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
2270
+ l++;
2271
+ start_index = (start_index + mcounter) & mask;
2272
+ startsimd = start_index >> 3;
2273
+ }
2274
+ fir->counter = (fir->counter + 1) % mcounter;
2275
+ }
2276
+
2277
+ return ocbuf;
2278
+ }
2279
+
2280
+
2281
+ pdlc_complex_buffer_t* pdlc_fir_filter_transform(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_complex_buffer_t *ocbuf)
2282
+ {
2283
+ const unsigned int nb_coefs = fir->nb_coefs;
2284
+ const unsigned int flt_len = fir->state_len;
2285
+ const unsigned int mask = fir->index_mask;
2286
+ const unsigned int lensimd = fir->coef_len >> 3;
2287
+ const unsigned int masksimd = mask >> 3;
2288
+ const size_t ibuflen = ifbuf->length;
2289
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
2290
+ unsigned int startsimd = start_index >> 3;
2291
+ unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
2292
+ unsigned int i, j;
2293
+ size_t k = 0;
2294
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
2295
+ #if !(defined __FMA__ || defined __FMA4__)
2296
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
2297
+ #endif
2298
+ register __m256 statereal;
2299
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
2300
+ __m256 *stater = (__m256*)fir->stater;
2301
+
2302
+
2303
+ if (!ocbuf)
2304
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
2305
+ else if (ocbuf->length != ibuflen)
2306
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
2307
+
2308
+
2309
+ if (nb_coefs & 1) {
2310
+ while ((start_index & 7) && k < ibuflen) {
2311
+ fir->stater[fir->index] = ifbuf->data[k];
2312
+ fir->index = (fir->index + 1) & mask;
2313
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2314
+ acc0 = _mm256_setzero_ps();
2315
+ j = startsimd;
2316
+ for (i = 0; i < lensimd; i++) {
2317
+ #if defined __FMA__
2318
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2319
+ #elif defined __FMA4__
2320
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2321
+ #else
2322
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2323
+ acc0 = _mm256_add_ps(acc0, prod0);
2324
+ #endif
2325
+ j = (j+1) & masksimd;
2326
+ }
2327
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2328
+ start_index = (start_index + 1) & mask;
2329
+ startsimd = start_index >> 3;
2330
+ ocbuf->data[k].real = fir->stater[middle_index];
2331
+ middle_index = (middle_index + 1) & mask;
2332
+ k++;
2333
+ }
2334
+ while (k + 8 <= ibuflen) {
2335
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
2336
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
2337
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
2338
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
2339
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
2340
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
2341
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
2342
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
2343
+ fir->index = (fir->index + 8) & mask;
2344
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
2345
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
2346
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
2347
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
2348
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
2349
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
2350
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
2351
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
2352
+ acc0 = _mm256_setzero_ps();
2353
+ acc1 = _mm256_setzero_ps();
2354
+ acc2 = _mm256_setzero_ps();
2355
+ acc3 = _mm256_setzero_ps();
2356
+ acc4 = _mm256_setzero_ps();
2357
+ acc5 = _mm256_setzero_ps();
2358
+ acc6 = _mm256_setzero_ps();
2359
+ acc7 = _mm256_setzero_ps();
2360
+ j = startsimd;
2361
+ for (i = 0; i < lensimd; i++) {
2362
+ statereal = stater[j];
2363
+ #if defined __FMA__
2364
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
2365
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
2366
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
2367
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
2368
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
2369
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
2370
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
2371
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
2372
+ #elif defined __FMA4__
2373
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
2374
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
2375
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
2376
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
2377
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
2378
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
2379
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
2380
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
2381
+ #else
2382
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
2383
+ acc0 = _mm256_add_ps(acc0, prod0);
2384
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
2385
+ acc1 = _mm256_add_ps(acc1, prod1);
2386
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
2387
+ acc2 = _mm256_add_ps(acc2, prod2);
2388
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
2389
+ acc3 = _mm256_add_ps(acc3, prod3);
2390
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
2391
+ acc4 = _mm256_add_ps(acc4, prod4);
2392
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
2393
+ acc5 = _mm256_add_ps(acc5, prod5);
2394
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
2395
+ acc6 = _mm256_add_ps(acc6, prod6);
2396
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
2397
+ acc7 = _mm256_add_ps(acc7, prod7);
2398
+ #endif
2399
+ j = (j+1) & masksimd;
2400
+ }
2401
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
2402
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
2403
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
2404
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
2405
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
2406
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
2407
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
2408
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
2409
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
2410
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
2411
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
2412
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
2413
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
2414
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
2415
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
2416
+ ocbuf->data[k+0].imag = h30[0];
2417
+ ocbuf->data[k+1].imag = h30[1];
2418
+ ocbuf->data[k+2].imag = h30[2];
2419
+ ocbuf->data[k+3].imag = h30[3];
2420
+ ocbuf->data[k+4].imag = h30[4];
2421
+ ocbuf->data[k+5].imag = h30[5];
2422
+ ocbuf->data[k+6].imag = h30[6];
2423
+ ocbuf->data[k+7].imag = h30[7];
2424
+ start_index = (start_index + 8) & mask;
2425
+ startsimd = start_index >> 3;
2426
+ ocbuf->data[k].real = fir->stater[middle_index];
2427
+ middle_index = (middle_index + 1) & mask;
2428
+ k++;
2429
+ ocbuf->data[k].real = fir->stater[middle_index];
2430
+ middle_index = (middle_index + 1) & mask;
2431
+ k++;
2432
+ ocbuf->data[k].real = fir->stater[middle_index];
2433
+ middle_index = (middle_index + 1) & mask;
2434
+ k++;
2435
+ ocbuf->data[k].real = fir->stater[middle_index];
2436
+ middle_index = (middle_index + 1) & mask;
2437
+ k++;
2438
+ ocbuf->data[k].real = fir->stater[middle_index];
2439
+ middle_index = (middle_index + 1) & mask;
2440
+ k++;
2441
+ ocbuf->data[k].real = fir->stater[middle_index];
2442
+ middle_index = (middle_index + 1) & mask;
2443
+ k++;
2444
+ ocbuf->data[k].real = fir->stater[middle_index];
2445
+ middle_index = (middle_index + 1) & mask;
2446
+ k++;
2447
+ ocbuf->data[k].real = fir->stater[middle_index];
2448
+ middle_index = (middle_index + 1) & mask;
2449
+ k++;
2450
+ }
2451
+ for (; k < ibuflen; k++) {
2452
+ fir->stater[fir->index] = ifbuf->data[k];
2453
+ fir->index = (fir->index + 1) & mask;
2454
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2455
+ acc0 = _mm256_setzero_ps();
2456
+ j = startsimd;
2457
+ for (i = 0; i < lensimd; i++) {
2458
+ #if defined __FMA__
2459
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2460
+ #elif defined __FMA4__
2461
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2462
+ #else
2463
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2464
+ acc0 = _mm256_add_ps(acc0, prod0);
2465
+ #endif
2466
+ j = (j+1) & masksimd;
2467
+ }
2468
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2469
+ start_index = (start_index + 1) & mask;
2470
+ startsimd = start_index >> 3;
2471
+ ocbuf->data[k].real = fir->stater[middle_index];
2472
+ middle_index = (middle_index + 1) & mask;
2473
+ }
2474
+ }
2475
+ else {
2476
+ while ((start_index & 7) && k < ibuflen) {
2477
+ fir->stater[fir->index] = ifbuf->data[k];
2478
+ fir->index = (fir->index + 1) & mask;
2479
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2480
+ acc0 = _mm256_setzero_ps();
2481
+ j = startsimd;
2482
+ for (i = 0; i < lensimd; i++) {
2483
+ #if defined __FMA__
2484
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2485
+ #elif defined __FMA4__
2486
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2487
+ #else
2488
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2489
+ acc0 = _mm256_add_ps(acc0, prod0);
2490
+ #endif
2491
+ j = (j+1) & masksimd;
2492
+ }
2493
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2494
+ start_index = (start_index + 1) & mask;
2495
+ startsimd = start_index >> 3;
2496
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2497
+ middle_index = (middle_index + 1) & mask;
2498
+ k++;
2499
+ }
2500
+ while (k + 8 <= ibuflen) {
2501
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
2502
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
2503
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
2504
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
2505
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
2506
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
2507
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
2508
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
2509
+ fir->index = (fir->index + 8) & mask;
2510
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
2511
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
2512
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
2513
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
2514
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
2515
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
2516
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
2517
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
2518
+ acc0 = _mm256_setzero_ps();
2519
+ acc1 = _mm256_setzero_ps();
2520
+ acc2 = _mm256_setzero_ps();
2521
+ acc3 = _mm256_setzero_ps();
2522
+ acc4 = _mm256_setzero_ps();
2523
+ acc5 = _mm256_setzero_ps();
2524
+ acc6 = _mm256_setzero_ps();
2525
+ acc7 = _mm256_setzero_ps();
2526
+ j = startsimd;
2527
+ for (i = 0; i < lensimd; i++) {
2528
+ statereal = stater[j];
2529
+ #if defined __FMA__
2530
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
2531
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
2532
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
2533
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
2534
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
2535
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
2536
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
2537
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
2538
+ #elif defined __FMA4__
2539
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
2540
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
2541
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
2542
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
2543
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
2544
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
2545
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
2546
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
2547
+ #else
2548
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
2549
+ acc0 = _mm256_add_ps(acc0, prod0);
2550
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
2551
+ acc1 = _mm256_add_ps(acc1, prod1);
2552
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
2553
+ acc2 = _mm256_add_ps(acc2, prod2);
2554
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
2555
+ acc3 = _mm256_add_ps(acc3, prod3);
2556
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
2557
+ acc4 = _mm256_add_ps(acc4, prod4);
2558
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
2559
+ acc5 = _mm256_add_ps(acc5, prod5);
2560
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
2561
+ acc6 = _mm256_add_ps(acc6, prod6);
2562
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
2563
+ acc7 = _mm256_add_ps(acc7, prod7);
2564
+ #endif
2565
+ j = (j+1) & masksimd;
2566
+ }
2567
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
2568
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
2569
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
2570
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
2571
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
2572
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
2573
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
2574
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
2575
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
2576
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
2577
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
2578
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
2579
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
2580
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
2581
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
2582
+ ocbuf->data[k+0].imag = h30[0];
2583
+ ocbuf->data[k+1].imag = h30[1];
2584
+ ocbuf->data[k+2].imag = h30[2];
2585
+ ocbuf->data[k+3].imag = h30[3];
2586
+ ocbuf->data[k+4].imag = h30[4];
2587
+ ocbuf->data[k+5].imag = h30[5];
2588
+ ocbuf->data[k+6].imag = h30[6];
2589
+ ocbuf->data[k+7].imag = h30[7];
2590
+ start_index = (start_index + 8) & mask;
2591
+ startsimd = start_index >> 3;
2592
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2593
+ middle_index = (middle_index + 1) & mask;
2594
+ k++;
2595
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2596
+ middle_index = (middle_index + 1) & mask;
2597
+ k++;
2598
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2599
+ middle_index = (middle_index + 1) & mask;
2600
+ k++;
2601
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2602
+ middle_index = (middle_index + 1) & mask;
2603
+ k++;
2604
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2605
+ middle_index = (middle_index + 1) & mask;
2606
+ k++;
2607
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2608
+ middle_index = (middle_index + 1) & mask;
2609
+ k++;
2610
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2611
+ middle_index = (middle_index + 1) & mask;
2612
+ k++;
2613
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2614
+ middle_index = (middle_index + 1) & mask;
2615
+ k++;
2616
+ }
2617
+ for (; k < ibuflen; k++) {
2618
+ fir->stater[fir->index] = ifbuf->data[k];
2619
+ fir->index = (fir->index + 1) & mask;
2620
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2621
+ acc0 = _mm256_setzero_ps();
2622
+ j = startsimd;
2623
+ for (i = 0; i < lensimd; i++) {
2624
+ #if defined __FMA__
2625
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2626
+ #elif defined __FMA4__
2627
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2628
+ #else
2629
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2630
+ acc0 = _mm256_add_ps(acc0, prod0);
2631
+ #endif
2632
+ j = (j+1) & masksimd;
2633
+ }
2634
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2635
+ start_index = (start_index + 1) & mask;
2636
+ startsimd = start_index >> 3;
2637
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2638
+ middle_index = (middle_index + 1) & mask;
2639
+ }
2640
+ }
2641
+
2642
+ return ocbuf;
2643
+ }
2644
+
2645
+