paddlec 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2645 @@
1
+ /* Copyright (C) 2019 Théotime Bollengier <theotime.bollengier@gmail.com>
2
+ *
3
+ * This file is part of PaddleC
4
+ *
5
+ * PaddleC is free software: you can redistribute it and/or modify
6
+ * it under the terms of the GNU General Public License as published by
7
+ * the Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * PaddleC is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ * GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License
16
+ * along with PaddleC. If not, see <https://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include <immintrin.h>
20
+
21
+ #if !(defined __FMA__) && defined __FMA4__
22
+ #include <x86intrin.h>
23
+ #endif
24
+
25
+
26
+ void pdlc_fir_filter_inspect(pdlc_fir_filter_t* fir)
27
+ {
28
+ size_t i, j;
29
+
30
+ printf("nb_coefs: %u, state_len: %u, coef_len: %u, index_mask: 0x%x, index: %u\n",
31
+ fir->nb_coefs, fir->state_len, fir->coef_len, fir->index_mask, fir->index);
32
+ printf("state: [%.7g", fir->stater[0]);
33
+ for (i = 1; i < fir->state_len; i++)
34
+ printf(", %.7g", fir->stater[i]);
35
+ printf("]\n");
36
+ for (j = 0; j < 8; j++) {
37
+ printf("coefs: {%lu}[%.7g", j, fir->coefs[j][0]);
38
+ for (i = 1; i < fir->coef_len; i++)
39
+ printf(", %.7g", fir->coefs[j][i]);
40
+ printf("]\n");
41
+ }
42
+ }
43
+
44
+
45
+ void pdlc_fir_filter_initialize(pdlc_fir_filter_t* fir, int order)
46
+ {
47
+ int i;
48
+
49
+ if (fir->coefs) {
50
+ for (i = 0; i < 8; i++)
51
+ if (fir->coefs[i])
52
+ _mm_free(fir->coefs[i]);
53
+ free(fir->coefs);
54
+ fir->coefs = NULL;
55
+ }
56
+
57
+ if (fir->stater)
58
+ _mm_free(fir->stater);
59
+ fir->stater = NULL;
60
+
61
+ if (fir->statei)
62
+ _mm_free(fir->statei);
63
+ fir->statei = NULL;
64
+
65
+ fir->nb_coefs = 0;
66
+ fir->state_len = 0;
67
+ fir->coef_len = 0;
68
+ fir->index = 0;
69
+ fir->index_mask = 0;
70
+ fir->counter = 0;
71
+ fir->max_counter = 1;
72
+
73
+ if (order < 0)
74
+ return;
75
+
76
+ if (order > 67108863) {
77
+ fprintf(stderr, "ERROR: libpaddlec: Filter order cannot be greater than 67108864\n");
78
+ exit(EXIT_FAILURE);
79
+ }
80
+
81
+ fir->nb_coefs = (unsigned int)(order + 1);
82
+ fir->coef_len = ((fir->nb_coefs + 7 + 7) >> 3) << 3;
83
+ fir->state_len = (unsigned int)(pow(2.0, ceil(log2(fir->coef_len))));
84
+ fir->index = 0;
85
+ fir->index_mask = fir->state_len - 1;
86
+
87
+ fir->coefs = malloc(8*sizeof(float*));
88
+ if (fir->coefs == NULL) {
89
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", 8 * sizeof(float*));
90
+ exit(EXIT_FAILURE);
91
+ }
92
+
93
+ for (i = 0; i < 8; i++) {
94
+ fir->coefs[i] = _mm_malloc(fir->coef_len * sizeof(float), sizeof(__m256));
95
+ if (fir->coefs[i] == NULL) {
96
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->coef_len * sizeof(float));
97
+ exit(EXIT_FAILURE);
98
+ }
99
+ }
100
+
101
+ fir->stater = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m256));
102
+ if (fir->stater == NULL) {
103
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
104
+ exit(EXIT_FAILURE);
105
+ }
106
+
107
+ fir->statei = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m256));
108
+ if (fir->statei == NULL) {
109
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
110
+ exit(EXIT_FAILURE);
111
+ }
112
+
113
+ memset(fir->stater, 0, fir->state_len * sizeof(float));
114
+ memset(fir->statei, 0, fir->state_len * sizeof(float));
115
+ for (i = 0; i < 8; i++)
116
+ memset(fir->coefs[i], 0, fir->coef_len * sizeof(float));
117
+ }
118
+
119
+
120
+ void pdlc_fir_filter_free(pdlc_fir_filter_t* fir)
121
+ {
122
+ int i;
123
+
124
+ if (!fir)
125
+ return;
126
+
127
+ if (fir->coefs) {
128
+ for (i = 0; i < 8; i++)
129
+ if (fir->coefs[i])
130
+ _mm_free(fir->coefs[i]);
131
+ free(fir->coefs);
132
+ }
133
+
134
+ if (fir->stater)
135
+ _mm_free(fir->stater);
136
+
137
+ if (fir->statei)
138
+ _mm_free(fir->statei);
139
+
140
+ free(fir);
141
+ }
142
+
143
+
144
+ size_t pdlc_fir_filter_size(pdlc_fir_filter_t* fir)
145
+ {
146
+ size_t res;
147
+
148
+ res = sizeof(pdlc_fir_filter_t);
149
+ res += sizeof(float*)* 8;
150
+ res += sizeof(float) * fir->state_len * 2;
151
+ res += sizeof(float) * fir->coef_len * 8;
152
+
153
+ return res;
154
+ }
155
+
156
+
157
+ int pdlc_fir_filter_set_coef_at(pdlc_fir_filter_t* fir, int index, float value)
158
+ {
159
+ int i;
160
+
161
+ if (index < 0 || index >= (int)fir->nb_coefs)
162
+ return -1;
163
+
164
+ for (i = 0; i < 8; i++)
165
+ fir->coefs[i][(fir->nb_coefs - 1 - index + i) % fir->coef_len] = value;
166
+
167
+ return 0;
168
+ }
169
+
170
+
171
+ float pdlc_fir_filter_filter_float(pdlc_fir_filter_t* fir, float sample, float *delayed)
172
+ {
173
+ const unsigned int nb_coefs = fir->nb_coefs;
174
+ const unsigned int flt_len = fir->state_len;
175
+ const unsigned int mask = fir->index_mask;
176
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
177
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
178
+ const unsigned int lensimd = fir->coef_len >> 3;
179
+ const unsigned int startsimd = start_index >> 3;
180
+ const unsigned int masksimd = mask >> 3;
181
+ unsigned int i, j;
182
+ register __m256 acc;
183
+ #if !(defined __FMA__ || defined __FMA4__)
184
+ register __m256 prod;
185
+ #endif
186
+ const __m256 *coefs = (__m256*)fir->coefs[start_index & 7];
187
+ __m256 *stater = (__m256*)fir->stater;
188
+
189
+ fir->stater[fir->index] = sample;
190
+ fir->index = (fir->index + 1) & mask;
191
+
192
+ if (delayed) {
193
+ if (nb_coefs & 1)
194
+ *delayed = fir->stater[middle_index];
195
+ else
196
+ *delayed = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
197
+ }
198
+
199
+ acc = _mm256_setzero_ps();
200
+ j = startsimd;
201
+ for (i = 0; i < lensimd; i++) {
202
+ #if defined __FMA__
203
+ acc = _mm256_fmadd_ps(coefs[i], stater[j], acc);
204
+ #elif defined __FMA4__
205
+ acc = _mm256_macc_ps(coefs[i], stater[j], acc);
206
+ #else
207
+ prod = _mm256_mul_ps(coefs[i], stater[j]);
208
+ acc = _mm256_add_ps(acc, prod);
209
+ #endif
210
+ j = (j+1) & masksimd;
211
+ }
212
+
213
+ return acc[0] + acc[1] + acc[2] + acc[3] + acc[4] + acc[5] + acc[6] + acc[7];
214
+ }
215
+
216
+
217
+ pdlc_complex_t pdlc_fir_filter_filter_complex(pdlc_fir_filter_t* fir, pdlc_complex_t sample, pdlc_complex_t *delayed)
218
+ {
219
+ const unsigned int nb_coefs = fir->nb_coefs;
220
+ const unsigned int flt_len = fir->state_len;
221
+ const unsigned int mask = fir->index_mask;
222
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
223
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
224
+ const unsigned int lensimd = fir->coef_len >> 3;
225
+ const unsigned int startsimd = start_index >> 3;
226
+ const unsigned int masksimd = mask >> 3;
227
+ unsigned int i, j;
228
+ pdlc_complex_t res = {0.0f, 0.0f};
229
+ register __m256 accr, acci;
230
+ #if !(defined __FMA__ || defined __FMA4__)
231
+ register __m256 prodr, prodi;
232
+ #endif
233
+ const __m256 *coefs = (__m256*)fir->coefs[start_index & 7];
234
+ __m256 *stater = (__m256*)fir->stater;
235
+ __m256 *statei = (__m256*)fir->statei;
236
+
237
+ fir->stater[fir->index] = sample.real;
238
+ fir->statei[fir->index] = sample.imag;
239
+ fir->index = (fir->index + 1) & mask;
240
+
241
+ accr = _mm256_setzero_ps();
242
+ acci = _mm256_setzero_ps();
243
+ j = startsimd;
244
+ for (i = 0; i < lensimd; i++) {
245
+ #if defined __FMA__
246
+ accr = _mm256_fmadd_ps(coefs[i], stater[j], accr);
247
+ acci = _mm256_fmadd_ps(coefs[i], statei[j], acci);
248
+ #elif defined __FMA4__
249
+ accr = _mm256_macc_ps(coefs[i], stater[j], accr);
250
+ acci = _mm256_macc_ps(coefs[i], statei[j], acci);
251
+ #else
252
+ prodr = _mm256_mul_ps(coefs[i], stater[j]);
253
+ prodi = _mm256_mul_ps(coefs[i], statei[j]);
254
+ accr = _mm256_add_ps(accr, prodr);
255
+ acci = _mm256_add_ps(acci, prodi);
256
+ #endif
257
+ j = (j+1) & masksimd;
258
+ }
259
+ res.real = accr[0] + accr[1] + accr[2] + accr[3] + accr[4] + accr[5] + accr[6] + accr[7];
260
+ res.imag = acci[0] + acci[1] + acci[2] + acci[3] + acci[4] + acci[5] + acci[6] + acci[7];
261
+
262
+ if (delayed) {
263
+ if (nb_coefs & 1) {
264
+ delayed->real = fir->stater[middle_index];
265
+ delayed->imag = fir->statei[middle_index];
266
+ }
267
+ else {
268
+ delayed->real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
269
+ delayed->imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
270
+ }
271
+ }
272
+
273
+ return res;
274
+ }
275
+
276
+
277
+ pdlc_buffer_t* pdlc_fir_filter_filter_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf, pdlc_buffer_t *delayed)
278
+ {
279
+ const unsigned int nb_coefs = fir->nb_coefs;
280
+ const unsigned int flt_len = fir->state_len;
281
+ const unsigned int mask = fir->index_mask;
282
+ const unsigned int lensimd = fir->coef_len >> 3;
283
+ const unsigned int masksimd = mask >> 3;
284
+ const size_t ibuflen = ifbuf->length;
285
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
286
+ unsigned int startsimd = start_index >> 3;
287
+ unsigned int middle_index;
288
+ unsigned int i, j;
289
+ size_t k;
290
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
291
+ #if !(defined __FMA__ || defined __FMA4__)
292
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
293
+ #endif
294
+ register __m256 statereal;
295
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
296
+ __m256 *stater = (__m256*)fir->stater;
297
+
298
+ if (!ofbuf)
299
+ ofbuf = pdlc_buffer_new(ibuflen);
300
+ else if (ofbuf->length != ibuflen)
301
+ pdlc_buffer_resize(ofbuf, ibuflen, 0);
302
+
303
+ if (delayed) {
304
+ if (delayed->length != ibuflen)
305
+ pdlc_buffer_resize(delayed, ibuflen, 0);
306
+ middle_index = (start_index + nb_coefs / 2) & mask;
307
+ if (nb_coefs & 1) {
308
+ k = 0;
309
+ while ((start_index & 7) && k < ibuflen) {
310
+ fir->stater[fir->index] = ifbuf->data[k];
311
+ fir->index = (fir->index + 1) & mask;
312
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
313
+ acc0 = _mm256_setzero_ps();
314
+ j = startsimd;
315
+ for (i = 0; i < lensimd; i++) {
316
+ #if defined __FMA__
317
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
318
+ #elif defined __FMA4__
319
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
320
+ #else
321
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
322
+ acc0 = _mm256_add_ps(acc0, prod0);
323
+ #endif
324
+ j = (j+1) & masksimd;
325
+ }
326
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
327
+ start_index = (start_index + 1) & mask;
328
+ startsimd = start_index >> 3;
329
+ delayed->data[k] = fir->stater[middle_index];
330
+ middle_index = (middle_index + 1) & mask;
331
+ k++;
332
+ }
333
+ while (k + 8 <= ibuflen) {
334
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
335
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
336
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
337
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
338
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
339
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
340
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
341
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
342
+ fir->index = (fir->index + 8) & mask;
343
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
344
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
345
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
346
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
347
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
348
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
349
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
350
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
351
+ acc0 = _mm256_setzero_ps();
352
+ acc1 = _mm256_setzero_ps();
353
+ acc2 = _mm256_setzero_ps();
354
+ acc3 = _mm256_setzero_ps();
355
+ acc4 = _mm256_setzero_ps();
356
+ acc5 = _mm256_setzero_ps();
357
+ acc6 = _mm256_setzero_ps();
358
+ acc7 = _mm256_setzero_ps();
359
+ j = startsimd;
360
+ for (i = 0; i < lensimd; i++) {
361
+ statereal = stater[j];
362
+ #if defined __FMA__
363
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
364
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
365
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
366
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
367
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
368
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
369
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
370
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
371
+ #elif defined __FMA4__
372
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
373
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
374
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
375
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
376
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
377
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
378
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
379
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
380
+ #else
381
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
382
+ acc0 = _mm256_add_ps(acc0, prod0);
383
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
384
+ acc1 = _mm256_add_ps(acc1, prod1);
385
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
386
+ acc2 = _mm256_add_ps(acc2, prod2);
387
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
388
+ acc3 = _mm256_add_ps(acc3, prod3);
389
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
390
+ acc4 = _mm256_add_ps(acc4, prod4);
391
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
392
+ acc5 = _mm256_add_ps(acc5, prod5);
393
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
394
+ acc6 = _mm256_add_ps(acc6, prod6);
395
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
396
+ acc7 = _mm256_add_ps(acc7, prod7);
397
+ #endif
398
+ j = (j+1) & masksimd;
399
+ }
400
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
401
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
402
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
403
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
404
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
405
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
406
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
407
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
408
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
409
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
410
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
411
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
412
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
413
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
414
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
415
+ _mm256_storeu_ps(ofbuf->data + k, h30);
416
+ start_index = (start_index + 8) & mask;
417
+ startsimd = start_index >> 3;
418
+ delayed->data[k] = fir->stater[middle_index];
419
+ middle_index = (middle_index + 1) & mask;
420
+ k++;
421
+ delayed->data[k] = fir->stater[middle_index];
422
+ middle_index = (middle_index + 1) & mask;
423
+ k++;
424
+ delayed->data[k] = fir->stater[middle_index];
425
+ middle_index = (middle_index + 1) & mask;
426
+ k++;
427
+ delayed->data[k] = fir->stater[middle_index];
428
+ middle_index = (middle_index + 1) & mask;
429
+ k++;
430
+ delayed->data[k] = fir->stater[middle_index];
431
+ middle_index = (middle_index + 1) & mask;
432
+ k++;
433
+ delayed->data[k] = fir->stater[middle_index];
434
+ middle_index = (middle_index + 1) & mask;
435
+ k++;
436
+ delayed->data[k] = fir->stater[middle_index];
437
+ middle_index = (middle_index + 1) & mask;
438
+ k++;
439
+ delayed->data[k] = fir->stater[middle_index];
440
+ middle_index = (middle_index + 1) & mask;
441
+ k++;
442
+ }
443
+ for (; k < ibuflen; k++) {
444
+ fir->stater[fir->index] = ifbuf->data[k];
445
+ fir->index = (fir->index + 1) & mask;
446
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
447
+ acc0 = _mm256_setzero_ps();
448
+ j = startsimd;
449
+ for (i = 0; i < lensimd; i++) {
450
+ #if defined __FMA__
451
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
452
+ #elif defined __FMA4__
453
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
454
+ #else
455
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
456
+ acc0 = _mm256_add_ps(acc0, prod0);
457
+ #endif
458
+ j = (j+1) & masksimd;
459
+ }
460
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
461
+ start_index = (start_index + 1) & mask;
462
+ startsimd = start_index >> 3;
463
+ delayed->data[k] = fir->stater[middle_index];
464
+ middle_index = (middle_index + 1) & mask;
465
+ }
466
+ }
467
+ else {
468
+ k = 0;
469
+ while ((start_index & 7) && k < ibuflen) {
470
+ fir->stater[fir->index] = ifbuf->data[k];
471
+ fir->index = (fir->index + 1) & mask;
472
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
473
+ acc0 = _mm256_setzero_ps();
474
+ j = startsimd;
475
+ for (i = 0; i < lensimd; i++) {
476
+ #if defined __FMA__
477
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
478
+ #elif defined __FMA4__
479
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
480
+ #else
481
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
482
+ acc0 = _mm256_add_ps(acc0, prod0);
483
+ #endif
484
+ j = (j+1) & masksimd;
485
+ }
486
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
487
+ start_index = (start_index + 1) & mask;
488
+ startsimd = start_index >> 3;
489
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
490
+ middle_index = (middle_index + 1) & mask;
491
+ k++;
492
+ }
493
+ while (k + 8 <= ibuflen) {
494
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
495
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
496
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
497
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
498
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
499
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
500
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
501
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
502
+ fir->index = (fir->index + 8) & mask;
503
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
504
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
505
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
506
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
507
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
508
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
509
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
510
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
511
+ acc0 = _mm256_setzero_ps();
512
+ acc1 = _mm256_setzero_ps();
513
+ acc2 = _mm256_setzero_ps();
514
+ acc3 = _mm256_setzero_ps();
515
+ acc4 = _mm256_setzero_ps();
516
+ acc5 = _mm256_setzero_ps();
517
+ acc6 = _mm256_setzero_ps();
518
+ acc7 = _mm256_setzero_ps();
519
+ j = startsimd;
520
+ for (i = 0; i < lensimd; i++) {
521
+ statereal = stater[j];
522
+ #if defined __FMA__
523
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
524
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
525
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
526
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
527
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
528
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
529
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
530
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
531
+ #elif defined __FMA4__
532
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
533
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
534
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
535
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
536
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
537
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
538
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
539
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
540
+ #else
541
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
542
+ acc0 = _mm256_add_ps(acc0, prod0);
543
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
544
+ acc1 = _mm256_add_ps(acc1, prod1);
545
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
546
+ acc2 = _mm256_add_ps(acc2, prod2);
547
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
548
+ acc3 = _mm256_add_ps(acc3, prod3);
549
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
550
+ acc4 = _mm256_add_ps(acc4, prod4);
551
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
552
+ acc5 = _mm256_add_ps(acc5, prod5);
553
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
554
+ acc6 = _mm256_add_ps(acc6, prod6);
555
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
556
+ acc7 = _mm256_add_ps(acc7, prod7);
557
+ #endif
558
+ j = (j+1) & masksimd;
559
+ }
560
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
561
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
562
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
563
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
564
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
565
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
566
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
567
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
568
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
569
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
570
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
571
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
572
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
573
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
574
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
575
+ _mm256_storeu_ps(ofbuf->data + k, h30);
576
+ start_index = (start_index + 8) & mask;
577
+ startsimd = start_index >> 3;
578
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
579
+ middle_index = (middle_index + 1) & mask;
580
+ k++;
581
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
582
+ middle_index = (middle_index + 1) & mask;
583
+ k++;
584
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
585
+ middle_index = (middle_index + 1) & mask;
586
+ k++;
587
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
588
+ middle_index = (middle_index + 1) & mask;
589
+ k++;
590
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
591
+ middle_index = (middle_index + 1) & mask;
592
+ k++;
593
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
594
+ middle_index = (middle_index + 1) & mask;
595
+ k++;
596
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
597
+ middle_index = (middle_index + 1) & mask;
598
+ k++;
599
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
600
+ middle_index = (middle_index + 1) & mask;
601
+ k++;
602
+ }
603
+ for (; k < ibuflen; k++) {
604
+ fir->stater[fir->index] = ifbuf->data[k];
605
+ fir->index = (fir->index + 1) & mask;
606
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
607
+ acc0 = _mm256_setzero_ps();
608
+ j = startsimd;
609
+ for (i = 0; i < lensimd; i++) {
610
+ #if defined __FMA__
611
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
612
+ #elif defined __FMA4__
613
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
614
+ #else
615
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
616
+ acc0 = _mm256_add_ps(acc0, prod0);
617
+ #endif
618
+ j = (j+1) & masksimd;
619
+ }
620
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
621
+ start_index = (start_index + 1) & mask;
622
+ startsimd = start_index >> 3;
623
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
624
+ middle_index = (middle_index + 1) & mask;
625
+ }
626
+ }
627
+ }
628
+ else {
629
+ k = 0;
630
+ while ((start_index & 7) && k < ibuflen) {
631
+ fir->stater[fir->index] = ifbuf->data[k];
632
+ fir->index = (fir->index + 1) & mask;
633
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
634
+ acc0 = _mm256_setzero_ps();
635
+ j = startsimd;
636
+ for (i = 0; i < lensimd; i++) {
637
+ #if defined __FMA__
638
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
639
+ #elif defined __FMA4__
640
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
641
+ #else
642
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
643
+ acc0 = _mm256_add_ps(acc0, prod0);
644
+ #endif
645
+ j = (j+1) & masksimd;
646
+ }
647
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
648
+ start_index = (start_index + 1) & mask;
649
+ startsimd = start_index >> 3;
650
+ k++;
651
+ }
652
+ while (k + 8 <= ibuflen) {
653
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
654
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
655
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
656
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
657
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
658
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
659
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
660
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
661
+ fir->index = (fir->index + 8) & mask;
662
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
663
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
664
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
665
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
666
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
667
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
668
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
669
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
670
+ acc0 = _mm256_setzero_ps();
671
+ acc1 = _mm256_setzero_ps();
672
+ acc2 = _mm256_setzero_ps();
673
+ acc3 = _mm256_setzero_ps();
674
+ acc4 = _mm256_setzero_ps();
675
+ acc5 = _mm256_setzero_ps();
676
+ acc6 = _mm256_setzero_ps();
677
+ acc7 = _mm256_setzero_ps();
678
+ j = startsimd;
679
+ for (i = 0; i < lensimd; i++) {
680
+ statereal = stater[j];
681
+ #if defined __FMA__
682
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
683
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
684
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
685
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
686
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
687
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
688
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
689
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
690
+ #elif defined __FMA4__
691
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
692
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
693
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
694
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
695
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
696
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
697
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
698
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
699
+ #else
700
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
701
+ acc0 = _mm256_add_ps(acc0, prod0);
702
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
703
+ acc1 = _mm256_add_ps(acc1, prod1);
704
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
705
+ acc2 = _mm256_add_ps(acc2, prod2);
706
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
707
+ acc3 = _mm256_add_ps(acc3, prod3);
708
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
709
+ acc4 = _mm256_add_ps(acc4, prod4);
710
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
711
+ acc5 = _mm256_add_ps(acc5, prod5);
712
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
713
+ acc6 = _mm256_add_ps(acc6, prod6);
714
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
715
+ acc7 = _mm256_add_ps(acc7, prod7);
716
+ #endif
717
+ j = (j+1) & masksimd;
718
+ }
719
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
720
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
721
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
722
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
723
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
724
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
725
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
726
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
727
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
728
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
729
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
730
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
731
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
732
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
733
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
734
+ _mm256_storeu_ps(ofbuf->data + k, h30);
735
+ start_index = (start_index + 8) & mask;
736
+ startsimd = start_index >> 3;
737
+ k += 8;
738
+ }
739
+ for (; k < ibuflen; k++) {
740
+ fir->stater[fir->index] = ifbuf->data[k];
741
+ fir->index = (fir->index + 1) & mask;
742
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
743
+ acc0 = _mm256_setzero_ps();
744
+ j = startsimd;
745
+ for (i = 0; i < lensimd; i++) {
746
+ #if defined __FMA__
747
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
748
+ #elif defined __FMA4__
749
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
750
+ #else
751
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
752
+ acc0 = _mm256_add_ps(acc0, prod0);
753
+ #endif
754
+ j = (j+1) & masksimd;
755
+ }
756
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
757
+ start_index = (start_index + 1) & mask;
758
+ startsimd = start_index >> 3;
759
+ }
760
+ }
761
+
762
+ return ofbuf;
763
+ }
764
+
765
+
766
+ pdlc_complex_buffer_t* pdlc_fir_filter_filter_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf, pdlc_complex_buffer_t *delayed)
767
+ {
768
+ const unsigned int nb_coefs = fir->nb_coefs;
769
+ const unsigned int flt_len = fir->state_len;
770
+ const unsigned int mask = fir->index_mask;
771
+ const unsigned int lensimd = fir->coef_len >> 3;
772
+ const unsigned int masksimd = mask >> 3;
773
+ const size_t ibuflen = icbuf->length;
774
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
775
+ unsigned int startsimd = start_index >> 3;
776
+ unsigned int middle_index;
777
+ unsigned int i, j;
778
+ size_t k;
779
+ register __m256 acc0r, acc1r, acc2r, acc3r, acc4r, acc5r, acc6r, acc7r;
780
+ register __m256 acc0i, acc1i, acc2i, acc3i, acc4i, acc5i, acc6i, acc7i;
781
+ #if !(defined __FMA__ || defined __FMA4__)
782
+ register __m256 prod0r, prod1r, prod2r, prod3r, prod4r, prod5r, prod6r, prod7r;
783
+ register __m256 prod0i, prod1i, prod2i, prod3i, prod4i, prod5i, prod6i, prod7i;
784
+ #endif
785
+ register __m256 statereal, stateimag;
786
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
787
+ __m256 *stater = (__m256*)fir->stater;
788
+ __m256 *statei = (__m256*)fir->statei;
789
+
790
+ if (!ocbuf)
791
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
792
+ else if (ocbuf->length != ibuflen)
793
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
794
+
795
+ if (delayed) {
796
+ if (delayed->length != ibuflen)
797
+ pdlc_complex_buffer_resize(delayed, ibuflen, 0);
798
+ middle_index = (start_index + nb_coefs / 2) & mask;
799
+ if (nb_coefs & 1) {
800
+ k = 0;
801
+ while ((start_index & 7) && k < ibuflen) {
802
+ fir->stater[fir->index] = icbuf->data[k].real;
803
+ fir->statei[fir->index] = icbuf->data[k].imag;
804
+ fir->index = (fir->index + 1) & mask;
805
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
806
+ acc0r = _mm256_setzero_ps();
807
+ acc0i = _mm256_setzero_ps();
808
+ j = startsimd;
809
+ for (i = 0; i < lensimd; i++) {
810
+ #if defined __FMA__
811
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
812
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
813
+ #elif defined __FMA4__
814
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
815
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
816
+ #else
817
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
818
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
819
+ acc0r = _mm256_add_ps(acc0r, prod0r);
820
+ acc0i = _mm256_add_ps(acc0i, prod0i);
821
+ #endif
822
+ j = (j+1) & masksimd;
823
+ }
824
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
825
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
826
+ start_index = (start_index + 1) & mask;
827
+ startsimd = start_index >> 3;
828
+ delayed->data[k].real = fir->stater[middle_index];
829
+ delayed->data[k].imag = fir->statei[middle_index];
830
+ middle_index = (middle_index + 1) & mask;
831
+ k++;
832
+ }
833
+ while (k + 8 <= ibuflen) {
834
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
835
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
836
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
837
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
838
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
839
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
840
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
841
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
842
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
843
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
844
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
845
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
846
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
847
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
848
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
849
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
850
+ fir->index = (fir->index + 8) & mask;
851
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
852
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
853
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
854
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
855
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
856
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
857
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
858
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
859
+ acc0r = _mm256_setzero_ps();
860
+ acc0i = _mm256_setzero_ps();
861
+ acc1r = _mm256_setzero_ps();
862
+ acc1i = _mm256_setzero_ps();
863
+ acc2r = _mm256_setzero_ps();
864
+ acc2i = _mm256_setzero_ps();
865
+ acc3r = _mm256_setzero_ps();
866
+ acc3i = _mm256_setzero_ps();
867
+ acc4r = _mm256_setzero_ps();
868
+ acc4i = _mm256_setzero_ps();
869
+ acc5r = _mm256_setzero_ps();
870
+ acc5i = _mm256_setzero_ps();
871
+ acc6r = _mm256_setzero_ps();
872
+ acc6i = _mm256_setzero_ps();
873
+ acc7r = _mm256_setzero_ps();
874
+ acc7i = _mm256_setzero_ps();
875
+ j = startsimd;
876
+ for (i = 0; i < lensimd; i++) {
877
+ statereal = stater[j];
878
+ stateimag = statei[j];
879
+ #if defined __FMA__
880
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
881
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
882
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
883
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
884
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
885
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
886
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
887
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
888
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
889
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
890
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
891
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
892
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
893
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
894
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
895
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
896
+ #elif defined __FMA4__
897
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
898
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
899
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
900
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
901
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
902
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
903
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
904
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
905
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
906
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
907
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
908
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
909
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
910
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
911
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
912
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
913
+ #else
914
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
915
+ acc0r = _mm256_add_ps(acc0r, prod0r);
916
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
917
+ acc1r = _mm256_add_ps(acc1r, prod1r);
918
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
919
+ acc2r = _mm256_add_ps(acc2r, prod2r);
920
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
921
+ acc3r = _mm256_add_ps(acc3r, prod3r);
922
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
923
+ acc4r = _mm256_add_ps(acc4r, prod4r);
924
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
925
+ acc5r = _mm256_add_ps(acc5r, prod5r);
926
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
927
+ acc6r = _mm256_add_ps(acc6r, prod6r);
928
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
929
+ acc7r = _mm256_add_ps(acc7r, prod7r);
930
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
931
+ acc0i = _mm256_add_ps(acc0i, prod0i);
932
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
933
+ acc1i = _mm256_add_ps(acc1i, prod1i);
934
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
935
+ acc2i = _mm256_add_ps(acc2i, prod2i);
936
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
937
+ acc3i = _mm256_add_ps(acc3i, prod3i);
938
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
939
+ acc4i = _mm256_add_ps(acc4i, prod4i);
940
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
941
+ acc5i = _mm256_add_ps(acc5i, prod5i);
942
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
943
+ acc6i = _mm256_add_ps(acc6i, prod6i);
944
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
945
+ acc7i = _mm256_add_ps(acc7i, prod7i);
946
+ #endif
947
+ j = (j+1) & masksimd;
948
+ }
949
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
950
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
951
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
952
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
953
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
954
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
955
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
956
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
957
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
958
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
959
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
960
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
961
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
962
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
963
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
964
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
965
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
966
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
967
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
968
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
969
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
970
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
971
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
972
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
973
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
974
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
975
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
976
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
977
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
978
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
979
+ ocbuf->data[k+0].real = h30r[0];
980
+ ocbuf->data[k+0].imag = h30i[0];
981
+ ocbuf->data[k+1].real = h30r[1];
982
+ ocbuf->data[k+1].imag = h30i[1];
983
+ ocbuf->data[k+2].real = h30r[2];
984
+ ocbuf->data[k+2].imag = h30i[2];
985
+ ocbuf->data[k+3].real = h30r[3];
986
+ ocbuf->data[k+3].imag = h30i[3];
987
+ ocbuf->data[k+4].real = h30r[4];
988
+ ocbuf->data[k+4].imag = h30i[4];
989
+ ocbuf->data[k+5].real = h30r[5];
990
+ ocbuf->data[k+5].imag = h30i[5];
991
+ ocbuf->data[k+6].real = h30r[6];
992
+ ocbuf->data[k+6].imag = h30i[6];
993
+ ocbuf->data[k+7].real = h30r[7];
994
+ ocbuf->data[k+7].imag = h30i[7];
995
+ start_index = (start_index + 8) & mask;
996
+ startsimd = start_index >> 3;
997
+ delayed->data[k].real = fir->stater[middle_index];
998
+ delayed->data[k].imag = fir->statei[middle_index];
999
+ middle_index = (middle_index + 1) & mask;
1000
+ k++;
1001
+ delayed->data[k].real = fir->stater[middle_index];
1002
+ delayed->data[k].imag = fir->statei[middle_index];
1003
+ middle_index = (middle_index + 1) & mask;
1004
+ k++;
1005
+ delayed->data[k].real = fir->stater[middle_index];
1006
+ delayed->data[k].imag = fir->statei[middle_index];
1007
+ middle_index = (middle_index + 1) & mask;
1008
+ k++;
1009
+ delayed->data[k].real = fir->stater[middle_index];
1010
+ delayed->data[k].imag = fir->statei[middle_index];
1011
+ middle_index = (middle_index + 1) & mask;
1012
+ k++;
1013
+ delayed->data[k].real = fir->stater[middle_index];
1014
+ delayed->data[k].imag = fir->statei[middle_index];
1015
+ middle_index = (middle_index + 1) & mask;
1016
+ k++;
1017
+ delayed->data[k].real = fir->stater[middle_index];
1018
+ delayed->data[k].imag = fir->statei[middle_index];
1019
+ middle_index = (middle_index + 1) & mask;
1020
+ k++;
1021
+ delayed->data[k].real = fir->stater[middle_index];
1022
+ delayed->data[k].imag = fir->statei[middle_index];
1023
+ middle_index = (middle_index + 1) & mask;
1024
+ k++;
1025
+ delayed->data[k].real = fir->stater[middle_index];
1026
+ delayed->data[k].imag = fir->statei[middle_index];
1027
+ middle_index = (middle_index + 1) & mask;
1028
+ k++;
1029
+ }
1030
+ for (; k < ibuflen; k++) {
1031
+ fir->stater[fir->index] = icbuf->data[k].real;
1032
+ fir->statei[fir->index] = icbuf->data[k].imag;
1033
+ fir->index = (fir->index + 1) & mask;
1034
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1035
+ acc0r = _mm256_setzero_ps();
1036
+ acc0i = _mm256_setzero_ps();
1037
+ j = startsimd;
1038
+ for (i = 0; i < lensimd; i++) {
1039
+ #if defined __FMA__
1040
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1041
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1042
+ #elif defined __FMA4__
1043
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1044
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1045
+ #else
1046
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1047
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1048
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1049
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1050
+ #endif
1051
+ j = (j+1) & masksimd;
1052
+ }
1053
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1054
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1055
+ start_index = (start_index + 1) & mask;
1056
+ startsimd = start_index >> 3;
1057
+ delayed->data[k].real = fir->stater[middle_index];
1058
+ delayed->data[k].imag = fir->statei[middle_index];
1059
+ middle_index = (middle_index + 1) & mask;
1060
+ }
1061
+ }
1062
+ else {
1063
+ k = 0;
1064
+ while ((start_index & 7) && k < ibuflen) {
1065
+ fir->stater[fir->index] = icbuf->data[k].real;
1066
+ fir->statei[fir->index] = icbuf->data[k].imag;
1067
+ fir->index = (fir->index + 1) & mask;
1068
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1069
+ acc0r = _mm256_setzero_ps();
1070
+ acc0i = _mm256_setzero_ps();
1071
+ j = startsimd;
1072
+ for (i = 0; i < lensimd; i++) {
1073
+ #if defined __FMA__
1074
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1075
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1076
+ #elif defined __FMA4__
1077
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1078
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1079
+ #else
1080
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1081
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1082
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1083
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1084
+ #endif
1085
+ j = (j+1) & masksimd;
1086
+ }
1087
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1088
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1089
+ start_index = (start_index + 1) & mask;
1090
+ startsimd = start_index >> 3;
1091
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1092
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1093
+ middle_index = (middle_index + 1) & mask;
1094
+ k++;
1095
+ }
1096
+ while (k + 4 <= ibuflen) {
1097
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
1098
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
1099
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
1100
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
1101
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
1102
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
1103
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
1104
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
1105
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
1106
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
1107
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
1108
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
1109
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
1110
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
1111
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
1112
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
1113
+ fir->index = (fir->index + 8) & mask;
1114
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1115
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1116
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1117
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1118
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1119
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1120
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1121
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1122
+ acc0r = _mm256_setzero_ps();
1123
+ acc0i = _mm256_setzero_ps();
1124
+ acc1r = _mm256_setzero_ps();
1125
+ acc1i = _mm256_setzero_ps();
1126
+ acc2r = _mm256_setzero_ps();
1127
+ acc2i = _mm256_setzero_ps();
1128
+ acc3r = _mm256_setzero_ps();
1129
+ acc3i = _mm256_setzero_ps();
1130
+ acc4r = _mm256_setzero_ps();
1131
+ acc4i = _mm256_setzero_ps();
1132
+ acc5r = _mm256_setzero_ps();
1133
+ acc5i = _mm256_setzero_ps();
1134
+ acc6r = _mm256_setzero_ps();
1135
+ acc6i = _mm256_setzero_ps();
1136
+ acc7r = _mm256_setzero_ps();
1137
+ acc7i = _mm256_setzero_ps();
1138
+ j = startsimd;
1139
+ for (i = 0; i < lensimd; i++) {
1140
+ statereal = stater[j];
1141
+ stateimag = statei[j];
1142
+ #if defined __FMA__
1143
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1144
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1145
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1146
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1147
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1148
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1149
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1150
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1151
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1152
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1153
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1154
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1155
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1156
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1157
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1158
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1159
+ #elif defined __FMA4__
1160
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1161
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1162
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1163
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1164
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1165
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1166
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1167
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1168
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1169
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1170
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1171
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1172
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1173
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1174
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1175
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1176
+ #else
1177
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1178
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1179
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1180
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1181
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1182
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1183
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1184
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1185
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1186
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1187
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1188
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1189
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1190
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1191
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1192
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1193
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1194
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1195
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1196
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1197
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1198
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1199
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1200
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1201
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1202
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1203
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
1204
+ acc5i = _mm256_add_ps(acc5i, prod5i);
1205
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
1206
+ acc6i = _mm256_add_ps(acc6i, prod6i);
1207
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
1208
+ acc7i = _mm256_add_ps(acc7i, prod7i);
1209
+ #endif
1210
+ j = (j+1) & masksimd;
1211
+ }
1212
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
1213
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
1214
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
1215
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
1216
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
1217
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
1218
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
1219
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
1220
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
1221
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
1222
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
1223
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
1224
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
1225
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
1226
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
1227
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
1228
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
1229
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
1230
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
1231
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
1232
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
1233
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
1234
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
1235
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
1236
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
1237
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
1238
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
1239
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
1240
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
1241
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
1242
+ ocbuf->data[k+0].real = h30r[0];
1243
+ ocbuf->data[k+0].imag = h30i[0];
1244
+ ocbuf->data[k+1].real = h30r[1];
1245
+ ocbuf->data[k+1].imag = h30i[1];
1246
+ ocbuf->data[k+2].real = h30r[2];
1247
+ ocbuf->data[k+2].imag = h30i[2];
1248
+ ocbuf->data[k+3].real = h30r[3];
1249
+ ocbuf->data[k+3].imag = h30i[3];
1250
+ ocbuf->data[k+4].real = h30r[4];
1251
+ ocbuf->data[k+4].imag = h30i[4];
1252
+ ocbuf->data[k+5].real = h30r[5];
1253
+ ocbuf->data[k+5].imag = h30i[5];
1254
+ ocbuf->data[k+6].real = h30r[6];
1255
+ ocbuf->data[k+6].imag = h30i[6];
1256
+ ocbuf->data[k+7].real = h30r[7];
1257
+ ocbuf->data[k+7].imag = h30i[7];
1258
+ start_index = (start_index + 8) & mask;
1259
+ startsimd = start_index >> 3;
1260
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1261
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1262
+ middle_index = (middle_index + 1) & mask;
1263
+ k++;
1264
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1265
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1266
+ middle_index = (middle_index + 1) & mask;
1267
+ k++;
1268
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1269
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1270
+ middle_index = (middle_index + 1) & mask;
1271
+ k++;
1272
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1273
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1274
+ middle_index = (middle_index + 1) & mask;
1275
+ k++;
1276
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1277
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1278
+ middle_index = (middle_index + 1) & mask;
1279
+ k++;
1280
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1281
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1282
+ middle_index = (middle_index + 1) & mask;
1283
+ k++;
1284
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1285
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1286
+ middle_index = (middle_index + 1) & mask;
1287
+ k++;
1288
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1289
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1290
+ middle_index = (middle_index + 1) & mask;
1291
+ k++;
1292
+ }
1293
+ for (; k < ibuflen; k++) {
1294
+ fir->stater[fir->index] = icbuf->data[k].real;
1295
+ fir->statei[fir->index] = icbuf->data[k].imag;
1296
+ fir->index = (fir->index + 1) & mask;
1297
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1298
+ acc0r = _mm256_setzero_ps();
1299
+ acc0i = _mm256_setzero_ps();
1300
+ j = startsimd;
1301
+ for (i = 0; i < lensimd; i++) {
1302
+ #if defined __FMA__
1303
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1304
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1305
+ #elif defined __FMA4__
1306
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1307
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1308
+ #else
1309
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1310
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1311
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1312
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1313
+ #endif
1314
+ j = (j+1) & masksimd;
1315
+ }
1316
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1317
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1318
+ start_index = (start_index + 1) & mask;
1319
+ startsimd = start_index >> 3;
1320
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1321
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
1322
+ middle_index = (middle_index + 1) & mask;
1323
+ }
1324
+ }
1325
+ }
1326
+ else {
1327
+ k = 0;
1328
+ while ((start_index & 7) && k < ibuflen) {
1329
+ fir->stater[fir->index] = icbuf->data[k].real;
1330
+ fir->statei[fir->index] = icbuf->data[k].imag;
1331
+ fir->index = (fir->index + 1) & mask;
1332
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1333
+ acc0r = _mm256_setzero_ps();
1334
+ acc0i = _mm256_setzero_ps();
1335
+ j = startsimd;
1336
+ for (i = 0; i < lensimd; i++) {
1337
+ #if defined __FMA__
1338
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1339
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1340
+ #elif defined __FMA4__
1341
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1342
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1343
+ #else
1344
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1345
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1346
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1347
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1348
+ #endif
1349
+ j = (j+1) & masksimd;
1350
+ }
1351
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1352
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1353
+ start_index = (start_index + 1) & mask;
1354
+ startsimd = start_index >> 3;
1355
+ k++;
1356
+ }
1357
+ while (k + 8 <= ibuflen) {
1358
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
1359
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
1360
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
1361
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
1362
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
1363
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
1364
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
1365
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
1366
+ fir->stater[(fir->index + 4) & mask] = icbuf->data[k + 4].real;
1367
+ fir->statei[(fir->index + 4) & mask] = icbuf->data[k + 4].imag;
1368
+ fir->stater[(fir->index + 5) & mask] = icbuf->data[k + 5].real;
1369
+ fir->statei[(fir->index + 5) & mask] = icbuf->data[k + 5].imag;
1370
+ fir->stater[(fir->index + 6) & mask] = icbuf->data[k + 6].real;
1371
+ fir->statei[(fir->index + 6) & mask] = icbuf->data[k + 6].imag;
1372
+ fir->stater[(fir->index + 7) & mask] = icbuf->data[k + 7].real;
1373
+ fir->statei[(fir->index + 7) & mask] = icbuf->data[k + 7].imag;
1374
+ fir->index = (fir->index + 8) & mask;
1375
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1376
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1377
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1378
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1379
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1380
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1381
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1382
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1383
+ acc0r = _mm256_setzero_ps();
1384
+ acc0i = _mm256_setzero_ps();
1385
+ acc1r = _mm256_setzero_ps();
1386
+ acc1i = _mm256_setzero_ps();
1387
+ acc2r = _mm256_setzero_ps();
1388
+ acc2i = _mm256_setzero_ps();
1389
+ acc3r = _mm256_setzero_ps();
1390
+ acc3i = _mm256_setzero_ps();
1391
+ acc4r = _mm256_setzero_ps();
1392
+ acc4i = _mm256_setzero_ps();
1393
+ acc5r = _mm256_setzero_ps();
1394
+ acc5i = _mm256_setzero_ps();
1395
+ acc6r = _mm256_setzero_ps();
1396
+ acc6i = _mm256_setzero_ps();
1397
+ acc7r = _mm256_setzero_ps();
1398
+ acc7i = _mm256_setzero_ps();
1399
+ j = startsimd;
1400
+ for (i = 0; i < lensimd; i++) {
1401
+ statereal = stater[j];
1402
+ stateimag = statei[j];
1403
+ #if defined __FMA__
1404
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1405
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1406
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1407
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1408
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1409
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1410
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1411
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1412
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1413
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1414
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1415
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1416
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1417
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1418
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1419
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1420
+ #elif defined __FMA4__
1421
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1422
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1423
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1424
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1425
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1426
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1427
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1428
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1429
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1430
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1431
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1432
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1433
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1434
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1435
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1436
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1437
+ #else
1438
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1439
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1440
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1441
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1442
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1443
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1444
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1445
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1446
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1447
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1448
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1449
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1450
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1451
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1452
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1453
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1454
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1455
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1456
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1457
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1458
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1459
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1460
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1461
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1462
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1463
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1464
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
1465
+ acc5i = _mm256_add_ps(acc5i, prod5i);
1466
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
1467
+ acc6i = _mm256_add_ps(acc6i, prod6i);
1468
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
1469
+ acc7i = _mm256_add_ps(acc7i, prod7i);
1470
+ #endif
1471
+ j = (j+1) & masksimd;
1472
+ }
1473
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
1474
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
1475
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
1476
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
1477
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
1478
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
1479
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
1480
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
1481
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
1482
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
1483
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
1484
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
1485
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
1486
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
1487
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
1488
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
1489
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
1490
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
1491
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
1492
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
1493
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
1494
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
1495
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
1496
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
1497
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
1498
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
1499
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
1500
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
1501
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
1502
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
1503
+ ocbuf->data[k+0].real = h30r[0];
1504
+ ocbuf->data[k+0].imag = h30i[0];
1505
+ ocbuf->data[k+1].real = h30r[1];
1506
+ ocbuf->data[k+1].imag = h30i[1];
1507
+ ocbuf->data[k+2].real = h30r[2];
1508
+ ocbuf->data[k+2].imag = h30i[2];
1509
+ ocbuf->data[k+3].real = h30r[3];
1510
+ ocbuf->data[k+3].imag = h30i[3];
1511
+ ocbuf->data[k+4].real = h30r[4];
1512
+ ocbuf->data[k+4].imag = h30i[4];
1513
+ ocbuf->data[k+5].real = h30r[5];
1514
+ ocbuf->data[k+5].imag = h30i[5];
1515
+ ocbuf->data[k+6].real = h30r[6];
1516
+ ocbuf->data[k+6].imag = h30i[6];
1517
+ ocbuf->data[k+7].real = h30r[7];
1518
+ ocbuf->data[k+7].imag = h30i[7];
1519
+ start_index = (start_index + 8) & mask;
1520
+ startsimd = start_index >> 3;
1521
+ k += 8;
1522
+ }
1523
+ for (; k < ibuflen; k++) {
1524
+ fir->stater[fir->index] = icbuf->data[k].real;
1525
+ fir->statei[fir->index] = icbuf->data[k].imag;
1526
+ fir->index = (fir->index + 1) & mask;
1527
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1528
+ acc0r = _mm256_setzero_ps();
1529
+ acc0i = _mm256_setzero_ps();
1530
+ j = startsimd;
1531
+ for (i = 0; i < lensimd; i++) {
1532
+ #if defined __FMA__
1533
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1534
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1535
+ #elif defined __FMA4__
1536
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1537
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1538
+ #else
1539
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1540
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1541
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1542
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1543
+ #endif
1544
+ j = (j+1) & masksimd;
1545
+ }
1546
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
1547
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
1548
+ start_index = (start_index + 1) & mask;
1549
+ startsimd = start_index >> 3;
1550
+ }
1551
+ }
1552
+
1553
+ return ocbuf;
1554
+ }
1555
+
1556
+
1557
+ pdlc_buffer_t* pdlc_fir_filter_interpolate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1558
+ {
1559
+ const unsigned int nb_coefs = fir->nb_coefs;
1560
+ const unsigned int flt_len = fir->state_len;
1561
+ const unsigned int mask = fir->index_mask;
1562
+ const unsigned int lensimd = fir->coef_len >> 3;
1563
+ const unsigned int masksimd = mask >> 3;
1564
+ const size_t ibuflen = ifbuf->length;
1565
+ const size_t obuflen = ibuflen*fir->max_counter;
1566
+ const float ffactor = (float)(fir->max_counter);
1567
+ const size_t mcounter = fir->max_counter;
1568
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1569
+ unsigned int startsimd = start_index >> 3;
1570
+ unsigned int i, j;
1571
+ size_t k = 0, l = 0;
1572
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
1573
+ #if !(defined __FMA__ || defined __FMA4__)
1574
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
1575
+ #endif
1576
+ register __m256 statereal;
1577
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
1578
+ __m256 *stater = (__m256*)fir->stater;
1579
+
1580
+
1581
+ if (!ofbuf)
1582
+ ofbuf = pdlc_buffer_new(obuflen);
1583
+ else if (ofbuf->length != obuflen)
1584
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1585
+
1586
+
1587
+ while ((start_index & 7) && k < obuflen) {
1588
+ if ((k % mcounter) == 0)
1589
+ fir->stater[fir->index] = ifbuf->data[l++];
1590
+ else
1591
+ fir->stater[fir->index] = 0.0f;
1592
+ fir->index = (fir->index + 1) & mask;
1593
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1594
+ acc0 = _mm256_setzero_ps();
1595
+ j = startsimd;
1596
+ for (i = 0; i < lensimd; i++) {
1597
+ #if defined __FMA__
1598
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
1599
+ #elif defined __FMA4__
1600
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
1601
+ #else
1602
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
1603
+ acc0 = _mm256_add_ps(acc0, prod0);
1604
+ #endif
1605
+ j = (j+1) & masksimd;
1606
+ }
1607
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7]) * ffactor;
1608
+ start_index = (start_index + 1) & mask;
1609
+ startsimd = start_index >> 3;
1610
+ k++;
1611
+ }
1612
+ while (k + 8 <= obuflen) {
1613
+ if (((k+0) % mcounter) == 0)
1614
+ fir->stater[fir->index] = ifbuf->data[l++];
1615
+ else
1616
+ fir->stater[fir->index] = 0.0f;
1617
+ fir->index = (fir->index + 1) & mask;
1618
+ if (((k+1) % mcounter) == 0)
1619
+ fir->stater[fir->index] = ifbuf->data[l++];
1620
+ else
1621
+ fir->stater[fir->index] = 0.0f;
1622
+ fir->index = (fir->index + 1) & mask;
1623
+ if (((k+2) % mcounter) == 0)
1624
+ fir->stater[fir->index] = ifbuf->data[l++];
1625
+ else
1626
+ fir->stater[fir->index] = 0.0f;
1627
+ fir->index = (fir->index + 1) & mask;
1628
+ if (((k+3) % mcounter) == 0)
1629
+ fir->stater[fir->index] = ifbuf->data[l++];
1630
+ else
1631
+ fir->stater[fir->index] = 0.0f;
1632
+ fir->index = (fir->index + 1) & mask;
1633
+ if (((k+4) % mcounter) == 0)
1634
+ fir->stater[fir->index] = ifbuf->data[l++];
1635
+ else
1636
+ fir->stater[fir->index] = 0.0f;
1637
+ fir->index = (fir->index + 1) & mask;
1638
+ if (((k+5) % mcounter) == 0)
1639
+ fir->stater[fir->index] = ifbuf->data[l++];
1640
+ else
1641
+ fir->stater[fir->index] = 0.0f;
1642
+ fir->index = (fir->index + 1) & mask;
1643
+ if (((k+6) % mcounter) == 0)
1644
+ fir->stater[fir->index] = ifbuf->data[l++];
1645
+ else
1646
+ fir->stater[fir->index] = 0.0f;
1647
+ fir->index = (fir->index + 1) & mask;
1648
+ if (((k+7) % mcounter) == 0)
1649
+ fir->stater[fir->index] = ifbuf->data[l++];
1650
+ else
1651
+ fir->stater[fir->index] = 0.0f;
1652
+ fir->index = (fir->index + 1) & mask;
1653
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1654
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1655
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1656
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1657
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1658
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1659
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1660
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1661
+ acc0 = _mm256_setzero_ps();
1662
+ acc1 = _mm256_setzero_ps();
1663
+ acc2 = _mm256_setzero_ps();
1664
+ acc3 = _mm256_setzero_ps();
1665
+ acc4 = _mm256_setzero_ps();
1666
+ acc5 = _mm256_setzero_ps();
1667
+ acc6 = _mm256_setzero_ps();
1668
+ acc7 = _mm256_setzero_ps();
1669
+ j = startsimd;
1670
+ for (i = 0; i < lensimd; i++) {
1671
+ statereal = stater[j];
1672
+ #if defined __FMA__
1673
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
1674
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
1675
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
1676
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
1677
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
1678
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
1679
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
1680
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
1681
+ #elif defined __FMA4__
1682
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
1683
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
1684
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
1685
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
1686
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
1687
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
1688
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
1689
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
1690
+ #else
1691
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
1692
+ acc0 = _mm256_add_ps(acc0, prod0);
1693
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
1694
+ acc1 = _mm256_add_ps(acc1, prod1);
1695
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
1696
+ acc2 = _mm256_add_ps(acc2, prod2);
1697
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
1698
+ acc3 = _mm256_add_ps(acc3, prod3);
1699
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
1700
+ acc4 = _mm256_add_ps(acc4, prod4);
1701
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
1702
+ acc5 = _mm256_add_ps(acc5, prod5);
1703
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
1704
+ acc6 = _mm256_add_ps(acc6, prod6);
1705
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
1706
+ acc7 = _mm256_add_ps(acc7, prod7);
1707
+ #endif
1708
+ j = (j+1) & masksimd;
1709
+ }
1710
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
1711
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
1712
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
1713
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
1714
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
1715
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
1716
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
1717
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
1718
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
1719
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
1720
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
1721
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
1722
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
1723
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
1724
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
1725
+ h30 = _mm256_mul_ps(h30, _mm256_set1_ps(ffactor));
1726
+ _mm256_storeu_ps(ofbuf->data + k, h30);
1727
+ start_index = (start_index + 8) & mask;
1728
+ startsimd = start_index >> 3;
1729
+ k += 8;
1730
+ }
1731
+ for (; k < obuflen; k++) {
1732
+ if ((k % mcounter) == 0)
1733
+ fir->stater[fir->index] = ifbuf->data[l++];
1734
+ else
1735
+ fir->stater[fir->index] = 0.0f;
1736
+ fir->index = (fir->index + 1) & mask;
1737
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1738
+ acc0 = _mm256_setzero_ps();
1739
+ j = startsimd;
1740
+ for (i = 0; i < lensimd; i++) {
1741
+ #if defined __FMA__
1742
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
1743
+ #elif defined __FMA4__
1744
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
1745
+ #else
1746
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
1747
+ acc0 = _mm256_add_ps(acc0, prod0);
1748
+ #endif
1749
+ j = (j+1) & masksimd;
1750
+ }
1751
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7]) * ffactor;
1752
+ start_index = (start_index + 1) & mask;
1753
+ startsimd = start_index >> 3;
1754
+ }
1755
+
1756
+ return ofbuf;
1757
+ }
1758
+
1759
+
1760
+ pdlc_complex_buffer_t* pdlc_fir_filter_interpolate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1761
+ {
1762
+ const unsigned int nb_coefs = fir->nb_coefs;
1763
+ const unsigned int flt_len = fir->state_len;
1764
+ const unsigned int mask = fir->index_mask;
1765
+ const unsigned int lensimd = fir->coef_len >> 3;
1766
+ const unsigned int masksimd = mask >> 3;
1767
+ const size_t ibuflen = icbuf->length;
1768
+ const size_t obuflen = ibuflen*fir->max_counter;
1769
+ const float ffactor = (float)(fir->max_counter);
1770
+ const size_t mcounter = fir->max_counter;
1771
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1772
+ unsigned int startsimd = start_index >> 3;
1773
+ unsigned int i, j;
1774
+ size_t k = 0, l = 0;
1775
+ register __m256 acc0r, acc1r, acc2r, acc3r, acc4r, acc5r, acc6r, acc7r;
1776
+ register __m256 acc0i, acc1i, acc2i, acc3i, acc4i, acc5i, acc6i, acc7i;
1777
+ #if !(defined __FMA__ || defined __FMA4__)
1778
+ register __m256 prod0r, prod1r, prod2r, prod3r, prod4r, prod5r, prod6r, prod7r;
1779
+ register __m256 prod0i, prod1i, prod2i, prod3i, prod4i, prod5i, prod6i, prod7i;
1780
+ #endif
1781
+ register __m256 statereal, stateimag;
1782
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
1783
+ __m256 *stater = (__m256*)fir->stater;
1784
+ __m256 *statei = (__m256*)fir->statei;
1785
+
1786
+
1787
+ if (!ocbuf)
1788
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1789
+ else if (ocbuf->length != obuflen)
1790
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1791
+
1792
+
1793
+ while ((start_index & 7) && k < obuflen) {
1794
+ if ((k % mcounter) == 0) {
1795
+ fir->stater[fir->index] = icbuf->data[l].real;
1796
+ fir->statei[fir->index] = icbuf->data[l].imag;
1797
+ l++;
1798
+ }
1799
+ else {
1800
+ fir->stater[fir->index] = 0.0f;
1801
+ fir->statei[fir->index] = 0.0f;
1802
+ }
1803
+ fir->index = (fir->index + 1) & mask;
1804
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
1805
+ acc0r = _mm256_setzero_ps();
1806
+ acc0i = _mm256_setzero_ps();
1807
+ j = startsimd;
1808
+ for (i = 0; i < lensimd; i++) {
1809
+ #if defined __FMA__
1810
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
1811
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
1812
+ #elif defined __FMA4__
1813
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
1814
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
1815
+ #else
1816
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
1817
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
1818
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1819
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1820
+ #endif
1821
+ j = (j+1) & masksimd;
1822
+ }
1823
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7]) * ffactor;
1824
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7]) * ffactor;
1825
+ start_index = (start_index + 1) & mask;
1826
+ startsimd = start_index >> 3;
1827
+ k++;
1828
+ }
1829
+ while (k + 8 <= obuflen) {
1830
+ if (((k+0) % mcounter) == 0) {
1831
+ fir->stater[fir->index] = icbuf->data[l].real;
1832
+ fir->statei[fir->index] = icbuf->data[l].imag;
1833
+ l++;
1834
+ }
1835
+ else {
1836
+ fir->stater[fir->index] = 0.0f;
1837
+ fir->statei[fir->index] = 0.0f;
1838
+ }
1839
+ fir->index = (fir->index + 1) & mask;
1840
+ if (((k+1) % mcounter) == 0) {
1841
+ fir->stater[fir->index] = icbuf->data[l].real;
1842
+ fir->statei[fir->index] = icbuf->data[l].imag;
1843
+ l++;
1844
+ }
1845
+ else {
1846
+ fir->stater[fir->index] = 0.0f;
1847
+ fir->statei[fir->index] = 0.0f;
1848
+ }
1849
+ fir->index = (fir->index + 1) & mask;
1850
+ if (((k+2) % mcounter) == 0) {
1851
+ fir->stater[fir->index] = icbuf->data[l].real;
1852
+ fir->statei[fir->index] = icbuf->data[l].imag;
1853
+ l++;
1854
+ }
1855
+ else {
1856
+ fir->stater[fir->index] = 0.0f;
1857
+ fir->statei[fir->index] = 0.0f;
1858
+ }
1859
+ fir->index = (fir->index + 1) & mask;
1860
+ if (((k+3) % mcounter) == 0) {
1861
+ fir->stater[fir->index] = icbuf->data[l].real;
1862
+ fir->statei[fir->index] = icbuf->data[l].imag;
1863
+ l++;
1864
+ }
1865
+ else {
1866
+ fir->stater[fir->index] = 0.0f;
1867
+ fir->statei[fir->index] = 0.0f;
1868
+ }
1869
+ fir->index = (fir->index + 1) & mask;
1870
+ if (((k+4) % mcounter) == 0) {
1871
+ fir->stater[fir->index] = icbuf->data[l].real;
1872
+ fir->statei[fir->index] = icbuf->data[l].imag;
1873
+ l++;
1874
+ }
1875
+ else {
1876
+ fir->stater[fir->index] = 0.0f;
1877
+ fir->statei[fir->index] = 0.0f;
1878
+ }
1879
+ fir->index = (fir->index + 1) & mask;
1880
+ if (((k+5) % mcounter) == 0) {
1881
+ fir->stater[fir->index] = icbuf->data[l].real;
1882
+ fir->statei[fir->index] = icbuf->data[l].imag;
1883
+ l++;
1884
+ }
1885
+ else {
1886
+ fir->stater[fir->index] = 0.0f;
1887
+ fir->statei[fir->index] = 0.0f;
1888
+ }
1889
+ fir->index = (fir->index + 1) & mask;
1890
+ if (((k+6) % mcounter) == 0) {
1891
+ fir->stater[fir->index] = icbuf->data[l].real;
1892
+ fir->statei[fir->index] = icbuf->data[l].imag;
1893
+ l++;
1894
+ }
1895
+ else {
1896
+ fir->stater[fir->index] = 0.0f;
1897
+ fir->statei[fir->index] = 0.0f;
1898
+ }
1899
+ fir->index = (fir->index + 1) & mask;
1900
+ if (((k+7) % mcounter) == 0) {
1901
+ fir->stater[fir->index] = icbuf->data[l].real;
1902
+ fir->statei[fir->index] = icbuf->data[l].imag;
1903
+ l++;
1904
+ }
1905
+ else {
1906
+ fir->stater[fir->index] = 0.0f;
1907
+ fir->statei[fir->index] = 0.0f;
1908
+ }
1909
+ fir->index = (fir->index + 1) & mask;
1910
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
1911
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
1912
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
1913
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
1914
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
1915
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
1916
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
1917
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
1918
+ acc0r = _mm256_setzero_ps();
1919
+ acc0i = _mm256_setzero_ps();
1920
+ acc1r = _mm256_setzero_ps();
1921
+ acc1i = _mm256_setzero_ps();
1922
+ acc2r = _mm256_setzero_ps();
1923
+ acc2i = _mm256_setzero_ps();
1924
+ acc3r = _mm256_setzero_ps();
1925
+ acc3i = _mm256_setzero_ps();
1926
+ acc4r = _mm256_setzero_ps();
1927
+ acc4i = _mm256_setzero_ps();
1928
+ acc5r = _mm256_setzero_ps();
1929
+ acc5i = _mm256_setzero_ps();
1930
+ acc6r = _mm256_setzero_ps();
1931
+ acc6i = _mm256_setzero_ps();
1932
+ acc7r = _mm256_setzero_ps();
1933
+ acc7i = _mm256_setzero_ps();
1934
+ j = startsimd;
1935
+ for (i = 0; i < lensimd; i++) {
1936
+ statereal = stater[j];
1937
+ stateimag = statei[j];
1938
+ #if defined __FMA__
1939
+ acc0r = _mm256_fmadd_ps(coefs0[i], statereal, acc0r);
1940
+ acc1r = _mm256_fmadd_ps(coefs1[i], statereal, acc1r);
1941
+ acc2r = _mm256_fmadd_ps(coefs2[i], statereal, acc2r);
1942
+ acc3r = _mm256_fmadd_ps(coefs3[i], statereal, acc3r);
1943
+ acc4r = _mm256_fmadd_ps(coefs4[i], statereal, acc4r);
1944
+ acc5r = _mm256_fmadd_ps(coefs5[i], statereal, acc5r);
1945
+ acc6r = _mm256_fmadd_ps(coefs6[i], statereal, acc6r);
1946
+ acc7r = _mm256_fmadd_ps(coefs7[i], statereal, acc7r);
1947
+ acc0i = _mm256_fmadd_ps(coefs0[i], stateimag, acc0i);
1948
+ acc1i = _mm256_fmadd_ps(coefs1[i], stateimag, acc1i);
1949
+ acc2i = _mm256_fmadd_ps(coefs2[i], stateimag, acc2i);
1950
+ acc3i = _mm256_fmadd_ps(coefs3[i], stateimag, acc3i);
1951
+ acc4i = _mm256_fmadd_ps(coefs4[i], stateimag, acc4i);
1952
+ acc5i = _mm256_fmadd_ps(coefs5[i], stateimag, acc5i);
1953
+ acc6i = _mm256_fmadd_ps(coefs6[i], stateimag, acc6i);
1954
+ acc7i = _mm256_fmadd_ps(coefs7[i], stateimag, acc7i);
1955
+ #elif defined __FMA4__
1956
+ acc0r = _mm256_macc_ps(coefs0[i], statereal, acc0r);
1957
+ acc1r = _mm256_macc_ps(coefs1[i], statereal, acc1r);
1958
+ acc2r = _mm256_macc_ps(coefs2[i], statereal, acc2r);
1959
+ acc3r = _mm256_macc_ps(coefs3[i], statereal, acc3r);
1960
+ acc4r = _mm256_macc_ps(coefs4[i], statereal, acc4r);
1961
+ acc5r = _mm256_macc_ps(coefs5[i], statereal, acc5r);
1962
+ acc6r = _mm256_macc_ps(coefs6[i], statereal, acc6r);
1963
+ acc7r = _mm256_macc_ps(coefs7[i], statereal, acc7r);
1964
+ acc0i = _mm256_macc_ps(coefs0[i], stateimag, acc0i);
1965
+ acc1i = _mm256_macc_ps(coefs1[i], stateimag, acc1i);
1966
+ acc2i = _mm256_macc_ps(coefs2[i], stateimag, acc2i);
1967
+ acc3i = _mm256_macc_ps(coefs3[i], stateimag, acc3i);
1968
+ acc4i = _mm256_macc_ps(coefs4[i], stateimag, acc4i);
1969
+ acc5i = _mm256_macc_ps(coefs5[i], stateimag, acc5i);
1970
+ acc6i = _mm256_macc_ps(coefs6[i], stateimag, acc6i);
1971
+ acc7i = _mm256_macc_ps(coefs7[i], stateimag, acc7i);
1972
+ #else
1973
+ prod0r = _mm256_mul_ps(coefs0[i], statereal);
1974
+ acc0r = _mm256_add_ps(acc0r, prod0r);
1975
+ prod1r = _mm256_mul_ps(coefs1[i], statereal);
1976
+ acc1r = _mm256_add_ps(acc1r, prod1r);
1977
+ prod2r = _mm256_mul_ps(coefs2[i], statereal);
1978
+ acc2r = _mm256_add_ps(acc2r, prod2r);
1979
+ prod3r = _mm256_mul_ps(coefs3[i], statereal);
1980
+ acc3r = _mm256_add_ps(acc3r, prod3r);
1981
+ prod4r = _mm256_mul_ps(coefs4[i], statereal);
1982
+ acc4r = _mm256_add_ps(acc4r, prod4r);
1983
+ prod5r = _mm256_mul_ps(coefs5[i], statereal);
1984
+ acc5r = _mm256_add_ps(acc5r, prod5r);
1985
+ prod6r = _mm256_mul_ps(coefs6[i], statereal);
1986
+ acc6r = _mm256_add_ps(acc6r, prod6r);
1987
+ prod7r = _mm256_mul_ps(coefs7[i], statereal);
1988
+ acc7r = _mm256_add_ps(acc7r, prod7r);
1989
+ prod0i = _mm256_mul_ps(coefs0[i], stateimag);
1990
+ acc0i = _mm256_add_ps(acc0i, prod0i);
1991
+ prod1i = _mm256_mul_ps(coefs1[i], stateimag);
1992
+ acc1i = _mm256_add_ps(acc1i, prod1i);
1993
+ prod2i = _mm256_mul_ps(coefs2[i], stateimag);
1994
+ acc2i = _mm256_add_ps(acc2i, prod2i);
1995
+ prod3i = _mm256_mul_ps(coefs3[i], stateimag);
1996
+ acc3i = _mm256_add_ps(acc3i, prod3i);
1997
+ prod4i = _mm256_mul_ps(coefs4[i], stateimag);
1998
+ acc4i = _mm256_add_ps(acc4i, prod4i);
1999
+ prod5i = _mm256_mul_ps(coefs5[i], stateimag);
2000
+ acc5i = _mm256_add_ps(acc5i, prod5i);
2001
+ prod6i = _mm256_mul_ps(coefs6[i], stateimag);
2002
+ acc6i = _mm256_add_ps(acc6i, prod6i);
2003
+ prod7i = _mm256_mul_ps(coefs7[i], stateimag);
2004
+ acc7i = _mm256_add_ps(acc7i, prod7i);
2005
+ #endif
2006
+ j = (j+1) & masksimd;
2007
+ }
2008
+ register __m256 h00r = _mm256_permute2f128_ps(acc0r, acc4r, 0x20);
2009
+ register __m256 h01r = _mm256_permute2f128_ps(acc0r, acc4r, 0x31);
2010
+ register __m256 h02r = _mm256_permute2f128_ps(acc1r, acc5r, 0x20);
2011
+ register __m256 h03r = _mm256_permute2f128_ps(acc1r, acc5r, 0x31);
2012
+ register __m256 h04r = _mm256_permute2f128_ps(acc2r, acc6r, 0x20);
2013
+ register __m256 h05r = _mm256_permute2f128_ps(acc2r, acc6r, 0x31);
2014
+ register __m256 h06r = _mm256_permute2f128_ps(acc3r, acc7r, 0x20);
2015
+ register __m256 h07r = _mm256_permute2f128_ps(acc3r, acc7r, 0x31);
2016
+ register __m256 h10r = _mm256_hadd_ps(h00r, h01r);
2017
+ register __m256 h11r = _mm256_hadd_ps(h02r, h03r);
2018
+ register __m256 h12r = _mm256_hadd_ps(h04r, h05r);
2019
+ register __m256 h13r = _mm256_hadd_ps(h06r, h07r);
2020
+ register __m256 h20r = _mm256_hadd_ps(h10r, h11r);
2021
+ register __m256 h21r = _mm256_hadd_ps(h12r, h13r);
2022
+ register __m256 h30r = _mm256_hadd_ps(h20r, h21r);
2023
+ h30r = _mm256_mul_ps(h30r, _mm256_set1_ps(ffactor));
2024
+ register __m256 h00i = _mm256_permute2f128_ps(acc0i, acc4i, 0x20);
2025
+ register __m256 h01i = _mm256_permute2f128_ps(acc0i, acc4i, 0x31);
2026
+ register __m256 h02i = _mm256_permute2f128_ps(acc1i, acc5i, 0x20);
2027
+ register __m256 h03i = _mm256_permute2f128_ps(acc1i, acc5i, 0x31);
2028
+ register __m256 h04i = _mm256_permute2f128_ps(acc2i, acc6i, 0x20);
2029
+ register __m256 h05i = _mm256_permute2f128_ps(acc2i, acc6i, 0x31);
2030
+ register __m256 h06i = _mm256_permute2f128_ps(acc3i, acc7i, 0x20);
2031
+ register __m256 h07i = _mm256_permute2f128_ps(acc3i, acc7i, 0x31);
2032
+ register __m256 h10i = _mm256_hadd_ps(h00i, h01i);
2033
+ register __m256 h11i = _mm256_hadd_ps(h02i, h03i);
2034
+ register __m256 h12i = _mm256_hadd_ps(h04i, h05i);
2035
+ register __m256 h13i = _mm256_hadd_ps(h06i, h07i);
2036
+ register __m256 h20i = _mm256_hadd_ps(h10i, h11i);
2037
+ register __m256 h21i = _mm256_hadd_ps(h12i, h13i);
2038
+ register __m256 h30i = _mm256_hadd_ps(h20i, h21i);
2039
+ h30i = _mm256_mul_ps(h30i, _mm256_set1_ps(ffactor));
2040
+ ocbuf->data[k+0].real = h30r[0];
2041
+ ocbuf->data[k+0].imag = h30i[0];
2042
+ ocbuf->data[k+1].real = h30r[1];
2043
+ ocbuf->data[k+1].imag = h30i[1];
2044
+ ocbuf->data[k+2].real = h30r[2];
2045
+ ocbuf->data[k+2].imag = h30i[2];
2046
+ ocbuf->data[k+3].real = h30r[3];
2047
+ ocbuf->data[k+3].imag = h30i[3];
2048
+ ocbuf->data[k+4].real = h30r[4];
2049
+ ocbuf->data[k+4].imag = h30i[4];
2050
+ ocbuf->data[k+5].real = h30r[5];
2051
+ ocbuf->data[k+5].imag = h30i[5];
2052
+ ocbuf->data[k+6].real = h30r[6];
2053
+ ocbuf->data[k+6].imag = h30i[6];
2054
+ ocbuf->data[k+7].real = h30r[7];
2055
+ ocbuf->data[k+7].imag = h30i[7];
2056
+ start_index = (start_index + 8) & mask;
2057
+ startsimd = start_index >> 3;
2058
+ k += 8;
2059
+ }
2060
+ for (; k < obuflen; k++) {
2061
+ if ((k % mcounter) == 0) {
2062
+ fir->stater[fir->index] = icbuf->data[l].real;
2063
+ fir->statei[fir->index] = icbuf->data[l].imag;
2064
+ l++;
2065
+ }
2066
+ else {
2067
+ fir->stater[fir->index] = 0.0f;
2068
+ fir->statei[fir->index] = 0.0f;
2069
+ }
2070
+ fir->index = (fir->index + 1) & mask;
2071
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2072
+ acc0r = _mm256_setzero_ps();
2073
+ acc0i = _mm256_setzero_ps();
2074
+ j = startsimd;
2075
+ for (i = 0; i < lensimd; i++) {
2076
+ #if defined __FMA__
2077
+ acc0r = _mm256_fmadd_ps(coefs0[i], stater[j], acc0r);
2078
+ acc0i = _mm256_fmadd_ps(coefs0[i], statei[j], acc0i);
2079
+ #elif defined __FMA4__
2080
+ acc0r = _mm256_macc_ps(coefs0[i], stater[j], acc0r);
2081
+ acc0i = _mm256_macc_ps(coefs0[i], statei[j], acc0i);
2082
+ #else
2083
+ prod0r = _mm256_mul_ps(coefs0[i], stater[j]);
2084
+ prod0i = _mm256_mul_ps(coefs0[i], statei[j]);
2085
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2086
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2087
+ #endif
2088
+ j = (j+1) & masksimd;
2089
+ }
2090
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7]) * ffactor;
2091
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7]) * ffactor;
2092
+ start_index = (start_index + 1) & mask;
2093
+ startsimd = start_index >> 3;
2094
+ }
2095
+
2096
+ return ocbuf;
2097
+ }
2098
+
2099
+
2100
+ pdlc_buffer_t* pdlc_fir_filter_decimate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
2101
+ {
2102
+ const unsigned int nb_coefs = fir->nb_coefs;
2103
+ const unsigned int flt_len = fir->state_len;
2104
+ const unsigned int mask = fir->index_mask;
2105
+ const unsigned int lensimd = fir->coef_len >> 3;
2106
+ const unsigned int masksimd = mask >> 3;
2107
+ const int mcounter = fir->max_counter;
2108
+ const size_t ibuflen = ifbuf->length;
2109
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
2110
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
2111
+ unsigned int startsimd = start_index >> 3;
2112
+ unsigned int i0, i1, j0, j1;
2113
+ size_t k, l;
2114
+ register __m256 acc0, acc1;
2115
+ #if !(defined __FMA__ || defined __FMA4__)
2116
+ register __m256 prod0, prod1;
2117
+ #endif
2118
+ const __m256 *coefs;
2119
+ __m256 *stater = (__m256*)fir->stater;
2120
+
2121
+
2122
+ if (!ofbuf)
2123
+ ofbuf = pdlc_buffer_new(obuflen);
2124
+ else if (ofbuf->length != obuflen)
2125
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
2126
+
2127
+
2128
+ for (k = 0, l = 0; k < ibuflen; k++) {
2129
+ fir->stater[fir->index] = ifbuf->data[k];
2130
+ fir->index = (fir->index + 1) & mask;
2131
+ if (fir->counter == 0) {
2132
+ coefs = (__m256*)fir->coefs[start_index & 7];
2133
+ acc0 = _mm256_setzero_ps();
2134
+ acc1 = _mm256_setzero_ps();
2135
+ j0 = startsimd;
2136
+ j1 = (startsimd+1) & masksimd;
2137
+ i0 = 0;
2138
+ i1 = 1;
2139
+ while (i1 < lensimd) {
2140
+ #if defined __FMA__
2141
+ acc0 = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0);
2142
+ acc1 = _mm256_fmadd_ps(coefs[i1], stater[j1], acc1);
2143
+ #elif defined __FMA4__
2144
+ acc0 = _mm256_macc_ps(coefs[i0], stater[j0], acc0);
2145
+ acc1 = _mm256_macc_ps(coefs[i1], stater[j1], acc1);
2146
+ #else
2147
+ prod0 = _mm256_mul_ps(coefs[i0], stater[j0]);
2148
+ acc0 = _mm256_add_ps(acc0, prod0);
2149
+ prod1 = _mm256_mul_ps(coefs[i1], stater[j1]);
2150
+ acc1 = _mm256_add_ps(acc1, prod1);
2151
+ #endif
2152
+ i0 += 2;
2153
+ i1 += 2;
2154
+ j0 = (j0+2) & masksimd;
2155
+ j1 = (j1+2) & masksimd;
2156
+ }
2157
+ while (i0 < lensimd) {
2158
+ #if defined __FMA__
2159
+ acc0 = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0);
2160
+ #elif defined __FMA4__
2161
+ acc0 = _mm256_macc_ps(coefs[i0], stater[j0], acc0);
2162
+ #else
2163
+ prod0 = _mm256_mul_ps(coefs[i0], stater[j0]);
2164
+ acc0 = _mm256_add_ps(acc0, prod0);
2165
+ #endif
2166
+ i0 += 2;
2167
+ j0 = (j0+2) & masksimd;
2168
+ }
2169
+ acc0 = _mm256_add_ps(acc0, acc1);
2170
+ ofbuf->data[l++] = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2171
+ start_index = (start_index + mcounter) & mask;
2172
+ startsimd = start_index >> 3;
2173
+ }
2174
+ fir->counter = (fir->counter + 1) % mcounter;
2175
+ }
2176
+
2177
+ return ofbuf;
2178
+ }
2179
+
2180
+
2181
+ pdlc_complex_buffer_t* pdlc_fir_filter_decimate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
2182
+ {
2183
+ const unsigned int nb_coefs = fir->nb_coefs;
2184
+ const unsigned int flt_len = fir->state_len;
2185
+ const unsigned int mask = fir->index_mask;
2186
+ const unsigned int lensimd = fir->coef_len >> 3;
2187
+ const unsigned int masksimd = mask >> 3;
2188
+ const int mcounter = fir->max_counter;
2189
+ const size_t ibuflen = icbuf->length;
2190
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
2191
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
2192
+ unsigned int startsimd = start_index >> 3;
2193
+ unsigned int i0, j0, i1, j1;
2194
+ size_t k, l;
2195
+ register __m256 acc0r, acc0i, acc1r, acc1i;
2196
+ #if !(defined __FMA__ || defined __FMA4__)
2197
+ register __m256 prod0r, prod0i, prod1r, prod1i;
2198
+ #endif
2199
+ const __m256 *coefs;
2200
+ __m256 *stater = (__m256*)fir->stater;
2201
+ __m256 *statei = (__m256*)fir->statei;
2202
+
2203
+
2204
+ if (!ocbuf)
2205
+ ocbuf = pdlc_complex_buffer_new(obuflen);
2206
+ else if (ocbuf->length != obuflen)
2207
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
2208
+
2209
+
2210
+ for (k = 0, l = 0; k < ibuflen; k++) {
2211
+ fir->stater[fir->index] = icbuf->data[k].real;
2212
+ fir->statei[fir->index] = icbuf->data[k].imag;
2213
+ fir->index = (fir->index + 1) & mask;
2214
+ if (fir->counter == 0) {
2215
+ coefs = (__m256*)fir->coefs[start_index & 7];
2216
+ acc0r = _mm256_setzero_ps();
2217
+ acc0i = _mm256_setzero_ps();
2218
+ acc1r = _mm256_setzero_ps();
2219
+ acc1i = _mm256_setzero_ps();
2220
+ j0 = startsimd;
2221
+ j1 = (startsimd+1) & masksimd;
2222
+ i0 = 0;
2223
+ i1 = 1;
2224
+ while (i1 < lensimd) {
2225
+ #if defined __FMA__
2226
+ acc0r = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0r);
2227
+ acc0i = _mm256_fmadd_ps(coefs[i0], statei[j0], acc0i);
2228
+ acc1r = _mm256_fmadd_ps(coefs[i1], stater[j1], acc1r);
2229
+ acc1i = _mm256_fmadd_ps(coefs[i1], statei[j1], acc1i);
2230
+ #elif defined __FMA4__
2231
+ acc0r = _mm256_macc_ps(coefs[i0], stater[j0], acc0r);
2232
+ acc0i = _mm256_macc_ps(coefs[i0], statei[j0], acc0i);
2233
+ acc1r = _mm256_macc_ps(coefs[i1], stater[j1], acc1r);
2234
+ acc1i = _mm256_macc_ps(coefs[i1], statei[j1], acc1i);
2235
+ #else
2236
+ prod0r = _mm256_mul_ps(coefs[i0], stater[j0]);
2237
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2238
+ prod0i = _mm256_mul_ps(coefs[i0], statei[j0]);
2239
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2240
+ prod1r = _mm256_mul_ps(coefs[i1], stater[j1]);
2241
+ acc1r = _mm256_add_ps(acc1r, prod1r);
2242
+ prod1i = _mm256_mul_ps(coefs[i1], statei[j1]);
2243
+ acc1i = _mm256_add_ps(acc1i, prod1i);
2244
+ #endif
2245
+ i0 += 2;
2246
+ i1 += 2;
2247
+ j0 = (j0+2) & masksimd;
2248
+ j1 = (j1+2) & masksimd;
2249
+ }
2250
+ while (i0 < lensimd) {
2251
+ #if defined __FMA__
2252
+ acc0r = _mm256_fmadd_ps(coefs[i0], stater[j0], acc0r);
2253
+ acc0i = _mm256_fmadd_ps(coefs[i0], statei[j0], acc0i);
2254
+ #elif defined __FMA4__
2255
+ acc0r = _mm256_macc_ps(coefs[i0], stater[j0], acc0r);
2256
+ acc0i = _mm256_macc_ps(coefs[i0], statei[j0], acc0i);
2257
+ #else
2258
+ prod0r = _mm256_mul_ps(coefs[i0], stater[j0]);
2259
+ acc0r = _mm256_add_ps(acc0r, prod0r);
2260
+ prod0i = _mm256_mul_ps(coefs[i0], statei[j0]);
2261
+ acc0i = _mm256_add_ps(acc0i, prod0i);
2262
+ #endif
2263
+ i0 += 2;
2264
+ j0 = (j0+2) & masksimd;
2265
+ }
2266
+ acc0r = _mm256_add_ps(acc0r, acc1r);
2267
+ acc0i = _mm256_add_ps(acc0i, acc1i);
2268
+ ocbuf->data[l].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3] + acc0r[4] + acc0r[5] + acc0r[6] + acc0r[7];
2269
+ ocbuf->data[l].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3] + acc0i[4] + acc0i[5] + acc0i[6] + acc0i[7];
2270
+ l++;
2271
+ start_index = (start_index + mcounter) & mask;
2272
+ startsimd = start_index >> 3;
2273
+ }
2274
+ fir->counter = (fir->counter + 1) % mcounter;
2275
+ }
2276
+
2277
+ return ocbuf;
2278
+ }
2279
+
2280
+
2281
+ pdlc_complex_buffer_t* pdlc_fir_filter_transform(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_complex_buffer_t *ocbuf)
2282
+ {
2283
+ const unsigned int nb_coefs = fir->nb_coefs;
2284
+ const unsigned int flt_len = fir->state_len;
2285
+ const unsigned int mask = fir->index_mask;
2286
+ const unsigned int lensimd = fir->coef_len >> 3;
2287
+ const unsigned int masksimd = mask >> 3;
2288
+ const size_t ibuflen = ifbuf->length;
2289
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
2290
+ unsigned int startsimd = start_index >> 3;
2291
+ unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
2292
+ unsigned int i, j;
2293
+ size_t k = 0;
2294
+ register __m256 acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
2295
+ #if !(defined __FMA__ || defined __FMA4__)
2296
+ register __m256 prod0, prod1, prod2, prod3, prod4, prod5, prod6, prod7;
2297
+ #endif
2298
+ register __m256 statereal;
2299
+ const __m256 *coefs0, *coefs1, *coefs2, *coefs3, *coefs4, *coefs5, *coefs6, *coefs7;
2300
+ __m256 *stater = (__m256*)fir->stater;
2301
+
2302
+
2303
+ if (!ocbuf)
2304
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
2305
+ else if (ocbuf->length != ibuflen)
2306
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
2307
+
2308
+
2309
+ if (nb_coefs & 1) {
2310
+ while ((start_index & 7) && k < ibuflen) {
2311
+ fir->stater[fir->index] = ifbuf->data[k];
2312
+ fir->index = (fir->index + 1) & mask;
2313
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2314
+ acc0 = _mm256_setzero_ps();
2315
+ j = startsimd;
2316
+ for (i = 0; i < lensimd; i++) {
2317
+ #if defined __FMA__
2318
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2319
+ #elif defined __FMA4__
2320
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2321
+ #else
2322
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2323
+ acc0 = _mm256_add_ps(acc0, prod0);
2324
+ #endif
2325
+ j = (j+1) & masksimd;
2326
+ }
2327
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2328
+ start_index = (start_index + 1) & mask;
2329
+ startsimd = start_index >> 3;
2330
+ ocbuf->data[k].real = fir->stater[middle_index];
2331
+ middle_index = (middle_index + 1) & mask;
2332
+ k++;
2333
+ }
2334
+ while (k + 8 <= ibuflen) {
2335
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
2336
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
2337
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
2338
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
2339
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
2340
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
2341
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
2342
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
2343
+ fir->index = (fir->index + 8) & mask;
2344
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
2345
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
2346
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
2347
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
2348
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
2349
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
2350
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
2351
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
2352
+ acc0 = _mm256_setzero_ps();
2353
+ acc1 = _mm256_setzero_ps();
2354
+ acc2 = _mm256_setzero_ps();
2355
+ acc3 = _mm256_setzero_ps();
2356
+ acc4 = _mm256_setzero_ps();
2357
+ acc5 = _mm256_setzero_ps();
2358
+ acc6 = _mm256_setzero_ps();
2359
+ acc7 = _mm256_setzero_ps();
2360
+ j = startsimd;
2361
+ for (i = 0; i < lensimd; i++) {
2362
+ statereal = stater[j];
2363
+ #if defined __FMA__
2364
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
2365
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
2366
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
2367
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
2368
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
2369
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
2370
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
2371
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
2372
+ #elif defined __FMA4__
2373
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
2374
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
2375
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
2376
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
2377
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
2378
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
2379
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
2380
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
2381
+ #else
2382
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
2383
+ acc0 = _mm256_add_ps(acc0, prod0);
2384
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
2385
+ acc1 = _mm256_add_ps(acc1, prod1);
2386
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
2387
+ acc2 = _mm256_add_ps(acc2, prod2);
2388
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
2389
+ acc3 = _mm256_add_ps(acc3, prod3);
2390
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
2391
+ acc4 = _mm256_add_ps(acc4, prod4);
2392
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
2393
+ acc5 = _mm256_add_ps(acc5, prod5);
2394
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
2395
+ acc6 = _mm256_add_ps(acc6, prod6);
2396
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
2397
+ acc7 = _mm256_add_ps(acc7, prod7);
2398
+ #endif
2399
+ j = (j+1) & masksimd;
2400
+ }
2401
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
2402
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
2403
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
2404
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
2405
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
2406
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
2407
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
2408
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
2409
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
2410
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
2411
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
2412
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
2413
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
2414
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
2415
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
2416
+ ocbuf->data[k+0].imag = h30[0];
2417
+ ocbuf->data[k+1].imag = h30[1];
2418
+ ocbuf->data[k+2].imag = h30[2];
2419
+ ocbuf->data[k+3].imag = h30[3];
2420
+ ocbuf->data[k+4].imag = h30[4];
2421
+ ocbuf->data[k+5].imag = h30[5];
2422
+ ocbuf->data[k+6].imag = h30[6];
2423
+ ocbuf->data[k+7].imag = h30[7];
2424
+ start_index = (start_index + 8) & mask;
2425
+ startsimd = start_index >> 3;
2426
+ ocbuf->data[k].real = fir->stater[middle_index];
2427
+ middle_index = (middle_index + 1) & mask;
2428
+ k++;
2429
+ ocbuf->data[k].real = fir->stater[middle_index];
2430
+ middle_index = (middle_index + 1) & mask;
2431
+ k++;
2432
+ ocbuf->data[k].real = fir->stater[middle_index];
2433
+ middle_index = (middle_index + 1) & mask;
2434
+ k++;
2435
+ ocbuf->data[k].real = fir->stater[middle_index];
2436
+ middle_index = (middle_index + 1) & mask;
2437
+ k++;
2438
+ ocbuf->data[k].real = fir->stater[middle_index];
2439
+ middle_index = (middle_index + 1) & mask;
2440
+ k++;
2441
+ ocbuf->data[k].real = fir->stater[middle_index];
2442
+ middle_index = (middle_index + 1) & mask;
2443
+ k++;
2444
+ ocbuf->data[k].real = fir->stater[middle_index];
2445
+ middle_index = (middle_index + 1) & mask;
2446
+ k++;
2447
+ ocbuf->data[k].real = fir->stater[middle_index];
2448
+ middle_index = (middle_index + 1) & mask;
2449
+ k++;
2450
+ }
2451
+ for (; k < ibuflen; k++) {
2452
+ fir->stater[fir->index] = ifbuf->data[k];
2453
+ fir->index = (fir->index + 1) & mask;
2454
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2455
+ acc0 = _mm256_setzero_ps();
2456
+ j = startsimd;
2457
+ for (i = 0; i < lensimd; i++) {
2458
+ #if defined __FMA__
2459
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2460
+ #elif defined __FMA4__
2461
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2462
+ #else
2463
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2464
+ acc0 = _mm256_add_ps(acc0, prod0);
2465
+ #endif
2466
+ j = (j+1) & masksimd;
2467
+ }
2468
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2469
+ start_index = (start_index + 1) & mask;
2470
+ startsimd = start_index >> 3;
2471
+ ocbuf->data[k].real = fir->stater[middle_index];
2472
+ middle_index = (middle_index + 1) & mask;
2473
+ }
2474
+ }
2475
+ else {
2476
+ while ((start_index & 7) && k < ibuflen) {
2477
+ fir->stater[fir->index] = ifbuf->data[k];
2478
+ fir->index = (fir->index + 1) & mask;
2479
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2480
+ acc0 = _mm256_setzero_ps();
2481
+ j = startsimd;
2482
+ for (i = 0; i < lensimd; i++) {
2483
+ #if defined __FMA__
2484
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2485
+ #elif defined __FMA4__
2486
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2487
+ #else
2488
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2489
+ acc0 = _mm256_add_ps(acc0, prod0);
2490
+ #endif
2491
+ j = (j+1) & masksimd;
2492
+ }
2493
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2494
+ start_index = (start_index + 1) & mask;
2495
+ startsimd = start_index >> 3;
2496
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2497
+ middle_index = (middle_index + 1) & mask;
2498
+ k++;
2499
+ }
2500
+ while (k + 8 <= ibuflen) {
2501
+ fir->stater[(fir->index + 0) & mask] = ifbuf->data[k + 0];
2502
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
2503
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
2504
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
2505
+ fir->stater[(fir->index + 4) & mask] = ifbuf->data[k + 4];
2506
+ fir->stater[(fir->index + 5) & mask] = ifbuf->data[k + 5];
2507
+ fir->stater[(fir->index + 6) & mask] = ifbuf->data[k + 6];
2508
+ fir->stater[(fir->index + 7) & mask] = ifbuf->data[k + 7];
2509
+ fir->index = (fir->index + 8) & mask;
2510
+ coefs0 = (__m256*)fir->coefs[(start_index + 0) & 7];
2511
+ coefs1 = (__m256*)fir->coefs[(start_index + 1) & 7];
2512
+ coefs2 = (__m256*)fir->coefs[(start_index + 2) & 7];
2513
+ coefs3 = (__m256*)fir->coefs[(start_index + 3) & 7];
2514
+ coefs4 = (__m256*)fir->coefs[(start_index + 4) & 7];
2515
+ coefs5 = (__m256*)fir->coefs[(start_index + 5) & 7];
2516
+ coefs6 = (__m256*)fir->coefs[(start_index + 6) & 7];
2517
+ coefs7 = (__m256*)fir->coefs[(start_index + 7) & 7];
2518
+ acc0 = _mm256_setzero_ps();
2519
+ acc1 = _mm256_setzero_ps();
2520
+ acc2 = _mm256_setzero_ps();
2521
+ acc3 = _mm256_setzero_ps();
2522
+ acc4 = _mm256_setzero_ps();
2523
+ acc5 = _mm256_setzero_ps();
2524
+ acc6 = _mm256_setzero_ps();
2525
+ acc7 = _mm256_setzero_ps();
2526
+ j = startsimd;
2527
+ for (i = 0; i < lensimd; i++) {
2528
+ statereal = stater[j];
2529
+ #if defined __FMA__
2530
+ acc0 = _mm256_fmadd_ps(coefs0[i], statereal, acc0);
2531
+ acc1 = _mm256_fmadd_ps(coefs1[i], statereal, acc1);
2532
+ acc2 = _mm256_fmadd_ps(coefs2[i], statereal, acc2);
2533
+ acc3 = _mm256_fmadd_ps(coefs3[i], statereal, acc3);
2534
+ acc4 = _mm256_fmadd_ps(coefs4[i], statereal, acc4);
2535
+ acc5 = _mm256_fmadd_ps(coefs5[i], statereal, acc5);
2536
+ acc6 = _mm256_fmadd_ps(coefs6[i], statereal, acc6);
2537
+ acc7 = _mm256_fmadd_ps(coefs7[i], statereal, acc7);
2538
+ #elif defined __FMA4__
2539
+ acc0 = _mm256_macc_ps(coefs0[i], statereal, acc0);
2540
+ acc1 = _mm256_macc_ps(coefs1[i], statereal, acc1);
2541
+ acc2 = _mm256_macc_ps(coefs2[i], statereal, acc2);
2542
+ acc3 = _mm256_macc_ps(coefs3[i], statereal, acc3);
2543
+ acc4 = _mm256_macc_ps(coefs4[i], statereal, acc4);
2544
+ acc5 = _mm256_macc_ps(coefs5[i], statereal, acc5);
2545
+ acc6 = _mm256_macc_ps(coefs6[i], statereal, acc6);
2546
+ acc7 = _mm256_macc_ps(coefs7[i], statereal, acc7);
2547
+ #else
2548
+ prod0 = _mm256_mul_ps(coefs0[i], statereal);
2549
+ acc0 = _mm256_add_ps(acc0, prod0);
2550
+ prod1 = _mm256_mul_ps(coefs1[i], statereal);
2551
+ acc1 = _mm256_add_ps(acc1, prod1);
2552
+ prod2 = _mm256_mul_ps(coefs2[i], statereal);
2553
+ acc2 = _mm256_add_ps(acc2, prod2);
2554
+ prod3 = _mm256_mul_ps(coefs3[i], statereal);
2555
+ acc3 = _mm256_add_ps(acc3, prod3);
2556
+ prod4 = _mm256_mul_ps(coefs4[i], statereal);
2557
+ acc4 = _mm256_add_ps(acc4, prod4);
2558
+ prod5 = _mm256_mul_ps(coefs5[i], statereal);
2559
+ acc5 = _mm256_add_ps(acc5, prod5);
2560
+ prod6 = _mm256_mul_ps(coefs6[i], statereal);
2561
+ acc6 = _mm256_add_ps(acc6, prod6);
2562
+ prod7 = _mm256_mul_ps(coefs7[i], statereal);
2563
+ acc7 = _mm256_add_ps(acc7, prod7);
2564
+ #endif
2565
+ j = (j+1) & masksimd;
2566
+ }
2567
+ register __m256 h00 = _mm256_permute2f128_ps(acc0, acc4, 0x20);
2568
+ register __m256 h01 = _mm256_permute2f128_ps(acc0, acc4, 0x31);
2569
+ register __m256 h02 = _mm256_permute2f128_ps(acc1, acc5, 0x20);
2570
+ register __m256 h03 = _mm256_permute2f128_ps(acc1, acc5, 0x31);
2571
+ register __m256 h04 = _mm256_permute2f128_ps(acc2, acc6, 0x20);
2572
+ register __m256 h05 = _mm256_permute2f128_ps(acc2, acc6, 0x31);
2573
+ register __m256 h06 = _mm256_permute2f128_ps(acc3, acc7, 0x20);
2574
+ register __m256 h07 = _mm256_permute2f128_ps(acc3, acc7, 0x31);
2575
+ register __m256 h10 = _mm256_hadd_ps(h00, h01);
2576
+ register __m256 h11 = _mm256_hadd_ps(h02, h03);
2577
+ register __m256 h12 = _mm256_hadd_ps(h04, h05);
2578
+ register __m256 h13 = _mm256_hadd_ps(h06, h07);
2579
+ register __m256 h20 = _mm256_hadd_ps(h10, h11);
2580
+ register __m256 h21 = _mm256_hadd_ps(h12, h13);
2581
+ register __m256 h30 = _mm256_hadd_ps(h20, h21);
2582
+ ocbuf->data[k+0].imag = h30[0];
2583
+ ocbuf->data[k+1].imag = h30[1];
2584
+ ocbuf->data[k+2].imag = h30[2];
2585
+ ocbuf->data[k+3].imag = h30[3];
2586
+ ocbuf->data[k+4].imag = h30[4];
2587
+ ocbuf->data[k+5].imag = h30[5];
2588
+ ocbuf->data[k+6].imag = h30[6];
2589
+ ocbuf->data[k+7].imag = h30[7];
2590
+ start_index = (start_index + 8) & mask;
2591
+ startsimd = start_index >> 3;
2592
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2593
+ middle_index = (middle_index + 1) & mask;
2594
+ k++;
2595
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2596
+ middle_index = (middle_index + 1) & mask;
2597
+ k++;
2598
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2599
+ middle_index = (middle_index + 1) & mask;
2600
+ k++;
2601
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2602
+ middle_index = (middle_index + 1) & mask;
2603
+ k++;
2604
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2605
+ middle_index = (middle_index + 1) & mask;
2606
+ k++;
2607
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2608
+ middle_index = (middle_index + 1) & mask;
2609
+ k++;
2610
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2611
+ middle_index = (middle_index + 1) & mask;
2612
+ k++;
2613
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2614
+ middle_index = (middle_index + 1) & mask;
2615
+ k++;
2616
+ }
2617
+ for (; k < ibuflen; k++) {
2618
+ fir->stater[fir->index] = ifbuf->data[k];
2619
+ fir->index = (fir->index + 1) & mask;
2620
+ coefs0 = (__m256*)fir->coefs[start_index & 7];
2621
+ acc0 = _mm256_setzero_ps();
2622
+ j = startsimd;
2623
+ for (i = 0; i < lensimd; i++) {
2624
+ #if defined __FMA__
2625
+ acc0 = _mm256_fmadd_ps(coefs0[i], stater[j], acc0);
2626
+ #elif defined __FMA4__
2627
+ acc0 = _mm256_macc_ps(coefs0[i], stater[j], acc0);
2628
+ #else
2629
+ prod0 = _mm256_mul_ps(coefs0[i], stater[j]);
2630
+ acc0 = _mm256_add_ps(acc0, prod0);
2631
+ #endif
2632
+ j = (j+1) & masksimd;
2633
+ }
2634
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3] + acc0[4] + acc0[5] + acc0[6] + acc0[7];
2635
+ start_index = (start_index + 1) & mask;
2636
+ startsimd = start_index >> 3;
2637
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
2638
+ middle_index = (middle_index + 1) & mask;
2639
+ }
2640
+ }
2641
+
2642
+ return ocbuf;
2643
+ }
2644
+
2645
+