paddlec 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1767 @@
1
+ /* Copyright (C) 2019 Théotime Bollengier <theotime.bollengier@gmail.com>
2
+ *
3
+ * This file is part of PaddleC
4
+ *
5
+ * PaddleC is free software: you can redistribute it and/or modify
6
+ * it under the terms of the GNU General Public License as published by
7
+ * the Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * PaddleC is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ * GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License
16
+ * along with PaddleC. If not, see <https://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include <arm_neon.h>
20
+
21
+
22
+ void pdlc_fir_filter_inspect(pdlc_fir_filter_t* fir)
23
+ {
24
+ size_t i, j;
25
+
26
+ printf("nb_coefs: %u, state_len: %u, coef_len: %u, index_mask: 0x%x, index: %u\n",
27
+ fir->nb_coefs, fir->state_len, fir->coef_len, fir->index_mask, fir->index);
28
+ printf("state: [%.7g", fir->stater[0]);
29
+ for (i = 1; i < fir->state_len; i++)
30
+ printf(", %.7g", fir->stater[i]);
31
+ printf("]\n");
32
+ for (j = 0; j < 4; j++) {
33
+ printf("coefs: {%u}[%.7g", j, fir->coefs[j][0]);
34
+ for (i = 1; i < fir->coef_len; i++)
35
+ printf(", %.7g", fir->coefs[j][i]);
36
+ printf("]\n");
37
+ }
38
+ }
39
+
40
+
41
+ void pdlc_fir_filter_initialize(pdlc_fir_filter_t* fir, int order)
42
+ {
43
+ int i, r;
44
+
45
+ if (fir->coefs) {
46
+ for (i = 0; i < 4; i++)
47
+ if (fir->coefs[i])
48
+ free(fir->coefs[i]);
49
+ free(fir->coefs);
50
+ fir->coefs = NULL;
51
+ }
52
+
53
+ if (fir->stater)
54
+ free(fir->stater);
55
+ fir->stater = NULL;
56
+
57
+ if (fir->statei)
58
+ free(fir->statei);
59
+ fir->statei = NULL;
60
+
61
+ fir->nb_coefs = 0;
62
+ fir->state_len = 0;
63
+ fir->coef_len = 0;
64
+ fir->index = 0;
65
+ fir->index_mask = 0;
66
+ fir->counter = 0;
67
+ fir->max_counter = 1;
68
+
69
+ if (order < 0)
70
+ return;
71
+
72
+ if (order > 67108863) {
73
+ fprintf(stderr, "ERROR: libpaddlec: Filter order cannot be greater than 67108864\n");
74
+ exit(EXIT_FAILURE);
75
+ }
76
+
77
+ fir->nb_coefs = (unsigned int)(order + 1);
78
+ fir->coef_len = ((fir->nb_coefs + 3 + 3) >> 2) << 2;
79
+ fir->state_len = (unsigned int)(pow(2.0, ceil(log2(fir->coef_len))));
80
+ fir->index = 0;
81
+ fir->index_mask = fir->state_len - 1;
82
+
83
+ fir->coefs = malloc(4*sizeof(float*));
84
+ if (fir->coefs == NULL) {
85
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", 4 * sizeof(float*));
86
+ exit(EXIT_FAILURE);
87
+ }
88
+
89
+ for (i = 0; i < 4; i++) {
90
+ r = posix_memalign((void**)&fir->coefs[i], sizeof(float32x4_t), fir->coef_len * sizeof(float));
91
+ if (r) {
92
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->coef_len * sizeof(float));
93
+ exit(EXIT_FAILURE);
94
+ }
95
+ }
96
+
97
+ r = posix_memalign((void**)&fir->stater, sizeof(float32x4_t), fir->state_len * sizeof(float));
98
+ if (r) {
99
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->state_len * sizeof(float));
100
+ exit(EXIT_FAILURE);
101
+ }
102
+
103
+ r = posix_memalign((void**)&fir->statei, sizeof(float32x4_t), fir->state_len * sizeof(float));
104
+ if (r) {
105
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->state_len * sizeof(float));
106
+ exit(EXIT_FAILURE);
107
+ }
108
+
109
+ memset(fir->stater, 0, fir->state_len * sizeof(float));
110
+ memset(fir->statei, 0, fir->state_len * sizeof(float));
111
+ for (i = 0; i < 4; i++)
112
+ memset(fir->coefs[i], 0, fir->coef_len * sizeof(float));
113
+ }
114
+
115
+
116
+ void pdlc_fir_filter_free(pdlc_fir_filter_t* fir)
117
+ {
118
+ int i;
119
+
120
+ if (!fir)
121
+ return;
122
+
123
+ if (fir->coefs) {
124
+ for (i = 0; i < 4; i++)
125
+ if (fir->coefs[i])
126
+ free(fir->coefs[i]);
127
+ free(fir->coefs);
128
+ }
129
+
130
+ if (fir->stater)
131
+ free(fir->stater);
132
+
133
+ if (fir->statei)
134
+ free(fir->statei);
135
+
136
+ free(fir);
137
+ }
138
+
139
+
140
+ size_t pdlc_fir_filter_size(pdlc_fir_filter_t* fir)
141
+ {
142
+ size_t res;
143
+
144
+ res = sizeof(pdlc_fir_filter_t);
145
+ res += sizeof(float*)* 4;
146
+ res += sizeof(float) * fir->state_len * 2;
147
+ res += sizeof(float) * fir->coef_len * 4;
148
+
149
+ return res;
150
+ }
151
+
152
+
153
+ int pdlc_fir_filter_set_coef_at(pdlc_fir_filter_t* fir, int index, float value)
154
+ {
155
+ int i;
156
+
157
+ if (index < 0 || index >= (int)fir->nb_coefs)
158
+ return -1;
159
+
160
+ for (i = 0; i < 4; i++)
161
+ fir->coefs[i][(fir->nb_coefs - 1 - index + i) % fir->coef_len] = value;
162
+
163
+ return 0;
164
+ }
165
+
166
+
167
+ float pdlc_fir_filter_filter_float(pdlc_fir_filter_t* fir, float sample, float *delayed)
168
+ {
169
+ const unsigned int nb_coefs = fir->nb_coefs;
170
+ const unsigned int flt_len = fir->state_len;
171
+ const unsigned int mask = fir->index_mask;
172
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
173
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
174
+ const unsigned int lensimd = fir->coef_len >> 2;
175
+ const unsigned int startsimd = start_index >> 2;
176
+ const unsigned int masksimd = mask >> 2;
177
+ unsigned int i, j;
178
+ register float32x4_t acc;
179
+ #ifndef __FP_FAST_FMA
180
+ register float32x4_t prod;
181
+ #endif
182
+ const float32x4_t *coefs = (float32x4_t*)fir->coefs[start_index & 3];
183
+ float32x4_t *stater = (float32x4_t*)fir->stater;
184
+
185
+ fir->stater[fir->index] = sample;
186
+ fir->index = (fir->index + 1) & mask;
187
+
188
+ if (delayed) {
189
+ if (nb_coefs & 1)
190
+ *delayed = fir->stater[middle_index];
191
+ else
192
+ *delayed = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
193
+ }
194
+
195
+ acc = vmovq_n_f32(0.0f);
196
+ j = startsimd;
197
+ for (i = 0; i < lensimd; i++) {
198
+ #ifndef __FP_FAST_FMA
199
+ prod = vmulq_f32(coefs[i], stater[j]);
200
+ acc = vaddq_f32(acc, prod);
201
+ #else
202
+ acc = vfmaq_f32(acc, coefs[i], stater[j]);
203
+ #endif
204
+ j = (j+1) & masksimd;
205
+ }
206
+
207
+ return acc[0] + acc[1] + acc[2] + acc[3];
208
+ }
209
+
210
+
211
+ pdlc_complex_t pdlc_fir_filter_filter_complex(pdlc_fir_filter_t* fir, pdlc_complex_t sample, pdlc_complex_t *delayed)
212
+ {
213
+ const unsigned int nb_coefs = fir->nb_coefs;
214
+ const unsigned int flt_len = fir->state_len;
215
+ const unsigned int mask = fir->index_mask;
216
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
217
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
218
+ const unsigned int lensimd = fir->coef_len >> 2;
219
+ const unsigned int startsimd = start_index >> 2;
220
+ const unsigned int masksimd = mask >> 2;
221
+ unsigned int i, j;
222
+ pdlc_complex_t res = {0.0f, 0.0f};
223
+ register float32x4_t accr, acci;
224
+ #ifndef __FP_FAST_FMA
225
+ register float32x4_t prodr, prodi;
226
+ #endif
227
+ const float32x4_t *coefs = (float32x4_t*)fir->coefs[start_index & 3];
228
+ float32x4_t *stater = (float32x4_t*)fir->stater;
229
+ float32x4_t *statei = (float32x4_t*)fir->statei;
230
+
231
+ fir->stater[fir->index] = sample.real;
232
+ fir->statei[fir->index] = sample.imag;
233
+ fir->index = (fir->index + 1) & mask;
234
+
235
+ if (delayed) {
236
+ if (nb_coefs & 1) {
237
+ delayed->real = fir->stater[middle_index];
238
+ delayed->imag = fir->statei[middle_index];
239
+ }
240
+ else {
241
+ delayed->real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
242
+ delayed->imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
243
+ }
244
+ }
245
+
246
+ accr = vmovq_n_f32(0.0f);
247
+ acci = vmovq_n_f32(0.0f);
248
+ j = startsimd;
249
+ for (i = 0; i < lensimd; i++) {
250
+ #ifndef __FP_FAST_FMA
251
+ prodr = vmulq_f32(coefs[i], stater[j]);
252
+ prodi = vmulq_f32(coefs[i], statei[j]);
253
+ accr = vaddq_f32(accr, prodr);
254
+ acci = vaddq_f32(acci, prodi);
255
+ #else
256
+ accr = vfmaq_f32(accr, coefs[i], stater[j]);
257
+ acci = vfmaq_f32(acci, coefs[i], statei[j]);
258
+ #endif
259
+ j = (j+1) & masksimd;
260
+ }
261
+ res.real = accr[0] + accr[1] + accr[2] + accr[3];
262
+ res.imag = acci[0] + acci[1] + acci[2] + acci[3];
263
+
264
+ return res;
265
+ }
266
+
267
+
268
+ pdlc_buffer_t* pdlc_fir_filter_filter_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf, pdlc_buffer_t *delayed)
269
+ {
270
+ const unsigned int nb_coefs = fir->nb_coefs;
271
+ const unsigned int flt_len = fir->state_len;
272
+ const unsigned int mask = fir->index_mask;
273
+ const unsigned int lensimd = fir->coef_len >> 2;
274
+ const unsigned int masksimd = mask >> 2;
275
+ const size_t ibuflen = ifbuf->length;
276
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
277
+ unsigned int startsimd = start_index >> 2;
278
+ unsigned int middle_index;
279
+ unsigned int i, j;
280
+ size_t k;
281
+ register float32x4_t acc0, acc1, acc2, acc3;
282
+ #ifndef __FP_FAST_FMA
283
+ register float32x4_t prod0, prod1, prod2, prod3;
284
+ #endif
285
+ register float32x4_t statereal;
286
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
287
+ float32x4_t *stater = (float32x4_t*)fir->stater;
288
+
289
+ if (!ofbuf)
290
+ ofbuf = pdlc_buffer_new(ibuflen);
291
+ else if (ofbuf->length != ibuflen)
292
+ pdlc_buffer_resize(ofbuf, ibuflen, 0);
293
+
294
+ if (delayed) {
295
+ if (delayed->length != ibuflen)
296
+ pdlc_buffer_resize(delayed, ibuflen, 0);
297
+ middle_index = (start_index + nb_coefs / 2) & mask;
298
+ if (nb_coefs & 1) {
299
+ k = 0;
300
+ while ((start_index & 3) && k < ibuflen) {
301
+ fir->stater[fir->index] = ifbuf->data[k];
302
+ fir->index = (fir->index + 1) & mask;
303
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
304
+ acc0 = vmovq_n_f32(0.0f);
305
+ j = startsimd;
306
+ for (i = 0; i < lensimd; i++) {
307
+ #ifndef __FP_FAST_FMA
308
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
309
+ acc0 = vaddq_f32(acc0, prod0);
310
+ #else
311
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
312
+ #endif
313
+ j = (j+1) & masksimd;
314
+ }
315
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
316
+ start_index = (start_index + 1) & mask;
317
+ startsimd = start_index >> 2;
318
+ delayed->data[k] = fir->stater[middle_index];
319
+ middle_index = (middle_index + 1) & mask;
320
+ k++;
321
+ }
322
+ while (k + 4 <= ibuflen) {
323
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
324
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
325
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
326
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
327
+ fir->index = (fir->index + 4) & mask;
328
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
329
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
330
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
331
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
332
+ acc0 = vmovq_n_f32(0.0f);
333
+ acc1 = vmovq_n_f32(0.0f);
334
+ acc2 = vmovq_n_f32(0.0f);
335
+ acc3 = vmovq_n_f32(0.0f);
336
+ j = startsimd;
337
+ for (i = 0; i < lensimd; i++) {
338
+ statereal = stater[j];
339
+ #ifndef __FP_FAST_FMA
340
+ prod0 = vmulq_f32(coefs0[i], statereal);
341
+ acc0 = vaddq_f32(acc0, prod0);
342
+ prod1 = vmulq_f32(coefs1[i], statereal);
343
+ acc1 = vaddq_f32(acc1, prod1);
344
+ prod2 = vmulq_f32(coefs2[i], statereal);
345
+ acc2 = vaddq_f32(acc2, prod2);
346
+ prod3 = vmulq_f32(coefs3[i], statereal);
347
+ acc3 = vaddq_f32(acc3, prod3);
348
+ #else
349
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
350
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
351
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
352
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
353
+ #endif
354
+ j = (j+1) & masksimd;
355
+ }
356
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
357
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
358
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
359
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
360
+ start_index = (start_index + 4) & mask;
361
+ startsimd = start_index >> 2;
362
+ delayed->data[k] = fir->stater[middle_index];
363
+ middle_index = (middle_index + 1) & mask;
364
+ k++;
365
+ delayed->data[k] = fir->stater[middle_index];
366
+ middle_index = (middle_index + 1) & mask;
367
+ k++;
368
+ delayed->data[k] = fir->stater[middle_index];
369
+ middle_index = (middle_index + 1) & mask;
370
+ k++;
371
+ delayed->data[k] = fir->stater[middle_index];
372
+ middle_index = (middle_index + 1) & mask;
373
+ k++;
374
+ }
375
+ for (; k < ibuflen; k++) {
376
+ fir->stater[fir->index] = ifbuf->data[k];
377
+ fir->index = (fir->index + 1) & mask;
378
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
379
+ acc0 = vmovq_n_f32(0.0f);
380
+ j = startsimd;
381
+ for (i = 0; i < lensimd; i++) {
382
+ #ifndef __FP_FAST_FMA
383
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
384
+ acc0 = vaddq_f32(acc0, prod0);
385
+ #else
386
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
387
+ #endif
388
+ j = (j+1) & masksimd;
389
+ }
390
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
391
+ start_index = (start_index + 1) & mask;
392
+ startsimd = start_index >> 2;
393
+ delayed->data[k] = fir->stater[middle_index];
394
+ middle_index = (middle_index + 1) & mask;
395
+ }
396
+ }
397
+ else {
398
+ k = 0;
399
+ while ((start_index & 3) && k < ibuflen) {
400
+ fir->stater[fir->index] = ifbuf->data[k];
401
+ fir->index = (fir->index + 1) & mask;
402
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
403
+ acc0 = vmovq_n_f32(0.0f);
404
+ j = startsimd;
405
+ for (i = 0; i < lensimd; i++) {
406
+ #ifndef __FP_FAST_FMA
407
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
408
+ acc0 = vaddq_f32(acc0, prod0);
409
+ #else
410
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
411
+ #endif
412
+ j = (j+1) & masksimd;
413
+ }
414
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
415
+ start_index = (start_index + 1) & mask;
416
+ startsimd = start_index >> 2;
417
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
418
+ middle_index = (middle_index + 1) & mask;
419
+ k++;
420
+ }
421
+ while (k + 4 <= ibuflen) {
422
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
423
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
424
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
425
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
426
+ fir->index = (fir->index + 4) & mask;
427
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
428
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
429
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
430
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
431
+ acc0 = vmovq_n_f32(0.0f);
432
+ acc1 = vmovq_n_f32(0.0f);
433
+ acc2 = vmovq_n_f32(0.0f);
434
+ acc3 = vmovq_n_f32(0.0f);
435
+ j = startsimd;
436
+ for (i = 0; i < lensimd; i++) {
437
+ statereal = stater[j];
438
+ #ifndef __FP_FAST_FMA
439
+ prod0 = vmulq_f32(coefs0[i], statereal);
440
+ acc0 = vaddq_f32(acc0, prod0);
441
+ prod1 = vmulq_f32(coefs1[i], statereal);
442
+ acc1 = vaddq_f32(acc1, prod1);
443
+ prod2 = vmulq_f32(coefs2[i], statereal);
444
+ acc2 = vaddq_f32(acc2, prod2);
445
+ prod3 = vmulq_f32(coefs3[i], statereal);
446
+ acc3 = vaddq_f32(acc3, prod3);
447
+ #else
448
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
449
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
450
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
451
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
452
+ #endif
453
+ j = (j+1) & masksimd;
454
+ }
455
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
456
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
457
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
458
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
459
+ start_index = (start_index + 4) & mask;
460
+ startsimd = start_index >> 2;
461
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
462
+ middle_index = (middle_index + 1) & mask;
463
+ k++;
464
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
465
+ middle_index = (middle_index + 1) & mask;
466
+ k++;
467
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
468
+ middle_index = (middle_index + 1) & mask;
469
+ k++;
470
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
471
+ middle_index = (middle_index + 1) & mask;
472
+ k++;
473
+ }
474
+ for (; k < ibuflen; k++) {
475
+ fir->stater[fir->index] = ifbuf->data[k];
476
+ fir->index = (fir->index + 1) & mask;
477
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
478
+ acc0 = vmovq_n_f32(0.0f);
479
+ j = startsimd;
480
+ for (i = 0; i < lensimd; i++) {
481
+ #ifndef __FP_FAST_FMA
482
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
483
+ acc0 = vaddq_f32(acc0, prod0);
484
+ #else
485
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
486
+ #endif
487
+ j = (j+1) & masksimd;
488
+ }
489
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
490
+ start_index = (start_index + 1) & mask;
491
+ startsimd = start_index >> 2;
492
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
493
+ middle_index = (middle_index + 1) & mask;
494
+ }
495
+ }
496
+ }
497
+ else {
498
+ k = 0;
499
+ while ((start_index & 3) && k < ibuflen) {
500
+ fir->stater[fir->index] = ifbuf->data[k];
501
+ fir->index = (fir->index + 1) & mask;
502
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
503
+ acc0 = vmovq_n_f32(0.0f);
504
+ j = startsimd;
505
+ for (i = 0; i < lensimd; i++) {
506
+ #ifndef __FP_FAST_FMA
507
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
508
+ acc0 = vaddq_f32(acc0, prod0);
509
+ #else
510
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
511
+ #endif
512
+ j = (j+1) & masksimd;
513
+ }
514
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
515
+ start_index = (start_index + 1) & mask;
516
+ startsimd = start_index >> 2;
517
+ k++;
518
+ }
519
+ while (k + 4 <= ibuflen) {
520
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
521
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
522
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
523
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
524
+ fir->index = (fir->index + 4) & mask;
525
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
526
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
527
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
528
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
529
+ acc0 = vmovq_n_f32(0.0f);
530
+ acc1 = vmovq_n_f32(0.0f);
531
+ acc2 = vmovq_n_f32(0.0f);
532
+ acc3 = vmovq_n_f32(0.0f);
533
+ j = startsimd;
534
+ for (i = 0; i < lensimd; i++) {
535
+ statereal = stater[j];
536
+ #ifndef __FP_FAST_FMA
537
+ prod0 = vmulq_f32(coefs0[i], statereal);
538
+ acc0 = vaddq_f32(acc0, prod0);
539
+ prod1 = vmulq_f32(coefs1[i], statereal);
540
+ acc1 = vaddq_f32(acc1, prod1);
541
+ prod2 = vmulq_f32(coefs2[i], statereal);
542
+ acc2 = vaddq_f32(acc2, prod2);
543
+ prod3 = vmulq_f32(coefs3[i], statereal);
544
+ acc3 = vaddq_f32(acc3, prod3);
545
+ #else
546
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
547
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
548
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
549
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
550
+ #endif
551
+ j = (j+1) & masksimd;
552
+ }
553
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
554
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
555
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
556
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
557
+ start_index = (start_index + 4) & mask;
558
+ startsimd = start_index >> 2;
559
+ k += 4;
560
+ }
561
+ for (; k < ibuflen; k++) {
562
+ fir->stater[fir->index] = ifbuf->data[k];
563
+ fir->index = (fir->index + 1) & mask;
564
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
565
+ acc0 = vmovq_n_f32(0.0f);
566
+ j = startsimd;
567
+ for (i = 0; i < lensimd; i++) {
568
+ #ifndef __FP_FAST_FMA
569
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
570
+ acc0 = vaddq_f32(acc0, prod0);
571
+ #else
572
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
573
+ #endif
574
+ j = (j+1) & masksimd;
575
+ }
576
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
577
+ start_index = (start_index + 1) & mask;
578
+ startsimd = start_index >> 2;
579
+ }
580
+ }
581
+
582
+ return ofbuf;
583
+ }
584
+
585
+
586
+ pdlc_complex_buffer_t* pdlc_fir_filter_filter_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf, pdlc_complex_buffer_t *delayed)
587
+ {
588
+ const unsigned int nb_coefs = fir->nb_coefs;
589
+ const unsigned int flt_len = fir->state_len;
590
+ const unsigned int mask = fir->index_mask;
591
+ const unsigned int lensimd = fir->coef_len >> 2;
592
+ const unsigned int masksimd = mask >> 2;
593
+ const size_t ibuflen = icbuf->length;
594
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
595
+ unsigned int startsimd = start_index >> 2;
596
+ unsigned int middle_index;
597
+ unsigned int i, j;
598
+ size_t k;
599
+ register float32x4_t acc0r, acc1r, acc2r, acc3r;
600
+ register float32x4_t acc0i, acc1i, acc2i, acc3i;
601
+ #ifndef __FP_FAST_FMA
602
+ register float32x4_t prod0r, prod1r, prod2r, prod3r;
603
+ register float32x4_t prod0i, prod1i, prod2i, prod3i;
604
+ #endif
605
+ register float32x4_t statereal;
606
+ register float32x4_t stateimag;
607
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
608
+ float32x4_t *stater = (float32x4_t*)fir->stater;
609
+ float32x4_t *statei = (float32x4_t*)fir->statei;
610
+
611
+ if (!ocbuf)
612
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
613
+ else if (ocbuf->length != ibuflen)
614
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
615
+
616
+ if (delayed) {
617
+ if (delayed->length != ibuflen)
618
+ pdlc_complex_buffer_resize(delayed, ibuflen, 0);
619
+ middle_index = (start_index + nb_coefs / 2) & mask;
620
+ if (nb_coefs & 1) {
621
+ //delayed->data[k] = fir->stater[middle_index];
622
+ //middle_index = (middle_index + 1) & mask;
623
+ k = 0;
624
+ while ((start_index & 3) && k < ibuflen) {
625
+ fir->stater[fir->index] = icbuf->data[k].real;
626
+ fir->statei[fir->index] = icbuf->data[k].imag;
627
+ fir->index = (fir->index + 1) & mask;
628
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
629
+ acc0r = vmovq_n_f32(0.0f);
630
+ acc0i = vmovq_n_f32(0.0f);
631
+ j = startsimd;
632
+ for (i = 0; i < lensimd; i++) {
633
+ #ifndef __FP_FAST_FMA
634
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
635
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
636
+ acc0r = vaddq_f32(acc0r, prod0r);
637
+ acc0i = vaddq_f32(acc0i, prod0i);
638
+ #else
639
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
640
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
641
+ #endif
642
+ j = (j+1) & masksimd;
643
+ }
644
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
645
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
646
+ start_index = (start_index + 1) & mask;
647
+ startsimd = start_index >> 2;
648
+ delayed->data[k].real = fir->stater[middle_index];
649
+ delayed->data[k].imag = fir->statei[middle_index];
650
+ middle_index = (middle_index + 1) & mask;
651
+ k++;
652
+ }
653
+ while (k + 4 <= ibuflen) {
654
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
655
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
656
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
657
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
658
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
659
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
660
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
661
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
662
+ fir->index = (fir->index + 4) & mask;
663
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
664
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
665
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
666
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
667
+ acc0r = vmovq_n_f32(0.0f);
668
+ acc0i = vmovq_n_f32(0.0f);
669
+ acc1r = vmovq_n_f32(0.0f);
670
+ acc1i = vmovq_n_f32(0.0f);
671
+ acc2r = vmovq_n_f32(0.0f);
672
+ acc2i = vmovq_n_f32(0.0f);
673
+ acc3r = vmovq_n_f32(0.0f);
674
+ acc3i = vmovq_n_f32(0.0f);
675
+ j = startsimd;
676
+ for (i = 0; i < lensimd; i++) {
677
+ statereal = stater[j];
678
+ stateimag = statei[j];
679
+ #ifndef __FP_FAST_FMA
680
+ prod0r = vmulq_f32(coefs0[i], statereal);
681
+ acc0r = vaddq_f32(acc0r, prod0r);
682
+ prod1r = vmulq_f32(coefs1[i], statereal);
683
+ acc1r = vaddq_f32(acc1r, prod1r);
684
+ prod2r = vmulq_f32(coefs2[i], statereal);
685
+ acc2r = vaddq_f32(acc2r, prod2r);
686
+ prod3r = vmulq_f32(coefs3[i], statereal);
687
+ acc3r = vaddq_f32(acc3r, prod3r);
688
+ prod0i = vmulq_f32(coefs0[i], stateimag);
689
+ acc0i = vaddq_f32(acc0i, prod0i);
690
+ prod1i = vmulq_f32(coefs1[i], stateimag);
691
+ acc1i = vaddq_f32(acc1i, prod1i);
692
+ prod2i = vmulq_f32(coefs2[i], stateimag);
693
+ acc2i = vaddq_f32(acc2i, prod2i);
694
+ prod3i = vmulq_f32(coefs3[i], stateimag);
695
+ acc3i = vaddq_f32(acc3i, prod3i);
696
+ #else
697
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
698
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
699
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
700
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
701
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
702
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
703
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
704
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
705
+ #endif
706
+ j = (j+1) & masksimd;
707
+ }
708
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
709
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
710
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
711
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
712
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
713
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
714
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
715
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
716
+ start_index = (start_index + 4) & mask;
717
+ startsimd = start_index >> 2;
718
+ delayed->data[k].real = fir->stater[middle_index];
719
+ delayed->data[k].imag = fir->statei[middle_index];
720
+ middle_index = (middle_index + 1) & mask;
721
+ k++;
722
+ delayed->data[k].real = fir->stater[middle_index];
723
+ delayed->data[k].imag = fir->statei[middle_index];
724
+ middle_index = (middle_index + 1) & mask;
725
+ k++;
726
+ delayed->data[k].real = fir->stater[middle_index];
727
+ delayed->data[k].imag = fir->statei[middle_index];
728
+ middle_index = (middle_index + 1) & mask;
729
+ k++;
730
+ delayed->data[k].real = fir->stater[middle_index];
731
+ delayed->data[k].imag = fir->statei[middle_index];
732
+ middle_index = (middle_index + 1) & mask;
733
+ k++;
734
+ }
735
+ for (; k < ibuflen; k++) {
736
+ fir->stater[fir->index] = icbuf->data[k].real;
737
+ fir->statei[fir->index] = icbuf->data[k].imag;
738
+ fir->index = (fir->index + 1) & mask;
739
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
740
+ acc0r = vmovq_n_f32(0.0f);
741
+ acc0i = vmovq_n_f32(0.0f);
742
+ j = startsimd;
743
+ for (i = 0; i < lensimd; i++) {
744
+ #ifndef __FP_FAST_FMA
745
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
746
+ acc0r = vaddq_f32(acc0r, prod0r);
747
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
748
+ acc0i = vaddq_f32(acc0i, prod0i);
749
+ #else
750
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
751
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
752
+ #endif
753
+ j = (j+1) & masksimd;
754
+ }
755
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
756
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
757
+ start_index = (start_index + 1) & mask;
758
+ startsimd = start_index >> 2;
759
+ delayed->data[k].real = fir->stater[middle_index];
760
+ delayed->data[k].imag = fir->statei[middle_index];
761
+ middle_index = (middle_index + 1) & mask;
762
+ }
763
+ }
764
+ else {
765
+ k = 0;
766
+ while ((start_index & 3) && k < ibuflen) {
767
+ fir->stater[fir->index] = icbuf->data[k].real;
768
+ fir->statei[fir->index] = icbuf->data[k].imag;
769
+ fir->index = (fir->index + 1) & mask;
770
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
771
+ acc0r = vmovq_n_f32(0.0f);
772
+ acc0i = vmovq_n_f32(0.0f);
773
+ j = startsimd;
774
+ for (i = 0; i < lensimd; i++) {
775
+ #ifndef __FP_FAST_FMA
776
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
777
+ acc0r = vaddq_f32(acc0r, prod0r);
778
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
779
+ acc0i = vaddq_f32(acc0i, prod0i);
780
+ #else
781
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
782
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
783
+ #endif
784
+ j = (j+1) & masksimd;
785
+ }
786
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
787
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
788
+ start_index = (start_index + 1) & mask;
789
+ startsimd = start_index >> 2;
790
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
791
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
792
+ middle_index = (middle_index + 1) & mask;
793
+ k++;
794
+ }
795
+ while (k + 4 <= ibuflen) {
796
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
797
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
798
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
799
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
800
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
801
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
802
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
803
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
804
+ fir->index = (fir->index + 4) & mask;
805
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
806
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
807
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
808
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
809
+ acc0r = vmovq_n_f32(0.0f);
810
+ acc0i = vmovq_n_f32(0.0f);
811
+ acc1r = vmovq_n_f32(0.0f);
812
+ acc1i = vmovq_n_f32(0.0f);
813
+ acc2r = vmovq_n_f32(0.0f);
814
+ acc2i = vmovq_n_f32(0.0f);
815
+ acc3r = vmovq_n_f32(0.0f);
816
+ acc3i = vmovq_n_f32(0.0f);
817
+ j = startsimd;
818
+ for (i = 0; i < lensimd; i++) {
819
+ statereal = stater[j];
820
+ stateimag = statei[j];
821
+ #ifndef __FP_FAST_FMA
822
+ prod0r = vmulq_f32(coefs0[i], statereal);
823
+ acc0r = vaddq_f32(acc0r, prod0r);
824
+ prod1r = vmulq_f32(coefs1[i], statereal);
825
+ acc1r = vaddq_f32(acc1r, prod1r);
826
+ prod2r = vmulq_f32(coefs2[i], statereal);
827
+ acc2r = vaddq_f32(acc2r, prod2r);
828
+ prod3r = vmulq_f32(coefs3[i], statereal);
829
+ acc3r = vaddq_f32(acc3r, prod3r);
830
+ prod0i = vmulq_f32(coefs0[i], stateimag);
831
+ acc0i = vaddq_f32(acc0i, prod0i);
832
+ prod1i = vmulq_f32(coefs1[i], stateimag);
833
+ acc1i = vaddq_f32(acc1i, prod1i);
834
+ prod2i = vmulq_f32(coefs2[i], stateimag);
835
+ acc2i = vaddq_f32(acc2i, prod2i);
836
+ prod3i = vmulq_f32(coefs3[i], stateimag);
837
+ acc3i = vaddq_f32(acc3i, prod3i);
838
+ #else
839
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
840
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
841
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
842
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
843
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
844
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
845
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
846
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
847
+ #endif
848
+ j = (j+1) & masksimd;
849
+ }
850
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
851
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
852
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
853
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
854
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
855
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
856
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
857
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
858
+ start_index = (start_index + 4) & mask;
859
+ startsimd = start_index >> 2;
860
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
861
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
862
+ middle_index = (middle_index + 1) & mask;
863
+ k++;
864
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
865
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
866
+ middle_index = (middle_index + 1) & mask;
867
+ k++;
868
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
869
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
870
+ middle_index = (middle_index + 1) & mask;
871
+ k++;
872
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
873
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
874
+ middle_index = (middle_index + 1) & mask;
875
+ k++;
876
+ }
877
+ for (; k < ibuflen; k++) {
878
+ fir->stater[fir->index] = icbuf->data[k].real;
879
+ fir->statei[fir->index] = icbuf->data[k].imag;
880
+ fir->index = (fir->index + 1) & mask;
881
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
882
+ acc0r = vmovq_n_f32(0.0f);
883
+ acc0i = vmovq_n_f32(0.0f);
884
+ j = startsimd;
885
+ for (i = 0; i < lensimd; i++) {
886
+ #ifndef __FP_FAST_FMA
887
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
888
+ acc0r = vaddq_f32(acc0r, prod0r);
889
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
890
+ acc0i = vaddq_f32(acc0i, prod0i);
891
+ #else
892
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
893
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
894
+ #endif
895
+ j = (j+1) & masksimd;
896
+ }
897
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
898
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
899
+ start_index = (start_index + 1) & mask;
900
+ startsimd = start_index >> 2;
901
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
902
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
903
+ middle_index = (middle_index + 1) & mask;
904
+ }
905
+ }
906
+ }
907
+ else {
908
+ k = 0;
909
+ while ((start_index & 3) && k < ibuflen) {
910
+ fir->stater[fir->index] = icbuf->data[k].real;
911
+ fir->statei[fir->index] = icbuf->data[k].imag;
912
+ fir->index = (fir->index + 1) & mask;
913
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
914
+ acc0r = vmovq_n_f32(0.0f);
915
+ acc0i = vmovq_n_f32(0.0f);
916
+ j = startsimd;
917
+ for (i = 0; i < lensimd; i++) {
918
+ #ifndef __FP_FAST_FMA
919
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
920
+ acc0r = vaddq_f32(acc0r, prod0r);
921
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
922
+ acc0i = vaddq_f32(acc0i, prod0i);
923
+ #else
924
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
925
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
926
+ #endif
927
+ j = (j+1) & masksimd;
928
+ }
929
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
930
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
931
+ start_index = (start_index + 1) & mask;
932
+ startsimd = start_index >> 2;
933
+ k++;
934
+ }
935
+ while (k + 4 <= ibuflen) {
936
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
937
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
938
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
939
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
940
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
941
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
942
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
943
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
944
+ fir->index = (fir->index + 4) & mask;
945
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
946
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
947
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
948
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
949
+ acc0r = vmovq_n_f32(0.0f);
950
+ acc0i = vmovq_n_f32(0.0f);
951
+ acc1r = vmovq_n_f32(0.0f);
952
+ acc1i = vmovq_n_f32(0.0f);
953
+ acc2r = vmovq_n_f32(0.0f);
954
+ acc2i = vmovq_n_f32(0.0f);
955
+ acc3r = vmovq_n_f32(0.0f);
956
+ acc3i = vmovq_n_f32(0.0f);
957
+ j = startsimd;
958
+ for (i = 0; i < lensimd; i++) {
959
+ statereal = stater[j];
960
+ stateimag = statei[j];
961
+ #ifndef __FP_FAST_FMA
962
+ prod0r = vmulq_f32(coefs0[i], statereal);
963
+ acc0r = vaddq_f32(acc0r, prod0r);
964
+ prod1r = vmulq_f32(coefs1[i], statereal);
965
+ acc1r = vaddq_f32(acc1r, prod1r);
966
+ prod2r = vmulq_f32(coefs2[i], statereal);
967
+ acc2r = vaddq_f32(acc2r, prod2r);
968
+ prod3r = vmulq_f32(coefs3[i], statereal);
969
+ acc3r = vaddq_f32(acc3r, prod3r);
970
+ prod0i = vmulq_f32(coefs0[i], stateimag);
971
+ acc0i = vaddq_f32(acc0i, prod0i);
972
+ prod1i = vmulq_f32(coefs1[i], stateimag);
973
+ acc1i = vaddq_f32(acc1i, prod1i);
974
+ prod2i = vmulq_f32(coefs2[i], stateimag);
975
+ acc2i = vaddq_f32(acc2i, prod2i);
976
+ prod3i = vmulq_f32(coefs3[i], stateimag);
977
+ acc3i = vaddq_f32(acc3i, prod3i);
978
+ #else
979
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
980
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
981
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
982
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
983
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
984
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
985
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
986
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
987
+ #endif
988
+ j = (j+1) & masksimd;
989
+ }
990
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
991
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
992
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
993
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
994
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
995
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
996
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
997
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
998
+ start_index = (start_index + 4) & mask;
999
+ startsimd = start_index >> 2;
1000
+ k += 4;
1001
+ }
1002
+ for (; k < ibuflen; k++) {
1003
+ fir->stater[fir->index] = icbuf->data[k].real;
1004
+ fir->statei[fir->index] = icbuf->data[k].imag;
1005
+ fir->index = (fir->index + 1) & mask;
1006
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1007
+ acc0r = vmovq_n_f32(0.0f);
1008
+ acc0i = vmovq_n_f32(0.0f);
1009
+ j = startsimd;
1010
+ for (i = 0; i < lensimd; i++) {
1011
+ #ifndef __FP_FAST_FMA
1012
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1013
+ acc0r = vaddq_f32(acc0r, prod0r);
1014
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1015
+ acc0i = vaddq_f32(acc0i, prod0i);
1016
+ #else
1017
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1018
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1019
+ #endif
1020
+ j = (j+1) & masksimd;
1021
+ }
1022
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
1023
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
1024
+ start_index = (start_index + 1) & mask;
1025
+ startsimd = start_index >> 2;
1026
+ }
1027
+ }
1028
+
1029
+ return ocbuf;
1030
+ }
1031
+
1032
+
1033
+ pdlc_buffer_t* pdlc_fir_filter_interpolate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1034
+ {
1035
+ const unsigned int nb_coefs = fir->nb_coefs;
1036
+ const unsigned int flt_len = fir->state_len;
1037
+ const unsigned int mask = fir->index_mask;
1038
+ const unsigned int lensimd = fir->coef_len >> 2;
1039
+ const unsigned int masksimd = mask >> 2;
1040
+ const size_t ibuflen = ifbuf->length;
1041
+ const size_t obuflen = ibuflen*fir->max_counter;
1042
+ const float ffactor = (float)(fir->max_counter);
1043
+ const size_t mcounter = fir->max_counter;
1044
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1045
+ unsigned int startsimd = start_index >> 2;
1046
+ unsigned int i, j;
1047
+ size_t k = 0, l = 0;
1048
+ register float32x4_t acc0, acc1, acc2, acc3;
1049
+ #ifndef __FP_FAST_FMA
1050
+ register float32x4_t prod0, prod1, prod2, prod3;
1051
+ #endif
1052
+ register float32x4_t statereal;
1053
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1054
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1055
+
1056
+ if (!ofbuf)
1057
+ ofbuf = pdlc_buffer_new(obuflen);
1058
+ else if (ofbuf->length != obuflen)
1059
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1060
+
1061
+
1062
+ while ((start_index & 3) && k < obuflen) {
1063
+ if ((k % mcounter) == 0)
1064
+ fir->stater[fir->index] = ifbuf->data[l++];
1065
+ else
1066
+ fir->stater[fir->index] = 0.0f;
1067
+ fir->index = (fir->index + 1) & mask;
1068
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1069
+ acc0 = vmovq_n_f32(0.0f);
1070
+ j = startsimd;
1071
+ for (i = 0; i < lensimd; i++) {
1072
+ #ifndef __FP_FAST_FMA
1073
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1074
+ acc0 = vaddq_f32(acc0, prod0);
1075
+ #else
1076
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1077
+ #endif
1078
+ j = (j+1) & masksimd;
1079
+ }
1080
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1081
+ start_index = (start_index + 1) & mask;
1082
+ startsimd = start_index >> 2;
1083
+ k++;
1084
+ }
1085
+ while (k + 4 <= obuflen) {
1086
+ if (((k+0) % mcounter) == 0)
1087
+ fir->stater[fir->index] = ifbuf->data[l++];
1088
+ else
1089
+ fir->stater[fir->index] = 0.0f;
1090
+ fir->index = (fir->index + 1) & mask;
1091
+ if (((k+1) % mcounter) == 0)
1092
+ fir->stater[fir->index] = ifbuf->data[l++];
1093
+ else
1094
+ fir->stater[fir->index] = 0.0f;
1095
+ fir->index = (fir->index + 1) & mask;
1096
+ if (((k+2) % mcounter) == 0)
1097
+ fir->stater[fir->index] = ifbuf->data[l++];
1098
+ else
1099
+ fir->stater[fir->index] = 0.0f;
1100
+ fir->index = (fir->index + 1) & mask;
1101
+ if (((k+3) % mcounter) == 0)
1102
+ fir->stater[fir->index] = ifbuf->data[l++];
1103
+ else
1104
+ fir->stater[fir->index] = 0.0f;
1105
+ fir->index = (fir->index + 1) & mask;
1106
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1107
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1108
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1109
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1110
+ acc0 = vmovq_n_f32(0.0f);
1111
+ acc1 = vmovq_n_f32(0.0f);
1112
+ acc2 = vmovq_n_f32(0.0f);
1113
+ acc3 = vmovq_n_f32(0.0f);
1114
+ j = startsimd;
1115
+ for (i = 0; i < lensimd; i++) {
1116
+ statereal = stater[j];
1117
+ #ifndef __FP_FAST_FMA
1118
+ prod0 = vmulq_f32(coefs0[i], statereal);
1119
+ acc0 = vaddq_f32(acc0, prod0);
1120
+ prod1 = vmulq_f32(coefs1[i], statereal);
1121
+ acc1 = vaddq_f32(acc1, prod1);
1122
+ prod2 = vmulq_f32(coefs2[i], statereal);
1123
+ acc2 = vaddq_f32(acc2, prod2);
1124
+ prod3 = vmulq_f32(coefs3[i], statereal);
1125
+ acc3 = vaddq_f32(acc3, prod3);
1126
+ #else
1127
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1128
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1129
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1130
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1131
+ #endif
1132
+ j = (j+1) & masksimd;
1133
+ }
1134
+ ofbuf->data[k+0] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1135
+ ofbuf->data[k+1] = (acc1[0] + acc1[1] + acc1[2] + acc1[3]) * ffactor;
1136
+ ofbuf->data[k+2] = (acc2[0] + acc2[1] + acc2[2] + acc2[3]) * ffactor;
1137
+ ofbuf->data[k+3] = (acc3[0] + acc3[1] + acc3[2] + acc3[3]) * ffactor;
1138
+ start_index = (start_index + 4) & mask;
1139
+ startsimd = start_index >> 2;
1140
+ k += 4;
1141
+ }
1142
+ for (; k < obuflen; k++) {
1143
+ if ((k % mcounter) == 0)
1144
+ fir->stater[fir->index] = ifbuf->data[l++];
1145
+ else
1146
+ fir->stater[fir->index] = 0.0f;
1147
+ fir->index = (fir->index + 1) & mask;
1148
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1149
+ acc0 = vmovq_n_f32(0.0f);
1150
+ j = startsimd;
1151
+ for (i = 0; i < lensimd; i++) {
1152
+ #ifndef __FP_FAST_FMA
1153
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1154
+ acc0 = vaddq_f32(acc0, prod0);
1155
+ #else
1156
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1157
+ #endif
1158
+ j = (j+1) & masksimd;
1159
+ }
1160
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1161
+ start_index = (start_index + 1) & mask;
1162
+ startsimd = start_index >> 2;
1163
+ }
1164
+
1165
+ return ofbuf;
1166
+ }
1167
+
1168
+
1169
+
1170
+ pdlc_complex_buffer_t* pdlc_fir_filter_interpolate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1171
+ {
1172
+ const unsigned int nb_coefs = fir->nb_coefs;
1173
+ const unsigned int flt_len = fir->state_len;
1174
+ const unsigned int mask = fir->index_mask;
1175
+ const unsigned int lensimd = fir->coef_len >> 2;
1176
+ const unsigned int masksimd = mask >> 2;
1177
+ const size_t ibuflen = icbuf->length;
1178
+ const size_t obuflen = ibuflen*fir->max_counter;
1179
+ const float ffactor = (float)(fir->max_counter);
1180
+ const size_t mcounter = fir->max_counter;
1181
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1182
+ unsigned int startsimd = start_index >> 2;
1183
+ unsigned int i, j;
1184
+ size_t k = 0, l = 0;
1185
+ register float32x4_t acc0r, acc1r, acc2r, acc3r;
1186
+ register float32x4_t acc0i, acc1i, acc2i, acc3i;
1187
+ #ifndef __FP_FAST_FMA
1188
+ register float32x4_t prod0r, prod1r, prod2r, prod3r;
1189
+ register float32x4_t prod0i, prod1i, prod2i, prod3i;
1190
+ #endif
1191
+ register float32x4_t statereal;
1192
+ register float32x4_t stateimag;
1193
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1194
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1195
+ float32x4_t *statei = (float32x4_t*)fir->statei;
1196
+
1197
+ if (!ocbuf)
1198
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1199
+ else if (ocbuf->length != obuflen)
1200
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1201
+
1202
+
1203
+ while ((start_index & 3) && k < obuflen) {
1204
+ if ((k % mcounter) == 0) {
1205
+ fir->stater[fir->index] = icbuf->data[l].real;
1206
+ fir->statei[fir->index] = icbuf->data[l].imag;
1207
+ l++;
1208
+ }
1209
+ else {
1210
+ fir->stater[fir->index] = 0.0f;
1211
+ fir->statei[fir->index] = 0.0f;
1212
+ }
1213
+ fir->index = (fir->index + 1) & mask;
1214
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1215
+ acc0r = vmovq_n_f32(0.0f);
1216
+ acc0i = vmovq_n_f32(0.0f);
1217
+ j = startsimd;
1218
+ for (i = 0; i < lensimd; i++) {
1219
+ #ifndef __FP_FAST_FMA
1220
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1221
+ acc0r = vaddq_f32(acc0r, prod0r);
1222
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1223
+ acc0i = vaddq_f32(acc0i, prod0i);
1224
+ #else
1225
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1226
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1227
+ #endif
1228
+ j = (j+1) & masksimd;
1229
+ }
1230
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1231
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1232
+ start_index = (start_index + 1) & mask;
1233
+ startsimd = start_index >> 2;
1234
+ k++;
1235
+ }
1236
+ while (k + 4 <= obuflen) {
1237
+ if (((k+0) % mcounter) == 0) {
1238
+ fir->stater[fir->index] = icbuf->data[l].real;
1239
+ fir->statei[fir->index] = icbuf->data[l].imag;
1240
+ l++;
1241
+ }
1242
+ else {
1243
+ fir->stater[fir->index] = 0.0f;
1244
+ fir->statei[fir->index] = 0.0f;
1245
+ }
1246
+ fir->index = (fir->index + 1) & mask;
1247
+ if (((k+1) % mcounter) == 0) {
1248
+ fir->stater[fir->index] = icbuf->data[l].real;
1249
+ fir->statei[fir->index] = icbuf->data[l].imag;
1250
+ l++;
1251
+ }
1252
+ else {
1253
+ fir->stater[fir->index] = 0.0f;
1254
+ fir->statei[fir->index] = 0.0f;
1255
+ }
1256
+ fir->index = (fir->index + 1) & mask;
1257
+ if (((k+2) % mcounter) == 0) {
1258
+ fir->stater[fir->index] = icbuf->data[l].real;
1259
+ fir->statei[fir->index] = icbuf->data[l].imag;
1260
+ l++;
1261
+ }
1262
+ else {
1263
+ fir->stater[fir->index] = 0.0f;
1264
+ fir->statei[fir->index] = 0.0f;
1265
+ }
1266
+ fir->index = (fir->index + 1) & mask;
1267
+ if (((k+3) % mcounter) == 0) {
1268
+ fir->stater[fir->index] = icbuf->data[l].real;
1269
+ fir->statei[fir->index] = icbuf->data[l].imag;
1270
+ l++;
1271
+ }
1272
+ else {
1273
+ fir->stater[fir->index] = 0.0f;
1274
+ fir->statei[fir->index] = 0.0f;
1275
+ }
1276
+ fir->index = (fir->index + 1) & mask;
1277
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1278
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1279
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1280
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1281
+ acc0r = vmovq_n_f32(0.0f);
1282
+ acc0i = vmovq_n_f32(0.0f);
1283
+ acc1r = vmovq_n_f32(0.0f);
1284
+ acc1i = vmovq_n_f32(0.0f);
1285
+ acc2r = vmovq_n_f32(0.0f);
1286
+ acc2i = vmovq_n_f32(0.0f);
1287
+ acc3r = vmovq_n_f32(0.0f);
1288
+ acc3i = vmovq_n_f32(0.0f);
1289
+ j = startsimd;
1290
+ for (i = 0; i < lensimd; i++) {
1291
+ statereal = stater[j];
1292
+ stateimag = statei[j];
1293
+ #ifndef __FP_FAST_FMA
1294
+ prod0r = vmulq_f32(coefs0[i], statereal);
1295
+ acc0r = vaddq_f32(acc0r, prod0r);
1296
+ prod1r = vmulq_f32(coefs1[i], statereal);
1297
+ acc1r = vaddq_f32(acc1r, prod1r);
1298
+ prod2r = vmulq_f32(coefs2[i], statereal);
1299
+ acc2r = vaddq_f32(acc2r, prod2r);
1300
+ prod3r = vmulq_f32(coefs3[i], statereal);
1301
+ acc3r = vaddq_f32(acc3r, prod3r);
1302
+ prod0i = vmulq_f32(coefs0[i], stateimag);
1303
+ acc0i = vaddq_f32(acc0i, prod0i);
1304
+ prod1i = vmulq_f32(coefs1[i], stateimag);
1305
+ acc1i = vaddq_f32(acc1i, prod1i);
1306
+ prod2i = vmulq_f32(coefs2[i], stateimag);
1307
+ acc2i = vaddq_f32(acc2i, prod2i);
1308
+ prod3i = vmulq_f32(coefs3[i], stateimag);
1309
+ acc3i = vaddq_f32(acc3i, prod3i);
1310
+ #else
1311
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
1312
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
1313
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
1314
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
1315
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
1316
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
1317
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
1318
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
1319
+ #endif
1320
+ j = (j+1) & masksimd;
1321
+ }
1322
+ ocbuf->data[k+0].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1323
+ ocbuf->data[k+0].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1324
+ ocbuf->data[k+1].real = (acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3]) * ffactor;
1325
+ ocbuf->data[k+1].imag = (acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3]) * ffactor;
1326
+ ocbuf->data[k+2].real = (acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3]) * ffactor;
1327
+ ocbuf->data[k+2].imag = (acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3]) * ffactor;
1328
+ ocbuf->data[k+3].real = (acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3]) * ffactor;
1329
+ ocbuf->data[k+3].imag = (acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3]) * ffactor;
1330
+ start_index = (start_index + 4) & mask;
1331
+ startsimd = start_index >> 2;
1332
+ k += 4;
1333
+ }
1334
+ for (; k < obuflen; k++) {
1335
+ if ((k % mcounter) == 0) {
1336
+ fir->stater[fir->index] = icbuf->data[l].real;
1337
+ fir->statei[fir->index] = icbuf->data[l].imag;
1338
+ l++;
1339
+ }
1340
+ else {
1341
+ fir->stater[fir->index] = 0.0f;
1342
+ fir->statei[fir->index] = 0.0f;
1343
+ }
1344
+ fir->index = (fir->index + 1) & mask;
1345
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1346
+ acc0r = vmovq_n_f32(0.0f);
1347
+ acc0i = vmovq_n_f32(0.0f);
1348
+ j = startsimd;
1349
+ for (i = 0; i < lensimd; i++) {
1350
+ #ifndef __FP_FAST_FMA
1351
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1352
+ acc0r = vaddq_f32(acc0r, prod0r);
1353
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1354
+ acc0i = vaddq_f32(acc0i, prod0i);
1355
+ #else
1356
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1357
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1358
+ #endif
1359
+ j = (j+1) & masksimd;
1360
+ }
1361
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1362
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1363
+ start_index = (start_index + 1) & mask;
1364
+ startsimd = start_index >> 2;
1365
+ }
1366
+
1367
+ return ocbuf;
1368
+ }
1369
+
1370
+
1371
+ pdlc_buffer_t* pdlc_fir_filter_decimate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1372
+ {
1373
+ const unsigned int nb_coefs = fir->nb_coefs;
1374
+ const unsigned int flt_len = fir->state_len;
1375
+ const unsigned int mask = fir->index_mask;
1376
+ const unsigned int lensimd = fir->coef_len >> 2;
1377
+ const unsigned int masksimd = mask >> 2;
1378
+ const int mcounter = fir->max_counter;
1379
+ const size_t ibuflen = ifbuf->length;
1380
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1381
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1382
+ unsigned int startsimd = start_index >> 2;
1383
+ unsigned int i0, i1, j0, j1;
1384
+ size_t k, l;
1385
+ register float32x4_t acc0, acc1;
1386
+ #ifndef __FP_FAST_FMA
1387
+ register float32x4_t prod0, prod1;
1388
+ #endif
1389
+ const float32x4_t *coefs;
1390
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1391
+
1392
+
1393
+ if (!ofbuf)
1394
+ ofbuf = pdlc_buffer_new(obuflen);
1395
+ else if (ofbuf->length != obuflen)
1396
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1397
+
1398
+
1399
+ for (k = 0, l = 0; k < ibuflen; k++) {
1400
+ fir->stater[fir->index] = ifbuf->data[k];
1401
+ fir->index = (fir->index + 1) & mask;
1402
+ if (fir->counter == 0) {
1403
+ coefs = (float32x4_t*)fir->coefs[start_index & 3];
1404
+ acc0 = vmovq_n_f32(0.0f);
1405
+ acc1 = vmovq_n_f32(0.0f);
1406
+ j0 = startsimd;
1407
+ j1 = (startsimd+1) & masksimd;
1408
+ i0 = 0;
1409
+ i1 = 1;
1410
+ while (i1 < lensimd) {
1411
+ #ifndef __FP_FAST_FMA
1412
+ prod0 = vmulq_f32(coefs[i0], stater[j0]);
1413
+ acc0 = vaddq_f32(acc0, prod0);
1414
+ prod1 = vmulq_f32(coefs[i1], stater[j1]);
1415
+ acc1 = vaddq_f32(acc1, prod1);
1416
+ #else
1417
+ acc0 = vfmaq_f32(acc0, coefs[i0], stater[j0]);
1418
+ acc1 = vfmaq_f32(acc1, coefs[i1], stater[j1]);
1419
+ #endif
1420
+ i0 += 2;
1421
+ i1 += 2;
1422
+ j0 = (j0+2) & masksimd;
1423
+ j1 = (j1+2) & masksimd;
1424
+ }
1425
+ while (i0 < lensimd) {
1426
+ #ifndef __FP_FAST_FMA
1427
+ prod0 = vmulq_f32(coefs[i0], stater[j0]);
1428
+ acc0 = vaddq_f32(acc0, prod0);
1429
+ #else
1430
+ acc0 = vfmaq_f32(acc0, coefs[i0], stater[j0]);
1431
+ #endif
1432
+ i0 += 2;
1433
+ j0 = (j0+2) & masksimd;
1434
+ }
1435
+ acc0 = vaddq_f32(acc0, acc1);
1436
+ ofbuf->data[l++] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1437
+ start_index = (start_index + mcounter) & mask;
1438
+ startsimd = start_index >> 2;
1439
+ }
1440
+ fir->counter = (fir->counter + 1) % mcounter;
1441
+ }
1442
+
1443
+ return ofbuf;
1444
+ }
1445
+
1446
+
1447
+ pdlc_complex_buffer_t* pdlc_fir_filter_decimate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1448
+ {
1449
+ const unsigned int nb_coefs = fir->nb_coefs;
1450
+ const unsigned int flt_len = fir->state_len;
1451
+ const unsigned int mask = fir->index_mask;
1452
+ const unsigned int lensimd = fir->coef_len >> 2;
1453
+ const unsigned int masksimd = mask >> 2;
1454
+ const int mcounter = fir->max_counter;
1455
+ const size_t ibuflen = icbuf->length;
1456
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1457
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1458
+ unsigned int startsimd = start_index >> 2;
1459
+ unsigned int i0, j0, i1, j1;
1460
+ size_t k, l;
1461
+ register float32x4_t acc0r, acc0i, acc1r, acc1i;
1462
+ #ifndef __FP_FAST_FMA
1463
+ register float32x4_t prod0r, prod0i, prod1r, prod1i;
1464
+ #endif
1465
+ const float32x4_t *coefs;
1466
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1467
+ float32x4_t *statei = (float32x4_t*)fir->statei;
1468
+
1469
+
1470
+ if (!ocbuf)
1471
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1472
+ else if (ocbuf->length != obuflen)
1473
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1474
+
1475
+
1476
+ for (k = 0, l = 0; k < ibuflen; k++) {
1477
+ fir->stater[fir->index] = icbuf->data[k].real;
1478
+ fir->statei[fir->index] = icbuf->data[k].imag;
1479
+ fir->index = (fir->index + 1) & mask;
1480
+ if (fir->counter == 0) {
1481
+ coefs = (float32x4_t*)fir->coefs[start_index & 3];
1482
+ acc0r = vmovq_n_f32(0.0f);
1483
+ acc0i = vmovq_n_f32(0.0f);
1484
+ acc1r = vmovq_n_f32(0.0f);
1485
+ acc1i = vmovq_n_f32(0.0f);
1486
+ j0 = startsimd;
1487
+ j1 = (startsimd+1) & masksimd;
1488
+ i0 = 0;
1489
+ i1 = 1;
1490
+ while (i1 < lensimd) {
1491
+ #ifndef __FP_FAST_FMA
1492
+ prod0r = vmulq_f32(coefs[i0], stater[j0]);
1493
+ acc0r = vaddq_f32(acc0r, prod0r);
1494
+ prod0i = vmulq_f32(coefs[i0], statei[j0]);
1495
+ acc0i = vaddq_f32(acc0i, prod0i);
1496
+ prod1r = vmulq_f32(coefs[i1], stater[j1]);
1497
+ acc1r = vaddq_f32(acc1r, prod1r);
1498
+ prod1i = vmulq_f32(coefs[i1], statei[j1]);
1499
+ acc1i = vaddq_f32(acc1i, prod1i);
1500
+ #else
1501
+ acc0r = vfmaq_f32(acc0r, coefs[i0], stater[j0]);
1502
+ acc0i = vfmaq_f32(acc0i, coefs[i0], statei[j0]);
1503
+ acc1r = vfmaq_f32(acc1r, coefs[i1], stater[j1]);
1504
+ acc1i = vfmaq_f32(acc1i, coefs[i1], statei[j1]);
1505
+ #endif
1506
+ i0 += 2;
1507
+ i1 += 2;
1508
+ j0 = (j0+2) & masksimd;
1509
+ j1 = (j1+2) & masksimd;
1510
+ }
1511
+ while (i0 < lensimd) {
1512
+ #ifndef __FP_FAST_FMA
1513
+ prod0r = vmulq_f32(coefs[i0], stater[j0]);
1514
+ acc0r = vaddq_f32(acc0r, prod0r);
1515
+ prod0i = vmulq_f32(coefs[i0], statei[j0]);
1516
+ acc0i = vaddq_f32(acc0i, prod0i);
1517
+ #else
1518
+ acc0r = vfmaq_f32(acc0r, coefs[i0], stater[j0]);
1519
+ acc0i = vfmaq_f32(acc0i, coefs[i0], statei[j0]);
1520
+ #endif
1521
+ i0 += 2;
1522
+ j0 = (j0+2) & masksimd;
1523
+ }
1524
+ acc0r = vaddq_f32(acc0r, acc1r);
1525
+ acc0i = vaddq_f32(acc0i, acc1i);
1526
+ ocbuf->data[l].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
1527
+ ocbuf->data[l].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
1528
+ l++;
1529
+ start_index = (start_index + mcounter) & mask;
1530
+ startsimd = start_index >> 2;
1531
+ }
1532
+ fir->counter = (fir->counter + 1) % mcounter;
1533
+ }
1534
+
1535
+ return ocbuf;
1536
+ }
1537
+
1538
+
1539
+ pdlc_complex_buffer_t* pdlc_fir_filter_transform(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_complex_buffer_t *ocbuf)
1540
+ {
1541
+ const unsigned int nb_coefs = fir->nb_coefs;
1542
+ const unsigned int flt_len = fir->state_len;
1543
+ const unsigned int mask = fir->index_mask;
1544
+ const unsigned int lensimd = fir->coef_len >> 2;
1545
+ const unsigned int masksimd = mask >> 2;
1546
+ const size_t ibuflen = ifbuf->length;
1547
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1548
+ unsigned int startsimd = start_index >> 2;
1549
+ unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
1550
+ unsigned int i, j;
1551
+ size_t k = 0;
1552
+ register float32x4_t acc0, acc1, acc2, acc3;
1553
+ #ifndef __FP_FAST_FMA
1554
+ register float32x4_t prod0, prod1, prod2, prod3;
1555
+ #endif
1556
+ register float32x4_t statereal;
1557
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1558
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1559
+
1560
+
1561
+ if (!ocbuf)
1562
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
1563
+ else if (ocbuf->length != ibuflen)
1564
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
1565
+
1566
+
1567
+ if (nb_coefs & 1) {
1568
+ while ((start_index & 3) && k < ibuflen) {
1569
+ fir->stater[fir->index] = ifbuf->data[k];
1570
+ fir->index = (fir->index + 1) & mask;
1571
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1572
+ acc0 = vmovq_n_f32(0.0f);
1573
+ j = startsimd;
1574
+ for (i = 0; i < lensimd; i++) {
1575
+ #ifndef __FP_FAST_FMA
1576
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1577
+ acc0 = vaddq_f32(acc0, prod0);
1578
+ #else
1579
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1580
+ #endif
1581
+ j = (j+1) & masksimd;
1582
+ }
1583
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1584
+ start_index = (start_index + 1) & mask;
1585
+ startsimd = start_index >> 2;
1586
+ ocbuf->data[k].real = fir->stater[middle_index];
1587
+ middle_index = (middle_index + 1) & mask;
1588
+ k++;
1589
+ }
1590
+ while (k + 4 <= ibuflen) {
1591
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1592
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1593
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1594
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1595
+ fir->index = (fir->index + 4) & mask;
1596
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1597
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1598
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1599
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1600
+ acc0 = vmovq_n_f32(0.0f);
1601
+ acc1 = vmovq_n_f32(0.0f);
1602
+ acc2 = vmovq_n_f32(0.0f);
1603
+ acc3 = vmovq_n_f32(0.0f);
1604
+ j = startsimd;
1605
+ for (i = 0; i < lensimd; i++) {
1606
+ statereal = stater[j];
1607
+ #ifndef __FP_FAST_FMA
1608
+ prod0 = vmulq_f32(coefs0[i], statereal);
1609
+ acc0 = vaddq_f32(acc0, prod0);
1610
+ prod1 = vmulq_f32(coefs1[i], statereal);
1611
+ acc1 = vaddq_f32(acc1, prod1);
1612
+ prod2 = vmulq_f32(coefs2[i], statereal);
1613
+ acc2 = vaddq_f32(acc2, prod2);
1614
+ prod3 = vmulq_f32(coefs3[i], statereal);
1615
+ acc3 = vaddq_f32(acc3, prod3);
1616
+ #else
1617
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1618
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1619
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1620
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1621
+ #endif
1622
+ j = (j+1) & masksimd;
1623
+ }
1624
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1625
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1626
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1627
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1628
+ start_index = (start_index + 4) & mask;
1629
+ startsimd = start_index >> 2;
1630
+ ocbuf->data[k].real = fir->stater[middle_index];
1631
+ middle_index = (middle_index + 1) & mask;
1632
+ k++;
1633
+ ocbuf->data[k].real = fir->stater[middle_index];
1634
+ middle_index = (middle_index + 1) & mask;
1635
+ k++;
1636
+ ocbuf->data[k].real = fir->stater[middle_index];
1637
+ middle_index = (middle_index + 1) & mask;
1638
+ k++;
1639
+ ocbuf->data[k].real = fir->stater[middle_index];
1640
+ middle_index = (middle_index + 1) & mask;
1641
+ k++;
1642
+ }
1643
+ for (; k < ibuflen; k++) {
1644
+ fir->stater[fir->index] = ifbuf->data[k];
1645
+ fir->index = (fir->index + 1) & mask;
1646
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1647
+ acc0 = vmovq_n_f32(0.0f);
1648
+ j = startsimd;
1649
+ for (i = 0; i < lensimd; i++) {
1650
+ #ifndef __FP_FAST_FMA
1651
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1652
+ acc0 = vaddq_f32(acc0, prod0);
1653
+ #else
1654
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1655
+ #endif
1656
+ j = (j+1) & masksimd;
1657
+ }
1658
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1659
+ start_index = (start_index + 1) & mask;
1660
+ startsimd = start_index >> 2;
1661
+ ocbuf->data[k].real = fir->stater[middle_index];
1662
+ middle_index = (middle_index + 1) & mask;
1663
+ }
1664
+ }
1665
+ else {
1666
+ while ((start_index & 3) && k < ibuflen) {
1667
+ fir->stater[fir->index] = ifbuf->data[k];
1668
+ fir->index = (fir->index + 1) & mask;
1669
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1670
+ acc0 = vmovq_n_f32(0.0f);
1671
+ j = startsimd;
1672
+ for (i = 0; i < lensimd; i++) {
1673
+ #ifndef __FP_FAST_FMA
1674
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1675
+ acc0 = vaddq_f32(acc0, prod0);
1676
+ #else
1677
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1678
+ #endif
1679
+ j = (j+1) & masksimd;
1680
+ }
1681
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1682
+ start_index = (start_index + 1) & mask;
1683
+ startsimd = start_index >> 2;
1684
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1685
+ middle_index = (middle_index + 1) & mask;
1686
+ k++;
1687
+ }
1688
+ while (k + 4 <= ibuflen) {
1689
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1690
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1691
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1692
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1693
+ fir->index = (fir->index + 4) & mask;
1694
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1695
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1696
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1697
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1698
+ acc0 = vmovq_n_f32(0.0f);
1699
+ acc1 = vmovq_n_f32(0.0f);
1700
+ acc2 = vmovq_n_f32(0.0f);
1701
+ acc3 = vmovq_n_f32(0.0f);
1702
+ j = startsimd;
1703
+ for (i = 0; i < lensimd; i++) {
1704
+ statereal = stater[j];
1705
+ #ifndef __FP_FAST_FMA
1706
+ prod0 = vmulq_f32(coefs0[i], statereal);
1707
+ acc0 = vaddq_f32(acc0, prod0);
1708
+ prod1 = vmulq_f32(coefs1[i], statereal);
1709
+ acc1 = vaddq_f32(acc1, prod1);
1710
+ prod2 = vmulq_f32(coefs2[i], statereal);
1711
+ acc2 = vaddq_f32(acc2, prod2);
1712
+ prod3 = vmulq_f32(coefs3[i], statereal);
1713
+ acc3 = vaddq_f32(acc3, prod3);
1714
+ #else
1715
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1716
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1717
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1718
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1719
+ #endif
1720
+ j = (j+1) & masksimd;
1721
+ }
1722
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1723
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1724
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1725
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1726
+ start_index = (start_index + 4) & mask;
1727
+ startsimd = start_index >> 2;
1728
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1729
+ middle_index = (middle_index + 1) & mask;
1730
+ k++;
1731
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1732
+ middle_index = (middle_index + 1) & mask;
1733
+ k++;
1734
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1735
+ middle_index = (middle_index + 1) & mask;
1736
+ k++;
1737
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1738
+ middle_index = (middle_index + 1) & mask;
1739
+ k++;
1740
+ }
1741
+ for (; k < ibuflen; k++) {
1742
+ fir->stater[fir->index] = ifbuf->data[k];
1743
+ fir->index = (fir->index + 1) & mask;
1744
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1745
+ acc0 = vmovq_n_f32(0.0f);
1746
+ j = startsimd;
1747
+ for (i = 0; i < lensimd; i++) {
1748
+ #ifndef __FP_FAST_FMA
1749
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1750
+ acc0 = vaddq_f32(acc0, prod0);
1751
+ #else
1752
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1753
+ #endif
1754
+ j = (j+1) & masksimd;
1755
+ }
1756
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1757
+ start_index = (start_index + 1) & mask;
1758
+ startsimd = start_index >> 2;
1759
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1760
+ middle_index = (middle_index + 1) & mask;
1761
+ }
1762
+ }
1763
+
1764
+ return ocbuf;
1765
+ }
1766
+
1767
+