paddlec 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1767 @@
1
+ /* Copyright (C) 2019 Théotime Bollengier <theotime.bollengier@gmail.com>
2
+ *
3
+ * This file is part of PaddleC
4
+ *
5
+ * PaddleC is free software: you can redistribute it and/or modify
6
+ * it under the terms of the GNU General Public License as published by
7
+ * the Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * PaddleC is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ * GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License
16
+ * along with PaddleC. If not, see <https://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include <arm_neon.h>
20
+
21
+
22
+ void pdlc_fir_filter_inspect(pdlc_fir_filter_t* fir)
23
+ {
24
+ size_t i, j;
25
+
26
+ printf("nb_coefs: %u, state_len: %u, coef_len: %u, index_mask: 0x%x, index: %u\n",
27
+ fir->nb_coefs, fir->state_len, fir->coef_len, fir->index_mask, fir->index);
28
+ printf("state: [%.7g", fir->stater[0]);
29
+ for (i = 1; i < fir->state_len; i++)
30
+ printf(", %.7g", fir->stater[i]);
31
+ printf("]\n");
32
+ for (j = 0; j < 4; j++) {
33
+ printf("coefs: {%u}[%.7g", j, fir->coefs[j][0]);
34
+ for (i = 1; i < fir->coef_len; i++)
35
+ printf(", %.7g", fir->coefs[j][i]);
36
+ printf("]\n");
37
+ }
38
+ }
39
+
40
+
41
+ void pdlc_fir_filter_initialize(pdlc_fir_filter_t* fir, int order)
42
+ {
43
+ int i, r;
44
+
45
+ if (fir->coefs) {
46
+ for (i = 0; i < 4; i++)
47
+ if (fir->coefs[i])
48
+ free(fir->coefs[i]);
49
+ free(fir->coefs);
50
+ fir->coefs = NULL;
51
+ }
52
+
53
+ if (fir->stater)
54
+ free(fir->stater);
55
+ fir->stater = NULL;
56
+
57
+ if (fir->statei)
58
+ free(fir->statei);
59
+ fir->statei = NULL;
60
+
61
+ fir->nb_coefs = 0;
62
+ fir->state_len = 0;
63
+ fir->coef_len = 0;
64
+ fir->index = 0;
65
+ fir->index_mask = 0;
66
+ fir->counter = 0;
67
+ fir->max_counter = 1;
68
+
69
+ if (order < 0)
70
+ return;
71
+
72
+ if (order > 67108863) {
73
+ fprintf(stderr, "ERROR: libpaddlec: Filter order cannot be greater than 67108864\n");
74
+ exit(EXIT_FAILURE);
75
+ }
76
+
77
+ fir->nb_coefs = (unsigned int)(order + 1);
78
+ fir->coef_len = ((fir->nb_coefs + 3 + 3) >> 2) << 2;
79
+ fir->state_len = (unsigned int)(pow(2.0, ceil(log2(fir->coef_len))));
80
+ fir->index = 0;
81
+ fir->index_mask = fir->state_len - 1;
82
+
83
+ fir->coefs = malloc(4*sizeof(float*));
84
+ if (fir->coefs == NULL) {
85
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", 4 * sizeof(float*));
86
+ exit(EXIT_FAILURE);
87
+ }
88
+
89
+ for (i = 0; i < 4; i++) {
90
+ r = posix_memalign((void**)&fir->coefs[i], sizeof(float32x4_t), fir->coef_len * sizeof(float));
91
+ if (r) {
92
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->coef_len * sizeof(float));
93
+ exit(EXIT_FAILURE);
94
+ }
95
+ }
96
+
97
+ r = posix_memalign((void**)&fir->stater, sizeof(float32x4_t), fir->state_len * sizeof(float));
98
+ if (r) {
99
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->state_len * sizeof(float));
100
+ exit(EXIT_FAILURE);
101
+ }
102
+
103
+ r = posix_memalign((void**)&fir->statei, sizeof(float32x4_t), fir->state_len * sizeof(float));
104
+ if (r) {
105
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %u bytes for FIR!\n", fir->state_len * sizeof(float));
106
+ exit(EXIT_FAILURE);
107
+ }
108
+
109
+ memset(fir->stater, 0, fir->state_len * sizeof(float));
110
+ memset(fir->statei, 0, fir->state_len * sizeof(float));
111
+ for (i = 0; i < 4; i++)
112
+ memset(fir->coefs[i], 0, fir->coef_len * sizeof(float));
113
+ }
114
+
115
+
116
+ void pdlc_fir_filter_free(pdlc_fir_filter_t* fir)
117
+ {
118
+ int i;
119
+
120
+ if (!fir)
121
+ return;
122
+
123
+ if (fir->coefs) {
124
+ for (i = 0; i < 4; i++)
125
+ if (fir->coefs[i])
126
+ free(fir->coefs[i]);
127
+ free(fir->coefs);
128
+ }
129
+
130
+ if (fir->stater)
131
+ free(fir->stater);
132
+
133
+ if (fir->statei)
134
+ free(fir->statei);
135
+
136
+ free(fir);
137
+ }
138
+
139
+
140
+ size_t pdlc_fir_filter_size(pdlc_fir_filter_t* fir)
141
+ {
142
+ size_t res;
143
+
144
+ res = sizeof(pdlc_fir_filter_t);
145
+ res += sizeof(float*)* 4;
146
+ res += sizeof(float) * fir->state_len * 2;
147
+ res += sizeof(float) * fir->coef_len * 4;
148
+
149
+ return res;
150
+ }
151
+
152
+
153
+ int pdlc_fir_filter_set_coef_at(pdlc_fir_filter_t* fir, int index, float value)
154
+ {
155
+ int i;
156
+
157
+ if (index < 0 || index >= (int)fir->nb_coefs)
158
+ return -1;
159
+
160
+ for (i = 0; i < 4; i++)
161
+ fir->coefs[i][(fir->nb_coefs - 1 - index + i) % fir->coef_len] = value;
162
+
163
+ return 0;
164
+ }
165
+
166
+
167
+ float pdlc_fir_filter_filter_float(pdlc_fir_filter_t* fir, float sample, float *delayed)
168
+ {
169
+ const unsigned int nb_coefs = fir->nb_coefs;
170
+ const unsigned int flt_len = fir->state_len;
171
+ const unsigned int mask = fir->index_mask;
172
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
173
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
174
+ const unsigned int lensimd = fir->coef_len >> 2;
175
+ const unsigned int startsimd = start_index >> 2;
176
+ const unsigned int masksimd = mask >> 2;
177
+ unsigned int i, j;
178
+ register float32x4_t acc;
179
+ #ifndef __FP_FAST_FMA
180
+ register float32x4_t prod;
181
+ #endif
182
+ const float32x4_t *coefs = (float32x4_t*)fir->coefs[start_index & 3];
183
+ float32x4_t *stater = (float32x4_t*)fir->stater;
184
+
185
+ fir->stater[fir->index] = sample;
186
+ fir->index = (fir->index + 1) & mask;
187
+
188
+ if (delayed) {
189
+ if (nb_coefs & 1)
190
+ *delayed = fir->stater[middle_index];
191
+ else
192
+ *delayed = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
193
+ }
194
+
195
+ acc = vmovq_n_f32(0.0f);
196
+ j = startsimd;
197
+ for (i = 0; i < lensimd; i++) {
198
+ #ifndef __FP_FAST_FMA
199
+ prod = vmulq_f32(coefs[i], stater[j]);
200
+ acc = vaddq_f32(acc, prod);
201
+ #else
202
+ acc = vfmaq_f32(acc, coefs[i], stater[j]);
203
+ #endif
204
+ j = (j+1) & masksimd;
205
+ }
206
+
207
+ return acc[0] + acc[1] + acc[2] + acc[3];
208
+ }
209
+
210
+
211
+ pdlc_complex_t pdlc_fir_filter_filter_complex(pdlc_fir_filter_t* fir, pdlc_complex_t sample, pdlc_complex_t *delayed)
212
+ {
213
+ const unsigned int nb_coefs = fir->nb_coefs;
214
+ const unsigned int flt_len = fir->state_len;
215
+ const unsigned int mask = fir->index_mask;
216
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
217
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
218
+ const unsigned int lensimd = fir->coef_len >> 2;
219
+ const unsigned int startsimd = start_index >> 2;
220
+ const unsigned int masksimd = mask >> 2;
221
+ unsigned int i, j;
222
+ pdlc_complex_t res = {0.0f, 0.0f};
223
+ register float32x4_t accr, acci;
224
+ #ifndef __FP_FAST_FMA
225
+ register float32x4_t prodr, prodi;
226
+ #endif
227
+ const float32x4_t *coefs = (float32x4_t*)fir->coefs[start_index & 3];
228
+ float32x4_t *stater = (float32x4_t*)fir->stater;
229
+ float32x4_t *statei = (float32x4_t*)fir->statei;
230
+
231
+ fir->stater[fir->index] = sample.real;
232
+ fir->statei[fir->index] = sample.imag;
233
+ fir->index = (fir->index + 1) & mask;
234
+
235
+ if (delayed) {
236
+ if (nb_coefs & 1) {
237
+ delayed->real = fir->stater[middle_index];
238
+ delayed->imag = fir->statei[middle_index];
239
+ }
240
+ else {
241
+ delayed->real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
242
+ delayed->imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
243
+ }
244
+ }
245
+
246
+ accr = vmovq_n_f32(0.0f);
247
+ acci = vmovq_n_f32(0.0f);
248
+ j = startsimd;
249
+ for (i = 0; i < lensimd; i++) {
250
+ #ifndef __FP_FAST_FMA
251
+ prodr = vmulq_f32(coefs[i], stater[j]);
252
+ prodi = vmulq_f32(coefs[i], statei[j]);
253
+ accr = vaddq_f32(accr, prodr);
254
+ acci = vaddq_f32(acci, prodi);
255
+ #else
256
+ accr = vfmaq_f32(accr, coefs[i], stater[j]);
257
+ acci = vfmaq_f32(acci, coefs[i], statei[j]);
258
+ #endif
259
+ j = (j+1) & masksimd;
260
+ }
261
+ res.real = accr[0] + accr[1] + accr[2] + accr[3];
262
+ res.imag = acci[0] + acci[1] + acci[2] + acci[3];
263
+
264
+ return res;
265
+ }
266
+
267
+
268
+ pdlc_buffer_t* pdlc_fir_filter_filter_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf, pdlc_buffer_t *delayed)
269
+ {
270
+ const unsigned int nb_coefs = fir->nb_coefs;
271
+ const unsigned int flt_len = fir->state_len;
272
+ const unsigned int mask = fir->index_mask;
273
+ const unsigned int lensimd = fir->coef_len >> 2;
274
+ const unsigned int masksimd = mask >> 2;
275
+ const size_t ibuflen = ifbuf->length;
276
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
277
+ unsigned int startsimd = start_index >> 2;
278
+ unsigned int middle_index;
279
+ unsigned int i, j;
280
+ size_t k;
281
+ register float32x4_t acc0, acc1, acc2, acc3;
282
+ #ifndef __FP_FAST_FMA
283
+ register float32x4_t prod0, prod1, prod2, prod3;
284
+ #endif
285
+ register float32x4_t statereal;
286
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
287
+ float32x4_t *stater = (float32x4_t*)fir->stater;
288
+
289
+ if (!ofbuf)
290
+ ofbuf = pdlc_buffer_new(ibuflen);
291
+ else if (ofbuf->length != ibuflen)
292
+ pdlc_buffer_resize(ofbuf, ibuflen, 0);
293
+
294
+ if (delayed) {
295
+ if (delayed->length != ibuflen)
296
+ pdlc_buffer_resize(delayed, ibuflen, 0);
297
+ middle_index = (start_index + nb_coefs / 2) & mask;
298
+ if (nb_coefs & 1) {
299
+ k = 0;
300
+ while ((start_index & 3) && k < ibuflen) {
301
+ fir->stater[fir->index] = ifbuf->data[k];
302
+ fir->index = (fir->index + 1) & mask;
303
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
304
+ acc0 = vmovq_n_f32(0.0f);
305
+ j = startsimd;
306
+ for (i = 0; i < lensimd; i++) {
307
+ #ifndef __FP_FAST_FMA
308
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
309
+ acc0 = vaddq_f32(acc0, prod0);
310
+ #else
311
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
312
+ #endif
313
+ j = (j+1) & masksimd;
314
+ }
315
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
316
+ start_index = (start_index + 1) & mask;
317
+ startsimd = start_index >> 2;
318
+ delayed->data[k] = fir->stater[middle_index];
319
+ middle_index = (middle_index + 1) & mask;
320
+ k++;
321
+ }
322
+ while (k + 4 <= ibuflen) {
323
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
324
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
325
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
326
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
327
+ fir->index = (fir->index + 4) & mask;
328
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
329
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
330
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
331
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
332
+ acc0 = vmovq_n_f32(0.0f);
333
+ acc1 = vmovq_n_f32(0.0f);
334
+ acc2 = vmovq_n_f32(0.0f);
335
+ acc3 = vmovq_n_f32(0.0f);
336
+ j = startsimd;
337
+ for (i = 0; i < lensimd; i++) {
338
+ statereal = stater[j];
339
+ #ifndef __FP_FAST_FMA
340
+ prod0 = vmulq_f32(coefs0[i], statereal);
341
+ acc0 = vaddq_f32(acc0, prod0);
342
+ prod1 = vmulq_f32(coefs1[i], statereal);
343
+ acc1 = vaddq_f32(acc1, prod1);
344
+ prod2 = vmulq_f32(coefs2[i], statereal);
345
+ acc2 = vaddq_f32(acc2, prod2);
346
+ prod3 = vmulq_f32(coefs3[i], statereal);
347
+ acc3 = vaddq_f32(acc3, prod3);
348
+ #else
349
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
350
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
351
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
352
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
353
+ #endif
354
+ j = (j+1) & masksimd;
355
+ }
356
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
357
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
358
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
359
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
360
+ start_index = (start_index + 4) & mask;
361
+ startsimd = start_index >> 2;
362
+ delayed->data[k] = fir->stater[middle_index];
363
+ middle_index = (middle_index + 1) & mask;
364
+ k++;
365
+ delayed->data[k] = fir->stater[middle_index];
366
+ middle_index = (middle_index + 1) & mask;
367
+ k++;
368
+ delayed->data[k] = fir->stater[middle_index];
369
+ middle_index = (middle_index + 1) & mask;
370
+ k++;
371
+ delayed->data[k] = fir->stater[middle_index];
372
+ middle_index = (middle_index + 1) & mask;
373
+ k++;
374
+ }
375
+ for (; k < ibuflen; k++) {
376
+ fir->stater[fir->index] = ifbuf->data[k];
377
+ fir->index = (fir->index + 1) & mask;
378
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
379
+ acc0 = vmovq_n_f32(0.0f);
380
+ j = startsimd;
381
+ for (i = 0; i < lensimd; i++) {
382
+ #ifndef __FP_FAST_FMA
383
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
384
+ acc0 = vaddq_f32(acc0, prod0);
385
+ #else
386
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
387
+ #endif
388
+ j = (j+1) & masksimd;
389
+ }
390
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
391
+ start_index = (start_index + 1) & mask;
392
+ startsimd = start_index >> 2;
393
+ delayed->data[k] = fir->stater[middle_index];
394
+ middle_index = (middle_index + 1) & mask;
395
+ }
396
+ }
397
+ else {
398
+ k = 0;
399
+ while ((start_index & 3) && k < ibuflen) {
400
+ fir->stater[fir->index] = ifbuf->data[k];
401
+ fir->index = (fir->index + 1) & mask;
402
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
403
+ acc0 = vmovq_n_f32(0.0f);
404
+ j = startsimd;
405
+ for (i = 0; i < lensimd; i++) {
406
+ #ifndef __FP_FAST_FMA
407
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
408
+ acc0 = vaddq_f32(acc0, prod0);
409
+ #else
410
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
411
+ #endif
412
+ j = (j+1) & masksimd;
413
+ }
414
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
415
+ start_index = (start_index + 1) & mask;
416
+ startsimd = start_index >> 2;
417
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
418
+ middle_index = (middle_index + 1) & mask;
419
+ k++;
420
+ }
421
+ while (k + 4 <= ibuflen) {
422
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
423
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
424
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
425
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
426
+ fir->index = (fir->index + 4) & mask;
427
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
428
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
429
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
430
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
431
+ acc0 = vmovq_n_f32(0.0f);
432
+ acc1 = vmovq_n_f32(0.0f);
433
+ acc2 = vmovq_n_f32(0.0f);
434
+ acc3 = vmovq_n_f32(0.0f);
435
+ j = startsimd;
436
+ for (i = 0; i < lensimd; i++) {
437
+ statereal = stater[j];
438
+ #ifndef __FP_FAST_FMA
439
+ prod0 = vmulq_f32(coefs0[i], statereal);
440
+ acc0 = vaddq_f32(acc0, prod0);
441
+ prod1 = vmulq_f32(coefs1[i], statereal);
442
+ acc1 = vaddq_f32(acc1, prod1);
443
+ prod2 = vmulq_f32(coefs2[i], statereal);
444
+ acc2 = vaddq_f32(acc2, prod2);
445
+ prod3 = vmulq_f32(coefs3[i], statereal);
446
+ acc3 = vaddq_f32(acc3, prod3);
447
+ #else
448
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
449
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
450
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
451
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
452
+ #endif
453
+ j = (j+1) & masksimd;
454
+ }
455
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
456
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
457
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
458
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
459
+ start_index = (start_index + 4) & mask;
460
+ startsimd = start_index >> 2;
461
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
462
+ middle_index = (middle_index + 1) & mask;
463
+ k++;
464
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
465
+ middle_index = (middle_index + 1) & mask;
466
+ k++;
467
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
468
+ middle_index = (middle_index + 1) & mask;
469
+ k++;
470
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
471
+ middle_index = (middle_index + 1) & mask;
472
+ k++;
473
+ }
474
+ for (; k < ibuflen; k++) {
475
+ fir->stater[fir->index] = ifbuf->data[k];
476
+ fir->index = (fir->index + 1) & mask;
477
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
478
+ acc0 = vmovq_n_f32(0.0f);
479
+ j = startsimd;
480
+ for (i = 0; i < lensimd; i++) {
481
+ #ifndef __FP_FAST_FMA
482
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
483
+ acc0 = vaddq_f32(acc0, prod0);
484
+ #else
485
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
486
+ #endif
487
+ j = (j+1) & masksimd;
488
+ }
489
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
490
+ start_index = (start_index + 1) & mask;
491
+ startsimd = start_index >> 2;
492
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
493
+ middle_index = (middle_index + 1) & mask;
494
+ }
495
+ }
496
+ }
497
+ else {
498
+ k = 0;
499
+ while ((start_index & 3) && k < ibuflen) {
500
+ fir->stater[fir->index] = ifbuf->data[k];
501
+ fir->index = (fir->index + 1) & mask;
502
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
503
+ acc0 = vmovq_n_f32(0.0f);
504
+ j = startsimd;
505
+ for (i = 0; i < lensimd; i++) {
506
+ #ifndef __FP_FAST_FMA
507
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
508
+ acc0 = vaddq_f32(acc0, prod0);
509
+ #else
510
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
511
+ #endif
512
+ j = (j+1) & masksimd;
513
+ }
514
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
515
+ start_index = (start_index + 1) & mask;
516
+ startsimd = start_index >> 2;
517
+ k++;
518
+ }
519
+ while (k + 4 <= ibuflen) {
520
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
521
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
522
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
523
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
524
+ fir->index = (fir->index + 4) & mask;
525
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
526
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
527
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
528
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
529
+ acc0 = vmovq_n_f32(0.0f);
530
+ acc1 = vmovq_n_f32(0.0f);
531
+ acc2 = vmovq_n_f32(0.0f);
532
+ acc3 = vmovq_n_f32(0.0f);
533
+ j = startsimd;
534
+ for (i = 0; i < lensimd; i++) {
535
+ statereal = stater[j];
536
+ #ifndef __FP_FAST_FMA
537
+ prod0 = vmulq_f32(coefs0[i], statereal);
538
+ acc0 = vaddq_f32(acc0, prod0);
539
+ prod1 = vmulq_f32(coefs1[i], statereal);
540
+ acc1 = vaddq_f32(acc1, prod1);
541
+ prod2 = vmulq_f32(coefs2[i], statereal);
542
+ acc2 = vaddq_f32(acc2, prod2);
543
+ prod3 = vmulq_f32(coefs3[i], statereal);
544
+ acc3 = vaddq_f32(acc3, prod3);
545
+ #else
546
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
547
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
548
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
549
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
550
+ #endif
551
+ j = (j+1) & masksimd;
552
+ }
553
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
554
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
555
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
556
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
557
+ start_index = (start_index + 4) & mask;
558
+ startsimd = start_index >> 2;
559
+ k += 4;
560
+ }
561
+ for (; k < ibuflen; k++) {
562
+ fir->stater[fir->index] = ifbuf->data[k];
563
+ fir->index = (fir->index + 1) & mask;
564
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
565
+ acc0 = vmovq_n_f32(0.0f);
566
+ j = startsimd;
567
+ for (i = 0; i < lensimd; i++) {
568
+ #ifndef __FP_FAST_FMA
569
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
570
+ acc0 = vaddq_f32(acc0, prod0);
571
+ #else
572
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
573
+ #endif
574
+ j = (j+1) & masksimd;
575
+ }
576
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
577
+ start_index = (start_index + 1) & mask;
578
+ startsimd = start_index >> 2;
579
+ }
580
+ }
581
+
582
+ return ofbuf;
583
+ }
584
+
585
+
586
+ pdlc_complex_buffer_t* pdlc_fir_filter_filter_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf, pdlc_complex_buffer_t *delayed)
587
+ {
588
+ const unsigned int nb_coefs = fir->nb_coefs;
589
+ const unsigned int flt_len = fir->state_len;
590
+ const unsigned int mask = fir->index_mask;
591
+ const unsigned int lensimd = fir->coef_len >> 2;
592
+ const unsigned int masksimd = mask >> 2;
593
+ const size_t ibuflen = icbuf->length;
594
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
595
+ unsigned int startsimd = start_index >> 2;
596
+ unsigned int middle_index;
597
+ unsigned int i, j;
598
+ size_t k;
599
+ register float32x4_t acc0r, acc1r, acc2r, acc3r;
600
+ register float32x4_t acc0i, acc1i, acc2i, acc3i;
601
+ #ifndef __FP_FAST_FMA
602
+ register float32x4_t prod0r, prod1r, prod2r, prod3r;
603
+ register float32x4_t prod0i, prod1i, prod2i, prod3i;
604
+ #endif
605
+ register float32x4_t statereal;
606
+ register float32x4_t stateimag;
607
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
608
+ float32x4_t *stater = (float32x4_t*)fir->stater;
609
+ float32x4_t *statei = (float32x4_t*)fir->statei;
610
+
611
+ if (!ocbuf)
612
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
613
+ else if (ocbuf->length != ibuflen)
614
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
615
+
616
+ if (delayed) {
617
+ if (delayed->length != ibuflen)
618
+ pdlc_complex_buffer_resize(delayed, ibuflen, 0);
619
+ middle_index = (start_index + nb_coefs / 2) & mask;
620
+ if (nb_coefs & 1) {
621
+ //delayed->data[k] = fir->stater[middle_index];
622
+ //middle_index = (middle_index + 1) & mask;
623
+ k = 0;
624
+ while ((start_index & 3) && k < ibuflen) {
625
+ fir->stater[fir->index] = icbuf->data[k].real;
626
+ fir->statei[fir->index] = icbuf->data[k].imag;
627
+ fir->index = (fir->index + 1) & mask;
628
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
629
+ acc0r = vmovq_n_f32(0.0f);
630
+ acc0i = vmovq_n_f32(0.0f);
631
+ j = startsimd;
632
+ for (i = 0; i < lensimd; i++) {
633
+ #ifndef __FP_FAST_FMA
634
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
635
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
636
+ acc0r = vaddq_f32(acc0r, prod0r);
637
+ acc0i = vaddq_f32(acc0i, prod0i);
638
+ #else
639
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
640
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
641
+ #endif
642
+ j = (j+1) & masksimd;
643
+ }
644
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
645
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
646
+ start_index = (start_index + 1) & mask;
647
+ startsimd = start_index >> 2;
648
+ delayed->data[k].real = fir->stater[middle_index];
649
+ delayed->data[k].imag = fir->statei[middle_index];
650
+ middle_index = (middle_index + 1) & mask;
651
+ k++;
652
+ }
653
+ while (k + 4 <= ibuflen) {
654
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
655
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
656
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
657
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
658
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
659
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
660
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
661
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
662
+ fir->index = (fir->index + 4) & mask;
663
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
664
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
665
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
666
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
667
+ acc0r = vmovq_n_f32(0.0f);
668
+ acc0i = vmovq_n_f32(0.0f);
669
+ acc1r = vmovq_n_f32(0.0f);
670
+ acc1i = vmovq_n_f32(0.0f);
671
+ acc2r = vmovq_n_f32(0.0f);
672
+ acc2i = vmovq_n_f32(0.0f);
673
+ acc3r = vmovq_n_f32(0.0f);
674
+ acc3i = vmovq_n_f32(0.0f);
675
+ j = startsimd;
676
+ for (i = 0; i < lensimd; i++) {
677
+ statereal = stater[j];
678
+ stateimag = statei[j];
679
+ #ifndef __FP_FAST_FMA
680
+ prod0r = vmulq_f32(coefs0[i], statereal);
681
+ acc0r = vaddq_f32(acc0r, prod0r);
682
+ prod1r = vmulq_f32(coefs1[i], statereal);
683
+ acc1r = vaddq_f32(acc1r, prod1r);
684
+ prod2r = vmulq_f32(coefs2[i], statereal);
685
+ acc2r = vaddq_f32(acc2r, prod2r);
686
+ prod3r = vmulq_f32(coefs3[i], statereal);
687
+ acc3r = vaddq_f32(acc3r, prod3r);
688
+ prod0i = vmulq_f32(coefs0[i], stateimag);
689
+ acc0i = vaddq_f32(acc0i, prod0i);
690
+ prod1i = vmulq_f32(coefs1[i], stateimag);
691
+ acc1i = vaddq_f32(acc1i, prod1i);
692
+ prod2i = vmulq_f32(coefs2[i], stateimag);
693
+ acc2i = vaddq_f32(acc2i, prod2i);
694
+ prod3i = vmulq_f32(coefs3[i], stateimag);
695
+ acc3i = vaddq_f32(acc3i, prod3i);
696
+ #else
697
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
698
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
699
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
700
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
701
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
702
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
703
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
704
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
705
+ #endif
706
+ j = (j+1) & masksimd;
707
+ }
708
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
709
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
710
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
711
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
712
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
713
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
714
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
715
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
716
+ start_index = (start_index + 4) & mask;
717
+ startsimd = start_index >> 2;
718
+ delayed->data[k].real = fir->stater[middle_index];
719
+ delayed->data[k].imag = fir->statei[middle_index];
720
+ middle_index = (middle_index + 1) & mask;
721
+ k++;
722
+ delayed->data[k].real = fir->stater[middle_index];
723
+ delayed->data[k].imag = fir->statei[middle_index];
724
+ middle_index = (middle_index + 1) & mask;
725
+ k++;
726
+ delayed->data[k].real = fir->stater[middle_index];
727
+ delayed->data[k].imag = fir->statei[middle_index];
728
+ middle_index = (middle_index + 1) & mask;
729
+ k++;
730
+ delayed->data[k].real = fir->stater[middle_index];
731
+ delayed->data[k].imag = fir->statei[middle_index];
732
+ middle_index = (middle_index + 1) & mask;
733
+ k++;
734
+ }
735
+ for (; k < ibuflen; k++) {
736
+ fir->stater[fir->index] = icbuf->data[k].real;
737
+ fir->statei[fir->index] = icbuf->data[k].imag;
738
+ fir->index = (fir->index + 1) & mask;
739
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
740
+ acc0r = vmovq_n_f32(0.0f);
741
+ acc0i = vmovq_n_f32(0.0f);
742
+ j = startsimd;
743
+ for (i = 0; i < lensimd; i++) {
744
+ #ifndef __FP_FAST_FMA
745
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
746
+ acc0r = vaddq_f32(acc0r, prod0r);
747
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
748
+ acc0i = vaddq_f32(acc0i, prod0i);
749
+ #else
750
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
751
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
752
+ #endif
753
+ j = (j+1) & masksimd;
754
+ }
755
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
756
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
757
+ start_index = (start_index + 1) & mask;
758
+ startsimd = start_index >> 2;
759
+ delayed->data[k].real = fir->stater[middle_index];
760
+ delayed->data[k].imag = fir->statei[middle_index];
761
+ middle_index = (middle_index + 1) & mask;
762
+ }
763
+ }
764
+ else {
765
+ k = 0;
766
+ while ((start_index & 3) && k < ibuflen) {
767
+ fir->stater[fir->index] = icbuf->data[k].real;
768
+ fir->statei[fir->index] = icbuf->data[k].imag;
769
+ fir->index = (fir->index + 1) & mask;
770
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
771
+ acc0r = vmovq_n_f32(0.0f);
772
+ acc0i = vmovq_n_f32(0.0f);
773
+ j = startsimd;
774
+ for (i = 0; i < lensimd; i++) {
775
+ #ifndef __FP_FAST_FMA
776
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
777
+ acc0r = vaddq_f32(acc0r, prod0r);
778
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
779
+ acc0i = vaddq_f32(acc0i, prod0i);
780
+ #else
781
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
782
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
783
+ #endif
784
+ j = (j+1) & masksimd;
785
+ }
786
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
787
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
788
+ start_index = (start_index + 1) & mask;
789
+ startsimd = start_index >> 2;
790
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
791
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
792
+ middle_index = (middle_index + 1) & mask;
793
+ k++;
794
+ }
795
+ while (k + 4 <= ibuflen) {
796
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
797
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
798
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
799
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
800
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
801
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
802
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
803
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
804
+ fir->index = (fir->index + 4) & mask;
805
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
806
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
807
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
808
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
809
+ acc0r = vmovq_n_f32(0.0f);
810
+ acc0i = vmovq_n_f32(0.0f);
811
+ acc1r = vmovq_n_f32(0.0f);
812
+ acc1i = vmovq_n_f32(0.0f);
813
+ acc2r = vmovq_n_f32(0.0f);
814
+ acc2i = vmovq_n_f32(0.0f);
815
+ acc3r = vmovq_n_f32(0.0f);
816
+ acc3i = vmovq_n_f32(0.0f);
817
+ j = startsimd;
818
+ for (i = 0; i < lensimd; i++) {
819
+ statereal = stater[j];
820
+ stateimag = statei[j];
821
+ #ifndef __FP_FAST_FMA
822
+ prod0r = vmulq_f32(coefs0[i], statereal);
823
+ acc0r = vaddq_f32(acc0r, prod0r);
824
+ prod1r = vmulq_f32(coefs1[i], statereal);
825
+ acc1r = vaddq_f32(acc1r, prod1r);
826
+ prod2r = vmulq_f32(coefs2[i], statereal);
827
+ acc2r = vaddq_f32(acc2r, prod2r);
828
+ prod3r = vmulq_f32(coefs3[i], statereal);
829
+ acc3r = vaddq_f32(acc3r, prod3r);
830
+ prod0i = vmulq_f32(coefs0[i], stateimag);
831
+ acc0i = vaddq_f32(acc0i, prod0i);
832
+ prod1i = vmulq_f32(coefs1[i], stateimag);
833
+ acc1i = vaddq_f32(acc1i, prod1i);
834
+ prod2i = vmulq_f32(coefs2[i], stateimag);
835
+ acc2i = vaddq_f32(acc2i, prod2i);
836
+ prod3i = vmulq_f32(coefs3[i], stateimag);
837
+ acc3i = vaddq_f32(acc3i, prod3i);
838
+ #else
839
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
840
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
841
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
842
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
843
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
844
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
845
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
846
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
847
+ #endif
848
+ j = (j+1) & masksimd;
849
+ }
850
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
851
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
852
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
853
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
854
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
855
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
856
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
857
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
858
+ start_index = (start_index + 4) & mask;
859
+ startsimd = start_index >> 2;
860
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
861
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
862
+ middle_index = (middle_index + 1) & mask;
863
+ k++;
864
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
865
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
866
+ middle_index = (middle_index + 1) & mask;
867
+ k++;
868
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
869
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
870
+ middle_index = (middle_index + 1) & mask;
871
+ k++;
872
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
873
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
874
+ middle_index = (middle_index + 1) & mask;
875
+ k++;
876
+ }
877
+ for (; k < ibuflen; k++) {
878
+ fir->stater[fir->index] = icbuf->data[k].real;
879
+ fir->statei[fir->index] = icbuf->data[k].imag;
880
+ fir->index = (fir->index + 1) & mask;
881
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
882
+ acc0r = vmovq_n_f32(0.0f);
883
+ acc0i = vmovq_n_f32(0.0f);
884
+ j = startsimd;
885
+ for (i = 0; i < lensimd; i++) {
886
+ #ifndef __FP_FAST_FMA
887
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
888
+ acc0r = vaddq_f32(acc0r, prod0r);
889
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
890
+ acc0i = vaddq_f32(acc0i, prod0i);
891
+ #else
892
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
893
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
894
+ #endif
895
+ j = (j+1) & masksimd;
896
+ }
897
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
898
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
899
+ start_index = (start_index + 1) & mask;
900
+ startsimd = start_index >> 2;
901
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
902
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
903
+ middle_index = (middle_index + 1) & mask;
904
+ }
905
+ }
906
+ }
907
+ else {
908
+ k = 0;
909
+ while ((start_index & 3) && k < ibuflen) {
910
+ fir->stater[fir->index] = icbuf->data[k].real;
911
+ fir->statei[fir->index] = icbuf->data[k].imag;
912
+ fir->index = (fir->index + 1) & mask;
913
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
914
+ acc0r = vmovq_n_f32(0.0f);
915
+ acc0i = vmovq_n_f32(0.0f);
916
+ j = startsimd;
917
+ for (i = 0; i < lensimd; i++) {
918
+ #ifndef __FP_FAST_FMA
919
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
920
+ acc0r = vaddq_f32(acc0r, prod0r);
921
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
922
+ acc0i = vaddq_f32(acc0i, prod0i);
923
+ #else
924
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
925
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
926
+ #endif
927
+ j = (j+1) & masksimd;
928
+ }
929
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
930
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
931
+ start_index = (start_index + 1) & mask;
932
+ startsimd = start_index >> 2;
933
+ k++;
934
+ }
935
+ while (k + 4 <= ibuflen) {
936
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
937
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
938
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
939
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
940
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
941
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
942
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
943
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
944
+ fir->index = (fir->index + 4) & mask;
945
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
946
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
947
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
948
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
949
+ acc0r = vmovq_n_f32(0.0f);
950
+ acc0i = vmovq_n_f32(0.0f);
951
+ acc1r = vmovq_n_f32(0.0f);
952
+ acc1i = vmovq_n_f32(0.0f);
953
+ acc2r = vmovq_n_f32(0.0f);
954
+ acc2i = vmovq_n_f32(0.0f);
955
+ acc3r = vmovq_n_f32(0.0f);
956
+ acc3i = vmovq_n_f32(0.0f);
957
+ j = startsimd;
958
+ for (i = 0; i < lensimd; i++) {
959
+ statereal = stater[j];
960
+ stateimag = statei[j];
961
+ #ifndef __FP_FAST_FMA
962
+ prod0r = vmulq_f32(coefs0[i], statereal);
963
+ acc0r = vaddq_f32(acc0r, prod0r);
964
+ prod1r = vmulq_f32(coefs1[i], statereal);
965
+ acc1r = vaddq_f32(acc1r, prod1r);
966
+ prod2r = vmulq_f32(coefs2[i], statereal);
967
+ acc2r = vaddq_f32(acc2r, prod2r);
968
+ prod3r = vmulq_f32(coefs3[i], statereal);
969
+ acc3r = vaddq_f32(acc3r, prod3r);
970
+ prod0i = vmulq_f32(coefs0[i], stateimag);
971
+ acc0i = vaddq_f32(acc0i, prod0i);
972
+ prod1i = vmulq_f32(coefs1[i], stateimag);
973
+ acc1i = vaddq_f32(acc1i, prod1i);
974
+ prod2i = vmulq_f32(coefs2[i], stateimag);
975
+ acc2i = vaddq_f32(acc2i, prod2i);
976
+ prod3i = vmulq_f32(coefs3[i], stateimag);
977
+ acc3i = vaddq_f32(acc3i, prod3i);
978
+ #else
979
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
980
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
981
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
982
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
983
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
984
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
985
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
986
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
987
+ #endif
988
+ j = (j+1) & masksimd;
989
+ }
990
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
991
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
992
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
993
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
994
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
995
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
996
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
997
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
998
+ start_index = (start_index + 4) & mask;
999
+ startsimd = start_index >> 2;
1000
+ k += 4;
1001
+ }
1002
+ for (; k < ibuflen; k++) {
1003
+ fir->stater[fir->index] = icbuf->data[k].real;
1004
+ fir->statei[fir->index] = icbuf->data[k].imag;
1005
+ fir->index = (fir->index + 1) & mask;
1006
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1007
+ acc0r = vmovq_n_f32(0.0f);
1008
+ acc0i = vmovq_n_f32(0.0f);
1009
+ j = startsimd;
1010
+ for (i = 0; i < lensimd; i++) {
1011
+ #ifndef __FP_FAST_FMA
1012
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1013
+ acc0r = vaddq_f32(acc0r, prod0r);
1014
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1015
+ acc0i = vaddq_f32(acc0i, prod0i);
1016
+ #else
1017
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1018
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1019
+ #endif
1020
+ j = (j+1) & masksimd;
1021
+ }
1022
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
1023
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
1024
+ start_index = (start_index + 1) & mask;
1025
+ startsimd = start_index >> 2;
1026
+ }
1027
+ }
1028
+
1029
+ return ocbuf;
1030
+ }
1031
+
1032
+
1033
+ pdlc_buffer_t* pdlc_fir_filter_interpolate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1034
+ {
1035
+ const unsigned int nb_coefs = fir->nb_coefs;
1036
+ const unsigned int flt_len = fir->state_len;
1037
+ const unsigned int mask = fir->index_mask;
1038
+ const unsigned int lensimd = fir->coef_len >> 2;
1039
+ const unsigned int masksimd = mask >> 2;
1040
+ const size_t ibuflen = ifbuf->length;
1041
+ const size_t obuflen = ibuflen*fir->max_counter;
1042
+ const float ffactor = (float)(fir->max_counter);
1043
+ const size_t mcounter = fir->max_counter;
1044
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1045
+ unsigned int startsimd = start_index >> 2;
1046
+ unsigned int i, j;
1047
+ size_t k = 0, l = 0;
1048
+ register float32x4_t acc0, acc1, acc2, acc3;
1049
+ #ifndef __FP_FAST_FMA
1050
+ register float32x4_t prod0, prod1, prod2, prod3;
1051
+ #endif
1052
+ register float32x4_t statereal;
1053
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1054
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1055
+
1056
+ if (!ofbuf)
1057
+ ofbuf = pdlc_buffer_new(obuflen);
1058
+ else if (ofbuf->length != obuflen)
1059
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1060
+
1061
+
1062
+ while ((start_index & 3) && k < obuflen) {
1063
+ if ((k % mcounter) == 0)
1064
+ fir->stater[fir->index] = ifbuf->data[l++];
1065
+ else
1066
+ fir->stater[fir->index] = 0.0f;
1067
+ fir->index = (fir->index + 1) & mask;
1068
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1069
+ acc0 = vmovq_n_f32(0.0f);
1070
+ j = startsimd;
1071
+ for (i = 0; i < lensimd; i++) {
1072
+ #ifndef __FP_FAST_FMA
1073
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1074
+ acc0 = vaddq_f32(acc0, prod0);
1075
+ #else
1076
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1077
+ #endif
1078
+ j = (j+1) & masksimd;
1079
+ }
1080
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1081
+ start_index = (start_index + 1) & mask;
1082
+ startsimd = start_index >> 2;
1083
+ k++;
1084
+ }
1085
+ while (k + 4 <= obuflen) {
1086
+ if (((k+0) % mcounter) == 0)
1087
+ fir->stater[fir->index] = ifbuf->data[l++];
1088
+ else
1089
+ fir->stater[fir->index] = 0.0f;
1090
+ fir->index = (fir->index + 1) & mask;
1091
+ if (((k+1) % mcounter) == 0)
1092
+ fir->stater[fir->index] = ifbuf->data[l++];
1093
+ else
1094
+ fir->stater[fir->index] = 0.0f;
1095
+ fir->index = (fir->index + 1) & mask;
1096
+ if (((k+2) % mcounter) == 0)
1097
+ fir->stater[fir->index] = ifbuf->data[l++];
1098
+ else
1099
+ fir->stater[fir->index] = 0.0f;
1100
+ fir->index = (fir->index + 1) & mask;
1101
+ if (((k+3) % mcounter) == 0)
1102
+ fir->stater[fir->index] = ifbuf->data[l++];
1103
+ else
1104
+ fir->stater[fir->index] = 0.0f;
1105
+ fir->index = (fir->index + 1) & mask;
1106
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1107
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1108
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1109
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1110
+ acc0 = vmovq_n_f32(0.0f);
1111
+ acc1 = vmovq_n_f32(0.0f);
1112
+ acc2 = vmovq_n_f32(0.0f);
1113
+ acc3 = vmovq_n_f32(0.0f);
1114
+ j = startsimd;
1115
+ for (i = 0; i < lensimd; i++) {
1116
+ statereal = stater[j];
1117
+ #ifndef __FP_FAST_FMA
1118
+ prod0 = vmulq_f32(coefs0[i], statereal);
1119
+ acc0 = vaddq_f32(acc0, prod0);
1120
+ prod1 = vmulq_f32(coefs1[i], statereal);
1121
+ acc1 = vaddq_f32(acc1, prod1);
1122
+ prod2 = vmulq_f32(coefs2[i], statereal);
1123
+ acc2 = vaddq_f32(acc2, prod2);
1124
+ prod3 = vmulq_f32(coefs3[i], statereal);
1125
+ acc3 = vaddq_f32(acc3, prod3);
1126
+ #else
1127
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1128
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1129
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1130
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1131
+ #endif
1132
+ j = (j+1) & masksimd;
1133
+ }
1134
+ ofbuf->data[k+0] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1135
+ ofbuf->data[k+1] = (acc1[0] + acc1[1] + acc1[2] + acc1[3]) * ffactor;
1136
+ ofbuf->data[k+2] = (acc2[0] + acc2[1] + acc2[2] + acc2[3]) * ffactor;
1137
+ ofbuf->data[k+3] = (acc3[0] + acc3[1] + acc3[2] + acc3[3]) * ffactor;
1138
+ start_index = (start_index + 4) & mask;
1139
+ startsimd = start_index >> 2;
1140
+ k += 4;
1141
+ }
1142
+ for (; k < obuflen; k++) {
1143
+ if ((k % mcounter) == 0)
1144
+ fir->stater[fir->index] = ifbuf->data[l++];
1145
+ else
1146
+ fir->stater[fir->index] = 0.0f;
1147
+ fir->index = (fir->index + 1) & mask;
1148
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1149
+ acc0 = vmovq_n_f32(0.0f);
1150
+ j = startsimd;
1151
+ for (i = 0; i < lensimd; i++) {
1152
+ #ifndef __FP_FAST_FMA
1153
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1154
+ acc0 = vaddq_f32(acc0, prod0);
1155
+ #else
1156
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1157
+ #endif
1158
+ j = (j+1) & masksimd;
1159
+ }
1160
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1161
+ start_index = (start_index + 1) & mask;
1162
+ startsimd = start_index >> 2;
1163
+ }
1164
+
1165
+ return ofbuf;
1166
+ }
1167
+
1168
+
1169
+
1170
+ pdlc_complex_buffer_t* pdlc_fir_filter_interpolate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1171
+ {
1172
+ const unsigned int nb_coefs = fir->nb_coefs;
1173
+ const unsigned int flt_len = fir->state_len;
1174
+ const unsigned int mask = fir->index_mask;
1175
+ const unsigned int lensimd = fir->coef_len >> 2;
1176
+ const unsigned int masksimd = mask >> 2;
1177
+ const size_t ibuflen = icbuf->length;
1178
+ const size_t obuflen = ibuflen*fir->max_counter;
1179
+ const float ffactor = (float)(fir->max_counter);
1180
+ const size_t mcounter = fir->max_counter;
1181
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1182
+ unsigned int startsimd = start_index >> 2;
1183
+ unsigned int i, j;
1184
+ size_t k = 0, l = 0;
1185
+ register float32x4_t acc0r, acc1r, acc2r, acc3r;
1186
+ register float32x4_t acc0i, acc1i, acc2i, acc3i;
1187
+ #ifndef __FP_FAST_FMA
1188
+ register float32x4_t prod0r, prod1r, prod2r, prod3r;
1189
+ register float32x4_t prod0i, prod1i, prod2i, prod3i;
1190
+ #endif
1191
+ register float32x4_t statereal;
1192
+ register float32x4_t stateimag;
1193
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1194
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1195
+ float32x4_t *statei = (float32x4_t*)fir->statei;
1196
+
1197
+ if (!ocbuf)
1198
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1199
+ else if (ocbuf->length != obuflen)
1200
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1201
+
1202
+
1203
+ while ((start_index & 3) && k < obuflen) {
1204
+ if ((k % mcounter) == 0) {
1205
+ fir->stater[fir->index] = icbuf->data[l].real;
1206
+ fir->statei[fir->index] = icbuf->data[l].imag;
1207
+ l++;
1208
+ }
1209
+ else {
1210
+ fir->stater[fir->index] = 0.0f;
1211
+ fir->statei[fir->index] = 0.0f;
1212
+ }
1213
+ fir->index = (fir->index + 1) & mask;
1214
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1215
+ acc0r = vmovq_n_f32(0.0f);
1216
+ acc0i = vmovq_n_f32(0.0f);
1217
+ j = startsimd;
1218
+ for (i = 0; i < lensimd; i++) {
1219
+ #ifndef __FP_FAST_FMA
1220
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1221
+ acc0r = vaddq_f32(acc0r, prod0r);
1222
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1223
+ acc0i = vaddq_f32(acc0i, prod0i);
1224
+ #else
1225
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1226
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1227
+ #endif
1228
+ j = (j+1) & masksimd;
1229
+ }
1230
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1231
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1232
+ start_index = (start_index + 1) & mask;
1233
+ startsimd = start_index >> 2;
1234
+ k++;
1235
+ }
1236
+ while (k + 4 <= obuflen) {
1237
+ if (((k+0) % mcounter) == 0) {
1238
+ fir->stater[fir->index] = icbuf->data[l].real;
1239
+ fir->statei[fir->index] = icbuf->data[l].imag;
1240
+ l++;
1241
+ }
1242
+ else {
1243
+ fir->stater[fir->index] = 0.0f;
1244
+ fir->statei[fir->index] = 0.0f;
1245
+ }
1246
+ fir->index = (fir->index + 1) & mask;
1247
+ if (((k+1) % mcounter) == 0) {
1248
+ fir->stater[fir->index] = icbuf->data[l].real;
1249
+ fir->statei[fir->index] = icbuf->data[l].imag;
1250
+ l++;
1251
+ }
1252
+ else {
1253
+ fir->stater[fir->index] = 0.0f;
1254
+ fir->statei[fir->index] = 0.0f;
1255
+ }
1256
+ fir->index = (fir->index + 1) & mask;
1257
+ if (((k+2) % mcounter) == 0) {
1258
+ fir->stater[fir->index] = icbuf->data[l].real;
1259
+ fir->statei[fir->index] = icbuf->data[l].imag;
1260
+ l++;
1261
+ }
1262
+ else {
1263
+ fir->stater[fir->index] = 0.0f;
1264
+ fir->statei[fir->index] = 0.0f;
1265
+ }
1266
+ fir->index = (fir->index + 1) & mask;
1267
+ if (((k+3) % mcounter) == 0) {
1268
+ fir->stater[fir->index] = icbuf->data[l].real;
1269
+ fir->statei[fir->index] = icbuf->data[l].imag;
1270
+ l++;
1271
+ }
1272
+ else {
1273
+ fir->stater[fir->index] = 0.0f;
1274
+ fir->statei[fir->index] = 0.0f;
1275
+ }
1276
+ fir->index = (fir->index + 1) & mask;
1277
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1278
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1279
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1280
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1281
+ acc0r = vmovq_n_f32(0.0f);
1282
+ acc0i = vmovq_n_f32(0.0f);
1283
+ acc1r = vmovq_n_f32(0.0f);
1284
+ acc1i = vmovq_n_f32(0.0f);
1285
+ acc2r = vmovq_n_f32(0.0f);
1286
+ acc2i = vmovq_n_f32(0.0f);
1287
+ acc3r = vmovq_n_f32(0.0f);
1288
+ acc3i = vmovq_n_f32(0.0f);
1289
+ j = startsimd;
1290
+ for (i = 0; i < lensimd; i++) {
1291
+ statereal = stater[j];
1292
+ stateimag = statei[j];
1293
+ #ifndef __FP_FAST_FMA
1294
+ prod0r = vmulq_f32(coefs0[i], statereal);
1295
+ acc0r = vaddq_f32(acc0r, prod0r);
1296
+ prod1r = vmulq_f32(coefs1[i], statereal);
1297
+ acc1r = vaddq_f32(acc1r, prod1r);
1298
+ prod2r = vmulq_f32(coefs2[i], statereal);
1299
+ acc2r = vaddq_f32(acc2r, prod2r);
1300
+ prod3r = vmulq_f32(coefs3[i], statereal);
1301
+ acc3r = vaddq_f32(acc3r, prod3r);
1302
+ prod0i = vmulq_f32(coefs0[i], stateimag);
1303
+ acc0i = vaddq_f32(acc0i, prod0i);
1304
+ prod1i = vmulq_f32(coefs1[i], stateimag);
1305
+ acc1i = vaddq_f32(acc1i, prod1i);
1306
+ prod2i = vmulq_f32(coefs2[i], stateimag);
1307
+ acc2i = vaddq_f32(acc2i, prod2i);
1308
+ prod3i = vmulq_f32(coefs3[i], stateimag);
1309
+ acc3i = vaddq_f32(acc3i, prod3i);
1310
+ #else
1311
+ acc0r = vfmaq_f32(acc0r, coefs0[i], statereal);
1312
+ acc0i = vfmaq_f32(acc0i, coefs0[i], stateimag);
1313
+ acc1r = vfmaq_f32(acc1r, coefs1[i], statereal);
1314
+ acc1i = vfmaq_f32(acc1i, coefs1[i], stateimag);
1315
+ acc2r = vfmaq_f32(acc2r, coefs2[i], statereal);
1316
+ acc2i = vfmaq_f32(acc2i, coefs2[i], stateimag);
1317
+ acc3r = vfmaq_f32(acc3r, coefs3[i], statereal);
1318
+ acc3i = vfmaq_f32(acc3i, coefs3[i], stateimag);
1319
+ #endif
1320
+ j = (j+1) & masksimd;
1321
+ }
1322
+ ocbuf->data[k+0].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1323
+ ocbuf->data[k+0].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1324
+ ocbuf->data[k+1].real = (acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3]) * ffactor;
1325
+ ocbuf->data[k+1].imag = (acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3]) * ffactor;
1326
+ ocbuf->data[k+2].real = (acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3]) * ffactor;
1327
+ ocbuf->data[k+2].imag = (acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3]) * ffactor;
1328
+ ocbuf->data[k+3].real = (acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3]) * ffactor;
1329
+ ocbuf->data[k+3].imag = (acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3]) * ffactor;
1330
+ start_index = (start_index + 4) & mask;
1331
+ startsimd = start_index >> 2;
1332
+ k += 4;
1333
+ }
1334
+ for (; k < obuflen; k++) {
1335
+ if ((k % mcounter) == 0) {
1336
+ fir->stater[fir->index] = icbuf->data[l].real;
1337
+ fir->statei[fir->index] = icbuf->data[l].imag;
1338
+ l++;
1339
+ }
1340
+ else {
1341
+ fir->stater[fir->index] = 0.0f;
1342
+ fir->statei[fir->index] = 0.0f;
1343
+ }
1344
+ fir->index = (fir->index + 1) & mask;
1345
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1346
+ acc0r = vmovq_n_f32(0.0f);
1347
+ acc0i = vmovq_n_f32(0.0f);
1348
+ j = startsimd;
1349
+ for (i = 0; i < lensimd; i++) {
1350
+ #ifndef __FP_FAST_FMA
1351
+ prod0r = vmulq_f32(coefs0[i], stater[j]);
1352
+ acc0r = vaddq_f32(acc0r, prod0r);
1353
+ prod0i = vmulq_f32(coefs0[i], statei[j]);
1354
+ acc0i = vaddq_f32(acc0i, prod0i);
1355
+ #else
1356
+ acc0r = vfmaq_f32(acc0r, coefs0[i], stater[j]);
1357
+ acc0i = vfmaq_f32(acc0i, coefs0[i], statei[j]);
1358
+ #endif
1359
+ j = (j+1) & masksimd;
1360
+ }
1361
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1362
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1363
+ start_index = (start_index + 1) & mask;
1364
+ startsimd = start_index >> 2;
1365
+ }
1366
+
1367
+ return ocbuf;
1368
+ }
1369
+
1370
+
1371
+ pdlc_buffer_t* pdlc_fir_filter_decimate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1372
+ {
1373
+ const unsigned int nb_coefs = fir->nb_coefs;
1374
+ const unsigned int flt_len = fir->state_len;
1375
+ const unsigned int mask = fir->index_mask;
1376
+ const unsigned int lensimd = fir->coef_len >> 2;
1377
+ const unsigned int masksimd = mask >> 2;
1378
+ const int mcounter = fir->max_counter;
1379
+ const size_t ibuflen = ifbuf->length;
1380
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1381
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1382
+ unsigned int startsimd = start_index >> 2;
1383
+ unsigned int i0, i1, j0, j1;
1384
+ size_t k, l;
1385
+ register float32x4_t acc0, acc1;
1386
+ #ifndef __FP_FAST_FMA
1387
+ register float32x4_t prod0, prod1;
1388
+ #endif
1389
+ const float32x4_t *coefs;
1390
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1391
+
1392
+
1393
+ if (!ofbuf)
1394
+ ofbuf = pdlc_buffer_new(obuflen);
1395
+ else if (ofbuf->length != obuflen)
1396
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1397
+
1398
+
1399
+ for (k = 0, l = 0; k < ibuflen; k++) {
1400
+ fir->stater[fir->index] = ifbuf->data[k];
1401
+ fir->index = (fir->index + 1) & mask;
1402
+ if (fir->counter == 0) {
1403
+ coefs = (float32x4_t*)fir->coefs[start_index & 3];
1404
+ acc0 = vmovq_n_f32(0.0f);
1405
+ acc1 = vmovq_n_f32(0.0f);
1406
+ j0 = startsimd;
1407
+ j1 = (startsimd+1) & masksimd;
1408
+ i0 = 0;
1409
+ i1 = 1;
1410
+ while (i1 < lensimd) {
1411
+ #ifndef __FP_FAST_FMA
1412
+ prod0 = vmulq_f32(coefs[i0], stater[j0]);
1413
+ acc0 = vaddq_f32(acc0, prod0);
1414
+ prod1 = vmulq_f32(coefs[i1], stater[j1]);
1415
+ acc1 = vaddq_f32(acc1, prod1);
1416
+ #else
1417
+ acc0 = vfmaq_f32(acc0, coefs[i0], stater[j0]);
1418
+ acc1 = vfmaq_f32(acc1, coefs[i1], stater[j1]);
1419
+ #endif
1420
+ i0 += 2;
1421
+ i1 += 2;
1422
+ j0 = (j0+2) & masksimd;
1423
+ j1 = (j1+2) & masksimd;
1424
+ }
1425
+ while (i0 < lensimd) {
1426
+ #ifndef __FP_FAST_FMA
1427
+ prod0 = vmulq_f32(coefs[i0], stater[j0]);
1428
+ acc0 = vaddq_f32(acc0, prod0);
1429
+ #else
1430
+ acc0 = vfmaq_f32(acc0, coefs[i0], stater[j0]);
1431
+ #endif
1432
+ i0 += 2;
1433
+ j0 = (j0+2) & masksimd;
1434
+ }
1435
+ acc0 = vaddq_f32(acc0, acc1);
1436
+ ofbuf->data[l++] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1437
+ start_index = (start_index + mcounter) & mask;
1438
+ startsimd = start_index >> 2;
1439
+ }
1440
+ fir->counter = (fir->counter + 1) % mcounter;
1441
+ }
1442
+
1443
+ return ofbuf;
1444
+ }
1445
+
1446
+
1447
+ pdlc_complex_buffer_t* pdlc_fir_filter_decimate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1448
+ {
1449
+ const unsigned int nb_coefs = fir->nb_coefs;
1450
+ const unsigned int flt_len = fir->state_len;
1451
+ const unsigned int mask = fir->index_mask;
1452
+ const unsigned int lensimd = fir->coef_len >> 2;
1453
+ const unsigned int masksimd = mask >> 2;
1454
+ const int mcounter = fir->max_counter;
1455
+ const size_t ibuflen = icbuf->length;
1456
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1457
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1458
+ unsigned int startsimd = start_index >> 2;
1459
+ unsigned int i0, j0, i1, j1;
1460
+ size_t k, l;
1461
+ register float32x4_t acc0r, acc0i, acc1r, acc1i;
1462
+ #ifndef __FP_FAST_FMA
1463
+ register float32x4_t prod0r, prod0i, prod1r, prod1i;
1464
+ #endif
1465
+ const float32x4_t *coefs;
1466
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1467
+ float32x4_t *statei = (float32x4_t*)fir->statei;
1468
+
1469
+
1470
+ if (!ocbuf)
1471
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1472
+ else if (ocbuf->length != obuflen)
1473
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1474
+
1475
+
1476
+ for (k = 0, l = 0; k < ibuflen; k++) {
1477
+ fir->stater[fir->index] = icbuf->data[k].real;
1478
+ fir->statei[fir->index] = icbuf->data[k].imag;
1479
+ fir->index = (fir->index + 1) & mask;
1480
+ if (fir->counter == 0) {
1481
+ coefs = (float32x4_t*)fir->coefs[start_index & 3];
1482
+ acc0r = vmovq_n_f32(0.0f);
1483
+ acc0i = vmovq_n_f32(0.0f);
1484
+ acc1r = vmovq_n_f32(0.0f);
1485
+ acc1i = vmovq_n_f32(0.0f);
1486
+ j0 = startsimd;
1487
+ j1 = (startsimd+1) & masksimd;
1488
+ i0 = 0;
1489
+ i1 = 1;
1490
+ while (i1 < lensimd) {
1491
+ #ifndef __FP_FAST_FMA
1492
+ prod0r = vmulq_f32(coefs[i0], stater[j0]);
1493
+ acc0r = vaddq_f32(acc0r, prod0r);
1494
+ prod0i = vmulq_f32(coefs[i0], statei[j0]);
1495
+ acc0i = vaddq_f32(acc0i, prod0i);
1496
+ prod1r = vmulq_f32(coefs[i1], stater[j1]);
1497
+ acc1r = vaddq_f32(acc1r, prod1r);
1498
+ prod1i = vmulq_f32(coefs[i1], statei[j1]);
1499
+ acc1i = vaddq_f32(acc1i, prod1i);
1500
+ #else
1501
+ acc0r = vfmaq_f32(acc0r, coefs[i0], stater[j0]);
1502
+ acc0i = vfmaq_f32(acc0i, coefs[i0], statei[j0]);
1503
+ acc1r = vfmaq_f32(acc1r, coefs[i1], stater[j1]);
1504
+ acc1i = vfmaq_f32(acc1i, coefs[i1], statei[j1]);
1505
+ #endif
1506
+ i0 += 2;
1507
+ i1 += 2;
1508
+ j0 = (j0+2) & masksimd;
1509
+ j1 = (j1+2) & masksimd;
1510
+ }
1511
+ while (i0 < lensimd) {
1512
+ #ifndef __FP_FAST_FMA
1513
+ prod0r = vmulq_f32(coefs[i0], stater[j0]);
1514
+ acc0r = vaddq_f32(acc0r, prod0r);
1515
+ prod0i = vmulq_f32(coefs[i0], statei[j0]);
1516
+ acc0i = vaddq_f32(acc0i, prod0i);
1517
+ #else
1518
+ acc0r = vfmaq_f32(acc0r, coefs[i0], stater[j0]);
1519
+ acc0i = vfmaq_f32(acc0i, coefs[i0], statei[j0]);
1520
+ #endif
1521
+ i0 += 2;
1522
+ j0 = (j0+2) & masksimd;
1523
+ }
1524
+ acc0r = vaddq_f32(acc0r, acc1r);
1525
+ acc0i = vaddq_f32(acc0i, acc1i);
1526
+ ocbuf->data[l].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
1527
+ ocbuf->data[l].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
1528
+ l++;
1529
+ start_index = (start_index + mcounter) & mask;
1530
+ startsimd = start_index >> 2;
1531
+ }
1532
+ fir->counter = (fir->counter + 1) % mcounter;
1533
+ }
1534
+
1535
+ return ocbuf;
1536
+ }
1537
+
1538
+
1539
+ pdlc_complex_buffer_t* pdlc_fir_filter_transform(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_complex_buffer_t *ocbuf)
1540
+ {
1541
+ const unsigned int nb_coefs = fir->nb_coefs;
1542
+ const unsigned int flt_len = fir->state_len;
1543
+ const unsigned int mask = fir->index_mask;
1544
+ const unsigned int lensimd = fir->coef_len >> 2;
1545
+ const unsigned int masksimd = mask >> 2;
1546
+ const size_t ibuflen = ifbuf->length;
1547
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1548
+ unsigned int startsimd = start_index >> 2;
1549
+ unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
1550
+ unsigned int i, j;
1551
+ size_t k = 0;
1552
+ register float32x4_t acc0, acc1, acc2, acc3;
1553
+ #ifndef __FP_FAST_FMA
1554
+ register float32x4_t prod0, prod1, prod2, prod3;
1555
+ #endif
1556
+ register float32x4_t statereal;
1557
+ const float32x4_t *coefs0, *coefs1, *coefs2, *coefs3;
1558
+ float32x4_t *stater = (float32x4_t*)fir->stater;
1559
+
1560
+
1561
+ if (!ocbuf)
1562
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
1563
+ else if (ocbuf->length != ibuflen)
1564
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
1565
+
1566
+
1567
+ if (nb_coefs & 1) {
1568
+ while ((start_index & 3) && k < ibuflen) {
1569
+ fir->stater[fir->index] = ifbuf->data[k];
1570
+ fir->index = (fir->index + 1) & mask;
1571
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1572
+ acc0 = vmovq_n_f32(0.0f);
1573
+ j = startsimd;
1574
+ for (i = 0; i < lensimd; i++) {
1575
+ #ifndef __FP_FAST_FMA
1576
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1577
+ acc0 = vaddq_f32(acc0, prod0);
1578
+ #else
1579
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1580
+ #endif
1581
+ j = (j+1) & masksimd;
1582
+ }
1583
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1584
+ start_index = (start_index + 1) & mask;
1585
+ startsimd = start_index >> 2;
1586
+ ocbuf->data[k].real = fir->stater[middle_index];
1587
+ middle_index = (middle_index + 1) & mask;
1588
+ k++;
1589
+ }
1590
+ while (k + 4 <= ibuflen) {
1591
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1592
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1593
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1594
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1595
+ fir->index = (fir->index + 4) & mask;
1596
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1597
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1598
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1599
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1600
+ acc0 = vmovq_n_f32(0.0f);
1601
+ acc1 = vmovq_n_f32(0.0f);
1602
+ acc2 = vmovq_n_f32(0.0f);
1603
+ acc3 = vmovq_n_f32(0.0f);
1604
+ j = startsimd;
1605
+ for (i = 0; i < lensimd; i++) {
1606
+ statereal = stater[j];
1607
+ #ifndef __FP_FAST_FMA
1608
+ prod0 = vmulq_f32(coefs0[i], statereal);
1609
+ acc0 = vaddq_f32(acc0, prod0);
1610
+ prod1 = vmulq_f32(coefs1[i], statereal);
1611
+ acc1 = vaddq_f32(acc1, prod1);
1612
+ prod2 = vmulq_f32(coefs2[i], statereal);
1613
+ acc2 = vaddq_f32(acc2, prod2);
1614
+ prod3 = vmulq_f32(coefs3[i], statereal);
1615
+ acc3 = vaddq_f32(acc3, prod3);
1616
+ #else
1617
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1618
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1619
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1620
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1621
+ #endif
1622
+ j = (j+1) & masksimd;
1623
+ }
1624
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1625
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1626
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1627
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1628
+ start_index = (start_index + 4) & mask;
1629
+ startsimd = start_index >> 2;
1630
+ ocbuf->data[k].real = fir->stater[middle_index];
1631
+ middle_index = (middle_index + 1) & mask;
1632
+ k++;
1633
+ ocbuf->data[k].real = fir->stater[middle_index];
1634
+ middle_index = (middle_index + 1) & mask;
1635
+ k++;
1636
+ ocbuf->data[k].real = fir->stater[middle_index];
1637
+ middle_index = (middle_index + 1) & mask;
1638
+ k++;
1639
+ ocbuf->data[k].real = fir->stater[middle_index];
1640
+ middle_index = (middle_index + 1) & mask;
1641
+ k++;
1642
+ }
1643
+ for (; k < ibuflen; k++) {
1644
+ fir->stater[fir->index] = ifbuf->data[k];
1645
+ fir->index = (fir->index + 1) & mask;
1646
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1647
+ acc0 = vmovq_n_f32(0.0f);
1648
+ j = startsimd;
1649
+ for (i = 0; i < lensimd; i++) {
1650
+ #ifndef __FP_FAST_FMA
1651
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1652
+ acc0 = vaddq_f32(acc0, prod0);
1653
+ #else
1654
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1655
+ #endif
1656
+ j = (j+1) & masksimd;
1657
+ }
1658
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1659
+ start_index = (start_index + 1) & mask;
1660
+ startsimd = start_index >> 2;
1661
+ ocbuf->data[k].real = fir->stater[middle_index];
1662
+ middle_index = (middle_index + 1) & mask;
1663
+ }
1664
+ }
1665
+ else {
1666
+ while ((start_index & 3) && k < ibuflen) {
1667
+ fir->stater[fir->index] = ifbuf->data[k];
1668
+ fir->index = (fir->index + 1) & mask;
1669
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1670
+ acc0 = vmovq_n_f32(0.0f);
1671
+ j = startsimd;
1672
+ for (i = 0; i < lensimd; i++) {
1673
+ #ifndef __FP_FAST_FMA
1674
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1675
+ acc0 = vaddq_f32(acc0, prod0);
1676
+ #else
1677
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1678
+ #endif
1679
+ j = (j+1) & masksimd;
1680
+ }
1681
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1682
+ start_index = (start_index + 1) & mask;
1683
+ startsimd = start_index >> 2;
1684
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1685
+ middle_index = (middle_index + 1) & mask;
1686
+ k++;
1687
+ }
1688
+ while (k + 4 <= ibuflen) {
1689
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1690
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1691
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1692
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1693
+ fir->index = (fir->index + 4) & mask;
1694
+ coefs0 = (float32x4_t*)fir->coefs[(start_index + 0) & 3];
1695
+ coefs1 = (float32x4_t*)fir->coefs[(start_index + 1) & 3];
1696
+ coefs2 = (float32x4_t*)fir->coefs[(start_index + 2) & 3];
1697
+ coefs3 = (float32x4_t*)fir->coefs[(start_index + 3) & 3];
1698
+ acc0 = vmovq_n_f32(0.0f);
1699
+ acc1 = vmovq_n_f32(0.0f);
1700
+ acc2 = vmovq_n_f32(0.0f);
1701
+ acc3 = vmovq_n_f32(0.0f);
1702
+ j = startsimd;
1703
+ for (i = 0; i < lensimd; i++) {
1704
+ statereal = stater[j];
1705
+ #ifndef __FP_FAST_FMA
1706
+ prod0 = vmulq_f32(coefs0[i], statereal);
1707
+ acc0 = vaddq_f32(acc0, prod0);
1708
+ prod1 = vmulq_f32(coefs1[i], statereal);
1709
+ acc1 = vaddq_f32(acc1, prod1);
1710
+ prod2 = vmulq_f32(coefs2[i], statereal);
1711
+ acc2 = vaddq_f32(acc2, prod2);
1712
+ prod3 = vmulq_f32(coefs3[i], statereal);
1713
+ acc3 = vaddq_f32(acc3, prod3);
1714
+ #else
1715
+ acc0 = vfmaq_f32(acc0, coefs0[i], statereal);
1716
+ acc1 = vfmaq_f32(acc1, coefs1[i], statereal);
1717
+ acc2 = vfmaq_f32(acc2, coefs2[i], statereal);
1718
+ acc3 = vfmaq_f32(acc3, coefs3[i], statereal);
1719
+ #endif
1720
+ j = (j+1) & masksimd;
1721
+ }
1722
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1723
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1724
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1725
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1726
+ start_index = (start_index + 4) & mask;
1727
+ startsimd = start_index >> 2;
1728
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1729
+ middle_index = (middle_index + 1) & mask;
1730
+ k++;
1731
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1732
+ middle_index = (middle_index + 1) & mask;
1733
+ k++;
1734
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1735
+ middle_index = (middle_index + 1) & mask;
1736
+ k++;
1737
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1738
+ middle_index = (middle_index + 1) & mask;
1739
+ k++;
1740
+ }
1741
+ for (; k < ibuflen; k++) {
1742
+ fir->stater[fir->index] = ifbuf->data[k];
1743
+ fir->index = (fir->index + 1) & mask;
1744
+ coefs0 = (float32x4_t*)fir->coefs[start_index & 3];
1745
+ acc0 = vmovq_n_f32(0.0f);
1746
+ j = startsimd;
1747
+ for (i = 0; i < lensimd; i++) {
1748
+ #ifndef __FP_FAST_FMA
1749
+ prod0 = vmulq_f32(coefs0[i], stater[j]);
1750
+ acc0 = vaddq_f32(acc0, prod0);
1751
+ #else
1752
+ acc0 = vfmaq_f32(acc0, coefs0[i], stater[j]);
1753
+ #endif
1754
+ j = (j+1) & masksimd;
1755
+ }
1756
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1757
+ start_index = (start_index + 1) & mask;
1758
+ startsimd = start_index >> 2;
1759
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1760
+ middle_index = (middle_index + 1) & mask;
1761
+ }
1762
+ }
1763
+
1764
+ return ocbuf;
1765
+ }
1766
+
1767
+