paddlec 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1677 @@
1
+ /* Copyright (C) 2019 Théotime Bollengier <theotime.bollengier@gmail.com>
2
+ *
3
+ * This file is part of PaddleC
4
+ *
5
+ * PaddleC is free software: you can redistribute it and/or modify
6
+ * it under the terms of the GNU General Public License as published by
7
+ * the Free Software Foundation, either version 3 of the License, or
8
+ * (at your option) any later version.
9
+ *
10
+ * PaddleC is distributed in the hope that it will be useful,
11
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
+ * GNU General Public License for more details.
14
+ *
15
+ * You should have received a copy of the GNU General Public License
16
+ * along with PaddleC. If not, see <https://www.gnu.org/licenses/>.
17
+ */
18
+
19
+ #include <xmmintrin.h>
20
+ #ifdef __SSE3__
21
+ #include <pmmintrin.h>
22
+ #endif
23
+
24
+
25
+ void pdlc_fir_filter_inspect(pdlc_fir_filter_t* fir)
26
+ {
27
+ size_t i, j;
28
+
29
+ printf("nb_coefs: %u, state_len: %u, coef_len: %u, index_mask: 0x%x, index: %u\n",
30
+ fir->nb_coefs, fir->state_len, fir->coef_len, fir->index_mask, fir->index);
31
+ printf("state: [%.7g", fir->stater[0]);
32
+ for (i = 1; i < fir->state_len; i++)
33
+ printf(", %.7g", fir->stater[i]);
34
+ printf("]\n");
35
+ for (j = 0; j < 4; j++) {
36
+ printf("coefs: {%lu}[%.7g", j, fir->coefs[j][0]);
37
+ for (i = 1; i < fir->coef_len; i++)
38
+ printf(", %.7g", fir->coefs[j][i]);
39
+ printf("]\n");
40
+ }
41
+ }
42
+
43
+
44
+ void pdlc_fir_filter_initialize(pdlc_fir_filter_t* fir, int order)
45
+ {
46
+ int i;
47
+
48
+ if (fir->coefs) {
49
+ for (i = 0; i < 4; i++)
50
+ if (fir->coefs[i])
51
+ _mm_free(fir->coefs[i]);
52
+ free(fir->coefs);
53
+ fir->coefs = NULL;
54
+ }
55
+
56
+ if (fir->stater)
57
+ _mm_free(fir->stater);
58
+ fir->stater = NULL;
59
+
60
+ if (fir->statei)
61
+ _mm_free(fir->statei);
62
+ fir->statei = NULL;
63
+
64
+ fir->nb_coefs = 0;
65
+ fir->state_len = 0;
66
+ fir->coef_len = 0;
67
+ fir->index = 0;
68
+ fir->index_mask = 0;
69
+ fir->counter = 0;
70
+ fir->max_counter = 1;
71
+
72
+ if (order < 0)
73
+ return;
74
+
75
+ if (order > 67108863) {
76
+ fprintf(stderr, "ERROR: libpaddlec: Filter order cannot be greater than 67108864\n");
77
+ exit(EXIT_FAILURE);
78
+ }
79
+
80
+ fir->nb_coefs = (unsigned int)(order + 1);
81
+ fir->coef_len = ((fir->nb_coefs + 3 + 3) >> 2) << 2;
82
+ fir->state_len = (unsigned int)(pow(2.0, ceil(log2(fir->coef_len))));
83
+ fir->index = 0;
84
+ fir->index_mask = fir->state_len - 1;
85
+
86
+ fir->coefs = malloc(4*sizeof(float*));
87
+ if (fir->coefs == NULL) {
88
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", 4 * sizeof(float*));
89
+ exit(EXIT_FAILURE);
90
+ }
91
+
92
+ for (i = 0; i < 4; i++) {
93
+ fir->coefs[i] = _mm_malloc(fir->coef_len * sizeof(float), sizeof(__m128));
94
+ if (fir->coefs[i] == NULL) {
95
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->coef_len * sizeof(float));
96
+ exit(EXIT_FAILURE);
97
+ }
98
+ }
99
+
100
+ fir->stater = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m128));
101
+ if (fir->stater == NULL) {
102
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
103
+ exit(EXIT_FAILURE);
104
+ }
105
+
106
+ fir->statei = _mm_malloc(fir->state_len * sizeof(float), sizeof(__m128));
107
+ if (fir->statei == NULL) {
108
+ fprintf(stderr, "ERROR: libpaddlec: Cannot allocate %lu bytes for FIR!\n", fir->state_len * sizeof(float));
109
+ exit(EXIT_FAILURE);
110
+ }
111
+
112
+ memset(fir->stater, 0, fir->state_len * sizeof(float));
113
+ memset(fir->statei, 0, fir->state_len * sizeof(float));
114
+ for (i = 0; i < 4; i++)
115
+ memset(fir->coefs[i], 0, fir->coef_len * sizeof(float));
116
+ }
117
+
118
+
119
+ void pdlc_fir_filter_free(pdlc_fir_filter_t* fir)
120
+ {
121
+ int i;
122
+
123
+ if (!fir)
124
+ return;
125
+
126
+ if (fir->coefs) {
127
+ for (i = 0; i < 4; i++)
128
+ if (fir->coefs[i])
129
+ _mm_free(fir->coefs[i]);
130
+ free(fir->coefs);
131
+ }
132
+
133
+ if (fir->stater)
134
+ _mm_free(fir->stater);
135
+
136
+ if (fir->statei)
137
+ _mm_free(fir->statei);
138
+
139
+ free(fir);
140
+ }
141
+
142
+
143
+ size_t pdlc_fir_filter_size(pdlc_fir_filter_t* fir)
144
+ {
145
+ size_t res;
146
+
147
+ res = sizeof(pdlc_fir_filter_t);
148
+ res += sizeof(float*)* 4;
149
+ res += sizeof(float) * fir->state_len * 2;
150
+ res += sizeof(float) * fir->coef_len * 4;
151
+
152
+ return res;
153
+ }
154
+
155
+
156
+ int pdlc_fir_filter_set_coef_at(pdlc_fir_filter_t* fir, int index, float value)
157
+ {
158
+ int i;
159
+
160
+ if (index < 0 || index >= (int)fir->nb_coefs)
161
+ return -1;
162
+
163
+ for (i = 0; i < 4; i++)
164
+ fir->coefs[i][(fir->nb_coefs - 1 - index + i) % fir->coef_len] = value;
165
+
166
+ return 0;
167
+ }
168
+
169
+
170
+ float pdlc_fir_filter_filter_float(pdlc_fir_filter_t* fir, float sample, float *delayed)
171
+ {
172
+ const unsigned int nb_coefs = fir->nb_coefs;
173
+ const unsigned int flt_len = fir->state_len;
174
+ const unsigned int mask = fir->index_mask;
175
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
176
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
177
+ const unsigned int lensimd = fir->coef_len >> 2;
178
+ const unsigned int startsimd = start_index >> 2;
179
+ const unsigned int masksimd = mask >> 2;
180
+ unsigned int i, j;
181
+ register __m128 acc, prod;
182
+ const __m128 *coefs = (__m128*)fir->coefs[start_index & 3];
183
+ __m128 *stater = (__m128*)fir->stater;
184
+
185
+ fir->stater[fir->index] = sample;
186
+ fir->index = (fir->index + 1) & mask;
187
+
188
+ if (delayed) {
189
+ if (nb_coefs & 1)
190
+ *delayed = fir->stater[middle_index];
191
+ else
192
+ *delayed = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
193
+ }
194
+
195
+ acc = _mm_setzero_ps();
196
+ j = startsimd;
197
+ for (i = 0; i < lensimd; i++) {
198
+ prod = _mm_mul_ps(coefs[i], stater[j]);
199
+ acc = _mm_add_ps(acc, prod);
200
+ j = (j+1) & masksimd;
201
+ }
202
+
203
+ return acc[0] + acc[1] + acc[2] + acc[3];
204
+ }
205
+
206
+
207
+ pdlc_complex_t pdlc_fir_filter_filter_complex(pdlc_fir_filter_t* fir, pdlc_complex_t sample, pdlc_complex_t *delayed)
208
+ {
209
+ const unsigned int nb_coefs = fir->nb_coefs;
210
+ const unsigned int flt_len = fir->state_len;
211
+ const unsigned int mask = fir->index_mask;
212
+ const unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
213
+ const unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
214
+ const unsigned int lensimd = fir->coef_len >> 2;
215
+ const unsigned int startsimd = start_index >> 2;
216
+ const unsigned int masksimd = mask >> 2;
217
+ unsigned int i, j;
218
+ pdlc_complex_t res = {0.0f, 0.0f};
219
+ register __m128 accr, acci, prodr, prodi;
220
+ const __m128 *coefs = (__m128*)fir->coefs[start_index & 3];
221
+ __m128 *stater = (__m128*)fir->stater;
222
+ __m128 *statei = (__m128*)fir->statei;
223
+
224
+ fir->stater[fir->index] = sample.real;
225
+ fir->statei[fir->index] = sample.imag;
226
+ fir->index = (fir->index + 1) & mask;
227
+
228
+ accr = _mm_setzero_ps();
229
+ acci = _mm_setzero_ps();
230
+ j = startsimd;
231
+ for (i = 0; i < lensimd; i++) {
232
+ prodr = _mm_mul_ps(coefs[i], stater[j]);
233
+ prodi = _mm_mul_ps(coefs[i], statei[j]);
234
+ accr = _mm_add_ps(accr, prodr);
235
+ acci = _mm_add_ps(acci, prodi);
236
+ j = (j+1) & masksimd;
237
+ }
238
+ res.real = accr[0] + accr[1] + accr[2] + accr[3];
239
+ res.imag = acci[0] + acci[1] + acci[2] + acci[3];
240
+
241
+ if (delayed) {
242
+ if (nb_coefs & 1) {
243
+ delayed->real = fir->stater[middle_index];
244
+ delayed->imag = fir->statei[middle_index];
245
+ }
246
+ else {
247
+ delayed->real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
248
+ delayed->imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
249
+ }
250
+ }
251
+
252
+ return res;
253
+ }
254
+
255
+
256
+ pdlc_buffer_t* pdlc_fir_filter_filter_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf, pdlc_buffer_t *delayed)
257
+ {
258
+ const unsigned int nb_coefs = fir->nb_coefs;
259
+ const unsigned int flt_len = fir->state_len;
260
+ const unsigned int mask = fir->index_mask;
261
+ const unsigned int lensimd = fir->coef_len >> 2;
262
+ const unsigned int masksimd = mask >> 2;
263
+ const size_t ibuflen = ifbuf->length;
264
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
265
+ unsigned int startsimd = start_index >> 2;
266
+ unsigned int middle_index;
267
+ unsigned int i, j;
268
+ size_t k;
269
+ register __m128 acc0, acc1, acc2, acc3;
270
+ register __m128 prod0, prod1, prod2, prod3;
271
+ register __m128 statereal;
272
+ const __m128 *coefs0, *coefs1, *coefs2, *coefs3;
273
+ __m128 *stater = (__m128*)fir->stater;
274
+
275
+ if (!ofbuf)
276
+ ofbuf = pdlc_buffer_new(ibuflen);
277
+ else if (ofbuf->length != ibuflen)
278
+ pdlc_buffer_resize(ofbuf, ibuflen, 0);
279
+
280
+ if (delayed) {
281
+ if (delayed->length != ibuflen)
282
+ pdlc_buffer_resize(delayed, ibuflen, 0);
283
+ middle_index = (start_index + nb_coefs / 2) & mask;
284
+ if (nb_coefs & 1) {
285
+ k = 0;
286
+ while ((start_index & 3) && k < ibuflen) {
287
+ fir->stater[fir->index] = ifbuf->data[k];
288
+ fir->index = (fir->index + 1) & mask;
289
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
290
+ acc0 = _mm_setzero_ps();
291
+ j = startsimd;
292
+ for (i = 0; i < lensimd; i++) {
293
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
294
+ acc0 = _mm_add_ps(acc0, prod0);
295
+ j = (j+1) & masksimd;
296
+ }
297
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
298
+ start_index = (start_index + 1) & mask;
299
+ startsimd = start_index >> 2;
300
+ delayed->data[k] = fir->stater[middle_index];
301
+ middle_index = (middle_index + 1) & mask;
302
+ k++;
303
+ }
304
+ while (k + 4 <= ibuflen) {
305
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
306
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
307
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
308
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
309
+ fir->index = (fir->index + 4) & mask;
310
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
311
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
312
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
313
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
314
+ acc0 = _mm_setzero_ps();
315
+ acc1 = _mm_setzero_ps();
316
+ acc2 = _mm_setzero_ps();
317
+ acc3 = _mm_setzero_ps();
318
+ j = startsimd;
319
+ for (i = 0; i < lensimd; i++) {
320
+ statereal = stater[j];
321
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
322
+ acc0 = _mm_add_ps(acc0, prod0);
323
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
324
+ acc1 = _mm_add_ps(acc1, prod1);
325
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
326
+ acc2 = _mm_add_ps(acc2, prod2);
327
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
328
+ acc3 = _mm_add_ps(acc3, prod3);
329
+ j = (j+1) & masksimd;
330
+ }
331
+ #ifdef __SSE3__
332
+ acc0 = _mm_hadd_ps(acc0, acc1);
333
+ acc2 = _mm_hadd_ps(acc2, acc3);
334
+ acc0 = _mm_hadd_ps(acc0, acc2);
335
+ ofbuf->data[k+0] = acc0[0];
336
+ ofbuf->data[k+1] = acc0[1];
337
+ ofbuf->data[k+2] = acc0[2];
338
+ ofbuf->data[k+3] = acc0[3];
339
+ #else
340
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
341
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
342
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
343
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
344
+ #endif
345
+ start_index = (start_index + 4) & mask;
346
+ startsimd = start_index >> 2;
347
+ delayed->data[k] = fir->stater[middle_index];
348
+ middle_index = (middle_index + 1) & mask;
349
+ k++;
350
+ delayed->data[k] = fir->stater[middle_index];
351
+ middle_index = (middle_index + 1) & mask;
352
+ k++;
353
+ delayed->data[k] = fir->stater[middle_index];
354
+ middle_index = (middle_index + 1) & mask;
355
+ k++;
356
+ delayed->data[k] = fir->stater[middle_index];
357
+ middle_index = (middle_index + 1) & mask;
358
+ k++;
359
+ }
360
+ for (; k < ibuflen; k++) {
361
+ fir->stater[fir->index] = ifbuf->data[k];
362
+ fir->index = (fir->index + 1) & mask;
363
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
364
+ acc0 = _mm_setzero_ps();
365
+ j = startsimd;
366
+ for (i = 0; i < lensimd; i++) {
367
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
368
+ acc0 = _mm_add_ps(acc0, prod0);
369
+ j = (j+1) & masksimd;
370
+ }
371
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
372
+ start_index = (start_index + 1) & mask;
373
+ startsimd = start_index >> 2;
374
+ delayed->data[k] = fir->stater[middle_index];
375
+ middle_index = (middle_index + 1) & mask;
376
+ }
377
+ }
378
+ else {
379
+ k = 0;
380
+ while ((start_index & 3) && k < ibuflen) {
381
+ fir->stater[fir->index] = ifbuf->data[k];
382
+ fir->index = (fir->index + 1) & mask;
383
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
384
+ acc0 = _mm_setzero_ps();
385
+ j = startsimd;
386
+ for (i = 0; i < lensimd; i++) {
387
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
388
+ acc0 = _mm_add_ps(acc0, prod0);
389
+ j = (j+1) & masksimd;
390
+ }
391
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
392
+ start_index = (start_index + 1) & mask;
393
+ startsimd = start_index >> 2;
394
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
395
+ middle_index = (middle_index + 1) & mask;
396
+ k++;
397
+ }
398
+ while (k + 4 <= ibuflen) {
399
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
400
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
401
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
402
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
403
+ fir->index = (fir->index + 4) & mask;
404
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
405
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
406
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
407
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
408
+ acc0 = _mm_setzero_ps();
409
+ acc1 = _mm_setzero_ps();
410
+ acc2 = _mm_setzero_ps();
411
+ acc3 = _mm_setzero_ps();
412
+ j = startsimd;
413
+ for (i = 0; i < lensimd; i++) {
414
+ statereal = stater[j];
415
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
416
+ acc0 = _mm_add_ps(acc0, prod0);
417
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
418
+ acc1 = _mm_add_ps(acc1, prod1);
419
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
420
+ acc2 = _mm_add_ps(acc2, prod2);
421
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
422
+ acc3 = _mm_add_ps(acc3, prod3);
423
+ j = (j+1) & masksimd;
424
+ }
425
+ #ifdef __SSE3__
426
+ acc0 = _mm_hadd_ps(acc0, acc1);
427
+ acc2 = _mm_hadd_ps(acc2, acc3);
428
+ acc0 = _mm_hadd_ps(acc0, acc2);
429
+ ofbuf->data[k+0] = acc0[0];
430
+ ofbuf->data[k+1] = acc0[1];
431
+ ofbuf->data[k+2] = acc0[2];
432
+ ofbuf->data[k+3] = acc0[3];
433
+ #else
434
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
435
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
436
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
437
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
438
+ #endif
439
+ start_index = (start_index + 4) & mask;
440
+ startsimd = start_index >> 2;
441
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
442
+ middle_index = (middle_index + 1) & mask;
443
+ k++;
444
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
445
+ middle_index = (middle_index + 1) & mask;
446
+ k++;
447
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
448
+ middle_index = (middle_index + 1) & mask;
449
+ k++;
450
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
451
+ middle_index = (middle_index + 1) & mask;
452
+ k++;
453
+ }
454
+ for (; k < ibuflen; k++) {
455
+ fir->stater[fir->index] = ifbuf->data[k];
456
+ fir->index = (fir->index + 1) & mask;
457
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
458
+ acc0 = _mm_setzero_ps();
459
+ j = startsimd;
460
+ for (i = 0; i < lensimd; i++) {
461
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
462
+ acc0 = _mm_add_ps(acc0, prod0);
463
+ j = (j+1) & masksimd;
464
+ }
465
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
466
+ start_index = (start_index + 1) & mask;
467
+ startsimd = start_index >> 2;
468
+ delayed->data[k] = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
469
+ middle_index = (middle_index + 1) & mask;
470
+ }
471
+ }
472
+ }
473
+ else {
474
+ k = 0;
475
+ while ((start_index & 3) && k < ibuflen) {
476
+ fir->stater[fir->index] = ifbuf->data[k];
477
+ fir->index = (fir->index + 1) & mask;
478
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
479
+ acc0 = _mm_setzero_ps();
480
+ j = startsimd;
481
+ for (i = 0; i < lensimd; i++) {
482
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
483
+ acc0 = _mm_add_ps(acc0, prod0);
484
+ j = (j+1) & masksimd;
485
+ }
486
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
487
+ start_index = (start_index + 1) & mask;
488
+ startsimd = start_index >> 2;
489
+ k++;
490
+ }
491
+ while (k + 4 <= ibuflen) {
492
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
493
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
494
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
495
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
496
+ fir->index = (fir->index + 4) & mask;
497
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
498
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
499
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
500
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
501
+ acc0 = _mm_setzero_ps();
502
+ acc1 = _mm_setzero_ps();
503
+ acc2 = _mm_setzero_ps();
504
+ acc3 = _mm_setzero_ps();
505
+ j = startsimd;
506
+ for (i = 0; i < lensimd; i++) {
507
+ statereal = stater[j];
508
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
509
+ acc0 = _mm_add_ps(acc0, prod0);
510
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
511
+ acc1 = _mm_add_ps(acc1, prod1);
512
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
513
+ acc2 = _mm_add_ps(acc2, prod2);
514
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
515
+ acc3 = _mm_add_ps(acc3, prod3);
516
+ j = (j+1) & masksimd;
517
+ }
518
+ #ifdef __SSE3__
519
+ acc0 = _mm_hadd_ps(acc0, acc1);
520
+ acc2 = _mm_hadd_ps(acc2, acc3);
521
+ acc0 = _mm_hadd_ps(acc0, acc2);
522
+ ofbuf->data[k+0] = acc0[0];
523
+ ofbuf->data[k+1] = acc0[1];
524
+ ofbuf->data[k+2] = acc0[2];
525
+ ofbuf->data[k+3] = acc0[3];
526
+ #else
527
+ ofbuf->data[k+0] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
528
+ ofbuf->data[k+1] = acc1[0] + acc1[1] + acc1[2] + acc1[3];
529
+ ofbuf->data[k+2] = acc2[0] + acc2[1] + acc2[2] + acc2[3];
530
+ ofbuf->data[k+3] = acc3[0] + acc3[1] + acc3[2] + acc3[3];
531
+ #endif
532
+ start_index = (start_index + 4) & mask;
533
+ startsimd = start_index >> 2;
534
+ k += 4;
535
+ }
536
+ for (; k < ibuflen; k++) {
537
+ fir->stater[fir->index] = ifbuf->data[k];
538
+ fir->index = (fir->index + 1) & mask;
539
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
540
+ acc0 = _mm_setzero_ps();
541
+ j = startsimd;
542
+ for (i = 0; i < lensimd; i++) {
543
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
544
+ acc0 = _mm_add_ps(acc0, prod0);
545
+ j = (j+1) & masksimd;
546
+ }
547
+ ofbuf->data[k] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
548
+ start_index = (start_index + 1) & mask;
549
+ startsimd = start_index >> 2;
550
+ }
551
+ }
552
+
553
+ return ofbuf;
554
+ }
555
+
556
+
557
+ pdlc_complex_buffer_t* pdlc_fir_filter_filter_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf, pdlc_complex_buffer_t *delayed)
558
+ {
559
+ const unsigned int nb_coefs = fir->nb_coefs;
560
+ const unsigned int flt_len = fir->state_len;
561
+ const unsigned int mask = fir->index_mask;
562
+ const unsigned int lensimd = fir->coef_len >> 2;
563
+ const unsigned int masksimd = mask >> 2;
564
+ const size_t ibuflen = icbuf->length;
565
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
566
+ unsigned int startsimd = start_index >> 2;
567
+ unsigned int middle_index;
568
+ unsigned int i, j;
569
+ size_t k;
570
+ register __m128 acc0r, acc1r, acc2r, acc3r;
571
+ register __m128 acc0i, acc1i, acc2i, acc3i;
572
+ register __m128 prod0r, prod1r, prod2r, prod3r;
573
+ register __m128 prod0i, prod1i, prod2i, prod3i;
574
+ register __m128 statereal;
575
+ register __m128 stateimag;
576
+ const __m128 *coefs0, *coefs1, *coefs2, *coefs3;
577
+ __m128 *stater = (__m128*)fir->stater;
578
+ __m128 *statei = (__m128*)fir->statei;
579
+
580
+ if (!ocbuf)
581
+ ocbuf = pdlc_complex_buffer_new(ibuflen);
582
+ else if (ocbuf->length != ibuflen)
583
+ pdlc_complex_buffer_resize(ocbuf, ibuflen, 0);
584
+
585
+ if (delayed) {
586
+ if (delayed->length != ibuflen)
587
+ pdlc_complex_buffer_resize(delayed, ibuflen, 0);
588
+ middle_index = (start_index + nb_coefs / 2) & mask;
589
+ if (nb_coefs & 1) {
590
+ //delayed->data[k] = fir->stater[middle_index];
591
+ //middle_index = (middle_index + 1) & mask;
592
+ k = 0;
593
+ while ((start_index & 3) && k < ibuflen) {
594
+ fir->stater[fir->index] = icbuf->data[k].real;
595
+ fir->statei[fir->index] = icbuf->data[k].imag;
596
+ fir->index = (fir->index + 1) & mask;
597
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
598
+ acc0r = _mm_setzero_ps();
599
+ acc0i = _mm_setzero_ps();
600
+ j = startsimd;
601
+ for (i = 0; i < lensimd; i++) {
602
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
603
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
604
+ acc0r = _mm_add_ps(acc0r, prod0r);
605
+ acc0i = _mm_add_ps(acc0i, prod0i);
606
+ j = (j+1) & masksimd;
607
+ }
608
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
609
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
610
+ start_index = (start_index + 1) & mask;
611
+ startsimd = start_index >> 2;
612
+ delayed->data[k].real = fir->stater[middle_index];
613
+ delayed->data[k].imag = fir->statei[middle_index];
614
+ middle_index = (middle_index + 1) & mask;
615
+ k++;
616
+ }
617
+ while (k + 4 <= ibuflen) {
618
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
619
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
620
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
621
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
622
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
623
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
624
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
625
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
626
+ fir->index = (fir->index + 4) & mask;
627
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
628
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
629
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
630
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
631
+ acc0r = _mm_setzero_ps();
632
+ acc0i = _mm_setzero_ps();
633
+ acc1r = _mm_setzero_ps();
634
+ acc1i = _mm_setzero_ps();
635
+ acc2r = _mm_setzero_ps();
636
+ acc2i = _mm_setzero_ps();
637
+ acc3r = _mm_setzero_ps();
638
+ acc3i = _mm_setzero_ps();
639
+ j = startsimd;
640
+ for (i = 0; i < lensimd; i++) {
641
+ statereal = stater[j];
642
+ prod0r = _mm_mul_ps(coefs0[i], statereal);
643
+ acc0r = _mm_add_ps(acc0r, prod0r);
644
+ prod1r = _mm_mul_ps(coefs1[i], statereal);
645
+ acc1r = _mm_add_ps(acc1r, prod1r);
646
+ prod2r = _mm_mul_ps(coefs2[i], statereal);
647
+ acc2r = _mm_add_ps(acc2r, prod2r);
648
+ prod3r = _mm_mul_ps(coefs3[i], statereal);
649
+ acc3r = _mm_add_ps(acc3r, prod3r);
650
+ stateimag = statei[j];
651
+ prod0i = _mm_mul_ps(coefs0[i], stateimag);
652
+ acc0i = _mm_add_ps(acc0i, prod0i);
653
+ prod1i = _mm_mul_ps(coefs1[i], stateimag);
654
+ acc1i = _mm_add_ps(acc1i, prod1i);
655
+ prod2i = _mm_mul_ps(coefs2[i], stateimag);
656
+ acc2i = _mm_add_ps(acc2i, prod2i);
657
+ prod3i = _mm_mul_ps(coefs3[i], stateimag);
658
+ acc3i = _mm_add_ps(acc3i, prod3i);
659
+ j = (j+1) & masksimd;
660
+ }
661
+ #ifdef __SSE3__
662
+ acc0r = _mm_hadd_ps(acc0r, acc1r);
663
+ acc0i = _mm_hadd_ps(acc0i, acc1i);
664
+ acc2r = _mm_hadd_ps(acc2r, acc3r);
665
+ acc2i = _mm_hadd_ps(acc2i, acc3i);
666
+ acc0r = _mm_hadd_ps(acc0r, acc2r);
667
+ acc0i = _mm_hadd_ps(acc0i, acc2i);
668
+ ocbuf->data[k+0].real = acc0r[0];
669
+ ocbuf->data[k+0].imag = acc0i[0];
670
+ ocbuf->data[k+1].real = acc0r[1];
671
+ ocbuf->data[k+1].imag = acc0i[1];
672
+ ocbuf->data[k+2].real = acc0r[2];
673
+ ocbuf->data[k+2].imag = acc0i[2];
674
+ ocbuf->data[k+3].real = acc0r[3];
675
+ ocbuf->data[k+3].imag = acc0i[3];
676
+ #else
677
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
678
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
679
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
680
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
681
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
682
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
683
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
684
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
685
+ #endif
686
+ start_index = (start_index + 4) & mask;
687
+ startsimd = start_index >> 2;
688
+ delayed->data[k].real = fir->stater[middle_index];
689
+ delayed->data[k].imag = fir->statei[middle_index];
690
+ middle_index = (middle_index + 1) & mask;
691
+ k++;
692
+ delayed->data[k].real = fir->stater[middle_index];
693
+ delayed->data[k].imag = fir->statei[middle_index];
694
+ middle_index = (middle_index + 1) & mask;
695
+ k++;
696
+ delayed->data[k].real = fir->stater[middle_index];
697
+ delayed->data[k].imag = fir->statei[middle_index];
698
+ middle_index = (middle_index + 1) & mask;
699
+ k++;
700
+ delayed->data[k].real = fir->stater[middle_index];
701
+ delayed->data[k].imag = fir->statei[middle_index];
702
+ middle_index = (middle_index + 1) & mask;
703
+ k++;
704
+ }
705
+ for (; k < ibuflen; k++) {
706
+ fir->stater[fir->index] = icbuf->data[k].real;
707
+ fir->statei[fir->index] = icbuf->data[k].imag;
708
+ fir->index = (fir->index + 1) & mask;
709
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
710
+ acc0r = _mm_setzero_ps();
711
+ acc0i = _mm_setzero_ps();
712
+ j = startsimd;
713
+ for (i = 0; i < lensimd; i++) {
714
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
715
+ acc0r = _mm_add_ps(acc0r, prod0r);
716
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
717
+ acc0i = _mm_add_ps(acc0i, prod0i);
718
+ j = (j+1) & masksimd;
719
+ }
720
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
721
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
722
+ start_index = (start_index + 1) & mask;
723
+ startsimd = start_index >> 2;
724
+ delayed->data[k].real = fir->stater[middle_index];
725
+ delayed->data[k].imag = fir->statei[middle_index];
726
+ middle_index = (middle_index + 1) & mask;
727
+ }
728
+ }
729
+ else {
730
+ k = 0;
731
+ while ((start_index & 3) && k < ibuflen) {
732
+ fir->stater[fir->index] = icbuf->data[k].real;
733
+ fir->statei[fir->index] = icbuf->data[k].imag;
734
+ fir->index = (fir->index + 1) & mask;
735
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
736
+ acc0r = _mm_setzero_ps();
737
+ acc0i = _mm_setzero_ps();
738
+ j = startsimd;
739
+ for (i = 0; i < lensimd; i++) {
740
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
741
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
742
+ acc0r = _mm_add_ps(acc0r, prod0r);
743
+ acc0i = _mm_add_ps(acc0i, prod0i);
744
+ j = (j+1) & masksimd;
745
+ }
746
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
747
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
748
+ start_index = (start_index + 1) & mask;
749
+ startsimd = start_index >> 2;
750
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
751
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
752
+ middle_index = (middle_index + 1) & mask;
753
+ k++;
754
+ }
755
+ while (k + 4 <= ibuflen) {
756
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
757
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
758
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
759
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
760
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
761
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
762
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
763
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
764
+ fir->index = (fir->index + 4) & mask;
765
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
766
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
767
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
768
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
769
+ acc0r = _mm_setzero_ps();
770
+ acc0i = _mm_setzero_ps();
771
+ acc1r = _mm_setzero_ps();
772
+ acc1i = _mm_setzero_ps();
773
+ acc2r = _mm_setzero_ps();
774
+ acc2i = _mm_setzero_ps();
775
+ acc3r = _mm_setzero_ps();
776
+ acc3i = _mm_setzero_ps();
777
+ j = startsimd;
778
+ for (i = 0; i < lensimd; i++) {
779
+ statereal = stater[j];
780
+ prod0r = _mm_mul_ps(coefs0[i], statereal);
781
+ acc0r = _mm_add_ps(acc0r, prod0r);
782
+ prod1r = _mm_mul_ps(coefs1[i], statereal);
783
+ acc1r = _mm_add_ps(acc1r, prod1r);
784
+ prod2r = _mm_mul_ps(coefs2[i], statereal);
785
+ acc2r = _mm_add_ps(acc2r, prod2r);
786
+ prod3r = _mm_mul_ps(coefs3[i], statereal);
787
+ acc3r = _mm_add_ps(acc3r, prod3r);
788
+ stateimag = statei[j];
789
+ prod0i = _mm_mul_ps(coefs0[i], stateimag);
790
+ acc0i = _mm_add_ps(acc0i, prod0i);
791
+ prod1i = _mm_mul_ps(coefs1[i], stateimag);
792
+ acc1i = _mm_add_ps(acc1i, prod1i);
793
+ prod2i = _mm_mul_ps(coefs2[i], stateimag);
794
+ acc2i = _mm_add_ps(acc2i, prod2i);
795
+ prod3i = _mm_mul_ps(coefs3[i], stateimag);
796
+ acc3i = _mm_add_ps(acc3i, prod3i);
797
+ j = (j+1) & masksimd;
798
+ }
799
+ #ifdef __SSE3__
800
+ acc0r = _mm_hadd_ps(acc0r, acc1r);
801
+ acc0i = _mm_hadd_ps(acc0i, acc1i);
802
+ acc2r = _mm_hadd_ps(acc2r, acc3r);
803
+ acc2i = _mm_hadd_ps(acc2i, acc3i);
804
+ acc0r = _mm_hadd_ps(acc0r, acc2r);
805
+ acc0i = _mm_hadd_ps(acc0i, acc2i);
806
+ ocbuf->data[k+0].real = acc0r[0];
807
+ ocbuf->data[k+0].imag = acc0i[0];
808
+ ocbuf->data[k+1].real = acc0r[1];
809
+ ocbuf->data[k+1].imag = acc0i[1];
810
+ ocbuf->data[k+2].real = acc0r[2];
811
+ ocbuf->data[k+2].imag = acc0i[2];
812
+ ocbuf->data[k+3].real = acc0r[3];
813
+ ocbuf->data[k+3].imag = acc0i[3];
814
+ #else
815
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
816
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
817
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
818
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
819
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
820
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
821
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
822
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
823
+ #endif
824
+ start_index = (start_index + 4) & mask;
825
+ startsimd = start_index >> 2;
826
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
827
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
828
+ middle_index = (middle_index + 1) & mask;
829
+ k++;
830
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
831
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
832
+ middle_index = (middle_index + 1) & mask;
833
+ k++;
834
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
835
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
836
+ middle_index = (middle_index + 1) & mask;
837
+ k++;
838
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
839
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
840
+ middle_index = (middle_index + 1) & mask;
841
+ k++;
842
+ }
843
+ for (; k < ibuflen; k++) {
844
+ fir->stater[fir->index] = icbuf->data[k].real;
845
+ fir->statei[fir->index] = icbuf->data[k].imag;
846
+ fir->index = (fir->index + 1) & mask;
847
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
848
+ acc0r = _mm_setzero_ps();
849
+ acc0i = _mm_setzero_ps();
850
+ j = startsimd;
851
+ for (i = 0; i < lensimd; i++) {
852
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
853
+ acc0r = _mm_add_ps(acc0r, prod0r);
854
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
855
+ acc0i = _mm_add_ps(acc0i, prod0i);
856
+ j = (j+1) & masksimd;
857
+ }
858
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
859
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
860
+ start_index = (start_index + 1) & mask;
861
+ startsimd = start_index >> 2;
862
+ delayed->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
863
+ delayed->data[k].imag = (fir->statei[middle_index] + fir->statei[(middle_index - 1) & mask]) / 2.0f;
864
+ middle_index = (middle_index + 1) & mask;
865
+ }
866
+ }
867
+ }
868
+ else {
869
+ k = 0;
870
+ while ((start_index & 3) && k < ibuflen) {
871
+ fir->stater[fir->index] = icbuf->data[k].real;
872
+ fir->statei[fir->index] = icbuf->data[k].imag;
873
+ fir->index = (fir->index + 1) & mask;
874
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
875
+ acc0r = _mm_setzero_ps();
876
+ acc0i = _mm_setzero_ps();
877
+ j = startsimd;
878
+ for (i = 0; i < lensimd; i++) {
879
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
880
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
881
+ acc0r = _mm_add_ps(acc0r, prod0r);
882
+ acc0i = _mm_add_ps(acc0i, prod0i);
883
+ j = (j+1) & masksimd;
884
+ }
885
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
886
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
887
+ start_index = (start_index + 1) & mask;
888
+ startsimd = start_index >> 2;
889
+ k++;
890
+ }
891
+ while (k + 4 <= ibuflen) {
892
+ fir->stater[ fir->index ] = icbuf->data[k + 0].real;
893
+ fir->stater[(fir->index + 1) & mask] = icbuf->data[k + 1].real;
894
+ fir->stater[(fir->index + 2) & mask] = icbuf->data[k + 2].real;
895
+ fir->stater[(fir->index + 3) & mask] = icbuf->data[k + 3].real;
896
+ fir->statei[ fir->index ] = icbuf->data[k + 0].imag;
897
+ fir->statei[(fir->index + 1) & mask] = icbuf->data[k + 1].imag;
898
+ fir->statei[(fir->index + 2) & mask] = icbuf->data[k + 2].imag;
899
+ fir->statei[(fir->index + 3) & mask] = icbuf->data[k + 3].imag;
900
+ fir->index = (fir->index + 4) & mask;
901
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
902
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
903
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
904
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
905
+ acc0r = _mm_setzero_ps();
906
+ acc0i = _mm_setzero_ps();
907
+ acc1r = _mm_setzero_ps();
908
+ acc1i = _mm_setzero_ps();
909
+ acc2r = _mm_setzero_ps();
910
+ acc2i = _mm_setzero_ps();
911
+ acc3r = _mm_setzero_ps();
912
+ acc3i = _mm_setzero_ps();
913
+ j = startsimd;
914
+ for (i = 0; i < lensimd; i++) {
915
+ statereal = stater[j];
916
+ prod0r = _mm_mul_ps(coefs0[i], statereal);
917
+ acc0r = _mm_add_ps(acc0r, prod0r);
918
+ prod1r = _mm_mul_ps(coefs1[i], statereal);
919
+ acc1r = _mm_add_ps(acc1r, prod1r);
920
+ prod2r = _mm_mul_ps(coefs2[i], statereal);
921
+ acc2r = _mm_add_ps(acc2r, prod2r);
922
+ prod3r = _mm_mul_ps(coefs3[i], statereal);
923
+ acc3r = _mm_add_ps(acc3r, prod3r);
924
+ stateimag = statei[j];
925
+ prod0i = _mm_mul_ps(coefs0[i], stateimag);
926
+ acc0i = _mm_add_ps(acc0i, prod0i);
927
+ prod1i = _mm_mul_ps(coefs1[i], stateimag);
928
+ acc1i = _mm_add_ps(acc1i, prod1i);
929
+ prod2i = _mm_mul_ps(coefs2[i], stateimag);
930
+ acc2i = _mm_add_ps(acc2i, prod2i);
931
+ prod3i = _mm_mul_ps(coefs3[i], stateimag);
932
+ acc3i = _mm_add_ps(acc3i, prod3i);
933
+ j = (j+1) & masksimd;
934
+ }
935
+ #ifdef __SSE3__
936
+ acc0r = _mm_hadd_ps(acc0r, acc1r);
937
+ acc0i = _mm_hadd_ps(acc0i, acc1i);
938
+ acc2r = _mm_hadd_ps(acc2r, acc3r);
939
+ acc2i = _mm_hadd_ps(acc2i, acc3i);
940
+ acc0r = _mm_hadd_ps(acc0r, acc2r);
941
+ acc0i = _mm_hadd_ps(acc0i, acc2i);
942
+ ocbuf->data[k+0].real = acc0r[0];
943
+ ocbuf->data[k+0].imag = acc0i[0];
944
+ ocbuf->data[k+1].real = acc0r[1];
945
+ ocbuf->data[k+1].imag = acc0i[1];
946
+ ocbuf->data[k+2].real = acc0r[2];
947
+ ocbuf->data[k+2].imag = acc0i[2];
948
+ ocbuf->data[k+3].real = acc0r[3];
949
+ ocbuf->data[k+3].imag = acc0i[3];
950
+ #else
951
+ ocbuf->data[k+0].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
952
+ ocbuf->data[k+0].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
953
+ ocbuf->data[k+1].real = acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3];
954
+ ocbuf->data[k+1].imag = acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3];
955
+ ocbuf->data[k+2].real = acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3];
956
+ ocbuf->data[k+2].imag = acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3];
957
+ ocbuf->data[k+3].real = acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3];
958
+ ocbuf->data[k+3].imag = acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3];
959
+ #endif
960
+ start_index = (start_index + 4) & mask;
961
+ startsimd = start_index >> 2;
962
+ k += 4;
963
+ }
964
+ for (; k < ibuflen; k++) {
965
+ fir->stater[fir->index] = icbuf->data[k].real;
966
+ fir->statei[fir->index] = icbuf->data[k].imag;
967
+ fir->index = (fir->index + 1) & mask;
968
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
969
+ acc0r = _mm_setzero_ps();
970
+ acc0i = _mm_setzero_ps();
971
+ j = startsimd;
972
+ for (i = 0; i < lensimd; i++) {
973
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
974
+ acc0r = _mm_add_ps(acc0r, prod0r);
975
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
976
+ acc0i = _mm_add_ps(acc0i, prod0i);
977
+ j = (j+1) & masksimd;
978
+ }
979
+ ocbuf->data[k].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
980
+ ocbuf->data[k].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
981
+ start_index = (start_index + 1) & mask;
982
+ startsimd = start_index >> 2;
983
+ }
984
+ }
985
+
986
+ return ocbuf;
987
+ }
988
+
989
+
990
+ pdlc_buffer_t* pdlc_fir_filter_interpolate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
991
+ {
992
+ const unsigned int nb_coefs = fir->nb_coefs;
993
+ const unsigned int flt_len = fir->state_len;
994
+ const unsigned int mask = fir->index_mask;
995
+ const unsigned int lensimd = fir->coef_len >> 2;
996
+ const unsigned int masksimd = mask >> 2;
997
+ const size_t ibuflen = ifbuf->length;
998
+ const size_t obuflen = ibuflen*fir->max_counter;
999
+ const float ffactor = (float)(fir->max_counter);
1000
+ const size_t mcounter = fir->max_counter;
1001
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1002
+ unsigned int startsimd = start_index >> 2;
1003
+ unsigned int i, j;
1004
+ size_t k = 0, l = 0;
1005
+ register __m128 acc0, acc1, acc2, acc3;
1006
+ register __m128 prod0, prod1, prod2, prod3;
1007
+ register __m128 statereal;
1008
+ const __m128 *coefs0, *coefs1, *coefs2, *coefs3;
1009
+ __m128 *stater = (__m128*)fir->stater;
1010
+
1011
+ if (!ofbuf)
1012
+ ofbuf = pdlc_buffer_new(obuflen);
1013
+ else if (ofbuf->length != obuflen)
1014
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1015
+
1016
+
1017
+ while ((start_index & 3) && k < obuflen) {
1018
+ if ((k % mcounter) == 0)
1019
+ fir->stater[fir->index] = ifbuf->data[l++];
1020
+ else
1021
+ fir->stater[fir->index] = 0.0f;
1022
+ fir->index = (fir->index + 1) & mask;
1023
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1024
+ acc0 = _mm_setzero_ps();
1025
+ j = startsimd;
1026
+ for (i = 0; i < lensimd; i++) {
1027
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1028
+ acc0 = _mm_add_ps(acc0, prod0);
1029
+ j = (j+1) & masksimd;
1030
+ }
1031
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1032
+ start_index = (start_index + 1) & mask;
1033
+ startsimd = start_index >> 2;
1034
+ k++;
1035
+ }
1036
+ while (k + 4 <= obuflen) {
1037
+ if (((k+0) % mcounter) == 0)
1038
+ fir->stater[fir->index] = ifbuf->data[l++];
1039
+ else
1040
+ fir->stater[fir->index] = 0.0f;
1041
+ fir->index = (fir->index + 1) & mask;
1042
+ if (((k+1) % mcounter) == 0)
1043
+ fir->stater[fir->index] = ifbuf->data[l++];
1044
+ else
1045
+ fir->stater[fir->index] = 0.0f;
1046
+ fir->index = (fir->index + 1) & mask;
1047
+ if (((k+2) % mcounter) == 0)
1048
+ fir->stater[fir->index] = ifbuf->data[l++];
1049
+ else
1050
+ fir->stater[fir->index] = 0.0f;
1051
+ fir->index = (fir->index + 1) & mask;
1052
+ if (((k+3) % mcounter) == 0)
1053
+ fir->stater[fir->index] = ifbuf->data[l++];
1054
+ else
1055
+ fir->stater[fir->index] = 0.0f;
1056
+ fir->index = (fir->index + 1) & mask;
1057
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
1058
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
1059
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
1060
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
1061
+ acc0 = _mm_setzero_ps();
1062
+ acc1 = _mm_setzero_ps();
1063
+ acc2 = _mm_setzero_ps();
1064
+ acc3 = _mm_setzero_ps();
1065
+ j = startsimd;
1066
+ for (i = 0; i < lensimd; i++) {
1067
+ statereal = stater[j];
1068
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
1069
+ acc0 = _mm_add_ps(acc0, prod0);
1070
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
1071
+ acc1 = _mm_add_ps(acc1, prod1);
1072
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
1073
+ acc2 = _mm_add_ps(acc2, prod2);
1074
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
1075
+ acc3 = _mm_add_ps(acc3, prod3);
1076
+ j = (j+1) & masksimd;
1077
+ }
1078
+ #ifdef __SSE3__
1079
+ acc0 = _mm_hadd_ps(acc0, acc1);
1080
+ acc2 = _mm_hadd_ps(acc2, acc3);
1081
+ acc0 = _mm_hadd_ps(acc0, acc2);
1082
+ acc1 = _mm_set_ps1(ffactor);
1083
+ acc0 = _mm_mul_ps(acc0, acc1);
1084
+ ofbuf->data[k+0] = acc0[0];
1085
+ ofbuf->data[k+1] = acc0[1];
1086
+ ofbuf->data[k+2] = acc0[2];
1087
+ ofbuf->data[k+3] = acc0[3];
1088
+ #else
1089
+ ofbuf->data[k+0] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1090
+ ofbuf->data[k+1] = (acc1[0] + acc1[1] + acc1[2] + acc1[3]) * ffactor;
1091
+ ofbuf->data[k+2] = (acc2[0] + acc2[1] + acc2[2] + acc2[3]) * ffactor;
1092
+ ofbuf->data[k+3] = (acc3[0] + acc3[1] + acc3[2] + acc3[3]) * ffactor;
1093
+ #endif
1094
+ start_index = (start_index + 4) & mask;
1095
+ startsimd = start_index >> 2;
1096
+ k += 4;
1097
+ }
1098
+ for (; k < obuflen; k++) {
1099
+ if ((k % mcounter) == 0)
1100
+ fir->stater[fir->index] = ifbuf->data[l++];
1101
+ else
1102
+ fir->stater[fir->index] = 0.0f;
1103
+ fir->index = (fir->index + 1) & mask;
1104
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1105
+ acc0 = _mm_setzero_ps();
1106
+ j = startsimd;
1107
+ for (i = 0; i < lensimd; i++) {
1108
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1109
+ acc0 = _mm_add_ps(acc0, prod0);
1110
+ j = (j+1) & masksimd;
1111
+ }
1112
+ ofbuf->data[k] = (acc0[0] + acc0[1] + acc0[2] + acc0[3]) * ffactor;
1113
+ start_index = (start_index + 1) & mask;
1114
+ startsimd = start_index >> 2;
1115
+ }
1116
+
1117
+ return ofbuf;
1118
+ }
1119
+
1120
+
1121
+
1122
+ pdlc_complex_buffer_t* pdlc_fir_filter_interpolate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1123
+ {
1124
+ const unsigned int nb_coefs = fir->nb_coefs;
1125
+ const unsigned int flt_len = fir->state_len;
1126
+ const unsigned int mask = fir->index_mask;
1127
+ const unsigned int lensimd = fir->coef_len >> 2;
1128
+ const unsigned int masksimd = mask >> 2;
1129
+ const size_t ibuflen = icbuf->length;
1130
+ const size_t obuflen = ibuflen*fir->max_counter;
1131
+ const float ffactor = (float)(fir->max_counter);
1132
+ const size_t mcounter = fir->max_counter;
1133
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1134
+ unsigned int startsimd = start_index >> 2;
1135
+ unsigned int i, j;
1136
+ size_t k = 0, l = 0;
1137
+ register __m128 acc0r, acc1r, acc2r, acc3r;
1138
+ register __m128 acc0i, acc1i, acc2i, acc3i;
1139
+ register __m128 prod0r, prod1r, prod2r, prod3r;
1140
+ register __m128 prod0i, prod1i, prod2i, prod3i;
1141
+ register __m128 statereal;
1142
+ register __m128 stateimag;
1143
+ const __m128 *coefs0, *coefs1, *coefs2, *coefs3;
1144
+ __m128 *stater = (__m128*)fir->stater;
1145
+ __m128 *statei = (__m128*)fir->statei;
1146
+
1147
+ if (!ocbuf)
1148
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1149
+ else if (ocbuf->length != obuflen)
1150
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1151
+
1152
+
1153
+ while ((start_index & 3) && k < obuflen) {
1154
+ if ((k % mcounter) == 0) {
1155
+ fir->stater[fir->index] = icbuf->data[l].real;
1156
+ fir->statei[fir->index] = icbuf->data[l].imag;
1157
+ l++;
1158
+ }
1159
+ else {
1160
+ fir->stater[fir->index] = 0.0f;
1161
+ fir->statei[fir->index] = 0.0f;
1162
+ }
1163
+ fir->index = (fir->index + 1) & mask;
1164
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1165
+ acc0r = _mm_setzero_ps();
1166
+ acc0i = _mm_setzero_ps();
1167
+ j = startsimd;
1168
+ for (i = 0; i < lensimd; i++) {
1169
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
1170
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
1171
+ acc0r = _mm_add_ps(acc0r, prod0r);
1172
+ acc0i = _mm_add_ps(acc0i, prod0i);
1173
+ j = (j+1) & masksimd;
1174
+ }
1175
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1176
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1177
+ start_index = (start_index + 1) & mask;
1178
+ startsimd = start_index >> 2;
1179
+ k++;
1180
+ }
1181
+ while (k + 4 <= obuflen) {
1182
+ if (((k+0) % mcounter) == 0) {
1183
+ fir->stater[fir->index] = icbuf->data[l].real;
1184
+ fir->statei[fir->index] = icbuf->data[l].imag;
1185
+ l++;
1186
+ }
1187
+ else {
1188
+ fir->stater[fir->index] = 0.0f;
1189
+ fir->statei[fir->index] = 0.0f;
1190
+ }
1191
+ fir->index = (fir->index + 1) & mask;
1192
+ if (((k+1) % mcounter) == 0) {
1193
+ fir->stater[fir->index] = icbuf->data[l].real;
1194
+ fir->statei[fir->index] = icbuf->data[l].imag;
1195
+ l++;
1196
+ }
1197
+ else {
1198
+ fir->stater[fir->index] = 0.0f;
1199
+ fir->statei[fir->index] = 0.0f;
1200
+ }
1201
+ fir->index = (fir->index + 1) & mask;
1202
+ if (((k+2) % mcounter) == 0) {
1203
+ fir->stater[fir->index] = icbuf->data[l].real;
1204
+ fir->statei[fir->index] = icbuf->data[l].imag;
1205
+ l++;
1206
+ }
1207
+ else {
1208
+ fir->stater[fir->index] = 0.0f;
1209
+ fir->statei[fir->index] = 0.0f;
1210
+ }
1211
+ fir->index = (fir->index + 1) & mask;
1212
+ if (((k+3) % mcounter) == 0) {
1213
+ fir->stater[fir->index] = icbuf->data[l].real;
1214
+ fir->statei[fir->index] = icbuf->data[l].imag;
1215
+ l++;
1216
+ }
1217
+ else {
1218
+ fir->stater[fir->index] = 0.0f;
1219
+ fir->statei[fir->index] = 0.0f;
1220
+ }
1221
+ fir->index = (fir->index + 1) & mask;
1222
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
1223
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
1224
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
1225
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
1226
+ acc0r = _mm_setzero_ps();
1227
+ acc0i = _mm_setzero_ps();
1228
+ acc1r = _mm_setzero_ps();
1229
+ acc1i = _mm_setzero_ps();
1230
+ acc2r = _mm_setzero_ps();
1231
+ acc2i = _mm_setzero_ps();
1232
+ acc3r = _mm_setzero_ps();
1233
+ acc3i = _mm_setzero_ps();
1234
+ j = startsimd;
1235
+ for (i = 0; i < lensimd; i++) {
1236
+ statereal = stater[j];
1237
+ prod0r = _mm_mul_ps(coefs0[i], statereal);
1238
+ acc0r = _mm_add_ps(acc0r, prod0r);
1239
+ prod1r = _mm_mul_ps(coefs1[i], statereal);
1240
+ acc1r = _mm_add_ps(acc1r, prod1r);
1241
+ prod2r = _mm_mul_ps(coefs2[i], statereal);
1242
+ acc2r = _mm_add_ps(acc2r, prod2r);
1243
+ prod3r = _mm_mul_ps(coefs3[i], statereal);
1244
+ acc3r = _mm_add_ps(acc3r, prod3r);
1245
+ stateimag = statei[j];
1246
+ prod0i = _mm_mul_ps(coefs0[i], stateimag);
1247
+ acc0i = _mm_add_ps(acc0i, prod0i);
1248
+ prod1i = _mm_mul_ps(coefs1[i], stateimag);
1249
+ acc1i = _mm_add_ps(acc1i, prod1i);
1250
+ prod2i = _mm_mul_ps(coefs2[i], stateimag);
1251
+ acc2i = _mm_add_ps(acc2i, prod2i);
1252
+ prod3i = _mm_mul_ps(coefs3[i], stateimag);
1253
+ acc3i = _mm_add_ps(acc3i, prod3i);
1254
+ j = (j+1) & masksimd;
1255
+ }
1256
+ #ifdef __SSE3__
1257
+ acc0r = _mm_hadd_ps(acc0r, acc1r);
1258
+ acc0i = _mm_hadd_ps(acc0i, acc1i);
1259
+ acc2r = _mm_hadd_ps(acc2r, acc3r);
1260
+ acc2i = _mm_hadd_ps(acc2i, acc3i);
1261
+ acc0r = _mm_hadd_ps(acc0r, acc2r);
1262
+ acc0i = _mm_hadd_ps(acc0i, acc2i);
1263
+ acc1r = _mm_set_ps1(ffactor);
1264
+ acc0r = _mm_mul_ps(acc0r, acc1r);
1265
+ acc0i = _mm_mul_ps(acc0i, acc1r);
1266
+ ocbuf->data[k+0].real = acc0r[0];
1267
+ ocbuf->data[k+0].imag = acc0i[0];
1268
+ ocbuf->data[k+1].real = acc0r[1];
1269
+ ocbuf->data[k+1].imag = acc0i[1];
1270
+ ocbuf->data[k+2].real = acc0r[2];
1271
+ ocbuf->data[k+2].imag = acc0i[2];
1272
+ ocbuf->data[k+3].real = acc0r[3];
1273
+ ocbuf->data[k+3].imag = acc0i[3];
1274
+ #else
1275
+ ocbuf->data[k+0].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1276
+ ocbuf->data[k+0].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1277
+ ocbuf->data[k+1].real = (acc1r[0] + acc1r[1] + acc1r[2] + acc1r[3]) * ffactor;
1278
+ ocbuf->data[k+1].imag = (acc1i[0] + acc1i[1] + acc1i[2] + acc1i[3]) * ffactor;
1279
+ ocbuf->data[k+2].real = (acc2r[0] + acc2r[1] + acc2r[2] + acc2r[3]) * ffactor;
1280
+ ocbuf->data[k+2].imag = (acc2i[0] + acc2i[1] + acc2i[2] + acc2i[3]) * ffactor;
1281
+ ocbuf->data[k+3].real = (acc3r[0] + acc3r[1] + acc3r[2] + acc3r[3]) * ffactor;
1282
+ ocbuf->data[k+3].imag = (acc3i[0] + acc3i[1] + acc3i[2] + acc3i[3]) * ffactor;
1283
+ #endif
1284
+ start_index = (start_index + 4) & mask;
1285
+ startsimd = start_index >> 2;
1286
+ k += 4;
1287
+ }
1288
+ for (; k < obuflen; k++) {
1289
+ if ((k % mcounter) == 0) {
1290
+ fir->stater[fir->index] = icbuf->data[l].real;
1291
+ fir->statei[fir->index] = icbuf->data[l].imag;
1292
+ l++;
1293
+ }
1294
+ else {
1295
+ fir->stater[fir->index] = 0.0f;
1296
+ fir->statei[fir->index] = 0.0f;
1297
+ }
1298
+ fir->index = (fir->index + 1) & mask;
1299
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1300
+ acc0r = _mm_setzero_ps();
1301
+ acc0i = _mm_setzero_ps();
1302
+ j = startsimd;
1303
+ for (i = 0; i < lensimd; i++) {
1304
+ prod0r = _mm_mul_ps(coefs0[i], stater[j]);
1305
+ prod0i = _mm_mul_ps(coefs0[i], statei[j]);
1306
+ acc0r = _mm_add_ps(acc0r, prod0r);
1307
+ acc0i = _mm_add_ps(acc0i, prod0i);
1308
+ j = (j+1) & masksimd;
1309
+ }
1310
+ ocbuf->data[k].real = (acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3]) * ffactor;
1311
+ ocbuf->data[k].imag = (acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3]) * ffactor;
1312
+ start_index = (start_index + 1) & mask;
1313
+ startsimd = start_index >> 2;
1314
+ }
1315
+
1316
+ return ocbuf;
1317
+ }
1318
+
1319
+
1320
+ pdlc_buffer_t* pdlc_fir_filter_decimate_float_buffer(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_buffer_t *ofbuf)
1321
+ {
1322
+ const unsigned int nb_coefs = fir->nb_coefs;
1323
+ const unsigned int flt_len = fir->state_len;
1324
+ const unsigned int mask = fir->index_mask;
1325
+ const unsigned int lensimd = fir->coef_len >> 2;
1326
+ const unsigned int masksimd = mask >> 2;
1327
+ const int mcounter = fir->max_counter;
1328
+ const size_t ibuflen = ifbuf->length;
1329
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1330
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1331
+ unsigned int startsimd = start_index >> 2;
1332
+ unsigned int i0, i1, j0, j1;
1333
+ size_t k, l;
1334
+ register __m128 acc0, acc1;
1335
+ register __m128 prod0, prod1;
1336
+ const __m128 *coefs;
1337
+ __m128 *stater = (__m128*)fir->stater;
1338
+
1339
+
1340
+ if (!ofbuf)
1341
+ ofbuf = pdlc_buffer_new(obuflen);
1342
+ else if (ofbuf->length != obuflen)
1343
+ pdlc_buffer_resize(ofbuf, obuflen, 0);
1344
+
1345
+
1346
+ for (k = 0, l = 0; k < ibuflen; k++) {
1347
+ fir->stater[fir->index] = ifbuf->data[k];
1348
+ fir->index = (fir->index + 1) & mask;
1349
+ if (fir->counter == 0) {
1350
+ coefs = (__m128*)fir->coefs[start_index & 3];
1351
+ acc0 = _mm_setzero_ps();
1352
+ acc1 = _mm_setzero_ps();
1353
+ j0 = startsimd;
1354
+ j1 = (startsimd+1) & masksimd;
1355
+ i0 = 0;
1356
+ i1 = 1;
1357
+ while (i1 < lensimd) {
1358
+ prod0 = _mm_mul_ps(coefs[i0], stater[j0]);
1359
+ acc0 = _mm_add_ps(acc0, prod0);
1360
+ prod1 = _mm_mul_ps(coefs[i1], stater[j1]);
1361
+ acc1 = _mm_add_ps(acc1, prod1);
1362
+ i0 += 2;
1363
+ i1 += 2;
1364
+ j0 = (j0+2) & masksimd;
1365
+ j1 = (j1+2) & masksimd;
1366
+ }
1367
+ while (i0 < lensimd) {
1368
+ prod0 = _mm_mul_ps(coefs[i0], stater[j0]);
1369
+ acc0 = _mm_add_ps(acc0, prod0);
1370
+ i0 += 2;
1371
+ j0 = (j0+2) & masksimd;
1372
+ }
1373
+ acc0 = _mm_add_ps(acc0, acc1);
1374
+ ofbuf->data[l++] = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1375
+ start_index = (start_index + mcounter) & mask;
1376
+ startsimd = start_index >> 2;
1377
+ }
1378
+ fir->counter = (fir->counter + 1) % mcounter;
1379
+ }
1380
+
1381
+ return ofbuf;
1382
+ }
1383
+
1384
+
1385
+ pdlc_complex_buffer_t* pdlc_fir_filter_decimate_complex_buffer(pdlc_fir_filter_t* fir, const pdlc_complex_buffer_t *icbuf, pdlc_complex_buffer_t *ocbuf)
1386
+ {
1387
+ const unsigned int nb_coefs = fir->nb_coefs;
1388
+ const unsigned int flt_len = fir->state_len;
1389
+ const unsigned int mask = fir->index_mask;
1390
+ const unsigned int lensimd = fir->coef_len >> 2;
1391
+ const unsigned int masksimd = mask >> 2;
1392
+ const int mcounter = fir->max_counter;
1393
+ const size_t ibuflen = icbuf->length;
1394
+ const size_t obuflen = (size_t)ceil(((double)ibuflen - (double)((mcounter - fir->counter) % mcounter)) / (double)mcounter);
1395
+ unsigned int start_index = (flt_len + fir->index + fir->counter + 1 - nb_coefs) & mask;
1396
+ unsigned int startsimd = start_index >> 2;
1397
+ unsigned int i0, j0, i1, j1;
1398
+ size_t k, l;
1399
+ register __m128 acc0r, acc0i, acc1r, acc1i;
1400
+ register __m128 prod0r, prod0i, prod1r, prod1i;
1401
+ const __m128 *coefs;
1402
+ __m128 *stater = (__m128*)fir->stater;
1403
+ __m128 *statei = (__m128*)fir->statei;
1404
+
1405
+
1406
+ if (!ocbuf)
1407
+ ocbuf = pdlc_complex_buffer_new(obuflen);
1408
+ else if (ocbuf->length != obuflen)
1409
+ pdlc_complex_buffer_resize(ocbuf, obuflen, 0);
1410
+
1411
+
1412
+ for (k = 0, l = 0; k < ibuflen; k++) {
1413
+ fir->stater[fir->index] = icbuf->data[k].real;
1414
+ fir->statei[fir->index] = icbuf->data[k].imag;
1415
+ fir->index = (fir->index + 1) & mask;
1416
+ if (fir->counter == 0) {
1417
+ coefs = (__m128*)fir->coefs[start_index & 3];
1418
+ acc0r = _mm_setzero_ps();
1419
+ acc0i = _mm_setzero_ps();
1420
+ acc1r = _mm_setzero_ps();
1421
+ acc1i = _mm_setzero_ps();
1422
+ j0 = startsimd;
1423
+ j1 = (startsimd+1) & masksimd;
1424
+ i0 = 0;
1425
+ i1 = 1;
1426
+ while (i1 < lensimd) {
1427
+ prod0r = _mm_mul_ps(coefs[i0], stater[j0]);
1428
+ acc0r = _mm_add_ps(acc0r, prod0r);
1429
+ prod0i = _mm_mul_ps(coefs[i0], statei[j0]);
1430
+ acc0i = _mm_add_ps(acc0i, prod0i);
1431
+ prod1r = _mm_mul_ps(coefs[i1], stater[j1]);
1432
+ acc1r = _mm_add_ps(acc1r, prod1r);
1433
+ prod1i = _mm_mul_ps(coefs[i1], statei[j1]);
1434
+ acc1i = _mm_add_ps(acc1i, prod1i);
1435
+ i0 += 2;
1436
+ i1 += 2;
1437
+ j0 = (j0+2) & masksimd;
1438
+ j1 = (j1+2) & masksimd;
1439
+ }
1440
+ while (i0 < lensimd) {
1441
+ prod0r = _mm_mul_ps(coefs[i0], stater[j0]);
1442
+ acc0r = _mm_add_ps(acc0r, prod0r);
1443
+ prod0i = _mm_mul_ps(coefs[i0], statei[j0]);
1444
+ acc0i = _mm_add_ps(acc0i, prod0i);
1445
+ i0 += 2;
1446
+ j0 = (j0+2) & masksimd;
1447
+ }
1448
+ acc0r = _mm_add_ps(acc0r, acc1r);
1449
+ acc0i = _mm_add_ps(acc0i, acc1i);
1450
+ ocbuf->data[l].real = acc0r[0] + acc0r[1] + acc0r[2] + acc0r[3];
1451
+ ocbuf->data[l].imag = acc0i[0] + acc0i[1] + acc0i[2] + acc0i[3];
1452
+ l++;
1453
+ start_index = (start_index + mcounter) & mask;
1454
+ startsimd = start_index >> 2;
1455
+ }
1456
+ fir->counter = (fir->counter + 1) % mcounter;
1457
+ }
1458
+
1459
+ return ocbuf;
1460
+ }
1461
+
1462
+
1463
+ pdlc_complex_buffer_t* pdlc_fir_filter_transform(pdlc_fir_filter_t* fir, const pdlc_buffer_t *ifbuf, pdlc_complex_buffer_t *ocbuf)
1464
+ {
1465
+ const unsigned int nb_coefs = fir->nb_coefs;
1466
+ const unsigned int flt_len = fir->state_len;
1467
+ const unsigned int mask = fir->index_mask;
1468
+ const unsigned int lensimd = fir->coef_len >> 2;
1469
+ const unsigned int masksimd = mask >> 2;
1470
+ const size_t ibuflen = ifbuf->length;
1471
+ unsigned int start_index = (flt_len + fir->index + 1 - nb_coefs) & mask;
1472
+ unsigned int startsimd = start_index >> 2;
1473
+ unsigned int middle_index = (start_index + nb_coefs / 2) & mask;
1474
+ unsigned int i, j;
1475
+ size_t k = 0;
1476
+ register __m128 acc0, acc1, acc2, acc3;
1477
+ register __m128 prod0, prod1, prod2, prod3;
1478
+ register __m128 statereal;
1479
+ const __m128 *coefs0, *coefs1, *coefs2, *coefs3;
1480
+ __m128 *stater = (__m128*)fir->stater;
1481
+
1482
+
1483
+ if (!ocbuf)
1484
+ ocbuf = pdlc_complex_buffer_new(ifbuf->length);
1485
+ else if (ocbuf->length != ifbuf->length)
1486
+ pdlc_complex_buffer_resize(ocbuf, ifbuf->length, 0);
1487
+
1488
+
1489
+ if (nb_coefs & 1) {
1490
+ while ((start_index & 3) && k < ibuflen) {
1491
+ fir->stater[fir->index] = ifbuf->data[k];
1492
+ fir->index = (fir->index + 1) & mask;
1493
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1494
+ acc0 = _mm_setzero_ps();
1495
+ j = startsimd;
1496
+ for (i = 0; i < lensimd; i++) {
1497
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1498
+ acc0 = _mm_add_ps(acc0, prod0);
1499
+ j = (j+1) & masksimd;
1500
+ }
1501
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1502
+ start_index = (start_index + 1) & mask;
1503
+ startsimd = start_index >> 2;
1504
+ ocbuf->data[k].real = fir->stater[middle_index];
1505
+ middle_index = (middle_index + 1) & mask;
1506
+ k++;
1507
+ }
1508
+ while (k + 4 <= ibuflen) {
1509
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1510
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1511
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1512
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1513
+ fir->index = (fir->index + 4) & mask;
1514
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
1515
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
1516
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
1517
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
1518
+ acc0 = _mm_setzero_ps();
1519
+ acc1 = _mm_setzero_ps();
1520
+ acc2 = _mm_setzero_ps();
1521
+ acc3 = _mm_setzero_ps();
1522
+ j = startsimd;
1523
+ for (i = 0; i < lensimd; i++) {
1524
+ statereal = stater[j];
1525
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
1526
+ acc0 = _mm_add_ps(acc0, prod0);
1527
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
1528
+ acc1 = _mm_add_ps(acc1, prod1);
1529
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
1530
+ acc2 = _mm_add_ps(acc2, prod2);
1531
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
1532
+ acc3 = _mm_add_ps(acc3, prod3);
1533
+ j = (j+1) & masksimd;
1534
+ }
1535
+ #ifdef __SSE3__
1536
+ acc0 = _mm_hadd_ps(acc0, acc1);
1537
+ acc2 = _mm_hadd_ps(acc2, acc3);
1538
+ acc0 = _mm_hadd_ps(acc0, acc2);
1539
+ ocbuf->data[k+0].imag = acc0[0];
1540
+ ocbuf->data[k+1].imag = acc0[1];
1541
+ ocbuf->data[k+2].imag = acc0[2];
1542
+ ocbuf->data[k+3].imag = acc0[3];
1543
+ #else
1544
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1545
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1546
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1547
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1548
+ #endif
1549
+ start_index = (start_index + 4) & mask;
1550
+ startsimd = start_index >> 2;
1551
+ ocbuf->data[k].real = fir->stater[middle_index];
1552
+ middle_index = (middle_index + 1) & mask;
1553
+ k++;
1554
+ ocbuf->data[k].real = fir->stater[middle_index];
1555
+ middle_index = (middle_index + 1) & mask;
1556
+ k++;
1557
+ ocbuf->data[k].real = fir->stater[middle_index];
1558
+ middle_index = (middle_index + 1) & mask;
1559
+ k++;
1560
+ ocbuf->data[k].real = fir->stater[middle_index];
1561
+ middle_index = (middle_index + 1) & mask;
1562
+ k++;
1563
+ }
1564
+ for (; k < ibuflen; k++) {
1565
+ fir->stater[fir->index] = ifbuf->data[k];
1566
+ fir->index = (fir->index + 1) & mask;
1567
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1568
+ acc0 = _mm_setzero_ps();
1569
+ j = startsimd;
1570
+ for (i = 0; i < lensimd; i++) {
1571
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1572
+ acc0 = _mm_add_ps(acc0, prod0);
1573
+ j = (j+1) & masksimd;
1574
+ }
1575
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1576
+ start_index = (start_index + 1) & mask;
1577
+ startsimd = start_index >> 2;
1578
+ ocbuf->data[k].real = fir->stater[middle_index];
1579
+ middle_index = (middle_index + 1) & mask;
1580
+ }
1581
+ }
1582
+ else {
1583
+ while ((start_index & 3) && k < ibuflen) {
1584
+ fir->stater[fir->index] = ifbuf->data[k];
1585
+ fir->index = (fir->index + 1) & mask;
1586
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1587
+ acc0 = _mm_setzero_ps();
1588
+ j = startsimd;
1589
+ for (i = 0; i < lensimd; i++) {
1590
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1591
+ acc0 = _mm_add_ps(acc0, prod0);
1592
+ j = (j+1) & masksimd;
1593
+ }
1594
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1595
+ start_index = (start_index + 1) & mask;
1596
+ startsimd = start_index >> 2;
1597
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1598
+ middle_index = (middle_index + 1) & mask;
1599
+ k++;
1600
+ }
1601
+ while (k + 4 <= ibuflen) {
1602
+ fir->stater[ fir->index ] = ifbuf->data[k + 0];
1603
+ fir->stater[(fir->index + 1) & mask] = ifbuf->data[k + 1];
1604
+ fir->stater[(fir->index + 2) & mask] = ifbuf->data[k + 2];
1605
+ fir->stater[(fir->index + 3) & mask] = ifbuf->data[k + 3];
1606
+ fir->index = (fir->index + 4) & mask;
1607
+ coefs0 = (__m128*)fir->coefs[(start_index + 0) & 3];
1608
+ coefs1 = (__m128*)fir->coefs[(start_index + 1) & 3];
1609
+ coefs2 = (__m128*)fir->coefs[(start_index + 2) & 3];
1610
+ coefs3 = (__m128*)fir->coefs[(start_index + 3) & 3];
1611
+ acc0 = _mm_setzero_ps();
1612
+ acc1 = _mm_setzero_ps();
1613
+ acc2 = _mm_setzero_ps();
1614
+ acc3 = _mm_setzero_ps();
1615
+ j = startsimd;
1616
+ for (i = 0; i < lensimd; i++) {
1617
+ statereal = stater[j];
1618
+ prod0 = _mm_mul_ps(coefs0[i], statereal);
1619
+ acc0 = _mm_add_ps(acc0, prod0);
1620
+ prod1 = _mm_mul_ps(coefs1[i], statereal);
1621
+ acc1 = _mm_add_ps(acc1, prod1);
1622
+ prod2 = _mm_mul_ps(coefs2[i], statereal);
1623
+ acc2 = _mm_add_ps(acc2, prod2);
1624
+ prod3 = _mm_mul_ps(coefs3[i], statereal);
1625
+ acc3 = _mm_add_ps(acc3, prod3);
1626
+ j = (j+1) & masksimd;
1627
+ }
1628
+ #ifdef __SSE3__
1629
+ acc0 = _mm_hadd_ps(acc0, acc1);
1630
+ acc2 = _mm_hadd_ps(acc2, acc3);
1631
+ acc0 = _mm_hadd_ps(acc0, acc2);
1632
+ ocbuf->data[k+0].imag = acc0[0];
1633
+ ocbuf->data[k+1].imag = acc0[1];
1634
+ ocbuf->data[k+2].imag = acc0[2];
1635
+ ocbuf->data[k+3].imag = acc0[3];
1636
+ #else
1637
+ ocbuf->data[k+0].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1638
+ ocbuf->data[k+1].imag = acc1[0] + acc1[1] + acc1[2] + acc1[3];
1639
+ ocbuf->data[k+2].imag = acc2[0] + acc2[1] + acc2[2] + acc2[3];
1640
+ ocbuf->data[k+3].imag = acc3[0] + acc3[1] + acc3[2] + acc3[3];
1641
+ #endif
1642
+ start_index = (start_index + 4) & mask;
1643
+ startsimd = start_index >> 2;
1644
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1645
+ middle_index = (middle_index + 1) & mask;
1646
+ k++;
1647
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1648
+ middle_index = (middle_index + 1) & mask;
1649
+ k++;
1650
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1651
+ middle_index = (middle_index + 1) & mask;
1652
+ k++;
1653
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1654
+ middle_index = (middle_index + 1) & mask;
1655
+ k++;
1656
+ }
1657
+ for (; k < ibuflen; k++) {
1658
+ fir->stater[fir->index] = ifbuf->data[k];
1659
+ fir->index = (fir->index + 1) & mask;
1660
+ coefs0 = (__m128*)fir->coefs[start_index & 3];
1661
+ acc0 = _mm_setzero_ps();
1662
+ j = startsimd;
1663
+ for (i = 0; i < lensimd; i++) {
1664
+ prod0 = _mm_mul_ps(coefs0[i], stater[j]);
1665
+ acc0 = _mm_add_ps(acc0, prod0);
1666
+ j = (j+1) & masksimd;
1667
+ }
1668
+ ocbuf->data[k].imag = acc0[0] + acc0[1] + acc0[2] + acc0[3];
1669
+ start_index = (start_index + 1) & mask;
1670
+ startsimd = start_index >> 2;
1671
+ ocbuf->data[k].real = (fir->stater[middle_index] + fir->stater[(middle_index - 1) & mask]) / 2.0f;
1672
+ middle_index = (middle_index + 1) & mask;
1673
+ }
1674
+ }
1675
+
1676
+ return ocbuf;
1677
+ }