dspx 1.1.4 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +31 -0
- package/dist/bindings.d.ts +62 -25
- package/dist/bindings.d.ts.map +1 -1
- package/dist/bindings.js +96 -26
- package/dist/bindings.js.map +1 -1
- package/dist/types.d.ts +14 -0
- package/dist/types.d.ts.map +1 -1
- package/package.json +1 -1
- package/prebuilds/win32-x64/dspx.node +0 -0
- package/src/native/adapters/ConvolutionStage.h +254 -183
- package/src/native/adapters/FftStage.cc +245 -121
- package/src/native/core/Fftpack.cc +59 -17
- package/src/native/core/Fftpack.h +3 -2
|
@@ -158,6 +158,9 @@ namespace dsp
|
|
|
158
158
|
|
|
159
159
|
outputSize = numFrames * outputSizePerFrame * numChannels;
|
|
160
160
|
|
|
161
|
+
// OPTIMIZATION: Process all frames and channels with minimal memory operations
|
|
162
|
+
// Use pre-allocated member buffers to avoid repeated allocations
|
|
163
|
+
|
|
161
164
|
for (size_t frame = 0; frame < numFrames; ++frame)
|
|
162
165
|
{
|
|
163
166
|
for (size_t ch = 0; ch < static_cast<size_t>(numChannels); ++ch)
|
|
@@ -165,43 +168,134 @@ namespace dsp
|
|
|
165
168
|
const float *frameInput = inputBuffer + (frame * inputSizePerFrame * numChannels) + ch;
|
|
166
169
|
float *frameOutput = outputBuffer + (frame * outputSizePerFrame * numChannels) + ch;
|
|
167
170
|
|
|
168
|
-
//
|
|
171
|
+
// OPTIMIZATION 1: Minimize conditional branches - process by transform type
|
|
172
|
+
// OPTIMIZATION 2: Use member buffers (already allocated in constructor)
|
|
173
|
+
|
|
174
|
+
// Load input data (OPTIMIZED: Loop unrolling + better ILP)
|
|
169
175
|
if (isInverseComplex)
|
|
170
176
|
{
|
|
171
|
-
//
|
|
172
|
-
|
|
177
|
+
// Complex input: deinterleave directly into complex buffer
|
|
178
|
+
// OPTIMIZATION: Unroll by 4 for better instruction-level parallelism
|
|
179
|
+
size_t i = 0;
|
|
180
|
+
const size_t stride = numChannels;
|
|
181
|
+
const size_t stride2 = 2 * stride;
|
|
182
|
+
|
|
183
|
+
// Process 4 complex numbers at a time
|
|
184
|
+
for (; i + 3 < m_fftSize; i += 4)
|
|
185
|
+
{
|
|
186
|
+
size_t idx0 = (i * 2) * stride;
|
|
187
|
+
size_t idx1 = ((i + 1) * 2) * stride;
|
|
188
|
+
size_t idx2 = ((i + 2) * 2) * stride;
|
|
189
|
+
size_t idx3 = ((i + 3) * 2) * stride;
|
|
190
|
+
|
|
191
|
+
m_complexBuffer[i] = std::complex<float>(frameInput[idx0], frameInput[idx0 + stride]);
|
|
192
|
+
m_complexBuffer[i + 1] = std::complex<float>(frameInput[idx1], frameInput[idx1 + stride]);
|
|
193
|
+
m_complexBuffer[i + 2] = std::complex<float>(frameInput[idx2], frameInput[idx2 + stride]);
|
|
194
|
+
m_complexBuffer[i + 3] = std::complex<float>(frameInput[idx3], frameInput[idx3 + stride]);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// Handle remainder
|
|
198
|
+
for (; i < m_fftSize; ++i)
|
|
173
199
|
{
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
frameInput[(i * 2 + 1) * numChannels]); // imag part
|
|
200
|
+
size_t idx = (i * 2) * stride;
|
|
201
|
+
m_complexBuffer[i] = std::complex<float>(frameInput[idx], frameInput[idx + stride]);
|
|
177
202
|
}
|
|
178
203
|
}
|
|
179
204
|
else if (isInverseReal)
|
|
180
205
|
{
|
|
181
|
-
//
|
|
206
|
+
// Half-spectrum complex input: deinterleave with loop unrolling
|
|
182
207
|
size_t halfSize = m_engine->getHalfSize();
|
|
183
|
-
|
|
208
|
+
size_t i = 0;
|
|
209
|
+
const size_t stride = numChannels;
|
|
210
|
+
|
|
211
|
+
// Unroll by 4
|
|
212
|
+
for (; i + 3 < halfSize; i += 4)
|
|
184
213
|
{
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
214
|
+
size_t idx0 = (i * 2) * stride;
|
|
215
|
+
size_t idx1 = ((i + 1) * 2) * stride;
|
|
216
|
+
size_t idx2 = ((i + 2) * 2) * stride;
|
|
217
|
+
size_t idx3 = ((i + 3) * 2) * stride;
|
|
218
|
+
|
|
219
|
+
m_complexBuffer[i] = std::complex<float>(frameInput[idx0], frameInput[idx0 + stride]);
|
|
220
|
+
m_complexBuffer[i + 1] = std::complex<float>(frameInput[idx1], frameInput[idx1 + stride]);
|
|
221
|
+
m_complexBuffer[i + 2] = std::complex<float>(frameInput[idx2], frameInput[idx2 + stride]);
|
|
222
|
+
m_complexBuffer[i + 3] = std::complex<float>(frameInput[idx3], frameInput[idx3 + stride]);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
// Handle remainder
|
|
226
|
+
for (; i < halfSize; ++i)
|
|
227
|
+
{
|
|
228
|
+
size_t idx = (i * 2) * stride;
|
|
229
|
+
m_complexBuffer[i] = std::complex<float>(frameInput[idx], frameInput[idx + stride]);
|
|
188
230
|
}
|
|
189
231
|
}
|
|
190
232
|
else
|
|
191
233
|
{
|
|
192
|
-
//
|
|
193
|
-
|
|
234
|
+
// Real input: deinterleave with loop unrolling
|
|
235
|
+
// OPTIMIZATION: Special case for numChannels == 1 (no deinterleaving needed)
|
|
236
|
+
if (numChannels == 1)
|
|
237
|
+
{
|
|
238
|
+
// Direct memcpy when no deinterleaving needed
|
|
239
|
+
std::memcpy(m_realBuffer.data(), frameInput, m_fftSize * sizeof(float));
|
|
240
|
+
}
|
|
241
|
+
else
|
|
194
242
|
{
|
|
195
|
-
|
|
243
|
+
// Deinterleave with unrolling by 8 for better ILP
|
|
244
|
+
size_t i = 0;
|
|
245
|
+
const size_t stride = numChannels;
|
|
246
|
+
|
|
247
|
+
for (; i + 7 < m_fftSize; i += 8)
|
|
248
|
+
{
|
|
249
|
+
m_realBuffer[i] = frameInput[i * stride];
|
|
250
|
+
m_realBuffer[i + 1] = frameInput[(i + 1) * stride];
|
|
251
|
+
m_realBuffer[i + 2] = frameInput[(i + 2) * stride];
|
|
252
|
+
m_realBuffer[i + 3] = frameInput[(i + 3) * stride];
|
|
253
|
+
m_realBuffer[i + 4] = frameInput[(i + 4) * stride];
|
|
254
|
+
m_realBuffer[i + 5] = frameInput[(i + 5) * stride];
|
|
255
|
+
m_realBuffer[i + 6] = frameInput[(i + 6) * stride];
|
|
256
|
+
m_realBuffer[i + 7] = frameInput[(i + 7) * stride];
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Handle remainder
|
|
260
|
+
for (; i < m_fftSize; ++i)
|
|
261
|
+
{
|
|
262
|
+
m_realBuffer[i] = frameInput[i * stride];
|
|
263
|
+
}
|
|
196
264
|
}
|
|
197
265
|
}
|
|
198
266
|
|
|
199
|
-
// Perform transform
|
|
200
|
-
|
|
267
|
+
// Perform transform (OPTIMIZED: Reduced switch cases, grouped by category)
|
|
268
|
+
if (isInverseComplex)
|
|
201
269
|
{
|
|
202
|
-
|
|
270
|
+
// All complex inverse transforms
|
|
271
|
+
if (m_type == TransformType::IFFT || (m_type == TransformType::FFT && !m_forward))
|
|
272
|
+
{
|
|
273
|
+
m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
|
|
274
|
+
}
|
|
275
|
+
else // IDFT or (DFT && !forward)
|
|
276
|
+
{
|
|
277
|
+
m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
|
|
278
|
+
// OPTIMIZATION: Use memcpy instead of std::copy for POD types
|
|
279
|
+
std::memcpy(m_complexBuffer.data(), m_tempComplexBuffer.data(),
|
|
280
|
+
m_fftSize * sizeof(std::complex<float>));
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
else if (isInverseReal)
|
|
284
|
+
{
|
|
285
|
+
// All real inverse transforms
|
|
286
|
+
if (m_type == TransformType::IRFFT || (m_type == TransformType::RFFT && !m_forward))
|
|
287
|
+
{
|
|
288
|
+
m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
|
|
289
|
+
}
|
|
290
|
+
else // IRDFT or (RDFT && !forward)
|
|
291
|
+
{
|
|
292
|
+
m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
else
|
|
203
296
|
{
|
|
204
|
-
|
|
297
|
+
// Forward transforms
|
|
298
|
+
if (m_type == TransformType::FFT)
|
|
205
299
|
{
|
|
206
300
|
// Forward FFT: real to complex
|
|
207
301
|
for (size_t i = 0; i < m_fftSize; ++i)
|
|
@@ -210,22 +304,7 @@ namespace dsp
|
|
|
210
304
|
}
|
|
211
305
|
m_engine->fft(m_complexBuffer.data(), m_complexBuffer.data());
|
|
212
306
|
}
|
|
213
|
-
else
|
|
214
|
-
{
|
|
215
|
-
// Inverse FFT: complex input already loaded
|
|
216
|
-
m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
|
|
217
|
-
}
|
|
218
|
-
break;
|
|
219
|
-
}
|
|
220
|
-
case TransformType::IFFT:
|
|
221
|
-
{
|
|
222
|
-
// Complex input already loaded into m_complexBuffer
|
|
223
|
-
m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
|
|
224
|
-
break;
|
|
225
|
-
}
|
|
226
|
-
case TransformType::DFT:
|
|
227
|
-
{
|
|
228
|
-
if (m_forward)
|
|
307
|
+
else if (m_type == TransformType::DFT)
|
|
229
308
|
{
|
|
230
309
|
// Forward DFT: real to complex
|
|
231
310
|
for (size_t i = 0; i < m_fftSize; ++i)
|
|
@@ -233,138 +312,183 @@ namespace dsp
|
|
|
233
312
|
m_complexBuffer[i] = std::complex<float>(m_realBuffer[i], 0.0f);
|
|
234
313
|
}
|
|
235
314
|
m_engine->dft(m_complexBuffer.data(), m_tempComplexBuffer.data());
|
|
236
|
-
std::copy
|
|
315
|
+
// OPTIMIZATION: Use memcpy instead of std::copy
|
|
316
|
+
std::memcpy(m_complexBuffer.data(), m_tempComplexBuffer.data(),
|
|
317
|
+
m_fftSize * sizeof(std::complex<float>));
|
|
237
318
|
}
|
|
238
|
-
else
|
|
239
|
-
{
|
|
240
|
-
// Inverse DFT: complex input already loaded
|
|
241
|
-
m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
|
|
242
|
-
std::copy(m_tempComplexBuffer.begin(), m_tempComplexBuffer.end(), m_complexBuffer.begin());
|
|
243
|
-
}
|
|
244
|
-
break;
|
|
245
|
-
}
|
|
246
|
-
case TransformType::IDFT:
|
|
247
|
-
{
|
|
248
|
-
// Complex input already loaded into m_complexBuffer
|
|
249
|
-
m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
|
|
250
|
-
std::copy(m_tempComplexBuffer.begin(), m_tempComplexBuffer.end(), m_complexBuffer.begin());
|
|
251
|
-
break;
|
|
252
|
-
}
|
|
253
|
-
case TransformType::RFFT:
|
|
254
|
-
{
|
|
255
|
-
if (m_forward)
|
|
319
|
+
else if (m_type == TransformType::RFFT)
|
|
256
320
|
{
|
|
257
321
|
m_engine->rfft(m_realBuffer.data(), m_complexBuffer.data());
|
|
258
322
|
}
|
|
259
|
-
else
|
|
260
|
-
{
|
|
261
|
-
// Inverse RFFT: half-spectrum complex input already loaded
|
|
262
|
-
m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
|
|
263
|
-
}
|
|
264
|
-
break;
|
|
265
|
-
}
|
|
266
|
-
case TransformType::IRFFT:
|
|
267
|
-
{
|
|
268
|
-
// Half-spectrum complex input already loaded into m_complexBuffer
|
|
269
|
-
m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
|
|
270
|
-
break;
|
|
271
|
-
}
|
|
272
|
-
case TransformType::RDFT:
|
|
273
|
-
{
|
|
274
|
-
if (m_forward)
|
|
323
|
+
else // RDFT
|
|
275
324
|
{
|
|
276
325
|
m_engine->rdft(m_realBuffer.data(), m_complexBuffer.data());
|
|
277
326
|
}
|
|
278
|
-
else
|
|
279
|
-
{
|
|
280
|
-
// Inverse RDFT: half-spectrum complex input already loaded
|
|
281
|
-
m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
|
|
282
|
-
}
|
|
283
|
-
break;
|
|
284
|
-
}
|
|
285
|
-
case TransformType::IRDFT:
|
|
286
|
-
{
|
|
287
|
-
// Half-spectrum complex input already loaded into m_complexBuffer
|
|
288
|
-
m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
|
|
289
|
-
break;
|
|
290
|
-
}
|
|
291
327
|
}
|
|
292
328
|
|
|
293
|
-
//
|
|
329
|
+
// Write output (OPTIMIZED: Loop unrolling for interleaving)
|
|
294
330
|
if (isInverseReal || isInverseComplex)
|
|
295
331
|
{
|
|
296
332
|
// ALL inverse transforms output real time-domain values
|
|
297
|
-
|
|
333
|
+
const float *sourceData = isInverseReal ? m_realBuffer.data() : nullptr;
|
|
334
|
+
|
|
335
|
+
// OPTIMIZATION: Special case for single channel (no interleaving)
|
|
336
|
+
if (numChannels == 1)
|
|
298
337
|
{
|
|
299
|
-
|
|
300
|
-
for (size_t i = 0; i < m_fftSize; ++i)
|
|
338
|
+
if (isInverseReal)
|
|
301
339
|
{
|
|
302
|
-
frameOutput
|
|
340
|
+
std::memcpy(frameOutput, m_realBuffer.data(), m_fftSize * sizeof(float));
|
|
341
|
+
}
|
|
342
|
+
else
|
|
343
|
+
{
|
|
344
|
+
// Extract real parts - unroll by 8
|
|
345
|
+
size_t i = 0;
|
|
346
|
+
for (; i + 7 < m_fftSize; i += 8)
|
|
347
|
+
{
|
|
348
|
+
frameOutput[i] = m_complexBuffer[i].real();
|
|
349
|
+
frameOutput[i + 1] = m_complexBuffer[i + 1].real();
|
|
350
|
+
frameOutput[i + 2] = m_complexBuffer[i + 2].real();
|
|
351
|
+
frameOutput[i + 3] = m_complexBuffer[i + 3].real();
|
|
352
|
+
frameOutput[i + 4] = m_complexBuffer[i + 4].real();
|
|
353
|
+
frameOutput[i + 5] = m_complexBuffer[i + 5].real();
|
|
354
|
+
frameOutput[i + 6] = m_complexBuffer[i + 6].real();
|
|
355
|
+
frameOutput[i + 7] = m_complexBuffer[i + 7].real();
|
|
356
|
+
}
|
|
357
|
+
for (; i < m_fftSize; ++i)
|
|
358
|
+
{
|
|
359
|
+
frameOutput[i] = m_complexBuffer[i].real();
|
|
360
|
+
}
|
|
303
361
|
}
|
|
304
362
|
}
|
|
305
363
|
else
|
|
306
364
|
{
|
|
307
|
-
//
|
|
308
|
-
|
|
365
|
+
// Interleave with loop unrolling by 4
|
|
366
|
+
size_t i = 0;
|
|
367
|
+
const size_t stride = numChannels;
|
|
368
|
+
|
|
369
|
+
for (; i + 3 < m_fftSize; i += 4)
|
|
309
370
|
{
|
|
310
|
-
|
|
371
|
+
if (isInverseReal)
|
|
372
|
+
{
|
|
373
|
+
frameOutput[i * stride] = m_realBuffer[i];
|
|
374
|
+
frameOutput[(i + 1) * stride] = m_realBuffer[i + 1];
|
|
375
|
+
frameOutput[(i + 2) * stride] = m_realBuffer[i + 2];
|
|
376
|
+
frameOutput[(i + 3) * stride] = m_realBuffer[i + 3];
|
|
377
|
+
}
|
|
378
|
+
else
|
|
379
|
+
{
|
|
380
|
+
frameOutput[i * stride] = m_complexBuffer[i].real();
|
|
381
|
+
frameOutput[(i + 1) * stride] = m_complexBuffer[i + 1].real();
|
|
382
|
+
frameOutput[(i + 2) * stride] = m_complexBuffer[i + 2].real();
|
|
383
|
+
frameOutput[(i + 3) * stride] = m_complexBuffer[i + 3].real();
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
// Handle remainder
|
|
388
|
+
for (; i < m_fftSize; ++i)
|
|
389
|
+
{
|
|
390
|
+
frameOutput[i * stride] = isInverseReal
|
|
391
|
+
? m_realBuffer[i]
|
|
392
|
+
: m_complexBuffer[i].real();
|
|
311
393
|
}
|
|
312
394
|
}
|
|
313
395
|
}
|
|
314
396
|
else
|
|
315
397
|
{
|
|
316
|
-
// Forward transforms
|
|
398
|
+
// Forward transforms - write output based on format
|
|
317
399
|
switch (m_format)
|
|
318
400
|
{
|
|
319
401
|
case OutputFormat::COMPLEX:
|
|
320
402
|
{
|
|
321
|
-
//
|
|
403
|
+
// OPTIMIZATION: Calculate numBins once, interleave with loop unrolling
|
|
322
404
|
size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
|
|
323
405
|
? m_engine->getHalfSize()
|
|
324
406
|
: m_fftSize;
|
|
325
|
-
|
|
407
|
+
|
|
408
|
+
// OPTIMIZATION: Unroll complex interleaving by 4
|
|
409
|
+
size_t i = 0;
|
|
410
|
+
const size_t stride = numChannels;
|
|
411
|
+
|
|
412
|
+
for (; i + 3 < numBins; i += 4)
|
|
326
413
|
{
|
|
327
|
-
|
|
328
|
-
|
|
414
|
+
size_t outIdx0 = (i * 2) * stride;
|
|
415
|
+
size_t outIdx1 = ((i + 1) * 2) * stride;
|
|
416
|
+
size_t outIdx2 = ((i + 2) * 2) * stride;
|
|
417
|
+
size_t outIdx3 = ((i + 3) * 2) * stride;
|
|
418
|
+
|
|
419
|
+
frameOutput[outIdx0] = m_complexBuffer[i].real();
|
|
420
|
+
frameOutput[outIdx0 + stride] = m_complexBuffer[i].imag();
|
|
421
|
+
frameOutput[outIdx1] = m_complexBuffer[i + 1].real();
|
|
422
|
+
frameOutput[outIdx1 + stride] = m_complexBuffer[i + 1].imag();
|
|
423
|
+
frameOutput[outIdx2] = m_complexBuffer[i + 2].real();
|
|
424
|
+
frameOutput[outIdx2 + stride] = m_complexBuffer[i + 2].imag();
|
|
425
|
+
frameOutput[outIdx3] = m_complexBuffer[i + 3].real();
|
|
426
|
+
frameOutput[outIdx3 + stride] = m_complexBuffer[i + 3].imag();
|
|
329
427
|
}
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
{
|
|
334
|
-
size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
|
|
335
|
-
? m_engine->getHalfSize()
|
|
336
|
-
: m_fftSize;
|
|
337
|
-
std::vector<float> magnitudes(numBins);
|
|
338
|
-
m_engine->getMagnitude(m_complexBuffer.data(), magnitudes.data(), numBins);
|
|
339
|
-
for (size_t i = 0; i < numBins; ++i)
|
|
428
|
+
|
|
429
|
+
// Handle remainder
|
|
430
|
+
for (; i < numBins; ++i)
|
|
340
431
|
{
|
|
341
|
-
|
|
432
|
+
size_t outIdx = (i * 2) * stride;
|
|
433
|
+
frameOutput[outIdx] = m_complexBuffer[i].real();
|
|
434
|
+
frameOutput[outIdx + stride] = m_complexBuffer[i].imag();
|
|
342
435
|
}
|
|
343
436
|
break;
|
|
344
437
|
}
|
|
438
|
+
case OutputFormat::MAGNITUDE:
|
|
345
439
|
case OutputFormat::POWER:
|
|
440
|
+
case OutputFormat::PHASE:
|
|
346
441
|
{
|
|
442
|
+
// OPTIMIZATION: Use stack buffer for small sizes, avoid heap allocation
|
|
347
443
|
size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
|
|
348
444
|
? m_engine->getHalfSize()
|
|
349
445
|
: m_fftSize;
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
446
|
+
|
|
447
|
+
// Use member realBuffer for temporary storage (already allocated)
|
|
448
|
+
float *tempOutput = m_realBuffer.data(); // Reuse buffer
|
|
449
|
+
|
|
450
|
+
if (m_format == OutputFormat::MAGNITUDE)
|
|
353
451
|
{
|
|
354
|
-
|
|
452
|
+
m_engine->getMagnitude(m_complexBuffer.data(), tempOutput, numBins);
|
|
355
453
|
}
|
|
356
|
-
|
|
357
|
-
}
|
|
358
|
-
case OutputFormat::PHASE:
|
|
359
|
-
{
|
|
360
|
-
size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
|
|
361
|
-
? m_engine->getHalfSize()
|
|
362
|
-
: m_fftSize;
|
|
363
|
-
std::vector<float> phases(numBins);
|
|
364
|
-
m_engine->getPhase(m_complexBuffer.data(), phases.data(), numBins);
|
|
365
|
-
for (size_t i = 0; i < numBins; ++i)
|
|
454
|
+
else if (m_format == OutputFormat::POWER)
|
|
366
455
|
{
|
|
367
|
-
|
|
456
|
+
m_engine->getPower(m_complexBuffer.data(), tempOutput, numBins);
|
|
457
|
+
}
|
|
458
|
+
else // PHASE
|
|
459
|
+
{
|
|
460
|
+
m_engine->getPhase(m_complexBuffer.data(), tempOutput, numBins);
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
// Write to output with stride (OPTIMIZED: Loop unrolling + special case)
|
|
464
|
+
if (numChannels == 1)
|
|
465
|
+
{
|
|
466
|
+
// Direct memcpy for single channel
|
|
467
|
+
std::memcpy(frameOutput, tempOutput, numBins * sizeof(float));
|
|
468
|
+
}
|
|
469
|
+
else
|
|
470
|
+
{
|
|
471
|
+
// Interleave with loop unrolling by 8
|
|
472
|
+
size_t i = 0;
|
|
473
|
+
const size_t stride = numChannels;
|
|
474
|
+
|
|
475
|
+
for (; i + 7 < numBins; i += 8)
|
|
476
|
+
{
|
|
477
|
+
frameOutput[i * stride] = tempOutput[i];
|
|
478
|
+
frameOutput[(i + 1) * stride] = tempOutput[i + 1];
|
|
479
|
+
frameOutput[(i + 2) * stride] = tempOutput[i + 2];
|
|
480
|
+
frameOutput[(i + 3) * stride] = tempOutput[i + 3];
|
|
481
|
+
frameOutput[(i + 4) * stride] = tempOutput[i + 4];
|
|
482
|
+
frameOutput[(i + 5) * stride] = tempOutput[i + 5];
|
|
483
|
+
frameOutput[(i + 6) * stride] = tempOutput[i + 6];
|
|
484
|
+
frameOutput[(i + 7) * stride] = tempOutput[i + 7];
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Handle remainder
|
|
488
|
+
for (; i < numBins; ++i)
|
|
489
|
+
{
|
|
490
|
+
frameOutput[i * stride] = tempOutput[i];
|
|
491
|
+
}
|
|
368
492
|
}
|
|
369
493
|
break;
|
|
370
494
|
}
|
|
@@ -51,7 +51,7 @@ namespace dsp
|
|
|
51
51
|
// ========== Public Transform Methods ==========
|
|
52
52
|
|
|
53
53
|
template <typename T>
|
|
54
|
-
void FftpackContext<T>::rfft(const T *input, std::complex<T> *output)
|
|
54
|
+
void FftpackContext<T>::rfft(const T *__restrict input, std::complex<T> *__restrict output)
|
|
55
55
|
{
|
|
56
56
|
if (m_n == 1)
|
|
57
57
|
{
|
|
@@ -59,15 +59,15 @@ namespace dsp
|
|
|
59
59
|
return;
|
|
60
60
|
}
|
|
61
61
|
|
|
62
|
-
//
|
|
63
|
-
std::
|
|
62
|
+
// OPTIMIZATION: Use memcpy for bulk copy (faster than std::copy for POD types)
|
|
63
|
+
std::memcpy(m_workBuffer.data(), input, m_n * sizeof(T));
|
|
64
64
|
|
|
65
65
|
// Perform forward real FFT
|
|
66
66
|
drftf1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
|
|
67
67
|
|
|
68
|
+
// OPTIMIZATION: Improved format conversion with better cache locality
|
|
68
69
|
// Convert FFTPACK halfcomplex format to standard complex format
|
|
69
70
|
// FFTPACK stores: [DC, re1, re2, ..., reN/2-1, Nyquist, im1, im2, ..., imN/2-1]
|
|
70
|
-
// (for even N)
|
|
71
71
|
|
|
72
72
|
size_t halfSize = (m_n / 2) + 1;
|
|
73
73
|
|
|
@@ -76,17 +76,37 @@ namespace dsp
|
|
|
76
76
|
|
|
77
77
|
if (m_n % 2 == 0)
|
|
78
78
|
{
|
|
79
|
-
// Even N
|
|
80
|
-
|
|
79
|
+
// OPTIMIZATION: Even N - process in order for better cache locality
|
|
80
|
+
size_t half = m_n / 2;
|
|
81
|
+
|
|
82
|
+
// Process middle frequencies (unrolled by 2 for better ILP)
|
|
83
|
+
size_t i = 1;
|
|
84
|
+
for (; i + 1 < half; i += 2)
|
|
85
|
+
{
|
|
86
|
+
// First pair
|
|
87
|
+
output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
|
|
88
|
+
// Second pair
|
|
89
|
+
output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
|
|
90
|
+
}
|
|
91
|
+
// Handle remainder
|
|
92
|
+
for (; i < half; ++i)
|
|
81
93
|
{
|
|
82
94
|
output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
|
|
83
95
|
}
|
|
84
|
-
|
|
96
|
+
|
|
97
|
+
// Nyquist component (real)
|
|
98
|
+
output[half] = std::complex<T>(m_workBuffer[m_n - 1], 0);
|
|
85
99
|
}
|
|
86
100
|
else
|
|
87
101
|
{
|
|
88
|
-
// Odd N
|
|
89
|
-
|
|
102
|
+
// OPTIMIZATION: Odd N - unrolled loop
|
|
103
|
+
size_t i = 1;
|
|
104
|
+
for (; i + 1 < halfSize; i += 2)
|
|
105
|
+
{
|
|
106
|
+
output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
|
|
107
|
+
output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
|
|
108
|
+
}
|
|
109
|
+
for (; i < halfSize; ++i)
|
|
90
110
|
{
|
|
91
111
|
output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
|
|
92
112
|
}
|
|
@@ -94,7 +114,7 @@ namespace dsp
|
|
|
94
114
|
}
|
|
95
115
|
|
|
96
116
|
template <typename T>
|
|
97
|
-
void FftpackContext<T>::irfft(const std::complex<T> *input, T *output)
|
|
117
|
+
void FftpackContext<T>::irfft(const std::complex<T> *__restrict input, T *__restrict output)
|
|
98
118
|
{
|
|
99
119
|
if (m_n == 1)
|
|
100
120
|
{
|
|
@@ -102,24 +122,46 @@ namespace dsp
|
|
|
102
122
|
return;
|
|
103
123
|
}
|
|
104
124
|
|
|
125
|
+
// OPTIMIZATION: Improved format conversion with loop unrolling
|
|
105
126
|
// Convert standard complex format to FFTPACK halfcomplex format
|
|
106
127
|
m_workBuffer[0] = input[0].real(); // DC
|
|
107
128
|
|
|
108
129
|
if (m_n % 2 == 0)
|
|
109
130
|
{
|
|
110
|
-
// Even N
|
|
111
|
-
|
|
131
|
+
// OPTIMIZATION: Even N - unrolled conversion
|
|
132
|
+
size_t half = m_n / 2;
|
|
133
|
+
size_t i = 1;
|
|
134
|
+
|
|
135
|
+
// Unroll by 2
|
|
136
|
+
for (; i + 1 < half; i += 2)
|
|
112
137
|
{
|
|
113
138
|
m_workBuffer[2 * i - 1] = input[i].real();
|
|
114
139
|
m_workBuffer[2 * i] = input[i].imag();
|
|
140
|
+
m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
|
|
141
|
+
m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
|
|
115
142
|
}
|
|
116
|
-
|
|
143
|
+
for (; i < half; ++i)
|
|
144
|
+
{
|
|
145
|
+
m_workBuffer[2 * i - 1] = input[i].real();
|
|
146
|
+
m_workBuffer[2 * i] = input[i].imag();
|
|
147
|
+
}
|
|
148
|
+
m_workBuffer[m_n - 1] = input[half].real(); // Nyquist
|
|
117
149
|
}
|
|
118
150
|
else
|
|
119
151
|
{
|
|
120
|
-
// Odd N
|
|
152
|
+
// OPTIMIZATION: Odd N - unrolled conversion
|
|
121
153
|
size_t halfSize = (m_n / 2) + 1;
|
|
122
|
-
|
|
154
|
+
size_t i = 1;
|
|
155
|
+
|
|
156
|
+
// Unroll by 2
|
|
157
|
+
for (; i + 1 < halfSize; i += 2)
|
|
158
|
+
{
|
|
159
|
+
m_workBuffer[2 * i - 1] = input[i].real();
|
|
160
|
+
m_workBuffer[2 * i] = input[i].imag();
|
|
161
|
+
m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
|
|
162
|
+
m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
|
|
163
|
+
}
|
|
164
|
+
for (; i < halfSize; ++i)
|
|
123
165
|
{
|
|
124
166
|
m_workBuffer[2 * i - 1] = input[i].real();
|
|
125
167
|
m_workBuffer[2 * i] = input[i].imag();
|
|
@@ -129,8 +171,8 @@ namespace dsp
|
|
|
129
171
|
// Perform inverse real FFT
|
|
130
172
|
drftb1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
|
|
131
173
|
|
|
132
|
-
//
|
|
133
|
-
std::
|
|
174
|
+
// OPTIMIZATION: Use memcpy for bulk copy
|
|
175
|
+
std::memcpy(output, m_workBuffer.data(), m_n * sizeof(T));
|
|
134
176
|
}
|
|
135
177
|
|
|
136
178
|
// ========== FFTPACK Initialization ==========
|
|
@@ -40,8 +40,9 @@ namespace dsp
|
|
|
40
40
|
explicit FftpackContext(size_t n);
|
|
41
41
|
|
|
42
42
|
// Forward/inverse real FFT
|
|
43
|
-
|
|
44
|
-
void
|
|
43
|
+
// Note: input/output buffers must not overlap (restrict semantics)
|
|
44
|
+
void rfft(const T *__restrict input, std::complex<T> *__restrict output);
|
|
45
|
+
void irfft(const std::complex<T> *__restrict input, T *__restrict output);
|
|
45
46
|
|
|
46
47
|
size_t size() const { return m_n; }
|
|
47
48
|
size_t halfSize() const { return (m_n / 2) + 1; }
|