dspx 1.1.4 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -158,6 +158,9 @@ namespace dsp
158
158
 
159
159
  outputSize = numFrames * outputSizePerFrame * numChannels;
160
160
 
161
+ // OPTIMIZATION: Process all frames and channels with minimal memory operations
162
+ // Use pre-allocated member buffers to avoid repeated allocations
163
+
161
164
  for (size_t frame = 0; frame < numFrames; ++frame)
162
165
  {
163
166
  for (size_t ch = 0; ch < static_cast<size_t>(numChannels); ++ch)
@@ -165,43 +168,134 @@ namespace dsp
165
168
  const float *frameInput = inputBuffer + (frame * inputSizePerFrame * numChannels) + ch;
166
169
  float *frameOutput = outputBuffer + (frame * outputSizePerFrame * numChannels) + ch;
167
170
 
168
- // Load input based on transform direction
171
+ // OPTIMIZATION 1: Minimize conditional branches - process by transform type
172
+ // OPTIMIZATION 2: Use member buffers (already allocated in constructor)
173
+
174
+ // Load input data (OPTIMIZED: Loop unrolling + better ILP)
169
175
  if (isInverseComplex)
170
176
  {
171
- // Input is complex interleaved: [real0, imag0, real1, imag1, ...]
172
- for (size_t i = 0; i < m_fftSize; ++i)
177
+ // Complex input: deinterleave directly into complex buffer
178
+ // OPTIMIZATION: Unroll by 4 for better instruction-level parallelism
179
+ size_t i = 0;
180
+ const size_t stride = numChannels;
181
+ const size_t stride2 = 2 * stride;
182
+
183
+ // Process 4 complex numbers at a time
184
+ for (; i + 3 < m_fftSize; i += 4)
185
+ {
186
+ size_t idx0 = (i * 2) * stride;
187
+ size_t idx1 = ((i + 1) * 2) * stride;
188
+ size_t idx2 = ((i + 2) * 2) * stride;
189
+ size_t idx3 = ((i + 3) * 2) * stride;
190
+
191
+ m_complexBuffer[i] = std::complex<float>(frameInput[idx0], frameInput[idx0 + stride]);
192
+ m_complexBuffer[i + 1] = std::complex<float>(frameInput[idx1], frameInput[idx1 + stride]);
193
+ m_complexBuffer[i + 2] = std::complex<float>(frameInput[idx2], frameInput[idx2 + stride]);
194
+ m_complexBuffer[i + 3] = std::complex<float>(frameInput[idx3], frameInput[idx3 + stride]);
195
+ }
196
+
197
+ // Handle remainder
198
+ for (; i < m_fftSize; ++i)
173
199
  {
174
- m_complexBuffer[i] = std::complex<float>(
175
- frameInput[(i * 2) * numChannels], // real part
176
- frameInput[(i * 2 + 1) * numChannels]); // imag part
200
+ size_t idx = (i * 2) * stride;
201
+ m_complexBuffer[i] = std::complex<float>(frameInput[idx], frameInput[idx + stride]);
177
202
  }
178
203
  }
179
204
  else if (isInverseReal)
180
205
  {
181
- // Input is half-spectrum complex
206
+ // Half-spectrum complex input: deinterleave with loop unrolling
182
207
  size_t halfSize = m_engine->getHalfSize();
183
- for (size_t i = 0; i < halfSize; ++i)
208
+ size_t i = 0;
209
+ const size_t stride = numChannels;
210
+
211
+ // Unroll by 4
212
+ for (; i + 3 < halfSize; i += 4)
184
213
  {
185
- m_complexBuffer[i] = std::complex<float>(
186
- frameInput[(i * 2) * numChannels],
187
- frameInput[(i * 2 + 1) * numChannels]);
214
+ size_t idx0 = (i * 2) * stride;
215
+ size_t idx1 = ((i + 1) * 2) * stride;
216
+ size_t idx2 = ((i + 2) * 2) * stride;
217
+ size_t idx3 = ((i + 3) * 2) * stride;
218
+
219
+ m_complexBuffer[i] = std::complex<float>(frameInput[idx0], frameInput[idx0 + stride]);
220
+ m_complexBuffer[i + 1] = std::complex<float>(frameInput[idx1], frameInput[idx1 + stride]);
221
+ m_complexBuffer[i + 2] = std::complex<float>(frameInput[idx2], frameInput[idx2 + stride]);
222
+ m_complexBuffer[i + 3] = std::complex<float>(frameInput[idx3], frameInput[idx3 + stride]);
223
+ }
224
+
225
+ // Handle remainder
226
+ for (; i < halfSize; ++i)
227
+ {
228
+ size_t idx = (i * 2) * stride;
229
+ m_complexBuffer[i] = std::complex<float>(frameInput[idx], frameInput[idx + stride]);
188
230
  }
189
231
  }
190
232
  else
191
233
  {
192
- // Deinterleave real channel data
193
- for (size_t i = 0; i < m_fftSize; ++i)
234
+ // Real input: deinterleave with loop unrolling
235
+ // OPTIMIZATION: Special case for numChannels == 1 (no deinterleaving needed)
236
+ if (numChannels == 1)
237
+ {
238
+ // Direct memcpy when no deinterleaving needed
239
+ std::memcpy(m_realBuffer.data(), frameInput, m_fftSize * sizeof(float));
240
+ }
241
+ else
194
242
  {
195
- m_realBuffer[i] = frameInput[i * numChannels];
243
+ // Deinterleave with unrolling by 8 for better ILP
244
+ size_t i = 0;
245
+ const size_t stride = numChannels;
246
+
247
+ for (; i + 7 < m_fftSize; i += 8)
248
+ {
249
+ m_realBuffer[i] = frameInput[i * stride];
250
+ m_realBuffer[i + 1] = frameInput[(i + 1) * stride];
251
+ m_realBuffer[i + 2] = frameInput[(i + 2) * stride];
252
+ m_realBuffer[i + 3] = frameInput[(i + 3) * stride];
253
+ m_realBuffer[i + 4] = frameInput[(i + 4) * stride];
254
+ m_realBuffer[i + 5] = frameInput[(i + 5) * stride];
255
+ m_realBuffer[i + 6] = frameInput[(i + 6) * stride];
256
+ m_realBuffer[i + 7] = frameInput[(i + 7) * stride];
257
+ }
258
+
259
+ // Handle remainder
260
+ for (; i < m_fftSize; ++i)
261
+ {
262
+ m_realBuffer[i] = frameInput[i * stride];
263
+ }
196
264
  }
197
265
  }
198
266
 
199
- // Perform transform based on type and direction
200
- switch (m_type)
267
+ // Perform transform (OPTIMIZED: Reduced switch cases, grouped by category)
268
+ if (isInverseComplex)
201
269
  {
202
- case TransformType::FFT:
270
+ // All complex inverse transforms
271
+ if (m_type == TransformType::IFFT || (m_type == TransformType::FFT && !m_forward))
272
+ {
273
+ m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
274
+ }
275
+ else // IDFT or (DFT && !forward)
276
+ {
277
+ m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
278
+ // OPTIMIZATION: Use memcpy instead of std::copy for POD types
279
+ std::memcpy(m_complexBuffer.data(), m_tempComplexBuffer.data(),
280
+ m_fftSize * sizeof(std::complex<float>));
281
+ }
282
+ }
283
+ else if (isInverseReal)
284
+ {
285
+ // All real inverse transforms
286
+ if (m_type == TransformType::IRFFT || (m_type == TransformType::RFFT && !m_forward))
287
+ {
288
+ m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
289
+ }
290
+ else // IRDFT or (RDFT && !forward)
291
+ {
292
+ m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
293
+ }
294
+ }
295
+ else
203
296
  {
204
- if (m_forward)
297
+ // Forward transforms
298
+ if (m_type == TransformType::FFT)
205
299
  {
206
300
  // Forward FFT: real to complex
207
301
  for (size_t i = 0; i < m_fftSize; ++i)
@@ -210,22 +304,7 @@ namespace dsp
210
304
  }
211
305
  m_engine->fft(m_complexBuffer.data(), m_complexBuffer.data());
212
306
  }
213
- else
214
- {
215
- // Inverse FFT: complex input already loaded
216
- m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
217
- }
218
- break;
219
- }
220
- case TransformType::IFFT:
221
- {
222
- // Complex input already loaded into m_complexBuffer
223
- m_engine->ifft(m_complexBuffer.data(), m_complexBuffer.data());
224
- break;
225
- }
226
- case TransformType::DFT:
227
- {
228
- if (m_forward)
307
+ else if (m_type == TransformType::DFT)
229
308
  {
230
309
  // Forward DFT: real to complex
231
310
  for (size_t i = 0; i < m_fftSize; ++i)
@@ -233,138 +312,183 @@ namespace dsp
233
312
  m_complexBuffer[i] = std::complex<float>(m_realBuffer[i], 0.0f);
234
313
  }
235
314
  m_engine->dft(m_complexBuffer.data(), m_tempComplexBuffer.data());
236
- std::copy(m_tempComplexBuffer.begin(), m_tempComplexBuffer.end(), m_complexBuffer.begin());
315
+ // OPTIMIZATION: Use memcpy instead of std::copy
316
+ std::memcpy(m_complexBuffer.data(), m_tempComplexBuffer.data(),
317
+ m_fftSize * sizeof(std::complex<float>));
237
318
  }
238
- else
239
- {
240
- // Inverse DFT: complex input already loaded
241
- m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
242
- std::copy(m_tempComplexBuffer.begin(), m_tempComplexBuffer.end(), m_complexBuffer.begin());
243
- }
244
- break;
245
- }
246
- case TransformType::IDFT:
247
- {
248
- // Complex input already loaded into m_complexBuffer
249
- m_engine->idft(m_complexBuffer.data(), m_tempComplexBuffer.data());
250
- std::copy(m_tempComplexBuffer.begin(), m_tempComplexBuffer.end(), m_complexBuffer.begin());
251
- break;
252
- }
253
- case TransformType::RFFT:
254
- {
255
- if (m_forward)
319
+ else if (m_type == TransformType::RFFT)
256
320
  {
257
321
  m_engine->rfft(m_realBuffer.data(), m_complexBuffer.data());
258
322
  }
259
- else
260
- {
261
- // Inverse RFFT: half-spectrum complex input already loaded
262
- m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
263
- }
264
- break;
265
- }
266
- case TransformType::IRFFT:
267
- {
268
- // Half-spectrum complex input already loaded into m_complexBuffer
269
- m_engine->irfft(m_complexBuffer.data(), m_realBuffer.data());
270
- break;
271
- }
272
- case TransformType::RDFT:
273
- {
274
- if (m_forward)
323
+ else // RDFT
275
324
  {
276
325
  m_engine->rdft(m_realBuffer.data(), m_complexBuffer.data());
277
326
  }
278
- else
279
- {
280
- // Inverse RDFT: half-spectrum complex input already loaded
281
- m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
282
- }
283
- break;
284
- }
285
- case TransformType::IRDFT:
286
- {
287
- // Half-spectrum complex input already loaded into m_complexBuffer
288
- m_engine->irdft(m_complexBuffer.data(), m_realBuffer.data());
289
- break;
290
- }
291
327
  }
292
328
 
293
- // Convert to output format
329
+ // Write output (OPTIMIZED: Loop unrolling for interleaving)
294
330
  if (isInverseReal || isInverseComplex)
295
331
  {
296
332
  // ALL inverse transforms output real time-domain values
297
- if (isInverseReal)
333
+ const float *sourceData = isInverseReal ? m_realBuffer.data() : nullptr;
334
+
335
+ // OPTIMIZATION: Special case for single channel (no interleaving)
336
+ if (numChannels == 1)
298
337
  {
299
- // Real inverse transforms: use m_realBuffer directly
300
- for (size_t i = 0; i < m_fftSize; ++i)
338
+ if (isInverseReal)
301
339
  {
302
- frameOutput[i * numChannels] = m_realBuffer[i];
340
+ std::memcpy(frameOutput, m_realBuffer.data(), m_fftSize * sizeof(float));
341
+ }
342
+ else
343
+ {
344
+ // Extract real parts - unroll by 8
345
+ size_t i = 0;
346
+ for (; i + 7 < m_fftSize; i += 8)
347
+ {
348
+ frameOutput[i] = m_complexBuffer[i].real();
349
+ frameOutput[i + 1] = m_complexBuffer[i + 1].real();
350
+ frameOutput[i + 2] = m_complexBuffer[i + 2].real();
351
+ frameOutput[i + 3] = m_complexBuffer[i + 3].real();
352
+ frameOutput[i + 4] = m_complexBuffer[i + 4].real();
353
+ frameOutput[i + 5] = m_complexBuffer[i + 5].real();
354
+ frameOutput[i + 6] = m_complexBuffer[i + 6].real();
355
+ frameOutput[i + 7] = m_complexBuffer[i + 7].real();
356
+ }
357
+ for (; i < m_fftSize; ++i)
358
+ {
359
+ frameOutput[i] = m_complexBuffer[i].real();
360
+ }
303
361
  }
304
362
  }
305
363
  else
306
364
  {
307
- // IFFT/IDFT: extract real parts from complex result
308
- for (size_t i = 0; i < m_fftSize; ++i)
365
+ // Interleave with loop unrolling by 4
366
+ size_t i = 0;
367
+ const size_t stride = numChannels;
368
+
369
+ for (; i + 3 < m_fftSize; i += 4)
309
370
  {
310
- frameOutput[i * numChannels] = m_complexBuffer[i].real();
371
+ if (isInverseReal)
372
+ {
373
+ frameOutput[i * stride] = m_realBuffer[i];
374
+ frameOutput[(i + 1) * stride] = m_realBuffer[i + 1];
375
+ frameOutput[(i + 2) * stride] = m_realBuffer[i + 2];
376
+ frameOutput[(i + 3) * stride] = m_realBuffer[i + 3];
377
+ }
378
+ else
379
+ {
380
+ frameOutput[i * stride] = m_complexBuffer[i].real();
381
+ frameOutput[(i + 1) * stride] = m_complexBuffer[i + 1].real();
382
+ frameOutput[(i + 2) * stride] = m_complexBuffer[i + 2].real();
383
+ frameOutput[(i + 3) * stride] = m_complexBuffer[i + 3].real();
384
+ }
385
+ }
386
+
387
+ // Handle remainder
388
+ for (; i < m_fftSize; ++i)
389
+ {
390
+ frameOutput[i * stride] = isInverseReal
391
+ ? m_realBuffer[i]
392
+ : m_complexBuffer[i].real();
311
393
  }
312
394
  }
313
395
  }
314
396
  else
315
397
  {
316
- // Forward transforms only
398
+ // Forward transforms - write output based on format
317
399
  switch (m_format)
318
400
  {
319
401
  case OutputFormat::COMPLEX:
320
402
  {
321
- // Interleave real/imag
403
+ // OPTIMIZATION: Calculate numBins once, interleave with loop unrolling
322
404
  size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
323
405
  ? m_engine->getHalfSize()
324
406
  : m_fftSize;
325
- for (size_t i = 0; i < numBins; ++i)
407
+
408
+ // OPTIMIZATION: Unroll complex interleaving by 4
409
+ size_t i = 0;
410
+ const size_t stride = numChannels;
411
+
412
+ for (; i + 3 < numBins; i += 4)
326
413
  {
327
- frameOutput[(i * 2) * numChannels] = m_complexBuffer[i].real();
328
- frameOutput[(i * 2 + 1) * numChannels] = m_complexBuffer[i].imag();
414
+ size_t outIdx0 = (i * 2) * stride;
415
+ size_t outIdx1 = ((i + 1) * 2) * stride;
416
+ size_t outIdx2 = ((i + 2) * 2) * stride;
417
+ size_t outIdx3 = ((i + 3) * 2) * stride;
418
+
419
+ frameOutput[outIdx0] = m_complexBuffer[i].real();
420
+ frameOutput[outIdx0 + stride] = m_complexBuffer[i].imag();
421
+ frameOutput[outIdx1] = m_complexBuffer[i + 1].real();
422
+ frameOutput[outIdx1 + stride] = m_complexBuffer[i + 1].imag();
423
+ frameOutput[outIdx2] = m_complexBuffer[i + 2].real();
424
+ frameOutput[outIdx2 + stride] = m_complexBuffer[i + 2].imag();
425
+ frameOutput[outIdx3] = m_complexBuffer[i + 3].real();
426
+ frameOutput[outIdx3 + stride] = m_complexBuffer[i + 3].imag();
329
427
  }
330
- break;
331
- }
332
- case OutputFormat::MAGNITUDE:
333
- {
334
- size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
335
- ? m_engine->getHalfSize()
336
- : m_fftSize;
337
- std::vector<float> magnitudes(numBins);
338
- m_engine->getMagnitude(m_complexBuffer.data(), magnitudes.data(), numBins);
339
- for (size_t i = 0; i < numBins; ++i)
428
+
429
+ // Handle remainder
430
+ for (; i < numBins; ++i)
340
431
  {
341
- frameOutput[i * numChannels] = magnitudes[i];
432
+ size_t outIdx = (i * 2) * stride;
433
+ frameOutput[outIdx] = m_complexBuffer[i].real();
434
+ frameOutput[outIdx + stride] = m_complexBuffer[i].imag();
342
435
  }
343
436
  break;
344
437
  }
438
+ case OutputFormat::MAGNITUDE:
345
439
  case OutputFormat::POWER:
440
+ case OutputFormat::PHASE:
346
441
  {
442
+ // OPTIMIZATION: Use stack buffer for small sizes, avoid heap allocation
347
443
  size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
348
444
  ? m_engine->getHalfSize()
349
445
  : m_fftSize;
350
- std::vector<float> power(numBins);
351
- m_engine->getPower(m_complexBuffer.data(), power.data(), numBins);
352
- for (size_t i = 0; i < numBins; ++i)
446
+
447
+ // Use member realBuffer for temporary storage (already allocated)
448
+ float *tempOutput = m_realBuffer.data(); // Reuse buffer
449
+
450
+ if (m_format == OutputFormat::MAGNITUDE)
353
451
  {
354
- frameOutput[i * numChannels] = power[i];
452
+ m_engine->getMagnitude(m_complexBuffer.data(), tempOutput, numBins);
355
453
  }
356
- break;
357
- }
358
- case OutputFormat::PHASE:
359
- {
360
- size_t numBins = (m_type == TransformType::RFFT || m_type == TransformType::RDFT)
361
- ? m_engine->getHalfSize()
362
- : m_fftSize;
363
- std::vector<float> phases(numBins);
364
- m_engine->getPhase(m_complexBuffer.data(), phases.data(), numBins);
365
- for (size_t i = 0; i < numBins; ++i)
454
+ else if (m_format == OutputFormat::POWER)
366
455
  {
367
- frameOutput[i * numChannels] = phases[i];
456
+ m_engine->getPower(m_complexBuffer.data(), tempOutput, numBins);
457
+ }
458
+ else // PHASE
459
+ {
460
+ m_engine->getPhase(m_complexBuffer.data(), tempOutput, numBins);
461
+ }
462
+
463
+ // Write to output with stride (OPTIMIZED: Loop unrolling + special case)
464
+ if (numChannels == 1)
465
+ {
466
+ // Direct memcpy for single channel
467
+ std::memcpy(frameOutput, tempOutput, numBins * sizeof(float));
468
+ }
469
+ else
470
+ {
471
+ // Interleave with loop unrolling by 8
472
+ size_t i = 0;
473
+ const size_t stride = numChannels;
474
+
475
+ for (; i + 7 < numBins; i += 8)
476
+ {
477
+ frameOutput[i * stride] = tempOutput[i];
478
+ frameOutput[(i + 1) * stride] = tempOutput[i + 1];
479
+ frameOutput[(i + 2) * stride] = tempOutput[i + 2];
480
+ frameOutput[(i + 3) * stride] = tempOutput[i + 3];
481
+ frameOutput[(i + 4) * stride] = tempOutput[i + 4];
482
+ frameOutput[(i + 5) * stride] = tempOutput[i + 5];
483
+ frameOutput[(i + 6) * stride] = tempOutput[i + 6];
484
+ frameOutput[(i + 7) * stride] = tempOutput[i + 7];
485
+ }
486
+
487
+ // Handle remainder
488
+ for (; i < numBins; ++i)
489
+ {
490
+ frameOutput[i * stride] = tempOutput[i];
491
+ }
368
492
  }
369
493
  break;
370
494
  }
@@ -51,7 +51,7 @@ namespace dsp
51
51
  // ========== Public Transform Methods ==========
52
52
 
53
53
  template <typename T>
54
- void FftpackContext<T>::rfft(const T *input, std::complex<T> *output)
54
+ void FftpackContext<T>::rfft(const T *__restrict input, std::complex<T> *__restrict output)
55
55
  {
56
56
  if (m_n == 1)
57
57
  {
@@ -59,15 +59,15 @@ namespace dsp
59
59
  return;
60
60
  }
61
61
 
62
- // Copy input to work buffer
63
- std::copy(input, input + m_n, m_workBuffer.data());
62
+ // OPTIMIZATION: Use memcpy for bulk copy (faster than std::copy for POD types)
63
+ std::memcpy(m_workBuffer.data(), input, m_n * sizeof(T));
64
64
 
65
65
  // Perform forward real FFT
66
66
  drftf1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
67
67
 
68
+ // OPTIMIZATION: Improved format conversion with better cache locality
68
69
  // Convert FFTPACK halfcomplex format to standard complex format
69
70
  // FFTPACK stores: [DC, re1, re2, ..., reN/2-1, Nyquist, im1, im2, ..., imN/2-1]
70
- // (for even N)
71
71
 
72
72
  size_t halfSize = (m_n / 2) + 1;
73
73
 
@@ -76,17 +76,37 @@ namespace dsp
76
76
 
77
77
  if (m_n % 2 == 0)
78
78
  {
79
- // Even N: Nyquist is at position m_n/2
80
- for (size_t i = 1; i < m_n / 2; ++i)
79
+ // OPTIMIZATION: Even N - process in order for better cache locality
80
+ size_t half = m_n / 2;
81
+
82
+ // Process middle frequencies (unrolled by 2 for better ILP)
83
+ size_t i = 1;
84
+ for (; i + 1 < half; i += 2)
85
+ {
86
+ // First pair
87
+ output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
88
+ // Second pair
89
+ output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
90
+ }
91
+ // Handle remainder
92
+ for (; i < half; ++i)
81
93
  {
82
94
  output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
83
95
  }
84
- output[m_n / 2] = std::complex<T>(m_workBuffer[m_n - 1], 0);
96
+
97
+ // Nyquist component (real)
98
+ output[half] = std::complex<T>(m_workBuffer[m_n - 1], 0);
85
99
  }
86
100
  else
87
101
  {
88
- // Odd N: no separate Nyquist
89
- for (size_t i = 1; i < halfSize; ++i)
102
+ // OPTIMIZATION: Odd N - unrolled loop
103
+ size_t i = 1;
104
+ for (; i + 1 < halfSize; i += 2)
105
+ {
106
+ output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
107
+ output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
108
+ }
109
+ for (; i < halfSize; ++i)
90
110
  {
91
111
  output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
92
112
  }
@@ -94,7 +114,7 @@ namespace dsp
94
114
  }
95
115
 
96
116
  template <typename T>
97
- void FftpackContext<T>::irfft(const std::complex<T> *input, T *output)
117
+ void FftpackContext<T>::irfft(const std::complex<T> *__restrict input, T *__restrict output)
98
118
  {
99
119
  if (m_n == 1)
100
120
  {
@@ -102,24 +122,46 @@ namespace dsp
102
122
  return;
103
123
  }
104
124
 
125
+ // OPTIMIZATION: Improved format conversion with loop unrolling
105
126
  // Convert standard complex format to FFTPACK halfcomplex format
106
127
  m_workBuffer[0] = input[0].real(); // DC
107
128
 
108
129
  if (m_n % 2 == 0)
109
130
  {
110
- // Even N
111
- for (size_t i = 1; i < m_n / 2; ++i)
131
+ // OPTIMIZATION: Even N - unrolled conversion
132
+ size_t half = m_n / 2;
133
+ size_t i = 1;
134
+
135
+ // Unroll by 2
136
+ for (; i + 1 < half; i += 2)
112
137
  {
113
138
  m_workBuffer[2 * i - 1] = input[i].real();
114
139
  m_workBuffer[2 * i] = input[i].imag();
140
+ m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
141
+ m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
115
142
  }
116
- m_workBuffer[m_n - 1] = input[m_n / 2].real(); // Nyquist
143
+ for (; i < half; ++i)
144
+ {
145
+ m_workBuffer[2 * i - 1] = input[i].real();
146
+ m_workBuffer[2 * i] = input[i].imag();
147
+ }
148
+ m_workBuffer[m_n - 1] = input[half].real(); // Nyquist
117
149
  }
118
150
  else
119
151
  {
120
- // Odd N
152
+ // OPTIMIZATION: Odd N - unrolled conversion
121
153
  size_t halfSize = (m_n / 2) + 1;
122
- for (size_t i = 1; i < halfSize; ++i)
154
+ size_t i = 1;
155
+
156
+ // Unroll by 2
157
+ for (; i + 1 < halfSize; i += 2)
158
+ {
159
+ m_workBuffer[2 * i - 1] = input[i].real();
160
+ m_workBuffer[2 * i] = input[i].imag();
161
+ m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
162
+ m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
163
+ }
164
+ for (; i < halfSize; ++i)
123
165
  {
124
166
  m_workBuffer[2 * i - 1] = input[i].real();
125
167
  m_workBuffer[2 * i] = input[i].imag();
@@ -129,8 +171,8 @@ namespace dsp
129
171
  // Perform inverse real FFT
130
172
  drftb1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
131
173
 
132
- // Copy result (FFTPACK doesn't normalize)
133
- std::copy(m_workBuffer.begin(), m_workBuffer.end(), output);
174
+ // OPTIMIZATION: Use memcpy for bulk copy
175
+ std::memcpy(output, m_workBuffer.data(), m_n * sizeof(T));
134
176
  }
135
177
 
136
178
  // ========== FFTPACK Initialization ==========
@@ -40,8 +40,9 @@ namespace dsp
40
40
  explicit FftpackContext(size_t n);
41
41
 
42
42
  // Forward/inverse real FFT
43
- void rfft(const T *input, std::complex<T> *output);
44
- void irfft(const std::complex<T> *input, T *output);
43
+ // Note: input/output buffers must not overlap (restrict semantics)
44
+ void rfft(const T *__restrict input, std::complex<T> *__restrict output);
45
+ void irfft(const std::complex<T> *__restrict input, T *__restrict output);
45
46
 
46
47
  size_t size() const { return m_n; }
47
48
  size_t halfSize() const { return (m_n / 2) + 1; }