dspx 1.1.3 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,7 +51,7 @@ namespace dsp
51
51
  // ========== Public Transform Methods ==========
52
52
 
53
53
  template <typename T>
54
- void FftpackContext<T>::rfft(const T *input, std::complex<T> *output)
54
+ void FftpackContext<T>::rfft(const T *__restrict input, std::complex<T> *__restrict output)
55
55
  {
56
56
  if (m_n == 1)
57
57
  {
@@ -59,15 +59,15 @@ namespace dsp
59
59
  return;
60
60
  }
61
61
 
62
- // Copy input to work buffer
63
- std::copy(input, input + m_n, m_workBuffer.data());
62
+ // OPTIMIZATION: Use memcpy for bulk copy (faster than std::copy for POD types)
63
+ std::memcpy(m_workBuffer.data(), input, m_n * sizeof(T));
64
64
 
65
65
  // Perform forward real FFT
66
66
  drftf1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
67
67
 
68
+ // OPTIMIZATION: Improved format conversion with better cache locality
68
69
  // Convert FFTPACK halfcomplex format to standard complex format
69
70
  // FFTPACK stores: [DC, re1, re2, ..., reN/2-1, Nyquist, im1, im2, ..., imN/2-1]
70
- // (for even N)
71
71
 
72
72
  size_t halfSize = (m_n / 2) + 1;
73
73
 
@@ -76,17 +76,37 @@ namespace dsp
76
76
 
77
77
  if (m_n % 2 == 0)
78
78
  {
79
- // Even N: Nyquist is at position m_n/2
80
- for (size_t i = 1; i < m_n / 2; ++i)
79
+ // OPTIMIZATION: Even N - process in order for better cache locality
80
+ size_t half = m_n / 2;
81
+
82
+ // Process middle frequencies (unrolled by 2 for better ILP)
83
+ size_t i = 1;
84
+ for (; i + 1 < half; i += 2)
85
+ {
86
+ // First pair
87
+ output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
88
+ // Second pair
89
+ output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
90
+ }
91
+ // Handle remainder
92
+ for (; i < half; ++i)
81
93
  {
82
94
  output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
83
95
  }
84
- output[m_n / 2] = std::complex<T>(m_workBuffer[m_n - 1], 0);
96
+
97
+ // Nyquist component (real)
98
+ output[half] = std::complex<T>(m_workBuffer[m_n - 1], 0);
85
99
  }
86
100
  else
87
101
  {
88
- // Odd N: no separate Nyquist
89
- for (size_t i = 1; i < halfSize; ++i)
102
+ // OPTIMIZATION: Odd N - unrolled loop
103
+ size_t i = 1;
104
+ for (; i + 1 < halfSize; i += 2)
105
+ {
106
+ output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
107
+ output[i + 1] = std::complex<T>(m_workBuffer[2 * (i + 1) - 1], m_workBuffer[2 * (i + 1)]);
108
+ }
109
+ for (; i < halfSize; ++i)
90
110
  {
91
111
  output[i] = std::complex<T>(m_workBuffer[2 * i - 1], m_workBuffer[2 * i]);
92
112
  }
@@ -94,7 +114,7 @@ namespace dsp
94
114
  }
95
115
 
96
116
  template <typename T>
97
- void FftpackContext<T>::irfft(const std::complex<T> *input, T *output)
117
+ void FftpackContext<T>::irfft(const std::complex<T> *__restrict input, T *__restrict output)
98
118
  {
99
119
  if (m_n == 1)
100
120
  {
@@ -102,24 +122,46 @@ namespace dsp
102
122
  return;
103
123
  }
104
124
 
125
+ // OPTIMIZATION: Improved format conversion with loop unrolling
105
126
  // Convert standard complex format to FFTPACK halfcomplex format
106
127
  m_workBuffer[0] = input[0].real(); // DC
107
128
 
108
129
  if (m_n % 2 == 0)
109
130
  {
110
- // Even N
111
- for (size_t i = 1; i < m_n / 2; ++i)
131
+ // OPTIMIZATION: Even N - unrolled conversion
132
+ size_t half = m_n / 2;
133
+ size_t i = 1;
134
+
135
+ // Unroll by 2
136
+ for (; i + 1 < half; i += 2)
112
137
  {
113
138
  m_workBuffer[2 * i - 1] = input[i].real();
114
139
  m_workBuffer[2 * i] = input[i].imag();
140
+ m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
141
+ m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
115
142
  }
116
- m_workBuffer[m_n - 1] = input[m_n / 2].real(); // Nyquist
143
+ for (; i < half; ++i)
144
+ {
145
+ m_workBuffer[2 * i - 1] = input[i].real();
146
+ m_workBuffer[2 * i] = input[i].imag();
147
+ }
148
+ m_workBuffer[m_n - 1] = input[half].real(); // Nyquist
117
149
  }
118
150
  else
119
151
  {
120
- // Odd N
152
+ // OPTIMIZATION: Odd N - unrolled conversion
121
153
  size_t halfSize = (m_n / 2) + 1;
122
- for (size_t i = 1; i < halfSize; ++i)
154
+ size_t i = 1;
155
+
156
+ // Unroll by 2
157
+ for (; i + 1 < halfSize; i += 2)
158
+ {
159
+ m_workBuffer[2 * i - 1] = input[i].real();
160
+ m_workBuffer[2 * i] = input[i].imag();
161
+ m_workBuffer[2 * (i + 1) - 1] = input[i + 1].real();
162
+ m_workBuffer[2 * (i + 1)] = input[i + 1].imag();
163
+ }
164
+ for (; i < halfSize; ++i)
123
165
  {
124
166
  m_workBuffer[2 * i - 1] = input[i].real();
125
167
  m_workBuffer[2 * i] = input[i].imag();
@@ -129,8 +171,8 @@ namespace dsp
129
171
  // Perform inverse real FFT
130
172
  drftb1(m_n, m_workBuffer.data(), m_wsave.data(), m_wsave.data() + m_n, m_ifac.data());
131
173
 
132
- // Copy result (FFTPACK doesn't normalize)
133
- std::copy(m_workBuffer.begin(), m_workBuffer.end(), output);
174
+ // OPTIMIZATION: Use memcpy for bulk copy
175
+ std::memcpy(output, m_workBuffer.data(), m_n * sizeof(T));
134
176
  }
135
177
 
136
178
  // ========== FFTPACK Initialization ==========
@@ -40,8 +40,9 @@ namespace dsp
40
40
  explicit FftpackContext(size_t n);
41
41
 
42
42
  // Forward/inverse real FFT
43
- void rfft(const T *input, std::complex<T> *output);
44
- void irfft(const std::complex<T> *input, T *output);
43
+ // Note: input/output buffers must not overlap (restrict semantics)
44
+ void rfft(const T *__restrict input, std::complex<T> *__restrict output);
45
+ void irfft(const std::complex<T> *__restrict input, T *__restrict output);
45
46
 
46
47
  size_t size() const { return m_n; }
47
48
  size_t halfSize() const { return (m_n / 2) + 1; }