dspx 0.1.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. package/.github/workflows/ci.yml +185 -0
  2. package/.vscode/c_cpp_properties.json +17 -0
  3. package/.vscode/settings.json +68 -0
  4. package/.vscode/tasks.json +28 -0
  5. package/DISCLAIMER.md +32 -0
  6. package/LICENSE +21 -0
  7. package/README.md +1803 -0
  8. package/ROADMAP.md +192 -0
  9. package/TECHNICAL_DEBT.md +165 -0
  10. package/binding.gyp +65 -0
  11. package/docs/ADVANCED_LOGGER_FEATURES.md +598 -0
  12. package/docs/AUTHENTICATION_SECURITY.md +396 -0
  13. package/docs/BACKEND_IMPROVEMENTS.md +399 -0
  14. package/docs/CHEBYSHEV_BIQUAD_EQ_IMPLEMENTATION.md +405 -0
  15. package/docs/FFT_IMPLEMENTATION.md +490 -0
  16. package/docs/FFT_IMPROVEMENTS_SUMMARY.md +387 -0
  17. package/docs/FFT_USER_GUIDE.md +494 -0
  18. package/docs/FILTERS_IMPLEMENTATION.md +260 -0
  19. package/docs/FILTER_API_GUIDE.md +418 -0
  20. package/docs/FIR_SIMD_OPTIMIZATION.md +175 -0
  21. package/docs/LOGGER_API_REFERENCE.md +350 -0
  22. package/docs/NOTCH_FILTER_QUICK_REF.md +121 -0
  23. package/docs/PHASE2_TESTS_AND_NOTCH_FILTER.md +341 -0
  24. package/docs/PHASES_5_7_SUMMARY.md +403 -0
  25. package/docs/PIPELINE_FILTER_INTEGRATION.md +446 -0
  26. package/docs/SIMD_OPTIMIZATIONS.md +211 -0
  27. package/docs/TEST_MIGRATION_SUMMARY.md +173 -0
  28. package/docs/TIMESERIES_IMPLEMENTATION_SUMMARY.md +322 -0
  29. package/docs/TIMESERIES_QUICK_REF.md +85 -0
  30. package/docs/advanced.md +559 -0
  31. package/docs/time-series-guide.md +617 -0
  32. package/docs/time-series-migration.md +376 -0
  33. package/jest.config.js +37 -0
  34. package/package.json +42 -0
  35. package/prebuilds/linux-x64/dsp-ts-redis.node +0 -0
  36. package/prebuilds/win32-x64/dsp-ts-redis.node +0 -0
  37. package/scripts/test.js +24 -0
  38. package/src/build/dsp-ts-redis.node +0 -0
  39. package/src/native/DspPipeline.cc +675 -0
  40. package/src/native/DspPipeline.h +44 -0
  41. package/src/native/FftBindings.cc +817 -0
  42. package/src/native/FilterBindings.cc +1001 -0
  43. package/src/native/IDspStage.h +53 -0
  44. package/src/native/adapters/InterpolatorStage.h +201 -0
  45. package/src/native/adapters/MeanAbsoluteValueStage.h +289 -0
  46. package/src/native/adapters/MovingAverageStage.h +306 -0
  47. package/src/native/adapters/RectifyStage.h +88 -0
  48. package/src/native/adapters/ResamplerStage.h +238 -0
  49. package/src/native/adapters/RmsStage.h +299 -0
  50. package/src/native/adapters/SscStage.h +121 -0
  51. package/src/native/adapters/VarianceStage.h +307 -0
  52. package/src/native/adapters/WampStage.h +114 -0
  53. package/src/native/adapters/WaveformLengthStage.h +115 -0
  54. package/src/native/adapters/ZScoreNormalizeStage.h +326 -0
  55. package/src/native/core/FftEngine.cc +441 -0
  56. package/src/native/core/FftEngine.h +224 -0
  57. package/src/native/core/FirFilter.cc +324 -0
  58. package/src/native/core/FirFilter.h +149 -0
  59. package/src/native/core/IirFilter.cc +576 -0
  60. package/src/native/core/IirFilter.h +210 -0
  61. package/src/native/core/MovingAbsoluteValueFilter.cc +17 -0
  62. package/src/native/core/MovingAbsoluteValueFilter.h +135 -0
  63. package/src/native/core/MovingAverageFilter.cc +18 -0
  64. package/src/native/core/MovingAverageFilter.h +135 -0
  65. package/src/native/core/MovingFftFilter.cc +291 -0
  66. package/src/native/core/MovingFftFilter.h +203 -0
  67. package/src/native/core/MovingVarianceFilter.cc +194 -0
  68. package/src/native/core/MovingVarianceFilter.h +114 -0
  69. package/src/native/core/MovingZScoreFilter.cc +215 -0
  70. package/src/native/core/MovingZScoreFilter.h +113 -0
  71. package/src/native/core/Policies.h +352 -0
  72. package/src/native/core/RmsFilter.cc +18 -0
  73. package/src/native/core/RmsFilter.h +131 -0
  74. package/src/native/core/SscFilter.cc +16 -0
  75. package/src/native/core/SscFilter.h +137 -0
  76. package/src/native/core/WampFilter.cc +16 -0
  77. package/src/native/core/WampFilter.h +101 -0
  78. package/src/native/core/WaveformLengthFilter.cc +17 -0
  79. package/src/native/core/WaveformLengthFilter.h +98 -0
  80. package/src/native/utils/CircularBufferArray.cc +336 -0
  81. package/src/native/utils/CircularBufferArray.h +62 -0
  82. package/src/native/utils/CircularBufferVector.cc +145 -0
  83. package/src/native/utils/CircularBufferVector.h +45 -0
  84. package/src/native/utils/NapiUtils.cc +53 -0
  85. package/src/native/utils/NapiUtils.h +21 -0
  86. package/src/native/utils/SimdOps.h +870 -0
  87. package/src/native/utils/SlidingWindowFilter.cc +239 -0
  88. package/src/native/utils/SlidingWindowFilter.h +159 -0
  89. package/src/native/utils/TimeSeriesBuffer.cc +205 -0
  90. package/src/native/utils/TimeSeriesBuffer.h +140 -0
  91. package/src/ts/CircularLogBuffer.ts +87 -0
  92. package/src/ts/DriftDetector.ts +331 -0
  93. package/src/ts/TopicRouter.ts +428 -0
  94. package/src/ts/__tests__/AdvancedDsp.test.ts +585 -0
  95. package/src/ts/__tests__/AuthAndEdgeCases.test.ts +241 -0
  96. package/src/ts/__tests__/Chaining.test.ts +387 -0
  97. package/src/ts/__tests__/ChebyshevBiquad.test.ts +229 -0
  98. package/src/ts/__tests__/CircularLogBuffer.test.ts +158 -0
  99. package/src/ts/__tests__/DriftDetector.test.ts +389 -0
  100. package/src/ts/__tests__/Fft.test.ts +484 -0
  101. package/src/ts/__tests__/ListState.test.ts +153 -0
  102. package/src/ts/__tests__/Logger.test.ts +208 -0
  103. package/src/ts/__tests__/LoggerAdvanced.test.ts +319 -0
  104. package/src/ts/__tests__/LoggerMinor.test.ts +247 -0
  105. package/src/ts/__tests__/MeanAbsoluteValue.test.ts +398 -0
  106. package/src/ts/__tests__/MovingAverage.test.ts +322 -0
  107. package/src/ts/__tests__/RMS.test.ts +315 -0
  108. package/src/ts/__tests__/Rectify.test.ts +272 -0
  109. package/src/ts/__tests__/Redis.test.ts +456 -0
  110. package/src/ts/__tests__/SlopeSignChange.test.ts +166 -0
  111. package/src/ts/__tests__/Tap.test.ts +164 -0
  112. package/src/ts/__tests__/TimeBasedExpiration.test.ts +124 -0
  113. package/src/ts/__tests__/TimeBasedRmsAndMav.test.ts +231 -0
  114. package/src/ts/__tests__/TimeBasedVarianceAndZScore.test.ts +284 -0
  115. package/src/ts/__tests__/TimeSeries.test.ts +254 -0
  116. package/src/ts/__tests__/TopicRouter.test.ts +332 -0
  117. package/src/ts/__tests__/TopicRouterAdvanced.test.ts +483 -0
  118. package/src/ts/__tests__/TopicRouterPriority.test.ts +487 -0
  119. package/src/ts/__tests__/Variance.test.ts +509 -0
  120. package/src/ts/__tests__/WaveformLength.test.ts +147 -0
  121. package/src/ts/__tests__/WillisonAmplitude.test.ts +197 -0
  122. package/src/ts/__tests__/ZScoreNormalize.test.ts +459 -0
  123. package/src/ts/advanced-dsp.ts +566 -0
  124. package/src/ts/backends.ts +1137 -0
  125. package/src/ts/bindings.ts +1225 -0
  126. package/src/ts/easter-egg.ts +42 -0
  127. package/src/ts/examples/MeanAbsoluteValue/test-state.ts +99 -0
  128. package/src/ts/examples/MeanAbsoluteValue/test-streaming.ts +269 -0
  129. package/src/ts/examples/MovingAverage/test-state.ts +85 -0
  130. package/src/ts/examples/MovingAverage/test-streaming.ts +188 -0
  131. package/src/ts/examples/RMS/test-state.ts +97 -0
  132. package/src/ts/examples/RMS/test-streaming.ts +253 -0
  133. package/src/ts/examples/Rectify/test-state.ts +107 -0
  134. package/src/ts/examples/Rectify/test-streaming.ts +242 -0
  135. package/src/ts/examples/Variance/test-state.ts +195 -0
  136. package/src/ts/examples/Variance/test-streaming.ts +260 -0
  137. package/src/ts/examples/ZScoreNormalize/test-state.ts +277 -0
  138. package/src/ts/examples/ZScoreNormalize/test-streaming.ts +306 -0
  139. package/src/ts/examples/advanced-dsp-examples.ts +397 -0
  140. package/src/ts/examples/callbacks/advanced-router-features.ts +326 -0
  141. package/src/ts/examples/callbacks/benchmark-circular-buffer.ts +109 -0
  142. package/src/ts/examples/callbacks/monitoring-example.ts +265 -0
  143. package/src/ts/examples/callbacks/pipeline-callbacks-example.ts +137 -0
  144. package/src/ts/examples/callbacks/pooled-callbacks-example.ts +274 -0
  145. package/src/ts/examples/callbacks/priority-routing-example.ts +277 -0
  146. package/src/ts/examples/callbacks/production-topic-router.ts +214 -0
  147. package/src/ts/examples/callbacks/topic-based-logging.ts +161 -0
  148. package/src/ts/examples/chaining/test-chaining-redis.ts +113 -0
  149. package/src/ts/examples/chaining/test-chaining.ts +52 -0
  150. package/src/ts/examples/emg-features-example.ts +284 -0
  151. package/src/ts/examples/fft-example.ts +309 -0
  152. package/src/ts/examples/fft-examples.ts +349 -0
  153. package/src/ts/examples/filter-examples.ts +320 -0
  154. package/src/ts/examples/list-state-example.ts +131 -0
  155. package/src/ts/examples/logger-example.ts +91 -0
  156. package/src/ts/examples/notch-filter-examples.ts +243 -0
  157. package/src/ts/examples/phase5/drift-detection-example.ts +290 -0
  158. package/src/ts/examples/phase6-7/production-observability.ts +476 -0
  159. package/src/ts/examples/phase6-7/redis-timeseries-integration.ts +446 -0
  160. package/src/ts/examples/redis/redis-example.ts +202 -0
  161. package/src/ts/examples/redis-example.ts +202 -0
  162. package/src/ts/examples/simd-benchmark.ts +126 -0
  163. package/src/ts/examples/tap-debugging.ts +230 -0
  164. package/src/ts/examples/timeseries/comparison-example.ts +290 -0
  165. package/src/ts/examples/timeseries/iot-sensor-example.ts +143 -0
  166. package/src/ts/examples/timeseries/redis-streaming-example.ts +233 -0
  167. package/src/ts/examples/waveform-length-example.ts +139 -0
  168. package/src/ts/fft.ts +722 -0
  169. package/src/ts/filters.ts +1078 -0
  170. package/src/ts/index.ts +120 -0
  171. package/src/ts/types.ts +589 -0
  172. package/tsconfig.json +15 -0
@@ -0,0 +1,870 @@
1
+ #pragma once
2
+
3
+ /**
4
+ * @file SimdOps.h
5
+ * @brief Cross-platform SIMD operations for DSP processing
6
+ *
7
+ * This header provides SIMD-optimized operations with automatic fallback
8
+ * to scalar implementations when SIMD is not available.
9
+ *
10
+ * Supports:
11
+ * - x86/x64: SSE2 (baseline), AVX2 (when available)
12
+ * - ARM: NEON (when available)
13
+ * - Fallback: Scalar operations with compiler auto-vectorization
14
+ */
15
+
16
+ #include <cstddef>
17
+ #include <cmath>
18
+ #include <algorithm>
19
+
20
+ // Platform detection
21
+ #if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
22
+ #define SIMD_X86
23
+ #if defined(__AVX2__)
24
+ #define SIMD_AVX2
25
+ #include <immintrin.h>
26
+ #elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
27
+ #define SIMD_SSE2
28
+ #include <emmintrin.h>
29
+ #endif
30
+ #elif defined(__ARM_NEON) || defined(__aarch64__)
31
+ #define SIMD_NEON
32
+ #include <arm_neon.h>
33
+ #endif
34
+
35
+ namespace dsp::simd
36
+ {
37
+ /**
38
+ * @brief Apply absolute value to array of floats (full-wave rectification)
39
+ * @param buffer Input/output buffer (modified in-place)
40
+ * @param size Number of elements
41
+ */
42
+ inline void abs_inplace(float *buffer, size_t size)
43
+ {
44
+ #if defined(SIMD_AVX2)
45
+ // AVX2: Process 8 floats at a time
46
+ const size_t simd_width = 8;
47
+ const size_t simd_count = size / simd_width;
48
+ const size_t simd_end = simd_count * simd_width;
49
+
50
+ // Sign bit mask (0x7FFFFFFF for each float)
51
+ const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
52
+
53
+ for (size_t i = 0; i < simd_end; i += simd_width)
54
+ {
55
+ __m256 values = _mm256_loadu_ps(&buffer[i]);
56
+ values = _mm256_and_ps(values, sign_mask); // Clear sign bit
57
+ _mm256_storeu_ps(&buffer[i], values);
58
+ }
59
+
60
+ // Handle remainder
61
+ for (size_t i = simd_end; i < size; ++i)
62
+ {
63
+ buffer[i] = std::fabs(buffer[i]);
64
+ }
65
+
66
+ #elif defined(SIMD_SSE2)
67
+ // SSE2: Process 4 floats at a time
68
+ const size_t simd_width = 4;
69
+ const size_t simd_count = size / simd_width;
70
+ const size_t simd_end = simd_count * simd_width;
71
+
72
+ const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
73
+
74
+ for (size_t i = 0; i < simd_end; i += simd_width)
75
+ {
76
+ __m128 values = _mm_loadu_ps(&buffer[i]);
77
+ values = _mm_and_ps(values, sign_mask);
78
+ _mm_storeu_ps(&buffer[i], values);
79
+ }
80
+
81
+ for (size_t i = simd_end; i < size; ++i)
82
+ {
83
+ buffer[i] = std::fabs(buffer[i]);
84
+ }
85
+
86
+ #elif defined(SIMD_NEON)
87
+ // ARM NEON: Process 4 floats at a time
88
+ const size_t simd_width = 4;
89
+ const size_t simd_count = size / simd_width;
90
+ const size_t simd_end = simd_count * simd_width;
91
+
92
+ for (size_t i = 0; i < simd_end; i += simd_width)
93
+ {
94
+ float32x4_t values = vld1q_f32(&buffer[i]);
95
+ values = vabsq_f32(values);
96
+ vst1q_f32(&buffer[i], values);
97
+ }
98
+
99
+ for (size_t i = simd_end; i < size; ++i)
100
+ {
101
+ buffer[i] = std::fabs(buffer[i]);
102
+ }
103
+
104
+ #else
105
+ // Scalar fallback (compiler may auto-vectorize)
106
+ for (size_t i = 0; i < size; ++i)
107
+ {
108
+ buffer[i] = std::fabs(buffer[i]);
109
+ }
110
+ #endif
111
+ }
112
+
113
+ /**
114
+ * @brief Apply half-wave rectification (max(0, x))
115
+ * @param buffer Input/output buffer (modified in-place)
116
+ * @param size Number of elements
117
+ */
118
+ inline void max_zero_inplace(float *buffer, size_t size)
119
+ {
120
+ #if defined(SIMD_AVX2)
121
+ const size_t simd_width = 8;
122
+ const size_t simd_count = size / simd_width;
123
+ const size_t simd_end = simd_count * simd_width;
124
+
125
+ const __m256 zero = _mm256_setzero_ps();
126
+
127
+ for (size_t i = 0; i < simd_end; i += simd_width)
128
+ {
129
+ __m256 values = _mm256_loadu_ps(&buffer[i]);
130
+ values = _mm256_max_ps(values, zero);
131
+ _mm256_storeu_ps(&buffer[i], values);
132
+ }
133
+
134
+ for (size_t i = simd_end; i < size; ++i)
135
+ {
136
+ buffer[i] = std::max(0.0f, buffer[i]);
137
+ }
138
+
139
+ #elif defined(SIMD_SSE2)
140
+ const size_t simd_width = 4;
141
+ const size_t simd_count = size / simd_width;
142
+ const size_t simd_end = simd_count * simd_width;
143
+
144
+ const __m128 zero = _mm_setzero_ps();
145
+
146
+ for (size_t i = 0; i < simd_end; i += simd_width)
147
+ {
148
+ __m128 values = _mm_loadu_ps(&buffer[i]);
149
+ values = _mm_max_ps(values, zero);
150
+ _mm_storeu_ps(&buffer[i], values);
151
+ }
152
+
153
+ for (size_t i = simd_end; i < size; ++i)
154
+ {
155
+ buffer[i] = std::max(0.0f, buffer[i]);
156
+ }
157
+
158
+ #elif defined(SIMD_NEON)
159
+ const size_t simd_width = 4;
160
+ const size_t simd_count = size / simd_width;
161
+ const size_t simd_end = simd_count * simd_width;
162
+
163
+ const float32x4_t zero = vdupq_n_f32(0.0f);
164
+
165
+ for (size_t i = 0; i < simd_end; i += simd_width)
166
+ {
167
+ float32x4_t values = vld1q_f32(&buffer[i]);
168
+ values = vmaxq_f32(values, zero);
169
+ vst1q_f32(&buffer[i], values);
170
+ }
171
+
172
+ for (size_t i = simd_end; i < size; ++i)
173
+ {
174
+ buffer[i] = std::max(0.0f, buffer[i]);
175
+ }
176
+
177
+ #else
178
+ for (size_t i = 0; i < size; ++i)
179
+ {
180
+ buffer[i] = std::max(0.0f, buffer[i]);
181
+ }
182
+ #endif
183
+ }
184
+
185
+ /**
186
+ * @brief Compute sum of array (optimized for batch mode operations)
187
+ * @param buffer Input buffer
188
+ * @param size Number of elements
189
+ * @return Sum of all elements
190
+ */
191
+ inline double sum(const float *buffer, size_t size)
192
+ {
193
+ #if defined(SIMD_AVX2)
194
+ const size_t simd_width = 8;
195
+ const size_t simd_count = size / simd_width;
196
+ const size_t simd_end = simd_count * simd_width;
197
+
198
+ __m256d acc1 = _mm256_setzero_pd();
199
+ __m256d acc2 = _mm256_setzero_pd();
200
+
201
+ for (size_t i = 0; i < simd_end; i += simd_width)
202
+ {
203
+ // Load 8 floats
204
+ __m256 values = _mm256_loadu_ps(&buffer[i]);
205
+
206
+ // Convert to two groups of 4 doubles for precision
207
+ __m128 lo = _mm256_castps256_ps128(values);
208
+ __m128 hi = _mm256_extractf128_ps(values, 1);
209
+
210
+ __m256d dbl_lo = _mm256_cvtps_pd(lo);
211
+ __m256d dbl_hi = _mm256_cvtps_pd(hi);
212
+
213
+ acc1 = _mm256_add_pd(acc1, dbl_lo);
214
+ acc2 = _mm256_add_pd(acc2, dbl_hi);
215
+ }
216
+
217
+ // Horizontal sum
218
+ acc1 = _mm256_add_pd(acc1, acc2);
219
+ __m128d sum_high = _mm256_extractf128_pd(acc1, 1);
220
+ __m128d sum_low = _mm256_castpd256_pd128(acc1);
221
+ __m128d sum128 = _mm_add_pd(sum_low, sum_high);
222
+
223
+ double result[2];
224
+ _mm_storeu_pd(result, sum128);
225
+ double total = result[0] + result[1];
226
+
227
+ // Handle remainder
228
+ for (size_t i = simd_end; i < size; ++i)
229
+ {
230
+ total += static_cast<double>(buffer[i]);
231
+ }
232
+
233
+ return total;
234
+
235
+ #elif defined(SIMD_SSE2)
236
+ const size_t simd_width = 4;
237
+ const size_t simd_count = size / simd_width;
238
+ const size_t simd_end = simd_count * simd_width;
239
+
240
+ __m128d acc1 = _mm_setzero_pd();
241
+ __m128d acc2 = _mm_setzero_pd();
242
+
243
+ for (size_t i = 0; i < simd_end; i += simd_width)
244
+ {
245
+ __m128 values = _mm_loadu_ps(&buffer[i]);
246
+
247
+ // Convert to doubles for precision
248
+ __m128d dbl_lo = _mm_cvtps_pd(values);
249
+ __m128d dbl_hi = _mm_cvtps_pd(_mm_movehl_ps(values, values));
250
+
251
+ acc1 = _mm_add_pd(acc1, dbl_lo);
252
+ acc2 = _mm_add_pd(acc2, dbl_hi);
253
+ }
254
+
255
+ acc1 = _mm_add_pd(acc1, acc2);
256
+ double result[2];
257
+ _mm_storeu_pd(result, acc1);
258
+ double total = result[0] + result[1];
259
+
260
+ for (size_t i = simd_end; i < size; ++i)
261
+ {
262
+ total += static_cast<double>(buffer[i]);
263
+ }
264
+
265
+ return total;
266
+
267
+ #else
268
+ // Scalar with Kahan summation for precision
269
+ double sum = 0.0;
270
+ double c = 0.0; // Compensation for lost low-order bits
271
+
272
+ for (size_t i = 0; i < size; ++i)
273
+ {
274
+ double y = static_cast<double>(buffer[i]) - c;
275
+ double t = sum + y;
276
+ c = (t - sum) - y;
277
+ sum = t;
278
+ }
279
+
280
+ return sum;
281
+ #endif
282
+ }
283
+
284
+ /**
285
+ * @brief Compute sum of squares (optimized for RMS calculations)
286
+ * @param buffer Input buffer
287
+ * @param size Number of elements
288
+ * @return Sum of squared elements
289
+ */
290
+ inline double sum_of_squares(const float *buffer, size_t size)
291
+ {
292
+ #if defined(SIMD_AVX2)
293
+ const size_t simd_width = 8;
294
+ const size_t simd_count = size / simd_width;
295
+ const size_t simd_end = simd_count * simd_width;
296
+
297
+ __m256d acc1 = _mm256_setzero_pd();
298
+ __m256d acc2 = _mm256_setzero_pd();
299
+
300
+ for (size_t i = 0; i < simd_end; i += simd_width)
301
+ {
302
+ __m256 values = _mm256_loadu_ps(&buffer[i]);
303
+
304
+ // Square the values
305
+ __m256 squares = _mm256_mul_ps(values, values);
306
+
307
+ // Convert to doubles for precision accumulation
308
+ __m128 lo = _mm256_castps256_ps128(squares);
309
+ __m128 hi = _mm256_extractf128_ps(squares, 1);
310
+
311
+ __m256d dbl_lo = _mm256_cvtps_pd(lo);
312
+ __m256d dbl_hi = _mm256_cvtps_pd(hi);
313
+
314
+ acc1 = _mm256_add_pd(acc1, dbl_lo);
315
+ acc2 = _mm256_add_pd(acc2, dbl_hi);
316
+ }
317
+
318
+ // Horizontal sum
319
+ acc1 = _mm256_add_pd(acc1, acc2);
320
+ __m128d sum_high = _mm256_extractf128_pd(acc1, 1);
321
+ __m128d sum_low = _mm256_castpd256_pd128(acc1);
322
+ __m128d sum128 = _mm_add_pd(sum_low, sum_high);
323
+
324
+ double result[2];
325
+ _mm_storeu_pd(result, sum128);
326
+ double total = result[0] + result[1];
327
+
328
+ // Handle remainder
329
+ for (size_t i = simd_end; i < size; ++i)
330
+ {
331
+ double val = static_cast<double>(buffer[i]);
332
+ total += val * val;
333
+ }
334
+
335
+ return total;
336
+
337
+ #elif defined(SIMD_SSE2)
338
+ const size_t simd_width = 4;
339
+ const size_t simd_count = size / simd_width;
340
+ const size_t simd_end = simd_count * simd_width;
341
+
342
+ __m128d acc1 = _mm_setzero_pd();
343
+ __m128d acc2 = _mm_setzero_pd();
344
+
345
+ for (size_t i = 0; i < simd_end; i += simd_width)
346
+ {
347
+ __m128 values = _mm_loadu_ps(&buffer[i]);
348
+ __m128 squares = _mm_mul_ps(values, values);
349
+
350
+ __m128d dbl_lo = _mm_cvtps_pd(squares);
351
+ __m128d dbl_hi = _mm_cvtps_pd(_mm_movehl_ps(squares, squares));
352
+
353
+ acc1 = _mm_add_pd(acc1, dbl_lo);
354
+ acc2 = _mm_add_pd(acc2, dbl_hi);
355
+ }
356
+
357
+ acc1 = _mm_add_pd(acc1, acc2);
358
+ double result[2];
359
+ _mm_storeu_pd(result, acc1);
360
+ double total = result[0] + result[1];
361
+
362
+ for (size_t i = simd_end; i < size; ++i)
363
+ {
364
+ double val = static_cast<double>(buffer[i]);
365
+ total += val * val;
366
+ }
367
+
368
+ return total;
369
+
370
+ #else
371
+ // Scalar with Kahan summation
372
+ double sum = 0.0;
373
+ double c = 0.0;
374
+
375
+ for (size_t i = 0; i < size; ++i)
376
+ {
377
+ double val = static_cast<double>(buffer[i]);
378
+ double y = (val * val) - c;
379
+ double t = sum + y;
380
+ c = (t - sum) - y;
381
+ sum = t;
382
+ }
383
+
384
+ return sum;
385
+ #endif
386
+ }
387
+
388
+ /**
389
+ * @brief QUICK WIN: Apply window function to signal (element-wise multiply)
390
+ * @param input Input signal buffer
391
+ * @param window Window coefficients
392
+ * @param output Output windowed signal
393
+ * @param size Number of elements
394
+ */
395
+ inline void apply_window(const float *input, const float *window, float *output, size_t size)
396
+ {
397
+ #if defined(SIMD_AVX2)
398
+ const size_t simd_width = 8;
399
+ const size_t simd_count = size / simd_width;
400
+ const size_t simd_end = simd_count * simd_width;
401
+
402
+ for (size_t i = 0; i < simd_end; i += simd_width)
403
+ {
404
+ __m256 in = _mm256_loadu_ps(&input[i]);
405
+ __m256 win = _mm256_loadu_ps(&window[i]);
406
+ __m256 result = _mm256_mul_ps(in, win);
407
+ _mm256_storeu_ps(&output[i], result);
408
+ }
409
+
410
+ // Handle remainder
411
+ for (size_t i = simd_end; i < size; ++i)
412
+ {
413
+ output[i] = input[i] * window[i];
414
+ }
415
+
416
+ #elif defined(SIMD_SSE2)
417
+ const size_t simd_width = 4;
418
+ const size_t simd_count = size / simd_width;
419
+ const size_t simd_end = simd_count * simd_width;
420
+
421
+ for (size_t i = 0; i < simd_end; i += simd_width)
422
+ {
423
+ __m128 in = _mm_loadu_ps(&input[i]);
424
+ __m128 win = _mm_loadu_ps(&window[i]);
425
+ __m128 result = _mm_mul_ps(in, win);
426
+ _mm_storeu_ps(&output[i], result);
427
+ }
428
+
429
+ for (size_t i = simd_end; i < size; ++i)
430
+ {
431
+ output[i] = input[i] * window[i];
432
+ }
433
+
434
+ #elif defined(SIMD_NEON)
435
+ const size_t simd_width = 4;
436
+ const size_t simd_count = size / simd_width;
437
+ const size_t simd_end = simd_count * simd_width;
438
+
439
+ for (size_t i = 0; i < simd_end; i += simd_width)
440
+ {
441
+ float32x4_t in = vld1q_f32(&input[i]);
442
+ float32x4_t win = vld1q_f32(&window[i]);
443
+ float32x4_t result = vmulq_f32(in, win);
444
+ vst1q_f32(&output[i], result);
445
+ }
446
+
447
+ for (size_t i = simd_end; i < size; ++i)
448
+ {
449
+ output[i] = input[i] * window[i];
450
+ }
451
+
452
+ #else
453
+ for (size_t i = 0; i < size; ++i)
454
+ {
455
+ output[i] = input[i] * window[i];
456
+ }
457
+ #endif
458
+ }
459
+
460
+ /**
461
+ * @brief MEDIUM WIN: Compute magnitude spectrum from complex values
462
+ * magnitude[i] = sqrt(real[i]² + imag[i]²)
463
+ * @param real Real components
464
+ * @param imag Imaginary components
465
+ * @param magnitude Output magnitudes
466
+ * @param size Number of complex values
467
+ */
468
+ inline void complex_magnitude(const float *real, const float *imag, float *magnitude, size_t size)
469
+ {
470
+ #if defined(SIMD_AVX2)
471
+ const size_t simd_width = 8;
472
+ const size_t simd_count = size / simd_width;
473
+ const size_t simd_end = simd_count * simd_width;
474
+
475
+ for (size_t i = 0; i < simd_end; i += simd_width)
476
+ {
477
+ __m256 re = _mm256_loadu_ps(&real[i]);
478
+ __m256 im = _mm256_loadu_ps(&imag[i]);
479
+
480
+ // mag² = re² + im²
481
+ __m256 re_sq = _mm256_mul_ps(re, re);
482
+ __m256 im_sq = _mm256_mul_ps(im, im);
483
+ __m256 mag_sq = _mm256_add_ps(re_sq, im_sq);
484
+
485
+ // mag = sqrt(mag²)
486
+ __m256 mag = _mm256_sqrt_ps(mag_sq);
487
+
488
+ _mm256_storeu_ps(&magnitude[i], mag);
489
+ }
490
+
491
+ for (size_t i = simd_end; i < size; ++i)
492
+ {
493
+ magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
494
+ }
495
+
496
+ #elif defined(SIMD_SSE2)
497
+ const size_t simd_width = 4;
498
+ const size_t simd_count = size / simd_width;
499
+ const size_t simd_end = simd_count * simd_width;
500
+
501
+ for (size_t i = 0; i < simd_end; i += simd_width)
502
+ {
503
+ __m128 re = _mm_loadu_ps(&real[i]);
504
+ __m128 im = _mm_loadu_ps(&imag[i]);
505
+
506
+ __m128 re_sq = _mm_mul_ps(re, re);
507
+ __m128 im_sq = _mm_mul_ps(im, im);
508
+ __m128 mag_sq = _mm_add_ps(re_sq, im_sq);
509
+ __m128 mag = _mm_sqrt_ps(mag_sq);
510
+
511
+ _mm_storeu_ps(&magnitude[i], mag);
512
+ }
513
+
514
+ for (size_t i = simd_end; i < size; ++i)
515
+ {
516
+ magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
517
+ }
518
+
519
+ #elif defined(SIMD_NEON)
520
+ const size_t simd_width = 4;
521
+ const size_t simd_count = size / simd_width;
522
+ const size_t simd_end = simd_count * simd_width;
523
+
524
+ for (size_t i = 0; i < simd_end; i += simd_width)
525
+ {
526
+ float32x4_t re = vld1q_f32(&real[i]);
527
+ float32x4_t im = vld1q_f32(&imag[i]);
528
+
529
+ float32x4_t re_sq = vmulq_f32(re, re);
530
+ float32x4_t im_sq = vmulq_f32(im, im);
531
+ float32x4_t mag_sq = vaddq_f32(re_sq, im_sq);
532
+
533
+ // ARM NEON sqrt (reciprocal square root + Newton-Raphson)
534
+ float32x4_t mag = vsqrtq_f32(mag_sq);
535
+
536
+ vst1q_f32(&magnitude[i], mag);
537
+ }
538
+
539
+ for (size_t i = simd_end; i < size; ++i)
540
+ {
541
+ magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
542
+ }
543
+
544
+ #else
545
+ for (size_t i = 0; i < size; ++i)
546
+ {
547
+ magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
548
+ }
549
+ #endif
550
+ }
551
+
552
+ /**
553
+ * @brief MEDIUM WIN: Compute power spectrum from complex values
554
+ * power[i] = real[i]² + imag[i]²
555
+ * @param real Real components
556
+ * @param imag Imaginary components
557
+ * @param power Output power values
558
+ * @param size Number of complex values
559
+ */
560
+ inline void complex_power(const float *real, const float *imag, float *power, size_t size)
561
+ {
562
+ #if defined(SIMD_AVX2)
563
+ const size_t simd_width = 8;
564
+ const size_t simd_count = size / simd_width;
565
+ const size_t simd_end = simd_count * simd_width;
566
+
567
+ for (size_t i = 0; i < simd_end; i += simd_width)
568
+ {
569
+ __m256 re = _mm256_loadu_ps(&real[i]);
570
+ __m256 im = _mm256_loadu_ps(&imag[i]);
571
+
572
+ __m256 re_sq = _mm256_mul_ps(re, re);
573
+ __m256 im_sq = _mm256_mul_ps(im, im);
574
+ __m256 pwr = _mm256_add_ps(re_sq, im_sq);
575
+
576
+ _mm256_storeu_ps(&power[i], pwr);
577
+ }
578
+
579
+ for (size_t i = simd_end; i < size; ++i)
580
+ {
581
+ power[i] = real[i] * real[i] + imag[i] * imag[i];
582
+ }
583
+
584
+ #elif defined(SIMD_SSE2)
585
+ const size_t simd_width = 4;
586
+ const size_t simd_count = size / simd_width;
587
+ const size_t simd_end = simd_count * simd_width;
588
+
589
+ for (size_t i = 0; i < simd_end; i += simd_width)
590
+ {
591
+ __m128 re = _mm_loadu_ps(&real[i]);
592
+ __m128 im = _mm_loadu_ps(&imag[i]);
593
+
594
+ __m128 re_sq = _mm_mul_ps(re, re);
595
+ __m128 im_sq = _mm_mul_ps(im, im);
596
+ __m128 pwr = _mm_add_ps(re_sq, im_sq);
597
+
598
+ _mm_storeu_ps(&power[i], pwr);
599
+ }
600
+
601
+ for (size_t i = simd_end; i < size; ++i)
602
+ {
603
+ power[i] = real[i] * real[i] + imag[i] * imag[i];
604
+ }
605
+
606
+ #elif defined(SIMD_NEON)
607
+ const size_t simd_width = 4;
608
+ const size_t simd_count = size / simd_width;
609
+ const size_t simd_end = simd_count * simd_width;
610
+
611
+ for (size_t i = 0; i < simd_end; i += simd_width)
612
+ {
613
+ float32x4_t re = vld1q_f32(&real[i]);
614
+ float32x4_t im = vld1q_f32(&imag[i]);
615
+
616
+ float32x4_t re_sq = vmulq_f32(re, re);
617
+ float32x4_t im_sq = vmulq_f32(im, im);
618
+ float32x4_t pwr = vaddq_f32(re_sq, im_sq);
619
+
620
+ vst1q_f32(&power[i], pwr);
621
+ }
622
+
623
+ for (size_t i = simd_end; i < size; ++i)
624
+ {
625
+ power[i] = real[i] * real[i] + imag[i] * imag[i];
626
+ }
627
+
628
+ #else
629
+ for (size_t i = 0; i < size; ++i)
630
+ {
631
+ power[i] = real[i] * real[i] + imag[i] * imag[i];
632
+ }
633
+ #endif
634
+ }
635
+
636
+ /**
637
+ * @brief SIMD-optimized dot product for FIR convolution
638
+ * result = sum(a[i] * b[i]) for i in [0, size)
639
+ * @param a First array
640
+ * @param b Second array
641
+ * @param size Number of elements
642
+ * @return Dot product
643
+ */
644
+ inline float dot_product(const float *a, const float *b, size_t size)
645
+ {
646
+ #if defined(SIMD_AVX2)
647
+ const size_t simd_width = 8;
648
+ const size_t simd_count = size / simd_width;
649
+ const size_t simd_end = simd_count * simd_width;
650
+
651
+ __m256 acc = _mm256_setzero_ps();
652
+
653
+ for (size_t i = 0; i < simd_end; i += simd_width)
654
+ {
655
+ __m256 va = _mm256_loadu_ps(&a[i]);
656
+ __m256 vb = _mm256_loadu_ps(&b[i]);
657
+ __m256 prod = _mm256_mul_ps(va, vb);
658
+ acc = _mm256_add_ps(acc, prod);
659
+ }
660
+
661
+ // Horizontal sum of the accumulator
662
+ __m128 hi = _mm256_extractf128_ps(acc, 1);
663
+ __m128 lo = _mm256_castps256_ps128(acc);
664
+ __m128 sum128 = _mm_add_ps(lo, hi);
665
+
666
+ // Reduce 4 floats to 1
667
+ sum128 = _mm_hadd_ps(sum128, sum128);
668
+ sum128 = _mm_hadd_ps(sum128, sum128);
669
+
670
+ float result = _mm_cvtss_f32(sum128);
671
+
672
+ // Handle remainder
673
+ for (size_t i = simd_end; i < size; ++i)
674
+ {
675
+ result += a[i] * b[i];
676
+ }
677
+
678
+ return result;
679
+
680
+ #elif defined(SIMD_SSE2)
681
+ const size_t simd_width = 4;
682
+ const size_t simd_count = size / simd_width;
683
+ const size_t simd_end = simd_count * simd_width;
684
+
685
+ __m128 acc = _mm_setzero_ps();
686
+
687
+ for (size_t i = 0; i < simd_end; i += simd_width)
688
+ {
689
+ __m128 va = _mm_loadu_ps(&a[i]);
690
+ __m128 vb = _mm_loadu_ps(&b[i]);
691
+ __m128 prod = _mm_mul_ps(va, vb);
692
+ acc = _mm_add_ps(acc, prod);
693
+ }
694
+
695
+ // Horizontal sum
696
+ __m128 shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(2, 3, 0, 1));
697
+ __m128 sums = _mm_add_ps(acc, shuf);
698
+ shuf = _mm_movehl_ps(shuf, sums);
699
+ sums = _mm_add_ss(sums, shuf);
700
+
701
+ float result = _mm_cvtss_f32(sums);
702
+
703
+ for (size_t i = simd_end; i < size; ++i)
704
+ {
705
+ result += a[i] * b[i];
706
+ }
707
+
708
+ return result;
709
+
710
+ #elif defined(SIMD_NEON)
711
+ const size_t simd_width = 4;
712
+ const size_t simd_count = size / simd_width;
713
+ const size_t simd_end = simd_count * simd_width;
714
+
715
+ float32x4_t acc = vdupq_n_f32(0.0f);
716
+
717
+ for (size_t i = 0; i < simd_end; i += simd_width)
718
+ {
719
+ float32x4_t va = vld1q_f32(&a[i]);
720
+ float32x4_t vb = vld1q_f32(&b[i]);
721
+ acc = vmlaq_f32(acc, va, vb); // Fused multiply-add
722
+ }
723
+
724
+ // Horizontal sum
725
+ float32x2_t sum2 = vadd_f32(vget_low_f32(acc), vget_high_f32(acc));
726
+ float result = vget_lane_f32(vpadd_f32(sum2, sum2), 0);
727
+
728
+ for (size_t i = simd_end; i < size; ++i)
729
+ {
730
+ result += a[i] * b[i];
731
+ }
732
+
733
+ return result;
734
+
735
+ #else
736
+ float result = 0.0f;
737
+ for (size_t i = 0; i < size; ++i)
738
+ {
739
+ result += a[i] * b[i];
740
+ }
741
+ return result;
742
+ #endif
743
+ }
744
+
745
+ /**
746
+ * @brief MAJOR WIN: Complex multiplication for FFT butterflies
747
+ * result = a * b (complex multiplication)
748
+ * @param a_real Real part of a
749
+ * @param a_imag Imaginary part of a
750
+ * @param b_real Real part of b
751
+ * @param b_imag Imaginary part of b
752
+ * @param out_real Output real part
753
+ * @param out_imag Output imaginary part
754
+ * @param size Number of complex multiplications
755
+ */
756
+ inline void complex_multiply(
757
+ const float *a_real, const float *a_imag,
758
+ const float *b_real, const float *b_imag,
759
+ float *out_real, float *out_imag,
760
+ size_t size)
761
+ {
762
+ #if defined(SIMD_AVX2)
763
+ const size_t simd_width = 8;
764
+ const size_t simd_count = size / simd_width;
765
+ const size_t simd_end = simd_count * simd_width;
766
+
767
+ for (size_t i = 0; i < simd_end; i += simd_width)
768
+ {
769
+ __m256 ar = _mm256_loadu_ps(&a_real[i]);
770
+ __m256 ai = _mm256_loadu_ps(&a_imag[i]);
771
+ __m256 br = _mm256_loadu_ps(&b_real[i]);
772
+ __m256 bi = _mm256_loadu_ps(&b_imag[i]);
773
+
774
+ // (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
775
+ __m256 ac = _mm256_mul_ps(ar, br);
776
+ __m256 bd = _mm256_mul_ps(ai, bi);
777
+ __m256 ad = _mm256_mul_ps(ar, bi);
778
+ __m256 bc = _mm256_mul_ps(ai, br);
779
+
780
+ __m256 real = _mm256_sub_ps(ac, bd);
781
+ __m256 imag = _mm256_add_ps(ad, bc);
782
+
783
+ _mm256_storeu_ps(&out_real[i], real);
784
+ _mm256_storeu_ps(&out_imag[i], imag);
785
+ }
786
+
787
+ for (size_t i = simd_end; i < size; ++i)
788
+ {
789
+ float ar = a_real[i], ai = a_imag[i];
790
+ float br = b_real[i], bi = b_imag[i];
791
+ out_real[i] = ar * br - ai * bi;
792
+ out_imag[i] = ar * bi + ai * br;
793
+ }
794
+
795
+ #elif defined(SIMD_SSE2)
796
+ const size_t simd_width = 4;
797
+ const size_t simd_count = size / simd_width;
798
+ const size_t simd_end = simd_count * simd_width;
799
+
800
+ for (size_t i = 0; i < simd_end; i += simd_width)
801
+ {
802
+ __m128 ar = _mm_loadu_ps(&a_real[i]);
803
+ __m128 ai = _mm_loadu_ps(&a_imag[i]);
804
+ __m128 br = _mm_loadu_ps(&b_real[i]);
805
+ __m128 bi = _mm_loadu_ps(&b_imag[i]);
806
+
807
+ __m128 ac = _mm_mul_ps(ar, br);
808
+ __m128 bd = _mm_mul_ps(ai, bi);
809
+ __m128 ad = _mm_mul_ps(ar, bi);
810
+ __m128 bc = _mm_mul_ps(ai, br);
811
+
812
+ __m128 real = _mm_sub_ps(ac, bd);
813
+ __m128 imag = _mm_add_ps(ad, bc);
814
+
815
+ _mm_storeu_ps(&out_real[i], real);
816
+ _mm_storeu_ps(&out_imag[i], imag);
817
+ }
818
+
819
+ for (size_t i = simd_end; i < size; ++i)
820
+ {
821
+ float ar = a_real[i], ai = a_imag[i];
822
+ float br = b_real[i], bi = b_imag[i];
823
+ out_real[i] = ar * br - ai * bi;
824
+ out_imag[i] = ar * bi + ai * br;
825
+ }
826
+
827
+ #elif defined(SIMD_NEON)
828
+ const size_t simd_width = 4;
829
+ const size_t simd_count = size / simd_width;
830
+ const size_t simd_end = simd_count * simd_width;
831
+
832
+ for (size_t i = 0; i < simd_end; i += simd_width)
833
+ {
834
+ float32x4_t ar = vld1q_f32(&a_real[i]);
835
+ float32x4_t ai = vld1q_f32(&a_imag[i]);
836
+ float32x4_t br = vld1q_f32(&b_real[i]);
837
+ float32x4_t bi = vld1q_f32(&b_imag[i]);
838
+
839
+ float32x4_t ac = vmulq_f32(ar, br);
840
+ float32x4_t bd = vmulq_f32(ai, bi);
841
+ float32x4_t ad = vmulq_f32(ar, bi);
842
+ float32x4_t bc = vmulq_f32(ai, br);
843
+
844
+ float32x4_t real = vsubq_f32(ac, bd);
845
+ float32x4_t imag = vaddq_f32(ad, bc);
846
+
847
+ vst1q_f32(&out_real[i], real);
848
+ vst1q_f32(&out_imag[i], imag);
849
+ }
850
+
851
+ for (size_t i = simd_end; i < size; ++i)
852
+ {
853
+ float ar = a_real[i], ai = a_imag[i];
854
+ float br = b_real[i], bi = b_imag[i];
855
+ out_real[i] = ar * br - ai * bi;
856
+ out_imag[i] = ar * bi + ai * br;
857
+ }
858
+
859
+ #else
860
+ for (size_t i = 0; i < size; ++i)
861
+ {
862
+ float ar = a_real[i], ai = a_imag[i];
863
+ float br = b_real[i], bi = b_imag[i];
864
+ out_real[i] = ar * br - ai * bi;
865
+ out_imag[i] = ar * bi + ai * br;
866
+ }
867
+ #endif
868
+ }
869
+
870
+ } // namespace dsp::simd