dspx 0.1.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/ci.yml +185 -0
- package/.vscode/c_cpp_properties.json +17 -0
- package/.vscode/settings.json +68 -0
- package/.vscode/tasks.json +28 -0
- package/DISCLAIMER.md +32 -0
- package/LICENSE +21 -0
- package/README.md +1803 -0
- package/ROADMAP.md +192 -0
- package/TECHNICAL_DEBT.md +165 -0
- package/binding.gyp +65 -0
- package/docs/ADVANCED_LOGGER_FEATURES.md +598 -0
- package/docs/AUTHENTICATION_SECURITY.md +396 -0
- package/docs/BACKEND_IMPROVEMENTS.md +399 -0
- package/docs/CHEBYSHEV_BIQUAD_EQ_IMPLEMENTATION.md +405 -0
- package/docs/FFT_IMPLEMENTATION.md +490 -0
- package/docs/FFT_IMPROVEMENTS_SUMMARY.md +387 -0
- package/docs/FFT_USER_GUIDE.md +494 -0
- package/docs/FILTERS_IMPLEMENTATION.md +260 -0
- package/docs/FILTER_API_GUIDE.md +418 -0
- package/docs/FIR_SIMD_OPTIMIZATION.md +175 -0
- package/docs/LOGGER_API_REFERENCE.md +350 -0
- package/docs/NOTCH_FILTER_QUICK_REF.md +121 -0
- package/docs/PHASE2_TESTS_AND_NOTCH_FILTER.md +341 -0
- package/docs/PHASES_5_7_SUMMARY.md +403 -0
- package/docs/PIPELINE_FILTER_INTEGRATION.md +446 -0
- package/docs/SIMD_OPTIMIZATIONS.md +211 -0
- package/docs/TEST_MIGRATION_SUMMARY.md +173 -0
- package/docs/TIMESERIES_IMPLEMENTATION_SUMMARY.md +322 -0
- package/docs/TIMESERIES_QUICK_REF.md +85 -0
- package/docs/advanced.md +559 -0
- package/docs/time-series-guide.md +617 -0
- package/docs/time-series-migration.md +376 -0
- package/jest.config.js +37 -0
- package/package.json +42 -0
- package/prebuilds/linux-x64/dsp-ts-redis.node +0 -0
- package/prebuilds/win32-x64/dsp-ts-redis.node +0 -0
- package/scripts/test.js +24 -0
- package/src/build/dsp-ts-redis.node +0 -0
- package/src/native/DspPipeline.cc +675 -0
- package/src/native/DspPipeline.h +44 -0
- package/src/native/FftBindings.cc +817 -0
- package/src/native/FilterBindings.cc +1001 -0
- package/src/native/IDspStage.h +53 -0
- package/src/native/adapters/InterpolatorStage.h +201 -0
- package/src/native/adapters/MeanAbsoluteValueStage.h +289 -0
- package/src/native/adapters/MovingAverageStage.h +306 -0
- package/src/native/adapters/RectifyStage.h +88 -0
- package/src/native/adapters/ResamplerStage.h +238 -0
- package/src/native/adapters/RmsStage.h +299 -0
- package/src/native/adapters/SscStage.h +121 -0
- package/src/native/adapters/VarianceStage.h +307 -0
- package/src/native/adapters/WampStage.h +114 -0
- package/src/native/adapters/WaveformLengthStage.h +115 -0
- package/src/native/adapters/ZScoreNormalizeStage.h +326 -0
- package/src/native/core/FftEngine.cc +441 -0
- package/src/native/core/FftEngine.h +224 -0
- package/src/native/core/FirFilter.cc +324 -0
- package/src/native/core/FirFilter.h +149 -0
- package/src/native/core/IirFilter.cc +576 -0
- package/src/native/core/IirFilter.h +210 -0
- package/src/native/core/MovingAbsoluteValueFilter.cc +17 -0
- package/src/native/core/MovingAbsoluteValueFilter.h +135 -0
- package/src/native/core/MovingAverageFilter.cc +18 -0
- package/src/native/core/MovingAverageFilter.h +135 -0
- package/src/native/core/MovingFftFilter.cc +291 -0
- package/src/native/core/MovingFftFilter.h +203 -0
- package/src/native/core/MovingVarianceFilter.cc +194 -0
- package/src/native/core/MovingVarianceFilter.h +114 -0
- package/src/native/core/MovingZScoreFilter.cc +215 -0
- package/src/native/core/MovingZScoreFilter.h +113 -0
- package/src/native/core/Policies.h +352 -0
- package/src/native/core/RmsFilter.cc +18 -0
- package/src/native/core/RmsFilter.h +131 -0
- package/src/native/core/SscFilter.cc +16 -0
- package/src/native/core/SscFilter.h +137 -0
- package/src/native/core/WampFilter.cc +16 -0
- package/src/native/core/WampFilter.h +101 -0
- package/src/native/core/WaveformLengthFilter.cc +17 -0
- package/src/native/core/WaveformLengthFilter.h +98 -0
- package/src/native/utils/CircularBufferArray.cc +336 -0
- package/src/native/utils/CircularBufferArray.h +62 -0
- package/src/native/utils/CircularBufferVector.cc +145 -0
- package/src/native/utils/CircularBufferVector.h +45 -0
- package/src/native/utils/NapiUtils.cc +53 -0
- package/src/native/utils/NapiUtils.h +21 -0
- package/src/native/utils/SimdOps.h +870 -0
- package/src/native/utils/SlidingWindowFilter.cc +239 -0
- package/src/native/utils/SlidingWindowFilter.h +159 -0
- package/src/native/utils/TimeSeriesBuffer.cc +205 -0
- package/src/native/utils/TimeSeriesBuffer.h +140 -0
- package/src/ts/CircularLogBuffer.ts +87 -0
- package/src/ts/DriftDetector.ts +331 -0
- package/src/ts/TopicRouter.ts +428 -0
- package/src/ts/__tests__/AdvancedDsp.test.ts +585 -0
- package/src/ts/__tests__/AuthAndEdgeCases.test.ts +241 -0
- package/src/ts/__tests__/Chaining.test.ts +387 -0
- package/src/ts/__tests__/ChebyshevBiquad.test.ts +229 -0
- package/src/ts/__tests__/CircularLogBuffer.test.ts +158 -0
- package/src/ts/__tests__/DriftDetector.test.ts +389 -0
- package/src/ts/__tests__/Fft.test.ts +484 -0
- package/src/ts/__tests__/ListState.test.ts +153 -0
- package/src/ts/__tests__/Logger.test.ts +208 -0
- package/src/ts/__tests__/LoggerAdvanced.test.ts +319 -0
- package/src/ts/__tests__/LoggerMinor.test.ts +247 -0
- package/src/ts/__tests__/MeanAbsoluteValue.test.ts +398 -0
- package/src/ts/__tests__/MovingAverage.test.ts +322 -0
- package/src/ts/__tests__/RMS.test.ts +315 -0
- package/src/ts/__tests__/Rectify.test.ts +272 -0
- package/src/ts/__tests__/Redis.test.ts +456 -0
- package/src/ts/__tests__/SlopeSignChange.test.ts +166 -0
- package/src/ts/__tests__/Tap.test.ts +164 -0
- package/src/ts/__tests__/TimeBasedExpiration.test.ts +124 -0
- package/src/ts/__tests__/TimeBasedRmsAndMav.test.ts +231 -0
- package/src/ts/__tests__/TimeBasedVarianceAndZScore.test.ts +284 -0
- package/src/ts/__tests__/TimeSeries.test.ts +254 -0
- package/src/ts/__tests__/TopicRouter.test.ts +332 -0
- package/src/ts/__tests__/TopicRouterAdvanced.test.ts +483 -0
- package/src/ts/__tests__/TopicRouterPriority.test.ts +487 -0
- package/src/ts/__tests__/Variance.test.ts +509 -0
- package/src/ts/__tests__/WaveformLength.test.ts +147 -0
- package/src/ts/__tests__/WillisonAmplitude.test.ts +197 -0
- package/src/ts/__tests__/ZScoreNormalize.test.ts +459 -0
- package/src/ts/advanced-dsp.ts +566 -0
- package/src/ts/backends.ts +1137 -0
- package/src/ts/bindings.ts +1225 -0
- package/src/ts/easter-egg.ts +42 -0
- package/src/ts/examples/MeanAbsoluteValue/test-state.ts +99 -0
- package/src/ts/examples/MeanAbsoluteValue/test-streaming.ts +269 -0
- package/src/ts/examples/MovingAverage/test-state.ts +85 -0
- package/src/ts/examples/MovingAverage/test-streaming.ts +188 -0
- package/src/ts/examples/RMS/test-state.ts +97 -0
- package/src/ts/examples/RMS/test-streaming.ts +253 -0
- package/src/ts/examples/Rectify/test-state.ts +107 -0
- package/src/ts/examples/Rectify/test-streaming.ts +242 -0
- package/src/ts/examples/Variance/test-state.ts +195 -0
- package/src/ts/examples/Variance/test-streaming.ts +260 -0
- package/src/ts/examples/ZScoreNormalize/test-state.ts +277 -0
- package/src/ts/examples/ZScoreNormalize/test-streaming.ts +306 -0
- package/src/ts/examples/advanced-dsp-examples.ts +397 -0
- package/src/ts/examples/callbacks/advanced-router-features.ts +326 -0
- package/src/ts/examples/callbacks/benchmark-circular-buffer.ts +109 -0
- package/src/ts/examples/callbacks/monitoring-example.ts +265 -0
- package/src/ts/examples/callbacks/pipeline-callbacks-example.ts +137 -0
- package/src/ts/examples/callbacks/pooled-callbacks-example.ts +274 -0
- package/src/ts/examples/callbacks/priority-routing-example.ts +277 -0
- package/src/ts/examples/callbacks/production-topic-router.ts +214 -0
- package/src/ts/examples/callbacks/topic-based-logging.ts +161 -0
- package/src/ts/examples/chaining/test-chaining-redis.ts +113 -0
- package/src/ts/examples/chaining/test-chaining.ts +52 -0
- package/src/ts/examples/emg-features-example.ts +284 -0
- package/src/ts/examples/fft-example.ts +309 -0
- package/src/ts/examples/fft-examples.ts +349 -0
- package/src/ts/examples/filter-examples.ts +320 -0
- package/src/ts/examples/list-state-example.ts +131 -0
- package/src/ts/examples/logger-example.ts +91 -0
- package/src/ts/examples/notch-filter-examples.ts +243 -0
- package/src/ts/examples/phase5/drift-detection-example.ts +290 -0
- package/src/ts/examples/phase6-7/production-observability.ts +476 -0
- package/src/ts/examples/phase6-7/redis-timeseries-integration.ts +446 -0
- package/src/ts/examples/redis/redis-example.ts +202 -0
- package/src/ts/examples/redis-example.ts +202 -0
- package/src/ts/examples/simd-benchmark.ts +126 -0
- package/src/ts/examples/tap-debugging.ts +230 -0
- package/src/ts/examples/timeseries/comparison-example.ts +290 -0
- package/src/ts/examples/timeseries/iot-sensor-example.ts +143 -0
- package/src/ts/examples/timeseries/redis-streaming-example.ts +233 -0
- package/src/ts/examples/waveform-length-example.ts +139 -0
- package/src/ts/fft.ts +722 -0
- package/src/ts/filters.ts +1078 -0
- package/src/ts/index.ts +120 -0
- package/src/ts/types.ts +589 -0
- package/tsconfig.json +15 -0
|
@@ -0,0 +1,870 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* @file SimdOps.h
|
|
5
|
+
* @brief Cross-platform SIMD operations for DSP processing
|
|
6
|
+
*
|
|
7
|
+
* This header provides SIMD-optimized operations with automatic fallback
|
|
8
|
+
* to scalar implementations when SIMD is not available.
|
|
9
|
+
*
|
|
10
|
+
* Supports:
|
|
11
|
+
* - x86/x64: SSE2 (baseline), AVX2 (when available)
|
|
12
|
+
* - ARM: NEON (when available)
|
|
13
|
+
* - Fallback: Scalar operations with compiler auto-vectorization
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
#include <cstddef>
|
|
17
|
+
#include <cmath>
|
|
18
|
+
#include <algorithm>
|
|
19
|
+
|
|
20
|
+
// Platform detection
|
|
21
|
+
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
|
22
|
+
#define SIMD_X86
|
|
23
|
+
#if defined(__AVX2__)
|
|
24
|
+
#define SIMD_AVX2
|
|
25
|
+
#include <immintrin.h>
|
|
26
|
+
#elif defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)
|
|
27
|
+
#define SIMD_SSE2
|
|
28
|
+
#include <emmintrin.h>
|
|
29
|
+
#endif
|
|
30
|
+
#elif defined(__ARM_NEON) || defined(__aarch64__)
|
|
31
|
+
#define SIMD_NEON
|
|
32
|
+
#include <arm_neon.h>
|
|
33
|
+
#endif
|
|
34
|
+
|
|
35
|
+
namespace dsp::simd
|
|
36
|
+
{
|
|
37
|
+
/**
|
|
38
|
+
* @brief Apply absolute value to array of floats (full-wave rectification)
|
|
39
|
+
* @param buffer Input/output buffer (modified in-place)
|
|
40
|
+
* @param size Number of elements
|
|
41
|
+
*/
|
|
42
|
+
inline void abs_inplace(float *buffer, size_t size)
|
|
43
|
+
{
|
|
44
|
+
#if defined(SIMD_AVX2)
|
|
45
|
+
// AVX2: Process 8 floats at a time
|
|
46
|
+
const size_t simd_width = 8;
|
|
47
|
+
const size_t simd_count = size / simd_width;
|
|
48
|
+
const size_t simd_end = simd_count * simd_width;
|
|
49
|
+
|
|
50
|
+
// Sign bit mask (0x7FFFFFFF for each float)
|
|
51
|
+
const __m256 sign_mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
|
|
52
|
+
|
|
53
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
54
|
+
{
|
|
55
|
+
__m256 values = _mm256_loadu_ps(&buffer[i]);
|
|
56
|
+
values = _mm256_and_ps(values, sign_mask); // Clear sign bit
|
|
57
|
+
_mm256_storeu_ps(&buffer[i], values);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Handle remainder
|
|
61
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
62
|
+
{
|
|
63
|
+
buffer[i] = std::fabs(buffer[i]);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
#elif defined(SIMD_SSE2)
|
|
67
|
+
// SSE2: Process 4 floats at a time
|
|
68
|
+
const size_t simd_width = 4;
|
|
69
|
+
const size_t simd_count = size / simd_width;
|
|
70
|
+
const size_t simd_end = simd_count * simd_width;
|
|
71
|
+
|
|
72
|
+
const __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
|
|
73
|
+
|
|
74
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
75
|
+
{
|
|
76
|
+
__m128 values = _mm_loadu_ps(&buffer[i]);
|
|
77
|
+
values = _mm_and_ps(values, sign_mask);
|
|
78
|
+
_mm_storeu_ps(&buffer[i], values);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
82
|
+
{
|
|
83
|
+
buffer[i] = std::fabs(buffer[i]);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#elif defined(SIMD_NEON)
|
|
87
|
+
// ARM NEON: Process 4 floats at a time
|
|
88
|
+
const size_t simd_width = 4;
|
|
89
|
+
const size_t simd_count = size / simd_width;
|
|
90
|
+
const size_t simd_end = simd_count * simd_width;
|
|
91
|
+
|
|
92
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
93
|
+
{
|
|
94
|
+
float32x4_t values = vld1q_f32(&buffer[i]);
|
|
95
|
+
values = vabsq_f32(values);
|
|
96
|
+
vst1q_f32(&buffer[i], values);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
100
|
+
{
|
|
101
|
+
buffer[i] = std::fabs(buffer[i]);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
#else
|
|
105
|
+
// Scalar fallback (compiler may auto-vectorize)
|
|
106
|
+
for (size_t i = 0; i < size; ++i)
|
|
107
|
+
{
|
|
108
|
+
buffer[i] = std::fabs(buffer[i]);
|
|
109
|
+
}
|
|
110
|
+
#endif
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* @brief Apply half-wave rectification (max(0, x))
|
|
115
|
+
* @param buffer Input/output buffer (modified in-place)
|
|
116
|
+
* @param size Number of elements
|
|
117
|
+
*/
|
|
118
|
+
inline void max_zero_inplace(float *buffer, size_t size)
|
|
119
|
+
{
|
|
120
|
+
#if defined(SIMD_AVX2)
|
|
121
|
+
const size_t simd_width = 8;
|
|
122
|
+
const size_t simd_count = size / simd_width;
|
|
123
|
+
const size_t simd_end = simd_count * simd_width;
|
|
124
|
+
|
|
125
|
+
const __m256 zero = _mm256_setzero_ps();
|
|
126
|
+
|
|
127
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
128
|
+
{
|
|
129
|
+
__m256 values = _mm256_loadu_ps(&buffer[i]);
|
|
130
|
+
values = _mm256_max_ps(values, zero);
|
|
131
|
+
_mm256_storeu_ps(&buffer[i], values);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
135
|
+
{
|
|
136
|
+
buffer[i] = std::max(0.0f, buffer[i]);
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
#elif defined(SIMD_SSE2)
|
|
140
|
+
const size_t simd_width = 4;
|
|
141
|
+
const size_t simd_count = size / simd_width;
|
|
142
|
+
const size_t simd_end = simd_count * simd_width;
|
|
143
|
+
|
|
144
|
+
const __m128 zero = _mm_setzero_ps();
|
|
145
|
+
|
|
146
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
147
|
+
{
|
|
148
|
+
__m128 values = _mm_loadu_ps(&buffer[i]);
|
|
149
|
+
values = _mm_max_ps(values, zero);
|
|
150
|
+
_mm_storeu_ps(&buffer[i], values);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
154
|
+
{
|
|
155
|
+
buffer[i] = std::max(0.0f, buffer[i]);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
#elif defined(SIMD_NEON)
|
|
159
|
+
const size_t simd_width = 4;
|
|
160
|
+
const size_t simd_count = size / simd_width;
|
|
161
|
+
const size_t simd_end = simd_count * simd_width;
|
|
162
|
+
|
|
163
|
+
const float32x4_t zero = vdupq_n_f32(0.0f);
|
|
164
|
+
|
|
165
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
166
|
+
{
|
|
167
|
+
float32x4_t values = vld1q_f32(&buffer[i]);
|
|
168
|
+
values = vmaxq_f32(values, zero);
|
|
169
|
+
vst1q_f32(&buffer[i], values);
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
173
|
+
{
|
|
174
|
+
buffer[i] = std::max(0.0f, buffer[i]);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
#else
|
|
178
|
+
for (size_t i = 0; i < size; ++i)
|
|
179
|
+
{
|
|
180
|
+
buffer[i] = std::max(0.0f, buffer[i]);
|
|
181
|
+
}
|
|
182
|
+
#endif
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* @brief Compute sum of array (optimized for batch mode operations)
|
|
187
|
+
* @param buffer Input buffer
|
|
188
|
+
* @param size Number of elements
|
|
189
|
+
* @return Sum of all elements
|
|
190
|
+
*/
|
|
191
|
+
inline double sum(const float *buffer, size_t size)
|
|
192
|
+
{
|
|
193
|
+
#if defined(SIMD_AVX2)
|
|
194
|
+
const size_t simd_width = 8;
|
|
195
|
+
const size_t simd_count = size / simd_width;
|
|
196
|
+
const size_t simd_end = simd_count * simd_width;
|
|
197
|
+
|
|
198
|
+
__m256d acc1 = _mm256_setzero_pd();
|
|
199
|
+
__m256d acc2 = _mm256_setzero_pd();
|
|
200
|
+
|
|
201
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
202
|
+
{
|
|
203
|
+
// Load 8 floats
|
|
204
|
+
__m256 values = _mm256_loadu_ps(&buffer[i]);
|
|
205
|
+
|
|
206
|
+
// Convert to two groups of 4 doubles for precision
|
|
207
|
+
__m128 lo = _mm256_castps256_ps128(values);
|
|
208
|
+
__m128 hi = _mm256_extractf128_ps(values, 1);
|
|
209
|
+
|
|
210
|
+
__m256d dbl_lo = _mm256_cvtps_pd(lo);
|
|
211
|
+
__m256d dbl_hi = _mm256_cvtps_pd(hi);
|
|
212
|
+
|
|
213
|
+
acc1 = _mm256_add_pd(acc1, dbl_lo);
|
|
214
|
+
acc2 = _mm256_add_pd(acc2, dbl_hi);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Horizontal sum
|
|
218
|
+
acc1 = _mm256_add_pd(acc1, acc2);
|
|
219
|
+
__m128d sum_high = _mm256_extractf128_pd(acc1, 1);
|
|
220
|
+
__m128d sum_low = _mm256_castpd256_pd128(acc1);
|
|
221
|
+
__m128d sum128 = _mm_add_pd(sum_low, sum_high);
|
|
222
|
+
|
|
223
|
+
double result[2];
|
|
224
|
+
_mm_storeu_pd(result, sum128);
|
|
225
|
+
double total = result[0] + result[1];
|
|
226
|
+
|
|
227
|
+
// Handle remainder
|
|
228
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
229
|
+
{
|
|
230
|
+
total += static_cast<double>(buffer[i]);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
return total;
|
|
234
|
+
|
|
235
|
+
#elif defined(SIMD_SSE2)
|
|
236
|
+
const size_t simd_width = 4;
|
|
237
|
+
const size_t simd_count = size / simd_width;
|
|
238
|
+
const size_t simd_end = simd_count * simd_width;
|
|
239
|
+
|
|
240
|
+
__m128d acc1 = _mm_setzero_pd();
|
|
241
|
+
__m128d acc2 = _mm_setzero_pd();
|
|
242
|
+
|
|
243
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
244
|
+
{
|
|
245
|
+
__m128 values = _mm_loadu_ps(&buffer[i]);
|
|
246
|
+
|
|
247
|
+
// Convert to doubles for precision
|
|
248
|
+
__m128d dbl_lo = _mm_cvtps_pd(values);
|
|
249
|
+
__m128d dbl_hi = _mm_cvtps_pd(_mm_movehl_ps(values, values));
|
|
250
|
+
|
|
251
|
+
acc1 = _mm_add_pd(acc1, dbl_lo);
|
|
252
|
+
acc2 = _mm_add_pd(acc2, dbl_hi);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
acc1 = _mm_add_pd(acc1, acc2);
|
|
256
|
+
double result[2];
|
|
257
|
+
_mm_storeu_pd(result, acc1);
|
|
258
|
+
double total = result[0] + result[1];
|
|
259
|
+
|
|
260
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
261
|
+
{
|
|
262
|
+
total += static_cast<double>(buffer[i]);
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
return total;
|
|
266
|
+
|
|
267
|
+
#else
|
|
268
|
+
// Scalar with Kahan summation for precision
|
|
269
|
+
double sum = 0.0;
|
|
270
|
+
double c = 0.0; // Compensation for lost low-order bits
|
|
271
|
+
|
|
272
|
+
for (size_t i = 0; i < size; ++i)
|
|
273
|
+
{
|
|
274
|
+
double y = static_cast<double>(buffer[i]) - c;
|
|
275
|
+
double t = sum + y;
|
|
276
|
+
c = (t - sum) - y;
|
|
277
|
+
sum = t;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
return sum;
|
|
281
|
+
#endif
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/**
|
|
285
|
+
* @brief Compute sum of squares (optimized for RMS calculations)
|
|
286
|
+
* @param buffer Input buffer
|
|
287
|
+
* @param size Number of elements
|
|
288
|
+
* @return Sum of squared elements
|
|
289
|
+
*/
|
|
290
|
+
inline double sum_of_squares(const float *buffer, size_t size)
|
|
291
|
+
{
|
|
292
|
+
#if defined(SIMD_AVX2)
|
|
293
|
+
const size_t simd_width = 8;
|
|
294
|
+
const size_t simd_count = size / simd_width;
|
|
295
|
+
const size_t simd_end = simd_count * simd_width;
|
|
296
|
+
|
|
297
|
+
__m256d acc1 = _mm256_setzero_pd();
|
|
298
|
+
__m256d acc2 = _mm256_setzero_pd();
|
|
299
|
+
|
|
300
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
301
|
+
{
|
|
302
|
+
__m256 values = _mm256_loadu_ps(&buffer[i]);
|
|
303
|
+
|
|
304
|
+
// Square the values
|
|
305
|
+
__m256 squares = _mm256_mul_ps(values, values);
|
|
306
|
+
|
|
307
|
+
// Convert to doubles for precision accumulation
|
|
308
|
+
__m128 lo = _mm256_castps256_ps128(squares);
|
|
309
|
+
__m128 hi = _mm256_extractf128_ps(squares, 1);
|
|
310
|
+
|
|
311
|
+
__m256d dbl_lo = _mm256_cvtps_pd(lo);
|
|
312
|
+
__m256d dbl_hi = _mm256_cvtps_pd(hi);
|
|
313
|
+
|
|
314
|
+
acc1 = _mm256_add_pd(acc1, dbl_lo);
|
|
315
|
+
acc2 = _mm256_add_pd(acc2, dbl_hi);
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Horizontal sum
|
|
319
|
+
acc1 = _mm256_add_pd(acc1, acc2);
|
|
320
|
+
__m128d sum_high = _mm256_extractf128_pd(acc1, 1);
|
|
321
|
+
__m128d sum_low = _mm256_castpd256_pd128(acc1);
|
|
322
|
+
__m128d sum128 = _mm_add_pd(sum_low, sum_high);
|
|
323
|
+
|
|
324
|
+
double result[2];
|
|
325
|
+
_mm_storeu_pd(result, sum128);
|
|
326
|
+
double total = result[0] + result[1];
|
|
327
|
+
|
|
328
|
+
// Handle remainder
|
|
329
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
330
|
+
{
|
|
331
|
+
double val = static_cast<double>(buffer[i]);
|
|
332
|
+
total += val * val;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
return total;
|
|
336
|
+
|
|
337
|
+
#elif defined(SIMD_SSE2)
|
|
338
|
+
const size_t simd_width = 4;
|
|
339
|
+
const size_t simd_count = size / simd_width;
|
|
340
|
+
const size_t simd_end = simd_count * simd_width;
|
|
341
|
+
|
|
342
|
+
__m128d acc1 = _mm_setzero_pd();
|
|
343
|
+
__m128d acc2 = _mm_setzero_pd();
|
|
344
|
+
|
|
345
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
346
|
+
{
|
|
347
|
+
__m128 values = _mm_loadu_ps(&buffer[i]);
|
|
348
|
+
__m128 squares = _mm_mul_ps(values, values);
|
|
349
|
+
|
|
350
|
+
__m128d dbl_lo = _mm_cvtps_pd(squares);
|
|
351
|
+
__m128d dbl_hi = _mm_cvtps_pd(_mm_movehl_ps(squares, squares));
|
|
352
|
+
|
|
353
|
+
acc1 = _mm_add_pd(acc1, dbl_lo);
|
|
354
|
+
acc2 = _mm_add_pd(acc2, dbl_hi);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
acc1 = _mm_add_pd(acc1, acc2);
|
|
358
|
+
double result[2];
|
|
359
|
+
_mm_storeu_pd(result, acc1);
|
|
360
|
+
double total = result[0] + result[1];
|
|
361
|
+
|
|
362
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
363
|
+
{
|
|
364
|
+
double val = static_cast<double>(buffer[i]);
|
|
365
|
+
total += val * val;
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return total;
|
|
369
|
+
|
|
370
|
+
#else
|
|
371
|
+
// Scalar with Kahan summation
|
|
372
|
+
double sum = 0.0;
|
|
373
|
+
double c = 0.0;
|
|
374
|
+
|
|
375
|
+
for (size_t i = 0; i < size; ++i)
|
|
376
|
+
{
|
|
377
|
+
double val = static_cast<double>(buffer[i]);
|
|
378
|
+
double y = (val * val) - c;
|
|
379
|
+
double t = sum + y;
|
|
380
|
+
c = (t - sum) - y;
|
|
381
|
+
sum = t;
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
return sum;
|
|
385
|
+
#endif
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/**
|
|
389
|
+
* @brief QUICK WIN: Apply window function to signal (element-wise multiply)
|
|
390
|
+
* @param input Input signal buffer
|
|
391
|
+
* @param window Window coefficients
|
|
392
|
+
* @param output Output windowed signal
|
|
393
|
+
* @param size Number of elements
|
|
394
|
+
*/
|
|
395
|
+
inline void apply_window(const float *input, const float *window, float *output, size_t size)
|
|
396
|
+
{
|
|
397
|
+
#if defined(SIMD_AVX2)
|
|
398
|
+
const size_t simd_width = 8;
|
|
399
|
+
const size_t simd_count = size / simd_width;
|
|
400
|
+
const size_t simd_end = simd_count * simd_width;
|
|
401
|
+
|
|
402
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
403
|
+
{
|
|
404
|
+
__m256 in = _mm256_loadu_ps(&input[i]);
|
|
405
|
+
__m256 win = _mm256_loadu_ps(&window[i]);
|
|
406
|
+
__m256 result = _mm256_mul_ps(in, win);
|
|
407
|
+
_mm256_storeu_ps(&output[i], result);
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
// Handle remainder
|
|
411
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
412
|
+
{
|
|
413
|
+
output[i] = input[i] * window[i];
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
#elif defined(SIMD_SSE2)
|
|
417
|
+
const size_t simd_width = 4;
|
|
418
|
+
const size_t simd_count = size / simd_width;
|
|
419
|
+
const size_t simd_end = simd_count * simd_width;
|
|
420
|
+
|
|
421
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
422
|
+
{
|
|
423
|
+
__m128 in = _mm_loadu_ps(&input[i]);
|
|
424
|
+
__m128 win = _mm_loadu_ps(&window[i]);
|
|
425
|
+
__m128 result = _mm_mul_ps(in, win);
|
|
426
|
+
_mm_storeu_ps(&output[i], result);
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
430
|
+
{
|
|
431
|
+
output[i] = input[i] * window[i];
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
#elif defined(SIMD_NEON)
|
|
435
|
+
const size_t simd_width = 4;
|
|
436
|
+
const size_t simd_count = size / simd_width;
|
|
437
|
+
const size_t simd_end = simd_count * simd_width;
|
|
438
|
+
|
|
439
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
440
|
+
{
|
|
441
|
+
float32x4_t in = vld1q_f32(&input[i]);
|
|
442
|
+
float32x4_t win = vld1q_f32(&window[i]);
|
|
443
|
+
float32x4_t result = vmulq_f32(in, win);
|
|
444
|
+
vst1q_f32(&output[i], result);
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
448
|
+
{
|
|
449
|
+
output[i] = input[i] * window[i];
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
#else
|
|
453
|
+
for (size_t i = 0; i < size; ++i)
|
|
454
|
+
{
|
|
455
|
+
output[i] = input[i] * window[i];
|
|
456
|
+
}
|
|
457
|
+
#endif
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
/**
|
|
461
|
+
* @brief MEDIUM WIN: Compute magnitude spectrum from complex values
|
|
462
|
+
* magnitude[i] = sqrt(real[i]² + imag[i]²)
|
|
463
|
+
* @param real Real components
|
|
464
|
+
* @param imag Imaginary components
|
|
465
|
+
* @param magnitude Output magnitudes
|
|
466
|
+
* @param size Number of complex values
|
|
467
|
+
*/
|
|
468
|
+
inline void complex_magnitude(const float *real, const float *imag, float *magnitude, size_t size)
|
|
469
|
+
{
|
|
470
|
+
#if defined(SIMD_AVX2)
|
|
471
|
+
const size_t simd_width = 8;
|
|
472
|
+
const size_t simd_count = size / simd_width;
|
|
473
|
+
const size_t simd_end = simd_count * simd_width;
|
|
474
|
+
|
|
475
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
476
|
+
{
|
|
477
|
+
__m256 re = _mm256_loadu_ps(&real[i]);
|
|
478
|
+
__m256 im = _mm256_loadu_ps(&imag[i]);
|
|
479
|
+
|
|
480
|
+
// mag² = re² + im²
|
|
481
|
+
__m256 re_sq = _mm256_mul_ps(re, re);
|
|
482
|
+
__m256 im_sq = _mm256_mul_ps(im, im);
|
|
483
|
+
__m256 mag_sq = _mm256_add_ps(re_sq, im_sq);
|
|
484
|
+
|
|
485
|
+
// mag = sqrt(mag²)
|
|
486
|
+
__m256 mag = _mm256_sqrt_ps(mag_sq);
|
|
487
|
+
|
|
488
|
+
_mm256_storeu_ps(&magnitude[i], mag);
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
492
|
+
{
|
|
493
|
+
magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
#elif defined(SIMD_SSE2)
|
|
497
|
+
const size_t simd_width = 4;
|
|
498
|
+
const size_t simd_count = size / simd_width;
|
|
499
|
+
const size_t simd_end = simd_count * simd_width;
|
|
500
|
+
|
|
501
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
502
|
+
{
|
|
503
|
+
__m128 re = _mm_loadu_ps(&real[i]);
|
|
504
|
+
__m128 im = _mm_loadu_ps(&imag[i]);
|
|
505
|
+
|
|
506
|
+
__m128 re_sq = _mm_mul_ps(re, re);
|
|
507
|
+
__m128 im_sq = _mm_mul_ps(im, im);
|
|
508
|
+
__m128 mag_sq = _mm_add_ps(re_sq, im_sq);
|
|
509
|
+
__m128 mag = _mm_sqrt_ps(mag_sq);
|
|
510
|
+
|
|
511
|
+
_mm_storeu_ps(&magnitude[i], mag);
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
515
|
+
{
|
|
516
|
+
magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
#elif defined(SIMD_NEON)
|
|
520
|
+
const size_t simd_width = 4;
|
|
521
|
+
const size_t simd_count = size / simd_width;
|
|
522
|
+
const size_t simd_end = simd_count * simd_width;
|
|
523
|
+
|
|
524
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
525
|
+
{
|
|
526
|
+
float32x4_t re = vld1q_f32(&real[i]);
|
|
527
|
+
float32x4_t im = vld1q_f32(&imag[i]);
|
|
528
|
+
|
|
529
|
+
float32x4_t re_sq = vmulq_f32(re, re);
|
|
530
|
+
float32x4_t im_sq = vmulq_f32(im, im);
|
|
531
|
+
float32x4_t mag_sq = vaddq_f32(re_sq, im_sq);
|
|
532
|
+
|
|
533
|
+
// ARM NEON sqrt (reciprocal square root + Newton-Raphson)
|
|
534
|
+
float32x4_t mag = vsqrtq_f32(mag_sq);
|
|
535
|
+
|
|
536
|
+
vst1q_f32(&magnitude[i], mag);
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
540
|
+
{
|
|
541
|
+
magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
#else
|
|
545
|
+
for (size_t i = 0; i < size; ++i)
|
|
546
|
+
{
|
|
547
|
+
magnitude[i] = std::sqrt(real[i] * real[i] + imag[i] * imag[i]);
|
|
548
|
+
}
|
|
549
|
+
#endif
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
/**
|
|
553
|
+
* @brief MEDIUM WIN: Compute power spectrum from complex values
|
|
554
|
+
* power[i] = real[i]² + imag[i]²
|
|
555
|
+
* @param real Real components
|
|
556
|
+
* @param imag Imaginary components
|
|
557
|
+
* @param power Output power values
|
|
558
|
+
* @param size Number of complex values
|
|
559
|
+
*/
|
|
560
|
+
inline void complex_power(const float *real, const float *imag, float *power, size_t size)
|
|
561
|
+
{
|
|
562
|
+
#if defined(SIMD_AVX2)
|
|
563
|
+
const size_t simd_width = 8;
|
|
564
|
+
const size_t simd_count = size / simd_width;
|
|
565
|
+
const size_t simd_end = simd_count * simd_width;
|
|
566
|
+
|
|
567
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
568
|
+
{
|
|
569
|
+
__m256 re = _mm256_loadu_ps(&real[i]);
|
|
570
|
+
__m256 im = _mm256_loadu_ps(&imag[i]);
|
|
571
|
+
|
|
572
|
+
__m256 re_sq = _mm256_mul_ps(re, re);
|
|
573
|
+
__m256 im_sq = _mm256_mul_ps(im, im);
|
|
574
|
+
__m256 pwr = _mm256_add_ps(re_sq, im_sq);
|
|
575
|
+
|
|
576
|
+
_mm256_storeu_ps(&power[i], pwr);
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
580
|
+
{
|
|
581
|
+
power[i] = real[i] * real[i] + imag[i] * imag[i];
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
#elif defined(SIMD_SSE2)
|
|
585
|
+
const size_t simd_width = 4;
|
|
586
|
+
const size_t simd_count = size / simd_width;
|
|
587
|
+
const size_t simd_end = simd_count * simd_width;
|
|
588
|
+
|
|
589
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
590
|
+
{
|
|
591
|
+
__m128 re = _mm_loadu_ps(&real[i]);
|
|
592
|
+
__m128 im = _mm_loadu_ps(&imag[i]);
|
|
593
|
+
|
|
594
|
+
__m128 re_sq = _mm_mul_ps(re, re);
|
|
595
|
+
__m128 im_sq = _mm_mul_ps(im, im);
|
|
596
|
+
__m128 pwr = _mm_add_ps(re_sq, im_sq);
|
|
597
|
+
|
|
598
|
+
_mm_storeu_ps(&power[i], pwr);
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
602
|
+
{
|
|
603
|
+
power[i] = real[i] * real[i] + imag[i] * imag[i];
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
#elif defined(SIMD_NEON)
|
|
607
|
+
const size_t simd_width = 4;
|
|
608
|
+
const size_t simd_count = size / simd_width;
|
|
609
|
+
const size_t simd_end = simd_count * simd_width;
|
|
610
|
+
|
|
611
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
612
|
+
{
|
|
613
|
+
float32x4_t re = vld1q_f32(&real[i]);
|
|
614
|
+
float32x4_t im = vld1q_f32(&imag[i]);
|
|
615
|
+
|
|
616
|
+
float32x4_t re_sq = vmulq_f32(re, re);
|
|
617
|
+
float32x4_t im_sq = vmulq_f32(im, im);
|
|
618
|
+
float32x4_t pwr = vaddq_f32(re_sq, im_sq);
|
|
619
|
+
|
|
620
|
+
vst1q_f32(&power[i], pwr);
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
624
|
+
{
|
|
625
|
+
power[i] = real[i] * real[i] + imag[i] * imag[i];
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
#else
|
|
629
|
+
for (size_t i = 0; i < size; ++i)
|
|
630
|
+
{
|
|
631
|
+
power[i] = real[i] * real[i] + imag[i] * imag[i];
|
|
632
|
+
}
|
|
633
|
+
#endif
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
/**
|
|
637
|
+
* @brief SIMD-optimized dot product for FIR convolution
|
|
638
|
+
* result = sum(a[i] * b[i]) for i in [0, size)
|
|
639
|
+
* @param a First array
|
|
640
|
+
* @param b Second array
|
|
641
|
+
* @param size Number of elements
|
|
642
|
+
* @return Dot product
|
|
643
|
+
*/
|
|
644
|
+
inline float dot_product(const float *a, const float *b, size_t size)
|
|
645
|
+
{
|
|
646
|
+
#if defined(SIMD_AVX2)
|
|
647
|
+
const size_t simd_width = 8;
|
|
648
|
+
const size_t simd_count = size / simd_width;
|
|
649
|
+
const size_t simd_end = simd_count * simd_width;
|
|
650
|
+
|
|
651
|
+
__m256 acc = _mm256_setzero_ps();
|
|
652
|
+
|
|
653
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
654
|
+
{
|
|
655
|
+
__m256 va = _mm256_loadu_ps(&a[i]);
|
|
656
|
+
__m256 vb = _mm256_loadu_ps(&b[i]);
|
|
657
|
+
__m256 prod = _mm256_mul_ps(va, vb);
|
|
658
|
+
acc = _mm256_add_ps(acc, prod);
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
// Horizontal sum of the accumulator
|
|
662
|
+
__m128 hi = _mm256_extractf128_ps(acc, 1);
|
|
663
|
+
__m128 lo = _mm256_castps256_ps128(acc);
|
|
664
|
+
__m128 sum128 = _mm_add_ps(lo, hi);
|
|
665
|
+
|
|
666
|
+
// Reduce 4 floats to 1
|
|
667
|
+
sum128 = _mm_hadd_ps(sum128, sum128);
|
|
668
|
+
sum128 = _mm_hadd_ps(sum128, sum128);
|
|
669
|
+
|
|
670
|
+
float result = _mm_cvtss_f32(sum128);
|
|
671
|
+
|
|
672
|
+
// Handle remainder
|
|
673
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
674
|
+
{
|
|
675
|
+
result += a[i] * b[i];
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
return result;
|
|
679
|
+
|
|
680
|
+
#elif defined(SIMD_SSE2)
|
|
681
|
+
const size_t simd_width = 4;
|
|
682
|
+
const size_t simd_count = size / simd_width;
|
|
683
|
+
const size_t simd_end = simd_count * simd_width;
|
|
684
|
+
|
|
685
|
+
__m128 acc = _mm_setzero_ps();
|
|
686
|
+
|
|
687
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
688
|
+
{
|
|
689
|
+
__m128 va = _mm_loadu_ps(&a[i]);
|
|
690
|
+
__m128 vb = _mm_loadu_ps(&b[i]);
|
|
691
|
+
__m128 prod = _mm_mul_ps(va, vb);
|
|
692
|
+
acc = _mm_add_ps(acc, prod);
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
// Horizontal sum
|
|
696
|
+
__m128 shuf = _mm_shuffle_ps(acc, acc, _MM_SHUFFLE(2, 3, 0, 1));
|
|
697
|
+
__m128 sums = _mm_add_ps(acc, shuf);
|
|
698
|
+
shuf = _mm_movehl_ps(shuf, sums);
|
|
699
|
+
sums = _mm_add_ss(sums, shuf);
|
|
700
|
+
|
|
701
|
+
float result = _mm_cvtss_f32(sums);
|
|
702
|
+
|
|
703
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
704
|
+
{
|
|
705
|
+
result += a[i] * b[i];
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
return result;
|
|
709
|
+
|
|
710
|
+
#elif defined(SIMD_NEON)
|
|
711
|
+
const size_t simd_width = 4;
|
|
712
|
+
const size_t simd_count = size / simd_width;
|
|
713
|
+
const size_t simd_end = simd_count * simd_width;
|
|
714
|
+
|
|
715
|
+
float32x4_t acc = vdupq_n_f32(0.0f);
|
|
716
|
+
|
|
717
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
718
|
+
{
|
|
719
|
+
float32x4_t va = vld1q_f32(&a[i]);
|
|
720
|
+
float32x4_t vb = vld1q_f32(&b[i]);
|
|
721
|
+
acc = vmlaq_f32(acc, va, vb); // Fused multiply-add
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
// Horizontal sum
|
|
725
|
+
float32x2_t sum2 = vadd_f32(vget_low_f32(acc), vget_high_f32(acc));
|
|
726
|
+
float result = vget_lane_f32(vpadd_f32(sum2, sum2), 0);
|
|
727
|
+
|
|
728
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
729
|
+
{
|
|
730
|
+
result += a[i] * b[i];
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
return result;
|
|
734
|
+
|
|
735
|
+
#else
|
|
736
|
+
float result = 0.0f;
|
|
737
|
+
for (size_t i = 0; i < size; ++i)
|
|
738
|
+
{
|
|
739
|
+
result += a[i] * b[i];
|
|
740
|
+
}
|
|
741
|
+
return result;
|
|
742
|
+
#endif
|
|
743
|
+
}
|
|
744
|
+
|
|
745
|
+
/**
|
|
746
|
+
* @brief MAJOR WIN: Complex multiplication for FFT butterflies
|
|
747
|
+
* result = a * b (complex multiplication)
|
|
748
|
+
* @param a_real Real part of a
|
|
749
|
+
* @param a_imag Imaginary part of a
|
|
750
|
+
* @param b_real Real part of b
|
|
751
|
+
* @param b_imag Imaginary part of b
|
|
752
|
+
* @param out_real Output real part
|
|
753
|
+
* @param out_imag Output imaginary part
|
|
754
|
+
* @param size Number of complex multiplications
|
|
755
|
+
*/
|
|
756
|
+
inline void complex_multiply(
|
|
757
|
+
const float *a_real, const float *a_imag,
|
|
758
|
+
const float *b_real, const float *b_imag,
|
|
759
|
+
float *out_real, float *out_imag,
|
|
760
|
+
size_t size)
|
|
761
|
+
{
|
|
762
|
+
#if defined(SIMD_AVX2)
|
|
763
|
+
const size_t simd_width = 8;
|
|
764
|
+
const size_t simd_count = size / simd_width;
|
|
765
|
+
const size_t simd_end = simd_count * simd_width;
|
|
766
|
+
|
|
767
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
768
|
+
{
|
|
769
|
+
__m256 ar = _mm256_loadu_ps(&a_real[i]);
|
|
770
|
+
__m256 ai = _mm256_loadu_ps(&a_imag[i]);
|
|
771
|
+
__m256 br = _mm256_loadu_ps(&b_real[i]);
|
|
772
|
+
__m256 bi = _mm256_loadu_ps(&b_imag[i]);
|
|
773
|
+
|
|
774
|
+
// (a + bi) * (c + di) = (ac - bd) + (ad + bc)i
|
|
775
|
+
__m256 ac = _mm256_mul_ps(ar, br);
|
|
776
|
+
__m256 bd = _mm256_mul_ps(ai, bi);
|
|
777
|
+
__m256 ad = _mm256_mul_ps(ar, bi);
|
|
778
|
+
__m256 bc = _mm256_mul_ps(ai, br);
|
|
779
|
+
|
|
780
|
+
__m256 real = _mm256_sub_ps(ac, bd);
|
|
781
|
+
__m256 imag = _mm256_add_ps(ad, bc);
|
|
782
|
+
|
|
783
|
+
_mm256_storeu_ps(&out_real[i], real);
|
|
784
|
+
_mm256_storeu_ps(&out_imag[i], imag);
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
788
|
+
{
|
|
789
|
+
float ar = a_real[i], ai = a_imag[i];
|
|
790
|
+
float br = b_real[i], bi = b_imag[i];
|
|
791
|
+
out_real[i] = ar * br - ai * bi;
|
|
792
|
+
out_imag[i] = ar * bi + ai * br;
|
|
793
|
+
}
|
|
794
|
+
|
|
795
|
+
#elif defined(SIMD_SSE2)
|
|
796
|
+
const size_t simd_width = 4;
|
|
797
|
+
const size_t simd_count = size / simd_width;
|
|
798
|
+
const size_t simd_end = simd_count * simd_width;
|
|
799
|
+
|
|
800
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
801
|
+
{
|
|
802
|
+
__m128 ar = _mm_loadu_ps(&a_real[i]);
|
|
803
|
+
__m128 ai = _mm_loadu_ps(&a_imag[i]);
|
|
804
|
+
__m128 br = _mm_loadu_ps(&b_real[i]);
|
|
805
|
+
__m128 bi = _mm_loadu_ps(&b_imag[i]);
|
|
806
|
+
|
|
807
|
+
__m128 ac = _mm_mul_ps(ar, br);
|
|
808
|
+
__m128 bd = _mm_mul_ps(ai, bi);
|
|
809
|
+
__m128 ad = _mm_mul_ps(ar, bi);
|
|
810
|
+
__m128 bc = _mm_mul_ps(ai, br);
|
|
811
|
+
|
|
812
|
+
__m128 real = _mm_sub_ps(ac, bd);
|
|
813
|
+
__m128 imag = _mm_add_ps(ad, bc);
|
|
814
|
+
|
|
815
|
+
_mm_storeu_ps(&out_real[i], real);
|
|
816
|
+
_mm_storeu_ps(&out_imag[i], imag);
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
820
|
+
{
|
|
821
|
+
float ar = a_real[i], ai = a_imag[i];
|
|
822
|
+
float br = b_real[i], bi = b_imag[i];
|
|
823
|
+
out_real[i] = ar * br - ai * bi;
|
|
824
|
+
out_imag[i] = ar * bi + ai * br;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
#elif defined(SIMD_NEON)
|
|
828
|
+
const size_t simd_width = 4;
|
|
829
|
+
const size_t simd_count = size / simd_width;
|
|
830
|
+
const size_t simd_end = simd_count * simd_width;
|
|
831
|
+
|
|
832
|
+
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
833
|
+
{
|
|
834
|
+
float32x4_t ar = vld1q_f32(&a_real[i]);
|
|
835
|
+
float32x4_t ai = vld1q_f32(&a_imag[i]);
|
|
836
|
+
float32x4_t br = vld1q_f32(&b_real[i]);
|
|
837
|
+
float32x4_t bi = vld1q_f32(&b_imag[i]);
|
|
838
|
+
|
|
839
|
+
float32x4_t ac = vmulq_f32(ar, br);
|
|
840
|
+
float32x4_t bd = vmulq_f32(ai, bi);
|
|
841
|
+
float32x4_t ad = vmulq_f32(ar, bi);
|
|
842
|
+
float32x4_t bc = vmulq_f32(ai, br);
|
|
843
|
+
|
|
844
|
+
float32x4_t real = vsubq_f32(ac, bd);
|
|
845
|
+
float32x4_t imag = vaddq_f32(ad, bc);
|
|
846
|
+
|
|
847
|
+
vst1q_f32(&out_real[i], real);
|
|
848
|
+
vst1q_f32(&out_imag[i], imag);
|
|
849
|
+
}
|
|
850
|
+
|
|
851
|
+
for (size_t i = simd_end; i < size; ++i)
|
|
852
|
+
{
|
|
853
|
+
float ar = a_real[i], ai = a_imag[i];
|
|
854
|
+
float br = b_real[i], bi = b_imag[i];
|
|
855
|
+
out_real[i] = ar * br - ai * bi;
|
|
856
|
+
out_imag[i] = ar * bi + ai * br;
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
#else
|
|
860
|
+
for (size_t i = 0; i < size; ++i)
|
|
861
|
+
{
|
|
862
|
+
float ar = a_real[i], ai = a_imag[i];
|
|
863
|
+
float br = b_real[i], bi = b_imag[i];
|
|
864
|
+
out_real[i] = ar * br - ai * bi;
|
|
865
|
+
out_imag[i] = ar * bi + ai * br;
|
|
866
|
+
}
|
|
867
|
+
#endif
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
} // namespace dsp::simd
|