dspx 0.2.0-alpha.12 → 0.2.0-alpha.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
Binary file
|
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
#include "../utils/ConvolutionPolicy.h"
|
|
6
6
|
#include "../utils/SimdOps.h"
|
|
7
7
|
#include "../core/FftEngine.h"
|
|
8
|
+
#include "../core/FirFilterNeon.h"
|
|
8
9
|
#include <vector>
|
|
9
10
|
#include <string>
|
|
10
11
|
#include <memory>
|
|
@@ -220,7 +221,9 @@ namespace dsp::adapters
|
|
|
220
221
|
// De-interleaved buffers for cache-friendly processing
|
|
221
222
|
std::vector<std::vector<float>> m_deinterleaved_buffers;
|
|
222
223
|
std::vector<float> m_temp_output_channel;
|
|
223
|
-
|
|
224
|
+
|
|
225
|
+
// High-performance FIR filters for batch convolution (one per channel)
|
|
226
|
+
std::vector<std::unique_ptr<core::FirFilterNeon>> m_batch_fir_filters;
|
|
224
227
|
|
|
225
228
|
// Reusable buffer for moving mode direct convolution (eliminates per-sample allocation)
|
|
226
229
|
mutable std::vector<float> m_moving_temp_buffer;
|
|
@@ -390,6 +393,67 @@ namespace dsp::adapters
|
|
|
390
393
|
size_t samplesPerChannel = numSamples / numChannels;
|
|
391
394
|
size_t kernelSize = m_kernel.size();
|
|
392
395
|
|
|
396
|
+
#if defined(__ARM_NEON) || defined(__aarch64__)
|
|
397
|
+
// ARM NEON path: Use FirFilterNeon for optimal performance
|
|
398
|
+
// This eliminates per-sample gather overhead with O(1) circular buffer
|
|
399
|
+
|
|
400
|
+
// Initialize FIR filters for each channel (only once)
|
|
401
|
+
if (m_batch_fir_filters.size() != static_cast<size_t>(numChannels))
|
|
402
|
+
{
|
|
403
|
+
m_batch_fir_filters.clear();
|
|
404
|
+
for (int ch = 0; ch < numChannels; ++ch)
|
|
405
|
+
{
|
|
406
|
+
m_batch_fir_filters.push_back(
|
|
407
|
+
std::make_unique<core::FirFilterNeon>(m_kernel));
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// Resize de-interleaved buffers if needed
|
|
412
|
+
if (m_deinterleaved_buffers.size() != static_cast<size_t>(numChannels))
|
|
413
|
+
{
|
|
414
|
+
m_deinterleaved_buffers.resize(numChannels);
|
|
415
|
+
for (auto &buf : m_deinterleaved_buffers)
|
|
416
|
+
{
|
|
417
|
+
buf.resize(samplesPerChannel);
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
else if (!m_deinterleaved_buffers.empty() &&
|
|
421
|
+
m_deinterleaved_buffers[0].size() != samplesPerChannel)
|
|
422
|
+
{
|
|
423
|
+
for (auto &buf : m_deinterleaved_buffers)
|
|
424
|
+
{
|
|
425
|
+
buf.resize(samplesPerChannel);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
|
|
429
|
+
// Step 1: De-interleave and process each channel with FirFilterNeon
|
|
430
|
+
for (int ch = 0; ch < numChannels; ++ch)
|
|
431
|
+
{
|
|
432
|
+
auto &fir = m_batch_fir_filters[ch];
|
|
433
|
+
auto &output = m_deinterleaved_buffers[ch];
|
|
434
|
+
|
|
435
|
+
// Reset filter state for batch mode (stateless)
|
|
436
|
+
fir->reset();
|
|
437
|
+
|
|
438
|
+
// Extract channel samples and process in-place
|
|
439
|
+
for (size_t i = 0; i < samplesPerChannel; ++i)
|
|
440
|
+
{
|
|
441
|
+
float input = buffer[i * numChannels + ch];
|
|
442
|
+
output[i] = fir->processSample(input);
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
// Step 2: Re-interleave output back to buffer
|
|
447
|
+
for (int ch = 0; ch < numChannels; ++ch)
|
|
448
|
+
{
|
|
449
|
+
const auto &output = m_deinterleaved_buffers[ch];
|
|
450
|
+
for (size_t i = 0; i < samplesPerChannel; ++i)
|
|
451
|
+
{
|
|
452
|
+
buffer[i * numChannels + ch] = output[i];
|
|
453
|
+
}
|
|
454
|
+
}
|
|
455
|
+
#else
|
|
456
|
+
// Non-ARM fallback: Original scalar implementation
|
|
393
457
|
// Resize de-interleaved buffers if channel count changed
|
|
394
458
|
if (m_deinterleaved_buffers.size() != static_cast<size_t>(numChannels))
|
|
395
459
|
{
|
|
@@ -414,12 +478,6 @@ namespace dsp::adapters
|
|
|
414
478
|
m_temp_output_channel.resize(samplesPerChannel);
|
|
415
479
|
}
|
|
416
480
|
|
|
417
|
-
// Resize reversed window buffer for SIMD (only allocate once)
|
|
418
|
-
if (m_reversed_window.size() != kernelSize)
|
|
419
|
-
{
|
|
420
|
-
m_reversed_window.resize(kernelSize);
|
|
421
|
-
}
|
|
422
|
-
|
|
423
481
|
// Step 1: De-interleave input data for cache-friendly processing
|
|
424
482
|
for (int ch = 0; ch < numChannels; ++ch)
|
|
425
483
|
{
|
|
@@ -438,43 +496,16 @@ namespace dsp::adapters
|
|
|
438
496
|
float *output = m_temp_output_channel.data();
|
|
439
497
|
|
|
440
498
|
// Standard convolution: y[n] = sum(h[k] * x[n-k])
|
|
441
|
-
// For small K: simple loop (compiler auto-vectorizes)
|
|
442
|
-
// For large K: use explicit SIMD with window collection
|
|
443
499
|
for (size_t n = 0; n < samplesPerChannel; ++n)
|
|
444
500
|
{
|
|
445
501
|
float sum = 0.0f;
|
|
446
502
|
|
|
447
|
-
|
|
503
|
+
// Simple scalar loop (compiler may auto-vectorize)
|
|
504
|
+
for (size_t k = 0; k < kernelSize; ++k)
|
|
448
505
|
{
|
|
449
|
-
|
|
450
|
-
for (size_t k = 0; k < kernelSize; ++k)
|
|
451
|
-
{
|
|
452
|
-
if (n >= k)
|
|
453
|
-
{
|
|
454
|
-
sum += kernelPtr[k] * channelInput[n - k];
|
|
455
|
-
}
|
|
456
|
-
}
|
|
457
|
-
}
|
|
458
|
-
else
|
|
459
|
-
{
|
|
460
|
-
// Large kernels: collect window and use SIMD dot product
|
|
461
|
-
// This avoids the performance cliff from backward indexing
|
|
462
|
-
if (n >= kernelSize - 1)
|
|
463
|
-
{
|
|
464
|
-
// Full window available - collect in forward order for SIMD
|
|
465
|
-
for (size_t k = 0; k < kernelSize; ++k)
|
|
466
|
-
{
|
|
467
|
-
m_reversed_window[k] = channelInput[n - k];
|
|
468
|
-
}
|
|
469
|
-
sum = simd::dot_product(kernelPtr, m_reversed_window.data(), kernelSize);
|
|
470
|
-
}
|
|
471
|
-
else
|
|
506
|
+
if (n >= k)
|
|
472
507
|
{
|
|
473
|
-
|
|
474
|
-
for (size_t k = 0; k <= n; ++k)
|
|
475
|
-
{
|
|
476
|
-
sum += kernelPtr[k] * channelInput[n - k];
|
|
477
|
-
}
|
|
508
|
+
sum += kernelPtr[k] * channelInput[n - k];
|
|
478
509
|
}
|
|
479
510
|
}
|
|
480
511
|
|
|
@@ -487,6 +518,7 @@ namespace dsp::adapters
|
|
|
487
518
|
buffer[i * numChannels + ch] = output[i];
|
|
488
519
|
}
|
|
489
520
|
}
|
|
521
|
+
#endif
|
|
490
522
|
}
|
|
491
523
|
|
|
492
524
|
/**
|
|
@@ -2,19 +2,24 @@
|
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* @file FirFilterNeon.h
|
|
5
|
-
* @brief ARM NEON-optimized FIR filter with
|
|
5
|
+
* @brief ARM NEON-optimized FIR filter with guard-zone circular buffer
|
|
6
6
|
*
|
|
7
|
-
* This implementation
|
|
8
|
-
*
|
|
9
|
-
* with simple shifts, allowing pure NEON vectorization without gather operations.
|
|
7
|
+
* This implementation keeps O(1) state updates while enabling fully contiguous
|
|
8
|
+
* NEON vectorization using a "guard zone" (mirrored buffer) technique.
|
|
10
9
|
*
|
|
11
|
-
*
|
|
10
|
+
* Key insight: Allocate buffer of size N + GUARD (where GUARD >= max SIMD width).
|
|
11
|
+
* When writing sample at index i, also write it at i+N. This ensures that any
|
|
12
|
+
* NEON load starting from 'head' can read contiguously without wrap-around logic.
|
|
13
|
+
*
|
|
14
|
+
* Performance: O(1) state update + fully vectorized O(N) convolution.
|
|
15
|
+
* Expected gain vs naive circular buffer: 3-6x for 16-128 tap filters on ARM.
|
|
12
16
|
*/
|
|
13
17
|
|
|
14
18
|
#include <vector>
|
|
15
19
|
#include <cstddef>
|
|
16
20
|
#include <cstring>
|
|
17
21
|
#include <stdexcept>
|
|
22
|
+
#include <algorithm>
|
|
18
23
|
|
|
19
24
|
#if defined(__ARM_NEON) || defined(__aarch64__)
|
|
20
25
|
#include <arm_neon.h>
|
|
@@ -23,40 +28,50 @@
|
|
|
23
28
|
namespace dsp::core
|
|
24
29
|
{
|
|
25
30
|
/**
|
|
26
|
-
* @brief High-performance NEON-optimized FIR filter
|
|
31
|
+
* @brief High-performance NEON-optimized FIR filter using guard-zone circular buffer
|
|
27
32
|
*
|
|
28
|
-
*
|
|
29
|
-
* -
|
|
30
|
-
* -
|
|
31
|
-
* -
|
|
32
|
-
* -
|
|
33
|
+
* Architecture:
|
|
34
|
+
* - Circular buffer with power-of-2 size for bitmask wrapping (O(1) update)
|
|
35
|
+
* - Guard zone (mirrored tail) to make SIMD reads always contiguous
|
|
36
|
+
* - Coefficients stored in forward order (newest sample = h[0])
|
|
37
|
+
* - NEON kernel reads forward from 'head' with no modulo in inner loop
|
|
33
38
|
*
|
|
34
|
-
* This
|
|
35
|
-
* 1.
|
|
36
|
-
* 2.
|
|
37
|
-
* 3.
|
|
39
|
+
* This gives best of both worlds:
|
|
40
|
+
* 1. O(1) state updates (increment head, write sample + guard)
|
|
41
|
+
* 2. Fully contiguous NEON loads (no gather/scatter)
|
|
42
|
+
* 3. No memmove/shift overhead (eliminated algorithmic regression)
|
|
38
43
|
*/
|
|
39
44
|
class FirFilterNeon
|
|
40
45
|
{
|
|
41
46
|
public:
|
|
42
47
|
explicit FirFilterNeon(const std::vector<float> &coefficients)
|
|
43
|
-
: m_numTaps(coefficients.size())
|
|
48
|
+
: m_numTaps(coefficients.size()),
|
|
49
|
+
m_head(0)
|
|
44
50
|
{
|
|
45
51
|
if (coefficients.empty())
|
|
46
52
|
{
|
|
47
53
|
throw std::invalid_argument("FIR coefficients cannot be empty");
|
|
48
54
|
}
|
|
49
55
|
|
|
50
|
-
//
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
for (size_t i = 0; i < m_numTaps; ++i)
|
|
56
|
+
// Round up to next power of 2 for bitmask wrapping
|
|
57
|
+
m_bufferSize = 1;
|
|
58
|
+
while (m_bufferSize < m_numTaps)
|
|
54
59
|
{
|
|
55
|
-
|
|
60
|
+
m_bufferSize <<= 1;
|
|
56
61
|
}
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
62
|
+
m_headMask = m_bufferSize - 1;
|
|
63
|
+
|
|
64
|
+
// Reverse coefficients to match memory access pattern:
|
|
65
|
+
// readStart points to oldest sample, we read forward (oldest→newest)
|
|
66
|
+
// So h[0] should multiply oldest sample = original h[numTaps-1]
|
|
67
|
+
// This way: h_rev[0]*x[oldest] + ... + h_rev[N-1]*x[newest]
|
|
68
|
+
// = h[N-1]*x[oldest] + ... + h[0]*x[newest] ✓
|
|
69
|
+
m_coefficients.resize(coefficients.size());
|
|
70
|
+
std::reverse_copy(coefficients.begin(), coefficients.end(), m_coefficients.begin());
|
|
71
|
+
|
|
72
|
+
// Allocate state buffer + guard zone
|
|
73
|
+
// Guard zone mirrors the entire circular buffer for contiguous wraparound reads
|
|
74
|
+
m_state.resize(m_bufferSize * 2, 0.0f);
|
|
60
75
|
}
|
|
61
76
|
|
|
62
77
|
/**
|
|
@@ -87,119 +102,136 @@ namespace dsp::core
|
|
|
87
102
|
}
|
|
88
103
|
|
|
89
104
|
/**
|
|
90
|
-
* @brief Reset filter state (clear
|
|
105
|
+
* @brief Reset filter state (clear circular buffer and guard zone)
|
|
91
106
|
*/
|
|
92
107
|
void reset()
|
|
93
108
|
{
|
|
94
|
-
std::fill(
|
|
109
|
+
std::fill(m_state.begin(), m_state.end(), 0.0f);
|
|
110
|
+
m_head = 0;
|
|
95
111
|
}
|
|
96
112
|
|
|
97
113
|
size_t getNumTaps() const { return m_numTaps; }
|
|
114
|
+
size_t getBufferSize() const { return m_bufferSize; }
|
|
98
115
|
|
|
99
116
|
private:
|
|
100
|
-
size_t m_numTaps;
|
|
101
|
-
|
|
102
|
-
|
|
117
|
+
size_t m_numTaps; // Number of filter taps
|
|
118
|
+
size_t m_bufferSize; // Power-of-2 buffer size (>= m_numTaps)
|
|
119
|
+
size_t m_head; // Current write position
|
|
120
|
+
size_t m_headMask; // Bitmask for wrapping (bufferSize - 1)
|
|
121
|
+
std::vector<float> m_coefficients; // Filter coefficients (forward order)
|
|
122
|
+
std::vector<float> m_state; // Circular buffer + guard zone
|
|
103
123
|
|
|
104
124
|
#if defined(__ARM_NEON) || defined(__aarch64__)
|
|
105
125
|
/**
|
|
106
|
-
* @brief NEON-optimized sample processing
|
|
126
|
+
* @brief NEON-optimized sample processing with guard-zone circular buffer
|
|
107
127
|
*
|
|
108
|
-
*
|
|
109
|
-
* 1.
|
|
110
|
-
* 2.
|
|
128
|
+
* Algorithm:
|
|
129
|
+
* 1. Write input to state[head] and state[head + bufferSize] (guard mirror)
|
|
130
|
+
* 2. Read N contiguous floats starting from state[head] using NEON
|
|
131
|
+
* 3. Compute dot product with coefficients (fully vectorized)
|
|
132
|
+
* 4. Advance head with bitmask wrapping (O(1))
|
|
111
133
|
*
|
|
112
|
-
*
|
|
113
|
-
*
|
|
134
|
+
* Key: The guard zone ensures that reads from 'head' are ALWAYS contiguous,
|
|
135
|
+
* even when they logically "wrap around" the circular buffer boundary.
|
|
114
136
|
*/
|
|
115
137
|
float processSampleNeon(float input)
|
|
116
138
|
{
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
139
|
+
// Advance head FIRST (points to oldest sample position)
|
|
140
|
+
m_head = (m_head + 1) & m_headMask;
|
|
141
|
+
|
|
142
|
+
// Write input to current position AND guard zone (O(1) mirroring)
|
|
143
|
+
m_state[m_head] = input;
|
|
144
|
+
// Always mirror to guard zone - this is critical for wraparound reads!
|
|
145
|
+
m_state[m_head + m_bufferSize] = input;
|
|
146
|
+
|
|
147
|
+
// NEON convolution: read samples from oldest to newest
|
|
148
|
+
// Coefficients are stored in REVERSE order, so:
|
|
149
|
+
// h_rev[0]*x[oldest] + h_rev[1]*x[older] + ... + h_rev[N-1]*x[newest]
|
|
150
|
+
// = h[N-1]*x[oldest] + ... + h[0]*x[newest] = correct FIR formula ✓
|
|
151
|
+
// The guard zone ensures contiguous reads even across the wrap boundary
|
|
152
|
+
// Calculate start position: if m_head >= (numTaps-1), read from [m_head - numTaps + 1]
|
|
153
|
+
// Otherwise, read from guard zone: [m_head + bufferSize - numTaps + 1]
|
|
154
|
+
size_t readStart;
|
|
155
|
+
if (m_head >= m_numTaps - 1)
|
|
156
|
+
{
|
|
157
|
+
readStart = m_head - m_numTaps + 1;
|
|
158
|
+
}
|
|
159
|
+
else
|
|
160
|
+
{
|
|
161
|
+
// Wrap using guard zone (no modulo needed!)
|
|
162
|
+
readStart = m_head + m_bufferSize - m_numTaps + 1;
|
|
163
|
+
}
|
|
164
|
+
const float *x = &m_state[readStart];
|
|
165
|
+
const float *h = m_coefficients.data();
|
|
166
|
+
|
|
167
|
+
constexpr size_t simd_width = 4;
|
|
168
|
+
const size_t simd_end = (m_numTaps / simd_width) * simd_width;
|
|
120
169
|
|
|
121
170
|
float32x4_t acc = vdupq_n_f32(0.0f);
|
|
122
171
|
|
|
123
|
-
// Vectorized MAC
|
|
172
|
+
// Vectorized MAC loop (no modulo, no branches!)
|
|
124
173
|
for (size_t i = 0; i < simd_end; i += simd_width)
|
|
125
174
|
{
|
|
126
|
-
float32x4_t c = vld1q_f32(
|
|
127
|
-
float32x4_t d = vld1q_f32(
|
|
175
|
+
float32x4_t c = vld1q_f32(h + i);
|
|
176
|
+
float32x4_t d = vld1q_f32(x + i);
|
|
128
177
|
acc = vmlaq_f32(acc, c, d); // Fused multiply-add
|
|
129
178
|
}
|
|
130
179
|
|
|
131
|
-
// Horizontal reduction
|
|
180
|
+
// Horizontal reduction
|
|
181
|
+
#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
|
182
|
+
// ARMv8.1-a and later: use vaddvq_f32
|
|
183
|
+
float output = vaddvq_f32(acc);
|
|
184
|
+
#else
|
|
185
|
+
// ARMv8.0 fallback: manual pairwise addition
|
|
132
186
|
float32x2_t sum_lo = vget_low_f32(acc);
|
|
133
187
|
float32x2_t sum_hi = vget_high_f32(acc);
|
|
134
188
|
float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
|
|
135
189
|
float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
|
|
136
190
|
float output = vget_lane_f32(sum_final, 0);
|
|
191
|
+
#endif
|
|
137
192
|
|
|
138
|
-
//
|
|
193
|
+
// Scalar tail (remaining 0-3 taps)
|
|
139
194
|
for (size_t i = simd_end; i < m_numTaps; ++i)
|
|
140
195
|
{
|
|
141
|
-
output +=
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
// Update delay line: shift left by 1, insert new sample at end
|
|
145
|
-
// For small taps (<= 64), NEON shift is faster than memmove
|
|
146
|
-
if (m_numTaps <= 64)
|
|
147
|
-
{
|
|
148
|
-
neonShiftLeft(m_delayLine.data(), m_numTaps, input);
|
|
149
|
-
}
|
|
150
|
-
else
|
|
151
|
-
{
|
|
152
|
-
std::memmove(m_delayLine.data(), m_delayLine.data() + 1, (m_numTaps - 1) * sizeof(float));
|
|
153
|
-
m_delayLine[m_numTaps - 1] = input;
|
|
196
|
+
output += h[i] * x[i];
|
|
154
197
|
}
|
|
155
198
|
|
|
156
199
|
return output;
|
|
157
200
|
}
|
|
201
|
+
#endif
|
|
158
202
|
|
|
159
203
|
/**
|
|
160
|
-
* @brief
|
|
161
|
-
* Shifts entire array left by 1 element using vectorized loads/stores
|
|
204
|
+
* @brief Scalar fallback for non-ARM platforms
|
|
162
205
|
*/
|
|
163
|
-
|
|
206
|
+
float processSampleScalar(float input)
|
|
164
207
|
{
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
const size_t simd_end = simd_count * simd_width;
|
|
208
|
+
// Advance head FIRST
|
|
209
|
+
m_head = (m_head + 1) & m_headMask;
|
|
168
210
|
|
|
169
|
-
//
|
|
170
|
-
|
|
211
|
+
// Write to circular buffer + guard
|
|
212
|
+
m_state[m_head] = input;
|
|
213
|
+
// Always mirror to guard zone
|
|
214
|
+
m_state[m_head + m_bufferSize] = input;
|
|
215
|
+
|
|
216
|
+
// Compute output (read backward from newest to oldest)
|
|
217
|
+
float output = 0.0f;
|
|
218
|
+
size_t readStart;
|
|
219
|
+
if (m_head >= m_numTaps - 1)
|
|
171
220
|
{
|
|
172
|
-
|
|
173
|
-
vst1q_f32(&data[i], vals);
|
|
221
|
+
readStart = m_head - m_numTaps + 1;
|
|
174
222
|
}
|
|
175
|
-
|
|
176
|
-
// Scalar remainder
|
|
177
|
-
for (size_t i = simd_end; i < size - 1; ++i)
|
|
223
|
+
else
|
|
178
224
|
{
|
|
179
|
-
|
|
225
|
+
readStart = m_head + m_bufferSize - m_numTaps + 1;
|
|
180
226
|
}
|
|
227
|
+
const float *x = &m_state[readStart];
|
|
228
|
+
const float *h = m_coefficients.data();
|
|
181
229
|
|
|
182
|
-
data[size - 1] = newValue;
|
|
183
|
-
}
|
|
184
|
-
#endif
|
|
185
|
-
|
|
186
|
-
/**
|
|
187
|
-
* @brief Scalar fallback for non-ARM platforms
|
|
188
|
-
*/
|
|
189
|
-
float processSampleScalar(float input)
|
|
190
|
-
{
|
|
191
|
-
float output = 0.0f;
|
|
192
|
-
|
|
193
|
-
// Compute output
|
|
194
230
|
for (size_t i = 0; i < m_numTaps; ++i)
|
|
195
231
|
{
|
|
196
|
-
output +=
|
|
232
|
+
output += h[i] * x[i];
|
|
197
233
|
}
|
|
198
234
|
|
|
199
|
-
// Update delay line
|
|
200
|
-
std::memmove(m_delayLine.data(), m_delayLine.data() + 1, (m_numTaps - 1) * sizeof(float));
|
|
201
|
-
m_delayLine[m_numTaps - 1] = input;
|
|
202
|
-
|
|
203
235
|
return output;
|
|
204
236
|
}
|
|
205
237
|
};
|
|
@@ -70,6 +70,27 @@ namespace dsp::core
|
|
|
70
70
|
*/
|
|
71
71
|
T addSample(T newValue) { return m_filter.addSample(newValue); }
|
|
72
72
|
|
|
73
|
+
/**
|
|
74
|
+
* @brief Process array of samples in batch (optimized for throughput).
|
|
75
|
+
*
|
|
76
|
+
* This is significantly faster than calling addSample() in a loop
|
|
77
|
+
* for small-to-medium input sizes, as it:
|
|
78
|
+
* 1. Avoids per-call overhead (JS→Native boundary crossing)
|
|
79
|
+
* 2. Enables better CPU cache utilization
|
|
80
|
+
* 3. Allows compiler to vectorize the loop
|
|
81
|
+
*
|
|
82
|
+
* @param input Input array of samples
|
|
83
|
+
* @param output Output array (same size as input)
|
|
84
|
+
* @param length Number of samples to process
|
|
85
|
+
*/
|
|
86
|
+
void processArray(const T *input, T *output, size_t length)
|
|
87
|
+
{
|
|
88
|
+
for (size_t i = 0; i < length; ++i)
|
|
89
|
+
{
|
|
90
|
+
output[i] = addSample(input[i]);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
73
94
|
/**
|
|
74
95
|
* @brief Adds a new sample with timestamp (time-aware mode only).
|
|
75
96
|
* @param newValue The new sample value to add.
|