dspx 0.2.0-alpha.12 → 0.2.0-alpha.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dspx",
3
- "version": "0.2.0-alpha.12",
3
+ "version": "0.2.0-alpha.14",
4
4
  "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
Binary file
@@ -5,6 +5,7 @@
5
5
  #include "../utils/ConvolutionPolicy.h"
6
6
  #include "../utils/SimdOps.h"
7
7
  #include "../core/FftEngine.h"
8
+ #include "../core/FirFilterNeon.h"
8
9
  #include <vector>
9
10
  #include <string>
10
11
  #include <memory>
@@ -220,7 +221,9 @@ namespace dsp::adapters
220
221
  // De-interleaved buffers for cache-friendly processing
221
222
  std::vector<std::vector<float>> m_deinterleaved_buffers;
222
223
  std::vector<float> m_temp_output_channel;
223
- std::vector<float> m_reversed_window; // Reversed window for SIMD convolution
224
+
225
+ // High-performance FIR filters for batch convolution (one per channel)
226
+ std::vector<std::unique_ptr<core::FirFilterNeon>> m_batch_fir_filters;
224
227
 
225
228
  // Reusable buffer for moving mode direct convolution (eliminates per-sample allocation)
226
229
  mutable std::vector<float> m_moving_temp_buffer;
@@ -390,6 +393,67 @@ namespace dsp::adapters
390
393
  size_t samplesPerChannel = numSamples / numChannels;
391
394
  size_t kernelSize = m_kernel.size();
392
395
 
396
+ #if defined(__ARM_NEON) || defined(__aarch64__)
397
+ // ARM NEON path: Use FirFilterNeon for optimal performance
398
+ // This eliminates per-sample gather overhead with O(1) circular buffer
399
+
400
+ // Initialize FIR filters for each channel (only once)
401
+ if (m_batch_fir_filters.size() != static_cast<size_t>(numChannels))
402
+ {
403
+ m_batch_fir_filters.clear();
404
+ for (int ch = 0; ch < numChannels; ++ch)
405
+ {
406
+ m_batch_fir_filters.push_back(
407
+ std::make_unique<core::FirFilterNeon>(m_kernel));
408
+ }
409
+ }
410
+
411
+ // Resize de-interleaved buffers if needed
412
+ if (m_deinterleaved_buffers.size() != static_cast<size_t>(numChannels))
413
+ {
414
+ m_deinterleaved_buffers.resize(numChannels);
415
+ for (auto &buf : m_deinterleaved_buffers)
416
+ {
417
+ buf.resize(samplesPerChannel);
418
+ }
419
+ }
420
+ else if (!m_deinterleaved_buffers.empty() &&
421
+ m_deinterleaved_buffers[0].size() != samplesPerChannel)
422
+ {
423
+ for (auto &buf : m_deinterleaved_buffers)
424
+ {
425
+ buf.resize(samplesPerChannel);
426
+ }
427
+ }
428
+
429
+ // Step 1: De-interleave and process each channel with FirFilterNeon
430
+ for (int ch = 0; ch < numChannels; ++ch)
431
+ {
432
+ auto &fir = m_batch_fir_filters[ch];
433
+ auto &output = m_deinterleaved_buffers[ch];
434
+
435
+ // Reset filter state for batch mode (stateless)
436
+ fir->reset();
437
+
438
+ // Extract channel samples and process in-place
439
+ for (size_t i = 0; i < samplesPerChannel; ++i)
440
+ {
441
+ float input = buffer[i * numChannels + ch];
442
+ output[i] = fir->processSample(input);
443
+ }
444
+ }
445
+
446
+ // Step 2: Re-interleave output back to buffer
447
+ for (int ch = 0; ch < numChannels; ++ch)
448
+ {
449
+ const auto &output = m_deinterleaved_buffers[ch];
450
+ for (size_t i = 0; i < samplesPerChannel; ++i)
451
+ {
452
+ buffer[i * numChannels + ch] = output[i];
453
+ }
454
+ }
455
+ #else
456
+ // Non-ARM fallback: Original scalar implementation
393
457
  // Resize de-interleaved buffers if channel count changed
394
458
  if (m_deinterleaved_buffers.size() != static_cast<size_t>(numChannels))
395
459
  {
@@ -414,12 +478,6 @@ namespace dsp::adapters
414
478
  m_temp_output_channel.resize(samplesPerChannel);
415
479
  }
416
480
 
417
- // Resize reversed window buffer for SIMD (only allocate once)
418
- if (m_reversed_window.size() != kernelSize)
419
- {
420
- m_reversed_window.resize(kernelSize);
421
- }
422
-
423
481
  // Step 1: De-interleave input data for cache-friendly processing
424
482
  for (int ch = 0; ch < numChannels; ++ch)
425
483
  {
@@ -438,43 +496,16 @@ namespace dsp::adapters
438
496
  float *output = m_temp_output_channel.data();
439
497
 
440
498
  // Standard convolution: y[n] = sum(h[k] * x[n-k])
441
- // For small K: simple loop (compiler auto-vectorizes)
442
- // For large K: use explicit SIMD with window collection
443
499
  for (size_t n = 0; n < samplesPerChannel; ++n)
444
500
  {
445
501
  float sum = 0.0f;
446
502
 
447
- if (kernelSize <= 16)
503
+ // Simple scalar loop (compiler may auto-vectorize)
504
+ for (size_t k = 0; k < kernelSize; ++k)
448
505
  {
449
- // Small kernels: tight loop, compiler auto-vectorizes
450
- for (size_t k = 0; k < kernelSize; ++k)
451
- {
452
- if (n >= k)
453
- {
454
- sum += kernelPtr[k] * channelInput[n - k];
455
- }
456
- }
457
- }
458
- else
459
- {
460
- // Large kernels: collect window and use SIMD dot product
461
- // This avoids the performance cliff from backward indexing
462
- if (n >= kernelSize - 1)
463
- {
464
- // Full window available - collect in forward order for SIMD
465
- for (size_t k = 0; k < kernelSize; ++k)
466
- {
467
- m_reversed_window[k] = channelInput[n - k];
468
- }
469
- sum = simd::dot_product(kernelPtr, m_reversed_window.data(), kernelSize);
470
- }
471
- else
506
+ if (n >= k)
472
507
  {
473
- // Partial window at the start
474
- for (size_t k = 0; k <= n; ++k)
475
- {
476
- sum += kernelPtr[k] * channelInput[n - k];
477
- }
508
+ sum += kernelPtr[k] * channelInput[n - k];
478
509
  }
479
510
  }
480
511
 
@@ -487,6 +518,7 @@ namespace dsp::adapters
487
518
  buffer[i * numChannels + ch] = output[i];
488
519
  }
489
520
  }
521
+ #endif
490
522
  }
491
523
 
492
524
  /**
@@ -2,19 +2,24 @@
2
2
 
3
3
  /**
4
4
  * @file FirFilterNeon.h
5
- * @brief ARM NEON-optimized FIR filter with transposed direct-form II structure
5
+ * @brief ARM NEON-optimized FIR filter with guard-zone circular buffer
6
6
  *
7
- * This implementation eliminates circular buffer overhead by using a transposed
8
- * (direct-form II) structure where the delay line is updated once per output sample
9
- * with simple shifts, allowing pure NEON vectorization without gather operations.
7
+ * This implementation keeps O(1) state updates while enabling fully contiguous
8
+ * NEON vectorization using a "guard zone" (mirrored buffer) technique.
10
9
  *
11
- * Expected performance gain vs circular buffer: 3-6x for 16-128 tap filters on ARM.
10
+ * Key insight: Allocate buffer of size N + GUARD (where GUARD >= max SIMD width).
11
+ * When writing sample at index i, also write it at i+N. This ensures that any
12
+ * NEON load starting from 'head' can read contiguously without wrap-around logic.
13
+ *
14
+ * Performance: O(1) state update + fully vectorized O(N) convolution.
15
+ * Expected gain vs naive circular buffer: 3-6x for 16-128 tap filters on ARM.
12
16
  */
13
17
 
14
18
  #include <vector>
15
19
  #include <cstddef>
16
20
  #include <cstring>
17
21
  #include <stdexcept>
22
+ #include <algorithm>
18
23
 
19
24
  #if defined(__ARM_NEON) || defined(__aarch64__)
20
25
  #include <arm_neon.h>
@@ -23,40 +28,50 @@
23
28
  namespace dsp::core
24
29
  {
25
30
  /**
26
- * @brief High-performance NEON-optimized FIR filter (ARM only)
31
+ * @brief High-performance NEON-optimized FIR filter using guard-zone circular buffer
27
32
  *
28
- * Uses transposed direct-form structure:
29
- * - Delay line stored in linear buffer (NO circular indexing)
30
- * - Coefficients reversed once during construction
31
- * - Inner loop is pure NEON FMA: acc = vmlaq_f32(acc, coeff, delay)
32
- * - Per-sample update is simple memmove/NEON shift
33
+ * Architecture:
34
+ * - Circular buffer with power-of-2 size for bitmask wrapping (O(1) update)
35
+ * - Guard zone (mirrored tail) to make SIMD reads always contiguous
36
+ * - Coefficients stored in forward order (newest sample = h[0])
37
+ * - NEON kernel reads forward from 'head' with no modulo in inner loop
33
38
  *
34
- * This architecture allows the CPU to:
35
- * 1. Stream memory linearly (no gather/scatter)
36
- * 2. Use full NEON FMA pipeline (1 cycle per 4 MACs)
37
- * 3. Prefetch ahead automatically (predictable access pattern)
39
+ * This gives best of both worlds:
40
+ * 1. O(1) state updates (increment head, write sample + guard)
41
+ * 2. Fully contiguous NEON loads (no gather/scatter)
42
+ * 3. No memmove/shift overhead (eliminated algorithmic regression)
38
43
  */
39
44
  class FirFilterNeon
40
45
  {
41
46
  public:
42
47
  explicit FirFilterNeon(const std::vector<float> &coefficients)
43
- : m_numTaps(coefficients.size())
48
+ : m_numTaps(coefficients.size()),
49
+ m_head(0)
44
50
  {
45
51
  if (coefficients.empty())
46
52
  {
47
53
  throw std::invalid_argument("FIR coefficients cannot be empty");
48
54
  }
49
55
 
50
- // Store coefficients in REVERSE order for direct convolution
51
- // (eliminates index arithmetic in inner loop)
52
- m_coefficients.resize(m_numTaps);
53
- for (size_t i = 0; i < m_numTaps; ++i)
56
+ // Round up to next power of 2 for bitmask wrapping
57
+ m_bufferSize = 1;
58
+ while (m_bufferSize < m_numTaps)
54
59
  {
55
- m_coefficients[i] = coefficients[m_numTaps - 1 - i];
60
+ m_bufferSize <<= 1;
56
61
  }
57
-
58
- // Allocate delay line (zero-initialized)
59
- m_delayLine.resize(m_numTaps, 0.0f);
62
+ m_headMask = m_bufferSize - 1;
63
+
64
+ // Reverse coefficients to match memory access pattern:
65
+ // readStart points to oldest sample, we read forward (oldest→newest)
66
+ // So h[0] should multiply oldest sample = original h[numTaps-1]
67
+ // This way: h_rev[0]*x[oldest] + ... + h_rev[N-1]*x[newest]
68
+ // = h[N-1]*x[oldest] + ... + h[0]*x[newest] ✓
69
+ m_coefficients.resize(coefficients.size());
70
+ std::reverse_copy(coefficients.begin(), coefficients.end(), m_coefficients.begin());
71
+
72
+ // Allocate state buffer + guard zone
73
+ // Guard zone mirrors the entire circular buffer for contiguous wraparound reads
74
+ m_state.resize(m_bufferSize * 2, 0.0f);
60
75
  }
61
76
 
62
77
  /**
@@ -87,119 +102,136 @@ namespace dsp::core
87
102
  }
88
103
 
89
104
  /**
90
- * @brief Reset filter state (clear delay line)
105
+ * @brief Reset filter state (clear circular buffer and guard zone)
91
106
  */
92
107
  void reset()
93
108
  {
94
- std::fill(m_delayLine.begin(), m_delayLine.end(), 0.0f);
109
+ std::fill(m_state.begin(), m_state.end(), 0.0f);
110
+ m_head = 0;
95
111
  }
96
112
 
97
113
  size_t getNumTaps() const { return m_numTaps; }
114
+ size_t getBufferSize() const { return m_bufferSize; }
98
115
 
99
116
  private:
100
- size_t m_numTaps;
101
- std::vector<float> m_coefficients; // Reversed for direct convolution
102
- std::vector<float> m_delayLine; // Linear buffer (NO circular indexing!)
117
+ size_t m_numTaps; // Number of filter taps
118
+ size_t m_bufferSize; // Power-of-2 buffer size (>= m_numTaps)
119
+ size_t m_head; // Current write position
120
+ size_t m_headMask; // Bitmask for wrapping (bufferSize - 1)
121
+ std::vector<float> m_coefficients; // Filter coefficients (forward order)
122
+ std::vector<float> m_state; // Circular buffer + guard zone
103
123
 
104
124
  #if defined(__ARM_NEON) || defined(__aarch64__)
105
125
  /**
106
- * @brief NEON-optimized sample processing
126
+ * @brief NEON-optimized sample processing with guard-zone circular buffer
107
127
  *
108
- * Transposed Direct-Form II FIR:
109
- * 1. Compute output: y[n] = sum(c[i] * d[i]) using NEON FMA
110
- * 2. Update delay line: shift left, insert new sample at end
128
+ * Algorithm:
129
+ * 1. Write input to state[head] and state[head + bufferSize] (guard mirror)
130
+ * 2. Read N contiguous floats starting from state[head] using NEON
131
+ * 3. Compute dot product with coefficients (fully vectorized)
132
+ * 4. Advance head with bitmask wrapping (O(1))
111
133
  *
112
- * This is THE key optimization: delay line is contiguous, so NEON
113
- * can stream loads/stores without address computation.
134
+ * Key: The guard zone ensures that reads from 'head' are ALWAYS contiguous,
135
+ * even when they logically "wrap around" the circular buffer boundary.
114
136
  */
115
137
  float processSampleNeon(float input)
116
138
  {
117
- const size_t simd_width = 4;
118
- const size_t simd_count = m_numTaps / simd_width;
119
- const size_t simd_end = simd_count * simd_width;
139
+ // Advance head FIRST (points to oldest sample position)
140
+ m_head = (m_head + 1) & m_headMask;
141
+
142
+ // Write input to current position AND guard zone (O(1) mirroring)
143
+ m_state[m_head] = input;
144
+ // Always mirror to guard zone - this is critical for wraparound reads!
145
+ m_state[m_head + m_bufferSize] = input;
146
+
147
+ // NEON convolution: read samples from oldest to newest
148
+ // Coefficients are stored in REVERSE order, so:
149
+ // h_rev[0]*x[oldest] + h_rev[1]*x[older] + ... + h_rev[N-1]*x[newest]
150
+ // = h[N-1]*x[oldest] + ... + h[0]*x[newest] = correct FIR formula ✓
151
+ // The guard zone ensures contiguous reads even across the wrap boundary
152
+ // Calculate start position: if m_head >= (numTaps-1), read from [m_head - numTaps + 1]
153
+ // Otherwise, read from guard zone: [m_head + bufferSize - numTaps + 1]
154
+ size_t readStart;
155
+ if (m_head >= m_numTaps - 1)
156
+ {
157
+ readStart = m_head - m_numTaps + 1;
158
+ }
159
+ else
160
+ {
161
+ // Wrap using guard zone (no modulo needed!)
162
+ readStart = m_head + m_bufferSize - m_numTaps + 1;
163
+ }
164
+ const float *x = &m_state[readStart];
165
+ const float *h = m_coefficients.data();
166
+
167
+ constexpr size_t simd_width = 4;
168
+ const size_t simd_end = (m_numTaps / simd_width) * simd_width;
120
169
 
121
170
  float32x4_t acc = vdupq_n_f32(0.0f);
122
171
 
123
- // Vectorized MAC: acc += coeff[i] * delay[i]
172
+ // Vectorized MAC loop (no modulo, no branches!)
124
173
  for (size_t i = 0; i < simd_end; i += simd_width)
125
174
  {
126
- float32x4_t c = vld1q_f32(&m_coefficients[i]);
127
- float32x4_t d = vld1q_f32(&m_delayLine[i]);
175
+ float32x4_t c = vld1q_f32(h + i);
176
+ float32x4_t d = vld1q_f32(x + i);
128
177
  acc = vmlaq_f32(acc, c, d); // Fused multiply-add
129
178
  }
130
179
 
131
- // Horizontal reduction (sum 4 lanes)
180
+ // Horizontal reduction
181
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
182
+ // ARMv8.1-a and later: use vaddvq_f32
183
+ float output = vaddvq_f32(acc);
184
+ #else
185
+ // ARMv8.0 fallback: manual pairwise addition
132
186
  float32x2_t sum_lo = vget_low_f32(acc);
133
187
  float32x2_t sum_hi = vget_high_f32(acc);
134
188
  float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
135
189
  float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
136
190
  float output = vget_lane_f32(sum_final, 0);
191
+ #endif
137
192
 
138
- // Handle remainder (scalar)
193
+ // Scalar tail (remaining 0-3 taps)
139
194
  for (size_t i = simd_end; i < m_numTaps; ++i)
140
195
  {
141
- output += m_coefficients[i] * m_delayLine[i];
142
- }
143
-
144
- // Update delay line: shift left by 1, insert new sample at end
145
- // For small taps (<= 64), NEON shift is faster than memmove
146
- if (m_numTaps <= 64)
147
- {
148
- neonShiftLeft(m_delayLine.data(), m_numTaps, input);
149
- }
150
- else
151
- {
152
- std::memmove(m_delayLine.data(), m_delayLine.data() + 1, (m_numTaps - 1) * sizeof(float));
153
- m_delayLine[m_numTaps - 1] = input;
196
+ output += h[i] * x[i];
154
197
  }
155
198
 
156
199
  return output;
157
200
  }
201
+ #endif
158
202
 
159
203
  /**
160
- * @brief NEON-accelerated delay line shift
161
- * Shifts entire array left by 1 element using vectorized loads/stores
204
+ * @brief Scalar fallback for non-ARM platforms
162
205
  */
163
- static void neonShiftLeft(float *data, size_t size, float newValue)
206
+ float processSampleScalar(float input)
164
207
  {
165
- const size_t simd_width = 4;
166
- const size_t simd_count = (size - 1) / simd_width;
167
- const size_t simd_end = simd_count * simd_width;
208
+ // Advance head FIRST
209
+ m_head = (m_head + 1) & m_headMask;
168
210
 
169
- // Vectorized shift: data[i] = data[i+1]
170
- for (size_t i = 0; i < simd_end; i += simd_width)
211
+ // Write to circular buffer + guard
212
+ m_state[m_head] = input;
213
+ // Always mirror to guard zone
214
+ m_state[m_head + m_bufferSize] = input;
215
+
216
+ // Compute output (read backward from newest to oldest)
217
+ float output = 0.0f;
218
+ size_t readStart;
219
+ if (m_head >= m_numTaps - 1)
171
220
  {
172
- float32x4_t vals = vld1q_f32(&data[i + 1]);
173
- vst1q_f32(&data[i], vals);
221
+ readStart = m_head - m_numTaps + 1;
174
222
  }
175
-
176
- // Scalar remainder
177
- for (size_t i = simd_end; i < size - 1; ++i)
223
+ else
178
224
  {
179
- data[i] = data[i + 1];
225
+ readStart = m_head + m_bufferSize - m_numTaps + 1;
180
226
  }
227
+ const float *x = &m_state[readStart];
228
+ const float *h = m_coefficients.data();
181
229
 
182
- data[size - 1] = newValue;
183
- }
184
- #endif
185
-
186
- /**
187
- * @brief Scalar fallback for non-ARM platforms
188
- */
189
- float processSampleScalar(float input)
190
- {
191
- float output = 0.0f;
192
-
193
- // Compute output
194
230
  for (size_t i = 0; i < m_numTaps; ++i)
195
231
  {
196
- output += m_coefficients[i] * m_delayLine[i];
232
+ output += h[i] * x[i];
197
233
  }
198
234
 
199
- // Update delay line
200
- std::memmove(m_delayLine.data(), m_delayLine.data() + 1, (m_numTaps - 1) * sizeof(float));
201
- m_delayLine[m_numTaps - 1] = input;
202
-
203
235
  return output;
204
236
  }
205
237
  };
@@ -70,6 +70,27 @@ namespace dsp::core
70
70
  */
71
71
  T addSample(T newValue) { return m_filter.addSample(newValue); }
72
72
 
73
+ /**
74
+ * @brief Process array of samples in batch (optimized for throughput).
75
+ *
76
+ * This is significantly faster than calling addSample() in a loop
77
+ * for small-to-medium input sizes, as it:
78
+ * 1. Avoids per-call overhead (JS→Native boundary crossing)
79
+ * 2. Enables better CPU cache utilization
80
+ * 3. Allows compiler to vectorize the loop
81
+ *
82
+ * @param input Input array of samples
83
+ * @param output Output array (same size as input)
84
+ * @param length Number of samples to process
85
+ */
86
+ void processArray(const T *input, T *output, size_t length)
87
+ {
88
+ for (size_t i = 0; i < length; ++i)
89
+ {
90
+ output[i] = addSample(input[i]);
91
+ }
92
+ }
93
+
73
94
  /**
74
95
  * @brief Adds a new sample with timestamp (time-aware mode only).
75
96
  * @param newValue The new sample value to add.