dspx 0.2.0-alpha.11 → 0.2.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/binding.gyp CHANGED
@@ -81,11 +81,20 @@
81
81
  }],
82
82
  # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, Tensor G4, etc.)
83
83
  ['target_arch=="arm64"', {
84
- "cflags+": [ "-march=armv8-a+fp+simd" ], # Enable NEON and FP on ARMv8
84
+ # ARMv8-a baseline: NEON + FP support (compatible with all ARMv8 CPUs)
85
+ "cflags+": [ "-march=armv8-a+fp+simd" ],
85
86
  "cflags_cc+": [ "-march=armv8-a+fp+simd" ],
86
87
  'xcode_settings': {
87
88
  'OTHER_CPLUSPLUSFLAGS+': [ '-march=armv8-a+fp+simd' ]
88
89
  }
90
+ # Optional: Upgrade to ARMv8.2-a for newer CPUs (Tensor G4, Apple M2+, Graviton 3+)
91
+ # Enables FP16 arithmetic and additional optimizations
92
+ # Uncomment the lines below to enable ARMv8.2-a:
93
+ # "cflags+": [ "-march=armv8.2-a+fp16" ],
94
+ # "cflags_cc+": [ "-march=armv8.2-a+fp16" ],
95
+ # 'xcode_settings': {
96
+ # 'OTHER_CPLUSPLUSFLAGS+': [ '-march=armv8.2-a+fp16' ]
97
+ # }
89
98
  }],
90
99
  # Condition for 32-bit ARM (older Android devices)
91
100
  ['target_arch=="arm"', {
package/dist/utils.d.ts CHANGED
@@ -40,6 +40,50 @@
40
40
  * ```
41
41
  */
42
42
  export declare function dotProduct(a: Float32Array, b: Float32Array): number;
43
+ /**
44
+ * Computes the sum of array elements using SIMD-accelerated native code.
45
+ *
46
+ * This implementation uses ARM NEON (4-wide) or x86 SSE2/AVX2 (4-8 wide) SIMD
47
+ * instructions with double-precision accumulation for numerical accuracy.
48
+ *
49
+ * @param buffer - Input array (Float32Array)
50
+ * @returns The sum of all elements
51
+ * @throws {TypeError} If input is not a Float32Array
52
+ *
53
+ * @example
54
+ * ```typescript
55
+ * const data = new Float32Array([1, 2, 3, 4, 5]);
56
+ * const total = sum(data); // 15
57
+ * ```
58
+ */
59
+ export declare function sum(buffer: Float32Array): number;
60
+ /**
61
+ * Computes the sum of squared elements using SIMD-accelerated native code.
62
+ *
63
+ * This implementation uses ARM NEON vmlaq_f32 (fused multiply-add) or x86
64
+ * SSE2/AVX2 for optimal performance. Result is accumulated in double precision.
65
+ *
66
+ * Useful for computing RMS, variance, power, energy, and L2 norm.
67
+ *
68
+ * @param buffer - Input array (Float32Array)
69
+ * @returns Sum of squares: buffer[0]² + buffer[1]² + ... + buffer[n-1]²
70
+ * @throws {TypeError} If input is not a Float32Array
71
+ *
72
+ * @example
73
+ * ```typescript
74
+ * const signal = new Float32Array([3, 4]); // 3-4-5 triangle
75
+ * const energy = sumOfSquares(signal); // 9 + 16 = 25
76
+ * const rms = Math.sqrt(energy / signal.length); // 5 / sqrt(2) ≈ 3.536
77
+ * ```
78
+ *
79
+ * @example
80
+ * ```typescript
81
+ * // Compute L2 norm (Euclidean length)
82
+ * const vector = new Float32Array([1, 2, 2]);
83
+ * const norm = Math.sqrt(sumOfSquares(vector)); // sqrt(9) = 3
84
+ * ```
85
+ */
86
+ export declare function sumOfSquares(buffer: Float32Array): number;
43
87
  /**
44
88
  * Utility functions for DSP operations.
45
89
  *
@@ -51,5 +95,15 @@ export declare const DspUtils: {
51
95
  * @see {@link dotProduct} for detailed documentation
52
96
  */
53
97
  dotProduct: typeof dotProduct;
98
+ /**
99
+ * Computes the sum of array elements using SIMD-accelerated native code.
100
+ * @see {@link sum} for detailed documentation
101
+ */
102
+ sum: typeof sum;
103
+ /**
104
+ * Computes the sum of squared elements using SIMD-accelerated native code.
105
+ * @see {@link sumOfSquares} for detailed documentation
106
+ */
107
+ sumOfSquares: typeof sumOfSquares;
54
108
  };
55
109
  //# sourceMappingURL=utils.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAyBA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAcnE;AAED;;;;GAIG;AACH,eAAO,MAAM,QAAQ;IACnB;;;OAGG;;CAEJ,CAAC"}
1
+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAyBA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAcnE;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,MAAM,CAKhD;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,YAAY,GAAG,MAAM,CAKzD;AAED;;;;GAIG;AACH,eAAO,MAAM,QAAQ;IACnB;;;OAGG;;IAGH;;;OAGG;;IAGH;;;OAGG;;CAEJ,CAAC"}
package/dist/utils.js CHANGED
@@ -73,6 +73,60 @@ export function dotProduct(a, b) {
73
73
  }
74
74
  return DspAddon.dotProduct(a, b);
75
75
  }
76
+ /**
77
+ * Computes the sum of array elements using SIMD-accelerated native code.
78
+ *
79
+ * This implementation uses ARM NEON (4-wide) or x86 SSE2/AVX2 (4-8 wide) SIMD
80
+ * instructions with double-precision accumulation for numerical accuracy.
81
+ *
82
+ * @param buffer - Input array (Float32Array)
83
+ * @returns The sum of all elements
84
+ * @throws {TypeError} If input is not a Float32Array
85
+ *
86
+ * @example
87
+ * ```typescript
88
+ * const data = new Float32Array([1, 2, 3, 4, 5]);
89
+ * const total = sum(data); // 15
90
+ * ```
91
+ */
92
+ export function sum(buffer) {
93
+ if (!(buffer instanceof Float32Array)) {
94
+ throw new TypeError("Argument must be a Float32Array");
95
+ }
96
+ return DspAddon.sum(buffer);
97
+ }
98
+ /**
99
+ * Computes the sum of squared elements using SIMD-accelerated native code.
100
+ *
101
+ * This implementation uses ARM NEON vmlaq_f32 (fused multiply-add) or x86
102
+ * SSE2/AVX2 for optimal performance. Result is accumulated in double precision.
103
+ *
104
+ * Useful for computing RMS, variance, power, energy, and L2 norm.
105
+ *
106
+ * @param buffer - Input array (Float32Array)
107
+ * @returns Sum of squares: buffer[0]² + buffer[1]² + ... + buffer[n-1]²
108
+ * @throws {TypeError} If input is not a Float32Array
109
+ *
110
+ * @example
111
+ * ```typescript
112
+ * const signal = new Float32Array([3, 4]); // 3-4-5 triangle
113
+ * const energy = sumOfSquares(signal); // 9 + 16 = 25
114
+ * const rms = Math.sqrt(energy / signal.length); // 5 / sqrt(2) ≈ 3.536
115
+ * ```
116
+ *
117
+ * @example
118
+ * ```typescript
119
+ * // Compute L2 norm (Euclidean length)
120
+ * const vector = new Float32Array([1, 2, 2]);
121
+ * const norm = Math.sqrt(sumOfSquares(vector)); // sqrt(9) = 3
122
+ * ```
123
+ */
124
+ export function sumOfSquares(buffer) {
125
+ if (!(buffer instanceof Float32Array)) {
126
+ throw new TypeError("Argument must be a Float32Array");
127
+ }
128
+ return DspAddon.sumOfSquares(buffer);
129
+ }
76
130
  /**
77
131
  * Utility functions for DSP operations.
78
132
  *
@@ -84,5 +138,15 @@ export const DspUtils = {
84
138
  * @see {@link dotProduct} for detailed documentation
85
139
  */
86
140
  dotProduct,
141
+ /**
142
+ * Computes the sum of array elements using SIMD-accelerated native code.
143
+ * @see {@link sum} for detailed documentation
144
+ */
145
+ sum,
146
+ /**
147
+ * Computes the sum of squared elements using SIMD-accelerated native code.
148
+ * @see {@link sumOfSquares} for detailed documentation
149
+ */
150
+ sumOfSquares,
87
151
  };
88
152
  //# sourceMappingURL=utils.js.map
package/dist/utils.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAAA,OAAO,YAAY,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAE1C,wCAAwC;AACxC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEtC,IAAI,QAAa,CAAC;AAClB,sCAAsC;AACtC,IAAI,CAAC;IACH,gDAAgD;IAChD,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAAC,OAAO,CAAC,EAAE,CAAC;IACX,IAAI,CAAC;QACH,oEAAoE;QACpE,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,+CAA+C;QAC/C,MAAM,IAAI,KAAK,CACb,oCAAoC,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,MAAM,UAAU,UAAU,CAAC,CAAe,EAAE,CAAe;IACzD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,uCAAuC,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,wCAAwC,CAAC,CAAC;IAChE,CAAC;IACD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,UAAU,CAClB,uCAAuC,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,MAAM,EAAE,CACxE,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACnC,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG;IACtB;;;OAGG;IACH,UAAU;CACX,CAAC"}
1
+ {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAAA,OAAO,YAAY,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAE1C,wCAAwC;AACxC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEtC,IAAI,QAAa,CAAC;AAClB,sCAAsC;AACtC,IAAI,CAAC;IACH,gDAAgD;IAChD,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAAC,OAAO,CAAC,EAAE,CAAC;IACX,IAAI,CAAC;QACH,oEAAoE;QACpE,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,+CAA+C;QAC/C,MAAM,IAAI,KAAK,CACb,oCAAoC,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,MAAM,UAAU,UAAU,CAAC,CAAe,EAAE,CAAe;IACzD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,uCAAuC,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,wCAAwC,CAAC,CAAC;IAChE,CAAC;IACD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,UAAU,CAClB,uCAAuC,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,MAAM,EAAE,CACxE,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACnC,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,GAAG,CAAC,MAAoB;IACtC,IAAI,CAAC,CAAC,MAAM,YAAY,YAAY,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,SAAS,CAAC,iCAAiC,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;AAC9B,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,UAAU,YAAY,CAAC,MAAoB;IAC/C,IAAI,CAAC,CAAC,MAAM,YAAY,YAAY,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,SAAS,CAAC,iCAAiC,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,QAAQ,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;AACvC,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG;IACtB;;;OAGG;IACH,UAAU;IAEV;;;OAGG;IACH,GAAG;IAEH;;;OAGG;IACH,YAAY;CACb,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dspx",
3
- "version": "0.2.0-alpha.11",
3
+ "version": "0.2.0-alpha.13",
4
4
  "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
Binary file
@@ -7,6 +7,11 @@
7
7
  #include <algorithm>
8
8
  #include "../utils/SimdOps.h"
9
9
 
10
+ // Include ARM NEON intrinsics if available
11
+ #if defined(__ARM_NEON) || defined(__aarch64__)
12
+ #include <arm_neon.h>
13
+ #endif
14
+
10
15
  namespace dsp
11
16
  {
12
17
  namespace core
@@ -270,7 +275,57 @@ namespace dsp
270
275
  // Update weights: w[n+1] = (1 - mu*lambda) * w[n] + mu * e[n] * x[n]
271
276
  // where (1 - mu*lambda) is the leakage factor for regularization
272
277
  T leakage = 1.0 - m_mu * m_lambda;
278
+ T mu_error = mu_n * error;
279
+
280
+ #if defined(__ARM_NEON) || defined(__aarch64__)
281
+ // NEON-optimized weight update for ARM processors
282
+ const size_t simd_width = 4;
283
+ const size_t simd_count = m_numTaps / simd_width;
284
+ const size_t simd_end = simd_count * simd_width;
285
+
286
+ float32x4_t leakage_vec = vdupq_n_f32(leakage);
287
+ float32x4_t mu_error_vec = vdupq_n_f32(mu_error);
288
+
289
+ // Vectorized update: weights[i] = leakage * weights[i] + mu_error * x[i]
290
+ for (size_t i = 0; i < simd_end; i += simd_width)
291
+ {
292
+ // Get indices for input buffer (circular)
293
+ size_t idx0 = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
294
+ size_t idx1 = (writeIdx + m_numTaps - 2 - i) % m_numTaps;
295
+ size_t idx2 = (writeIdx + m_numTaps - 3 - i) % m_numTaps;
296
+ size_t idx3 = (writeIdx + m_numTaps - 4 - i) % m_numTaps;
297
+
298
+ // Load 4 input samples (must be done individually due to circular buffer)
299
+ float x_vals[4] = {
300
+ static_cast<float>(inputBuffer[idx0]),
301
+ static_cast<float>(inputBuffer[idx1]),
302
+ static_cast<float>(inputBuffer[idx2]),
303
+ static_cast<float>(inputBuffer[idx3])};
304
+ float32x4_t x = vld1q_f32(x_vals);
305
+
306
+ // Load 4 weights
307
+ float32x4_t w = vld1q_f32(reinterpret_cast<const float *>(&weights[i]));
308
+
309
+ // Apply leakage: w *= leakage
310
+ w = vmulq_f32(w, leakage_vec);
311
+
312
+ // Fused multiply-add: w += mu_error * x
313
+ w = vmlaq_f32(w, mu_error_vec, x);
314
+
315
+ // Store updated weights
316
+ vst1q_f32(reinterpret_cast<float *>(&weights[i]), w);
317
+ }
318
+
319
+ // Handle remainder (scalar)
320
+ for (size_t i = simd_end; i < m_numTaps; ++i)
321
+ {
322
+ size_t idx = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
323
+ T x_i = inputBuffer[idx];
324
+ weights[i] = leakage * weights[i] + mu_error * x_i;
325
+ }
273
326
 
327
+ #else
328
+ // Scalar weight update for non-ARM platforms
274
329
  for (size_t i = 0; i < m_numTaps; ++i)
275
330
  {
276
331
  size_t idx = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
@@ -279,6 +334,7 @@ namespace dsp
279
334
  // Apply leaky LMS update
280
335
  weights[i] = leakage * weights[i] + mu_n * error * x_i;
281
336
  }
337
+ #endif
282
338
  }
283
339
  };
284
340
 
@@ -28,6 +28,20 @@ namespace dsp
28
28
  throw std::invalid_argument("FIR filter requires at least one coefficient");
29
29
  }
30
30
 
31
+ #if defined(__ARM_NEON) || defined(__aarch64__)
32
+ // Auto-select NEON for float32 + small-medium taps (where transposed form wins)
33
+ // For large taps (>128), circular buffer's O(1) state update is better than O(N) shift
34
+ m_useNeon = false;
35
+ if constexpr (std::is_same_v<T, float>)
36
+ {
37
+ if (stateful && coefficients.size() >= 8 && coefficients.size() <= 128)
38
+ {
39
+ m_neonFilter = std::make_unique<FirFilterNeon>(coefficients);
40
+ m_useNeon = true;
41
+ }
42
+ }
43
+ #endif
44
+
31
45
  if (stateful)
32
46
  {
33
47
  // Round up to next power of 2 for efficient circular buffer (enables bitwise AND instead of modulo)
@@ -51,6 +65,14 @@ namespace dsp
51
65
  throw std::runtime_error("processSample() requires stateful mode");
52
66
  }
53
67
 
68
+ #if defined(__ARM_NEON) || defined(__aarch64__)
69
+ // Use NEON filter if available
70
+ if (m_useNeon && m_neonFilter)
71
+ {
72
+ return static_cast<T>(m_neonFilter->processSample(static_cast<float>(input)));
73
+ }
74
+ #endif
75
+
54
76
  // Store input in circular buffer
55
77
  m_state[m_stateIndex] = input;
56
78
 
@@ -86,6 +108,18 @@ namespace dsp
86
108
  template <typename T>
87
109
  void FirFilter<T>::process(const T *input, T *output, size_t length, bool stateless)
88
110
  {
111
+ #if defined(__ARM_NEON) || defined(__aarch64__)
112
+ // Use NEON batch processing for stateful mode
113
+ if (!stateless && m_stateful && m_useNeon && m_neonFilter)
114
+ {
115
+ for (size_t i = 0; i < length; ++i)
116
+ {
117
+ output[i] = static_cast<T>(m_neonFilter->processSample(static_cast<float>(input[i])));
118
+ }
119
+ return;
120
+ }
121
+ #endif
122
+
89
123
  if (stateless || !m_stateful)
90
124
  {
91
125
  // Stateless mode: each output depends only on current window
@@ -166,6 +200,13 @@ namespace dsp
166
200
  template <typename T>
167
201
  void FirFilter<T>::reset()
168
202
  {
203
+ #if defined(__ARM_NEON) || defined(__aarch64__)
204
+ if (m_useNeon && m_neonFilter)
205
+ {
206
+ m_neonFilter->reset();
207
+ }
208
+ #endif
209
+
169
210
  if (m_stateful)
170
211
  {
171
212
  std::fill(m_state.begin(), m_state.end(), T(0));
@@ -183,6 +224,23 @@ namespace dsp
183
224
 
184
225
  m_coefficients = coefficients;
185
226
 
227
+ #if defined(__ARM_NEON) || defined(__aarch64__)
228
+ // Update NEON filter if in use
229
+ m_useNeon = false;
230
+ if constexpr (std::is_same_v<T, float>)
231
+ {
232
+ if (m_stateful && coefficients.size() >= 8 && coefficients.size() <= 128)
233
+ {
234
+ m_neonFilter = std::make_unique<FirFilterNeon>(coefficients);
235
+ m_useNeon = true;
236
+ }
237
+ else
238
+ {
239
+ m_neonFilter.reset();
240
+ }
241
+ }
242
+ #endif
243
+
186
244
  if (m_stateful)
187
245
  {
188
246
  // Round up to next power of 2
@@ -20,6 +20,11 @@
20
20
  #include <memory>
21
21
  #include "../utils/CircularBufferArray.h"
22
22
 
23
+ // Include NEON-optimized filter for ARM platforms
24
+ #if defined(__ARM_NEON) || defined(__aarch64__)
25
+ #include "FirFilterNeon.h"
26
+ #endif
27
+
23
28
  namespace dsp
24
29
  {
25
30
  namespace core
@@ -138,6 +143,12 @@ namespace dsp
138
143
  size_t m_stateMask; // Bitmask for power-of-2 circular buffer (replaces modulo)
139
144
  bool m_stateful; // Whether to maintain state between calls
140
145
 
146
+ #if defined(__ARM_NEON) || defined(__aarch64__)
147
+ // NEON-optimized filter for ARM (auto-selected for small-medium taps + float32)
148
+ std::unique_ptr<FirFilterNeon> m_neonFilter;
149
+ bool m_useNeon;
150
+ #endif
151
+
141
152
  /**
142
153
  * Compute single output sample via convolution
143
154
  * @param input Current input sample
@@ -0,0 +1,233 @@
1
+ #pragma once
2
+
3
+ /**
4
+ * @file FirFilterNeon.h
5
+ * @brief ARM NEON-optimized FIR filter with guard-zone circular buffer
6
+ *
7
+ * This implementation keeps O(1) state updates while enabling fully contiguous
8
+ * NEON vectorization using a "guard zone" (mirrored buffer) technique.
9
+ *
10
+ * Key insight: Allocate buffer of size N + GUARD (where GUARD >= max SIMD width).
11
+ * When writing sample at index i, also write it at i+N. This ensures that any
12
+ * NEON load starting from 'head' can read contiguously without wrap-around logic.
13
+ *
14
+ * Performance: O(1) state update + fully vectorized O(N) convolution.
15
+ * Expected gain vs naive circular buffer: 3-6x for 16-128 tap filters on ARM.
16
+ */
17
+
18
+ #include <vector>
19
+ #include <cstddef>
20
+ #include <cstring>
21
+ #include <stdexcept>
22
+ #include <algorithm>
23
+
24
+ #if defined(__ARM_NEON) || defined(__aarch64__)
25
+ #include <arm_neon.h>
26
+ #endif
27
+
28
+ namespace dsp::core
29
+ {
30
+ /**
31
+ * @brief High-performance NEON-optimized FIR filter using guard-zone circular buffer
32
+ *
33
+ * Architecture:
34
+ * - Circular buffer with power-of-2 size for bitmask wrapping (O(1) update)
35
+ * - Guard zone (mirrored tail) to make SIMD reads always contiguous
36
+ * - Coefficients stored in forward order (newest sample = h[0])
37
+ * - NEON kernel reads forward from 'head' with no modulo in inner loop
38
+ *
39
+ * This gives best of both worlds:
40
+ * 1. O(1) state updates (increment head, write sample + guard)
41
+ * 2. Fully contiguous NEON loads (no gather/scatter)
42
+ * 3. No memmove/shift overhead (eliminated algorithmic regression)
43
+ */
44
+ class FirFilterNeon
45
+ {
46
+ public:
47
+ explicit FirFilterNeon(const std::vector<float> &coefficients)
48
+ : m_numTaps(coefficients.size()),
49
+ m_head(0)
50
+ {
51
+ if (coefficients.empty())
52
+ {
53
+ throw std::invalid_argument("FIR coefficients cannot be empty");
54
+ }
55
+
56
+ // Round up to next power of 2 for bitmask wrapping
57
+ m_bufferSize = 1;
58
+ while (m_bufferSize < m_numTaps)
59
+ {
60
+ m_bufferSize <<= 1;
61
+ }
62
+ m_headMask = m_bufferSize - 1;
63
+
64
+ // Store coefficients in FORWARD order (h[0] = newest tap)
65
+ // This matches the circular buffer access pattern
66
+ m_coefficients = coefficients;
67
+
68
+ // Allocate state buffer + guard zone
69
+ // Guard zone mirrors the entire circular buffer for contiguous wraparound reads
70
+ m_state.resize(m_bufferSize * 2, 0.0f);
71
+ }
72
+
73
+ /**
74
+ * @brief Process single sample (stateful, streaming mode)
75
+ * @param input New input sample
76
+ * @return Filtered output
77
+ */
78
+ float processSample(float input)
79
+ {
80
+ #if defined(__ARM_NEON) || defined(__aarch64__)
81
+ return processSampleNeon(input);
82
+ #else
83
+ return processSampleScalar(input);
84
+ #endif
85
+ }
86
+
87
+ /**
88
+ * @brief Process batch of samples in-place
89
+ * @param buffer Input/output buffer
90
+ * @param numSamples Number of samples to process
91
+ */
92
+ void processBatch(float *buffer, size_t numSamples)
93
+ {
94
+ for (size_t i = 0; i < numSamples; ++i)
95
+ {
96
+ buffer[i] = processSample(buffer[i]);
97
+ }
98
+ }
99
+
100
+ /**
101
+ * @brief Reset filter state (clear circular buffer and guard zone)
102
+ */
103
+ void reset()
104
+ {
105
+ std::fill(m_state.begin(), m_state.end(), 0.0f);
106
+ m_head = 0;
107
+ }
108
+
109
+ size_t getNumTaps() const { return m_numTaps; }
110
+ size_t getBufferSize() const { return m_bufferSize; }
111
+
112
+ private:
113
+ size_t m_numTaps; // Number of filter taps
114
+ size_t m_bufferSize; // Power-of-2 buffer size (>= m_numTaps)
115
+ size_t m_head; // Current write position
116
+ size_t m_headMask; // Bitmask for wrapping (bufferSize - 1)
117
+ std::vector<float> m_coefficients; // Filter coefficients (forward order)
118
+ std::vector<float> m_state; // Circular buffer + guard zone
119
+
120
+ #if defined(__ARM_NEON) || defined(__aarch64__)
121
+ /**
122
+ * @brief NEON-optimized sample processing with guard-zone circular buffer
123
+ *
124
+ * Algorithm:
125
+ * 1. Write input to state[head] and state[head + bufferSize] (guard mirror)
126
+ * 2. Read N contiguous floats starting from state[head] using NEON
127
+ * 3. Compute dot product with coefficients (fully vectorized)
128
+ * 4. Advance head with bitmask wrapping (O(1))
129
+ *
130
+ * Key: The guard zone ensures that reads from 'head' are ALWAYS contiguous,
131
+ * even when they logically "wrap around" the circular buffer boundary.
132
+ */
133
+ float processSampleNeon(float input)
134
+ {
135
+ // Advance head FIRST (points to oldest sample position)
136
+ m_head = (m_head + 1) & m_headMask;
137
+
138
+ // Write input to current position AND guard zone (O(1) mirroring)
139
+ m_state[m_head] = input;
140
+ // Always mirror to guard zone - this is critical for wraparound reads!
141
+ m_state[m_head + m_bufferSize] = input;
142
+
143
+ // NEON convolution: read BACKWARD from m_head (newest to oldest)
144
+ // m_head points to newest sample, we need to read m_numTaps samples backward
145
+ // The guard zone ensures contiguous reads even across the wrap boundary
146
+ // Calculate start position: if m_head >= (numTaps-1), read from [m_head - numTaps + 1]
147
+ // Otherwise, read from guard zone: [m_head + bufferSize - numTaps + 1]
148
+ size_t readStart;
149
+ if (m_head >= m_numTaps - 1)
150
+ {
151
+ readStart = m_head - m_numTaps + 1;
152
+ }
153
+ else
154
+ {
155
+ // Wrap using guard zone (no modulo needed!)
156
+ readStart = m_head + m_bufferSize - m_numTaps + 1;
157
+ }
158
+ const float *x = &m_state[readStart];
159
+ const float *h = m_coefficients.data();
160
+
161
+ constexpr size_t simd_width = 4;
162
+ const size_t simd_end = (m_numTaps / simd_width) * simd_width;
163
+
164
+ float32x4_t acc = vdupq_n_f32(0.0f);
165
+
166
+ // Vectorized MAC loop (no modulo, no branches!)
167
+ for (size_t i = 0; i < simd_end; i += simd_width)
168
+ {
169
+ float32x4_t c = vld1q_f32(h + i);
170
+ float32x4_t d = vld1q_f32(x + i);
171
+ acc = vmlaq_f32(acc, c, d); // Fused multiply-add
172
+ }
173
+
174
+ // Horizontal reduction
175
+ #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
176
+ // ARMv8.1-a and later: use vaddvq_f32
177
+ float output = vaddvq_f32(acc);
178
+ #else
179
+ // ARMv8.0 fallback: manual pairwise addition
180
+ float32x2_t sum_lo = vget_low_f32(acc);
181
+ float32x2_t sum_hi = vget_high_f32(acc);
182
+ float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
183
+ float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
184
+ float output = vget_lane_f32(sum_final, 0);
185
+ #endif
186
+
187
+ // Scalar tail (remaining 0-3 taps)
188
+ for (size_t i = simd_end; i < m_numTaps; ++i)
189
+ {
190
+ output += h[i] * x[i];
191
+ }
192
+
193
+ return output;
194
+ }
195
+ #endif
196
+
197
+ /**
198
+ * @brief Scalar fallback for non-ARM platforms
199
+ */
200
+ float processSampleScalar(float input)
201
+ {
202
+ // Advance head FIRST
203
+ m_head = (m_head + 1) & m_headMask;
204
+
205
+ // Write to circular buffer + guard
206
+ m_state[m_head] = input;
207
+ // Always mirror to guard zone
208
+ m_state[m_head + m_bufferSize] = input;
209
+
210
+ // Compute output (read backward from newest to oldest)
211
+ float output = 0.0f;
212
+ size_t readStart;
213
+ if (m_head >= m_numTaps - 1)
214
+ {
215
+ readStart = m_head - m_numTaps + 1;
216
+ }
217
+ else
218
+ {
219
+ readStart = m_head + m_bufferSize - m_numTaps + 1;
220
+ }
221
+ const float *x = &m_state[readStart];
222
+ const float *h = m_coefficients.data();
223
+
224
+ for (size_t i = 0; i < m_numTaps; ++i)
225
+ {
226
+ output += h[i] * x[i];
227
+ }
228
+
229
+ return output;
230
+ }
231
+ };
232
+
233
+ } // namespace dsp::core
@@ -70,6 +70,27 @@ namespace dsp::core
70
70
  */
71
71
  T addSample(T newValue) { return m_filter.addSample(newValue); }
72
72
 
73
+ /**
74
+ * @brief Process array of samples in batch (optimized for throughput).
75
+ *
76
+ * This is significantly faster than calling addSample() in a loop
77
+ * for small-to-medium input sizes, as it:
78
+ * 1. Avoids per-call overhead (JS→Native boundary crossing)
79
+ * 2. Enables better CPU cache utilization
80
+ * 3. Allows compiler to vectorize the loop
81
+ *
82
+ * @param input Input array of samples
83
+ * @param output Output array (same size as input)
84
+ * @param length Number of samples to process
85
+ */
86
+ void processArray(const T *input, T *output, size_t length)
87
+ {
88
+ for (size_t i = 0; i < length; ++i)
89
+ {
90
+ output[i] = addSample(input[i]);
91
+ }
92
+ }
93
+
73
94
  /**
74
95
  * @brief Adds a new sample with timestamp (time-aware mode only).
75
96
  * @param newValue The new sample value to add.
@@ -293,6 +293,36 @@ namespace dsp::simd
293
293
 
294
294
  return total;
295
295
 
296
+ #elif defined(SIMD_NEON)
297
+ const size_t simd_width = 4;
298
+ const size_t simd_count = size / simd_width;
299
+ const size_t simd_end = simd_count * simd_width;
300
+
301
+ // ARM NEON: Accumulate in float, then convert to double for precision
302
+ float32x4_t acc = vdupq_n_f32(0.0f);
303
+
304
+ for (size_t i = 0; i < simd_end; i += simd_width)
305
+ {
306
+ float32x4_t values = vld1q_f32(&buffer[i]);
307
+ acc = vaddq_f32(acc, values);
308
+ }
309
+
310
+ // Pairwise add to get horizontal sum, then convert to double
311
+ float32x2_t sum_lo = vget_low_f32(acc);
312
+ float32x2_t sum_hi = vget_high_f32(acc);
313
+ float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
314
+ float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
315
+
316
+ double total = static_cast<double>(vget_lane_f32(sum_final, 0));
317
+
318
+ // Handle remainder
319
+ for (size_t i = simd_end; i < size; ++i)
320
+ {
321
+ total += static_cast<double>(buffer[i]);
322
+ }
323
+
324
+ return total;
325
+
296
326
  #else
297
327
  // Scalar with Kahan summation for precision
298
328
  double sum = 0.0;
@@ -396,6 +426,36 @@ namespace dsp::simd
396
426
 
397
427
  return total;
398
428
 
429
+ #elif defined(SIMD_NEON)
430
+ const size_t simd_width = 4;
431
+ const size_t simd_count = size / simd_width;
432
+ const size_t simd_end = simd_count * simd_width;
433
+
434
+ // ARM NEON: Accumulate squares in float, then convert to double
435
+ float32x4_t acc = vdupq_n_f32(0.0f);
436
+
437
+ for (size_t i = 0; i < simd_end; i += simd_width)
438
+ {
439
+ float32x4_t values = vld1q_f32(&buffer[i]);
440
+ // Fused multiply-add: acc += values * values
441
+ acc = vmlaq_f32(acc, values, values);
442
+ }
443
+
444
+ // Convert to double for precision
445
+ float temp[4];
446
+ vst1q_f32(temp, acc);
447
+ double total = static_cast<double>(temp[0]) + static_cast<double>(temp[1]) +
448
+ static_cast<double>(temp[2]) + static_cast<double>(temp[3]);
449
+
450
+ // Handle remainder
451
+ for (size_t i = simd_end; i < size; ++i)
452
+ {
453
+ double val = static_cast<double>(buffer[i]);
454
+ total += val * val;
455
+ }
456
+
457
+ return total;
458
+
399
459
  #else
400
460
  // Scalar with Kahan summation
401
461
  double sum = 0.0;