npm - dspx - Versions diffs - 0.2.0-alpha.11 → 0.2.0-alpha.13 - Mend

dspx 0.2.0-alpha.11 → 0.2.0-alpha.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/binding.gyp +10 -1
package/dist/utils.d.ts +54 -0
package/dist/utils.d.ts.map +1 -1
package/dist/utils.js +64 -0
package/dist/utils.js.map +1 -1
package/package.json +1 -1
package/prebuilds/linux-arm64/dspx.node +0 -0
package/src/native/core/DifferentiableFilter.h +56 -0
package/src/native/core/FirFilter.cc +58 -0
package/src/native/core/FirFilter.h +11 -0
package/src/native/core/FirFilterNeon.h +233 -0
package/src/native/core/MovingAverageFilter.h +21 -0
package/src/native/utils/SimdOps.h +60 -0

package/binding.gyp CHANGED Viewed

@@ -81,11 +81,20 @@
         }],
         # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, Tensor G4, etc.)
         ['target_arch=="arm64"', {
-          "cflags+": [ "-march=armv8-a+fp+simd" ],  # Enable NEON and FP on ARMv8
+          # ARMv8-a baseline: NEON + FP support (compatible with all ARMv8 CPUs)
+          "cflags+": [ "-march=armv8-a+fp+simd" ],
           "cflags_cc+": [ "-march=armv8-a+fp+simd" ],
           'xcode_settings': {
             'OTHER_CPLUSPLUSFLAGS+': [ '-march=armv8-a+fp+simd' ]
           }
+          # Optional: Upgrade to ARMv8.2-a for newer CPUs (Tensor G4, Apple M2+, Graviton 3+)
+          # Enables FP16 arithmetic and additional optimizations
+          # Uncomment the lines below to enable ARMv8.2-a:
+          # "cflags+": [ "-march=armv8.2-a+fp16" ],
+          # "cflags_cc+": [ "-march=armv8.2-a+fp16" ],
+          # 'xcode_settings': {
+          #   'OTHER_CPLUSPLUSFLAGS+': [ '-march=armv8.2-a+fp16' ]
+          # }
         }],
         # Condition for 32-bit ARM (older Android devices)
         ['target_arch=="arm"', {

package/dist/utils.d.ts CHANGED Viewed

@@ -40,6 +40,50 @@
  * ```
  */
 export declare function dotProduct(a: Float32Array, b: Float32Array): number;
+/**
+ * Computes the sum of array elements using SIMD-accelerated native code.
+ *
+ * This implementation uses ARM NEON (4-wide) or x86 SSE2/AVX2 (4-8 wide) SIMD
+ * instructions with double-precision accumulation for numerical accuracy.
+ *
+ * @param buffer - Input array (Float32Array)
+ * @returns The sum of all elements
+ * @throws {TypeError} If input is not a Float32Array
+ *
+ * @example
+ * ```typescript
+ * const data = new Float32Array([1, 2, 3, 4, 5]);
+ * const total = sum(data); // 15
+ * ```
+ */
+export declare function sum(buffer: Float32Array): number;
+/**
+ * Computes the sum of squared elements using SIMD-accelerated native code.
+ *
+ * This implementation uses ARM NEON vmlaq_f32 (fused multiply-add) or x86
+ * SSE2/AVX2 for optimal performance. Result is accumulated in double precision.
+ *
+ * Useful for computing RMS, variance, power, energy, and L2 norm.
+ *
+ * @param buffer - Input array (Float32Array)
+ * @returns Sum of squares: buffer[0]² + buffer[1]² + ... + buffer[n-1]²
+ * @throws {TypeError} If input is not a Float32Array
+ *
+ * @example
+ * ```typescript
+ * const signal = new Float32Array([3, 4]); // 3-4-5 triangle
+ * const energy = sumOfSquares(signal); // 9 + 16 = 25
+ * const rms = Math.sqrt(energy / signal.length); // 5 / sqrt(2) ≈ 3.536
+ * ```
+ *
+ * @example
+ * ```typescript
+ * // Compute L2 norm (Euclidean length)
+ * const vector = new Float32Array([1, 2, 2]);
+ * const norm = Math.sqrt(sumOfSquares(vector)); // sqrt(9) = 3
+ * ```
+ */
+export declare function sumOfSquares(buffer: Float32Array): number;
 /**
  * Utility functions for DSP operations.
  *
@@ -51,5 +95,15 @@ export declare const DspUtils: {
      * @see {@link dotProduct} for detailed documentation
      */
     dotProduct: typeof dotProduct;
+    /**
+     * Computes the sum of array elements using SIMD-accelerated native code.
+     * @see {@link sum} for detailed documentation
+     */
+    sum: typeof sum;
+    /**
+     * Computes the sum of squared elements using SIMD-accelerated native code.
+     * @see {@link sumOfSquares} for detailed documentation
+     */
+    sumOfSquares: typeof sumOfSquares;
 };
 //# sourceMappingURL=utils.d.ts.map

package/dist/utils.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAyBA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAcnE;AAED;;;;GAIG;AACH,eAAO,MAAM,QAAQ;IACnB;;;OAGG;;CAEJ,CAAC"}
1	+ {"version":3,"file":"utils.d.ts","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAyBA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CAcnE;AAED;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,GAAG,CAAC,MAAM,EAAE,YAAY,GAAG,MAAM,CAKhD;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,YAAY,GAAG,MAAM,CAKzD;AAED;;;;GAIG;AACH,eAAO,MAAM,QAAQ;IACnB;;;OAGG;;IAGH;;;OAGG;;IAGH;;;OAGG;;CAEJ,CAAC"}

package/dist/utils.js CHANGED Viewed

@@ -73,6 +73,60 @@ export function dotProduct(a, b) {
     }
     return DspAddon.dotProduct(a, b);
 }
+/**
+ * Computes the sum of array elements using SIMD-accelerated native code.
+ *
+ * This implementation uses ARM NEON (4-wide) or x86 SSE2/AVX2 (4-8 wide) SIMD
+ * instructions with double-precision accumulation for numerical accuracy.
+ *
+ * @param buffer - Input array (Float32Array)
+ * @returns The sum of all elements
+ * @throws {TypeError} If input is not a Float32Array
+ *
+ * @example
+ * ```typescript
+ * const data = new Float32Array([1, 2, 3, 4, 5]);
+ * const total = sum(data); // 15
+ * ```
+ */
+export function sum(buffer) {
+    if (!(buffer instanceof Float32Array)) {
+        throw new TypeError("Argument must be a Float32Array");
+    }
+    return DspAddon.sum(buffer);
+}
+/**
+ * Computes the sum of squared elements using SIMD-accelerated native code.
+ *
+ * This implementation uses ARM NEON vmlaq_f32 (fused multiply-add) or x86
+ * SSE2/AVX2 for optimal performance. Result is accumulated in double precision.
+ *
+ * Useful for computing RMS, variance, power, energy, and L2 norm.
+ *
+ * @param buffer - Input array (Float32Array)
+ * @returns Sum of squares: buffer[0]² + buffer[1]² + ... + buffer[n-1]²
+ * @throws {TypeError} If input is not a Float32Array
+ *
+ * @example
+ * ```typescript
+ * const signal = new Float32Array([3, 4]); // 3-4-5 triangle
+ * const energy = sumOfSquares(signal); // 9 + 16 = 25
+ * const rms = Math.sqrt(energy / signal.length); // 5 / sqrt(2) ≈ 3.536
+ * ```
+ *
+ * @example
+ * ```typescript
+ * // Compute L2 norm (Euclidean length)
+ * const vector = new Float32Array([1, 2, 2]);
+ * const norm = Math.sqrt(sumOfSquares(vector)); // sqrt(9) = 3
+ * ```
+ */
+export function sumOfSquares(buffer) {
+    if (!(buffer instanceof Float32Array)) {
+        throw new TypeError("Argument must be a Float32Array");
+    }
+    return DspAddon.sumOfSquares(buffer);
+}
 /**
  * Utility functions for DSP operations.
  *
@@ -84,5 +138,15 @@ export const DspUtils = {
      * @see {@link dotProduct} for detailed documentation
      */
     dotProduct,
+    /**
+     * Computes the sum of array elements using SIMD-accelerated native code.
+     * @see {@link sum} for detailed documentation
+     */
+    sum,
+    /**
+     * Computes the sum of squared elements using SIMD-accelerated native code.
+     * @see {@link sumOfSquares} for detailed documentation
+     */
+    sumOfSquares,
 };
 //# sourceMappingURL=utils.js.map

package/dist/utils.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAAA,OAAO,YAAY,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAE1C,wCAAwC;AACxC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEtC,IAAI,QAAa,CAAC;AAClB,sCAAsC;AACtC,IAAI,CAAC;IACH,gDAAgD;IAChD,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAAC,OAAO,CAAC,EAAE,CAAC;IACX,IAAI,CAAC;QACH,oEAAoE;QACpE,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,+CAA+C;QAC/C,MAAM,IAAI,KAAK,CACb,oCAAoC,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,MAAM,UAAU,UAAU,CAAC,CAAe,EAAE,CAAe;IACzD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,uCAAuC,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,wCAAwC,CAAC,CAAC;IAChE,CAAC;IACD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,UAAU,CAClB,uCAAuC,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,MAAM,EAAE,CACxE,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACnC,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG;IACtB;;;OAGG;IACH,UAAU;~~CACX~~,CAAC"}
1	+ {"version":3,"file":"utils.js","sourceRoot":"","sources":["../src/ts/utils.ts"],"names":[],"mappings":"AAAA,OAAO,YAAY,MAAM,gBAAgB,CAAC;AAC1C,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAE1C,wCAAwC;AACxC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AAClD,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,CAAC;AAEtC,IAAI,QAAa,CAAC;AAClB,sCAAsC;AACtC,IAAI,CAAC;IACH,gDAAgD;IAChD,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,CAAC,CAAC;AACjD,CAAC;AAAC,OAAO,CAAC,EAAE,CAAC;IACX,IAAI,CAAC;QACH,oEAAoE;QACpE,QAAQ,GAAG,YAAY,CAAC,IAAI,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC;IACvD,CAAC;IAAC,OAAO,GAAQ,EAAE,CAAC;QAClB,+CAA+C;QAC/C,MAAM,IAAI,KAAK,CACb,oCAAoC,GAAG,EAAE,OAAO,IAAI,MAAM,CAAC,GAAG,CAAC,EAAE,CAClE,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAwCG;AACH,MAAM,UAAU,UAAU,CAAC,CAAe,EAAE,CAAe;IACzD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,uCAAuC,CAAC,CAAC;IAC/D,CAAC;IACD,IAAI,CAAC,CAAC,CAAC,YAAY,YAAY,CAAC,EAAE,CAAC;QACjC,MAAM,IAAI,SAAS,CAAC,wCAAwC,CAAC,CAAC;IAChE,CAAC;IACD,IAAI,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,MAAM,EAAE,CAAC;QAC1B,MAAM,IAAI,UAAU,CAClB,uCAAuC,CAAC,CAAC,MAAM,cAAc,CAAC,CAAC,MAAM,EAAE,CACxE,CAAC;IACJ,CAAC;IAED,OAAO,QAAQ,CAAC,UAAU,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;AACnC,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AACH,MAAM,UAAU,GAAG,CAAC,MAAoB;IACtC,IAAI,CAAC,CAAC,MAAM,YAAY,YAAY,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,SAAS,CAAC,iCAAiC,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC;AAC9B,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AACH,MAAM,UAAU,YAAY,CAAC,MAAoB;IAC/C,IAAI,CAAC,CAAC,MAAM,YAAY,YAAY,CAAC,EAAE,CAAC;QACtC,MAAM,IAAI,SAAS,CAAC,iCAAiC,CAAC,CAAC;IACzD,CAAC;IACD,OAAO,QAAQ,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;AACvC,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,QAAQ,GAAG;IACtB;;;OAGG;IACH,UAAU;IAEV;;;OAGG;IACH,GAAG;IAEH;;;OAGG;IACH,YAAY;CACb,CAAC"}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "dspx",
-  "version": "0.2.0-alpha.11",
+  "version": "0.2.0-alpha.13",
   "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",

package/prebuilds/linux-arm64/dspx.node CHANGED Viewed

Binary file

package/src/native/core/DifferentiableFilter.h CHANGED Viewed

@@ -7,6 +7,11 @@
 #include <algorithm>
 #include "../utils/SimdOps.h"
+// Include ARM NEON intrinsics if available
+#if defined(__ARM_NEON) || defined(__aarch64__)
+#include <arm_neon.h>
+#endif
 namespace dsp
 {
     namespace core
@@ -270,7 +275,57 @@ namespace dsp
                 // Update weights: w[n+1] = (1 - mu*lambda) * w[n] + mu * e[n] * x[n]
                 // where (1 - mu*lambda) is the leakage factor for regularization
                 T leakage = 1.0 - m_mu * m_lambda;
+                T mu_error = mu_n * error;
+#if defined(__ARM_NEON) || defined(__aarch64__)
+                // NEON-optimized weight update for ARM processors
+                const size_t simd_width = 4;
+                const size_t simd_count = m_numTaps / simd_width;
+                const size_t simd_end = simd_count * simd_width;
+                float32x4_t leakage_vec = vdupq_n_f32(leakage);
+                float32x4_t mu_error_vec = vdupq_n_f32(mu_error);
+                // Vectorized update: weights[i] = leakage * weights[i] + mu_error * x[i]
+                for (size_t i = 0; i < simd_end; i += simd_width)
+                {
+                    // Get indices for input buffer (circular)
+                    size_t idx0 = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
+                    size_t idx1 = (writeIdx + m_numTaps - 2 - i) % m_numTaps;
+                    size_t idx2 = (writeIdx + m_numTaps - 3 - i) % m_numTaps;
+                    size_t idx3 = (writeIdx + m_numTaps - 4 - i) % m_numTaps;
+                    // Load 4 input samples (must be done individually due to circular buffer)
+                    float x_vals[4] = {
+                        static_cast<float>(inputBuffer[idx0]),
+                        static_cast<float>(inputBuffer[idx1]),
+                        static_cast<float>(inputBuffer[idx2]),
+                        static_cast<float>(inputBuffer[idx3])};
+                    float32x4_t x = vld1q_f32(x_vals);
+                    // Load 4 weights
+                    float32x4_t w = vld1q_f32(reinterpret_cast<const float *>(&weights[i]));
+                    // Apply leakage: w *= leakage
+                    w = vmulq_f32(w, leakage_vec);
+                    // Fused multiply-add: w += mu_error * x
+                    w = vmlaq_f32(w, mu_error_vec, x);
+                    // Store updated weights
+                    vst1q_f32(reinterpret_cast<float *>(&weights[i]), w);
+                }
+                // Handle remainder (scalar)
+                for (size_t i = simd_end; i < m_numTaps; ++i)
+                {
+                    size_t idx = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
+                    T x_i = inputBuffer[idx];
+                    weights[i] = leakage * weights[i] + mu_error * x_i;
+                }
+#else
+                // Scalar weight update for non-ARM platforms
                 for (size_t i = 0; i < m_numTaps; ++i)
                 {
                     size_t idx = (writeIdx + m_numTaps - 1 - i) % m_numTaps;
@@ -279,6 +334,7 @@ namespace dsp
                     // Apply leaky LMS update
                     weights[i] = leakage * weights[i] + mu_n * error * x_i;
                 }
+#endif
             }
         };

package/src/native/core/FirFilter.cc CHANGED Viewed

@@ -28,6 +28,20 @@ namespace dsp
                 throw std::invalid_argument("FIR filter requires at least one coefficient");
             }
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            // Auto-select NEON for float32 + small-medium taps (where transposed form wins)
+            // For large taps (>128), circular buffer's O(1) state update is better than O(N) shift
+            m_useNeon = false;
+            if constexpr (std::is_same_v<T, float>)
+            {
+                if (stateful && coefficients.size() >= 8 && coefficients.size() <= 128)
+                {
+                    m_neonFilter = std::make_unique<FirFilterNeon>(coefficients);
+                    m_useNeon = true;
+                }
+            }
+#endif
             if (stateful)
             {
                 // Round up to next power of 2 for efficient circular buffer (enables bitwise AND instead of modulo)
@@ -51,6 +65,14 @@ namespace dsp
                 throw std::runtime_error("processSample() requires stateful mode");
             }
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            // Use NEON filter if available
+            if (m_useNeon && m_neonFilter)
+            {
+                return static_cast<T>(m_neonFilter->processSample(static_cast<float>(input)));
+            }
+#endif
             // Store input in circular buffer
             m_state[m_stateIndex] = input;
@@ -86,6 +108,18 @@ namespace dsp
         template <typename T>
         void FirFilter<T>::process(const T *input, T *output, size_t length, bool stateless)
         {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            // Use NEON batch processing for stateful mode
+            if (!stateless && m_stateful && m_useNeon && m_neonFilter)
+            {
+                for (size_t i = 0; i < length; ++i)
+                {
+                    output[i] = static_cast<T>(m_neonFilter->processSample(static_cast<float>(input[i])));
+                }
+                return;
+            }
+#endif
             if (stateless || !m_stateful)
             {
                 // Stateless mode: each output depends only on current window
@@ -166,6 +200,13 @@ namespace dsp
         template <typename T>
         void FirFilter<T>::reset()
         {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            if (m_useNeon && m_neonFilter)
+            {
+                m_neonFilter->reset();
+            }
+#endif
             if (m_stateful)
             {
                 std::fill(m_state.begin(), m_state.end(), T(0));
@@ -183,6 +224,23 @@ namespace dsp
             m_coefficients = coefficients;
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            // Update NEON filter if in use
+            m_useNeon = false;
+            if constexpr (std::is_same_v<T, float>)
+            {
+                if (m_stateful && coefficients.size() >= 8 && coefficients.size() <= 128)
+                {
+                    m_neonFilter = std::make_unique<FirFilterNeon>(coefficients);
+                    m_useNeon = true;
+                }
+                else
+                {
+                    m_neonFilter.reset();
+                }
+            }
+#endif
             if (m_stateful)
             {
                 // Round up to next power of 2

package/src/native/core/FirFilter.h CHANGED Viewed

@@ -20,6 +20,11 @@
 #include <memory>
 #include "../utils/CircularBufferArray.h"
+// Include NEON-optimized filter for ARM platforms
+#if defined(__ARM_NEON) || defined(__aarch64__)
+#include "FirFilterNeon.h"
+#endif
 namespace dsp
 {
     namespace core
@@ -138,6 +143,12 @@ namespace dsp
             size_t m_stateMask;            // Bitmask for power-of-2 circular buffer (replaces modulo)
             bool m_stateful;               // Whether to maintain state between calls
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            // NEON-optimized filter for ARM (auto-selected for small-medium taps + float32)
+            std::unique_ptr<FirFilterNeon> m_neonFilter;
+            bool m_useNeon;
+#endif
             /**
              * Compute single output sample via convolution
              * @param input Current input sample

package/src/native/core/FirFilterNeon.h ADDED Viewed

@@ -0,0 +1,233 @@
+#pragma once
+/**
+ * @file FirFilterNeon.h
+ * @brief ARM NEON-optimized FIR filter with guard-zone circular buffer
+ *
+ * This implementation keeps O(1) state updates while enabling fully contiguous
+ * NEON vectorization using a "guard zone" (mirrored buffer) technique.
+ *
+ * Key insight: Allocate buffer of size N + GUARD (where GUARD >= max SIMD width).
+ * When writing sample at index i, also write it at i+N. This ensures that any
+ * NEON load starting from 'head' can read contiguously without wrap-around logic.
+ *
+ * Performance: O(1) state update + fully vectorized O(N) convolution.
+ * Expected gain vs naive circular buffer: 3-6x for 16-128 tap filters on ARM.
+ */
+#include <vector>
+#include <cstddef>
+#include <cstring>
+#include <stdexcept>
+#include <algorithm>
+#if defined(__ARM_NEON) || defined(__aarch64__)
+#include <arm_neon.h>
+#endif
+namespace dsp::core
+{
+    /**
+     * @brief High-performance NEON-optimized FIR filter using guard-zone circular buffer
+     *
+     * Architecture:
+     * - Circular buffer with power-of-2 size for bitmask wrapping (O(1) update)
+     * - Guard zone (mirrored tail) to make SIMD reads always contiguous
+     * - Coefficients stored in forward order (newest sample = h[0])
+     * - NEON kernel reads forward from 'head' with no modulo in inner loop
+     *
+     * This gives best of both worlds:
+     * 1. O(1) state updates (increment head, write sample + guard)
+     * 2. Fully contiguous NEON loads (no gather/scatter)
+     * 3. No memmove/shift overhead (eliminated algorithmic regression)
+     */
+    class FirFilterNeon
+    {
+    public:
+        explicit FirFilterNeon(const std::vector<float> &coefficients)
+            : m_numTaps(coefficients.size()),
+              m_head(0)
+        {
+            if (coefficients.empty())
+            {
+                throw std::invalid_argument("FIR coefficients cannot be empty");
+            }
+            // Round up to next power of 2 for bitmask wrapping
+            m_bufferSize = 1;
+            while (m_bufferSize < m_numTaps)
+            {
+                m_bufferSize <<= 1;
+            }
+            m_headMask = m_bufferSize - 1;
+            // Store coefficients in FORWARD order (h[0] = newest tap)
+            // This matches the circular buffer access pattern
+            m_coefficients = coefficients;
+            // Allocate state buffer + guard zone
+            // Guard zone mirrors the entire circular buffer for contiguous wraparound reads
+            m_state.resize(m_bufferSize * 2, 0.0f);
+        }
+        /**
+         * @brief Process single sample (stateful, streaming mode)
+         * @param input New input sample
+         * @return Filtered output
+         */
+        float processSample(float input)
+        {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            return processSampleNeon(input);
+#else
+            return processSampleScalar(input);
+#endif
+        }
+        /**
+         * @brief Process batch of samples in-place
+         * @param buffer Input/output buffer
+         * @param numSamples Number of samples to process
+         */
+        void processBatch(float *buffer, size_t numSamples)
+        {
+            for (size_t i = 0; i < numSamples; ++i)
+            {
+                buffer[i] = processSample(buffer[i]);
+            }
+        }
+        /**
+         * @brief Reset filter state (clear circular buffer and guard zone)
+         */
+        void reset()
+        {
+            std::fill(m_state.begin(), m_state.end(), 0.0f);
+            m_head = 0;
+        }
+        size_t getNumTaps() const { return m_numTaps; }
+        size_t getBufferSize() const { return m_bufferSize; }
+    private:
+        size_t m_numTaps;                  // Number of filter taps
+        size_t m_bufferSize;               // Power-of-2 buffer size (>= m_numTaps)
+        size_t m_head;                     // Current write position
+        size_t m_headMask;                 // Bitmask for wrapping (bufferSize - 1)
+        std::vector<float> m_coefficients; // Filter coefficients (forward order)
+        std::vector<float> m_state;        // Circular buffer + guard zone
+#if defined(__ARM_NEON) || defined(__aarch64__)
+        /**
+         * @brief NEON-optimized sample processing with guard-zone circular buffer
+         *
+         * Algorithm:
+         * 1. Write input to state[head] and state[head + bufferSize] (guard mirror)
+         * 2. Read N contiguous floats starting from state[head] using NEON
+         * 3. Compute dot product with coefficients (fully vectorized)
+         * 4. Advance head with bitmask wrapping (O(1))
+         *
+         * Key: The guard zone ensures that reads from 'head' are ALWAYS contiguous,
+         * even when they logically "wrap around" the circular buffer boundary.
+         */
+        float processSampleNeon(float input)
+        {
+            // Advance head FIRST (points to oldest sample position)
+            m_head = (m_head + 1) & m_headMask;
+            // Write input to current position AND guard zone (O(1) mirroring)
+            m_state[m_head] = input;
+            // Always mirror to guard zone - this is critical for wraparound reads!
+            m_state[m_head + m_bufferSize] = input;
+            // NEON convolution: read BACKWARD from m_head (newest to oldest)
+            // m_head points to newest sample, we need to read m_numTaps samples backward
+            // The guard zone ensures contiguous reads even across the wrap boundary
+            // Calculate start position: if m_head >= (numTaps-1), read from [m_head - numTaps + 1]
+            // Otherwise, read from guard zone: [m_head + bufferSize - numTaps + 1]
+            size_t readStart;
+            if (m_head >= m_numTaps - 1)
+            {
+                readStart = m_head - m_numTaps + 1;
+            }
+            else
+            {
+                // Wrap using guard zone (no modulo needed!)
+                readStart = m_head + m_bufferSize - m_numTaps + 1;
+            }
+            const float *x = &m_state[readStart];
+            const float *h = m_coefficients.data();
+            constexpr size_t simd_width = 4;
+            const size_t simd_end = (m_numTaps / simd_width) * simd_width;
+            float32x4_t acc = vdupq_n_f32(0.0f);
+            // Vectorized MAC loop (no modulo, no branches!)
+            for (size_t i = 0; i < simd_end; i += simd_width)
+            {
+                float32x4_t c = vld1q_f32(h + i);
+                float32x4_t d = vld1q_f32(x + i);
+                acc = vmlaq_f32(acc, c, d); // Fused multiply-add
+            }
+            // Horizontal reduction
+#if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+            // ARMv8.1-a and later: use vaddvq_f32
+            float output = vaddvq_f32(acc);
+#else
+            // ARMv8.0 fallback: manual pairwise addition
+            float32x2_t sum_lo = vget_low_f32(acc);
+            float32x2_t sum_hi = vget_high_f32(acc);
+            float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
+            float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
+            float output = vget_lane_f32(sum_final, 0);
+#endif
+            // Scalar tail (remaining 0-3 taps)
+            for (size_t i = simd_end; i < m_numTaps; ++i)
+            {
+                output += h[i] * x[i];
+            }
+            return output;
+        }
+#endif
+        /**
+         * @brief Scalar fallback for non-ARM platforms
+         */
+        float processSampleScalar(float input)
+        {
+            // Advance head FIRST
+            m_head = (m_head + 1) & m_headMask;
+            // Write to circular buffer + guard
+            m_state[m_head] = input;
+            // Always mirror to guard zone
+            m_state[m_head + m_bufferSize] = input;
+            // Compute output (read backward from newest to oldest)
+            float output = 0.0f;
+            size_t readStart;
+            if (m_head >= m_numTaps - 1)
+            {
+                readStart = m_head - m_numTaps + 1;
+            }
+            else
+            {
+                readStart = m_head + m_bufferSize - m_numTaps + 1;
+            }
+            const float *x = &m_state[readStart];
+            const float *h = m_coefficients.data();
+            for (size_t i = 0; i < m_numTaps; ++i)
+            {
+                output += h[i] * x[i];
+            }
+            return output;
+        }
+    };
+} // namespace dsp::core

package/src/native/core/MovingAverageFilter.h CHANGED Viewed

@@ -70,6 +70,27 @@ namespace dsp::core
          */
         T addSample(T newValue) { return m_filter.addSample(newValue); }
+        /**
+         * @brief Process array of samples in batch (optimized for throughput).
+         *
+         * This is significantly faster than calling addSample() in a loop
+         * for small-to-medium input sizes, as it:
+         * 1. Avoids per-call overhead (JS→Native boundary crossing)
+         * 2. Enables better CPU cache utilization
+         * 3. Allows compiler to vectorize the loop
+         *
+         * @param input Input array of samples
+         * @param output Output array (same size as input)
+         * @param length Number of samples to process
+         */
+        void processArray(const T *input, T *output, size_t length)
+        {
+            for (size_t i = 0; i < length; ++i)
+            {
+                output[i] = addSample(input[i]);
+            }
+        }
         /**
          * @brief Adds a new sample with timestamp (time-aware mode only).
          * @param newValue The new sample value to add.

package/src/native/utils/SimdOps.h CHANGED Viewed

@@ -293,6 +293,36 @@ namespace dsp::simd
         return total;
+#elif defined(SIMD_NEON)
+        const size_t simd_width = 4;
+        const size_t simd_count = size / simd_width;
+        const size_t simd_end = simd_count * simd_width;
+        // ARM NEON: Accumulate in float, then convert to double for precision
+        float32x4_t acc = vdupq_n_f32(0.0f);
+        for (size_t i = 0; i < simd_end; i += simd_width)
+        {
+            float32x4_t values = vld1q_f32(&buffer[i]);
+            acc = vaddq_f32(acc, values);
+        }
+        // Pairwise add to get horizontal sum, then convert to double
+        float32x2_t sum_lo = vget_low_f32(acc);
+        float32x2_t sum_hi = vget_high_f32(acc);
+        float32x2_t sum_pair = vadd_f32(sum_lo, sum_hi);
+        float32x2_t sum_final = vpadd_f32(sum_pair, sum_pair);
+        double total = static_cast<double>(vget_lane_f32(sum_final, 0));
+        // Handle remainder
+        for (size_t i = simd_end; i < size; ++i)
+        {
+            total += static_cast<double>(buffer[i]);
+        }
+        return total;
 #else
         // Scalar with Kahan summation for precision
         double sum = 0.0;
@@ -396,6 +426,36 @@ namespace dsp::simd
         return total;
+#elif defined(SIMD_NEON)
+        const size_t simd_width = 4;
+        const size_t simd_count = size / simd_width;
+        const size_t simd_end = simd_count * simd_width;
+        // ARM NEON: Accumulate squares in float, then convert to double
+        float32x4_t acc = vdupq_n_f32(0.0f);
+        for (size_t i = 0; i < simd_end; i += simd_width)
+        {
+            float32x4_t values = vld1q_f32(&buffer[i]);
+            // Fused multiply-add: acc += values * values
+            acc = vmlaq_f32(acc, values, values);
+        }
+        // Convert to double for precision
+        float temp[4];
+        vst1q_f32(temp, acc);
+        double total = static_cast<double>(temp[0]) + static_cast<double>(temp[1]) +
+                       static_cast<double>(temp[2]) + static_cast<double>(temp[3]);
+        // Handle remainder
+        for (size_t i = simd_end; i < size; ++i)
+        {
+            double val = static_cast<double>(buffer[i]);
+            total += val * val;
+        }
+        return total;
 #else
         // Scalar with Kahan summation
         double sum = 0.0;