npm - dspx - Versions diffs - 1.4.1 → 1.4.8 - Mend

dspx 1.4.1 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +4 -2
package/binding.gyp +1 -1
package/package.json +11 -3
package/prebuilds/win32-x64/dspx.node +0 -0
package/src/native/adapters/TimeAlignmentStage.cc +191 -12
package/src/native/adapters/TimeAlignmentStage.h +5 -5
package/src/native/core/FirFilter.cc +26 -0
package/src/native/core/FirFilterNeon.h +60 -0

package/README.md CHANGED Viewed

@@ -11,6 +11,8 @@
 A modern DSP library built for Node.js backends processing real-time biosignals, audio streams, and sensor data. Features native C++ filters with full state serialization (to Redis, S3, or any storage backend), enabling seamless processing across service restarts and distributed workers.
+[View the benchmarks](https://github.com/A-KGeorge/dspx-benchmark/)
 ---
 ## ✨ Features
@@ -475,7 +477,7 @@ const clean = notch.process(noisySignal);
 import { createDspPipeline, Convolution } from "dspx";
 const pipeline = createDspPipeline();
 pipeline.addStage(
-  new Convolution({ kernel: OPTIMAL_LOWPASS_COEFFS.cutoff_0_2 })
+  new Convolution({ kernel: OPTIMAL_LOWPASS_COEFFS.cutoff_0_2 }),
 );
 // ✅ Zero Python dependency - coefficients ship with the library!
@@ -1661,7 +1663,7 @@ const saveBreaker = new CircuitBreaker(
     timeout: 2000, // Fail if >2s
     errorThresholdPercentage: 50, // Trip after 50% failures
     resetTimeout: 30000, // Try recovery after 30s
-  }
+  },
 );
 saveBreaker.fallback(() => {

package/binding.gyp CHANGED Viewed

@@ -93,7 +93,7 @@
             'OTHER_CPLUSPLUSFLAGS+': [ '-msse3', '-mavx', '-mavx2' ]
           }
         }],
-        # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, Tensor G4, etc.)
+        # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, etc.)
         ['target_arch=="arm64"', {
           # ARMv8-a baseline: NEON + FP support (compatible with all ARMv8 CPUs)
           "cflags+": [ "-march=armv8-a+fp+simd" ],

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "dspx",
-  "version": "1.4.1",
+  "version": "1.4.8",
   "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",
@@ -21,7 +21,7 @@
     "build:ts": "tsc",
     "build:native": "node-gyp rebuild",
     "build": "npm run build:native && npm run build:ts",
-    "prebuildify": "prebuildify --napi --strip --target 18.0.0 --target 20.0.0 --target 22.0.0",
+    "prebuildify": "prebuildify --napi --strip --target 18.0.0 --target 20.0.0 --target 22.0.0 --target 24.0.0",
     "changeset": "changeset",
     "version": "changeset version",
     "publish-packages": "changeset publish"
@@ -33,7 +33,11 @@
   },
   "repository": {
     "type": "git",
-    "url": "https://github.com/A-KGeorge/dspx"
+    "url": "git+https://github.com/A-KGeorge/dspx.git"
+  },
+  "publishConfig": {
+    "access": "public",
+    "provenance": true
   },
   "keywords": [
     "dsp",
@@ -47,6 +51,10 @@
   "author": "Alan Kochukalam George",
   "license": "Apache-2.0",
   "type": "module",
+  "engines": {
+    "node": ">=18.0.0",
+    "npm": ">=11.5.1"
+  },
   "dependencies": {
     "cross-env": "^7.0.3",
     "node-addon-api": "^8.5.0",

package/prebuilds/win32-x64/dspx.node CHANGED Viewed

Binary file

package/src/native/adapters/TimeAlignmentStage.cc CHANGED Viewed

@@ -18,13 +18,14 @@
 // Debug assertion macro
 #ifdef _DEBUG
-    #define ASSERT_BOUNDS(idx, maxSize, msg) \
-        if ((idx) >= (maxSize)) { \
-            std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
-            throw std::out_of_range(msg); \
-        }
+#define ASSERT_BOUNDS(idx, maxSize, msg)                                                                  \
+    if ((idx) >= (maxSize))                                                                               \
+    {                                                                                                     \
+        std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
+        throw std::out_of_range(msg);                                                                     \
+    }
 #else
-    #define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
+#define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
 #endif
 // Helper function to check debug flag
@@ -234,13 +235,58 @@ namespace dsp
                                                  std::to_string(outIdx) + ", targetTime=" + std::to_string(targetTime));
                     case GapPolicy::ZERO_FILL:
+                    {
+                        size_t writeIdx = outIdx * channels;
+                        ASSERT_BOUNDS(writeIdx + channels - 1, outputSize, "ZERO_FILL output write");
+#if defined(HAS_AVX2)
+                        // AVX2: Zero 8 floats at a time
+                        __m256 zero = _mm256_setzero_ps();
+                        int ch = 0;
+                        for (; ch + 8 <= channels; ch += 8)
+                        {
+                            _mm256_storeu_ps(&outputBuffer[writeIdx + ch], zero);
+                        }
+                        // Scalar remainder
+                        for (; ch < channels; ++ch)
+                        {
+                            outputBuffer[writeIdx + ch] = 0.0f;
+                        }
+#elif defined(HAS_SSE)
+                        // SSE: Zero 4 floats at a time
+                        __m128 zero = _mm_setzero_ps();
+                        int ch = 0;
+                        for (; ch + 4 <= channels; ch += 4)
+                        {
+                            _mm_storeu_ps(&outputBuffer[writeIdx + ch], zero);
+                        }
+                        // Scalar remainder
+                        for (; ch < channels; ++ch)
+                        {
+                            outputBuffer[writeIdx + ch] = 0.0f;
+                        }
+#elif defined(HAS_NEON)
+                        // NEON: Zero 4 floats at a time
+                        float32x4_t zero = vdupq_n_f32(0.0f);
+                        int ch = 0;
+                        for (; ch + 4 <= channels; ch += 4)
+                        {
+                            vst1q_f32(&outputBuffer[writeIdx + ch], zero);
+                        }
+                        // Scalar remainder
+                        for (; ch < channels; ++ch)
+                        {
+                            outputBuffer[writeIdx + ch] = 0.0f;
+                        }
+#else
+                        // Scalar fallback
                         for (int ch = 0; ch < channels; ++ch)
                         {
-                            size_t writeIdx = outIdx * channels + ch;
-                            ASSERT_BOUNDS(writeIdx, outputSize, "ZERO_FILL output write");
-                            outputBuffer[writeIdx] = 0.0f;
+                            outputBuffer[writeIdx + ch] = 0.0f;
                         }
-                        break;
+#endif
+                    }
+                    break;
                     case GapPolicy::HOLD:
                         // Hold last valid value before gap
@@ -260,7 +306,7 @@ namespace dsp
                             float t0 = timestamps[gapStart * channels];
                             float t1 = timestamps[gapEnd * channels];
                             float denominator = t1 - t0;
                             // Protection against division by zero
                             if (std::abs(denominator) < 1e-6f)
                             {
@@ -632,6 +678,138 @@ namespace dsp
             size_t centerIdx = findBracketingInterval(targetTime, timestamps, numSamples, channels, searchStart);
+#if defined(HAS_AVX2) || defined(HAS_SSE) || defined(HAS_NEON)
+            // SIMD-optimized path: Process 4 samples at a time
+            float values[windowSize] = {0};
+            float weights[windowSize] = {0};
+            int validCount = 0;
+            // Gather values and compute weights
+            for (int offset = -windowSize / 2; offset < windowSize / 2; ++offset)
+            {
+                int sampleIdx = static_cast<int>(centerIdx) + offset;
+                if (sampleIdx < 0 || sampleIdx >= static_cast<int>(numSamples))
+                    continue;
+                float t = timestamps[sampleIdx * channels];
+                float v = samples[sampleIdx * channels + channel];
+                // Sinc function: sin(π*x) / (π*x)
+                float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
+                float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
+                // Hamming window
+                float window = 0.54f - 0.46f * std::cos(2.0f * M_PI * (offset + windowSize / 2.0f) / windowSize);
+                values[validCount] = v;
+                weights[validCount] = sinc * window;
+                validCount++;
+            }
+            // SIMD accumulation
+            float sum = 0.0f;
+            float weightSum = 0.0f;
+#if defined(HAS_AVX2)
+            __m256 vsum = _mm256_setzero_ps();
+            __m256 wsum = _mm256_setzero_ps();
+            int i = 0;
+            for (; i + 8 <= validCount; i += 8)
+            {
+                __m256 v = _mm256_loadu_ps(&values[i]);
+                __m256 w = _mm256_loadu_ps(&weights[i]);
+                vsum = _mm256_add_ps(vsum, _mm256_mul_ps(v, w)); // sum += v * w
+                wsum = _mm256_add_ps(wsum, w);
+            }
+            // Horizontal sum for AVX2
+            __m128 vsum_low = _mm256_castps256_ps128(vsum);
+            __m128 vsum_high = _mm256_extractf128_ps(vsum, 1);
+            __m128 vsum128 = _mm_add_ps(vsum_low, vsum_high);
+            __m128 wsum_low = _mm256_castps256_ps128(wsum);
+            __m128 wsum_high = _mm256_extractf128_ps(wsum, 1);
+            __m128 wsum128 = _mm_add_ps(wsum_low, wsum_high);
+            // Continue with SSE reduction
+            vsum128 = _mm_hadd_ps(vsum128, vsum128);
+            vsum128 = _mm_hadd_ps(vsum128, vsum128);
+            sum = _mm_cvtss_f32(vsum128);
+            wsum128 = _mm_hadd_ps(wsum128, wsum128);
+            wsum128 = _mm_hadd_ps(wsum128, wsum128);
+            weightSum = _mm_cvtss_f32(wsum128);
+            // Scalar remainder
+            for (; i < validCount; ++i)
+            {
+                sum += values[i] * weights[i];
+                weightSum += weights[i];
+            }
+#elif defined(HAS_SSE)
+            __m128 vsum = _mm_setzero_ps();
+            __m128 wsum = _mm_setzero_ps();
+            int i = 0;
+            for (; i + 4 <= validCount; i += 4)
+            {
+                __m128 v = _mm_loadu_ps(&values[i]);
+                __m128 w = _mm_loadu_ps(&weights[i]);
+                vsum = _mm_add_ps(vsum, _mm_mul_ps(v, w));
+                wsum = _mm_add_ps(wsum, w);
+            }
+            // Horizontal sum
+            vsum = _mm_hadd_ps(vsum, vsum);
+            vsum = _mm_hadd_ps(vsum, vsum);
+            sum = _mm_cvtss_f32(vsum);
+            wsum = _mm_hadd_ps(wsum, wsum);
+            wsum = _mm_hadd_ps(wsum, wsum);
+            weightSum = _mm_cvtss_f32(wsum);
+            // Scalar remainder
+            for (; i < validCount; ++i)
+            {
+                sum += values[i] * weights[i];
+                weightSum += weights[i];
+            }
+#elif defined(HAS_NEON)
+            float32x4_t vsum = vdupq_n_f32(0.0f);
+            float32x4_t wsum = vdupq_n_f32(0.0f);
+            int i = 0;
+            for (; i + 4 <= validCount; i += 4)
+            {
+                float32x4_t v = vld1q_f32(&values[i]);
+                float32x4_t w = vld1q_f32(&weights[i]);
+                vsum = vmlaq_f32(vsum, v, w); // vsum += v * w
+                wsum = vaddq_f32(wsum, w);
+            }
+            // Horizontal sum
+            float32x2_t vsum_low = vget_low_f32(vsum);
+            float32x2_t vsum_high = vget_high_f32(vsum);
+            float32x2_t vsum_pair = vadd_f32(vsum_low, vsum_high);
+            sum = vget_lane_f32(vpadd_f32(vsum_pair, vsum_pair), 0);
+            float32x2_t wsum_low = vget_low_f32(wsum);
+            float32x2_t wsum_high = vget_high_f32(wsum);
+            float32x2_t wsum_pair = vadd_f32(wsum_low, wsum_high);
+            weightSum = vget_lane_f32(vpadd_f32(wsum_pair, wsum_pair), 0);
+            // Scalar remainder
+            for (; i < validCount; ++i)
+            {
+                sum += values[i] * weights[i];
+                weightSum += weights[i];
+            }
+#endif
+            output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
+#else
+            // Scalar fallback
             float sum = 0.0f;
             float weightSum = 0.0f;
@@ -645,7 +823,7 @@ namespace dsp
                 float v = samples[sampleIdx * channels + channel];
                 // Sinc function: sin(π*x) / (π*x)
-                float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f; // Normalize by sample rate
+                float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
                 float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
                 // Hamming window
@@ -657,6 +835,7 @@ namespace dsp
             }
             output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
+#endif
             searchStart = centerIdx;
         }

package/src/native/adapters/TimeAlignmentStage.h CHANGED Viewed

@@ -58,11 +58,11 @@ namespace dsp
          * TimeAlignmentStage: Production-grade irregular timestamp resampling
          *
          * This stage solves the problems identified in Gemini's analysis:
-         * 1. ✅ Time-based coordinate system (not index-based)
-         * 2. ✅ Gap detection and handling policies
-         * 3. ✅ Clock drift compensation
-         * 4. ✅ Proper SIMD optimization for irregular data
-         * 5. ✅ Configurable extrapolation/error handling
+         * 1. Time-based coordinate system (not index-based)
+         * 2. Gap detection and handling policies
+         * 3. Clock drift compensation
+         * 4. Proper SIMD optimization for irregular data
+         * 5. Configurable extrapolation/error handling
          *
          * Usage:
          *   auto stage = TimeAlignmentStage(

package/src/native/core/FirFilter.cc CHANGED Viewed

@@ -318,6 +318,19 @@ namespace dsp
         template <typename T>
         std::pair<std::vector<T>, size_t> FirFilter<T>::getState() const
         {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            if (m_useNeon && m_neonFilter)
+            {
+                // Use NEON filter's linearization on ARM
+                auto [linearState, stateIndex] = m_neonFilter->exportLinearState();
+                std::vector<T> state(linearState.size());
+                for (size_t i = 0; i < linearState.size(); ++i)
+                {
+                    state[i] = static_cast<T>(linearState[i]);
+                }
+                return {state, stateIndex};
+            }
+#endif
             return {m_state, m_stateIndex};
         }
@@ -339,6 +352,19 @@ namespace dsp
                 throw std::invalid_argument("stateIndex out of range");
             }
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            if (m_useNeon && m_neonFilter)
+            {
+                // Convert to float vector for NEON filter's linearization
+                std::vector<float> floatState(state.size());
+                for (size_t i = 0; i < state.size(); ++i)
+                {
+                    floatState[i] = static_cast<float>(state[i]);
+                }
+                m_neonFilter->importLinearState(floatState, stateIndex);
+                return;
+            }
+#endif
             m_state = state;
             m_stateIndex = stateIndex;
         }

package/src/native/core/FirFilterNeon.h CHANGED Viewed

@@ -227,6 +227,66 @@ namespace dsp::core
             m_samplesProcessed = 0;
         }
+        /**
+         * @brief Export state in linear format (oldest->newest) for serialization
+         * @return Pair of linear state vector and state index
+         */
+        std::pair<std::vector<float>, size_t> exportLinearState() const
+        {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            const float *state = getState();
+            std::vector<float> linearState(m_bufferSize, 0.0f);
+            // Calculate oldest sample position in circular buffer
+            size_t oldestPos = (m_head >= m_numTaps - 1)
+                                   ? m_head - (m_numTaps - 1)
+                                   : m_head + m_bufferSize - (m_numTaps - 1);
+            // Un-rotate: copy from oldest->newest into linear array
+            for (size_t i = 0; i < m_bufferSize; ++i)
+            {
+                linearState[i] = state[(oldestPos + i) & m_headMask];
+            }
+            return {linearState, m_numTaps - 1};
+#else
+            // Scalar path: state is already linear
+            const float *state = getState();
+            std::vector<float> linearState(state, state + m_bufferSize);
+            return {linearState, m_head};
+#endif
+        }
+        /**
+         * @brief Import state from linear format (oldest->newest) after deserialization
+         * @param linearState Linear state vector (oldest->newest)
+         * @param stateIndex State index (number of valid samples - 1)
+         */
+        void importLinearState(const std::vector<float> &linearState, size_t stateIndex)
+        {
+#if defined(__ARM_NEON) || defined(__aarch64__)
+            float *state = getState();
+            // Copy linear state into circular buffer
+            for (size_t i = 0; i < m_bufferSize && i < linearState.size(); ++i)
+            {
+                state[i] = linearState[i];
+                state[i + m_bufferSize] = linearState[i]; // Guard zone
+            }
+            // Set head to point where newest sample will be written
+            m_head = stateIndex;
+#else
+            // Scalar path: direct copy
+            float *state = getState();
+            for (size_t i = 0; i < m_bufferSize && i < linearState.size(); ++i)
+            {
+                state[i] = linearState[i];
+            }
+            m_head = stateIndex;
+#endif
+        }
         size_t getNumTaps() const { return m_numTaps; }
         size_t getBufferSize() const { return m_bufferSize; }