dspx 1.4.1 → 1.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,6 +11,8 @@
11
11
 
12
12
  A modern DSP library built for Node.js backends processing real-time biosignals, audio streams, and sensor data. Features native C++ filters with full state serialization (to Redis, S3, or any storage backend), enabling seamless processing across service restarts and distributed workers.
13
13
 
14
+ [View the benchmarks](https://github.com/A-KGeorge/dspx-benchmark/)
15
+
14
16
  ---
15
17
 
16
18
  ## ✨ Features
@@ -475,7 +477,7 @@ const clean = notch.process(noisySignal);
475
477
  import { createDspPipeline, Convolution } from "dspx";
476
478
  const pipeline = createDspPipeline();
477
479
  pipeline.addStage(
478
- new Convolution({ kernel: OPTIMAL_LOWPASS_COEFFS.cutoff_0_2 })
480
+ new Convolution({ kernel: OPTIMAL_LOWPASS_COEFFS.cutoff_0_2 }),
479
481
  );
480
482
 
481
483
  // ✅ Zero Python dependency - coefficients ship with the library!
@@ -1661,7 +1663,7 @@ const saveBreaker = new CircuitBreaker(
1661
1663
  timeout: 2000, // Fail if >2s
1662
1664
  errorThresholdPercentage: 50, // Trip after 50% failures
1663
1665
  resetTimeout: 30000, // Try recovery after 30s
1664
- }
1666
+ },
1665
1667
  );
1666
1668
 
1667
1669
  saveBreaker.fallback(() => {
package/binding.gyp CHANGED
@@ -93,7 +93,7 @@
93
93
  'OTHER_CPLUSPLUSFLAGS+': [ '-msse3', '-mavx', '-mavx2' ]
94
94
  }
95
95
  }],
96
- # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, Tensor G4, etc.)
96
+ # Condition for arm64 architecture (Android, iOS, M1/M2 Macs, etc.)
97
97
  ['target_arch=="arm64"', {
98
98
  # ARMv8-a baseline: NEON + FP support (compatible with all ARMv8 CPUs)
99
99
  "cflags+": [ "-march=armv8-a+fp+simd" ],
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dspx",
3
- "version": "1.4.1",
3
+ "version": "1.4.8",
4
4
  "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
@@ -21,7 +21,7 @@
21
21
  "build:ts": "tsc",
22
22
  "build:native": "node-gyp rebuild",
23
23
  "build": "npm run build:native && npm run build:ts",
24
- "prebuildify": "prebuildify --napi --strip --target 18.0.0 --target 20.0.0 --target 22.0.0",
24
+ "prebuildify": "prebuildify --napi --strip --target 18.0.0 --target 20.0.0 --target 22.0.0 --target 24.0.0",
25
25
  "changeset": "changeset",
26
26
  "version": "changeset version",
27
27
  "publish-packages": "changeset publish"
@@ -33,7 +33,11 @@
33
33
  },
34
34
  "repository": {
35
35
  "type": "git",
36
- "url": "https://github.com/A-KGeorge/dspx"
36
+ "url": "git+https://github.com/A-KGeorge/dspx.git"
37
+ },
38
+ "publishConfig": {
39
+ "access": "public",
40
+ "provenance": true
37
41
  },
38
42
  "keywords": [
39
43
  "dsp",
@@ -47,6 +51,10 @@
47
51
  "author": "Alan Kochukalam George",
48
52
  "license": "Apache-2.0",
49
53
  "type": "module",
54
+ "engines": {
55
+ "node": ">=18.0.0",
56
+ "npm": ">=11.5.1"
57
+ },
50
58
  "dependencies": {
51
59
  "cross-env": "^7.0.3",
52
60
  "node-addon-api": "^8.5.0",
Binary file
@@ -18,13 +18,14 @@
18
18
 
19
19
  // Debug assertion macro
20
20
  #ifdef _DEBUG
21
- #define ASSERT_BOUNDS(idx, maxSize, msg) \
22
- if ((idx) >= (maxSize)) { \
23
- std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
24
- throw std::out_of_range(msg); \
25
- }
21
+ #define ASSERT_BOUNDS(idx, maxSize, msg) \
22
+ if ((idx) >= (maxSize)) \
23
+ { \
24
+ std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
25
+ throw std::out_of_range(msg); \
26
+ }
26
27
  #else
27
- #define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
28
+ #define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
28
29
  #endif
29
30
 
30
31
  // Helper function to check debug flag
@@ -234,13 +235,58 @@ namespace dsp
234
235
  std::to_string(outIdx) + ", targetTime=" + std::to_string(targetTime));
235
236
 
236
237
  case GapPolicy::ZERO_FILL:
238
+ {
239
+ size_t writeIdx = outIdx * channels;
240
+ ASSERT_BOUNDS(writeIdx + channels - 1, outputSize, "ZERO_FILL output write");
241
+
242
+ #if defined(HAS_AVX2)
243
+ // AVX2: Zero 8 floats at a time
244
+ __m256 zero = _mm256_setzero_ps();
245
+ int ch = 0;
246
+ for (; ch + 8 <= channels; ch += 8)
247
+ {
248
+ _mm256_storeu_ps(&outputBuffer[writeIdx + ch], zero);
249
+ }
250
+ // Scalar remainder
251
+ for (; ch < channels; ++ch)
252
+ {
253
+ outputBuffer[writeIdx + ch] = 0.0f;
254
+ }
255
+ #elif defined(HAS_SSE)
256
+ // SSE: Zero 4 floats at a time
257
+ __m128 zero = _mm_setzero_ps();
258
+ int ch = 0;
259
+ for (; ch + 4 <= channels; ch += 4)
260
+ {
261
+ _mm_storeu_ps(&outputBuffer[writeIdx + ch], zero);
262
+ }
263
+ // Scalar remainder
264
+ for (; ch < channels; ++ch)
265
+ {
266
+ outputBuffer[writeIdx + ch] = 0.0f;
267
+ }
268
+ #elif defined(HAS_NEON)
269
+ // NEON: Zero 4 floats at a time
270
+ float32x4_t zero = vdupq_n_f32(0.0f);
271
+ int ch = 0;
272
+ for (; ch + 4 <= channels; ch += 4)
273
+ {
274
+ vst1q_f32(&outputBuffer[writeIdx + ch], zero);
275
+ }
276
+ // Scalar remainder
277
+ for (; ch < channels; ++ch)
278
+ {
279
+ outputBuffer[writeIdx + ch] = 0.0f;
280
+ }
281
+ #else
282
+ // Scalar fallback
237
283
  for (int ch = 0; ch < channels; ++ch)
238
284
  {
239
- size_t writeIdx = outIdx * channels + ch;
240
- ASSERT_BOUNDS(writeIdx, outputSize, "ZERO_FILL output write");
241
- outputBuffer[writeIdx] = 0.0f;
285
+ outputBuffer[writeIdx + ch] = 0.0f;
242
286
  }
243
- break;
287
+ #endif
288
+ }
289
+ break;
244
290
 
245
291
  case GapPolicy::HOLD:
246
292
  // Hold last valid value before gap
@@ -260,7 +306,7 @@ namespace dsp
260
306
  float t0 = timestamps[gapStart * channels];
261
307
  float t1 = timestamps[gapEnd * channels];
262
308
  float denominator = t1 - t0;
263
-
309
+
264
310
  // Protection against division by zero
265
311
  if (std::abs(denominator) < 1e-6f)
266
312
  {
@@ -632,6 +678,138 @@ namespace dsp
632
678
 
633
679
  size_t centerIdx = findBracketingInterval(targetTime, timestamps, numSamples, channels, searchStart);
634
680
 
681
+ #if defined(HAS_AVX2) || defined(HAS_SSE) || defined(HAS_NEON)
682
+ // SIMD-optimized path: Process 4 samples at a time
683
+ float values[windowSize] = {0};
684
+ float weights[windowSize] = {0};
685
+ int validCount = 0;
686
+
687
+ // Gather values and compute weights
688
+ for (int offset = -windowSize / 2; offset < windowSize / 2; ++offset)
689
+ {
690
+ int sampleIdx = static_cast<int>(centerIdx) + offset;
691
+ if (sampleIdx < 0 || sampleIdx >= static_cast<int>(numSamples))
692
+ continue;
693
+
694
+ float t = timestamps[sampleIdx * channels];
695
+ float v = samples[sampleIdx * channels + channel];
696
+
697
+ // Sinc function: sin(π*x) / (π*x)
698
+ float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
699
+ float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
700
+
701
+ // Hamming window
702
+ float window = 0.54f - 0.46f * std::cos(2.0f * M_PI * (offset + windowSize / 2.0f) / windowSize);
703
+
704
+ values[validCount] = v;
705
+ weights[validCount] = sinc * window;
706
+ validCount++;
707
+ }
708
+
709
+ // SIMD accumulation
710
+ float sum = 0.0f;
711
+ float weightSum = 0.0f;
712
+
713
+ #if defined(HAS_AVX2)
714
+ __m256 vsum = _mm256_setzero_ps();
715
+ __m256 wsum = _mm256_setzero_ps();
716
+
717
+ int i = 0;
718
+ for (; i + 8 <= validCount; i += 8)
719
+ {
720
+ __m256 v = _mm256_loadu_ps(&values[i]);
721
+ __m256 w = _mm256_loadu_ps(&weights[i]);
722
+ vsum = _mm256_add_ps(vsum, _mm256_mul_ps(v, w)); // sum += v * w
723
+ wsum = _mm256_add_ps(wsum, w);
724
+ }
725
+
726
+ // Horizontal sum for AVX2
727
+ __m128 vsum_low = _mm256_castps256_ps128(vsum);
728
+ __m128 vsum_high = _mm256_extractf128_ps(vsum, 1);
729
+ __m128 vsum128 = _mm_add_ps(vsum_low, vsum_high);
730
+
731
+ __m128 wsum_low = _mm256_castps256_ps128(wsum);
732
+ __m128 wsum_high = _mm256_extractf128_ps(wsum, 1);
733
+ __m128 wsum128 = _mm_add_ps(wsum_low, wsum_high);
734
+
735
+ // Continue with SSE reduction
736
+ vsum128 = _mm_hadd_ps(vsum128, vsum128);
737
+ vsum128 = _mm_hadd_ps(vsum128, vsum128);
738
+ sum = _mm_cvtss_f32(vsum128);
739
+
740
+ wsum128 = _mm_hadd_ps(wsum128, wsum128);
741
+ wsum128 = _mm_hadd_ps(wsum128, wsum128);
742
+ weightSum = _mm_cvtss_f32(wsum128);
743
+
744
+ // Scalar remainder
745
+ for (; i < validCount; ++i)
746
+ {
747
+ sum += values[i] * weights[i];
748
+ weightSum += weights[i];
749
+ }
750
+ #elif defined(HAS_SSE)
751
+ __m128 vsum = _mm_setzero_ps();
752
+ __m128 wsum = _mm_setzero_ps();
753
+
754
+ int i = 0;
755
+ for (; i + 4 <= validCount; i += 4)
756
+ {
757
+ __m128 v = _mm_loadu_ps(&values[i]);
758
+ __m128 w = _mm_loadu_ps(&weights[i]);
759
+ vsum = _mm_add_ps(vsum, _mm_mul_ps(v, w));
760
+ wsum = _mm_add_ps(wsum, w);
761
+ }
762
+
763
+ // Horizontal sum
764
+ vsum = _mm_hadd_ps(vsum, vsum);
765
+ vsum = _mm_hadd_ps(vsum, vsum);
766
+ sum = _mm_cvtss_f32(vsum);
767
+
768
+ wsum = _mm_hadd_ps(wsum, wsum);
769
+ wsum = _mm_hadd_ps(wsum, wsum);
770
+ weightSum = _mm_cvtss_f32(wsum);
771
+
772
+ // Scalar remainder
773
+ for (; i < validCount; ++i)
774
+ {
775
+ sum += values[i] * weights[i];
776
+ weightSum += weights[i];
777
+ }
778
+ #elif defined(HAS_NEON)
779
+ float32x4_t vsum = vdupq_n_f32(0.0f);
780
+ float32x4_t wsum = vdupq_n_f32(0.0f);
781
+
782
+ int i = 0;
783
+ for (; i + 4 <= validCount; i += 4)
784
+ {
785
+ float32x4_t v = vld1q_f32(&values[i]);
786
+ float32x4_t w = vld1q_f32(&weights[i]);
787
+ vsum = vmlaq_f32(vsum, v, w); // vsum += v * w
788
+ wsum = vaddq_f32(wsum, w);
789
+ }
790
+
791
+ // Horizontal sum
792
+ float32x2_t vsum_low = vget_low_f32(vsum);
793
+ float32x2_t vsum_high = vget_high_f32(vsum);
794
+ float32x2_t vsum_pair = vadd_f32(vsum_low, vsum_high);
795
+ sum = vget_lane_f32(vpadd_f32(vsum_pair, vsum_pair), 0);
796
+
797
+ float32x2_t wsum_low = vget_low_f32(wsum);
798
+ float32x2_t wsum_high = vget_high_f32(wsum);
799
+ float32x2_t wsum_pair = vadd_f32(wsum_low, wsum_high);
800
+ weightSum = vget_lane_f32(vpadd_f32(wsum_pair, wsum_pair), 0);
801
+
802
+ // Scalar remainder
803
+ for (; i < validCount; ++i)
804
+ {
805
+ sum += values[i] * weights[i];
806
+ weightSum += weights[i];
807
+ }
808
+ #endif
809
+
810
+ output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
811
+ #else
812
+ // Scalar fallback
635
813
  float sum = 0.0f;
636
814
  float weightSum = 0.0f;
637
815
 
@@ -645,7 +823,7 @@ namespace dsp
645
823
  float v = samples[sampleIdx * channels + channel];
646
824
 
647
825
  // Sinc function: sin(π*x) / (π*x)
648
- float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f; // Normalize by sample rate
826
+ float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
649
827
  float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
650
828
 
651
829
  // Hamming window
@@ -657,6 +835,7 @@ namespace dsp
657
835
  }
658
836
 
659
837
  output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
838
+ #endif
660
839
  searchStart = centerIdx;
661
840
  }
662
841
 
@@ -58,11 +58,11 @@ namespace dsp
58
58
  * TimeAlignmentStage: Production-grade irregular timestamp resampling
59
59
  *
60
60
  * This stage solves the problems identified in Gemini's analysis:
61
- * 1. Time-based coordinate system (not index-based)
62
- * 2. Gap detection and handling policies
63
- * 3. Clock drift compensation
64
- * 4. Proper SIMD optimization for irregular data
65
- * 5. Configurable extrapolation/error handling
61
+ * 1. Time-based coordinate system (not index-based)
62
+ * 2. Gap detection and handling policies
63
+ * 3. Clock drift compensation
64
+ * 4. Proper SIMD optimization for irregular data
65
+ * 5. Configurable extrapolation/error handling
66
66
  *
67
67
  * Usage:
68
68
  * auto stage = TimeAlignmentStage(
@@ -318,6 +318,19 @@ namespace dsp
318
318
  template <typename T>
319
319
  std::pair<std::vector<T>, size_t> FirFilter<T>::getState() const
320
320
  {
321
+ #if defined(__ARM_NEON) || defined(__aarch64__)
322
+ if (m_useNeon && m_neonFilter)
323
+ {
324
+ // Use NEON filter's linearization on ARM
325
+ auto [linearState, stateIndex] = m_neonFilter->exportLinearState();
326
+ std::vector<T> state(linearState.size());
327
+ for (size_t i = 0; i < linearState.size(); ++i)
328
+ {
329
+ state[i] = static_cast<T>(linearState[i]);
330
+ }
331
+ return {state, stateIndex};
332
+ }
333
+ #endif
321
334
  return {m_state, m_stateIndex};
322
335
  }
323
336
 
@@ -339,6 +352,19 @@ namespace dsp
339
352
  throw std::invalid_argument("stateIndex out of range");
340
353
  }
341
354
 
355
+ #if defined(__ARM_NEON) || defined(__aarch64__)
356
+ if (m_useNeon && m_neonFilter)
357
+ {
358
+ // Convert to float vector for NEON filter's linearization
359
+ std::vector<float> floatState(state.size());
360
+ for (size_t i = 0; i < state.size(); ++i)
361
+ {
362
+ floatState[i] = static_cast<float>(state[i]);
363
+ }
364
+ m_neonFilter->importLinearState(floatState, stateIndex);
365
+ return;
366
+ }
367
+ #endif
342
368
  m_state = state;
343
369
  m_stateIndex = stateIndex;
344
370
  }
@@ -227,6 +227,66 @@ namespace dsp::core
227
227
  m_samplesProcessed = 0;
228
228
  }
229
229
 
230
+ /**
231
+ * @brief Export state in linear format (oldest->newest) for serialization
232
+ * @return Pair of linear state vector and state index
233
+ */
234
+ std::pair<std::vector<float>, size_t> exportLinearState() const
235
+ {
236
+ #if defined(__ARM_NEON) || defined(__aarch64__)
237
+ const float *state = getState();
238
+ std::vector<float> linearState(m_bufferSize, 0.0f);
239
+
240
+ // Calculate oldest sample position in circular buffer
241
+ size_t oldestPos = (m_head >= m_numTaps - 1)
242
+ ? m_head - (m_numTaps - 1)
243
+ : m_head + m_bufferSize - (m_numTaps - 1);
244
+
245
+ // Un-rotate: copy from oldest->newest into linear array
246
+ for (size_t i = 0; i < m_bufferSize; ++i)
247
+ {
248
+ linearState[i] = state[(oldestPos + i) & m_headMask];
249
+ }
250
+
251
+ return {linearState, m_numTaps - 1};
252
+ #else
253
+ // Scalar path: state is already linear
254
+ const float *state = getState();
255
+ std::vector<float> linearState(state, state + m_bufferSize);
256
+ return {linearState, m_head};
257
+ #endif
258
+ }
259
+
260
+ /**
261
+ * @brief Import state from linear format (oldest->newest) after deserialization
262
+ * @param linearState Linear state vector (oldest->newest)
263
+ * @param stateIndex State index (number of valid samples - 1)
264
+ */
265
+ void importLinearState(const std::vector<float> &linearState, size_t stateIndex)
266
+ {
267
+ #if defined(__ARM_NEON) || defined(__aarch64__)
268
+ float *state = getState();
269
+
270
+ // Copy linear state into circular buffer
271
+ for (size_t i = 0; i < m_bufferSize && i < linearState.size(); ++i)
272
+ {
273
+ state[i] = linearState[i];
274
+ state[i + m_bufferSize] = linearState[i]; // Guard zone
275
+ }
276
+
277
+ // Set head to point where newest sample will be written
278
+ m_head = stateIndex;
279
+ #else
280
+ // Scalar path: direct copy
281
+ float *state = getState();
282
+ for (size_t i = 0; i < m_bufferSize && i < linearState.size(); ++i)
283
+ {
284
+ state[i] = linearState[i];
285
+ }
286
+ m_head = stateIndex;
287
+ #endif
288
+ }
289
+
230
290
  size_t getNumTaps() const { return m_numTaps; }
231
291
  size_t getBufferSize() const { return m_bufferSize; }
232
292