dspx 1.4.1 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dspx",
3
- "version": "1.4.1",
3
+ "version": "1.4.2",
4
4
  "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
Binary file
@@ -18,13 +18,14 @@
18
18
 
19
19
  // Debug assertion macro
20
20
  #ifdef _DEBUG
21
- #define ASSERT_BOUNDS(idx, maxSize, msg) \
22
- if ((idx) >= (maxSize)) { \
23
- std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
24
- throw std::out_of_range(msg); \
25
- }
21
+ #define ASSERT_BOUNDS(idx, maxSize, msg) \
22
+ if ((idx) >= (maxSize)) \
23
+ { \
24
+ std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
25
+ throw std::out_of_range(msg); \
26
+ }
26
27
  #else
27
- #define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
28
+ #define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
28
29
  #endif
29
30
 
30
31
  // Helper function to check debug flag
@@ -234,13 +235,58 @@ namespace dsp
234
235
  std::to_string(outIdx) + ", targetTime=" + std::to_string(targetTime));
235
236
 
236
237
  case GapPolicy::ZERO_FILL:
238
+ {
239
+ size_t writeIdx = outIdx * channels;
240
+ ASSERT_BOUNDS(writeIdx + channels - 1, outputSize, "ZERO_FILL output write");
241
+
242
+ #if defined(HAS_AVX2)
243
+ // AVX2: Zero 8 floats at a time
244
+ __m256 zero = _mm256_setzero_ps();
245
+ int ch = 0;
246
+ for (; ch + 8 <= channels; ch += 8)
247
+ {
248
+ _mm256_storeu_ps(&outputBuffer[writeIdx + ch], zero);
249
+ }
250
+ // Scalar remainder
251
+ for (; ch < channels; ++ch)
252
+ {
253
+ outputBuffer[writeIdx + ch] = 0.0f;
254
+ }
255
+ #elif defined(HAS_SSE)
256
+ // SSE: Zero 4 floats at a time
257
+ __m128 zero = _mm_setzero_ps();
258
+ int ch = 0;
259
+ for (; ch + 4 <= channels; ch += 4)
260
+ {
261
+ _mm_storeu_ps(&outputBuffer[writeIdx + ch], zero);
262
+ }
263
+ // Scalar remainder
264
+ for (; ch < channels; ++ch)
265
+ {
266
+ outputBuffer[writeIdx + ch] = 0.0f;
267
+ }
268
+ #elif defined(HAS_NEON)
269
+ // NEON: Zero 4 floats at a time
270
+ float32x4_t zero = vdupq_n_f32(0.0f);
271
+ int ch = 0;
272
+ for (; ch + 4 <= channels; ch += 4)
273
+ {
274
+ vst1q_f32(&outputBuffer[writeIdx + ch], zero);
275
+ }
276
+ // Scalar remainder
277
+ for (; ch < channels; ++ch)
278
+ {
279
+ outputBuffer[writeIdx + ch] = 0.0f;
280
+ }
281
+ #else
282
+ // Scalar fallback
237
283
  for (int ch = 0; ch < channels; ++ch)
238
284
  {
239
- size_t writeIdx = outIdx * channels + ch;
240
- ASSERT_BOUNDS(writeIdx, outputSize, "ZERO_FILL output write");
241
- outputBuffer[writeIdx] = 0.0f;
285
+ outputBuffer[writeIdx + ch] = 0.0f;
242
286
  }
243
- break;
287
+ #endif
288
+ }
289
+ break;
244
290
 
245
291
  case GapPolicy::HOLD:
246
292
  // Hold last valid value before gap
@@ -260,7 +306,7 @@ namespace dsp
260
306
  float t0 = timestamps[gapStart * channels];
261
307
  float t1 = timestamps[gapEnd * channels];
262
308
  float denominator = t1 - t0;
263
-
309
+
264
310
  // Protection against division by zero
265
311
  if (std::abs(denominator) < 1e-6f)
266
312
  {
@@ -632,6 +678,138 @@ namespace dsp
632
678
 
633
679
  size_t centerIdx = findBracketingInterval(targetTime, timestamps, numSamples, channels, searchStart);
634
680
 
681
+ #if defined(HAS_AVX2) || defined(HAS_SSE) || defined(HAS_NEON)
682
+ // SIMD-optimized path: Process 4 samples at a time
683
+ float values[windowSize] = {0};
684
+ float weights[windowSize] = {0};
685
+ int validCount = 0;
686
+
687
+ // Gather values and compute weights
688
+ for (int offset = -windowSize / 2; offset < windowSize / 2; ++offset)
689
+ {
690
+ int sampleIdx = static_cast<int>(centerIdx) + offset;
691
+ if (sampleIdx < 0 || sampleIdx >= static_cast<int>(numSamples))
692
+ continue;
693
+
694
+ float t = timestamps[sampleIdx * channels];
695
+ float v = samples[sampleIdx * channels + channel];
696
+
697
+ // Sinc function: sin(π*x) / (π*x)
698
+ float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
699
+ float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
700
+
701
+ // Hamming window
702
+ float window = 0.54f - 0.46f * std::cos(2.0f * M_PI * (offset + windowSize / 2.0f) / windowSize);
703
+
704
+ values[validCount] = v;
705
+ weights[validCount] = sinc * window;
706
+ validCount++;
707
+ }
708
+
709
+ // SIMD accumulation
710
+ float sum = 0.0f;
711
+ float weightSum = 0.0f;
712
+
713
+ #if defined(HAS_AVX2)
714
+ __m256 vsum = _mm256_setzero_ps();
715
+ __m256 wsum = _mm256_setzero_ps();
716
+
717
+ int i = 0;
718
+ for (; i + 8 <= validCount; i += 8)
719
+ {
720
+ __m256 v = _mm256_loadu_ps(&values[i]);
721
+ __m256 w = _mm256_loadu_ps(&weights[i]);
722
+ vsum = _mm256_add_ps(vsum, _mm256_mul_ps(v, w)); // sum += v * w
723
+ wsum = _mm256_add_ps(wsum, w);
724
+ }
725
+
726
+ // Horizontal sum for AVX2
727
+ __m128 vsum_low = _mm256_castps256_ps128(vsum);
728
+ __m128 vsum_high = _mm256_extractf128_ps(vsum, 1);
729
+ __m128 vsum128 = _mm_add_ps(vsum_low, vsum_high);
730
+
731
+ __m128 wsum_low = _mm256_castps256_ps128(wsum);
732
+ __m128 wsum_high = _mm256_extractf128_ps(wsum, 1);
733
+ __m128 wsum128 = _mm_add_ps(wsum_low, wsum_high);
734
+
735
+ // Continue with SSE reduction
736
+ vsum128 = _mm_hadd_ps(vsum128, vsum128);
737
+ vsum128 = _mm_hadd_ps(vsum128, vsum128);
738
+ sum = _mm_cvtss_f32(vsum128);
739
+
740
+ wsum128 = _mm_hadd_ps(wsum128, wsum128);
741
+ wsum128 = _mm_hadd_ps(wsum128, wsum128);
742
+ weightSum = _mm_cvtss_f32(wsum128);
743
+
744
+ // Scalar remainder
745
+ for (; i < validCount; ++i)
746
+ {
747
+ sum += values[i] * weights[i];
748
+ weightSum += weights[i];
749
+ }
750
+ #elif defined(HAS_SSE)
751
+ __m128 vsum = _mm_setzero_ps();
752
+ __m128 wsum = _mm_setzero_ps();
753
+
754
+ int i = 0;
755
+ for (; i + 4 <= validCount; i += 4)
756
+ {
757
+ __m128 v = _mm_loadu_ps(&values[i]);
758
+ __m128 w = _mm_loadu_ps(&weights[i]);
759
+ vsum = _mm_add_ps(vsum, _mm_mul_ps(v, w));
760
+ wsum = _mm_add_ps(wsum, w);
761
+ }
762
+
763
+ // Horizontal sum
764
+ vsum = _mm_hadd_ps(vsum, vsum);
765
+ vsum = _mm_hadd_ps(vsum, vsum);
766
+ sum = _mm_cvtss_f32(vsum);
767
+
768
+ wsum = _mm_hadd_ps(wsum, wsum);
769
+ wsum = _mm_hadd_ps(wsum, wsum);
770
+ weightSum = _mm_cvtss_f32(wsum);
771
+
772
+ // Scalar remainder
773
+ for (; i < validCount; ++i)
774
+ {
775
+ sum += values[i] * weights[i];
776
+ weightSum += weights[i];
777
+ }
778
+ #elif defined(HAS_NEON)
779
+ float32x4_t vsum = vdupq_n_f32(0.0f);
780
+ float32x4_t wsum = vdupq_n_f32(0.0f);
781
+
782
+ int i = 0;
783
+ for (; i + 4 <= validCount; i += 4)
784
+ {
785
+ float32x4_t v = vld1q_f32(&values[i]);
786
+ float32x4_t w = vld1q_f32(&weights[i]);
787
+ vsum = vmlaq_f32(vsum, v, w); // vsum += v * w
788
+ wsum = vaddq_f32(wsum, w);
789
+ }
790
+
791
+ // Horizontal sum
792
+ float32x2_t vsum_low = vget_low_f32(vsum);
793
+ float32x2_t vsum_high = vget_high_f32(vsum);
794
+ float32x2_t vsum_pair = vadd_f32(vsum_low, vsum_high);
795
+ sum = vget_lane_f32(vpadd_f32(vsum_pair, vsum_pair), 0);
796
+
797
+ float32x2_t wsum_low = vget_low_f32(wsum);
798
+ float32x2_t wsum_high = vget_high_f32(wsum);
799
+ float32x2_t wsum_pair = vadd_f32(wsum_low, wsum_high);
800
+ weightSum = vget_lane_f32(vpadd_f32(wsum_pair, wsum_pair), 0);
801
+
802
+ // Scalar remainder
803
+ for (; i < validCount; ++i)
804
+ {
805
+ sum += values[i] * weights[i];
806
+ weightSum += weights[i];
807
+ }
808
+ #endif
809
+
810
+ output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
811
+ #else
812
+ // Scalar fallback
635
813
  float sum = 0.0f;
636
814
  float weightSum = 0.0f;
637
815
 
@@ -645,7 +823,7 @@ namespace dsp
645
823
  float v = samples[sampleIdx * channels + channel];
646
824
 
647
825
  // Sinc function: sin(π*x) / (π*x)
648
- float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f; // Normalize by sample rate
826
+ float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
649
827
  float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
650
828
 
651
829
  // Hamming window
@@ -657,6 +835,7 @@ namespace dsp
657
835
  }
658
836
 
659
837
  output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
838
+ #endif
660
839
  searchStart = centerIdx;
661
840
  }
662
841
 
@@ -58,11 +58,11 @@ namespace dsp
58
58
  * TimeAlignmentStage: Production-grade irregular timestamp resampling
59
59
  *
60
60
  * This stage solves the problems identified in Gemini's analysis:
61
- * 1. Time-based coordinate system (not index-based)
62
- * 2. Gap detection and handling policies
63
- * 3. Clock drift compensation
64
- * 4. Proper SIMD optimization for irregular data
65
- * 5. Configurable extrapolation/error handling
61
+ * 1. Time-based coordinate system (not index-based)
62
+ * 2. Gap detection and handling policies
63
+ * 3. Clock drift compensation
64
+ * 4. Proper SIMD optimization for irregular data
65
+ * 5. Configurable extrapolation/error handling
66
66
  *
67
67
  * Usage:
68
68
  * auto stage = TimeAlignmentStage(