dspx 1.4.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
Binary file
|
|
@@ -18,13 +18,14 @@
|
|
|
18
18
|
|
|
19
19
|
// Debug assertion macro
|
|
20
20
|
#ifdef _DEBUG
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
#define ASSERT_BOUNDS(idx, maxSize, msg) \
|
|
22
|
+
if ((idx) >= (maxSize)) \
|
|
23
|
+
{ \
|
|
24
|
+
std::cerr << "[BOUNDS ERROR] " << msg << ": idx=" << (idx) << ", max=" << (maxSize) << std::endl; \
|
|
25
|
+
throw std::out_of_range(msg); \
|
|
26
|
+
}
|
|
26
27
|
#else
|
|
27
|
-
|
|
28
|
+
#define ASSERT_BOUNDS(idx, maxSize, msg) ((void)0)
|
|
28
29
|
#endif
|
|
29
30
|
|
|
30
31
|
// Helper function to check debug flag
|
|
@@ -234,13 +235,58 @@ namespace dsp
|
|
|
234
235
|
std::to_string(outIdx) + ", targetTime=" + std::to_string(targetTime));
|
|
235
236
|
|
|
236
237
|
case GapPolicy::ZERO_FILL:
|
|
238
|
+
{
|
|
239
|
+
size_t writeIdx = outIdx * channels;
|
|
240
|
+
ASSERT_BOUNDS(writeIdx + channels - 1, outputSize, "ZERO_FILL output write");
|
|
241
|
+
|
|
242
|
+
#if defined(HAS_AVX2)
|
|
243
|
+
// AVX2: Zero 8 floats at a time
|
|
244
|
+
__m256 zero = _mm256_setzero_ps();
|
|
245
|
+
int ch = 0;
|
|
246
|
+
for (; ch + 8 <= channels; ch += 8)
|
|
247
|
+
{
|
|
248
|
+
_mm256_storeu_ps(&outputBuffer[writeIdx + ch], zero);
|
|
249
|
+
}
|
|
250
|
+
// Scalar remainder
|
|
251
|
+
for (; ch < channels; ++ch)
|
|
252
|
+
{
|
|
253
|
+
outputBuffer[writeIdx + ch] = 0.0f;
|
|
254
|
+
}
|
|
255
|
+
#elif defined(HAS_SSE)
|
|
256
|
+
// SSE: Zero 4 floats at a time
|
|
257
|
+
__m128 zero = _mm_setzero_ps();
|
|
258
|
+
int ch = 0;
|
|
259
|
+
for (; ch + 4 <= channels; ch += 4)
|
|
260
|
+
{
|
|
261
|
+
_mm_storeu_ps(&outputBuffer[writeIdx + ch], zero);
|
|
262
|
+
}
|
|
263
|
+
// Scalar remainder
|
|
264
|
+
for (; ch < channels; ++ch)
|
|
265
|
+
{
|
|
266
|
+
outputBuffer[writeIdx + ch] = 0.0f;
|
|
267
|
+
}
|
|
268
|
+
#elif defined(HAS_NEON)
|
|
269
|
+
// NEON: Zero 4 floats at a time
|
|
270
|
+
float32x4_t zero = vdupq_n_f32(0.0f);
|
|
271
|
+
int ch = 0;
|
|
272
|
+
for (; ch + 4 <= channels; ch += 4)
|
|
273
|
+
{
|
|
274
|
+
vst1q_f32(&outputBuffer[writeIdx + ch], zero);
|
|
275
|
+
}
|
|
276
|
+
// Scalar remainder
|
|
277
|
+
for (; ch < channels; ++ch)
|
|
278
|
+
{
|
|
279
|
+
outputBuffer[writeIdx + ch] = 0.0f;
|
|
280
|
+
}
|
|
281
|
+
#else
|
|
282
|
+
// Scalar fallback
|
|
237
283
|
for (int ch = 0; ch < channels; ++ch)
|
|
238
284
|
{
|
|
239
|
-
|
|
240
|
-
ASSERT_BOUNDS(writeIdx, outputSize, "ZERO_FILL output write");
|
|
241
|
-
outputBuffer[writeIdx] = 0.0f;
|
|
285
|
+
outputBuffer[writeIdx + ch] = 0.0f;
|
|
242
286
|
}
|
|
243
|
-
|
|
287
|
+
#endif
|
|
288
|
+
}
|
|
289
|
+
break;
|
|
244
290
|
|
|
245
291
|
case GapPolicy::HOLD:
|
|
246
292
|
// Hold last valid value before gap
|
|
@@ -260,7 +306,7 @@ namespace dsp
|
|
|
260
306
|
float t0 = timestamps[gapStart * channels];
|
|
261
307
|
float t1 = timestamps[gapEnd * channels];
|
|
262
308
|
float denominator = t1 - t0;
|
|
263
|
-
|
|
309
|
+
|
|
264
310
|
// Protection against division by zero
|
|
265
311
|
if (std::abs(denominator) < 1e-6f)
|
|
266
312
|
{
|
|
@@ -632,6 +678,138 @@ namespace dsp
|
|
|
632
678
|
|
|
633
679
|
size_t centerIdx = findBracketingInterval(targetTime, timestamps, numSamples, channels, searchStart);
|
|
634
680
|
|
|
681
|
+
#if defined(HAS_AVX2) || defined(HAS_SSE) || defined(HAS_NEON)
|
|
682
|
+
// SIMD-optimized path: Process 4 samples at a time
|
|
683
|
+
float values[windowSize] = {0};
|
|
684
|
+
float weights[windowSize] = {0};
|
|
685
|
+
int validCount = 0;
|
|
686
|
+
|
|
687
|
+
// Gather values and compute weights
|
|
688
|
+
for (int offset = -windowSize / 2; offset < windowSize / 2; ++offset)
|
|
689
|
+
{
|
|
690
|
+
int sampleIdx = static_cast<int>(centerIdx) + offset;
|
|
691
|
+
if (sampleIdx < 0 || sampleIdx >= static_cast<int>(numSamples))
|
|
692
|
+
continue;
|
|
693
|
+
|
|
694
|
+
float t = timestamps[sampleIdx * channels];
|
|
695
|
+
float v = samples[sampleIdx * channels + channel];
|
|
696
|
+
|
|
697
|
+
// Sinc function: sin(π*x) / (π*x)
|
|
698
|
+
float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
|
|
699
|
+
float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
|
|
700
|
+
|
|
701
|
+
// Hamming window
|
|
702
|
+
float window = 0.54f - 0.46f * std::cos(2.0f * M_PI * (offset + windowSize / 2.0f) / windowSize);
|
|
703
|
+
|
|
704
|
+
values[validCount] = v;
|
|
705
|
+
weights[validCount] = sinc * window;
|
|
706
|
+
validCount++;
|
|
707
|
+
}
|
|
708
|
+
|
|
709
|
+
// SIMD accumulation
|
|
710
|
+
float sum = 0.0f;
|
|
711
|
+
float weightSum = 0.0f;
|
|
712
|
+
|
|
713
|
+
#if defined(HAS_AVX2)
|
|
714
|
+
__m256 vsum = _mm256_setzero_ps();
|
|
715
|
+
__m256 wsum = _mm256_setzero_ps();
|
|
716
|
+
|
|
717
|
+
int i = 0;
|
|
718
|
+
for (; i + 8 <= validCount; i += 8)
|
|
719
|
+
{
|
|
720
|
+
__m256 v = _mm256_loadu_ps(&values[i]);
|
|
721
|
+
__m256 w = _mm256_loadu_ps(&weights[i]);
|
|
722
|
+
vsum = _mm256_add_ps(vsum, _mm256_mul_ps(v, w)); // sum += v * w
|
|
723
|
+
wsum = _mm256_add_ps(wsum, w);
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
// Horizontal sum for AVX2
|
|
727
|
+
__m128 vsum_low = _mm256_castps256_ps128(vsum);
|
|
728
|
+
__m128 vsum_high = _mm256_extractf128_ps(vsum, 1);
|
|
729
|
+
__m128 vsum128 = _mm_add_ps(vsum_low, vsum_high);
|
|
730
|
+
|
|
731
|
+
__m128 wsum_low = _mm256_castps256_ps128(wsum);
|
|
732
|
+
__m128 wsum_high = _mm256_extractf128_ps(wsum, 1);
|
|
733
|
+
__m128 wsum128 = _mm_add_ps(wsum_low, wsum_high);
|
|
734
|
+
|
|
735
|
+
// Continue with SSE reduction
|
|
736
|
+
vsum128 = _mm_hadd_ps(vsum128, vsum128);
|
|
737
|
+
vsum128 = _mm_hadd_ps(vsum128, vsum128);
|
|
738
|
+
sum = _mm_cvtss_f32(vsum128);
|
|
739
|
+
|
|
740
|
+
wsum128 = _mm_hadd_ps(wsum128, wsum128);
|
|
741
|
+
wsum128 = _mm_hadd_ps(wsum128, wsum128);
|
|
742
|
+
weightSum = _mm_cvtss_f32(wsum128);
|
|
743
|
+
|
|
744
|
+
// Scalar remainder
|
|
745
|
+
for (; i < validCount; ++i)
|
|
746
|
+
{
|
|
747
|
+
sum += values[i] * weights[i];
|
|
748
|
+
weightSum += weights[i];
|
|
749
|
+
}
|
|
750
|
+
#elif defined(HAS_SSE)
|
|
751
|
+
__m128 vsum = _mm_setzero_ps();
|
|
752
|
+
__m128 wsum = _mm_setzero_ps();
|
|
753
|
+
|
|
754
|
+
int i = 0;
|
|
755
|
+
for (; i + 4 <= validCount; i += 4)
|
|
756
|
+
{
|
|
757
|
+
__m128 v = _mm_loadu_ps(&values[i]);
|
|
758
|
+
__m128 w = _mm_loadu_ps(&weights[i]);
|
|
759
|
+
vsum = _mm_add_ps(vsum, _mm_mul_ps(v, w));
|
|
760
|
+
wsum = _mm_add_ps(wsum, w);
|
|
761
|
+
}
|
|
762
|
+
|
|
763
|
+
// Horizontal sum
|
|
764
|
+
vsum = _mm_hadd_ps(vsum, vsum);
|
|
765
|
+
vsum = _mm_hadd_ps(vsum, vsum);
|
|
766
|
+
sum = _mm_cvtss_f32(vsum);
|
|
767
|
+
|
|
768
|
+
wsum = _mm_hadd_ps(wsum, wsum);
|
|
769
|
+
wsum = _mm_hadd_ps(wsum, wsum);
|
|
770
|
+
weightSum = _mm_cvtss_f32(wsum);
|
|
771
|
+
|
|
772
|
+
// Scalar remainder
|
|
773
|
+
for (; i < validCount; ++i)
|
|
774
|
+
{
|
|
775
|
+
sum += values[i] * weights[i];
|
|
776
|
+
weightSum += weights[i];
|
|
777
|
+
}
|
|
778
|
+
#elif defined(HAS_NEON)
|
|
779
|
+
float32x4_t vsum = vdupq_n_f32(0.0f);
|
|
780
|
+
float32x4_t wsum = vdupq_n_f32(0.0f);
|
|
781
|
+
|
|
782
|
+
int i = 0;
|
|
783
|
+
for (; i + 4 <= validCount; i += 4)
|
|
784
|
+
{
|
|
785
|
+
float32x4_t v = vld1q_f32(&values[i]);
|
|
786
|
+
float32x4_t w = vld1q_f32(&weights[i]);
|
|
787
|
+
vsum = vmlaq_f32(vsum, v, w); // vsum += v * w
|
|
788
|
+
wsum = vaddq_f32(wsum, w);
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
// Horizontal sum
|
|
792
|
+
float32x2_t vsum_low = vget_low_f32(vsum);
|
|
793
|
+
float32x2_t vsum_high = vget_high_f32(vsum);
|
|
794
|
+
float32x2_t vsum_pair = vadd_f32(vsum_low, vsum_high);
|
|
795
|
+
sum = vget_lane_f32(vpadd_f32(vsum_pair, vsum_pair), 0);
|
|
796
|
+
|
|
797
|
+
float32x2_t wsum_low = vget_low_f32(wsum);
|
|
798
|
+
float32x2_t wsum_high = vget_high_f32(wsum);
|
|
799
|
+
float32x2_t wsum_pair = vadd_f32(wsum_low, wsum_high);
|
|
800
|
+
weightSum = vget_lane_f32(vpadd_f32(wsum_pair, wsum_pair), 0);
|
|
801
|
+
|
|
802
|
+
// Scalar remainder
|
|
803
|
+
for (; i < validCount; ++i)
|
|
804
|
+
{
|
|
805
|
+
sum += values[i] * weights[i];
|
|
806
|
+
weightSum += weights[i];
|
|
807
|
+
}
|
|
808
|
+
#endif
|
|
809
|
+
|
|
810
|
+
output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
|
|
811
|
+
#else
|
|
812
|
+
// Scalar fallback
|
|
635
813
|
float sum = 0.0f;
|
|
636
814
|
float weightSum = 0.0f;
|
|
637
815
|
|
|
@@ -645,7 +823,7 @@ namespace dsp
|
|
|
645
823
|
float v = samples[sampleIdx * channels + channel];
|
|
646
824
|
|
|
647
825
|
// Sinc function: sin(π*x) / (π*x)
|
|
648
|
-
float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
|
|
826
|
+
float x = (targetTime - t) * m_estimatedSampleRate / 1000.0f;
|
|
649
827
|
float sinc = (std::abs(x) < 1e-6f) ? 1.0f : std::sin(M_PI * x) / (M_PI * x);
|
|
650
828
|
|
|
651
829
|
// Hamming window
|
|
@@ -657,6 +835,7 @@ namespace dsp
|
|
|
657
835
|
}
|
|
658
836
|
|
|
659
837
|
output = (weightSum > 0.0f) ? (sum / weightSum) : 0.0f;
|
|
838
|
+
#endif
|
|
660
839
|
searchStart = centerIdx;
|
|
661
840
|
}
|
|
662
841
|
|
|
@@ -58,11 +58,11 @@ namespace dsp
|
|
|
58
58
|
* TimeAlignmentStage: Production-grade irregular timestamp resampling
|
|
59
59
|
*
|
|
60
60
|
* This stage solves the problems identified in Gemini's analysis:
|
|
61
|
-
* 1.
|
|
62
|
-
* 2.
|
|
63
|
-
* 3.
|
|
64
|
-
* 4.
|
|
65
|
-
* 5.
|
|
61
|
+
* 1. Time-based coordinate system (not index-based)
|
|
62
|
+
* 2. Gap detection and handling policies
|
|
63
|
+
* 3. Clock drift compensation
|
|
64
|
+
* 4. Proper SIMD optimization for irregular data
|
|
65
|
+
* 5. Configurable extrapolation/error handling
|
|
66
66
|
*
|
|
67
67
|
* Usage:
|
|
68
68
|
* auto stage = TimeAlignmentStage(
|