npm - dspx - Versions diffs - 1.3.3 → 1.3.5 - Mend

dspx 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/prebuilds/win32-x64/dspx.node +0 -0
package/src/native/DspPipeline.cc +729 -51

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "dspx",
-  "version": "1.3.3",
+  "version": "1.3.5",
   "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
   "main": "./dist/index.js",
   "types": "./dist/index.d.ts",

package/prebuilds/win32-x64/dspx.node CHANGED Viewed

Binary file

package/src/native/DspPipeline.cc CHANGED Viewed

@@ -51,6 +51,30 @@ namespace dsp
 #include <cstdlib>
 #include "utils/Toon.h"
+// SIMD optimizations for timestamp interpolation
+// Priority: AVX2 (8-wide) > SSE (4-wide) > NEON (4-wide) > Scalar
+#if defined(__AVX2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#include <immintrin.h>
+#define HAS_AVX2 1
+#define HAS_SSE 0
+#define HAS_NEON 0
+#elif defined(__SSE__) || defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
+#include <emmintrin.h> // SSE2
+#include <xmmintrin.h> // SSE
+#define HAS_AVX2 0
+#define HAS_SSE 1
+#define HAS_NEON 0
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define HAS_AVX2 0
+#define HAS_SSE 0
+#define HAS_NEON 1
+#else
+#define HAS_AVX2 0
+#define HAS_SSE 0
+#define HAS_NEON 0
+#endif
 namespace dsp
 {
@@ -84,9 +108,13 @@ namespace dsp
     DspPipeline::DspPipeline(const Napi::CallbackInfo &info)
         : Napi::ObjectWrap<DspPipeline>(info)
     {
+        // std::cout << "[DEBUG] DspPipeline::Constructor - this=" << this
+        //           << ", creating pipeline" << std::endl;
         // Initialize the lock
         m_isBusy = std::make_shared<std::atomic<bool>>(false);
+        // std::cout << "[DEBUG] DspPipeline::Constructor - m_isBusy=" << m_isBusy.get() << std::endl;
         InitializeStageFactories();
+        // std::cout << "[DEBUG] DspPipeline::Constructor - complete, this=" << this << std::endl;
     }
     /**
@@ -1146,22 +1174,26 @@ namespace dsp
     Napi::Value DspPipeline::AddStage(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::AddStage - this=" << this << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] AddStage - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
         if (*m_isBusy)
         {
+            // std::cout << "[DEBUG] AddStage - pipeline busy, this=" << this << std::endl;
             Napi::Error::New(env, "Cannot add stage while processing").ThrowAsJavaScriptException();
             return env.Undefined();
         }
         // 1. Get arguments from TypeScript
         std::string stageName = info[0].As<Napi::String>();
+        // std::cout << "[DEBUG] AddStage - stageName=" << stageName << ", this=" << this << std::endl;
         Napi::Object params = info[1].As<Napi::Object>();
         // 2. Look up the stage factory in the map
@@ -1202,10 +1234,12 @@ namespace dsp
     Napi::Value DspPipeline::AddFilterStage(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::AddFilterStage - this=" << this << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] AddFilterStage - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
@@ -1249,6 +1283,569 @@ namespace dsp
         return env.Undefined();
     }
+    /**
+     * SIMD-optimized timestamp interpolation for resizing stages
+     * Multi-platform support:
+     * - AVX2 (x86_64): 8-wide vectorization
+     * - SSE2 (x86): 4-wide vectorization
+     * - NEON (ARM): 4-wide vectorization
+     * - Scalar fallback for all other platforms
+     *
+     * @param timestamps Source timestamp array (channel-major layout)
+     * @param prevNumSamples Number of samples in source
+     * @param prevChannels Number of channels in source
+     * @param numOutputSamples Number of samples to generate
+     * @param outputChannels Number of channels in output
+     * @param timeScale Time scaling factor from stage
+     * @param output Output timestamp vector
+     */
+    inline void interpolateTimestampsSIMD(
+        const float *timestamps,
+        size_t prevNumSamples,
+        int prevChannels,
+        size_t numOutputSamples,
+        int outputChannels,
+        double timeScale,
+        std::vector<float> &output)
+    {
+#if HAS_AVX2
+        // ========================================
+        // AVX2 Implementation (8-wide)
+        // ========================================
+        // Process 8 output samples at a time with AVX2
+        const size_t simdWidth = 8;
+        const size_t simdIterations = numOutputSamples / simdWidth;
+        const size_t remainder = numOutputSamples % simdWidth;
+        // Precompute constants for SIMD
+        const __m256 vTimeScale = _mm256_set1_ps(static_cast<float>(timeScale));
+        const __m256i vPrevChannels = _mm256_set1_epi32(prevChannels);
+        const __m256 vPrevNumSamples = _mm256_set1_ps(static_cast<float>(prevNumSamples));
+        const __m256 vOne = _mm256_set1_ps(1.0f);
+        // SIMD loop: Process 8 timestamps at once
+        for (size_t iter = 0; iter < simdIterations; ++iter)
+        {
+            size_t baseIdx = iter * simdWidth;
+            // Generate indices: [baseIdx, baseIdx+1, ..., baseIdx+7]
+            __m256 vIdx = _mm256_set_ps(
+                static_cast<float>(baseIdx + 7),
+                static_cast<float>(baseIdx + 6),
+                static_cast<float>(baseIdx + 5),
+                static_cast<float>(baseIdx + 4),
+                static_cast<float>(baseIdx + 3),
+                static_cast<float>(baseIdx + 2),
+                static_cast<float>(baseIdx + 1),
+                static_cast<float>(baseIdx + 0));
+            // Calculate input time: i * timeScale
+            __m256 vInputTime = _mm256_mul_ps(vIdx, vTimeScale);
+            // Extract integer and fractional parts
+            __m256i vInputIdx = _mm256_cvttps_epi32(vInputTime);
+            __m256 vInputIdxFloat = _mm256_cvtepi32_ps(vInputIdx);
+            __m256 vFrac = _mm256_sub_ps(vInputTime, vInputIdxFloat);
+            // Process each of the 8 values (can't easily vectorize the conditional logic)
+            alignas(32) float inputTimes[8];
+            alignas(32) int inputIndices[8];
+            alignas(32) float fracs[8];
+            _mm256_store_ps(inputTimes, vInputTime);
+            _mm256_store_si256((__m256i *)inputIndices, vInputIdx);
+            _mm256_store_ps(fracs, vFrac);
+            for (size_t j = 0; j < simdWidth; ++j)
+            {
+                size_t i = baseIdx + j;
+                size_t inputIdx = inputIndices[j];
+                float frac = fracs[j];
+                float timestamp;
+                if (inputIdx >= prevNumSamples)
+                {
+                    size_t lastIdx = prevNumSamples - 1;
+                    timestamp = timestamps[lastIdx * prevChannels] +
+                                static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
+                }
+                else if (inputIdx + 1 >= prevNumSamples)
+                {
+                    timestamp = timestamps[inputIdx * prevChannels];
+                }
+                else
+                {
+                    float t0 = timestamps[inputIdx * prevChannels];
+                    float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                    timestamp = t0 + frac * (t1 - t0);
+                }
+                // Replicate timestamp across all output channels
+                for (int ch = 0; ch < outputChannels; ++ch)
+                {
+                    output[i * outputChannels + ch] = timestamp;
+                }
+            }
+        }
+        // Handle remainder samples with scalar code
+        for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#elif HAS_SSE
+        // ========================================
+        // SSE2 Implementation (4-wide)
+        // ========================================
+        const size_t simdWidth = 4;
+        const size_t simdIterations = numOutputSamples / simdWidth;
+        const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
+        const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
+        for (size_t iter = 0; iter < simdIterations; ++iter)
+        {
+            size_t baseIdx = iter * simdWidth;
+            // Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
+            alignas(16) float indices[4] = {
+                static_cast<float>(baseIdx),
+                static_cast<float>(baseIdx + 1),
+                static_cast<float>(baseIdx + 2),
+                static_cast<float>(baseIdx + 3)};
+            __m128 vIndices = _mm_load_ps(indices);
+            __m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
+            // Convert to int and back to get integer part
+            __m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
+            __m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
+            __m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
+            // Store for scalar processing
+            alignas(16) float inputTimes[4];
+            _mm_store_ps(inputTimes, vInputTime);
+            alignas(16) int inputIndices[4];
+            _mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
+            alignas(16) float fractions[4];
+            _mm_store_ps(fractions, vFrac);
+            // Process each sample
+            for (size_t j = 0; j < simdWidth; ++j)
+            {
+                size_t i = baseIdx + j;
+                size_t inputIdx = inputIndices[j];
+                double frac = fractions[j];
+                float timestamp;
+                if (inputIdx >= prevNumSamples)
+                {
+                    size_t lastIdx = prevNumSamples - 1;
+                    timestamp = timestamps[lastIdx * prevChannels] +
+                                static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
+                }
+                else if (inputIdx + 1 >= prevNumSamples)
+                {
+                    timestamp = timestamps[inputIdx * prevChannels];
+                }
+                else
+                {
+                    float t0 = timestamps[inputIdx * prevChannels];
+                    float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                    timestamp = t0 + frac * (t1 - t0);
+                }
+                for (int ch = 0; ch < outputChannels; ++ch)
+                {
+                    output[i * outputChannels + ch] = timestamp;
+                }
+            }
+        }
+        // Handle remainder
+        for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#elif HAS_NEON
+        // ========================================
+        // ARM NEON Implementation (4-wide)
+        // ========================================
+        const size_t simdWidth = 4;
+        const size_t simdIterations = numOutputSamples / simdWidth;
+        const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
+        const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
+        for (size_t iter = 0; iter < simdIterations; ++iter)
+        {
+            size_t baseIdx = iter * simdWidth;
+            // Generate indices
+            alignas(16) float indices[4] = {
+                static_cast<float>(baseIdx),
+                static_cast<float>(baseIdx + 1),
+                static_cast<float>(baseIdx + 2),
+                static_cast<float>(baseIdx + 3)};
+            float32x4_t vIndices = vld1q_f32(indices);
+            float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
+            // Extract integer and fractional parts
+            int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
+            float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
+            float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
+            // Store for processing
+            alignas(16) float inputTimes[4];
+            vst1q_f32(inputTimes, vInputTime);
+            alignas(16) int inputIndices[4];
+            vst1q_s32(inputIndices, vInputIdx);
+            alignas(16) float fractions[4];
+            vst1q_f32(fractions, vFrac);
+            // Process each sample
+            for (size_t j = 0; j < simdWidth; ++j)
+            {
+                size_t i = baseIdx + j;
+                size_t inputIdx = inputIndices[j];
+                double frac = fractions[j];
+                float timestamp;
+                if (inputIdx >= prevNumSamples)
+                {
+                    size_t lastIdx = prevNumSamples - 1;
+                    timestamp = timestamps[lastIdx * prevChannels] +
+                                static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
+                }
+                else if (inputIdx + 1 >= prevNumSamples)
+                {
+                    timestamp = timestamps[inputIdx * prevChannels];
+                }
+                else
+                {
+                    float t0 = timestamps[inputIdx * prevChannels];
+                    float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                    timestamp = t0 + frac * (t1 - t0);
+                }
+                for (int ch = 0; ch < outputChannels; ++ch)
+                {
+                    output[i * outputChannels + ch] = timestamp;
+                }
+            }
+        }
+        // Handle remainder
+        for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#elif HAS_SSE
+        // ========================================
+        // SSE2 Implementation (4-wide)
+        // ========================================
+        const size_t simdWidth = 4;
+        const size_t simdIterations = numOutputSamples / simdWidth;
+        const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
+        const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
+        for (size_t iter = 0; iter < simdIterations; ++iter)
+        {
+            size_t baseIdx = iter * simdWidth;
+            // Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
+            alignas(16) float indices[4] = {
+                static_cast<float>(baseIdx),
+                static_cast<float>(baseIdx + 1),
+                static_cast<float>(baseIdx + 2),
+                static_cast<float>(baseIdx + 3)};
+            __m128 vIndices = _mm_load_ps(indices);
+            __m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
+            // Convert to int and back to get integer part
+            __m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
+            __m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
+            __m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
+            // Store for scalar processing
+            alignas(16) float inputTimes[4];
+            _mm_store_ps(inputTimes, vInputTime);
+            alignas(16) int inputIndices[4];
+            _mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
+            alignas(16) float fractions[4];
+            _mm_store_ps(fractions, vFrac);
+            // Process each sample
+            for (size_t j = 0; j < simdWidth; ++j)
+            {
+                size_t i = baseIdx + j;
+                size_t inputIdx = inputIndices[j];
+                double frac = fractions[j];
+                float timestamp;
+                if (inputIdx >= prevNumSamples)
+                {
+                    size_t lastIdx = prevNumSamples - 1;
+                    timestamp = timestamps[lastIdx * prevChannels] +
+                                static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
+                }
+                else if (inputIdx + 1 >= prevNumSamples)
+                {
+                    timestamp = timestamps[inputIdx * prevChannels];
+                }
+                else
+                {
+                    float t0 = timestamps[inputIdx * prevChannels];
+                    float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                    timestamp = t0 + frac * (t1 - t0);
+                }
+                for (int ch = 0; ch < outputChannels; ++ch)
+                {
+                    output[i * outputChannels + ch] = timestamp;
+                }
+            }
+        }
+        // Handle remainder
+        for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#elif HAS_NEON
+        // ========================================
+        // ARM NEON Implementation (4-wide)
+        // ========================================
+        const size_t simdWidth = 4;
+        const size_t simdIterations = numOutputSamples / simdWidth;
+        const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
+        const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
+        for (size_t iter = 0; iter < simdIterations; ++iter)
+        {
+            size_t baseIdx = iter * simdWidth;
+            // Generate indices
+            alignas(16) float indices[4] = {
+                static_cast<float>(baseIdx),
+                static_cast<float>(baseIdx + 1),
+                static_cast<float>(baseIdx + 2),
+                static_cast<float>(baseIdx + 3)};
+            float32x4_t vIndices = vld1q_f32(indices);
+            float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
+            // Extract integer and fractional parts
+            int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
+            float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
+            float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
+            // Store for processing
+            alignas(16) float inputTimes[4];
+            vst1q_f32(inputTimes, vInputTime);
+            alignas(16) int inputIndices[4];
+            vst1q_s32(inputIndices, vInputIdx);
+            alignas(16) float fractions[4];
+            vst1q_f32(fractions, vFrac);
+            // Process each sample
+            for (size_t j = 0; j < simdWidth; ++j)
+            {
+                size_t i = baseIdx + j;
+                size_t inputIdx = inputIndices[j];
+                double frac = fractions[j];
+                float timestamp;
+                if (inputIdx >= prevNumSamples)
+                {
+                    size_t lastIdx = prevNumSamples - 1;
+                    timestamp = timestamps[lastIdx * prevChannels] +
+                                static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
+                }
+                else if (inputIdx + 1 >= prevNumSamples)
+                {
+                    timestamp = timestamps[inputIdx * prevChannels];
+                }
+                else
+                {
+                    float t0 = timestamps[inputIdx * prevChannels];
+                    float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                    timestamp = t0 + frac * (t1 - t0);
+                }
+                for (int ch = 0; ch < outputChannels; ++ch)
+                {
+                    output[i * outputChannels + ch] = timestamp;
+                }
+            }
+        }
+        // Handle remainder
+        for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#else
+        // ========================================
+        // Scalar Fallback (universal)
+        // ========================================
+        for (size_t i = 0; i < numOutputSamples; ++i)
+        {
+            double inputTime = i * timeScale;
+            size_t inputIdx = static_cast<size_t>(inputTime);
+            double frac = inputTime - inputIdx;
+            float timestamp;
+            if (inputIdx >= prevNumSamples)
+            {
+                size_t lastIdx = prevNumSamples - 1;
+                timestamp = timestamps[lastIdx * prevChannels] +
+                            static_cast<float>((inputTime - lastIdx) * timeScale);
+            }
+            else if (inputIdx + 1 >= prevNumSamples)
+            {
+                timestamp = timestamps[inputIdx * prevChannels];
+            }
+            else
+            {
+                float t0 = timestamps[inputIdx * prevChannels];
+                float t1 = timestamps[(inputIdx + 1) * prevChannels];
+                timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
+            }
+            for (int ch = 0; ch < outputChannels; ++ch)
+            {
+                output[i * outputChannels + ch] = timestamp;
+            }
+        }
+#endif
+    }
     /**
      * AsyncWorker for processing DSP pipeline in background thread
      */
@@ -1278,34 +1875,46 @@ namespace dsp
               m_timestampRef(std::move(timestampRef)),
               m_busyLock(busyLock)
         {
+            // std::cout << "[DEBUG] ProcessWorker::ProcessWorker - this=" << this << std::endl;
+            m_stageCount = m_stages.size();
+            m_stageTypes.reserve(m_stageCount);
+            for (const auto &stage : m_stages)
+            {
+                m_stageTypes.push_back(stage->getType());
+            }
         }
     protected:
         // This runs on a worker thread (not blocking the event loop)
         void Execute() override
         {
-            // Local storage for generated timestamps (RAII - automatically freed when function exits)
+            // std::cout << "[DEBUG] ProcessWorker::Execute - START, this=" << this
+            //           << ", data=" << m_data << ", numSamples=" << m_numSamples
+            //           << ", channels=" << m_channels << std::endl;
+            // std::cout << "[WORKER-" << std::this_thread::get_id() << "] Execute START (stages="
+            //           << m_stages.size() << ")" << std::endl;
+            // CRITICAL FIX: Use a unique_ptr for timestamp ownership
             std::vector<float> generatedTimestamps;
+            std::unique_ptr<std::vector<float>> allocatedTimestamps;
             try
             {
-                // 1. Generate Timestamps if missing (Optimization)
+                // 1. Generate Timestamps if missing
                 if (m_timestamps == nullptr)
                 {
-                    generatedTimestamps.resize(m_numSamples);
+                    // std::cout << "[DEBUG] Execute - generating timestamps, sampleRate=" << m_sampleRate << std::endl;
-                    // Calculate time step (dt) in milliseconds
-                    // If sampleRate is 0 or invalid, default to 1.0 (treating indices as time)
+                    generatedTimestamps.resize(m_numSamples);
                     double dt = (m_sampleRate > 0.0) ? (1000.0 / m_sampleRate) : 1.0;
-                    // Fill timestamps linearly: t[i] = i * dt
                     for (size_t i = 0; i < m_numSamples; ++i)
                     {
                         generatedTimestamps[i] = static_cast<float>(i * dt);
                     }
-                    // Point the main processing pointer to our locally generated data
                     m_timestamps = generatedTimestamps.data();
+                    // std::cout << "[DEBUG] Execute - timestamps generated, addr=" << m_timestamps << std::endl;
                 }
                 // 2. Process the buffer through all stages
@@ -1315,79 +1924,105 @@ namespace dsp
                 bool usingTempBuffer = false;
                 const bool debugStageDumps = std::getenv("DSPX_DEBUG_STAGE_DUMPS") != nullptr;
-                for (const auto &stage : m_stages)
+                // std::cout << "[DEBUG] Execute - processing through " << m_stages.size() << " stages" << std::endl;
+                for (size_t stageIdx = 0; stageIdx < m_stages.size(); ++stageIdx)
                 {
+                    const auto &stage = m_stages[stageIdx];
+                    // std::cout << "[DEBUG] Execute - stage " << stageIdx << ", type="
+                    //           << stage->getType() << ", addr=" << stage.get()
+                    //           << ", isResizing=" << stage->isResizing() << std::endl;
                     if (stage->isResizing())
                     {
-                        // Resizing logic (same as before)
+                        // Calculate output size
                         size_t outputSize = stage->calculateOutputSize(currentSize);
                         float *outputBuffer = new float[outputSize];
+                        // std::cout << "[DEBUG] Execute - allocated output buffer, size=" << outputSize
+                        //           << ", addr=" << outputBuffer << std::endl;
+                        // CRITICAL: Save the PREVIOUS size before processResizing updates currentSize
+                        size_t prevSize = currentSize;
                         size_t actualOutputSize = 0;
                         stage->processResizing(currentBuffer, currentSize,
                                                outputBuffer, actualOutputSize,
                                                m_channels, m_timestamps);
-                        if (usingTempBuffer)
-                            delete[] currentBuffer;
+                        // std::cout << "[DEBUG] Execute - stage " << stageIdx << " resized: "
+                        //           << prevSize << " -> " << actualOutputSize // Use prevSize!
+                        //           << ", buffer=" << outputBuffer << std::endl;
+                        // Free previous temp buffer if we owned it
+                        if (usingTempBuffer && tempBuffer != nullptr)
+                        {
+                            //  std::cout << "[DEBUG] Execute - freeing previous temp buffer=" << tempBuffer << std::endl;
+                            delete[] tempBuffer;
+                        }
+                        // Update buffer tracking
+                        tempBuffer = outputBuffer;
                         currentBuffer = outputBuffer;
                         currentSize = actualOutputSize;
                         usingTempBuffer = true;
+                        // Save previous channel count BEFORE updating
+                        int prevChannels = m_channels;
+                        // Update channel count if stage changed it
                         int outputChannels = stage->getOutputChannels();
                         if (outputChannels > 0)
+                        {
+                            // std::cout << "[DEBUG] Execute - channels changed: " << m_channels
+                            //           << " -> " << outputChannels << std::endl;
                             m_channels = outputChannels;
+                        }
-                        // Re-interpolate timestamps if needed (same as before)
+                        // Re-interpolate timestamps if needed
                         if (m_timestamps != nullptr)
                         {
+                            // std::cout << "[DEBUG] Execute - reinterpolating timestamps" << std::endl;
                             double timeScale = stage->getTimeScaleFactor();
                             size_t numOutputSamples = actualOutputSize / m_channels;
-                            float *newTimestamps = new float[actualOutputSize];
-                            for (size_t i = 0; i < numOutputSamples; ++i)
-                            {
-                                double inputTime = i * timeScale;
-                                size_t inputIdx = static_cast<size_t>(inputTime);
-                                double frac = inputTime - inputIdx;
-                                float timestamp;
-                                if (inputIdx >= (currentSize / m_channels))
-                                {
-                                    size_t lastIdx = (currentSize / m_channels) - 1;
-                                    timestamp = m_timestamps[lastIdx * m_channels] +
-                                                static_cast<float>((inputTime - lastIdx) * timeScale);
-                                }
-                                else if (inputIdx + 1 >= (currentSize / m_channels))
-                                {
-                                    timestamp = m_timestamps[inputIdx * m_channels];
-                                }
-                                else
-                                {
-                                    float t0 = m_timestamps[inputIdx * m_channels];
-                                    float t1 = m_timestamps[(inputIdx + 1) * m_channels];
-                                    timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
-                                }
-                                for (int ch = 0; ch < m_channels; ++ch)
-                                {
-                                    newTimestamps[i * m_channels + ch] = timestamp;
-                                }
-                            }
-                            m_timestamps = newTimestamps;
-                            m_timestampBuffer.reset(newTimestamps);
+                            // CRITICAL FIX: Use prevSize and prevChannels!
+                            size_t prevNumSamples = prevSize / prevChannels;
+                            // Create new timestamp vector
+                            auto newTimestamps = std::make_unique<std::vector<float>>(actualOutputSize);
+                            // Use SIMD-optimized interpolation
+                            interpolateTimestampsSIMD(
+                                m_timestamps,
+                                prevNumSamples,
+                                prevChannels,
+                                numOutputSamples,
+                                m_channels,
+                                timeScale,
+                                *newTimestamps);
+                            // CRITICAL FIX: Transfer ownership safely
+                            allocatedTimestamps = std::move(newTimestamps);
+                            m_timestamps = allocatedTimestamps->data();
+                            // std::cout << "[DEBUG] Execute - timestamps reinterpolated (SIMD), new addr="
+                            //           << m_timestamps << std::endl;
                         }
                     }
                     else
                     {
                         // In-place processing
+                        // std::cout << "[DEBUG] Execute - stage " << stageIdx << " in-place processing" << std::endl;
                         stage->process(currentBuffer, currentSize, m_channels, m_timestamps);
                         if (debugStageDumps)
                         {
                             const char *stype = stage->getType();
                             size_t toShow = std::min<size_t>(8, currentSize);
-                            std::cout << "[DUMP] after '" << stype << "':";
+                            // std::cout << "[DUMP] after '" << stype << "':";
                             for (size_t i = 0; i < toShow; ++i)
                             {
                                 std::cout << (i == 0 ? ' ' : ',') << currentBuffer[i];
@@ -1400,16 +2035,22 @@ namespace dsp
                 m_finalBuffer = currentBuffer;
                 m_finalSize = currentSize;
                 m_ownsBuffer = usingTempBuffer;
+                // std::cout << "[DEBUG] Execute - COMPLETE, finalBuffer=" << m_finalBuffer
+                //           << ", finalSize=" << m_finalSize << ", ownsBuffer=" << m_ownsBuffer << std::endl;
             }
             catch (const std::exception &e)
             {
+                // std::cout << "[DEBUG] Execute - EXCEPTION: " << e.what() << ", this=" << this << std::endl;
+                // std::cout << "[WORKER-" << std::this_thread::get_id() << "] EXCEPTION: " << e.what() << std::endl;
                 SetError(e.what());
             }
-        }
+        } // This runs on the main thread after Execute() completes
-        // This runs on the main thread after Execute() completes
         void OnOK() override
         {
+            // std::cout << "[DEBUG] ProcessWorker::OnOK - START, this=" << this
+            //   << ", finalBuffer=" << (void *)m_finalBuffer << ", finalSize=" << m_finalSize << std::endl;
             *m_busyLock = false; // unlock the pipeline
             Napi::Env env = Env();
@@ -1423,22 +2064,29 @@ namespace dsp
             // Clean up temporary buffer if we allocated one
             if (m_ownsBuffer)
             {
+                // std::cout << "[DEBUG] OnOK - deleting temp buffer=" << (void *)m_finalBuffer << std::endl;
                 delete[] m_finalBuffer;
             }
+            // std::cout << "[DEBUG] OnOK - COMPLETE, resolving promise, this=" << this << std::endl;
             // Resolve the promise with the processed buffer
             m_deferred.Resolve(outputArray);
         }
         void OnError(const Napi::Error &error) override
         {
+            // std::cout << "[DEBUG] ProcessWorker::OnError - this=" << this
+            //           << ", error=" << error.Message() << std::endl;
             m_deferred.Reject(error.Value());
             *m_busyLock = false; // unlock the pipeline
+            // std::cout << "[DEBUG] OnError - COMPLETE, this=" << this << std::endl;
         }
     private:
         Napi::Promise::Deferred m_deferred;
         std::vector<std::unique_ptr<IDspStage>> &m_stages;
+        size_t m_stageCount;
+        std::vector<std::string> m_stageTypes;
         float *m_data;
         float *m_timestamps;
         double m_sampleRate;
@@ -1469,16 +2117,19 @@ namespace dsp
     Napi::Value DspPipeline::ProcessAsync(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::ProcessAsync - this=" << this << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] ProcessAsync - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
         if (*m_isBusy)
         {
+            // std::cout << "[DEBUG] ProcessAsync - pipeline busy, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is busy: Cannot call process() while another operation is running.").ThrowAsJavaScriptException();
             return env.Undefined();
         }
@@ -1547,8 +2198,13 @@ namespace dsp
         }
         *m_isBusy = true; // lock the pipeline
+        // std::cout << "[DEBUG] ProcessAsync - creating worker, data=" << (void *)data
+        //           << ", numSamples=" << numSamples << ", channels=" << channels
+        //           << ", this=" << this << std::endl;
         ProcessWorker *worker = new ProcessWorker(env, std::move(deferred), m_stages, data, timestamps, sampleRate, numSamples, channels, std::move(bufferRef), std::move(timestampRef), m_isBusy);
+        // std::cout << "[DEBUG] ProcessAsync - queuing worker=" << (void *)worker
+        //           << ", this=" << this << std::endl;
         worker->Queue();
         return promise;
@@ -1566,16 +2222,19 @@ namespace dsp
     Napi::Value DspPipeline::ProcessSync(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::ProcessSync - this=" << this << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] ProcessSync - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
         if (*m_isBusy)
         {
+            // std::cout << "[DEBUG] ProcessSync - pipeline busy, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is busy: Cannot call processSync() while an async operation is running.").ThrowAsJavaScriptException();
             return env.Undefined();
         }
@@ -1702,10 +2361,13 @@ namespace dsp
     Napi::Value DspPipeline::SaveState(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::SaveState - this=" << this
+        //           << ", stages=" << m_stages.size() << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] SaveState - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
@@ -1799,10 +2461,12 @@ namespace dsp
     Napi::Value DspPipeline::LoadState(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::LoadState - this=" << this
+        //           << ", current stages=" << m_stages.size() << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] LoadState - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
@@ -2046,21 +2710,27 @@ namespace dsp
     Napi::Value DspPipeline::ClearState(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::ClearState - this=" << this
+        //   << ", stages=" << m_stages.size() << std::endl;
         // Check if pipeline is disposed
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] ClearState - pipeline disposed, this=" << this << std::endl;
             Napi::Error::New(env, "Pipeline is disposed").ThrowAsJavaScriptException();
             return env.Undefined();
         }
         // Reset all stages
-        for (auto &stage : m_stages)
+        for (size_t i = 0; i < m_stages.size(); ++i)
         {
-            stage->reset();
+            // std::cout << "[DEBUG] ClearState - resetting stage " << i
+            //           << ", addr=" << m_stages[i].get() << std::endl;
+            m_stages[i]->reset();
         }
-        std::cout << "Pipeline state cleared (" << m_stages.size() << " stages reset)" << std::endl;
+        // std::cout << "[DEBUG] Pipeline state cleared (" << m_stages.size()
+        //           << " stages reset), this=" << this << std::endl;
         return env.Undefined();
     }
@@ -2156,21 +2826,27 @@ namespace dsp
     Napi::Value DspPipeline::Dispose(const Napi::CallbackInfo &info)
     {
         Napi::Env env = info.Env();
+        // std::cout << "[DEBUG] DspPipeline::Dispose - this=" << this
+        //           << ", stages=" << m_stages.size() << ", disposed=" << m_disposed << std::endl;
         // Already disposed - silently succeed (idempotent behavior)
         if (m_disposed)
         {
+            // std::cout << "[DEBUG] Dispose - already disposed, this=" << this << std::endl;
             return env.Undefined();
         }
         // Cannot dispose while processing is in progress
         if (*m_isBusy)
         {
+            // std::cout << "[DEBUG] Dispose - pipeline busy, cannot dispose, this=" << this << std::endl;
             Napi::Error::New(env, "Cannot dispose pipeline: process() is still running.")
                 .ThrowAsJavaScriptException();
             return env.Undefined();
         }
+        // std::cout << "[DEBUG] Dispose - clearing " << m_stages.size()
+        //           << " stages, this=" << this << std::endl;
         // Clear all stages - triggers RAII cleanup of all stage resources
         // This will:
         // - Free all stage internal buffers
@@ -2179,12 +2855,14 @@ namespace dsp
         // - Free all detachable buffers
         // - Free timestamp and resize buffers
         m_stages.clear();
+        // std::cout << "[DEBUG] Dispose - stages cleared, this=" << this << std::endl;
         // Reset busy flag (defensive programming)
         *m_isBusy = false;
         // Mark as disposed to prevent further operations
         m_disposed = true;
+        // std::cout << "[DEBUG] Dispose - complete, this=" << this << std::endl;
         return env.Undefined();
     }