dspx 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "dspx",
3
- "version": "1.3.4",
3
+ "version": "1.3.5",
4
4
  "description": "High-performance DSP library with native C++ acceleration and Redis state persistence",
5
5
  "main": "./dist/index.js",
6
6
  "types": "./dist/index.d.ts",
Binary file
@@ -51,6 +51,30 @@ namespace dsp
51
51
  #include <cstdlib>
52
52
  #include "utils/Toon.h"
53
53
 
54
+ // SIMD optimizations for timestamp interpolation
55
+ // Priority: AVX2 (8-wide) > SSE (4-wide) > NEON (4-wide) > Scalar
56
+ #if defined(__AVX2__) || (defined(_MSC_VER) && defined(__AVX2__))
57
+ #include <immintrin.h>
58
+ #define HAS_AVX2 1
59
+ #define HAS_SSE 0
60
+ #define HAS_NEON 0
61
+ #elif defined(__SSE__) || defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
62
+ #include <emmintrin.h> // SSE2
63
+ #include <xmmintrin.h> // SSE
64
+ #define HAS_AVX2 0
65
+ #define HAS_SSE 1
66
+ #define HAS_NEON 0
67
+ #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
68
+ #include <arm_neon.h>
69
+ #define HAS_AVX2 0
70
+ #define HAS_SSE 0
71
+ #define HAS_NEON 1
72
+ #else
73
+ #define HAS_AVX2 0
74
+ #define HAS_SSE 0
75
+ #define HAS_NEON 0
76
+ #endif
77
+
54
78
  namespace dsp
55
79
  {
56
80
 
@@ -1259,6 +1283,569 @@ namespace dsp
1259
1283
  return env.Undefined();
1260
1284
  }
1261
1285
 
1286
+ /**
1287
+ * SIMD-optimized timestamp interpolation for resizing stages
1288
+ * Multi-platform support:
1289
+ * - AVX2 (x86_64): 8-wide vectorization
1290
+ * - SSE2 (x86): 4-wide vectorization
1291
+ * - NEON (ARM): 4-wide vectorization
1292
+ * - Scalar fallback for all other platforms
1293
+ *
1294
+ * @param timestamps Source timestamp array (channel-major layout)
1295
+ * @param prevNumSamples Number of samples in source
1296
+ * @param prevChannels Number of channels in source
1297
+ * @param numOutputSamples Number of samples to generate
1298
+ * @param outputChannels Number of channels in output
1299
+ * @param timeScale Time scaling factor from stage
1300
+ * @param output Output timestamp vector
1301
+ */
1302
+ inline void interpolateTimestampsSIMD(
1303
+ const float *timestamps,
1304
+ size_t prevNumSamples,
1305
+ int prevChannels,
1306
+ size_t numOutputSamples,
1307
+ int outputChannels,
1308
+ double timeScale,
1309
+ std::vector<float> &output)
1310
+ {
1311
+ #if HAS_AVX2
1312
+ // ========================================
1313
+ // AVX2 Implementation (8-wide)
1314
+ // ========================================
1315
+ // Process 8 output samples at a time with AVX2
1316
+ const size_t simdWidth = 8;
1317
+ const size_t simdIterations = numOutputSamples / simdWidth;
1318
+ const size_t remainder = numOutputSamples % simdWidth;
1319
+
1320
+ // Precompute constants for SIMD
1321
+ const __m256 vTimeScale = _mm256_set1_ps(static_cast<float>(timeScale));
1322
+ const __m256i vPrevChannels = _mm256_set1_epi32(prevChannels);
1323
+ const __m256 vPrevNumSamples = _mm256_set1_ps(static_cast<float>(prevNumSamples));
1324
+ const __m256 vOne = _mm256_set1_ps(1.0f);
1325
+
1326
+ // SIMD loop: Process 8 timestamps at once
1327
+ for (size_t iter = 0; iter < simdIterations; ++iter)
1328
+ {
1329
+ size_t baseIdx = iter * simdWidth;
1330
+
1331
+ // Generate indices: [baseIdx, baseIdx+1, ..., baseIdx+7]
1332
+ __m256 vIdx = _mm256_set_ps(
1333
+ static_cast<float>(baseIdx + 7),
1334
+ static_cast<float>(baseIdx + 6),
1335
+ static_cast<float>(baseIdx + 5),
1336
+ static_cast<float>(baseIdx + 4),
1337
+ static_cast<float>(baseIdx + 3),
1338
+ static_cast<float>(baseIdx + 2),
1339
+ static_cast<float>(baseIdx + 1),
1340
+ static_cast<float>(baseIdx + 0));
1341
+
1342
+ // Calculate input time: i * timeScale
1343
+ __m256 vInputTime = _mm256_mul_ps(vIdx, vTimeScale);
1344
+
1345
+ // Extract integer and fractional parts
1346
+ __m256i vInputIdx = _mm256_cvttps_epi32(vInputTime);
1347
+ __m256 vInputIdxFloat = _mm256_cvtepi32_ps(vInputIdx);
1348
+ __m256 vFrac = _mm256_sub_ps(vInputTime, vInputIdxFloat);
1349
+
1350
+ // Process each of the 8 values (can't easily vectorize the conditional logic)
1351
+ alignas(32) float inputTimes[8];
1352
+ alignas(32) int inputIndices[8];
1353
+ alignas(32) float fracs[8];
1354
+
1355
+ _mm256_store_ps(inputTimes, vInputTime);
1356
+ _mm256_store_si256((__m256i *)inputIndices, vInputIdx);
1357
+ _mm256_store_ps(fracs, vFrac);
1358
+
1359
+ for (size_t j = 0; j < simdWidth; ++j)
1360
+ {
1361
+ size_t i = baseIdx + j;
1362
+ size_t inputIdx = inputIndices[j];
1363
+ float frac = fracs[j];
1364
+ float timestamp;
1365
+
1366
+ if (inputIdx >= prevNumSamples)
1367
+ {
1368
+ size_t lastIdx = prevNumSamples - 1;
1369
+ timestamp = timestamps[lastIdx * prevChannels] +
1370
+ static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
1371
+ }
1372
+ else if (inputIdx + 1 >= prevNumSamples)
1373
+ {
1374
+ timestamp = timestamps[inputIdx * prevChannels];
1375
+ }
1376
+ else
1377
+ {
1378
+ float t0 = timestamps[inputIdx * prevChannels];
1379
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1380
+ timestamp = t0 + frac * (t1 - t0);
1381
+ }
1382
+
1383
+ // Replicate timestamp across all output channels
1384
+ for (int ch = 0; ch < outputChannels; ++ch)
1385
+ {
1386
+ output[i * outputChannels + ch] = timestamp;
1387
+ }
1388
+ }
1389
+ }
1390
+
1391
+ // Handle remainder samples with scalar code
1392
+ for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
1393
+ {
1394
+ double inputTime = i * timeScale;
1395
+ size_t inputIdx = static_cast<size_t>(inputTime);
1396
+ double frac = inputTime - inputIdx;
1397
+ float timestamp;
1398
+
1399
+ if (inputIdx >= prevNumSamples)
1400
+ {
1401
+ size_t lastIdx = prevNumSamples - 1;
1402
+ timestamp = timestamps[lastIdx * prevChannels] +
1403
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1404
+ }
1405
+ else if (inputIdx + 1 >= prevNumSamples)
1406
+ {
1407
+ timestamp = timestamps[inputIdx * prevChannels];
1408
+ }
1409
+ else
1410
+ {
1411
+ float t0 = timestamps[inputIdx * prevChannels];
1412
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1413
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1414
+ }
1415
+
1416
+ for (int ch = 0; ch < outputChannels; ++ch)
1417
+ {
1418
+ output[i * outputChannels + ch] = timestamp;
1419
+ }
1420
+ }
1421
+ #elif HAS_SSE
1422
+ // ========================================
1423
+ // SSE2 Implementation (4-wide)
1424
+ // ========================================
1425
+ const size_t simdWidth = 4;
1426
+ const size_t simdIterations = numOutputSamples / simdWidth;
1427
+
1428
+ const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
1429
+ const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
1430
+
1431
+ for (size_t iter = 0; iter < simdIterations; ++iter)
1432
+ {
1433
+ size_t baseIdx = iter * simdWidth;
1434
+
1435
+ // Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
1436
+ alignas(16) float indices[4] = {
1437
+ static_cast<float>(baseIdx),
1438
+ static_cast<float>(baseIdx + 1),
1439
+ static_cast<float>(baseIdx + 2),
1440
+ static_cast<float>(baseIdx + 3)};
1441
+ __m128 vIndices = _mm_load_ps(indices);
1442
+ __m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
1443
+
1444
+ // Convert to int and back to get integer part
1445
+ __m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
1446
+ __m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
1447
+ __m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
1448
+
1449
+ // Store for scalar processing
1450
+ alignas(16) float inputTimes[4];
1451
+ _mm_store_ps(inputTimes, vInputTime);
1452
+ alignas(16) int inputIndices[4];
1453
+ _mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
1454
+ alignas(16) float fractions[4];
1455
+ _mm_store_ps(fractions, vFrac);
1456
+
1457
+ // Process each sample
1458
+ for (size_t j = 0; j < simdWidth; ++j)
1459
+ {
1460
+ size_t i = baseIdx + j;
1461
+ size_t inputIdx = inputIndices[j];
1462
+ double frac = fractions[j];
1463
+ float timestamp;
1464
+
1465
+ if (inputIdx >= prevNumSamples)
1466
+ {
1467
+ size_t lastIdx = prevNumSamples - 1;
1468
+ timestamp = timestamps[lastIdx * prevChannels] +
1469
+ static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
1470
+ }
1471
+ else if (inputIdx + 1 >= prevNumSamples)
1472
+ {
1473
+ timestamp = timestamps[inputIdx * prevChannels];
1474
+ }
1475
+ else
1476
+ {
1477
+ float t0 = timestamps[inputIdx * prevChannels];
1478
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1479
+ timestamp = t0 + frac * (t1 - t0);
1480
+ }
1481
+
1482
+ for (int ch = 0; ch < outputChannels; ++ch)
1483
+ {
1484
+ output[i * outputChannels + ch] = timestamp;
1485
+ }
1486
+ }
1487
+ }
1488
+
1489
+ // Handle remainder
1490
+ for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
1491
+ {
1492
+ double inputTime = i * timeScale;
1493
+ size_t inputIdx = static_cast<size_t>(inputTime);
1494
+ double frac = inputTime - inputIdx;
1495
+ float timestamp;
1496
+
1497
+ if (inputIdx >= prevNumSamples)
1498
+ {
1499
+ size_t lastIdx = prevNumSamples - 1;
1500
+ timestamp = timestamps[lastIdx * prevChannels] +
1501
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1502
+ }
1503
+ else if (inputIdx + 1 >= prevNumSamples)
1504
+ {
1505
+ timestamp = timestamps[inputIdx * prevChannels];
1506
+ }
1507
+ else
1508
+ {
1509
+ float t0 = timestamps[inputIdx * prevChannels];
1510
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1511
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1512
+ }
1513
+
1514
+ for (int ch = 0; ch < outputChannels; ++ch)
1515
+ {
1516
+ output[i * outputChannels + ch] = timestamp;
1517
+ }
1518
+ }
1519
+ #elif HAS_NEON
1520
+ // ========================================
1521
+ // ARM NEON Implementation (4-wide)
1522
+ // ========================================
1523
+ const size_t simdWidth = 4;
1524
+ const size_t simdIterations = numOutputSamples / simdWidth;
1525
+
1526
+ const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
1527
+ const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
1528
+
1529
+ for (size_t iter = 0; iter < simdIterations; ++iter)
1530
+ {
1531
+ size_t baseIdx = iter * simdWidth;
1532
+
1533
+ // Generate indices
1534
+ alignas(16) float indices[4] = {
1535
+ static_cast<float>(baseIdx),
1536
+ static_cast<float>(baseIdx + 1),
1537
+ static_cast<float>(baseIdx + 2),
1538
+ static_cast<float>(baseIdx + 3)};
1539
+ float32x4_t vIndices = vld1q_f32(indices);
1540
+ float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
1541
+
1542
+ // Extract integer and fractional parts
1543
+ int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
1544
+ float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
1545
+ float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
1546
+
1547
+ // Store for processing
1548
+ alignas(16) float inputTimes[4];
1549
+ vst1q_f32(inputTimes, vInputTime);
1550
+ alignas(16) int inputIndices[4];
1551
+ vst1q_s32(inputIndices, vInputIdx);
1552
+ alignas(16) float fractions[4];
1553
+ vst1q_f32(fractions, vFrac);
1554
+
1555
+ // Process each sample
1556
+ for (size_t j = 0; j < simdWidth; ++j)
1557
+ {
1558
+ size_t i = baseIdx + j;
1559
+ size_t inputIdx = inputIndices[j];
1560
+ double frac = fractions[j];
1561
+ float timestamp;
1562
+
1563
+ if (inputIdx >= prevNumSamples)
1564
+ {
1565
+ size_t lastIdx = prevNumSamples - 1;
1566
+ timestamp = timestamps[lastIdx * prevChannels] +
1567
+ static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
1568
+ }
1569
+ else if (inputIdx + 1 >= prevNumSamples)
1570
+ {
1571
+ timestamp = timestamps[inputIdx * prevChannels];
1572
+ }
1573
+ else
1574
+ {
1575
+ float t0 = timestamps[inputIdx * prevChannels];
1576
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1577
+ timestamp = t0 + frac * (t1 - t0);
1578
+ }
1579
+
1580
+ for (int ch = 0; ch < outputChannels; ++ch)
1581
+ {
1582
+ output[i * outputChannels + ch] = timestamp;
1583
+ }
1584
+ }
1585
+ }
1586
+
1587
+ // Handle remainder
1588
+ for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
1589
+ {
1590
+ double inputTime = i * timeScale;
1591
+ size_t inputIdx = static_cast<size_t>(inputTime);
1592
+ double frac = inputTime - inputIdx;
1593
+ float timestamp;
1594
+
1595
+ if (inputIdx >= prevNumSamples)
1596
+ {
1597
+ size_t lastIdx = prevNumSamples - 1;
1598
+ timestamp = timestamps[lastIdx * prevChannels] +
1599
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1600
+ }
1601
+ else if (inputIdx + 1 >= prevNumSamples)
1602
+ {
1603
+ timestamp = timestamps[inputIdx * prevChannels];
1604
+ }
1605
+ else
1606
+ {
1607
+ float t0 = timestamps[inputIdx * prevChannels];
1608
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1609
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1610
+ }
1611
+
1612
+ for (int ch = 0; ch < outputChannels; ++ch)
1613
+ {
1614
+ output[i * outputChannels + ch] = timestamp;
1615
+ }
1616
+ }
1617
+ #elif HAS_SSE
1618
+ // ========================================
1619
+ // SSE2 Implementation (4-wide)
1620
+ // ========================================
1621
+ const size_t simdWidth = 4;
1622
+ const size_t simdIterations = numOutputSamples / simdWidth;
1623
+
1624
+ const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
1625
+ const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
1626
+
1627
+ for (size_t iter = 0; iter < simdIterations; ++iter)
1628
+ {
1629
+ size_t baseIdx = iter * simdWidth;
1630
+
1631
+ // Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
1632
+ alignas(16) float indices[4] = {
1633
+ static_cast<float>(baseIdx),
1634
+ static_cast<float>(baseIdx + 1),
1635
+ static_cast<float>(baseIdx + 2),
1636
+ static_cast<float>(baseIdx + 3)};
1637
+ __m128 vIndices = _mm_load_ps(indices);
1638
+ __m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
1639
+
1640
+ // Convert to int and back to get integer part
1641
+ __m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
1642
+ __m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
1643
+ __m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
1644
+
1645
+ // Store for scalar processing
1646
+ alignas(16) float inputTimes[4];
1647
+ _mm_store_ps(inputTimes, vInputTime);
1648
+ alignas(16) int inputIndices[4];
1649
+ _mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
1650
+ alignas(16) float fractions[4];
1651
+ _mm_store_ps(fractions, vFrac);
1652
+
1653
+ // Process each sample
1654
+ for (size_t j = 0; j < simdWidth; ++j)
1655
+ {
1656
+ size_t i = baseIdx + j;
1657
+ size_t inputIdx = inputIndices[j];
1658
+ double frac = fractions[j];
1659
+ float timestamp;
1660
+
1661
+ if (inputIdx >= prevNumSamples)
1662
+ {
1663
+ size_t lastIdx = prevNumSamples - 1;
1664
+ timestamp = timestamps[lastIdx * prevChannels] +
1665
+ static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
1666
+ }
1667
+ else if (inputIdx + 1 >= prevNumSamples)
1668
+ {
1669
+ timestamp = timestamps[inputIdx * prevChannels];
1670
+ }
1671
+ else
1672
+ {
1673
+ float t0 = timestamps[inputIdx * prevChannels];
1674
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1675
+ timestamp = t0 + frac * (t1 - t0);
1676
+ }
1677
+
1678
+ for (int ch = 0; ch < outputChannels; ++ch)
1679
+ {
1680
+ output[i * outputChannels + ch] = timestamp;
1681
+ }
1682
+ }
1683
+ }
1684
+
1685
+ // Handle remainder
1686
+ for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
1687
+ {
1688
+ double inputTime = i * timeScale;
1689
+ size_t inputIdx = static_cast<size_t>(inputTime);
1690
+ double frac = inputTime - inputIdx;
1691
+ float timestamp;
1692
+
1693
+ if (inputIdx >= prevNumSamples)
1694
+ {
1695
+ size_t lastIdx = prevNumSamples - 1;
1696
+ timestamp = timestamps[lastIdx * prevChannels] +
1697
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1698
+ }
1699
+ else if (inputIdx + 1 >= prevNumSamples)
1700
+ {
1701
+ timestamp = timestamps[inputIdx * prevChannels];
1702
+ }
1703
+ else
1704
+ {
1705
+ float t0 = timestamps[inputIdx * prevChannels];
1706
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1707
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1708
+ }
1709
+
1710
+ for (int ch = 0; ch < outputChannels; ++ch)
1711
+ {
1712
+ output[i * outputChannels + ch] = timestamp;
1713
+ }
1714
+ }
1715
+ #elif HAS_NEON
1716
+ // ========================================
1717
+ // ARM NEON Implementation (4-wide)
1718
+ // ========================================
1719
+ const size_t simdWidth = 4;
1720
+ const size_t simdIterations = numOutputSamples / simdWidth;
1721
+
1722
+ const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
1723
+ const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
1724
+
1725
+ for (size_t iter = 0; iter < simdIterations; ++iter)
1726
+ {
1727
+ size_t baseIdx = iter * simdWidth;
1728
+
1729
+ // Generate indices
1730
+ alignas(16) float indices[4] = {
1731
+ static_cast<float>(baseIdx),
1732
+ static_cast<float>(baseIdx + 1),
1733
+ static_cast<float>(baseIdx + 2),
1734
+ static_cast<float>(baseIdx + 3)};
1735
+ float32x4_t vIndices = vld1q_f32(indices);
1736
+ float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
1737
+
1738
+ // Extract integer and fractional parts
1739
+ int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
1740
+ float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
1741
+ float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
1742
+
1743
+ // Store for processing
1744
+ alignas(16) float inputTimes[4];
1745
+ vst1q_f32(inputTimes, vInputTime);
1746
+ alignas(16) int inputIndices[4];
1747
+ vst1q_s32(inputIndices, vInputIdx);
1748
+ alignas(16) float fractions[4];
1749
+ vst1q_f32(fractions, vFrac);
1750
+
1751
+ // Process each sample
1752
+ for (size_t j = 0; j < simdWidth; ++j)
1753
+ {
1754
+ size_t i = baseIdx + j;
1755
+ size_t inputIdx = inputIndices[j];
1756
+ double frac = fractions[j];
1757
+ float timestamp;
1758
+
1759
+ if (inputIdx >= prevNumSamples)
1760
+ {
1761
+ size_t lastIdx = prevNumSamples - 1;
1762
+ timestamp = timestamps[lastIdx * prevChannels] +
1763
+ static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
1764
+ }
1765
+ else if (inputIdx + 1 >= prevNumSamples)
1766
+ {
1767
+ timestamp = timestamps[inputIdx * prevChannels];
1768
+ }
1769
+ else
1770
+ {
1771
+ float t0 = timestamps[inputIdx * prevChannels];
1772
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1773
+ timestamp = t0 + frac * (t1 - t0);
1774
+ }
1775
+
1776
+ for (int ch = 0; ch < outputChannels; ++ch)
1777
+ {
1778
+ output[i * outputChannels + ch] = timestamp;
1779
+ }
1780
+ }
1781
+ }
1782
+
1783
+ // Handle remainder
1784
+ for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
1785
+ {
1786
+ double inputTime = i * timeScale;
1787
+ size_t inputIdx = static_cast<size_t>(inputTime);
1788
+ double frac = inputTime - inputIdx;
1789
+ float timestamp;
1790
+
1791
+ if (inputIdx >= prevNumSamples)
1792
+ {
1793
+ size_t lastIdx = prevNumSamples - 1;
1794
+ timestamp = timestamps[lastIdx * prevChannels] +
1795
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1796
+ }
1797
+ else if (inputIdx + 1 >= prevNumSamples)
1798
+ {
1799
+ timestamp = timestamps[inputIdx * prevChannels];
1800
+ }
1801
+ else
1802
+ {
1803
+ float t0 = timestamps[inputIdx * prevChannels];
1804
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1805
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1806
+ }
1807
+
1808
+ for (int ch = 0; ch < outputChannels; ++ch)
1809
+ {
1810
+ output[i * outputChannels + ch] = timestamp;
1811
+ }
1812
+ }
1813
+ #else
1814
+ // ========================================
1815
+ // Scalar Fallback (universal)
1816
+ // ========================================
1817
+ for (size_t i = 0; i < numOutputSamples; ++i)
1818
+ {
1819
+ double inputTime = i * timeScale;
1820
+ size_t inputIdx = static_cast<size_t>(inputTime);
1821
+ double frac = inputTime - inputIdx;
1822
+ float timestamp;
1823
+
1824
+ if (inputIdx >= prevNumSamples)
1825
+ {
1826
+ size_t lastIdx = prevNumSamples - 1;
1827
+ timestamp = timestamps[lastIdx * prevChannels] +
1828
+ static_cast<float>((inputTime - lastIdx) * timeScale);
1829
+ }
1830
+ else if (inputIdx + 1 >= prevNumSamples)
1831
+ {
1832
+ timestamp = timestamps[inputIdx * prevChannels];
1833
+ }
1834
+ else
1835
+ {
1836
+ float t0 = timestamps[inputIdx * prevChannels];
1837
+ float t1 = timestamps[(inputIdx + 1) * prevChannels];
1838
+ timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1839
+ }
1840
+
1841
+ for (int ch = 0; ch < outputChannels; ++ch)
1842
+ {
1843
+ output[i * outputChannels + ch] = timestamp;
1844
+ }
1845
+ }
1846
+ #endif
1847
+ }
1848
+
1262
1849
  /**
1263
1850
  * AsyncWorker for processing DSP pipeline in background thread
1264
1851
  */
@@ -1407,41 +1994,21 @@ namespace dsp
1407
1994
  // Create new timestamp vector
1408
1995
  auto newTimestamps = std::make_unique<std::vector<float>>(actualOutputSize);
1409
1996
 
1410
- for (size_t i = 0; i < numOutputSamples; ++i)
1411
- {
1412
- double inputTime = i * timeScale;
1413
- size_t inputIdx = static_cast<size_t>(inputTime);
1414
- double frac = inputTime - inputIdx;
1415
- float timestamp;
1416
-
1417
- if (inputIdx >= prevNumSamples)
1418
- {
1419
- size_t lastIdx = prevNumSamples - 1;
1420
- timestamp = m_timestamps[lastIdx * prevChannels] +
1421
- static_cast<float>((inputTime - lastIdx) * timeScale);
1422
- }
1423
- else if (inputIdx + 1 >= prevNumSamples)
1424
- {
1425
- timestamp = m_timestamps[inputIdx * prevChannels];
1426
- }
1427
- else
1428
- {
1429
- float t0 = m_timestamps[inputIdx * prevChannels];
1430
- float t1 = m_timestamps[(inputIdx + 1) * prevChannels];
1431
- timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
1432
- }
1433
-
1434
- for (int ch = 0; ch < m_channels; ++ch)
1435
- {
1436
- (*newTimestamps)[i * m_channels + ch] = timestamp;
1437
- }
1438
- }
1997
+ // Use SIMD-optimized interpolation
1998
+ interpolateTimestampsSIMD(
1999
+ m_timestamps,
2000
+ prevNumSamples,
2001
+ prevChannels,
2002
+ numOutputSamples,
2003
+ m_channels,
2004
+ timeScale,
2005
+ *newTimestamps);
1439
2006
 
1440
2007
  // CRITICAL FIX: Transfer ownership safely
1441
2008
  allocatedTimestamps = std::move(newTimestamps);
1442
2009
  m_timestamps = allocatedTimestamps->data();
1443
2010
 
1444
- // std::cout << "[DEBUG] Execute - timestamps reinterpolated, new addr="
2011
+ // std::cout << "[DEBUG] Execute - timestamps reinterpolated (SIMD), new addr="
1445
2012
  // << m_timestamps << std::endl;
1446
2013
  }
1447
2014
  }