dspx 1.3.4 → 1.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/prebuilds/win32-x64/dspx.node +0 -0
- package/src/native/DspPipeline.cc +597 -30
package/package.json
CHANGED
|
Binary file
|
|
@@ -51,6 +51,30 @@ namespace dsp
|
|
|
51
51
|
#include <cstdlib>
|
|
52
52
|
#include "utils/Toon.h"
|
|
53
53
|
|
|
54
|
+
// SIMD optimizations for timestamp interpolation
|
|
55
|
+
// Priority: AVX2 (8-wide) > SSE (4-wide) > NEON (4-wide) > Scalar
|
|
56
|
+
#if defined(__AVX2__) || (defined(_MSC_VER) && defined(__AVX2__))
|
|
57
|
+
#include <immintrin.h>
|
|
58
|
+
#define HAS_AVX2 1
|
|
59
|
+
#define HAS_SSE 0
|
|
60
|
+
#define HAS_NEON 0
|
|
61
|
+
#elif defined(__SSE__) || defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)))
|
|
62
|
+
#include <emmintrin.h> // SSE2
|
|
63
|
+
#include <xmmintrin.h> // SSE
|
|
64
|
+
#define HAS_AVX2 0
|
|
65
|
+
#define HAS_SSE 1
|
|
66
|
+
#define HAS_NEON 0
|
|
67
|
+
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
|
|
68
|
+
#include <arm_neon.h>
|
|
69
|
+
#define HAS_AVX2 0
|
|
70
|
+
#define HAS_SSE 0
|
|
71
|
+
#define HAS_NEON 1
|
|
72
|
+
#else
|
|
73
|
+
#define HAS_AVX2 0
|
|
74
|
+
#define HAS_SSE 0
|
|
75
|
+
#define HAS_NEON 0
|
|
76
|
+
#endif
|
|
77
|
+
|
|
54
78
|
namespace dsp
|
|
55
79
|
{
|
|
56
80
|
|
|
@@ -1259,6 +1283,569 @@ namespace dsp
|
|
|
1259
1283
|
return env.Undefined();
|
|
1260
1284
|
}
|
|
1261
1285
|
|
|
1286
|
+
/**
|
|
1287
|
+
* SIMD-optimized timestamp interpolation for resizing stages
|
|
1288
|
+
* Multi-platform support:
|
|
1289
|
+
* - AVX2 (x86_64): 8-wide vectorization
|
|
1290
|
+
* - SSE2 (x86): 4-wide vectorization
|
|
1291
|
+
* - NEON (ARM): 4-wide vectorization
|
|
1292
|
+
* - Scalar fallback for all other platforms
|
|
1293
|
+
*
|
|
1294
|
+
* @param timestamps Source timestamp array (channel-major layout)
|
|
1295
|
+
* @param prevNumSamples Number of samples in source
|
|
1296
|
+
* @param prevChannels Number of channels in source
|
|
1297
|
+
* @param numOutputSamples Number of samples to generate
|
|
1298
|
+
* @param outputChannels Number of channels in output
|
|
1299
|
+
* @param timeScale Time scaling factor from stage
|
|
1300
|
+
* @param output Output timestamp vector
|
|
1301
|
+
*/
|
|
1302
|
+
inline void interpolateTimestampsSIMD(
|
|
1303
|
+
const float *timestamps,
|
|
1304
|
+
size_t prevNumSamples,
|
|
1305
|
+
int prevChannels,
|
|
1306
|
+
size_t numOutputSamples,
|
|
1307
|
+
int outputChannels,
|
|
1308
|
+
double timeScale,
|
|
1309
|
+
std::vector<float> &output)
|
|
1310
|
+
{
|
|
1311
|
+
#if HAS_AVX2
|
|
1312
|
+
// ========================================
|
|
1313
|
+
// AVX2 Implementation (8-wide)
|
|
1314
|
+
// ========================================
|
|
1315
|
+
// Process 8 output samples at a time with AVX2
|
|
1316
|
+
const size_t simdWidth = 8;
|
|
1317
|
+
const size_t simdIterations = numOutputSamples / simdWidth;
|
|
1318
|
+
const size_t remainder = numOutputSamples % simdWidth;
|
|
1319
|
+
|
|
1320
|
+
// Precompute constants for SIMD
|
|
1321
|
+
const __m256 vTimeScale = _mm256_set1_ps(static_cast<float>(timeScale));
|
|
1322
|
+
const __m256i vPrevChannels = _mm256_set1_epi32(prevChannels);
|
|
1323
|
+
const __m256 vPrevNumSamples = _mm256_set1_ps(static_cast<float>(prevNumSamples));
|
|
1324
|
+
const __m256 vOne = _mm256_set1_ps(1.0f);
|
|
1325
|
+
|
|
1326
|
+
// SIMD loop: Process 8 timestamps at once
|
|
1327
|
+
for (size_t iter = 0; iter < simdIterations; ++iter)
|
|
1328
|
+
{
|
|
1329
|
+
size_t baseIdx = iter * simdWidth;
|
|
1330
|
+
|
|
1331
|
+
// Generate indices: [baseIdx, baseIdx+1, ..., baseIdx+7]
|
|
1332
|
+
__m256 vIdx = _mm256_set_ps(
|
|
1333
|
+
static_cast<float>(baseIdx + 7),
|
|
1334
|
+
static_cast<float>(baseIdx + 6),
|
|
1335
|
+
static_cast<float>(baseIdx + 5),
|
|
1336
|
+
static_cast<float>(baseIdx + 4),
|
|
1337
|
+
static_cast<float>(baseIdx + 3),
|
|
1338
|
+
static_cast<float>(baseIdx + 2),
|
|
1339
|
+
static_cast<float>(baseIdx + 1),
|
|
1340
|
+
static_cast<float>(baseIdx + 0));
|
|
1341
|
+
|
|
1342
|
+
// Calculate input time: i * timeScale
|
|
1343
|
+
__m256 vInputTime = _mm256_mul_ps(vIdx, vTimeScale);
|
|
1344
|
+
|
|
1345
|
+
// Extract integer and fractional parts
|
|
1346
|
+
__m256i vInputIdx = _mm256_cvttps_epi32(vInputTime);
|
|
1347
|
+
__m256 vInputIdxFloat = _mm256_cvtepi32_ps(vInputIdx);
|
|
1348
|
+
__m256 vFrac = _mm256_sub_ps(vInputTime, vInputIdxFloat);
|
|
1349
|
+
|
|
1350
|
+
// Process each of the 8 values (can't easily vectorize the conditional logic)
|
|
1351
|
+
alignas(32) float inputTimes[8];
|
|
1352
|
+
alignas(32) int inputIndices[8];
|
|
1353
|
+
alignas(32) float fracs[8];
|
|
1354
|
+
|
|
1355
|
+
_mm256_store_ps(inputTimes, vInputTime);
|
|
1356
|
+
_mm256_store_si256((__m256i *)inputIndices, vInputIdx);
|
|
1357
|
+
_mm256_store_ps(fracs, vFrac);
|
|
1358
|
+
|
|
1359
|
+
for (size_t j = 0; j < simdWidth; ++j)
|
|
1360
|
+
{
|
|
1361
|
+
size_t i = baseIdx + j;
|
|
1362
|
+
size_t inputIdx = inputIndices[j];
|
|
1363
|
+
float frac = fracs[j];
|
|
1364
|
+
float timestamp;
|
|
1365
|
+
|
|
1366
|
+
if (inputIdx >= prevNumSamples)
|
|
1367
|
+
{
|
|
1368
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1369
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1370
|
+
static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
|
|
1371
|
+
}
|
|
1372
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1373
|
+
{
|
|
1374
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1375
|
+
}
|
|
1376
|
+
else
|
|
1377
|
+
{
|
|
1378
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1379
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1380
|
+
timestamp = t0 + frac * (t1 - t0);
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
// Replicate timestamp across all output channels
|
|
1384
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1385
|
+
{
|
|
1386
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1387
|
+
}
|
|
1388
|
+
}
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
// Handle remainder samples with scalar code
|
|
1392
|
+
for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
|
|
1393
|
+
{
|
|
1394
|
+
double inputTime = i * timeScale;
|
|
1395
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1396
|
+
double frac = inputTime - inputIdx;
|
|
1397
|
+
float timestamp;
|
|
1398
|
+
|
|
1399
|
+
if (inputIdx >= prevNumSamples)
|
|
1400
|
+
{
|
|
1401
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1402
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1403
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1404
|
+
}
|
|
1405
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1406
|
+
{
|
|
1407
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1408
|
+
}
|
|
1409
|
+
else
|
|
1410
|
+
{
|
|
1411
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1412
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1413
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1417
|
+
{
|
|
1418
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1419
|
+
}
|
|
1420
|
+
}
|
|
1421
|
+
#elif HAS_SSE
|
|
1422
|
+
// ========================================
|
|
1423
|
+
// SSE2 Implementation (4-wide)
|
|
1424
|
+
// ========================================
|
|
1425
|
+
const size_t simdWidth = 4;
|
|
1426
|
+
const size_t simdIterations = numOutputSamples / simdWidth;
|
|
1427
|
+
|
|
1428
|
+
const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
|
|
1429
|
+
const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
|
|
1430
|
+
|
|
1431
|
+
for (size_t iter = 0; iter < simdIterations; ++iter)
|
|
1432
|
+
{
|
|
1433
|
+
size_t baseIdx = iter * simdWidth;
|
|
1434
|
+
|
|
1435
|
+
// Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
|
|
1436
|
+
alignas(16) float indices[4] = {
|
|
1437
|
+
static_cast<float>(baseIdx),
|
|
1438
|
+
static_cast<float>(baseIdx + 1),
|
|
1439
|
+
static_cast<float>(baseIdx + 2),
|
|
1440
|
+
static_cast<float>(baseIdx + 3)};
|
|
1441
|
+
__m128 vIndices = _mm_load_ps(indices);
|
|
1442
|
+
__m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
|
|
1443
|
+
|
|
1444
|
+
// Convert to int and back to get integer part
|
|
1445
|
+
__m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
|
|
1446
|
+
__m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
|
|
1447
|
+
__m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
|
|
1448
|
+
|
|
1449
|
+
// Store for scalar processing
|
|
1450
|
+
alignas(16) float inputTimes[4];
|
|
1451
|
+
_mm_store_ps(inputTimes, vInputTime);
|
|
1452
|
+
alignas(16) int inputIndices[4];
|
|
1453
|
+
_mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
|
|
1454
|
+
alignas(16) float fractions[4];
|
|
1455
|
+
_mm_store_ps(fractions, vFrac);
|
|
1456
|
+
|
|
1457
|
+
// Process each sample
|
|
1458
|
+
for (size_t j = 0; j < simdWidth; ++j)
|
|
1459
|
+
{
|
|
1460
|
+
size_t i = baseIdx + j;
|
|
1461
|
+
size_t inputIdx = inputIndices[j];
|
|
1462
|
+
double frac = fractions[j];
|
|
1463
|
+
float timestamp;
|
|
1464
|
+
|
|
1465
|
+
if (inputIdx >= prevNumSamples)
|
|
1466
|
+
{
|
|
1467
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1468
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1469
|
+
static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
|
|
1470
|
+
}
|
|
1471
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1472
|
+
{
|
|
1473
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1474
|
+
}
|
|
1475
|
+
else
|
|
1476
|
+
{
|
|
1477
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1478
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1479
|
+
timestamp = t0 + frac * (t1 - t0);
|
|
1480
|
+
}
|
|
1481
|
+
|
|
1482
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1483
|
+
{
|
|
1484
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1485
|
+
}
|
|
1486
|
+
}
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1489
|
+
// Handle remainder
|
|
1490
|
+
for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
|
|
1491
|
+
{
|
|
1492
|
+
double inputTime = i * timeScale;
|
|
1493
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1494
|
+
double frac = inputTime - inputIdx;
|
|
1495
|
+
float timestamp;
|
|
1496
|
+
|
|
1497
|
+
if (inputIdx >= prevNumSamples)
|
|
1498
|
+
{
|
|
1499
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1500
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1501
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1502
|
+
}
|
|
1503
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1504
|
+
{
|
|
1505
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1506
|
+
}
|
|
1507
|
+
else
|
|
1508
|
+
{
|
|
1509
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1510
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1511
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1512
|
+
}
|
|
1513
|
+
|
|
1514
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1515
|
+
{
|
|
1516
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1517
|
+
}
|
|
1518
|
+
}
|
|
1519
|
+
#elif HAS_NEON
|
|
1520
|
+
// ========================================
|
|
1521
|
+
// ARM NEON Implementation (4-wide)
|
|
1522
|
+
// ========================================
|
|
1523
|
+
const size_t simdWidth = 4;
|
|
1524
|
+
const size_t simdIterations = numOutputSamples / simdWidth;
|
|
1525
|
+
|
|
1526
|
+
const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
|
|
1527
|
+
const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
|
|
1528
|
+
|
|
1529
|
+
for (size_t iter = 0; iter < simdIterations; ++iter)
|
|
1530
|
+
{
|
|
1531
|
+
size_t baseIdx = iter * simdWidth;
|
|
1532
|
+
|
|
1533
|
+
// Generate indices
|
|
1534
|
+
alignas(16) float indices[4] = {
|
|
1535
|
+
static_cast<float>(baseIdx),
|
|
1536
|
+
static_cast<float>(baseIdx + 1),
|
|
1537
|
+
static_cast<float>(baseIdx + 2),
|
|
1538
|
+
static_cast<float>(baseIdx + 3)};
|
|
1539
|
+
float32x4_t vIndices = vld1q_f32(indices);
|
|
1540
|
+
float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
|
|
1541
|
+
|
|
1542
|
+
// Extract integer and fractional parts
|
|
1543
|
+
int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
|
|
1544
|
+
float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
|
|
1545
|
+
float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
|
|
1546
|
+
|
|
1547
|
+
// Store for processing
|
|
1548
|
+
alignas(16) float inputTimes[4];
|
|
1549
|
+
vst1q_f32(inputTimes, vInputTime);
|
|
1550
|
+
alignas(16) int inputIndices[4];
|
|
1551
|
+
vst1q_s32(inputIndices, vInputIdx);
|
|
1552
|
+
alignas(16) float fractions[4];
|
|
1553
|
+
vst1q_f32(fractions, vFrac);
|
|
1554
|
+
|
|
1555
|
+
// Process each sample
|
|
1556
|
+
for (size_t j = 0; j < simdWidth; ++j)
|
|
1557
|
+
{
|
|
1558
|
+
size_t i = baseIdx + j;
|
|
1559
|
+
size_t inputIdx = inputIndices[j];
|
|
1560
|
+
double frac = fractions[j];
|
|
1561
|
+
float timestamp;
|
|
1562
|
+
|
|
1563
|
+
if (inputIdx >= prevNumSamples)
|
|
1564
|
+
{
|
|
1565
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1566
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1567
|
+
static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
|
|
1568
|
+
}
|
|
1569
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1570
|
+
{
|
|
1571
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1572
|
+
}
|
|
1573
|
+
else
|
|
1574
|
+
{
|
|
1575
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1576
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1577
|
+
timestamp = t0 + frac * (t1 - t0);
|
|
1578
|
+
}
|
|
1579
|
+
|
|
1580
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1581
|
+
{
|
|
1582
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
|
|
1587
|
+
// Handle remainder
|
|
1588
|
+
for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
|
|
1589
|
+
{
|
|
1590
|
+
double inputTime = i * timeScale;
|
|
1591
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1592
|
+
double frac = inputTime - inputIdx;
|
|
1593
|
+
float timestamp;
|
|
1594
|
+
|
|
1595
|
+
if (inputIdx >= prevNumSamples)
|
|
1596
|
+
{
|
|
1597
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1598
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1599
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1600
|
+
}
|
|
1601
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1602
|
+
{
|
|
1603
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1604
|
+
}
|
|
1605
|
+
else
|
|
1606
|
+
{
|
|
1607
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1608
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1609
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1613
|
+
{
|
|
1614
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1615
|
+
}
|
|
1616
|
+
}
|
|
1617
|
+
#elif HAS_SSE
|
|
1618
|
+
// ========================================
|
|
1619
|
+
// SSE2 Implementation (4-wide)
|
|
1620
|
+
// ========================================
|
|
1621
|
+
const size_t simdWidth = 4;
|
|
1622
|
+
const size_t simdIterations = numOutputSamples / simdWidth;
|
|
1623
|
+
|
|
1624
|
+
const __m128 vTimeScale = _mm_set1_ps(static_cast<float>(timeScale));
|
|
1625
|
+
const __m128 vPrevNumSamples = _mm_set1_ps(static_cast<float>(prevNumSamples));
|
|
1626
|
+
|
|
1627
|
+
for (size_t iter = 0; iter < simdIterations; ++iter)
|
|
1628
|
+
{
|
|
1629
|
+
size_t baseIdx = iter * simdWidth;
|
|
1630
|
+
|
|
1631
|
+
// Generate indices [baseIdx, baseIdx+1, baseIdx+2, baseIdx+3]
|
|
1632
|
+
alignas(16) float indices[4] = {
|
|
1633
|
+
static_cast<float>(baseIdx),
|
|
1634
|
+
static_cast<float>(baseIdx + 1),
|
|
1635
|
+
static_cast<float>(baseIdx + 2),
|
|
1636
|
+
static_cast<float>(baseIdx + 3)};
|
|
1637
|
+
__m128 vIndices = _mm_load_ps(indices);
|
|
1638
|
+
__m128 vInputTime = _mm_mul_ps(vIndices, vTimeScale);
|
|
1639
|
+
|
|
1640
|
+
// Convert to int and back to get integer part
|
|
1641
|
+
__m128i vInputIdx = _mm_cvttps_epi32(vInputTime);
|
|
1642
|
+
__m128 vInputIdxFloat = _mm_cvtepi32_ps(vInputIdx);
|
|
1643
|
+
__m128 vFrac = _mm_sub_ps(vInputTime, vInputIdxFloat);
|
|
1644
|
+
|
|
1645
|
+
// Store for scalar processing
|
|
1646
|
+
alignas(16) float inputTimes[4];
|
|
1647
|
+
_mm_store_ps(inputTimes, vInputTime);
|
|
1648
|
+
alignas(16) int inputIndices[4];
|
|
1649
|
+
_mm_store_si128(reinterpret_cast<__m128i *>(inputIndices), vInputIdx);
|
|
1650
|
+
alignas(16) float fractions[4];
|
|
1651
|
+
_mm_store_ps(fractions, vFrac);
|
|
1652
|
+
|
|
1653
|
+
// Process each sample
|
|
1654
|
+
for (size_t j = 0; j < simdWidth; ++j)
|
|
1655
|
+
{
|
|
1656
|
+
size_t i = baseIdx + j;
|
|
1657
|
+
size_t inputIdx = inputIndices[j];
|
|
1658
|
+
double frac = fractions[j];
|
|
1659
|
+
float timestamp;
|
|
1660
|
+
|
|
1661
|
+
if (inputIdx >= prevNumSamples)
|
|
1662
|
+
{
|
|
1663
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1664
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1665
|
+
static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
|
|
1666
|
+
}
|
|
1667
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1668
|
+
{
|
|
1669
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1670
|
+
}
|
|
1671
|
+
else
|
|
1672
|
+
{
|
|
1673
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1674
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1675
|
+
timestamp = t0 + frac * (t1 - t0);
|
|
1676
|
+
}
|
|
1677
|
+
|
|
1678
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1679
|
+
{
|
|
1680
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
}
|
|
1684
|
+
|
|
1685
|
+
// Handle remainder
|
|
1686
|
+
for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
|
|
1687
|
+
{
|
|
1688
|
+
double inputTime = i * timeScale;
|
|
1689
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1690
|
+
double frac = inputTime - inputIdx;
|
|
1691
|
+
float timestamp;
|
|
1692
|
+
|
|
1693
|
+
if (inputIdx >= prevNumSamples)
|
|
1694
|
+
{
|
|
1695
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1696
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1697
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1698
|
+
}
|
|
1699
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1700
|
+
{
|
|
1701
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1702
|
+
}
|
|
1703
|
+
else
|
|
1704
|
+
{
|
|
1705
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1706
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1707
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1708
|
+
}
|
|
1709
|
+
|
|
1710
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1711
|
+
{
|
|
1712
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1713
|
+
}
|
|
1714
|
+
}
|
|
1715
|
+
#elif HAS_NEON
|
|
1716
|
+
// ========================================
|
|
1717
|
+
// ARM NEON Implementation (4-wide)
|
|
1718
|
+
// ========================================
|
|
1719
|
+
const size_t simdWidth = 4;
|
|
1720
|
+
const size_t simdIterations = numOutputSamples / simdWidth;
|
|
1721
|
+
|
|
1722
|
+
const float32x4_t vTimeScale = vdupq_n_f32(static_cast<float>(timeScale));
|
|
1723
|
+
const float32x4_t vPrevNumSamples = vdupq_n_f32(static_cast<float>(prevNumSamples));
|
|
1724
|
+
|
|
1725
|
+
for (size_t iter = 0; iter < simdIterations; ++iter)
|
|
1726
|
+
{
|
|
1727
|
+
size_t baseIdx = iter * simdWidth;
|
|
1728
|
+
|
|
1729
|
+
// Generate indices
|
|
1730
|
+
alignas(16) float indices[4] = {
|
|
1731
|
+
static_cast<float>(baseIdx),
|
|
1732
|
+
static_cast<float>(baseIdx + 1),
|
|
1733
|
+
static_cast<float>(baseIdx + 2),
|
|
1734
|
+
static_cast<float>(baseIdx + 3)};
|
|
1735
|
+
float32x4_t vIndices = vld1q_f32(indices);
|
|
1736
|
+
float32x4_t vInputTime = vmulq_f32(vIndices, vTimeScale);
|
|
1737
|
+
|
|
1738
|
+
// Extract integer and fractional parts
|
|
1739
|
+
int32x4_t vInputIdx = vcvtq_s32_f32(vInputTime);
|
|
1740
|
+
float32x4_t vInputIdxFloat = vcvtq_f32_s32(vInputIdx);
|
|
1741
|
+
float32x4_t vFrac = vsubq_f32(vInputTime, vInputIdxFloat);
|
|
1742
|
+
|
|
1743
|
+
// Store for processing
|
|
1744
|
+
alignas(16) float inputTimes[4];
|
|
1745
|
+
vst1q_f32(inputTimes, vInputTime);
|
|
1746
|
+
alignas(16) int inputIndices[4];
|
|
1747
|
+
vst1q_s32(inputIndices, vInputIdx);
|
|
1748
|
+
alignas(16) float fractions[4];
|
|
1749
|
+
vst1q_f32(fractions, vFrac);
|
|
1750
|
+
|
|
1751
|
+
// Process each sample
|
|
1752
|
+
for (size_t j = 0; j < simdWidth; ++j)
|
|
1753
|
+
{
|
|
1754
|
+
size_t i = baseIdx + j;
|
|
1755
|
+
size_t inputIdx = inputIndices[j];
|
|
1756
|
+
double frac = fractions[j];
|
|
1757
|
+
float timestamp;
|
|
1758
|
+
|
|
1759
|
+
if (inputIdx >= prevNumSamples)
|
|
1760
|
+
{
|
|
1761
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1762
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1763
|
+
static_cast<float>((inputTimes[j] - lastIdx) * timeScale);
|
|
1764
|
+
}
|
|
1765
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1766
|
+
{
|
|
1767
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1768
|
+
}
|
|
1769
|
+
else
|
|
1770
|
+
{
|
|
1771
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1772
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1773
|
+
timestamp = t0 + frac * (t1 - t0);
|
|
1774
|
+
}
|
|
1775
|
+
|
|
1776
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1777
|
+
{
|
|
1778
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1779
|
+
}
|
|
1780
|
+
}
|
|
1781
|
+
}
|
|
1782
|
+
|
|
1783
|
+
// Handle remainder
|
|
1784
|
+
for (size_t i = simdIterations * simdWidth; i < numOutputSamples; ++i)
|
|
1785
|
+
{
|
|
1786
|
+
double inputTime = i * timeScale;
|
|
1787
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1788
|
+
double frac = inputTime - inputIdx;
|
|
1789
|
+
float timestamp;
|
|
1790
|
+
|
|
1791
|
+
if (inputIdx >= prevNumSamples)
|
|
1792
|
+
{
|
|
1793
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1794
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1795
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1796
|
+
}
|
|
1797
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1798
|
+
{
|
|
1799
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1800
|
+
}
|
|
1801
|
+
else
|
|
1802
|
+
{
|
|
1803
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1804
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1805
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1806
|
+
}
|
|
1807
|
+
|
|
1808
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1809
|
+
{
|
|
1810
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1811
|
+
}
|
|
1812
|
+
}
|
|
1813
|
+
#else
|
|
1814
|
+
// ========================================
|
|
1815
|
+
// Scalar Fallback (universal)
|
|
1816
|
+
// ========================================
|
|
1817
|
+
for (size_t i = 0; i < numOutputSamples; ++i)
|
|
1818
|
+
{
|
|
1819
|
+
double inputTime = i * timeScale;
|
|
1820
|
+
size_t inputIdx = static_cast<size_t>(inputTime);
|
|
1821
|
+
double frac = inputTime - inputIdx;
|
|
1822
|
+
float timestamp;
|
|
1823
|
+
|
|
1824
|
+
if (inputIdx >= prevNumSamples)
|
|
1825
|
+
{
|
|
1826
|
+
size_t lastIdx = prevNumSamples - 1;
|
|
1827
|
+
timestamp = timestamps[lastIdx * prevChannels] +
|
|
1828
|
+
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1829
|
+
}
|
|
1830
|
+
else if (inputIdx + 1 >= prevNumSamples)
|
|
1831
|
+
{
|
|
1832
|
+
timestamp = timestamps[inputIdx * prevChannels];
|
|
1833
|
+
}
|
|
1834
|
+
else
|
|
1835
|
+
{
|
|
1836
|
+
float t0 = timestamps[inputIdx * prevChannels];
|
|
1837
|
+
float t1 = timestamps[(inputIdx + 1) * prevChannels];
|
|
1838
|
+
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1839
|
+
}
|
|
1840
|
+
|
|
1841
|
+
for (int ch = 0; ch < outputChannels; ++ch)
|
|
1842
|
+
{
|
|
1843
|
+
output[i * outputChannels + ch] = timestamp;
|
|
1844
|
+
}
|
|
1845
|
+
}
|
|
1846
|
+
#endif
|
|
1847
|
+
}
|
|
1848
|
+
|
|
1262
1849
|
/**
|
|
1263
1850
|
* AsyncWorker for processing DSP pipeline in background thread
|
|
1264
1851
|
*/
|
|
@@ -1407,41 +1994,21 @@ namespace dsp
|
|
|
1407
1994
|
// Create new timestamp vector
|
|
1408
1995
|
auto newTimestamps = std::make_unique<std::vector<float>>(actualOutputSize);
|
|
1409
1996
|
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
size_t lastIdx = prevNumSamples - 1;
|
|
1420
|
-
timestamp = m_timestamps[lastIdx * prevChannels] +
|
|
1421
|
-
static_cast<float>((inputTime - lastIdx) * timeScale);
|
|
1422
|
-
}
|
|
1423
|
-
else if (inputIdx + 1 >= prevNumSamples)
|
|
1424
|
-
{
|
|
1425
|
-
timestamp = m_timestamps[inputIdx * prevChannels];
|
|
1426
|
-
}
|
|
1427
|
-
else
|
|
1428
|
-
{
|
|
1429
|
-
float t0 = m_timestamps[inputIdx * prevChannels];
|
|
1430
|
-
float t1 = m_timestamps[(inputIdx + 1) * prevChannels];
|
|
1431
|
-
timestamp = t0 + static_cast<float>(frac) * (t1 - t0);
|
|
1432
|
-
}
|
|
1433
|
-
|
|
1434
|
-
for (int ch = 0; ch < m_channels; ++ch)
|
|
1435
|
-
{
|
|
1436
|
-
(*newTimestamps)[i * m_channels + ch] = timestamp;
|
|
1437
|
-
}
|
|
1438
|
-
}
|
|
1997
|
+
// Use SIMD-optimized interpolation
|
|
1998
|
+
interpolateTimestampsSIMD(
|
|
1999
|
+
m_timestamps,
|
|
2000
|
+
prevNumSamples,
|
|
2001
|
+
prevChannels,
|
|
2002
|
+
numOutputSamples,
|
|
2003
|
+
m_channels,
|
|
2004
|
+
timeScale,
|
|
2005
|
+
*newTimestamps);
|
|
1439
2006
|
|
|
1440
2007
|
// CRITICAL FIX: Transfer ownership safely
|
|
1441
2008
|
allocatedTimestamps = std::move(newTimestamps);
|
|
1442
2009
|
m_timestamps = allocatedTimestamps->data();
|
|
1443
2010
|
|
|
1444
|
-
// std::cout << "[DEBUG] Execute - timestamps reinterpolated, new addr="
|
|
2011
|
+
// std::cout << "[DEBUG] Execute - timestamps reinterpolated (SIMD), new addr="
|
|
1445
2012
|
// << m_timestamps << std::endl;
|
|
1446
2013
|
}
|
|
1447
2014
|
}
|