@absolutejs/voice 0.0.22-beta.577 → 0.0.22-beta.579
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/client/htmxBootstrap.js +186 -5
- package/dist/client/index.d.ts +1 -0
- package/dist/client/index.js +187 -5
- package/dist/client/timeStretch.d.ts +5 -0
- package/dist/core/backchannel.d.ts +6 -0
- package/dist/core/types.d.ts +9 -0
- package/dist/index.d.ts +1 -1
- package/dist/index.js +94 -64
- package/dist/telephony/twilio.d.ts +3 -0
- package/dist/testing/index.js +279 -5
- package/package.json +1 -1
|
@@ -1533,12 +1533,165 @@ var createVoiceController = (path, options = {}) => {
|
|
|
1533
1533
|
};
|
|
1534
1534
|
};
|
|
1535
1535
|
|
|
1536
|
+
// src/client/timeStretch.ts
|
|
1537
|
+
var HOP_MS = 10;
|
|
1538
|
+
var SEEK_MS = 5;
|
|
1539
|
+
var ENERGY_EPSILON = 0.000001;
|
|
1540
|
+
var HALF = 0.5;
|
|
1541
|
+
var MS_PER_SECOND = 1000;
|
|
1542
|
+
var makeHann = (length) => {
|
|
1543
|
+
const weights = new Float32Array(length);
|
|
1544
|
+
for (let index = 0;index < length; index += 1) {
|
|
1545
|
+
weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
|
|
1546
|
+
}
|
|
1547
|
+
return weights;
|
|
1548
|
+
};
|
|
1549
|
+
var correlationScore = (base, start, ref, length) => {
|
|
1550
|
+
let dot = 0;
|
|
1551
|
+
let energy = 0;
|
|
1552
|
+
for (let index = 0;index < length; index += 1) {
|
|
1553
|
+
const sample = base[start + index] ?? 0;
|
|
1554
|
+
dot += sample * (ref[index] ?? 0);
|
|
1555
|
+
energy += sample * sample;
|
|
1556
|
+
}
|
|
1557
|
+
return dot / Math.sqrt(energy + ENERGY_EPSILON);
|
|
1558
|
+
};
|
|
1559
|
+
var overlapAddGrain = (src, off, tail, weights, hop) => {
|
|
1560
|
+
const out = new Float32Array(hop);
|
|
1561
|
+
const nextTail = new Float32Array(hop);
|
|
1562
|
+
for (let index = 0;index < hop; index += 1) {
|
|
1563
|
+
out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
|
|
1564
|
+
nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
|
|
1565
|
+
}
|
|
1566
|
+
return { nextTail, out };
|
|
1567
|
+
};
|
|
1568
|
+
var createTimeStretcher = () => {
|
|
1569
|
+
let sampleRate = 0;
|
|
1570
|
+
let channelCount = 0;
|
|
1571
|
+
let hop = 0;
|
|
1572
|
+
let frameLen = 0;
|
|
1573
|
+
let seek = 0;
|
|
1574
|
+
let weights = new Float32Array(0);
|
|
1575
|
+
let buffers = [];
|
|
1576
|
+
let inputStart = 0;
|
|
1577
|
+
let analysisPos = 0;
|
|
1578
|
+
let olaTail = [];
|
|
1579
|
+
let naturalRef = null;
|
|
1580
|
+
const init = (rate, channels) => {
|
|
1581
|
+
sampleRate = rate;
|
|
1582
|
+
channelCount = channels;
|
|
1583
|
+
hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
|
|
1584
|
+
frameLen = hop * 2;
|
|
1585
|
+
seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
|
|
1586
|
+
weights = makeHann(frameLen);
|
|
1587
|
+
buffers = Array.from({ length: channels }, () => new Float32Array(0));
|
|
1588
|
+
olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
|
|
1589
|
+
inputStart = 0;
|
|
1590
|
+
analysisPos = seek;
|
|
1591
|
+
naturalRef = null;
|
|
1592
|
+
};
|
|
1593
|
+
const reset = () => {
|
|
1594
|
+
buffers = buffers.map(() => new Float32Array(0));
|
|
1595
|
+
olaTail = olaTail.map(() => new Float32Array(hop));
|
|
1596
|
+
inputStart = 0;
|
|
1597
|
+
analysisPos = seek;
|
|
1598
|
+
naturalRef = null;
|
|
1599
|
+
};
|
|
1600
|
+
const append = (input) => {
|
|
1601
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1602
|
+
const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
|
|
1603
|
+
const existing = buffers[channel] ?? new Float32Array(0);
|
|
1604
|
+
const merged = new Float32Array(existing.length + incoming.length);
|
|
1605
|
+
merged.set(existing, 0);
|
|
1606
|
+
merged.set(incoming, existing.length);
|
|
1607
|
+
buffers[channel] = merged;
|
|
1608
|
+
}
|
|
1609
|
+
};
|
|
1610
|
+
const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
|
|
1611
|
+
const compact = () => {
|
|
1612
|
+
const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
|
|
1613
|
+
if (keepFrom <= inputStart)
|
|
1614
|
+
return;
|
|
1615
|
+
const drop = keepFrom - inputStart;
|
|
1616
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1617
|
+
buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
|
|
1618
|
+
}
|
|
1619
|
+
inputStart = keepFrom;
|
|
1620
|
+
};
|
|
1621
|
+
const bestOffset = (center) => {
|
|
1622
|
+
if (!naturalRef)
|
|
1623
|
+
return 0;
|
|
1624
|
+
const [base] = buffers;
|
|
1625
|
+
if (!base)
|
|
1626
|
+
return 0;
|
|
1627
|
+
let bestDelta = 0;
|
|
1628
|
+
let bestScore = -Infinity;
|
|
1629
|
+
for (let delta = -seek;delta <= seek; delta += 1) {
|
|
1630
|
+
const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
|
|
1631
|
+
if (score <= bestScore)
|
|
1632
|
+
continue;
|
|
1633
|
+
bestScore = score;
|
|
1634
|
+
bestDelta = delta;
|
|
1635
|
+
}
|
|
1636
|
+
return bestDelta;
|
|
1637
|
+
};
|
|
1638
|
+
const process = (input, speed, rate) => {
|
|
1639
|
+
const channels = Math.max(1, input.length);
|
|
1640
|
+
if (sampleRate !== rate || channelCount !== channels)
|
|
1641
|
+
init(rate, channels);
|
|
1642
|
+
append(input);
|
|
1643
|
+
const analysisHop = hop * speed;
|
|
1644
|
+
const segments = Array.from({ length: channelCount }, () => []);
|
|
1645
|
+
const emitGrain = (pos) => {
|
|
1646
|
+
const off = pos - inputStart;
|
|
1647
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1648
|
+
const src = buffers[channel];
|
|
1649
|
+
const tail = olaTail[channel];
|
|
1650
|
+
if (!src || !tail)
|
|
1651
|
+
continue;
|
|
1652
|
+
const grain = overlapAddGrain(src, off, tail, weights, hop);
|
|
1653
|
+
olaTail[channel] = grain.nextTail;
|
|
1654
|
+
segments[channel]?.push(grain.out);
|
|
1655
|
+
}
|
|
1656
|
+
};
|
|
1657
|
+
const captureRef = (pos) => {
|
|
1658
|
+
const ref = new Float32Array(frameLen);
|
|
1659
|
+
const refOff = pos + hop - inputStart;
|
|
1660
|
+
const [base] = buffers;
|
|
1661
|
+
if (base)
|
|
1662
|
+
ref.set(base.subarray(refOff, refOff + frameLen));
|
|
1663
|
+
naturalRef = ref;
|
|
1664
|
+
};
|
|
1665
|
+
const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
|
|
1666
|
+
while (canEmit()) {
|
|
1667
|
+
const center = Math.round(analysisPos);
|
|
1668
|
+
const pos = center + bestOffset(center);
|
|
1669
|
+
emitGrain(pos);
|
|
1670
|
+
captureRef(pos);
|
|
1671
|
+
analysisPos += analysisHop;
|
|
1672
|
+
}
|
|
1673
|
+
compact();
|
|
1674
|
+
return segments.map((channelSegments) => {
|
|
1675
|
+
const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
|
|
1676
|
+
const merged = new Float32Array(total);
|
|
1677
|
+
let offset = 0;
|
|
1678
|
+
for (const seg of channelSegments) {
|
|
1679
|
+
merged.set(seg, offset);
|
|
1680
|
+
offset += seg.length;
|
|
1681
|
+
}
|
|
1682
|
+
return merged;
|
|
1683
|
+
});
|
|
1684
|
+
};
|
|
1685
|
+
return { process, reset };
|
|
1686
|
+
};
|
|
1687
|
+
|
|
1536
1688
|
// src/client/audioPlayer.ts
|
|
1537
1689
|
var DEFAULT_LOOKAHEAD_MS = 15;
|
|
1538
1690
|
var DEFAULT_VOLUME = 1;
|
|
1539
1691
|
var DEFAULT_PLAYBACK_RATE = 1;
|
|
1540
1692
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
1541
1693
|
var MAX_PLAYBACK_RATE = 2;
|
|
1694
|
+
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
1542
1695
|
var createInitialState3 = () => ({
|
|
1543
1696
|
activeSourceCount: 0,
|
|
1544
1697
|
error: null,
|
|
@@ -1601,6 +1754,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1601
1754
|
let outputNode = null;
|
|
1602
1755
|
let volume = clampVolume(options.volume);
|
|
1603
1756
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
1757
|
+
let stretcher = null;
|
|
1604
1758
|
let queueEndTime = 0;
|
|
1605
1759
|
let syncPromise = Promise.resolve();
|
|
1606
1760
|
let interruptStartedAt = null;
|
|
@@ -1633,6 +1787,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1633
1787
|
const resolveInterrupt = (latencyMs) => {
|
|
1634
1788
|
clearInterruptTimer();
|
|
1635
1789
|
interruptStartedAt = null;
|
|
1790
|
+
stretcher?.reset();
|
|
1636
1791
|
setState({
|
|
1637
1792
|
activeSourceCount: sourceNodes.size,
|
|
1638
1793
|
isPlaying: false,
|
|
@@ -1697,13 +1852,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1697
1852
|
queueEndTime = audioContext.currentTime;
|
|
1698
1853
|
return audioContext;
|
|
1699
1854
|
};
|
|
1700
|
-
const
|
|
1701
|
-
const context = await ensureAudioContext();
|
|
1702
|
-
const buffer = decodePCM16LEChunk(context, chunk);
|
|
1855
|
+
const scheduleBuffer = (context, buffer, rate) => {
|
|
1703
1856
|
const node = context.createBufferSource();
|
|
1704
1857
|
node.buffer = buffer;
|
|
1705
1858
|
if (node.playbackRate) {
|
|
1706
|
-
node.playbackRate.value =
|
|
1859
|
+
node.playbackRate.value = rate;
|
|
1707
1860
|
}
|
|
1708
1861
|
node.connect(outputNode ?? context.destination);
|
|
1709
1862
|
node.onended = () => {
|
|
@@ -1716,7 +1869,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1716
1869
|
maybeResolveInterrupt();
|
|
1717
1870
|
};
|
|
1718
1871
|
const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
|
|
1719
|
-
queueEndTime = startAt + buffer.duration /
|
|
1872
|
+
queueEndTime = startAt + buffer.duration / rate;
|
|
1720
1873
|
sourceNodes.add(node);
|
|
1721
1874
|
setState({
|
|
1722
1875
|
activeSourceCount: sourceNodes.size,
|
|
@@ -1724,6 +1877,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1724
1877
|
});
|
|
1725
1878
|
node.start(startAt);
|
|
1726
1879
|
};
|
|
1880
|
+
const scheduleChunk = async (chunk) => {
|
|
1881
|
+
const context = await ensureAudioContext();
|
|
1882
|
+
const buffer = decodePCM16LEChunk(context, chunk);
|
|
1883
|
+
if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
|
|
1884
|
+
stretcher?.reset();
|
|
1885
|
+
scheduleBuffer(context, buffer, playbackRate);
|
|
1886
|
+
return;
|
|
1887
|
+
}
|
|
1888
|
+
const channels = Math.max(1, chunk.format.channels);
|
|
1889
|
+
const input = [];
|
|
1890
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
1891
|
+
input.push(buffer.getChannelData(channelIndex));
|
|
1892
|
+
}
|
|
1893
|
+
stretcher ??= createTimeStretcher();
|
|
1894
|
+
const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
|
|
1895
|
+
const outLength = stretched[0]?.length ?? 0;
|
|
1896
|
+
if (outLength === 0) {
|
|
1897
|
+
return;
|
|
1898
|
+
}
|
|
1899
|
+
const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
|
|
1900
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
1901
|
+
const channelOut = stretched[channelIndex];
|
|
1902
|
+
if (!channelOut)
|
|
1903
|
+
continue;
|
|
1904
|
+
outBuffer.getChannelData(channelIndex).set(channelOut);
|
|
1905
|
+
}
|
|
1906
|
+
scheduleBuffer(context, outBuffer, 1);
|
|
1907
|
+
};
|
|
1727
1908
|
const stopQueuedPlayback = (options2) => {
|
|
1728
1909
|
for (const node of [...sourceNodes]) {
|
|
1729
1910
|
node.stop?.();
|
package/dist/client/index.d.ts
CHANGED
|
@@ -2,6 +2,7 @@ export { bindVoiceReactiveSource, voiceSseReactiveSource, } from "./reactiveSour
|
|
|
2
2
|
export type { VoiceReactiveSource, VoiceSseReactiveSourceOptions, } from "./reactiveSource";
|
|
3
3
|
export { createVoiceConnection } from "./connection";
|
|
4
4
|
export { createVoiceAudioPlayer, decodeVoiceAudioChunk } from "./audioPlayer";
|
|
5
|
+
export { createTimeStretcher, type TimeStretcher } from "./timeStretch";
|
|
5
6
|
export { createVoiceStream } from "./createVoiceStream";
|
|
6
7
|
export { createVoiceBrowserMediaReporter } from "./browserMedia";
|
|
7
8
|
export type { VoiceBrowserMediaReporter } from "./browserMedia";
|
package/dist/client/index.js
CHANGED
|
@@ -370,12 +370,165 @@ var createVoiceConnection = (path, options = {}) => {
|
|
|
370
370
|
getSessionId: () => state.sessionId
|
|
371
371
|
};
|
|
372
372
|
};
|
|
373
|
+
// src/client/timeStretch.ts
|
|
374
|
+
var HOP_MS = 10;
|
|
375
|
+
var SEEK_MS = 5;
|
|
376
|
+
var ENERGY_EPSILON = 0.000001;
|
|
377
|
+
var HALF = 0.5;
|
|
378
|
+
var MS_PER_SECOND = 1000;
|
|
379
|
+
var makeHann = (length) => {
|
|
380
|
+
const weights = new Float32Array(length);
|
|
381
|
+
for (let index = 0;index < length; index += 1) {
|
|
382
|
+
weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
|
|
383
|
+
}
|
|
384
|
+
return weights;
|
|
385
|
+
};
|
|
386
|
+
var correlationScore = (base, start, ref, length) => {
|
|
387
|
+
let dot = 0;
|
|
388
|
+
let energy = 0;
|
|
389
|
+
for (let index = 0;index < length; index += 1) {
|
|
390
|
+
const sample = base[start + index] ?? 0;
|
|
391
|
+
dot += sample * (ref[index] ?? 0);
|
|
392
|
+
energy += sample * sample;
|
|
393
|
+
}
|
|
394
|
+
return dot / Math.sqrt(energy + ENERGY_EPSILON);
|
|
395
|
+
};
|
|
396
|
+
var overlapAddGrain = (src, off, tail, weights, hop) => {
|
|
397
|
+
const out = new Float32Array(hop);
|
|
398
|
+
const nextTail = new Float32Array(hop);
|
|
399
|
+
for (let index = 0;index < hop; index += 1) {
|
|
400
|
+
out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
|
|
401
|
+
nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
|
|
402
|
+
}
|
|
403
|
+
return { nextTail, out };
|
|
404
|
+
};
|
|
405
|
+
var createTimeStretcher = () => {
|
|
406
|
+
let sampleRate = 0;
|
|
407
|
+
let channelCount = 0;
|
|
408
|
+
let hop = 0;
|
|
409
|
+
let frameLen = 0;
|
|
410
|
+
let seek = 0;
|
|
411
|
+
let weights = new Float32Array(0);
|
|
412
|
+
let buffers = [];
|
|
413
|
+
let inputStart = 0;
|
|
414
|
+
let analysisPos = 0;
|
|
415
|
+
let olaTail = [];
|
|
416
|
+
let naturalRef = null;
|
|
417
|
+
const init = (rate, channels) => {
|
|
418
|
+
sampleRate = rate;
|
|
419
|
+
channelCount = channels;
|
|
420
|
+
hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
|
|
421
|
+
frameLen = hop * 2;
|
|
422
|
+
seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
|
|
423
|
+
weights = makeHann(frameLen);
|
|
424
|
+
buffers = Array.from({ length: channels }, () => new Float32Array(0));
|
|
425
|
+
olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
|
|
426
|
+
inputStart = 0;
|
|
427
|
+
analysisPos = seek;
|
|
428
|
+
naturalRef = null;
|
|
429
|
+
};
|
|
430
|
+
const reset = () => {
|
|
431
|
+
buffers = buffers.map(() => new Float32Array(0));
|
|
432
|
+
olaTail = olaTail.map(() => new Float32Array(hop));
|
|
433
|
+
inputStart = 0;
|
|
434
|
+
analysisPos = seek;
|
|
435
|
+
naturalRef = null;
|
|
436
|
+
};
|
|
437
|
+
const append = (input) => {
|
|
438
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
439
|
+
const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
|
|
440
|
+
const existing = buffers[channel] ?? new Float32Array(0);
|
|
441
|
+
const merged = new Float32Array(existing.length + incoming.length);
|
|
442
|
+
merged.set(existing, 0);
|
|
443
|
+
merged.set(incoming, existing.length);
|
|
444
|
+
buffers[channel] = merged;
|
|
445
|
+
}
|
|
446
|
+
};
|
|
447
|
+
const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
|
|
448
|
+
const compact = () => {
|
|
449
|
+
const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
|
|
450
|
+
if (keepFrom <= inputStart)
|
|
451
|
+
return;
|
|
452
|
+
const drop = keepFrom - inputStart;
|
|
453
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
454
|
+
buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
|
|
455
|
+
}
|
|
456
|
+
inputStart = keepFrom;
|
|
457
|
+
};
|
|
458
|
+
const bestOffset = (center) => {
|
|
459
|
+
if (!naturalRef)
|
|
460
|
+
return 0;
|
|
461
|
+
const [base] = buffers;
|
|
462
|
+
if (!base)
|
|
463
|
+
return 0;
|
|
464
|
+
let bestDelta = 0;
|
|
465
|
+
let bestScore = -Infinity;
|
|
466
|
+
for (let delta = -seek;delta <= seek; delta += 1) {
|
|
467
|
+
const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
|
|
468
|
+
if (score <= bestScore)
|
|
469
|
+
continue;
|
|
470
|
+
bestScore = score;
|
|
471
|
+
bestDelta = delta;
|
|
472
|
+
}
|
|
473
|
+
return bestDelta;
|
|
474
|
+
};
|
|
475
|
+
const process = (input, speed, rate) => {
|
|
476
|
+
const channels = Math.max(1, input.length);
|
|
477
|
+
if (sampleRate !== rate || channelCount !== channels)
|
|
478
|
+
init(rate, channels);
|
|
479
|
+
append(input);
|
|
480
|
+
const analysisHop = hop * speed;
|
|
481
|
+
const segments = Array.from({ length: channelCount }, () => []);
|
|
482
|
+
const emitGrain = (pos) => {
|
|
483
|
+
const off = pos - inputStart;
|
|
484
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
485
|
+
const src = buffers[channel];
|
|
486
|
+
const tail = olaTail[channel];
|
|
487
|
+
if (!src || !tail)
|
|
488
|
+
continue;
|
|
489
|
+
const grain = overlapAddGrain(src, off, tail, weights, hop);
|
|
490
|
+
olaTail[channel] = grain.nextTail;
|
|
491
|
+
segments[channel]?.push(grain.out);
|
|
492
|
+
}
|
|
493
|
+
};
|
|
494
|
+
const captureRef = (pos) => {
|
|
495
|
+
const ref = new Float32Array(frameLen);
|
|
496
|
+
const refOff = pos + hop - inputStart;
|
|
497
|
+
const [base] = buffers;
|
|
498
|
+
if (base)
|
|
499
|
+
ref.set(base.subarray(refOff, refOff + frameLen));
|
|
500
|
+
naturalRef = ref;
|
|
501
|
+
};
|
|
502
|
+
const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
|
|
503
|
+
while (canEmit()) {
|
|
504
|
+
const center = Math.round(analysisPos);
|
|
505
|
+
const pos = center + bestOffset(center);
|
|
506
|
+
emitGrain(pos);
|
|
507
|
+
captureRef(pos);
|
|
508
|
+
analysisPos += analysisHop;
|
|
509
|
+
}
|
|
510
|
+
compact();
|
|
511
|
+
return segments.map((channelSegments) => {
|
|
512
|
+
const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
|
|
513
|
+
const merged = new Float32Array(total);
|
|
514
|
+
let offset = 0;
|
|
515
|
+
for (const seg of channelSegments) {
|
|
516
|
+
merged.set(seg, offset);
|
|
517
|
+
offset += seg.length;
|
|
518
|
+
}
|
|
519
|
+
return merged;
|
|
520
|
+
});
|
|
521
|
+
};
|
|
522
|
+
return { process, reset };
|
|
523
|
+
};
|
|
524
|
+
|
|
373
525
|
// src/client/audioPlayer.ts
|
|
374
526
|
var DEFAULT_LOOKAHEAD_MS = 15;
|
|
375
527
|
var DEFAULT_VOLUME = 1;
|
|
376
528
|
var DEFAULT_PLAYBACK_RATE = 1;
|
|
377
529
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
378
530
|
var MAX_PLAYBACK_RATE = 2;
|
|
531
|
+
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
379
532
|
var createInitialState = () => ({
|
|
380
533
|
activeSourceCount: 0,
|
|
381
534
|
error: null,
|
|
@@ -438,6 +591,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
438
591
|
let outputNode = null;
|
|
439
592
|
let volume = clampVolume(options.volume);
|
|
440
593
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
594
|
+
let stretcher = null;
|
|
441
595
|
let queueEndTime = 0;
|
|
442
596
|
let syncPromise = Promise.resolve();
|
|
443
597
|
let interruptStartedAt = null;
|
|
@@ -470,6 +624,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
470
624
|
const resolveInterrupt = (latencyMs) => {
|
|
471
625
|
clearInterruptTimer();
|
|
472
626
|
interruptStartedAt = null;
|
|
627
|
+
stretcher?.reset();
|
|
473
628
|
setState({
|
|
474
629
|
activeSourceCount: sourceNodes.size,
|
|
475
630
|
isPlaying: false,
|
|
@@ -534,13 +689,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
534
689
|
queueEndTime = audioContext.currentTime;
|
|
535
690
|
return audioContext;
|
|
536
691
|
};
|
|
537
|
-
const
|
|
538
|
-
const context = await ensureAudioContext();
|
|
539
|
-
const buffer = decodePCM16LEChunk(context, chunk);
|
|
692
|
+
const scheduleBuffer = (context, buffer, rate) => {
|
|
540
693
|
const node = context.createBufferSource();
|
|
541
694
|
node.buffer = buffer;
|
|
542
695
|
if (node.playbackRate) {
|
|
543
|
-
node.playbackRate.value =
|
|
696
|
+
node.playbackRate.value = rate;
|
|
544
697
|
}
|
|
545
698
|
node.connect(outputNode ?? context.destination);
|
|
546
699
|
node.onended = () => {
|
|
@@ -553,7 +706,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
553
706
|
maybeResolveInterrupt();
|
|
554
707
|
};
|
|
555
708
|
const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
|
|
556
|
-
queueEndTime = startAt + buffer.duration /
|
|
709
|
+
queueEndTime = startAt + buffer.duration / rate;
|
|
557
710
|
sourceNodes.add(node);
|
|
558
711
|
setState({
|
|
559
712
|
activeSourceCount: sourceNodes.size,
|
|
@@ -561,6 +714,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
561
714
|
});
|
|
562
715
|
node.start(startAt);
|
|
563
716
|
};
|
|
717
|
+
const scheduleChunk = async (chunk) => {
|
|
718
|
+
const context = await ensureAudioContext();
|
|
719
|
+
const buffer = decodePCM16LEChunk(context, chunk);
|
|
720
|
+
if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
|
|
721
|
+
stretcher?.reset();
|
|
722
|
+
scheduleBuffer(context, buffer, playbackRate);
|
|
723
|
+
return;
|
|
724
|
+
}
|
|
725
|
+
const channels = Math.max(1, chunk.format.channels);
|
|
726
|
+
const input = [];
|
|
727
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
728
|
+
input.push(buffer.getChannelData(channelIndex));
|
|
729
|
+
}
|
|
730
|
+
stretcher ??= createTimeStretcher();
|
|
731
|
+
const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
|
|
732
|
+
const outLength = stretched[0]?.length ?? 0;
|
|
733
|
+
if (outLength === 0) {
|
|
734
|
+
return;
|
|
735
|
+
}
|
|
736
|
+
const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
|
|
737
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
738
|
+
const channelOut = stretched[channelIndex];
|
|
739
|
+
if (!channelOut)
|
|
740
|
+
continue;
|
|
741
|
+
outBuffer.getChannelData(channelIndex).set(channelOut);
|
|
742
|
+
}
|
|
743
|
+
scheduleBuffer(context, outBuffer, 1);
|
|
744
|
+
};
|
|
564
745
|
const stopQueuedPlayback = (options2) => {
|
|
565
746
|
for (const node of [...sourceNodes]) {
|
|
566
747
|
node.stop?.();
|
|
@@ -12303,6 +12484,7 @@ export {
|
|
|
12303
12484
|
createVoiceAudioPlayer,
|
|
12304
12485
|
createVoiceAgentSquadStatusViewModel,
|
|
12305
12486
|
createVoiceAgentSquadStatusStore,
|
|
12487
|
+
createTimeStretcher,
|
|
12306
12488
|
createMicrophoneCapture,
|
|
12307
12489
|
buildVoiceAgentSquadStatusReport,
|
|
12308
12490
|
bindVoiceReactiveSource,
|
|
@@ -10,6 +10,12 @@ export type VoiceBackchannelDriverOptions = {
|
|
|
10
10
|
minSpeechMs?: number;
|
|
11
11
|
onCue: (cue: VoiceBackchannelCue) => Promise<void> | void;
|
|
12
12
|
};
|
|
13
|
+
export type VoiceBackchannelConfig = {
|
|
14
|
+
enabled?: boolean;
|
|
15
|
+
cues?: ReadonlyArray<string>;
|
|
16
|
+
minSpeechMs?: number;
|
|
17
|
+
cueIntervalMs?: number;
|
|
18
|
+
};
|
|
13
19
|
export type VoiceBackchannelDriver = {
|
|
14
20
|
noteSpeech: (timestampMs?: number) => void;
|
|
15
21
|
noteSilence: (timestampMs?: number) => void;
|
package/dist/core/types.d.ts
CHANGED
|
@@ -783,6 +783,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
|
|
|
783
783
|
userText: string;
|
|
784
784
|
}) => Promise<string | null>;
|
|
785
785
|
fillerForTimeoutMs?: number;
|
|
786
|
+
backchannel?: import("./backchannel").VoiceBackchannelConfig;
|
|
786
787
|
defaultSilentTurnAck?: string;
|
|
787
788
|
routeOnTurnTimeoutMs?: number;
|
|
788
789
|
audioConditioning?: VoiceAudioConditioningConfig;
|
|
@@ -968,6 +969,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
|
|
|
968
969
|
}) => Promise<string | null>;
|
|
969
970
|
/** Ceiling for the `fillerFor` call before we fall back to a static phrase. Default 600ms. */
|
|
970
971
|
fillerForTimeoutMs?: number;
|
|
972
|
+
/**
|
|
973
|
+
* Backchannel cues — short "mm-hm"/"right" acknowledgements played while the
|
|
974
|
+
* CALLER is mid-turn (a long answer) so they feel heard, the way a human
|
|
975
|
+
* listener interjects. Plays on the same non-turn TTS path as fillers, so it
|
|
976
|
+
* never registers as the assistant's turn or trips barge-in. Off unless
|
|
977
|
+
* `enabled` is set. Fires only while the assistant is silent.
|
|
978
|
+
*/
|
|
979
|
+
backchannel?: import("./backchannel").VoiceBackchannelConfig;
|
|
971
980
|
/**
|
|
972
981
|
* Default spoken ack if the model returns ONLY tool calls (no text) and the
|
|
973
982
|
* turn isn't ending. Without this, the caller hears total silence after
|
package/dist/index.d.ts
CHANGED
|
@@ -98,7 +98,7 @@ export type { VoiceCampaignDisposition, VoiceCampaignDispositionRetryPolicy, Voi
|
|
|
98
98
|
export { createVoiceBackchannelDriver } from "./core/backchannel";
|
|
99
99
|
export { createVoiceOAuth2TokenSource } from "./core/oauth2TokenSource";
|
|
100
100
|
export type { CreateVoiceOAuth2TokenSourceOptions, VoiceOAuth2TokenResponse, VoiceOAuth2TokenSource, } from "./core/oauth2TokenSource";
|
|
101
|
-
export type { VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
|
|
101
|
+
export type { VoiceBackchannelConfig, VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
|
|
102
102
|
export { createVoiceIVRSession, describeVoiceIVRPlan, evaluateVoiceIVRPlan, } from "./core/ivrPlan";
|
|
103
103
|
export type { VoiceIVRBranch, VoiceIVRDecision, VoiceIVRInput, VoiceIVRMatch, VoiceIVRPlan, VoiceIVRSession, } from "./core/ivrPlan";
|
|
104
104
|
export { VOICE_CALLER_MEMORY_KEY, buildVoiceCallerMemoryNamespace, createVoiceCallerMemoryNamespace, summarizeVoiceCallerTranscript, } from "./core/callerMemory";
|
package/dist/index.js
CHANGED
|
@@ -3091,6 +3091,71 @@ var toVoiceSessionSummary = (session) => ({
|
|
|
3091
3091
|
// src/core/session.ts
|
|
3092
3092
|
import { Buffer as Buffer2 } from "buffer";
|
|
3093
3093
|
|
|
3094
|
+
// src/core/backchannel.ts
|
|
3095
|
+
var DEFAULT_CUES = [
|
|
3096
|
+
{ text: "mm-hmm" },
|
|
3097
|
+
{ text: "I see" },
|
|
3098
|
+
{ text: "right" },
|
|
3099
|
+
{ text: "go on" }
|
|
3100
|
+
];
|
|
3101
|
+
var createVoiceBackchannelDriver = (options) => {
|
|
3102
|
+
const cues = options.cues ?? DEFAULT_CUES;
|
|
3103
|
+
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
3104
|
+
const cueIntervalMs = options.cueIntervalMs ?? 2500;
|
|
3105
|
+
const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
|
|
3106
|
+
let speechStartedAt;
|
|
3107
|
+
let lastCueAt;
|
|
3108
|
+
let cueCount = 0;
|
|
3109
|
+
let firing = false;
|
|
3110
|
+
const tryFire = async (now) => {
|
|
3111
|
+
if (firing || cues.length === 0) {
|
|
3112
|
+
return;
|
|
3113
|
+
}
|
|
3114
|
+
if (speechStartedAt === undefined) {
|
|
3115
|
+
return;
|
|
3116
|
+
}
|
|
3117
|
+
const elapsed = now - speechStartedAt;
|
|
3118
|
+
if (elapsed < minSpeechMs) {
|
|
3119
|
+
return;
|
|
3120
|
+
}
|
|
3121
|
+
if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
|
|
3122
|
+
return;
|
|
3123
|
+
}
|
|
3124
|
+
const cue = cues[cueIndexFn(cueCount)];
|
|
3125
|
+
if (!cue) {
|
|
3126
|
+
return;
|
|
3127
|
+
}
|
|
3128
|
+
firing = true;
|
|
3129
|
+
try {
|
|
3130
|
+
await options.onCue(cue);
|
|
3131
|
+
} finally {
|
|
3132
|
+
firing = false;
|
|
3133
|
+
lastCueAt = now;
|
|
3134
|
+
cueCount += 1;
|
|
3135
|
+
}
|
|
3136
|
+
};
|
|
3137
|
+
return {
|
|
3138
|
+
noteSilence: (timestampMs) => {
|
|
3139
|
+
const now = timestampMs ?? Date.now();
|
|
3140
|
+
if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
|
|
3141
|
+
speechStartedAt = undefined;
|
|
3142
|
+
}
|
|
3143
|
+
},
|
|
3144
|
+
noteSpeech: (timestampMs) => {
|
|
3145
|
+
const now = timestampMs ?? Date.now();
|
|
3146
|
+
if (speechStartedAt === undefined) {
|
|
3147
|
+
speechStartedAt = now;
|
|
3148
|
+
}
|
|
3149
|
+
tryFire(now);
|
|
3150
|
+
},
|
|
3151
|
+
reset: () => {
|
|
3152
|
+
speechStartedAt = undefined;
|
|
3153
|
+
lastCueAt = undefined;
|
|
3154
|
+
cueCount = 0;
|
|
3155
|
+
}
|
|
3156
|
+
};
|
|
3157
|
+
};
|
|
3158
|
+
|
|
3094
3159
|
// src/core/handoff.ts
|
|
3095
3160
|
var toHex3 = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
3096
3161
|
var signHandoffBody = async (input) => {
|
|
@@ -5217,6 +5282,30 @@ var createVoiceSession = (options) => {
|
|
|
5217
5282
|
});
|
|
5218
5283
|
});
|
|
5219
5284
|
};
|
|
5285
|
+
const emitBackchannelCue = (text) => {
|
|
5286
|
+
if (!text || !options.tts)
|
|
5287
|
+
return;
|
|
5288
|
+
if (activeTTSTurnId !== undefined || fillerActive)
|
|
5289
|
+
return;
|
|
5290
|
+
runSerial("backchannel.send", async () => {
|
|
5291
|
+
if (activeTTSTurnId !== undefined || fillerActive)
|
|
5292
|
+
return;
|
|
5293
|
+
const adapterSession = await ensureTTSSession();
|
|
5294
|
+
if (!adapterSession)
|
|
5295
|
+
return;
|
|
5296
|
+
try {
|
|
5297
|
+
await adapterSession.send(text);
|
|
5298
|
+
} catch {}
|
|
5299
|
+
});
|
|
5300
|
+
};
|
|
5301
|
+
const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
|
|
5302
|
+
...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
|
|
5303
|
+
...options.backchannel.cues ? {
|
|
5304
|
+
cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
|
|
5305
|
+
} : {},
|
|
5306
|
+
...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
|
|
5307
|
+
onCue: (cue) => emitBackchannelCue(cue.text)
|
|
5308
|
+
}) : null;
|
|
5220
5309
|
const createTurnTTSStreamer = (turn, session) => {
|
|
5221
5310
|
let buffer = "";
|
|
5222
5311
|
let full = "";
|
|
@@ -5708,6 +5797,7 @@ var createVoiceSession = (options) => {
|
|
|
5708
5797
|
};
|
|
5709
5798
|
const commitTurnInternal = async (reason = "manual") => {
|
|
5710
5799
|
clearSilenceTimer();
|
|
5800
|
+
backchannelDriver?.reset();
|
|
5711
5801
|
amdLastTurnCommitAt = Date.now();
|
|
5712
5802
|
const session = await readSession();
|
|
5713
5803
|
if (session.status === "completed" || session.status === "failed") {
|
|
@@ -6051,7 +6141,9 @@ var createVoiceSession = (options) => {
|
|
|
6051
6141
|
speechDetected = true;
|
|
6052
6142
|
clearSilenceTimer();
|
|
6053
6143
|
kickCallSilenceWatchdog();
|
|
6144
|
+
backchannelDriver?.noteSpeech();
|
|
6054
6145
|
} else if (speechDetected) {
|
|
6146
|
+
backchannelDriver?.noteSilence();
|
|
6055
6147
|
const currentSession = await readSession();
|
|
6056
6148
|
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
|
|
6057
6149
|
partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
|
|
@@ -24811,6 +24903,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
24811
24903
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|
|
24812
24904
|
...options.fillerFor ? { fillerFor: options.fillerFor } : {},
|
|
24813
24905
|
...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
|
|
24906
|
+
...options.backchannel ? { backchannel: options.backchannel } : {},
|
|
24814
24907
|
...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
|
|
24815
24908
|
...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
|
|
24816
24909
|
trace: options.trace,
|
|
@@ -39177,6 +39270,7 @@ var voice = (config) => {
|
|
|
39177
39270
|
...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
|
|
39178
39271
|
...config.fillerFor ? { fillerFor: config.fillerFor } : {},
|
|
39179
39272
|
...config.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: config.fillerForTimeoutMs } : {},
|
|
39273
|
+
...config.backchannel ? { backchannel: config.backchannel } : {},
|
|
39180
39274
|
...config.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: config.defaultSilentTurnAck } : {},
|
|
39181
39275
|
...config.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: config.routeOnTurnTimeoutMs } : {},
|
|
39182
39276
|
tts: config.tts,
|
|
@@ -41569,70 +41663,6 @@ var summarizeVoiceCampaignDispositions = (record) => {
|
|
|
41569
41663
|
totalRecipients: record.recipients.length
|
|
41570
41664
|
};
|
|
41571
41665
|
};
|
|
41572
|
-
// src/core/backchannel.ts
|
|
41573
|
-
var DEFAULT_CUES = [
|
|
41574
|
-
{ text: "mm-hmm" },
|
|
41575
|
-
{ text: "I see" },
|
|
41576
|
-
{ text: "right" },
|
|
41577
|
-
{ text: "go on" }
|
|
41578
|
-
];
|
|
41579
|
-
var createVoiceBackchannelDriver = (options) => {
|
|
41580
|
-
const cues = options.cues ?? DEFAULT_CUES;
|
|
41581
|
-
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
41582
|
-
const cueIntervalMs = options.cueIntervalMs ?? 2500;
|
|
41583
|
-
const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
|
|
41584
|
-
let speechStartedAt;
|
|
41585
|
-
let lastCueAt;
|
|
41586
|
-
let cueCount = 0;
|
|
41587
|
-
let firing = false;
|
|
41588
|
-
const tryFire = async (now) => {
|
|
41589
|
-
if (firing || cues.length === 0) {
|
|
41590
|
-
return;
|
|
41591
|
-
}
|
|
41592
|
-
if (speechStartedAt === undefined) {
|
|
41593
|
-
return;
|
|
41594
|
-
}
|
|
41595
|
-
const elapsed = now - speechStartedAt;
|
|
41596
|
-
if (elapsed < minSpeechMs) {
|
|
41597
|
-
return;
|
|
41598
|
-
}
|
|
41599
|
-
if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
|
|
41600
|
-
return;
|
|
41601
|
-
}
|
|
41602
|
-
const cue = cues[cueIndexFn(cueCount)];
|
|
41603
|
-
if (!cue) {
|
|
41604
|
-
return;
|
|
41605
|
-
}
|
|
41606
|
-
firing = true;
|
|
41607
|
-
try {
|
|
41608
|
-
await options.onCue(cue);
|
|
41609
|
-
} finally {
|
|
41610
|
-
firing = false;
|
|
41611
|
-
lastCueAt = now;
|
|
41612
|
-
cueCount += 1;
|
|
41613
|
-
}
|
|
41614
|
-
};
|
|
41615
|
-
return {
|
|
41616
|
-
noteSilence: (timestampMs) => {
|
|
41617
|
-
const now = timestampMs ?? Date.now();
|
|
41618
|
-
if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
|
|
41619
|
-
speechStartedAt = undefined;
|
|
41620
|
-
}
|
|
41621
|
-
},
|
|
41622
|
-
noteSpeech: (timestampMs) => {
|
|
41623
|
-
const now = timestampMs ?? Date.now();
|
|
41624
|
-
if (speechStartedAt === undefined) {
|
|
41625
|
-
speechStartedAt = now;
|
|
41626
|
-
}
|
|
41627
|
-
tryFire(now);
|
|
41628
|
-
},
|
|
41629
|
-
reset: () => {
|
|
41630
|
-
speechStartedAt = undefined;
|
|
41631
|
-
lastCueAt = undefined;
|
|
41632
|
-
cueCount = 0;
|
|
41633
|
-
}
|
|
41634
|
-
};
|
|
41635
|
-
};
|
|
41636
41666
|
// src/core/oauth2TokenSource.ts
|
|
41637
41667
|
var createVoiceOAuth2TokenSource = (options) => {
|
|
41638
41668
|
const fetchImpl = options.fetch ?? globalThis.fetch.bind(globalThis);
|
|
@@ -164,6 +164,9 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
|
|
|
164
164
|
}) => Promise<string | null>;
|
|
165
165
|
/** Cap on the `fillerFor` race before falling back to a static phrase. Default 600ms. */
|
|
166
166
|
fillerForTimeoutMs?: number;
|
|
167
|
+
/** Backchannel cues played while the caller is mid-turn so they feel heard.
|
|
168
|
+
* Non-turn TTS path (no barge-in interaction). Off unless `enabled`. */
|
|
169
|
+
backchannel?: import("../core/backchannel").VoiceBackchannelConfig;
|
|
167
170
|
/**
|
|
168
171
|
* Default spoken ack if the model returns ONLY tool calls (no text) and
|
|
169
172
|
* the turn isn't ending. Without this, the caller hears silence and
|
package/dist/testing/index.js
CHANGED
|
@@ -1577,12 +1577,165 @@ var buildSessionCorrectionAudit = (raw, generic, experimental, benchmarkSeeded,
|
|
|
1577
1577
|
}
|
|
1578
1578
|
};
|
|
1579
1579
|
};
|
|
1580
|
+
// src/client/timeStretch.ts
|
|
1581
|
+
var HOP_MS = 10;
|
|
1582
|
+
var SEEK_MS = 5;
|
|
1583
|
+
var ENERGY_EPSILON = 0.000001;
|
|
1584
|
+
var HALF = 0.5;
|
|
1585
|
+
var MS_PER_SECOND = 1000;
|
|
1586
|
+
var makeHann = (length) => {
|
|
1587
|
+
const weights = new Float32Array(length);
|
|
1588
|
+
for (let index = 0;index < length; index += 1) {
|
|
1589
|
+
weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
|
|
1590
|
+
}
|
|
1591
|
+
return weights;
|
|
1592
|
+
};
|
|
1593
|
+
var correlationScore = (base, start, ref, length) => {
|
|
1594
|
+
let dot = 0;
|
|
1595
|
+
let energy = 0;
|
|
1596
|
+
for (let index = 0;index < length; index += 1) {
|
|
1597
|
+
const sample = base[start + index] ?? 0;
|
|
1598
|
+
dot += sample * (ref[index] ?? 0);
|
|
1599
|
+
energy += sample * sample;
|
|
1600
|
+
}
|
|
1601
|
+
return dot / Math.sqrt(energy + ENERGY_EPSILON);
|
|
1602
|
+
};
|
|
1603
|
+
var overlapAddGrain = (src, off, tail, weights, hop) => {
|
|
1604
|
+
const out = new Float32Array(hop);
|
|
1605
|
+
const nextTail = new Float32Array(hop);
|
|
1606
|
+
for (let index = 0;index < hop; index += 1) {
|
|
1607
|
+
out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
|
|
1608
|
+
nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
|
|
1609
|
+
}
|
|
1610
|
+
return { nextTail, out };
|
|
1611
|
+
};
|
|
1612
|
+
var createTimeStretcher = () => {
|
|
1613
|
+
let sampleRate = 0;
|
|
1614
|
+
let channelCount = 0;
|
|
1615
|
+
let hop = 0;
|
|
1616
|
+
let frameLen = 0;
|
|
1617
|
+
let seek = 0;
|
|
1618
|
+
let weights = new Float32Array(0);
|
|
1619
|
+
let buffers = [];
|
|
1620
|
+
let inputStart = 0;
|
|
1621
|
+
let analysisPos = 0;
|
|
1622
|
+
let olaTail = [];
|
|
1623
|
+
let naturalRef = null;
|
|
1624
|
+
const init = (rate, channels) => {
|
|
1625
|
+
sampleRate = rate;
|
|
1626
|
+
channelCount = channels;
|
|
1627
|
+
hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
|
|
1628
|
+
frameLen = hop * 2;
|
|
1629
|
+
seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
|
|
1630
|
+
weights = makeHann(frameLen);
|
|
1631
|
+
buffers = Array.from({ length: channels }, () => new Float32Array(0));
|
|
1632
|
+
olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
|
|
1633
|
+
inputStart = 0;
|
|
1634
|
+
analysisPos = seek;
|
|
1635
|
+
naturalRef = null;
|
|
1636
|
+
};
|
|
1637
|
+
const reset = () => {
|
|
1638
|
+
buffers = buffers.map(() => new Float32Array(0));
|
|
1639
|
+
olaTail = olaTail.map(() => new Float32Array(hop));
|
|
1640
|
+
inputStart = 0;
|
|
1641
|
+
analysisPos = seek;
|
|
1642
|
+
naturalRef = null;
|
|
1643
|
+
};
|
|
1644
|
+
const append = (input) => {
|
|
1645
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1646
|
+
const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
|
|
1647
|
+
const existing = buffers[channel] ?? new Float32Array(0);
|
|
1648
|
+
const merged = new Float32Array(existing.length + incoming.length);
|
|
1649
|
+
merged.set(existing, 0);
|
|
1650
|
+
merged.set(incoming, existing.length);
|
|
1651
|
+
buffers[channel] = merged;
|
|
1652
|
+
}
|
|
1653
|
+
};
|
|
1654
|
+
const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
|
|
1655
|
+
const compact = () => {
|
|
1656
|
+
const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
|
|
1657
|
+
if (keepFrom <= inputStart)
|
|
1658
|
+
return;
|
|
1659
|
+
const drop = keepFrom - inputStart;
|
|
1660
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1661
|
+
buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
|
|
1662
|
+
}
|
|
1663
|
+
inputStart = keepFrom;
|
|
1664
|
+
};
|
|
1665
|
+
const bestOffset = (center) => {
|
|
1666
|
+
if (!naturalRef)
|
|
1667
|
+
return 0;
|
|
1668
|
+
const [base] = buffers;
|
|
1669
|
+
if (!base)
|
|
1670
|
+
return 0;
|
|
1671
|
+
let bestDelta = 0;
|
|
1672
|
+
let bestScore = -Infinity;
|
|
1673
|
+
for (let delta = -seek;delta <= seek; delta += 1) {
|
|
1674
|
+
const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
|
|
1675
|
+
if (score <= bestScore)
|
|
1676
|
+
continue;
|
|
1677
|
+
bestScore = score;
|
|
1678
|
+
bestDelta = delta;
|
|
1679
|
+
}
|
|
1680
|
+
return bestDelta;
|
|
1681
|
+
};
|
|
1682
|
+
const process2 = (input, speed, rate) => {
|
|
1683
|
+
const channels = Math.max(1, input.length);
|
|
1684
|
+
if (sampleRate !== rate || channelCount !== channels)
|
|
1685
|
+
init(rate, channels);
|
|
1686
|
+
append(input);
|
|
1687
|
+
const analysisHop = hop * speed;
|
|
1688
|
+
const segments = Array.from({ length: channelCount }, () => []);
|
|
1689
|
+
const emitGrain = (pos) => {
|
|
1690
|
+
const off = pos - inputStart;
|
|
1691
|
+
for (let channel = 0;channel < channelCount; channel += 1) {
|
|
1692
|
+
const src = buffers[channel];
|
|
1693
|
+
const tail = olaTail[channel];
|
|
1694
|
+
if (!src || !tail)
|
|
1695
|
+
continue;
|
|
1696
|
+
const grain = overlapAddGrain(src, off, tail, weights, hop);
|
|
1697
|
+
olaTail[channel] = grain.nextTail;
|
|
1698
|
+
segments[channel]?.push(grain.out);
|
|
1699
|
+
}
|
|
1700
|
+
};
|
|
1701
|
+
const captureRef = (pos) => {
|
|
1702
|
+
const ref = new Float32Array(frameLen);
|
|
1703
|
+
const refOff = pos + hop - inputStart;
|
|
1704
|
+
const [base] = buffers;
|
|
1705
|
+
if (base)
|
|
1706
|
+
ref.set(base.subarray(refOff, refOff + frameLen));
|
|
1707
|
+
naturalRef = ref;
|
|
1708
|
+
};
|
|
1709
|
+
const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
|
|
1710
|
+
while (canEmit()) {
|
|
1711
|
+
const center = Math.round(analysisPos);
|
|
1712
|
+
const pos = center + bestOffset(center);
|
|
1713
|
+
emitGrain(pos);
|
|
1714
|
+
captureRef(pos);
|
|
1715
|
+
analysisPos += analysisHop;
|
|
1716
|
+
}
|
|
1717
|
+
compact();
|
|
1718
|
+
return segments.map((channelSegments) => {
|
|
1719
|
+
const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
|
|
1720
|
+
const merged = new Float32Array(total);
|
|
1721
|
+
let offset = 0;
|
|
1722
|
+
for (const seg of channelSegments) {
|
|
1723
|
+
merged.set(seg, offset);
|
|
1724
|
+
offset += seg.length;
|
|
1725
|
+
}
|
|
1726
|
+
return merged;
|
|
1727
|
+
});
|
|
1728
|
+
};
|
|
1729
|
+
return { process: process2, reset };
|
|
1730
|
+
};
|
|
1731
|
+
|
|
1580
1732
|
// src/client/audioPlayer.ts
|
|
1581
1733
|
var DEFAULT_LOOKAHEAD_MS = 15;
|
|
1582
1734
|
var DEFAULT_VOLUME = 1;
|
|
1583
1735
|
var DEFAULT_PLAYBACK_RATE = 1;
|
|
1584
1736
|
var MIN_PLAYBACK_RATE = 0.5;
|
|
1585
1737
|
var MAX_PLAYBACK_RATE = 2;
|
|
1738
|
+
var STRETCH_BYPASS_EPSILON = 0.01;
|
|
1586
1739
|
var createInitialState = () => ({
|
|
1587
1740
|
activeSourceCount: 0,
|
|
1588
1741
|
error: null,
|
|
@@ -1645,6 +1798,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1645
1798
|
let outputNode = null;
|
|
1646
1799
|
let volume = clampVolume(options.volume);
|
|
1647
1800
|
let playbackRate = clampPlaybackRate(options.playbackRate);
|
|
1801
|
+
let stretcher = null;
|
|
1648
1802
|
let queueEndTime = 0;
|
|
1649
1803
|
let syncPromise = Promise.resolve();
|
|
1650
1804
|
let interruptStartedAt = null;
|
|
@@ -1677,6 +1831,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1677
1831
|
const resolveInterrupt = (latencyMs) => {
|
|
1678
1832
|
clearInterruptTimer();
|
|
1679
1833
|
interruptStartedAt = null;
|
|
1834
|
+
stretcher?.reset();
|
|
1680
1835
|
setState({
|
|
1681
1836
|
activeSourceCount: sourceNodes.size,
|
|
1682
1837
|
isPlaying: false,
|
|
@@ -1741,13 +1896,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1741
1896
|
queueEndTime = audioContext.currentTime;
|
|
1742
1897
|
return audioContext;
|
|
1743
1898
|
};
|
|
1744
|
-
const
|
|
1745
|
-
const context = await ensureAudioContext();
|
|
1746
|
-
const buffer = decodePCM16LEChunk(context, chunk);
|
|
1899
|
+
const scheduleBuffer = (context, buffer, rate) => {
|
|
1747
1900
|
const node = context.createBufferSource();
|
|
1748
1901
|
node.buffer = buffer;
|
|
1749
1902
|
if (node.playbackRate) {
|
|
1750
|
-
node.playbackRate.value =
|
|
1903
|
+
node.playbackRate.value = rate;
|
|
1751
1904
|
}
|
|
1752
1905
|
node.connect(outputNode ?? context.destination);
|
|
1753
1906
|
node.onended = () => {
|
|
@@ -1760,7 +1913,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1760
1913
|
maybeResolveInterrupt();
|
|
1761
1914
|
};
|
|
1762
1915
|
const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
|
|
1763
|
-
queueEndTime = startAt + buffer.duration /
|
|
1916
|
+
queueEndTime = startAt + buffer.duration / rate;
|
|
1764
1917
|
sourceNodes.add(node);
|
|
1765
1918
|
setState({
|
|
1766
1919
|
activeSourceCount: sourceNodes.size,
|
|
@@ -1768,6 +1921,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
|
|
|
1768
1921
|
});
|
|
1769
1922
|
node.start(startAt);
|
|
1770
1923
|
};
|
|
1924
|
+
const scheduleChunk = async (chunk) => {
|
|
1925
|
+
const context = await ensureAudioContext();
|
|
1926
|
+
const buffer = decodePCM16LEChunk(context, chunk);
|
|
1927
|
+
if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
|
|
1928
|
+
stretcher?.reset();
|
|
1929
|
+
scheduleBuffer(context, buffer, playbackRate);
|
|
1930
|
+
return;
|
|
1931
|
+
}
|
|
1932
|
+
const channels = Math.max(1, chunk.format.channels);
|
|
1933
|
+
const input = [];
|
|
1934
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
1935
|
+
input.push(buffer.getChannelData(channelIndex));
|
|
1936
|
+
}
|
|
1937
|
+
stretcher ??= createTimeStretcher();
|
|
1938
|
+
const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
|
|
1939
|
+
const outLength = stretched[0]?.length ?? 0;
|
|
1940
|
+
if (outLength === 0) {
|
|
1941
|
+
return;
|
|
1942
|
+
}
|
|
1943
|
+
const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
|
|
1944
|
+
for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
|
|
1945
|
+
const channelOut = stretched[channelIndex];
|
|
1946
|
+
if (!channelOut)
|
|
1947
|
+
continue;
|
|
1948
|
+
outBuffer.getChannelData(channelIndex).set(channelOut);
|
|
1949
|
+
}
|
|
1950
|
+
scheduleBuffer(context, outBuffer, 1);
|
|
1951
|
+
};
|
|
1771
1952
|
const stopQueuedPlayback = (options2) => {
|
|
1772
1953
|
for (const node of [...sourceNodes]) {
|
|
1773
1954
|
node.stop?.();
|
|
@@ -5130,6 +5311,71 @@ var createVoiceMemoryStore = () => {
|
|
|
5130
5311
|
// src/core/session.ts
|
|
5131
5312
|
import { Buffer as Buffer2 } from "buffer";
|
|
5132
5313
|
|
|
5314
|
+
// src/core/backchannel.ts
|
|
5315
|
+
var DEFAULT_CUES = [
|
|
5316
|
+
{ text: "mm-hmm" },
|
|
5317
|
+
{ text: "I see" },
|
|
5318
|
+
{ text: "right" },
|
|
5319
|
+
{ text: "go on" }
|
|
5320
|
+
];
|
|
5321
|
+
var createVoiceBackchannelDriver = (options) => {
|
|
5322
|
+
const cues = options.cues ?? DEFAULT_CUES;
|
|
5323
|
+
const minSpeechMs = options.minSpeechMs ?? 2500;
|
|
5324
|
+
const cueIntervalMs = options.cueIntervalMs ?? 2500;
|
|
5325
|
+
const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
|
|
5326
|
+
let speechStartedAt;
|
|
5327
|
+
let lastCueAt;
|
|
5328
|
+
let cueCount = 0;
|
|
5329
|
+
let firing = false;
|
|
5330
|
+
const tryFire = async (now) => {
|
|
5331
|
+
if (firing || cues.length === 0) {
|
|
5332
|
+
return;
|
|
5333
|
+
}
|
|
5334
|
+
if (speechStartedAt === undefined) {
|
|
5335
|
+
return;
|
|
5336
|
+
}
|
|
5337
|
+
const elapsed = now - speechStartedAt;
|
|
5338
|
+
if (elapsed < minSpeechMs) {
|
|
5339
|
+
return;
|
|
5340
|
+
}
|
|
5341
|
+
if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
|
|
5342
|
+
return;
|
|
5343
|
+
}
|
|
5344
|
+
const cue = cues[cueIndexFn(cueCount)];
|
|
5345
|
+
if (!cue) {
|
|
5346
|
+
return;
|
|
5347
|
+
}
|
|
5348
|
+
firing = true;
|
|
5349
|
+
try {
|
|
5350
|
+
await options.onCue(cue);
|
|
5351
|
+
} finally {
|
|
5352
|
+
firing = false;
|
|
5353
|
+
lastCueAt = now;
|
|
5354
|
+
cueCount += 1;
|
|
5355
|
+
}
|
|
5356
|
+
};
|
|
5357
|
+
return {
|
|
5358
|
+
noteSilence: (timestampMs) => {
|
|
5359
|
+
const now = timestampMs ?? Date.now();
|
|
5360
|
+
if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
|
|
5361
|
+
speechStartedAt = undefined;
|
|
5362
|
+
}
|
|
5363
|
+
},
|
|
5364
|
+
noteSpeech: (timestampMs) => {
|
|
5365
|
+
const now = timestampMs ?? Date.now();
|
|
5366
|
+
if (speechStartedAt === undefined) {
|
|
5367
|
+
speechStartedAt = now;
|
|
5368
|
+
}
|
|
5369
|
+
tryFire(now);
|
|
5370
|
+
},
|
|
5371
|
+
reset: () => {
|
|
5372
|
+
speechStartedAt = undefined;
|
|
5373
|
+
lastCueAt = undefined;
|
|
5374
|
+
cueCount = 0;
|
|
5375
|
+
}
|
|
5376
|
+
};
|
|
5377
|
+
};
|
|
5378
|
+
|
|
5133
5379
|
// src/core/handoff.ts
|
|
5134
5380
|
var toHex = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
5135
5381
|
var signHandoffBody = async (input) => {
|
|
@@ -7152,6 +7398,30 @@ var createVoiceSession = (options) => {
|
|
|
7152
7398
|
});
|
|
7153
7399
|
});
|
|
7154
7400
|
};
|
|
7401
|
+
const emitBackchannelCue = (text) => {
|
|
7402
|
+
if (!text || !options.tts)
|
|
7403
|
+
return;
|
|
7404
|
+
if (activeTTSTurnId !== undefined || fillerActive)
|
|
7405
|
+
return;
|
|
7406
|
+
runSerial("backchannel.send", async () => {
|
|
7407
|
+
if (activeTTSTurnId !== undefined || fillerActive)
|
|
7408
|
+
return;
|
|
7409
|
+
const adapterSession = await ensureTTSSession();
|
|
7410
|
+
if (!adapterSession)
|
|
7411
|
+
return;
|
|
7412
|
+
try {
|
|
7413
|
+
await adapterSession.send(text);
|
|
7414
|
+
} catch {}
|
|
7415
|
+
});
|
|
7416
|
+
};
|
|
7417
|
+
const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
|
|
7418
|
+
...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
|
|
7419
|
+
...options.backchannel.cues ? {
|
|
7420
|
+
cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
|
|
7421
|
+
} : {},
|
|
7422
|
+
...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
|
|
7423
|
+
onCue: (cue) => emitBackchannelCue(cue.text)
|
|
7424
|
+
}) : null;
|
|
7155
7425
|
const createTurnTTSStreamer = (turn, session) => {
|
|
7156
7426
|
let buffer = "";
|
|
7157
7427
|
let full = "";
|
|
@@ -7643,6 +7913,7 @@ var createVoiceSession = (options) => {
|
|
|
7643
7913
|
};
|
|
7644
7914
|
const commitTurnInternal = async (reason = "manual") => {
|
|
7645
7915
|
clearSilenceTimer();
|
|
7916
|
+
backchannelDriver?.reset();
|
|
7646
7917
|
amdLastTurnCommitAt = Date.now();
|
|
7647
7918
|
const session = await readSession();
|
|
7648
7919
|
if (session.status === "completed" || session.status === "failed") {
|
|
@@ -7986,7 +8257,9 @@ var createVoiceSession = (options) => {
|
|
|
7986
8257
|
speechDetected = true;
|
|
7987
8258
|
clearSilenceTimer();
|
|
7988
8259
|
kickCallSilenceWatchdog();
|
|
8260
|
+
backchannelDriver?.noteSpeech();
|
|
7989
8261
|
} else if (speechDetected) {
|
|
8262
|
+
backchannelDriver?.noteSilence();
|
|
7990
8263
|
const currentSession = await readSession();
|
|
7991
8264
|
const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
|
|
7992
8265
|
partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
|
|
@@ -13465,6 +13738,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
|
|
|
13465
13738
|
...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
|
|
13466
13739
|
...options.fillerFor ? { fillerFor: options.fillerFor } : {},
|
|
13467
13740
|
...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
|
|
13741
|
+
...options.backchannel ? { backchannel: options.backchannel } : {},
|
|
13468
13742
|
...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
|
|
13469
13743
|
...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
|
|
13470
13744
|
trace: options.trace,
|