@absolutejs/voice 0.0.22-beta.577 → 0.0.22-beta.579

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1533,12 +1533,165 @@ var createVoiceController = (path, options = {}) => {
1533
1533
  };
1534
1534
  };
1535
1535
 
1536
+ // src/client/timeStretch.ts
1537
+ var HOP_MS = 10;
1538
+ var SEEK_MS = 5;
1539
+ var ENERGY_EPSILON = 0.000001;
1540
+ var HALF = 0.5;
1541
+ var MS_PER_SECOND = 1000;
1542
+ var makeHann = (length) => {
1543
+ const weights = new Float32Array(length);
1544
+ for (let index = 0;index < length; index += 1) {
1545
+ weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
1546
+ }
1547
+ return weights;
1548
+ };
1549
+ var correlationScore = (base, start, ref, length) => {
1550
+ let dot = 0;
1551
+ let energy = 0;
1552
+ for (let index = 0;index < length; index += 1) {
1553
+ const sample = base[start + index] ?? 0;
1554
+ dot += sample * (ref[index] ?? 0);
1555
+ energy += sample * sample;
1556
+ }
1557
+ return dot / Math.sqrt(energy + ENERGY_EPSILON);
1558
+ };
1559
+ var overlapAddGrain = (src, off, tail, weights, hop) => {
1560
+ const out = new Float32Array(hop);
1561
+ const nextTail = new Float32Array(hop);
1562
+ for (let index = 0;index < hop; index += 1) {
1563
+ out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
1564
+ nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
1565
+ }
1566
+ return { nextTail, out };
1567
+ };
1568
+ var createTimeStretcher = () => {
1569
+ let sampleRate = 0;
1570
+ let channelCount = 0;
1571
+ let hop = 0;
1572
+ let frameLen = 0;
1573
+ let seek = 0;
1574
+ let weights = new Float32Array(0);
1575
+ let buffers = [];
1576
+ let inputStart = 0;
1577
+ let analysisPos = 0;
1578
+ let olaTail = [];
1579
+ let naturalRef = null;
1580
+ const init = (rate, channels) => {
1581
+ sampleRate = rate;
1582
+ channelCount = channels;
1583
+ hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
1584
+ frameLen = hop * 2;
1585
+ seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
1586
+ weights = makeHann(frameLen);
1587
+ buffers = Array.from({ length: channels }, () => new Float32Array(0));
1588
+ olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
1589
+ inputStart = 0;
1590
+ analysisPos = seek;
1591
+ naturalRef = null;
1592
+ };
1593
+ const reset = () => {
1594
+ buffers = buffers.map(() => new Float32Array(0));
1595
+ olaTail = olaTail.map(() => new Float32Array(hop));
1596
+ inputStart = 0;
1597
+ analysisPos = seek;
1598
+ naturalRef = null;
1599
+ };
1600
+ const append = (input) => {
1601
+ for (let channel = 0;channel < channelCount; channel += 1) {
1602
+ const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
1603
+ const existing = buffers[channel] ?? new Float32Array(0);
1604
+ const merged = new Float32Array(existing.length + incoming.length);
1605
+ merged.set(existing, 0);
1606
+ merged.set(incoming, existing.length);
1607
+ buffers[channel] = merged;
1608
+ }
1609
+ };
1610
+ const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
1611
+ const compact = () => {
1612
+ const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
1613
+ if (keepFrom <= inputStart)
1614
+ return;
1615
+ const drop = keepFrom - inputStart;
1616
+ for (let channel = 0;channel < channelCount; channel += 1) {
1617
+ buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
1618
+ }
1619
+ inputStart = keepFrom;
1620
+ };
1621
+ const bestOffset = (center) => {
1622
+ if (!naturalRef)
1623
+ return 0;
1624
+ const [base] = buffers;
1625
+ if (!base)
1626
+ return 0;
1627
+ let bestDelta = 0;
1628
+ let bestScore = -Infinity;
1629
+ for (let delta = -seek;delta <= seek; delta += 1) {
1630
+ const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
1631
+ if (score <= bestScore)
1632
+ continue;
1633
+ bestScore = score;
1634
+ bestDelta = delta;
1635
+ }
1636
+ return bestDelta;
1637
+ };
1638
+ const process = (input, speed, rate) => {
1639
+ const channels = Math.max(1, input.length);
1640
+ if (sampleRate !== rate || channelCount !== channels)
1641
+ init(rate, channels);
1642
+ append(input);
1643
+ const analysisHop = hop * speed;
1644
+ const segments = Array.from({ length: channelCount }, () => []);
1645
+ const emitGrain = (pos) => {
1646
+ const off = pos - inputStart;
1647
+ for (let channel = 0;channel < channelCount; channel += 1) {
1648
+ const src = buffers[channel];
1649
+ const tail = olaTail[channel];
1650
+ if (!src || !tail)
1651
+ continue;
1652
+ const grain = overlapAddGrain(src, off, tail, weights, hop);
1653
+ olaTail[channel] = grain.nextTail;
1654
+ segments[channel]?.push(grain.out);
1655
+ }
1656
+ };
1657
+ const captureRef = (pos) => {
1658
+ const ref = new Float32Array(frameLen);
1659
+ const refOff = pos + hop - inputStart;
1660
+ const [base] = buffers;
1661
+ if (base)
1662
+ ref.set(base.subarray(refOff, refOff + frameLen));
1663
+ naturalRef = ref;
1664
+ };
1665
+ const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
1666
+ while (canEmit()) {
1667
+ const center = Math.round(analysisPos);
1668
+ const pos = center + bestOffset(center);
1669
+ emitGrain(pos);
1670
+ captureRef(pos);
1671
+ analysisPos += analysisHop;
1672
+ }
1673
+ compact();
1674
+ return segments.map((channelSegments) => {
1675
+ const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
1676
+ const merged = new Float32Array(total);
1677
+ let offset = 0;
1678
+ for (const seg of channelSegments) {
1679
+ merged.set(seg, offset);
1680
+ offset += seg.length;
1681
+ }
1682
+ return merged;
1683
+ });
1684
+ };
1685
+ return { process, reset };
1686
+ };
1687
+
1536
1688
  // src/client/audioPlayer.ts
1537
1689
  var DEFAULT_LOOKAHEAD_MS = 15;
1538
1690
  var DEFAULT_VOLUME = 1;
1539
1691
  var DEFAULT_PLAYBACK_RATE = 1;
1540
1692
  var MIN_PLAYBACK_RATE = 0.5;
1541
1693
  var MAX_PLAYBACK_RATE = 2;
1694
+ var STRETCH_BYPASS_EPSILON = 0.01;
1542
1695
  var createInitialState3 = () => ({
1543
1696
  activeSourceCount: 0,
1544
1697
  error: null,
@@ -1601,6 +1754,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1601
1754
  let outputNode = null;
1602
1755
  let volume = clampVolume(options.volume);
1603
1756
  let playbackRate = clampPlaybackRate(options.playbackRate);
1757
+ let stretcher = null;
1604
1758
  let queueEndTime = 0;
1605
1759
  let syncPromise = Promise.resolve();
1606
1760
  let interruptStartedAt = null;
@@ -1633,6 +1787,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1633
1787
  const resolveInterrupt = (latencyMs) => {
1634
1788
  clearInterruptTimer();
1635
1789
  interruptStartedAt = null;
1790
+ stretcher?.reset();
1636
1791
  setState({
1637
1792
  activeSourceCount: sourceNodes.size,
1638
1793
  isPlaying: false,
@@ -1697,13 +1852,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1697
1852
  queueEndTime = audioContext.currentTime;
1698
1853
  return audioContext;
1699
1854
  };
1700
- const scheduleChunk = async (chunk) => {
1701
- const context = await ensureAudioContext();
1702
- const buffer = decodePCM16LEChunk(context, chunk);
1855
+ const scheduleBuffer = (context, buffer, rate) => {
1703
1856
  const node = context.createBufferSource();
1704
1857
  node.buffer = buffer;
1705
1858
  if (node.playbackRate) {
1706
- node.playbackRate.value = playbackRate;
1859
+ node.playbackRate.value = rate;
1707
1860
  }
1708
1861
  node.connect(outputNode ?? context.destination);
1709
1862
  node.onended = () => {
@@ -1716,7 +1869,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1716
1869
  maybeResolveInterrupt();
1717
1870
  };
1718
1871
  const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
1719
- queueEndTime = startAt + buffer.duration / playbackRate;
1872
+ queueEndTime = startAt + buffer.duration / rate;
1720
1873
  sourceNodes.add(node);
1721
1874
  setState({
1722
1875
  activeSourceCount: sourceNodes.size,
@@ -1724,6 +1877,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1724
1877
  });
1725
1878
  node.start(startAt);
1726
1879
  };
1880
+ const scheduleChunk = async (chunk) => {
1881
+ const context = await ensureAudioContext();
1882
+ const buffer = decodePCM16LEChunk(context, chunk);
1883
+ if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
1884
+ stretcher?.reset();
1885
+ scheduleBuffer(context, buffer, playbackRate);
1886
+ return;
1887
+ }
1888
+ const channels = Math.max(1, chunk.format.channels);
1889
+ const input = [];
1890
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
1891
+ input.push(buffer.getChannelData(channelIndex));
1892
+ }
1893
+ stretcher ??= createTimeStretcher();
1894
+ const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
1895
+ const outLength = stretched[0]?.length ?? 0;
1896
+ if (outLength === 0) {
1897
+ return;
1898
+ }
1899
+ const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
1900
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
1901
+ const channelOut = stretched[channelIndex];
1902
+ if (!channelOut)
1903
+ continue;
1904
+ outBuffer.getChannelData(channelIndex).set(channelOut);
1905
+ }
1906
+ scheduleBuffer(context, outBuffer, 1);
1907
+ };
1727
1908
  const stopQueuedPlayback = (options2) => {
1728
1909
  for (const node of [...sourceNodes]) {
1729
1910
  node.stop?.();
@@ -2,6 +2,7 @@ export { bindVoiceReactiveSource, voiceSseReactiveSource, } from "./reactiveSour
2
2
  export type { VoiceReactiveSource, VoiceSseReactiveSourceOptions, } from "./reactiveSource";
3
3
  export { createVoiceConnection } from "./connection";
4
4
  export { createVoiceAudioPlayer, decodeVoiceAudioChunk } from "./audioPlayer";
5
+ export { createTimeStretcher, type TimeStretcher } from "./timeStretch";
5
6
  export { createVoiceStream } from "./createVoiceStream";
6
7
  export { createVoiceBrowserMediaReporter } from "./browserMedia";
7
8
  export type { VoiceBrowserMediaReporter } from "./browserMedia";
@@ -370,12 +370,165 @@ var createVoiceConnection = (path, options = {}) => {
370
370
  getSessionId: () => state.sessionId
371
371
  };
372
372
  };
373
+ // src/client/timeStretch.ts
374
+ var HOP_MS = 10;
375
+ var SEEK_MS = 5;
376
+ var ENERGY_EPSILON = 0.000001;
377
+ var HALF = 0.5;
378
+ var MS_PER_SECOND = 1000;
379
+ var makeHann = (length) => {
380
+ const weights = new Float32Array(length);
381
+ for (let index = 0;index < length; index += 1) {
382
+ weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
383
+ }
384
+ return weights;
385
+ };
386
+ var correlationScore = (base, start, ref, length) => {
387
+ let dot = 0;
388
+ let energy = 0;
389
+ for (let index = 0;index < length; index += 1) {
390
+ const sample = base[start + index] ?? 0;
391
+ dot += sample * (ref[index] ?? 0);
392
+ energy += sample * sample;
393
+ }
394
+ return dot / Math.sqrt(energy + ENERGY_EPSILON);
395
+ };
396
+ var overlapAddGrain = (src, off, tail, weights, hop) => {
397
+ const out = new Float32Array(hop);
398
+ const nextTail = new Float32Array(hop);
399
+ for (let index = 0;index < hop; index += 1) {
400
+ out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
401
+ nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
402
+ }
403
+ return { nextTail, out };
404
+ };
405
+ var createTimeStretcher = () => {
406
+ let sampleRate = 0;
407
+ let channelCount = 0;
408
+ let hop = 0;
409
+ let frameLen = 0;
410
+ let seek = 0;
411
+ let weights = new Float32Array(0);
412
+ let buffers = [];
413
+ let inputStart = 0;
414
+ let analysisPos = 0;
415
+ let olaTail = [];
416
+ let naturalRef = null;
417
+ const init = (rate, channels) => {
418
+ sampleRate = rate;
419
+ channelCount = channels;
420
+ hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
421
+ frameLen = hop * 2;
422
+ seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
423
+ weights = makeHann(frameLen);
424
+ buffers = Array.from({ length: channels }, () => new Float32Array(0));
425
+ olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
426
+ inputStart = 0;
427
+ analysisPos = seek;
428
+ naturalRef = null;
429
+ };
430
+ const reset = () => {
431
+ buffers = buffers.map(() => new Float32Array(0));
432
+ olaTail = olaTail.map(() => new Float32Array(hop));
433
+ inputStart = 0;
434
+ analysisPos = seek;
435
+ naturalRef = null;
436
+ };
437
+ const append = (input) => {
438
+ for (let channel = 0;channel < channelCount; channel += 1) {
439
+ const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
440
+ const existing = buffers[channel] ?? new Float32Array(0);
441
+ const merged = new Float32Array(existing.length + incoming.length);
442
+ merged.set(existing, 0);
443
+ merged.set(incoming, existing.length);
444
+ buffers[channel] = merged;
445
+ }
446
+ };
447
+ const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
448
+ const compact = () => {
449
+ const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
450
+ if (keepFrom <= inputStart)
451
+ return;
452
+ const drop = keepFrom - inputStart;
453
+ for (let channel = 0;channel < channelCount; channel += 1) {
454
+ buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
455
+ }
456
+ inputStart = keepFrom;
457
+ };
458
+ const bestOffset = (center) => {
459
+ if (!naturalRef)
460
+ return 0;
461
+ const [base] = buffers;
462
+ if (!base)
463
+ return 0;
464
+ let bestDelta = 0;
465
+ let bestScore = -Infinity;
466
+ for (let delta = -seek;delta <= seek; delta += 1) {
467
+ const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
468
+ if (score <= bestScore)
469
+ continue;
470
+ bestScore = score;
471
+ bestDelta = delta;
472
+ }
473
+ return bestDelta;
474
+ };
475
+ const process = (input, speed, rate) => {
476
+ const channels = Math.max(1, input.length);
477
+ if (sampleRate !== rate || channelCount !== channels)
478
+ init(rate, channels);
479
+ append(input);
480
+ const analysisHop = hop * speed;
481
+ const segments = Array.from({ length: channelCount }, () => []);
482
+ const emitGrain = (pos) => {
483
+ const off = pos - inputStart;
484
+ for (let channel = 0;channel < channelCount; channel += 1) {
485
+ const src = buffers[channel];
486
+ const tail = olaTail[channel];
487
+ if (!src || !tail)
488
+ continue;
489
+ const grain = overlapAddGrain(src, off, tail, weights, hop);
490
+ olaTail[channel] = grain.nextTail;
491
+ segments[channel]?.push(grain.out);
492
+ }
493
+ };
494
+ const captureRef = (pos) => {
495
+ const ref = new Float32Array(frameLen);
496
+ const refOff = pos + hop - inputStart;
497
+ const [base] = buffers;
498
+ if (base)
499
+ ref.set(base.subarray(refOff, refOff + frameLen));
500
+ naturalRef = ref;
501
+ };
502
+ const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
503
+ while (canEmit()) {
504
+ const center = Math.round(analysisPos);
505
+ const pos = center + bestOffset(center);
506
+ emitGrain(pos);
507
+ captureRef(pos);
508
+ analysisPos += analysisHop;
509
+ }
510
+ compact();
511
+ return segments.map((channelSegments) => {
512
+ const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
513
+ const merged = new Float32Array(total);
514
+ let offset = 0;
515
+ for (const seg of channelSegments) {
516
+ merged.set(seg, offset);
517
+ offset += seg.length;
518
+ }
519
+ return merged;
520
+ });
521
+ };
522
+ return { process, reset };
523
+ };
524
+
373
525
  // src/client/audioPlayer.ts
374
526
  var DEFAULT_LOOKAHEAD_MS = 15;
375
527
  var DEFAULT_VOLUME = 1;
376
528
  var DEFAULT_PLAYBACK_RATE = 1;
377
529
  var MIN_PLAYBACK_RATE = 0.5;
378
530
  var MAX_PLAYBACK_RATE = 2;
531
+ var STRETCH_BYPASS_EPSILON = 0.01;
379
532
  var createInitialState = () => ({
380
533
  activeSourceCount: 0,
381
534
  error: null,
@@ -438,6 +591,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
438
591
  let outputNode = null;
439
592
  let volume = clampVolume(options.volume);
440
593
  let playbackRate = clampPlaybackRate(options.playbackRate);
594
+ let stretcher = null;
441
595
  let queueEndTime = 0;
442
596
  let syncPromise = Promise.resolve();
443
597
  let interruptStartedAt = null;
@@ -470,6 +624,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
470
624
  const resolveInterrupt = (latencyMs) => {
471
625
  clearInterruptTimer();
472
626
  interruptStartedAt = null;
627
+ stretcher?.reset();
473
628
  setState({
474
629
  activeSourceCount: sourceNodes.size,
475
630
  isPlaying: false,
@@ -534,13 +689,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
534
689
  queueEndTime = audioContext.currentTime;
535
690
  return audioContext;
536
691
  };
537
- const scheduleChunk = async (chunk) => {
538
- const context = await ensureAudioContext();
539
- const buffer = decodePCM16LEChunk(context, chunk);
692
+ const scheduleBuffer = (context, buffer, rate) => {
540
693
  const node = context.createBufferSource();
541
694
  node.buffer = buffer;
542
695
  if (node.playbackRate) {
543
- node.playbackRate.value = playbackRate;
696
+ node.playbackRate.value = rate;
544
697
  }
545
698
  node.connect(outputNode ?? context.destination);
546
699
  node.onended = () => {
@@ -553,7 +706,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
553
706
  maybeResolveInterrupt();
554
707
  };
555
708
  const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
556
- queueEndTime = startAt + buffer.duration / playbackRate;
709
+ queueEndTime = startAt + buffer.duration / rate;
557
710
  sourceNodes.add(node);
558
711
  setState({
559
712
  activeSourceCount: sourceNodes.size,
@@ -561,6 +714,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
561
714
  });
562
715
  node.start(startAt);
563
716
  };
717
+ const scheduleChunk = async (chunk) => {
718
+ const context = await ensureAudioContext();
719
+ const buffer = decodePCM16LEChunk(context, chunk);
720
+ if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
721
+ stretcher?.reset();
722
+ scheduleBuffer(context, buffer, playbackRate);
723
+ return;
724
+ }
725
+ const channels = Math.max(1, chunk.format.channels);
726
+ const input = [];
727
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
728
+ input.push(buffer.getChannelData(channelIndex));
729
+ }
730
+ stretcher ??= createTimeStretcher();
731
+ const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
732
+ const outLength = stretched[0]?.length ?? 0;
733
+ if (outLength === 0) {
734
+ return;
735
+ }
736
+ const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
737
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
738
+ const channelOut = stretched[channelIndex];
739
+ if (!channelOut)
740
+ continue;
741
+ outBuffer.getChannelData(channelIndex).set(channelOut);
742
+ }
743
+ scheduleBuffer(context, outBuffer, 1);
744
+ };
564
745
  const stopQueuedPlayback = (options2) => {
565
746
  for (const node of [...sourceNodes]) {
566
747
  node.stop?.();
@@ -12303,6 +12484,7 @@ export {
12303
12484
  createVoiceAudioPlayer,
12304
12485
  createVoiceAgentSquadStatusViewModel,
12305
12486
  createVoiceAgentSquadStatusStore,
12487
+ createTimeStretcher,
12306
12488
  createMicrophoneCapture,
12307
12489
  buildVoiceAgentSquadStatusReport,
12308
12490
  bindVoiceReactiveSource,
@@ -0,0 +1,5 @@
1
+ export type TimeStretcher = {
2
+ process: (input: Float32Array[], speed: number, sampleRate: number) => Float32Array[];
3
+ reset: () => void;
4
+ };
5
+ export declare const createTimeStretcher: () => TimeStretcher;
@@ -10,6 +10,12 @@ export type VoiceBackchannelDriverOptions = {
10
10
  minSpeechMs?: number;
11
11
  onCue: (cue: VoiceBackchannelCue) => Promise<void> | void;
12
12
  };
13
+ export type VoiceBackchannelConfig = {
14
+ enabled?: boolean;
15
+ cues?: ReadonlyArray<string>;
16
+ minSpeechMs?: number;
17
+ cueIntervalMs?: number;
18
+ };
13
19
  export type VoiceBackchannelDriver = {
14
20
  noteSpeech: (timestampMs?: number) => void;
15
21
  noteSilence: (timestampMs?: number) => void;
@@ -783,6 +783,7 @@ export type VoicePluginConfig<TContext = unknown, TSession extends VoiceSessionR
783
783
  userText: string;
784
784
  }) => Promise<string | null>;
785
785
  fillerForTimeoutMs?: number;
786
+ backchannel?: import("./backchannel").VoiceBackchannelConfig;
786
787
  defaultSilentTurnAck?: string;
787
788
  routeOnTurnTimeoutMs?: number;
788
789
  audioConditioning?: VoiceAudioConditioningConfig;
@@ -968,6 +969,14 @@ export type CreateVoiceSessionOptions<TContext = unknown, TSession extends Voice
968
969
  }) => Promise<string | null>;
969
970
  /** Ceiling for the `fillerFor` call before we fall back to a static phrase. Default 600ms. */
970
971
  fillerForTimeoutMs?: number;
972
+ /**
973
+ * Backchannel cues — short "mm-hm"/"right" acknowledgements played while the
974
+ * CALLER is mid-turn (a long answer) so they feel heard, the way a human
975
+ * listener interjects. Plays on the same non-turn TTS path as fillers, so it
976
+ * never registers as the assistant's turn or trips barge-in. Off unless
977
+ * `enabled` is set. Fires only while the assistant is silent.
978
+ */
979
+ backchannel?: import("./backchannel").VoiceBackchannelConfig;
971
980
  /**
972
981
  * Default spoken ack if the model returns ONLY tool calls (no text) and the
973
982
  * turn isn't ending. Without this, the caller hears total silence after
package/dist/index.d.ts CHANGED
@@ -98,7 +98,7 @@ export type { VoiceCampaignDisposition, VoiceCampaignDispositionRetryPolicy, Voi
98
98
  export { createVoiceBackchannelDriver } from "./core/backchannel";
99
99
  export { createVoiceOAuth2TokenSource } from "./core/oauth2TokenSource";
100
100
  export type { CreateVoiceOAuth2TokenSourceOptions, VoiceOAuth2TokenResponse, VoiceOAuth2TokenSource, } from "./core/oauth2TokenSource";
101
- export type { VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
101
+ export type { VoiceBackchannelConfig, VoiceBackchannelCue, VoiceBackchannelDriver, VoiceBackchannelDriverOptions, } from "./core/backchannel";
102
102
  export { createVoiceIVRSession, describeVoiceIVRPlan, evaluateVoiceIVRPlan, } from "./core/ivrPlan";
103
103
  export type { VoiceIVRBranch, VoiceIVRDecision, VoiceIVRInput, VoiceIVRMatch, VoiceIVRPlan, VoiceIVRSession, } from "./core/ivrPlan";
104
104
  export { VOICE_CALLER_MEMORY_KEY, buildVoiceCallerMemoryNamespace, createVoiceCallerMemoryNamespace, summarizeVoiceCallerTranscript, } from "./core/callerMemory";
package/dist/index.js CHANGED
@@ -3091,6 +3091,71 @@ var toVoiceSessionSummary = (session) => ({
3091
3091
  // src/core/session.ts
3092
3092
  import { Buffer as Buffer2 } from "buffer";
3093
3093
 
3094
+ // src/core/backchannel.ts
3095
+ var DEFAULT_CUES = [
3096
+ { text: "mm-hmm" },
3097
+ { text: "I see" },
3098
+ { text: "right" },
3099
+ { text: "go on" }
3100
+ ];
3101
+ var createVoiceBackchannelDriver = (options) => {
3102
+ const cues = options.cues ?? DEFAULT_CUES;
3103
+ const minSpeechMs = options.minSpeechMs ?? 2500;
3104
+ const cueIntervalMs = options.cueIntervalMs ?? 2500;
3105
+ const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
3106
+ let speechStartedAt;
3107
+ let lastCueAt;
3108
+ let cueCount = 0;
3109
+ let firing = false;
3110
+ const tryFire = async (now) => {
3111
+ if (firing || cues.length === 0) {
3112
+ return;
3113
+ }
3114
+ if (speechStartedAt === undefined) {
3115
+ return;
3116
+ }
3117
+ const elapsed = now - speechStartedAt;
3118
+ if (elapsed < minSpeechMs) {
3119
+ return;
3120
+ }
3121
+ if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
3122
+ return;
3123
+ }
3124
+ const cue = cues[cueIndexFn(cueCount)];
3125
+ if (!cue) {
3126
+ return;
3127
+ }
3128
+ firing = true;
3129
+ try {
3130
+ await options.onCue(cue);
3131
+ } finally {
3132
+ firing = false;
3133
+ lastCueAt = now;
3134
+ cueCount += 1;
3135
+ }
3136
+ };
3137
+ return {
3138
+ noteSilence: (timestampMs) => {
3139
+ const now = timestampMs ?? Date.now();
3140
+ if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
3141
+ speechStartedAt = undefined;
3142
+ }
3143
+ },
3144
+ noteSpeech: (timestampMs) => {
3145
+ const now = timestampMs ?? Date.now();
3146
+ if (speechStartedAt === undefined) {
3147
+ speechStartedAt = now;
3148
+ }
3149
+ tryFire(now);
3150
+ },
3151
+ reset: () => {
3152
+ speechStartedAt = undefined;
3153
+ lastCueAt = undefined;
3154
+ cueCount = 0;
3155
+ }
3156
+ };
3157
+ };
3158
+
3094
3159
  // src/core/handoff.ts
3095
3160
  var toHex3 = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
3096
3161
  var signHandoffBody = async (input) => {
@@ -5217,6 +5282,30 @@ var createVoiceSession = (options) => {
5217
5282
  });
5218
5283
  });
5219
5284
  };
5285
+ const emitBackchannelCue = (text) => {
5286
+ if (!text || !options.tts)
5287
+ return;
5288
+ if (activeTTSTurnId !== undefined || fillerActive)
5289
+ return;
5290
+ runSerial("backchannel.send", async () => {
5291
+ if (activeTTSTurnId !== undefined || fillerActive)
5292
+ return;
5293
+ const adapterSession = await ensureTTSSession();
5294
+ if (!adapterSession)
5295
+ return;
5296
+ try {
5297
+ await adapterSession.send(text);
5298
+ } catch {}
5299
+ });
5300
+ };
5301
+ const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
5302
+ ...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
5303
+ ...options.backchannel.cues ? {
5304
+ cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
5305
+ } : {},
5306
+ ...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
5307
+ onCue: (cue) => emitBackchannelCue(cue.text)
5308
+ }) : null;
5220
5309
  const createTurnTTSStreamer = (turn, session) => {
5221
5310
  let buffer = "";
5222
5311
  let full = "";
@@ -5708,6 +5797,7 @@ var createVoiceSession = (options) => {
5708
5797
  };
5709
5798
  const commitTurnInternal = async (reason = "manual") => {
5710
5799
  clearSilenceTimer();
5800
+ backchannelDriver?.reset();
5711
5801
  amdLastTurnCommitAt = Date.now();
5712
5802
  const session = await readSession();
5713
5803
  if (session.status === "completed" || session.status === "failed") {
@@ -6051,7 +6141,9 @@ var createVoiceSession = (options) => {
6051
6141
  speechDetected = true;
6052
6142
  clearSilenceTimer();
6053
6143
  kickCallSilenceWatchdog();
6144
+ backchannelDriver?.noteSpeech();
6054
6145
  } else if (speechDetected) {
6146
+ backchannelDriver?.noteSilence();
6055
6147
  const currentSession = await readSession();
6056
6148
  const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
6057
6149
  partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
@@ -24811,6 +24903,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
24811
24903
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
24812
24904
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
24813
24905
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
24906
+ ...options.backchannel ? { backchannel: options.backchannel } : {},
24814
24907
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
24815
24908
  ...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
24816
24909
  trace: options.trace,
@@ -39177,6 +39270,7 @@ var voice = (config) => {
39177
39270
  ...config.fillerDelayMs !== undefined ? { fillerDelayMs: config.fillerDelayMs } : {},
39178
39271
  ...config.fillerFor ? { fillerFor: config.fillerFor } : {},
39179
39272
  ...config.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: config.fillerForTimeoutMs } : {},
39273
+ ...config.backchannel ? { backchannel: config.backchannel } : {},
39180
39274
  ...config.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: config.defaultSilentTurnAck } : {},
39181
39275
  ...config.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: config.routeOnTurnTimeoutMs } : {},
39182
39276
  tts: config.tts,
@@ -41569,70 +41663,6 @@ var summarizeVoiceCampaignDispositions = (record) => {
41569
41663
  totalRecipients: record.recipients.length
41570
41664
  };
41571
41665
  };
41572
- // src/core/backchannel.ts
41573
- var DEFAULT_CUES = [
41574
- { text: "mm-hmm" },
41575
- { text: "I see" },
41576
- { text: "right" },
41577
- { text: "go on" }
41578
- ];
41579
- var createVoiceBackchannelDriver = (options) => {
41580
- const cues = options.cues ?? DEFAULT_CUES;
41581
- const minSpeechMs = options.minSpeechMs ?? 2500;
41582
- const cueIntervalMs = options.cueIntervalMs ?? 2500;
41583
- const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
41584
- let speechStartedAt;
41585
- let lastCueAt;
41586
- let cueCount = 0;
41587
- let firing = false;
41588
- const tryFire = async (now) => {
41589
- if (firing || cues.length === 0) {
41590
- return;
41591
- }
41592
- if (speechStartedAt === undefined) {
41593
- return;
41594
- }
41595
- const elapsed = now - speechStartedAt;
41596
- if (elapsed < minSpeechMs) {
41597
- return;
41598
- }
41599
- if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
41600
- return;
41601
- }
41602
- const cue = cues[cueIndexFn(cueCount)];
41603
- if (!cue) {
41604
- return;
41605
- }
41606
- firing = true;
41607
- try {
41608
- await options.onCue(cue);
41609
- } finally {
41610
- firing = false;
41611
- lastCueAt = now;
41612
- cueCount += 1;
41613
- }
41614
- };
41615
- return {
41616
- noteSilence: (timestampMs) => {
41617
- const now = timestampMs ?? Date.now();
41618
- if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
41619
- speechStartedAt = undefined;
41620
- }
41621
- },
41622
- noteSpeech: (timestampMs) => {
41623
- const now = timestampMs ?? Date.now();
41624
- if (speechStartedAt === undefined) {
41625
- speechStartedAt = now;
41626
- }
41627
- tryFire(now);
41628
- },
41629
- reset: () => {
41630
- speechStartedAt = undefined;
41631
- lastCueAt = undefined;
41632
- cueCount = 0;
41633
- }
41634
- };
41635
- };
41636
41666
  // src/core/oauth2TokenSource.ts
41637
41667
  var createVoiceOAuth2TokenSource = (options) => {
41638
41668
  const fetchImpl = options.fetch ?? globalThis.fetch.bind(globalThis);
@@ -164,6 +164,9 @@ export type TwilioMediaStreamBridgeOptions<TContext = unknown, TSession extends
164
164
  }) => Promise<string | null>;
165
165
  /** Cap on the `fillerFor` race before falling back to a static phrase. Default 600ms. */
166
166
  fillerForTimeoutMs?: number;
167
+ /** Backchannel cues played while the caller is mid-turn so they feel heard.
168
+ * Non-turn TTS path (no barge-in interaction). Off unless `enabled`. */
169
+ backchannel?: import("../core/backchannel").VoiceBackchannelConfig;
167
170
  /**
168
171
  * Default spoken ack if the model returns ONLY tool calls (no text) and
169
172
  * the turn isn't ending. Without this, the caller hears silence and
@@ -1577,12 +1577,165 @@ var buildSessionCorrectionAudit = (raw, generic, experimental, benchmarkSeeded,
1577
1577
  }
1578
1578
  };
1579
1579
  };
1580
+ // src/client/timeStretch.ts
1581
+ var HOP_MS = 10;
1582
+ var SEEK_MS = 5;
1583
+ var ENERGY_EPSILON = 0.000001;
1584
+ var HALF = 0.5;
1585
+ var MS_PER_SECOND = 1000;
1586
+ var makeHann = (length) => {
1587
+ const weights = new Float32Array(length);
1588
+ for (let index = 0;index < length; index += 1) {
1589
+ weights[index] = HALF - HALF * Math.cos(2 * Math.PI * index / length);
1590
+ }
1591
+ return weights;
1592
+ };
1593
+ var correlationScore = (base, start, ref, length) => {
1594
+ let dot = 0;
1595
+ let energy = 0;
1596
+ for (let index = 0;index < length; index += 1) {
1597
+ const sample = base[start + index] ?? 0;
1598
+ dot += sample * (ref[index] ?? 0);
1599
+ energy += sample * sample;
1600
+ }
1601
+ return dot / Math.sqrt(energy + ENERGY_EPSILON);
1602
+ };
1603
+ var overlapAddGrain = (src, off, tail, weights, hop) => {
1604
+ const out = new Float32Array(hop);
1605
+ const nextTail = new Float32Array(hop);
1606
+ for (let index = 0;index < hop; index += 1) {
1607
+ out[index] = (tail[index] ?? 0) + (src[off + index] ?? 0) * (weights[index] ?? 0);
1608
+ nextTail[index] = (src[off + hop + index] ?? 0) * (weights[hop + index] ?? 0);
1609
+ }
1610
+ return { nextTail, out };
1611
+ };
1612
+ var createTimeStretcher = () => {
1613
+ let sampleRate = 0;
1614
+ let channelCount = 0;
1615
+ let hop = 0;
1616
+ let frameLen = 0;
1617
+ let seek = 0;
1618
+ let weights = new Float32Array(0);
1619
+ let buffers = [];
1620
+ let inputStart = 0;
1621
+ let analysisPos = 0;
1622
+ let olaTail = [];
1623
+ let naturalRef = null;
1624
+ const init = (rate, channels) => {
1625
+ sampleRate = rate;
1626
+ channelCount = channels;
1627
+ hop = Math.max(1, Math.round(sampleRate * HOP_MS / MS_PER_SECOND));
1628
+ frameLen = hop * 2;
1629
+ seek = Math.max(1, Math.round(sampleRate * SEEK_MS / MS_PER_SECOND));
1630
+ weights = makeHann(frameLen);
1631
+ buffers = Array.from({ length: channels }, () => new Float32Array(0));
1632
+ olaTail = Array.from({ length: channels }, () => new Float32Array(hop));
1633
+ inputStart = 0;
1634
+ analysisPos = seek;
1635
+ naturalRef = null;
1636
+ };
1637
+ const reset = () => {
1638
+ buffers = buffers.map(() => new Float32Array(0));
1639
+ olaTail = olaTail.map(() => new Float32Array(hop));
1640
+ inputStart = 0;
1641
+ analysisPos = seek;
1642
+ naturalRef = null;
1643
+ };
1644
+ const append = (input) => {
1645
+ for (let channel = 0;channel < channelCount; channel += 1) {
1646
+ const incoming = input[channel] ?? input[0] ?? new Float32Array(0);
1647
+ const existing = buffers[channel] ?? new Float32Array(0);
1648
+ const merged = new Float32Array(existing.length + incoming.length);
1649
+ merged.set(existing, 0);
1650
+ merged.set(incoming, existing.length);
1651
+ buffers[channel] = merged;
1652
+ }
1653
+ };
1654
+ const inputEnd = () => inputStart + (buffers[0]?.length ?? 0);
1655
+ const compact = () => {
1656
+ const keepFrom = Math.max(inputStart, Math.floor(analysisPos) - seek - 1);
1657
+ if (keepFrom <= inputStart)
1658
+ return;
1659
+ const drop = keepFrom - inputStart;
1660
+ for (let channel = 0;channel < channelCount; channel += 1) {
1661
+ buffers[channel] = (buffers[channel] ?? new Float32Array(0)).slice(drop);
1662
+ }
1663
+ inputStart = keepFrom;
1664
+ };
1665
+ const bestOffset = (center) => {
1666
+ if (!naturalRef)
1667
+ return 0;
1668
+ const [base] = buffers;
1669
+ if (!base)
1670
+ return 0;
1671
+ let bestDelta = 0;
1672
+ let bestScore = -Infinity;
1673
+ for (let delta = -seek;delta <= seek; delta += 1) {
1674
+ const score = correlationScore(base, center + delta - inputStart, naturalRef, frameLen);
1675
+ if (score <= bestScore)
1676
+ continue;
1677
+ bestScore = score;
1678
+ bestDelta = delta;
1679
+ }
1680
+ return bestDelta;
1681
+ };
1682
+ const process2 = (input, speed, rate) => {
1683
+ const channels = Math.max(1, input.length);
1684
+ if (sampleRate !== rate || channelCount !== channels)
1685
+ init(rate, channels);
1686
+ append(input);
1687
+ const analysisHop = hop * speed;
1688
+ const segments = Array.from({ length: channelCount }, () => []);
1689
+ const emitGrain = (pos) => {
1690
+ const off = pos - inputStart;
1691
+ for (let channel = 0;channel < channelCount; channel += 1) {
1692
+ const src = buffers[channel];
1693
+ const tail = olaTail[channel];
1694
+ if (!src || !tail)
1695
+ continue;
1696
+ const grain = overlapAddGrain(src, off, tail, weights, hop);
1697
+ olaTail[channel] = grain.nextTail;
1698
+ segments[channel]?.push(grain.out);
1699
+ }
1700
+ };
1701
+ const captureRef = (pos) => {
1702
+ const ref = new Float32Array(frameLen);
1703
+ const refOff = pos + hop - inputStart;
1704
+ const [base] = buffers;
1705
+ if (base)
1706
+ ref.set(base.subarray(refOff, refOff + frameLen));
1707
+ naturalRef = ref;
1708
+ };
1709
+ const canEmit = () => Math.floor(analysisPos) - seek >= inputStart && Math.floor(analysisPos) + seek + frameLen + hop <= inputEnd();
1710
+ while (canEmit()) {
1711
+ const center = Math.round(analysisPos);
1712
+ const pos = center + bestOffset(center);
1713
+ emitGrain(pos);
1714
+ captureRef(pos);
1715
+ analysisPos += analysisHop;
1716
+ }
1717
+ compact();
1718
+ return segments.map((channelSegments) => {
1719
+ const total = channelSegments.reduce((sum, seg) => sum + seg.length, 0);
1720
+ const merged = new Float32Array(total);
1721
+ let offset = 0;
1722
+ for (const seg of channelSegments) {
1723
+ merged.set(seg, offset);
1724
+ offset += seg.length;
1725
+ }
1726
+ return merged;
1727
+ });
1728
+ };
1729
+ return { process: process2, reset };
1730
+ };
1731
+
1580
1732
  // src/client/audioPlayer.ts
1581
1733
  var DEFAULT_LOOKAHEAD_MS = 15;
1582
1734
  var DEFAULT_VOLUME = 1;
1583
1735
  var DEFAULT_PLAYBACK_RATE = 1;
1584
1736
  var MIN_PLAYBACK_RATE = 0.5;
1585
1737
  var MAX_PLAYBACK_RATE = 2;
1738
+ var STRETCH_BYPASS_EPSILON = 0.01;
1586
1739
  var createInitialState = () => ({
1587
1740
  activeSourceCount: 0,
1588
1741
  error: null,
@@ -1645,6 +1798,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1645
1798
  let outputNode = null;
1646
1799
  let volume = clampVolume(options.volume);
1647
1800
  let playbackRate = clampPlaybackRate(options.playbackRate);
1801
+ let stretcher = null;
1648
1802
  let queueEndTime = 0;
1649
1803
  let syncPromise = Promise.resolve();
1650
1804
  let interruptStartedAt = null;
@@ -1677,6 +1831,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1677
1831
  const resolveInterrupt = (latencyMs) => {
1678
1832
  clearInterruptTimer();
1679
1833
  interruptStartedAt = null;
1834
+ stretcher?.reset();
1680
1835
  setState({
1681
1836
  activeSourceCount: sourceNodes.size,
1682
1837
  isPlaying: false,
@@ -1741,13 +1896,11 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1741
1896
  queueEndTime = audioContext.currentTime;
1742
1897
  return audioContext;
1743
1898
  };
1744
- const scheduleChunk = async (chunk) => {
1745
- const context = await ensureAudioContext();
1746
- const buffer = decodePCM16LEChunk(context, chunk);
1899
+ const scheduleBuffer = (context, buffer, rate) => {
1747
1900
  const node = context.createBufferSource();
1748
1901
  node.buffer = buffer;
1749
1902
  if (node.playbackRate) {
1750
- node.playbackRate.value = playbackRate;
1903
+ node.playbackRate.value = rate;
1751
1904
  }
1752
1905
  node.connect(outputNode ?? context.destination);
1753
1906
  node.onended = () => {
@@ -1760,7 +1913,7 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1760
1913
  maybeResolveInterrupt();
1761
1914
  };
1762
1915
  const startAt = Math.max(context.currentTime + lookaheadSeconds, queueEndTime);
1763
- queueEndTime = startAt + buffer.duration / playbackRate;
1916
+ queueEndTime = startAt + buffer.duration / rate;
1764
1917
  sourceNodes.add(node);
1765
1918
  setState({
1766
1919
  activeSourceCount: sourceNodes.size,
@@ -1768,6 +1921,34 @@ var createVoiceAudioPlayer = (source, options = {}) => {
1768
1921
  });
1769
1922
  node.start(startAt);
1770
1923
  };
1924
+ const scheduleChunk = async (chunk) => {
1925
+ const context = await ensureAudioContext();
1926
+ const buffer = decodePCM16LEChunk(context, chunk);
1927
+ if (Math.abs(playbackRate - 1) <= STRETCH_BYPASS_EPSILON) {
1928
+ stretcher?.reset();
1929
+ scheduleBuffer(context, buffer, playbackRate);
1930
+ return;
1931
+ }
1932
+ const channels = Math.max(1, chunk.format.channels);
1933
+ const input = [];
1934
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
1935
+ input.push(buffer.getChannelData(channelIndex));
1936
+ }
1937
+ stretcher ??= createTimeStretcher();
1938
+ const stretched = stretcher.process(input, playbackRate, chunk.format.sampleRateHz);
1939
+ const outLength = stretched[0]?.length ?? 0;
1940
+ if (outLength === 0) {
1941
+ return;
1942
+ }
1943
+ const outBuffer = context.createBuffer(channels, outLength, chunk.format.sampleRateHz);
1944
+ for (let channelIndex = 0;channelIndex < channels; channelIndex += 1) {
1945
+ const channelOut = stretched[channelIndex];
1946
+ if (!channelOut)
1947
+ continue;
1948
+ outBuffer.getChannelData(channelIndex).set(channelOut);
1949
+ }
1950
+ scheduleBuffer(context, outBuffer, 1);
1951
+ };
1771
1952
  const stopQueuedPlayback = (options2) => {
1772
1953
  for (const node of [...sourceNodes]) {
1773
1954
  node.stop?.();
@@ -5130,6 +5311,71 @@ var createVoiceMemoryStore = () => {
5130
5311
  // src/core/session.ts
5131
5312
  import { Buffer as Buffer2 } from "buffer";
5132
5313
 
5314
+ // src/core/backchannel.ts
5315
+ var DEFAULT_CUES = [
5316
+ { text: "mm-hmm" },
5317
+ { text: "I see" },
5318
+ { text: "right" },
5319
+ { text: "go on" }
5320
+ ];
5321
+ var createVoiceBackchannelDriver = (options) => {
5322
+ const cues = options.cues ?? DEFAULT_CUES;
5323
+ const minSpeechMs = options.minSpeechMs ?? 2500;
5324
+ const cueIntervalMs = options.cueIntervalMs ?? 2500;
5325
+ const cueIndexFn = options.cueIndex ?? ((index) => index % Math.max(cues.length, 1));
5326
+ let speechStartedAt;
5327
+ let lastCueAt;
5328
+ let cueCount = 0;
5329
+ let firing = false;
5330
+ const tryFire = async (now) => {
5331
+ if (firing || cues.length === 0) {
5332
+ return;
5333
+ }
5334
+ if (speechStartedAt === undefined) {
5335
+ return;
5336
+ }
5337
+ const elapsed = now - speechStartedAt;
5338
+ if (elapsed < minSpeechMs) {
5339
+ return;
5340
+ }
5341
+ if (lastCueAt !== undefined && now - lastCueAt < cueIntervalMs) {
5342
+ return;
5343
+ }
5344
+ const cue = cues[cueIndexFn(cueCount)];
5345
+ if (!cue) {
5346
+ return;
5347
+ }
5348
+ firing = true;
5349
+ try {
5350
+ await options.onCue(cue);
5351
+ } finally {
5352
+ firing = false;
5353
+ lastCueAt = now;
5354
+ cueCount += 1;
5355
+ }
5356
+ };
5357
+ return {
5358
+ noteSilence: (timestampMs) => {
5359
+ const now = timestampMs ?? Date.now();
5360
+ if (lastCueAt !== undefined && now - lastCueAt > cueIntervalMs * 2) {
5361
+ speechStartedAt = undefined;
5362
+ }
5363
+ },
5364
+ noteSpeech: (timestampMs) => {
5365
+ const now = timestampMs ?? Date.now();
5366
+ if (speechStartedAt === undefined) {
5367
+ speechStartedAt = now;
5368
+ }
5369
+ tryFire(now);
5370
+ },
5371
+ reset: () => {
5372
+ speechStartedAt = undefined;
5373
+ lastCueAt = undefined;
5374
+ cueCount = 0;
5375
+ }
5376
+ };
5377
+ };
5378
+
5133
5379
  // src/core/handoff.ts
5134
5380
  var toHex = (bytes) => Array.from(bytes, (byte) => byte.toString(16).padStart(2, "0")).join("");
5135
5381
  var signHandoffBody = async (input) => {
@@ -7152,6 +7398,30 @@ var createVoiceSession = (options) => {
7152
7398
  });
7153
7399
  });
7154
7400
  };
7401
+ const emitBackchannelCue = (text) => {
7402
+ if (!text || !options.tts)
7403
+ return;
7404
+ if (activeTTSTurnId !== undefined || fillerActive)
7405
+ return;
7406
+ runSerial("backchannel.send", async () => {
7407
+ if (activeTTSTurnId !== undefined || fillerActive)
7408
+ return;
7409
+ const adapterSession = await ensureTTSSession();
7410
+ if (!adapterSession)
7411
+ return;
7412
+ try {
7413
+ await adapterSession.send(text);
7414
+ } catch {}
7415
+ });
7416
+ };
7417
+ const backchannelDriver = options.backchannel?.enabled && options.tts ? createVoiceBackchannelDriver({
7418
+ ...options.backchannel.cueIntervalMs !== undefined ? { cueIntervalMs: options.backchannel.cueIntervalMs } : {},
7419
+ ...options.backchannel.cues ? {
7420
+ cues: options.backchannel.cues.filter((cue) => typeof cue === "string" && cue.trim().length > 0).map((cue) => ({ text: cue }))
7421
+ } : {},
7422
+ ...options.backchannel.minSpeechMs !== undefined ? { minSpeechMs: options.backchannel.minSpeechMs } : {},
7423
+ onCue: (cue) => emitBackchannelCue(cue.text)
7424
+ }) : null;
7155
7425
  const createTurnTTSStreamer = (turn, session) => {
7156
7426
  let buffer = "";
7157
7427
  let full = "";
@@ -7643,6 +7913,7 @@ var createVoiceSession = (options) => {
7643
7913
  };
7644
7914
  const commitTurnInternal = async (reason = "manual") => {
7645
7915
  clearSilenceTimer();
7916
+ backchannelDriver?.reset();
7646
7917
  amdLastTurnCommitAt = Date.now();
7647
7918
  const session = await readSession();
7648
7919
  if (session.status === "completed" || session.status === "failed") {
@@ -7986,7 +8257,9 @@ var createVoiceSession = (options) => {
7986
8257
  speechDetected = true;
7987
8258
  clearSilenceTimer();
7988
8259
  kickCallSilenceWatchdog();
8260
+ backchannelDriver?.noteSpeech();
7989
8261
  } else if (speechDetected) {
8262
+ backchannelDriver?.noteSilence();
7990
8263
  const currentSession = await readSession();
7991
8264
  const hasTurnText = Boolean(buildTurnText(currentSession.currentTurn.transcripts, currentSession.currentTurn.partialText, {
7992
8265
  partialEndedAtMs: currentSession.currentTurn.partialEndedAt,
@@ -13465,6 +13738,7 @@ var createTwilioMediaStreamBridge = (socket, options) => {
13465
13738
  ...options.bargeInMinPartialWords !== undefined ? { bargeInMinPartialWords: options.bargeInMinPartialWords } : {},
13466
13739
  ...options.fillerFor ? { fillerFor: options.fillerFor } : {},
13467
13740
  ...options.fillerForTimeoutMs !== undefined ? { fillerForTimeoutMs: options.fillerForTimeoutMs } : {},
13741
+ ...options.backchannel ? { backchannel: options.backchannel } : {},
13468
13742
  ...options.defaultSilentTurnAck !== undefined ? { defaultSilentTurnAck: options.defaultSilentTurnAck } : {},
13469
13743
  ...options.routeOnTurnTimeoutMs !== undefined ? { routeOnTurnTimeoutMs: options.routeOnTurnTimeoutMs } : {},
13470
13744
  trace: options.trace,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@absolutejs/voice",
3
- "version": "0.0.22-beta.577",
3
+ "version": "0.0.22-beta.579",
4
4
  "description": "Voice primitives and Elysia plugin for AbsoluteJS",
5
5
  "repository": {
6
6
  "type": "git",