@layercode/js-sdk 1.0.21 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -510,6 +510,24 @@ class WavStreamPlayer {
|
|
|
510
510
|
this.isPlaying = false;
|
|
511
511
|
}
|
|
512
512
|
|
|
513
|
+
/**
|
|
514
|
+
* Clears interrupted track IDs to prevent memory leaks
|
|
515
|
+
* @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
|
|
516
|
+
*/
|
|
517
|
+
clearInterruptedTracks(keepTrackIds = []) {
|
|
518
|
+
if (keepTrackIds.length === 0) {
|
|
519
|
+
this.interruptedTrackIds = {};
|
|
520
|
+
} else {
|
|
521
|
+
const newInterruptedTracks = {};
|
|
522
|
+
for (const trackId of keepTrackIds) {
|
|
523
|
+
if (this.interruptedTrackIds[trackId]) {
|
|
524
|
+
newInterruptedTracks[trackId] = true;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
this.interruptedTrackIds = newInterruptedTracks;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
513
531
|
/**
|
|
514
532
|
* Connects the audio context and enables output to speakers
|
|
515
533
|
* @returns {Promise<true>}
|
|
@@ -743,7 +761,7 @@ class WavStreamPlayer {
|
|
|
743
761
|
this.analyser.disconnect();
|
|
744
762
|
}
|
|
745
763
|
|
|
746
|
-
if (this.context) {
|
|
764
|
+
if (this.context && this.context.state !== 'closed') {
|
|
747
765
|
this.context.close().catch((err) => console.error("Error closing audio context:", err));
|
|
748
766
|
}
|
|
749
767
|
|
|
@@ -3498,17 +3516,59 @@ class LayercodeClient {
|
|
|
3498
3516
|
this.agentAudioAmplitude = 0;
|
|
3499
3517
|
this.sessionId = options.sessionId || null;
|
|
3500
3518
|
this.pushToTalkActive = false;
|
|
3501
|
-
this.vadPausedPlayer = false;
|
|
3502
3519
|
this.pushToTalkEnabled = false;
|
|
3503
3520
|
this.canInterrupt = false;
|
|
3504
3521
|
this.userIsSpeaking = false;
|
|
3505
3522
|
this.endUserTurn = false;
|
|
3506
3523
|
this.recorderStarted = false;
|
|
3507
3524
|
this.readySent = false;
|
|
3525
|
+
this.currentTurnId = null;
|
|
3526
|
+
this.audioBuffer = [];
|
|
3527
|
+
this.audioPauseTime = null;
|
|
3508
3528
|
// Bind event handlers
|
|
3509
3529
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3510
3530
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3511
3531
|
}
|
|
3532
|
+
_setupAmplitudeBasedVAD() {
|
|
3533
|
+
let isSpeakingByAmplitude = false;
|
|
3534
|
+
let silenceFrames = 0;
|
|
3535
|
+
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3536
|
+
const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
|
|
3537
|
+
// Monitor amplitude changes
|
|
3538
|
+
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3539
|
+
const wasSpeaking = isSpeakingByAmplitude;
|
|
3540
|
+
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3541
|
+
silenceFrames = 0;
|
|
3542
|
+
if (!wasSpeaking) {
|
|
3543
|
+
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3544
|
+
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3545
|
+
this.audioPauseTime = Date.now();
|
|
3546
|
+
this.wavPlayer.pause();
|
|
3547
|
+
}
|
|
3548
|
+
isSpeakingByAmplitude = true;
|
|
3549
|
+
this.userIsSpeaking = true;
|
|
3550
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3551
|
+
this._wsSend({
|
|
3552
|
+
type: 'vad_events',
|
|
3553
|
+
event: 'vad_start',
|
|
3554
|
+
});
|
|
3555
|
+
}
|
|
3556
|
+
}
|
|
3557
|
+
else {
|
|
3558
|
+
silenceFrames++;
|
|
3559
|
+
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3560
|
+
// Speech ended
|
|
3561
|
+
isSpeakingByAmplitude = false;
|
|
3562
|
+
this.userIsSpeaking = false;
|
|
3563
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3564
|
+
this._wsSend({
|
|
3565
|
+
type: 'vad_events',
|
|
3566
|
+
event: 'vad_end',
|
|
3567
|
+
});
|
|
3568
|
+
}
|
|
3569
|
+
}
|
|
3570
|
+
});
|
|
3571
|
+
}
|
|
3512
3572
|
_initializeVAD() {
|
|
3513
3573
|
console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
|
|
3514
3574
|
// If we're in push to talk mode, we don't need to use the VAD model
|
|
@@ -3517,9 +3577,17 @@ class LayercodeClient {
|
|
|
3517
3577
|
}
|
|
3518
3578
|
const timeout = setTimeout(() => {
|
|
3519
3579
|
console.log('silero vad model timeout');
|
|
3520
|
-
|
|
3521
|
-
|
|
3522
|
-
this.
|
|
3580
|
+
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3581
|
+
// Send a message to server indicating VAD failure
|
|
3582
|
+
this._wsSend({
|
|
3583
|
+
type: 'vad_events',
|
|
3584
|
+
event: 'vad_model_failed',
|
|
3585
|
+
});
|
|
3586
|
+
// In automatic mode without VAD, allow the bot to speak initially
|
|
3587
|
+
this.userIsSpeaking = false;
|
|
3588
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3589
|
+
// Set up amplitude-based fallback detection
|
|
3590
|
+
this._setupAmplitudeBasedVAD();
|
|
3523
3591
|
}, 2000);
|
|
3524
3592
|
if (!this.canInterrupt) {
|
|
3525
3593
|
dist.MicVAD.new({
|
|
@@ -3528,20 +3596,30 @@ class LayercodeClient {
|
|
|
3528
3596
|
positiveSpeechThreshold: 0.3,
|
|
3529
3597
|
negativeSpeechThreshold: 0.2,
|
|
3530
3598
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3531
|
-
minSpeechFrames:
|
|
3599
|
+
minSpeechFrames: 0,
|
|
3532
3600
|
preSpeechPadFrames: 0,
|
|
3533
3601
|
onSpeechStart: () => {
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3602
|
+
this.userIsSpeaking = true;
|
|
3603
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3604
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3605
|
+
this._wsSend({
|
|
3606
|
+
type: 'vad_events',
|
|
3607
|
+
event: 'vad_start',
|
|
3608
|
+
});
|
|
3538
3609
|
},
|
|
3539
|
-
|
|
3610
|
+
onSpeechEnd: () => {
|
|
3611
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3612
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3613
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3540
3614
|
this.userIsSpeaking = false;
|
|
3541
3615
|
this.options.onUserIsSpeakingChange(false);
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
this.
|
|
3616
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3617
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3618
|
+
this._wsSend({
|
|
3619
|
+
type: 'vad_events',
|
|
3620
|
+
event: 'vad_end',
|
|
3621
|
+
});
|
|
3622
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3545
3623
|
},
|
|
3546
3624
|
})
|
|
3547
3625
|
.then((vad) => {
|
|
@@ -3563,41 +3641,59 @@ class LayercodeClient {
|
|
|
3563
3641
|
positiveSpeechThreshold: 0.3,
|
|
3564
3642
|
negativeSpeechThreshold: 0.2,
|
|
3565
3643
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3566
|
-
minSpeechFrames:
|
|
3644
|
+
minSpeechFrames: 5,
|
|
3567
3645
|
preSpeechPadFrames: 0,
|
|
3568
3646
|
onSpeechStart: () => {
|
|
3569
3647
|
// Only pause agent audio if it's currently playing
|
|
3570
3648
|
if (this.wavPlayer.isPlaying) {
|
|
3571
3649
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3650
|
+
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3572
3651
|
this.wavPlayer.pause();
|
|
3573
|
-
this.vadPausedPlayer = true; // VAD is responsible for this pause
|
|
3574
3652
|
}
|
|
3575
3653
|
else {
|
|
3576
3654
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3577
3655
|
}
|
|
3578
|
-
this.userIsSpeaking = true;
|
|
3579
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3580
3656
|
console.log('onSpeechStart: sending vad_start');
|
|
3581
3657
|
this._wsSend({
|
|
3582
3658
|
type: 'vad_events',
|
|
3583
3659
|
event: 'vad_start',
|
|
3584
3660
|
});
|
|
3661
|
+
this.userIsSpeaking = true;
|
|
3662
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3663
|
+
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3664
|
+
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3585
3665
|
},
|
|
3586
3666
|
onVADMisfire: () => {
|
|
3587
3667
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3588
3668
|
this.userIsSpeaking = false;
|
|
3669
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3589
3670
|
this.options.onUserIsSpeakingChange(false);
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
this.wavPlayer.
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3671
|
+
// Add the missing delay before resuming to prevent race conditions
|
|
3672
|
+
setTimeout(() => {
|
|
3673
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3674
|
+
console.log('onVADMisfire: Resuming after delay');
|
|
3675
|
+
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3676
|
+
this.wavPlayer.play();
|
|
3677
|
+
}
|
|
3678
|
+
else {
|
|
3679
|
+
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3680
|
+
this.endUserTurn = true;
|
|
3681
|
+
}
|
|
3682
|
+
}, this.options.vadResumeDelay);
|
|
3598
3683
|
},
|
|
3599
3684
|
onSpeechEnd: () => {
|
|
3600
|
-
|
|
3685
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3686
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3687
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3688
|
+
this.userIsSpeaking = false;
|
|
3689
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3690
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3691
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3692
|
+
this._wsSend({
|
|
3693
|
+
type: 'vad_events',
|
|
3694
|
+
event: 'vad_end',
|
|
3695
|
+
});
|
|
3696
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3601
3697
|
},
|
|
3602
3698
|
})
|
|
3603
3699
|
.then((vad) => {
|
|
@@ -3632,13 +3728,36 @@ class LayercodeClient {
|
|
|
3632
3728
|
});
|
|
3633
3729
|
}
|
|
3634
3730
|
async _clientInterruptAssistantReplay() {
|
|
3635
|
-
await this.wavPlayer.interrupt();
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3731
|
+
const offsetData = await this.wavPlayer.interrupt();
|
|
3732
|
+
if (offsetData && this.currentTurnId) {
|
|
3733
|
+
let offsetMs = offsetData.currentTime * 1000;
|
|
3734
|
+
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3735
|
+
if (this.audioPauseTime) {
|
|
3736
|
+
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3737
|
+
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3738
|
+
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3739
|
+
offsetMs = adjustedOffsetMs;
|
|
3740
|
+
this.audioPauseTime = null; // Clear the pause time
|
|
3741
|
+
}
|
|
3742
|
+
else {
|
|
3743
|
+
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3744
|
+
}
|
|
3745
|
+
// Send interruption event with accurate playback offset in milliseconds
|
|
3746
|
+
this._wsSend({
|
|
3747
|
+
type: 'trigger.response.audio.interrupted',
|
|
3748
|
+
playback_offset: offsetMs,
|
|
3749
|
+
interruption_context: {
|
|
3750
|
+
turn_id: this.currentTurnId,
|
|
3751
|
+
playback_offset_ms: offsetMs,
|
|
3752
|
+
},
|
|
3753
|
+
});
|
|
3754
|
+
}
|
|
3755
|
+
else {
|
|
3756
|
+
console.warn('Interruption requested but missing required data:', {
|
|
3757
|
+
hasOffsetData: !!offsetData,
|
|
3758
|
+
hasTurnId: !!this.currentTurnId,
|
|
3759
|
+
});
|
|
3760
|
+
}
|
|
3642
3761
|
}
|
|
3643
3762
|
async triggerUserTurnStarted() {
|
|
3644
3763
|
if (!this.pushToTalkActive) {
|
|
@@ -3669,24 +3788,38 @@ class LayercodeClient {
|
|
|
3669
3788
|
// Sent from the server to this client when a new user turn is detected
|
|
3670
3789
|
console.log('received turn.start from server');
|
|
3671
3790
|
console.log(message);
|
|
3672
|
-
if (message.role === '
|
|
3791
|
+
if (message.role === 'assistant') {
|
|
3792
|
+
// Start tracking new assistant turn
|
|
3793
|
+
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3794
|
+
// This prevents race conditions where text arrives before audio
|
|
3795
|
+
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3796
|
+
}
|
|
3797
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3673
3798
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3674
3799
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3675
3800
|
await this._clientInterruptAssistantReplay();
|
|
3676
3801
|
}
|
|
3677
|
-
// if (message.role === 'assistant') {
|
|
3678
|
-
// // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
|
|
3679
|
-
// console.log('Clearing audio buffer as assistant turn has started');
|
|
3680
|
-
// await this._clientInterruptAssistantReplay();
|
|
3681
|
-
// }
|
|
3682
3802
|
break;
|
|
3683
3803
|
case 'response.audio':
|
|
3684
3804
|
const audioBuffer = base64ToArrayBuffer(message.content);
|
|
3685
3805
|
this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
|
|
3806
|
+
// Set current turn ID from first audio message, or update if different turn
|
|
3807
|
+
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3808
|
+
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3809
|
+
this.currentTurnId = message.turn_id;
|
|
3810
|
+
// Clean up interrupted tracks, keeping only the current turn
|
|
3811
|
+
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3812
|
+
}
|
|
3686
3813
|
break;
|
|
3687
|
-
|
|
3688
|
-
|
|
3689
|
-
|
|
3814
|
+
case 'response.text': {
|
|
3815
|
+
// Set turn ID from first text message if not set
|
|
3816
|
+
if (!this.currentTurnId) {
|
|
3817
|
+
this.currentTurnId = message.turn_id;
|
|
3818
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3819
|
+
}
|
|
3820
|
+
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3821
|
+
break;
|
|
3822
|
+
}
|
|
3690
3823
|
case 'response.data':
|
|
3691
3824
|
console.log('received response.data', message);
|
|
3692
3825
|
this.options.onDataMessage(message);
|
|
@@ -3711,18 +3844,29 @@ class LayercodeClient {
|
|
|
3711
3844
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3712
3845
|
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3713
3846
|
if (sendAudio) {
|
|
3847
|
+
// If we have buffered audio, send it first
|
|
3848
|
+
if (this.audioBuffer.length > 0) {
|
|
3849
|
+
console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
|
|
3850
|
+
for (const bufferedAudio of this.audioBuffer) {
|
|
3851
|
+
this._wsSend({
|
|
3852
|
+
type: 'client.audio',
|
|
3853
|
+
content: bufferedAudio,
|
|
3854
|
+
});
|
|
3855
|
+
}
|
|
3856
|
+
this.audioBuffer = []; // Clear the buffer after sending
|
|
3857
|
+
}
|
|
3858
|
+
// Send the current audio
|
|
3714
3859
|
this._wsSend({
|
|
3715
3860
|
type: 'client.audio',
|
|
3716
3861
|
content: base64,
|
|
3717
3862
|
});
|
|
3718
|
-
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
});
|
|
3863
|
+
}
|
|
3864
|
+
else {
|
|
3865
|
+
// Buffer audio when not sending (to catch audio just before VAD triggers)
|
|
3866
|
+
this.audioBuffer.push(base64);
|
|
3867
|
+
// Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
|
|
3868
|
+
if (this.audioBuffer.length > 10) {
|
|
3869
|
+
this.audioBuffer.shift(); // Remove oldest chunk
|
|
3726
3870
|
}
|
|
3727
3871
|
}
|
|
3728
3872
|
}
|
|
@@ -3779,6 +3923,8 @@ class LayercodeClient {
|
|
|
3779
3923
|
async connect() {
|
|
3780
3924
|
try {
|
|
3781
3925
|
this._setStatus('connecting');
|
|
3926
|
+
// Reset turn tracking for clean start
|
|
3927
|
+
this._resetTurnTracking();
|
|
3782
3928
|
// Get session key from server
|
|
3783
3929
|
let authorizeSessionRequestBody = {
|
|
3784
3930
|
pipeline_id: this.options.pipelineId,
|
|
@@ -3856,11 +4002,27 @@ class LayercodeClient {
|
|
|
3856
4002
|
throw error;
|
|
3857
4003
|
}
|
|
3858
4004
|
}
|
|
4005
|
+
_resetTurnTracking() {
|
|
4006
|
+
this.currentTurnId = null;
|
|
4007
|
+
console.log('Reset turn tracking state');
|
|
4008
|
+
}
|
|
3859
4009
|
async disconnect() {
|
|
3860
|
-
|
|
4010
|
+
// Clean up VAD if it exists
|
|
4011
|
+
if (this.vad) {
|
|
4012
|
+
this.vad.pause();
|
|
4013
|
+
this.vad.destroy();
|
|
4014
|
+
this.vad = null;
|
|
4015
|
+
}
|
|
3861
4016
|
this.wavRecorder.quit();
|
|
3862
4017
|
this.wavPlayer.disconnect();
|
|
3863
|
-
|
|
4018
|
+
// Reset turn tracking
|
|
4019
|
+
this._resetTurnTracking();
|
|
4020
|
+
// Close websocket and ensure status is updated
|
|
4021
|
+
if (this.ws) {
|
|
4022
|
+
this.ws.close();
|
|
4023
|
+
this._setStatus('disconnected');
|
|
4024
|
+
this.options.onDisconnect();
|
|
4025
|
+
}
|
|
3864
4026
|
}
|
|
3865
4027
|
/**
|
|
3866
4028
|
* Gets the microphone MediaStream used by this client
|