@layercode/js-sdk 1.0.21 → 1.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -510,6 +510,24 @@ class WavStreamPlayer {
|
|
|
510
510
|
this.isPlaying = false;
|
|
511
511
|
}
|
|
512
512
|
|
|
513
|
+
/**
|
|
514
|
+
* Clears interrupted track IDs to prevent memory leaks
|
|
515
|
+
* @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
|
|
516
|
+
*/
|
|
517
|
+
clearInterruptedTracks(keepTrackIds = []) {
|
|
518
|
+
if (keepTrackIds.length === 0) {
|
|
519
|
+
this.interruptedTrackIds = {};
|
|
520
|
+
} else {
|
|
521
|
+
const newInterruptedTracks = {};
|
|
522
|
+
for (const trackId of keepTrackIds) {
|
|
523
|
+
if (this.interruptedTrackIds[trackId]) {
|
|
524
|
+
newInterruptedTracks[trackId] = true;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
this.interruptedTrackIds = newInterruptedTracks;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
513
531
|
/**
|
|
514
532
|
* Connects the audio context and enables output to speakers
|
|
515
533
|
* @returns {Promise<true>}
|
|
@@ -743,7 +761,7 @@ class WavStreamPlayer {
|
|
|
743
761
|
this.analyser.disconnect();
|
|
744
762
|
}
|
|
745
763
|
|
|
746
|
-
if (this.context) {
|
|
764
|
+
if (this.context && this.context.state !== 'closed') {
|
|
747
765
|
this.context.close().catch((err) => console.error("Error closing audio context:", err));
|
|
748
766
|
}
|
|
749
767
|
|
|
@@ -3498,13 +3516,15 @@ class LayercodeClient {
|
|
|
3498
3516
|
this.agentAudioAmplitude = 0;
|
|
3499
3517
|
this.sessionId = options.sessionId || null;
|
|
3500
3518
|
this.pushToTalkActive = false;
|
|
3501
|
-
this.vadPausedPlayer = false;
|
|
3502
3519
|
this.pushToTalkEnabled = false;
|
|
3503
3520
|
this.canInterrupt = false;
|
|
3504
3521
|
this.userIsSpeaking = false;
|
|
3505
3522
|
this.endUserTurn = false;
|
|
3506
3523
|
this.recorderStarted = false;
|
|
3507
3524
|
this.readySent = false;
|
|
3525
|
+
this.currentTurnText = '';
|
|
3526
|
+
this.currentTurnId = null;
|
|
3527
|
+
this.audioBuffer = [];
|
|
3508
3528
|
// Bind event handlers
|
|
3509
3529
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3510
3530
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
@@ -3531,17 +3551,48 @@ class LayercodeClient {
|
|
|
3531
3551
|
minSpeechFrames: 15,
|
|
3532
3552
|
preSpeechPadFrames: 0,
|
|
3533
3553
|
onSpeechStart: () => {
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3554
|
+
this.userIsSpeaking = true;
|
|
3555
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3556
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3557
|
+
this._wsSend({
|
|
3558
|
+
type: 'vad_events',
|
|
3559
|
+
event: 'vad_start',
|
|
3560
|
+
});
|
|
3538
3561
|
},
|
|
3539
3562
|
onVADMisfire: () => {
|
|
3563
|
+
console.log('onVADMisfire: Short utterance detected, resuming bot');
|
|
3564
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3540
3565
|
this.userIsSpeaking = false;
|
|
3541
3566
|
this.options.onUserIsSpeakingChange(false);
|
|
3567
|
+
// Send vad_end to indicate the short utterance is over
|
|
3568
|
+
this._wsSend({
|
|
3569
|
+
type: 'vad_events',
|
|
3570
|
+
event: 'vad_end',
|
|
3571
|
+
});
|
|
3572
|
+
// End the user's turn
|
|
3573
|
+
this._wsSend({
|
|
3574
|
+
type: 'trigger.turn.end',
|
|
3575
|
+
role: 'user',
|
|
3576
|
+
});
|
|
3577
|
+
// Resume bot audio if it was playing
|
|
3578
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3579
|
+
console.log('onVADMisfire: Resuming bot audio');
|
|
3580
|
+
this.wavPlayer.play();
|
|
3581
|
+
}
|
|
3542
3582
|
},
|
|
3543
3583
|
onSpeechEnd: () => {
|
|
3544
|
-
|
|
3584
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3585
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3586
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3587
|
+
this.userIsSpeaking = false;
|
|
3588
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3589
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3590
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3591
|
+
this._wsSend({
|
|
3592
|
+
type: 'vad_events',
|
|
3593
|
+
event: 'vad_end',
|
|
3594
|
+
});
|
|
3595
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3545
3596
|
},
|
|
3546
3597
|
})
|
|
3547
3598
|
.then((vad) => {
|
|
@@ -3563,41 +3614,59 @@ class LayercodeClient {
|
|
|
3563
3614
|
positiveSpeechThreshold: 0.3,
|
|
3564
3615
|
negativeSpeechThreshold: 0.2,
|
|
3565
3616
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3566
|
-
minSpeechFrames:
|
|
3617
|
+
minSpeechFrames: 5,
|
|
3567
3618
|
preSpeechPadFrames: 0,
|
|
3568
3619
|
onSpeechStart: () => {
|
|
3569
3620
|
// Only pause agent audio if it's currently playing
|
|
3570
3621
|
if (this.wavPlayer.isPlaying) {
|
|
3571
3622
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3572
3623
|
this.wavPlayer.pause();
|
|
3573
|
-
this.vadPausedPlayer = true; // VAD is responsible for this pause
|
|
3574
3624
|
}
|
|
3575
3625
|
else {
|
|
3576
3626
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3577
3627
|
}
|
|
3578
|
-
this.userIsSpeaking = true;
|
|
3579
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3580
3628
|
console.log('onSpeechStart: sending vad_start');
|
|
3581
3629
|
this._wsSend({
|
|
3582
3630
|
type: 'vad_events',
|
|
3583
3631
|
event: 'vad_start',
|
|
3584
3632
|
});
|
|
3633
|
+
this.userIsSpeaking = true;
|
|
3634
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3635
|
+
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3636
|
+
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3585
3637
|
},
|
|
3586
3638
|
onVADMisfire: () => {
|
|
3587
3639
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3588
3640
|
this.userIsSpeaking = false;
|
|
3641
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3589
3642
|
this.options.onUserIsSpeakingChange(false);
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
this.wavPlayer.
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3643
|
+
// Add the missing delay before resuming to prevent race conditions
|
|
3644
|
+
setTimeout(() => {
|
|
3645
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3646
|
+
console.log('onVADMisfire: Resuming after delay');
|
|
3647
|
+
this.wavPlayer.play();
|
|
3648
|
+
this.userIsSpeaking = true;
|
|
3649
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3650
|
+
}
|
|
3651
|
+
else {
|
|
3652
|
+
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3653
|
+
this.endUserTurn = true;
|
|
3654
|
+
}
|
|
3655
|
+
}, this.options.vadResumeDelay);
|
|
3598
3656
|
},
|
|
3599
3657
|
onSpeechEnd: () => {
|
|
3600
|
-
|
|
3658
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3659
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3660
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3661
|
+
this.userIsSpeaking = false;
|
|
3662
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3663
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3664
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3665
|
+
this._wsSend({
|
|
3666
|
+
type: 'vad_events',
|
|
3667
|
+
event: 'vad_end',
|
|
3668
|
+
});
|
|
3669
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3601
3670
|
},
|
|
3602
3671
|
})
|
|
3603
3672
|
.then((vad) => {
|
|
@@ -3631,14 +3700,33 @@ class LayercodeClient {
|
|
|
3631
3700
|
reason: 'completed',
|
|
3632
3701
|
});
|
|
3633
3702
|
}
|
|
3703
|
+
_estimateWordsHeard(text, playbackOffsetSeconds) {
|
|
3704
|
+
const words = text.split(/\s+/).filter((word) => word.length > 0);
|
|
3705
|
+
const totalWords = words.length;
|
|
3706
|
+
// Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
|
|
3707
|
+
const estimatedWordsPerSecond = 2.5;
|
|
3708
|
+
const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
|
|
3709
|
+
const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
|
|
3710
|
+
return { wordsHeard: estimatedWordsHeard, textHeard };
|
|
3711
|
+
}
|
|
3634
3712
|
async _clientInterruptAssistantReplay() {
|
|
3635
|
-
await this.wavPlayer.interrupt();
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
|
|
3713
|
+
const offsetData = await this.wavPlayer.interrupt();
|
|
3714
|
+
if (offsetData && this.currentTurnText && this.currentTurnId) {
|
|
3715
|
+
const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
|
|
3716
|
+
const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
3717
|
+
console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
|
|
3718
|
+
// Send interruption event with context
|
|
3719
|
+
this._wsSend({
|
|
3720
|
+
type: 'trigger.response.audio.interrupted',
|
|
3721
|
+
playback_offset: offsetData.currentTime,
|
|
3722
|
+
interruption_context: {
|
|
3723
|
+
turn_id: this.currentTurnId,
|
|
3724
|
+
estimated_words_heard: wordsHeard,
|
|
3725
|
+
total_words: totalWords,
|
|
3726
|
+
text_heard: textHeard,
|
|
3727
|
+
},
|
|
3728
|
+
});
|
|
3729
|
+
}
|
|
3642
3730
|
}
|
|
3643
3731
|
async triggerUserTurnStarted() {
|
|
3644
3732
|
if (!this.pushToTalkActive) {
|
|
@@ -3669,20 +3757,44 @@ class LayercodeClient {
|
|
|
3669
3757
|
// Sent from the server to this client when a new user turn is detected
|
|
3670
3758
|
console.log('received turn.start from server');
|
|
3671
3759
|
console.log(message);
|
|
3672
|
-
if (message.role === '
|
|
3760
|
+
if (message.role === 'assistant') {
|
|
3761
|
+
// Start tracking new assistant turn
|
|
3762
|
+
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3763
|
+
// This prevents race conditions where text arrives before audio
|
|
3764
|
+
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3765
|
+
}
|
|
3766
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3673
3767
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3674
3768
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3675
3769
|
await this._clientInterruptAssistantReplay();
|
|
3676
3770
|
}
|
|
3677
|
-
// if (message.role === 'assistant') {
|
|
3678
|
-
// // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
|
|
3679
|
-
// console.log('Clearing audio buffer as assistant turn has started');
|
|
3680
|
-
// await this._clientInterruptAssistantReplay();
|
|
3681
|
-
// }
|
|
3682
3771
|
break;
|
|
3683
3772
|
case 'response.audio':
|
|
3684
3773
|
const audioBuffer = base64ToArrayBuffer(message.content);
|
|
3685
3774
|
this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
|
|
3775
|
+
// Set current turn ID from first audio message, or update if different turn
|
|
3776
|
+
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3777
|
+
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3778
|
+
const oldTurnId = this.currentTurnId;
|
|
3779
|
+
this.currentTurnId = message.turn_id;
|
|
3780
|
+
this.currentTurnText = ''; // Reset text for new turn
|
|
3781
|
+
// Clean up interrupted tracks, keeping only the current turn
|
|
3782
|
+
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3783
|
+
}
|
|
3784
|
+
break;
|
|
3785
|
+
case 'response.text':
|
|
3786
|
+
// Set turn ID from first text message if not set, or accumulate if matches current turn
|
|
3787
|
+
if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
|
|
3788
|
+
if (!this.currentTurnId) {
|
|
3789
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3790
|
+
this.currentTurnId = message.turn_id;
|
|
3791
|
+
this.currentTurnText = '';
|
|
3792
|
+
}
|
|
3793
|
+
this.currentTurnText += message.content;
|
|
3794
|
+
}
|
|
3795
|
+
else {
|
|
3796
|
+
console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
|
|
3797
|
+
}
|
|
3686
3798
|
break;
|
|
3687
3799
|
// case 'response.end':
|
|
3688
3800
|
// console.log('received response.end');
|
|
@@ -3711,18 +3823,29 @@ class LayercodeClient {
|
|
|
3711
3823
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3712
3824
|
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3713
3825
|
if (sendAudio) {
|
|
3826
|
+
// If we have buffered audio, send it first
|
|
3827
|
+
if (this.audioBuffer.length > 0) {
|
|
3828
|
+
console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
|
|
3829
|
+
for (const bufferedAudio of this.audioBuffer) {
|
|
3830
|
+
this._wsSend({
|
|
3831
|
+
type: 'client.audio',
|
|
3832
|
+
content: bufferedAudio,
|
|
3833
|
+
});
|
|
3834
|
+
}
|
|
3835
|
+
this.audioBuffer = []; // Clear the buffer after sending
|
|
3836
|
+
}
|
|
3837
|
+
// Send the current audio
|
|
3714
3838
|
this._wsSend({
|
|
3715
3839
|
type: 'client.audio',
|
|
3716
3840
|
content: base64,
|
|
3717
3841
|
});
|
|
3718
|
-
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
});
|
|
3842
|
+
}
|
|
3843
|
+
else {
|
|
3844
|
+
// Buffer audio when not sending (to catch audio just before VAD triggers)
|
|
3845
|
+
this.audioBuffer.push(base64);
|
|
3846
|
+
// Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
|
|
3847
|
+
if (this.audioBuffer.length > 10) {
|
|
3848
|
+
this.audioBuffer.shift(); // Remove oldest chunk
|
|
3726
3849
|
}
|
|
3727
3850
|
}
|
|
3728
3851
|
}
|
|
@@ -3857,10 +3980,20 @@ class LayercodeClient {
|
|
|
3857
3980
|
}
|
|
3858
3981
|
}
|
|
3859
3982
|
async disconnect() {
|
|
3860
|
-
|
|
3983
|
+
// Clean up VAD if it exists
|
|
3984
|
+
if (this.vad) {
|
|
3985
|
+
this.vad.pause();
|
|
3986
|
+
this.vad.destroy();
|
|
3987
|
+
this.vad = null;
|
|
3988
|
+
}
|
|
3861
3989
|
this.wavRecorder.quit();
|
|
3862
3990
|
this.wavPlayer.disconnect();
|
|
3863
|
-
|
|
3991
|
+
// Close websocket and ensure status is updated
|
|
3992
|
+
if (this.ws) {
|
|
3993
|
+
this.ws.close();
|
|
3994
|
+
this._setStatus('disconnected');
|
|
3995
|
+
this.options.onDisconnect();
|
|
3996
|
+
}
|
|
3864
3997
|
}
|
|
3865
3998
|
/**
|
|
3866
3999
|
* Gets the microphone MediaStream used by this client
|