@layercode/js-sdk 1.0.19 → 1.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -510,6 +510,24 @@ class WavStreamPlayer {
|
|
|
510
510
|
this.isPlaying = false;
|
|
511
511
|
}
|
|
512
512
|
|
|
513
|
+
/**
|
|
514
|
+
* Clears interrupted track IDs to prevent memory leaks
|
|
515
|
+
* @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
|
|
516
|
+
*/
|
|
517
|
+
clearInterruptedTracks(keepTrackIds = []) {
|
|
518
|
+
if (keepTrackIds.length === 0) {
|
|
519
|
+
this.interruptedTrackIds = {};
|
|
520
|
+
} else {
|
|
521
|
+
const newInterruptedTracks = {};
|
|
522
|
+
for (const trackId of keepTrackIds) {
|
|
523
|
+
if (this.interruptedTrackIds[trackId]) {
|
|
524
|
+
newInterruptedTracks[trackId] = true;
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
this.interruptedTrackIds = newInterruptedTracks;
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
513
531
|
/**
|
|
514
532
|
* Connects the audio context and enables output to speakers
|
|
515
533
|
* @returns {Promise<true>}
|
|
@@ -743,7 +761,7 @@ class WavStreamPlayer {
|
|
|
743
761
|
this.analyser.disconnect();
|
|
744
762
|
}
|
|
745
763
|
|
|
746
|
-
if (this.context) {
|
|
764
|
+
if (this.context && this.context.state !== 'closed') {
|
|
747
765
|
this.context.close().catch((err) => console.error("Error closing audio context:", err));
|
|
748
766
|
}
|
|
749
767
|
|
|
@@ -3482,6 +3500,7 @@ class LayercodeClient {
|
|
|
3482
3500
|
onUserAmplitudeChange: options.onUserAmplitudeChange || (() => { }),
|
|
3483
3501
|
onAgentAmplitudeChange: options.onAgentAmplitudeChange || (() => { }),
|
|
3484
3502
|
onStatusChange: options.onStatusChange || (() => { }),
|
|
3503
|
+
onUserIsSpeakingChange: options.onUserIsSpeakingChange || (() => { }),
|
|
3485
3504
|
};
|
|
3486
3505
|
this.AMPLITUDE_MONITORING_SAMPLE_RATE = 10;
|
|
3487
3506
|
this._websocketUrl = 'wss://api.layercode.com/v1/pipelines/websocket';
|
|
@@ -3497,13 +3516,15 @@ class LayercodeClient {
|
|
|
3497
3516
|
this.agentAudioAmplitude = 0;
|
|
3498
3517
|
this.sessionId = options.sessionId || null;
|
|
3499
3518
|
this.pushToTalkActive = false;
|
|
3500
|
-
this.vadPausedPlayer = false;
|
|
3501
3519
|
this.pushToTalkEnabled = false;
|
|
3502
3520
|
this.canInterrupt = false;
|
|
3503
3521
|
this.userIsSpeaking = false;
|
|
3504
3522
|
this.endUserTurn = false;
|
|
3505
3523
|
this.recorderStarted = false;
|
|
3506
3524
|
this.readySent = false;
|
|
3525
|
+
this.currentTurnText = '';
|
|
3526
|
+
this.currentTurnId = null;
|
|
3527
|
+
this.audioBuffer = [];
|
|
3507
3528
|
// Bind event handlers
|
|
3508
3529
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3509
3530
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
@@ -3518,6 +3539,7 @@ class LayercodeClient {
|
|
|
3518
3539
|
console.log('silero vad model timeout');
|
|
3519
3540
|
// TODO: send message to server to indicate that the vad model timed out
|
|
3520
3541
|
this.userIsSpeaking = true; // allow audio to be sent to the server
|
|
3542
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3521
3543
|
}, 2000);
|
|
3522
3544
|
if (!this.canInterrupt) {
|
|
3523
3545
|
dist.MicVAD.new({
|
|
@@ -3529,15 +3551,48 @@ class LayercodeClient {
|
|
|
3529
3551
|
minSpeechFrames: 15,
|
|
3530
3552
|
preSpeechPadFrames: 0,
|
|
3531
3553
|
onSpeechStart: () => {
|
|
3532
|
-
|
|
3533
|
-
|
|
3534
|
-
|
|
3554
|
+
this.userIsSpeaking = true;
|
|
3555
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3556
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3557
|
+
this._wsSend({
|
|
3558
|
+
type: 'vad_events',
|
|
3559
|
+
event: 'vad_start',
|
|
3560
|
+
});
|
|
3535
3561
|
},
|
|
3536
3562
|
onVADMisfire: () => {
|
|
3563
|
+
console.log('onVADMisfire: Short utterance detected, resuming bot');
|
|
3564
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3537
3565
|
this.userIsSpeaking = false;
|
|
3566
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3567
|
+
// Send vad_end to indicate the short utterance is over
|
|
3568
|
+
this._wsSend({
|
|
3569
|
+
type: 'vad_events',
|
|
3570
|
+
event: 'vad_end',
|
|
3571
|
+
});
|
|
3572
|
+
// End the user's turn
|
|
3573
|
+
this._wsSend({
|
|
3574
|
+
type: 'trigger.turn.end',
|
|
3575
|
+
role: 'user',
|
|
3576
|
+
});
|
|
3577
|
+
// Resume bot audio if it was playing
|
|
3578
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3579
|
+
console.log('onVADMisfire: Resuming bot audio');
|
|
3580
|
+
this.wavPlayer.play();
|
|
3581
|
+
}
|
|
3538
3582
|
},
|
|
3539
3583
|
onSpeechEnd: () => {
|
|
3540
|
-
|
|
3584
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3585
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3586
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3587
|
+
this.userIsSpeaking = false;
|
|
3588
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3589
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3590
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3591
|
+
this._wsSend({
|
|
3592
|
+
type: 'vad_events',
|
|
3593
|
+
event: 'vad_end',
|
|
3594
|
+
});
|
|
3595
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3541
3596
|
},
|
|
3542
3597
|
})
|
|
3543
3598
|
.then((vad) => {
|
|
@@ -3559,43 +3614,59 @@ class LayercodeClient {
|
|
|
3559
3614
|
positiveSpeechThreshold: 0.3,
|
|
3560
3615
|
negativeSpeechThreshold: 0.2,
|
|
3561
3616
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3562
|
-
minSpeechFrames:
|
|
3617
|
+
minSpeechFrames: 5,
|
|
3563
3618
|
preSpeechPadFrames: 0,
|
|
3564
3619
|
onSpeechStart: () => {
|
|
3565
3620
|
// Only pause agent audio if it's currently playing
|
|
3566
3621
|
if (this.wavPlayer.isPlaying) {
|
|
3567
3622
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3568
3623
|
this.wavPlayer.pause();
|
|
3569
|
-
this.vadPausedPlayer = true; // VAD is responsible for this pause
|
|
3570
3624
|
}
|
|
3571
3625
|
else {
|
|
3572
3626
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3573
3627
|
}
|
|
3574
|
-
this.userIsSpeaking = true;
|
|
3575
3628
|
console.log('onSpeechStart: sending vad_start');
|
|
3576
3629
|
this._wsSend({
|
|
3577
3630
|
type: 'vad_events',
|
|
3578
3631
|
event: 'vad_start',
|
|
3579
3632
|
});
|
|
3633
|
+
this.userIsSpeaking = true;
|
|
3634
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3635
|
+
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3636
|
+
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3580
3637
|
},
|
|
3581
3638
|
onVADMisfire: () => {
|
|
3582
3639
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3583
3640
|
this.userIsSpeaking = false;
|
|
3584
|
-
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
|
|
3588
|
-
|
|
3589
|
-
|
|
3590
|
-
|
|
3591
|
-
|
|
3641
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3642
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3643
|
+
// Add the missing delay before resuming to prevent race conditions
|
|
3644
|
+
setTimeout(() => {
|
|
3645
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3646
|
+
console.log('onVADMisfire: Resuming after delay');
|
|
3647
|
+
this.wavPlayer.play();
|
|
3648
|
+
this.userIsSpeaking = true;
|
|
3649
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3650
|
+
}
|
|
3651
|
+
else {
|
|
3652
|
+
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3653
|
+
this.endUserTurn = true;
|
|
3654
|
+
}
|
|
3655
|
+
}, this.options.vadResumeDelay);
|
|
3592
3656
|
},
|
|
3593
3657
|
onSpeechEnd: () => {
|
|
3658
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3659
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3660
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3594
3661
|
this.userIsSpeaking = false;
|
|
3662
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3663
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3664
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3595
3665
|
this._wsSend({
|
|
3596
3666
|
type: 'vad_events',
|
|
3597
3667
|
event: 'vad_end',
|
|
3598
3668
|
});
|
|
3669
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3599
3670
|
},
|
|
3600
3671
|
})
|
|
3601
3672
|
.then((vad) => {
|
|
@@ -3629,14 +3700,33 @@ class LayercodeClient {
|
|
|
3629
3700
|
reason: 'completed',
|
|
3630
3701
|
});
|
|
3631
3702
|
}
|
|
3703
|
+
_estimateWordsHeard(text, playbackOffsetSeconds) {
|
|
3704
|
+
const words = text.split(/\s+/).filter((word) => word.length > 0);
|
|
3705
|
+
const totalWords = words.length;
|
|
3706
|
+
// Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
|
|
3707
|
+
const estimatedWordsPerSecond = 2.5;
|
|
3708
|
+
const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
|
|
3709
|
+
const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
|
|
3710
|
+
return { wordsHeard: estimatedWordsHeard, textHeard };
|
|
3711
|
+
}
|
|
3632
3712
|
async _clientInterruptAssistantReplay() {
|
|
3633
|
-
await this.wavPlayer.interrupt();
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3713
|
+
const offsetData = await this.wavPlayer.interrupt();
|
|
3714
|
+
if (offsetData && this.currentTurnText && this.currentTurnId) {
|
|
3715
|
+
const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
|
|
3716
|
+
const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
3717
|
+
console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
|
|
3718
|
+
// Send interruption event with context
|
|
3719
|
+
this._wsSend({
|
|
3720
|
+
type: 'trigger.response.audio.interrupted',
|
|
3721
|
+
playback_offset: offsetData.currentTime,
|
|
3722
|
+
interruption_context: {
|
|
3723
|
+
turn_id: this.currentTurnId,
|
|
3724
|
+
estimated_words_heard: wordsHeard,
|
|
3725
|
+
total_words: totalWords,
|
|
3726
|
+
text_heard: textHeard,
|
|
3727
|
+
},
|
|
3728
|
+
});
|
|
3729
|
+
}
|
|
3640
3730
|
}
|
|
3641
3731
|
async triggerUserTurnStarted() {
|
|
3642
3732
|
if (!this.pushToTalkActive) {
|
|
@@ -3667,20 +3757,44 @@ class LayercodeClient {
|
|
|
3667
3757
|
// Sent from the server to this client when a new user turn is detected
|
|
3668
3758
|
console.log('received turn.start from server');
|
|
3669
3759
|
console.log(message);
|
|
3670
|
-
if (message.role === '
|
|
3760
|
+
if (message.role === 'assistant') {
|
|
3761
|
+
// Start tracking new assistant turn
|
|
3762
|
+
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3763
|
+
// This prevents race conditions where text arrives before audio
|
|
3764
|
+
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3765
|
+
}
|
|
3766
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3671
3767
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3672
3768
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3673
3769
|
await this._clientInterruptAssistantReplay();
|
|
3674
3770
|
}
|
|
3675
|
-
// if (message.role === 'assistant') {
|
|
3676
|
-
// // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
|
|
3677
|
-
// console.log('Clearing audio buffer as assistant turn has started');
|
|
3678
|
-
// await this._clientInterruptAssistantReplay();
|
|
3679
|
-
// }
|
|
3680
3771
|
break;
|
|
3681
3772
|
case 'response.audio':
|
|
3682
3773
|
const audioBuffer = base64ToArrayBuffer(message.content);
|
|
3683
3774
|
this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
|
|
3775
|
+
// Set current turn ID from first audio message, or update if different turn
|
|
3776
|
+
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3777
|
+
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3778
|
+
const oldTurnId = this.currentTurnId;
|
|
3779
|
+
this.currentTurnId = message.turn_id;
|
|
3780
|
+
this.currentTurnText = ''; // Reset text for new turn
|
|
3781
|
+
// Clean up interrupted tracks, keeping only the current turn
|
|
3782
|
+
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3783
|
+
}
|
|
3784
|
+
break;
|
|
3785
|
+
case 'response.text':
|
|
3786
|
+
// Set turn ID from first text message if not set, or accumulate if matches current turn
|
|
3787
|
+
if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
|
|
3788
|
+
if (!this.currentTurnId) {
|
|
3789
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3790
|
+
this.currentTurnId = message.turn_id;
|
|
3791
|
+
this.currentTurnText = '';
|
|
3792
|
+
}
|
|
3793
|
+
this.currentTurnText += message.content;
|
|
3794
|
+
}
|
|
3795
|
+
else {
|
|
3796
|
+
console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
|
|
3797
|
+
}
|
|
3684
3798
|
break;
|
|
3685
3799
|
// case 'response.end':
|
|
3686
3800
|
// console.log('received response.end');
|
|
@@ -3709,17 +3823,29 @@ class LayercodeClient {
|
|
|
3709
3823
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3710
3824
|
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3711
3825
|
if (sendAudio) {
|
|
3826
|
+
// If we have buffered audio, send it first
|
|
3827
|
+
if (this.audioBuffer.length > 0) {
|
|
3828
|
+
console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
|
|
3829
|
+
for (const bufferedAudio of this.audioBuffer) {
|
|
3830
|
+
this._wsSend({
|
|
3831
|
+
type: 'client.audio',
|
|
3832
|
+
content: bufferedAudio,
|
|
3833
|
+
});
|
|
3834
|
+
}
|
|
3835
|
+
this.audioBuffer = []; // Clear the buffer after sending
|
|
3836
|
+
}
|
|
3837
|
+
// Send the current audio
|
|
3712
3838
|
this._wsSend({
|
|
3713
3839
|
type: 'client.audio',
|
|
3714
3840
|
content: base64,
|
|
3715
3841
|
});
|
|
3716
|
-
|
|
3717
|
-
|
|
3718
|
-
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
|
|
3722
|
-
|
|
3842
|
+
}
|
|
3843
|
+
else {
|
|
3844
|
+
// Buffer audio when not sending (to catch audio just before VAD triggers)
|
|
3845
|
+
this.audioBuffer.push(base64);
|
|
3846
|
+
// Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
|
|
3847
|
+
if (this.audioBuffer.length > 10) {
|
|
3848
|
+
this.audioBuffer.shift(); // Remove oldest chunk
|
|
3723
3849
|
}
|
|
3724
3850
|
}
|
|
3725
3851
|
}
|
|
@@ -3854,10 +3980,20 @@ class LayercodeClient {
|
|
|
3854
3980
|
}
|
|
3855
3981
|
}
|
|
3856
3982
|
async disconnect() {
|
|
3857
|
-
|
|
3983
|
+
// Clean up VAD if it exists
|
|
3984
|
+
if (this.vad) {
|
|
3985
|
+
this.vad.pause();
|
|
3986
|
+
this.vad.destroy();
|
|
3987
|
+
this.vad = null;
|
|
3988
|
+
}
|
|
3858
3989
|
this.wavRecorder.quit();
|
|
3859
3990
|
this.wavPlayer.disconnect();
|
|
3860
|
-
|
|
3991
|
+
// Close websocket and ensure status is updated
|
|
3992
|
+
if (this.ws) {
|
|
3993
|
+
this.ws.close();
|
|
3994
|
+
this._setStatus('disconnected');
|
|
3995
|
+
this.options.onDisconnect();
|
|
3996
|
+
}
|
|
3861
3997
|
}
|
|
3862
3998
|
/**
|
|
3863
3999
|
* Gets the microphone MediaStream used by this client
|
|
@@ -3866,6 +4002,25 @@ class LayercodeClient {
|
|
|
3866
4002
|
getStream() {
|
|
3867
4003
|
return this.wavRecorder.getStream();
|
|
3868
4004
|
}
|
|
4005
|
+
/**
|
|
4006
|
+
* Switches the input device for the microphone and restarts recording
|
|
4007
|
+
* @param {string} deviceId - The deviceId of the new microphone
|
|
4008
|
+
*/
|
|
4009
|
+
async setInputDevice(deviceId) {
|
|
4010
|
+
if (this.wavRecorder) {
|
|
4011
|
+
try {
|
|
4012
|
+
await this.wavRecorder.end();
|
|
4013
|
+
}
|
|
4014
|
+
catch (e) { }
|
|
4015
|
+
try {
|
|
4016
|
+
await this.wavRecorder.quit();
|
|
4017
|
+
}
|
|
4018
|
+
catch (e) { }
|
|
4019
|
+
}
|
|
4020
|
+
await this.wavRecorder.begin(deviceId);
|
|
4021
|
+
await this.wavRecorder.record(this._handleDataAvailable, 1638);
|
|
4022
|
+
this._setupAmplitudeMonitoring(this.wavRecorder, this.options.onUserAmplitudeChange, (amp) => (this.userAudioAmplitude = amp));
|
|
4023
|
+
}
|
|
3869
4024
|
}
|
|
3870
4025
|
|
|
3871
4026
|
export { LayercodeClient as default };
|