@layercode/js-sdk 1.0.19 → 1.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -516,6 +516,24 @@ registerProcessor('stream_processor', StreamProcessor);
|
|
|
516
516
|
this.isPlaying = false;
|
|
517
517
|
}
|
|
518
518
|
|
|
519
|
+
/**
|
|
520
|
+
* Clears interrupted track IDs to prevent memory leaks
|
|
521
|
+
* @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
|
|
522
|
+
*/
|
|
523
|
+
clearInterruptedTracks(keepTrackIds = []) {
|
|
524
|
+
if (keepTrackIds.length === 0) {
|
|
525
|
+
this.interruptedTrackIds = {};
|
|
526
|
+
} else {
|
|
527
|
+
const newInterruptedTracks = {};
|
|
528
|
+
for (const trackId of keepTrackIds) {
|
|
529
|
+
if (this.interruptedTrackIds[trackId]) {
|
|
530
|
+
newInterruptedTracks[trackId] = true;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
this.interruptedTrackIds = newInterruptedTracks;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
519
537
|
/**
|
|
520
538
|
* Connects the audio context and enables output to speakers
|
|
521
539
|
* @returns {Promise<true>}
|
|
@@ -749,7 +767,7 @@ registerProcessor('stream_processor', StreamProcessor);
|
|
|
749
767
|
this.analyser.disconnect();
|
|
750
768
|
}
|
|
751
769
|
|
|
752
|
-
if (this.context) {
|
|
770
|
+
if (this.context && this.context.state !== 'closed') {
|
|
753
771
|
this.context.close().catch((err) => console.error("Error closing audio context:", err));
|
|
754
772
|
}
|
|
755
773
|
|
|
@@ -3488,6 +3506,7 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3488
3506
|
onUserAmplitudeChange: options.onUserAmplitudeChange || (() => { }),
|
|
3489
3507
|
onAgentAmplitudeChange: options.onAgentAmplitudeChange || (() => { }),
|
|
3490
3508
|
onStatusChange: options.onStatusChange || (() => { }),
|
|
3509
|
+
onUserIsSpeakingChange: options.onUserIsSpeakingChange || (() => { }),
|
|
3491
3510
|
};
|
|
3492
3511
|
this.AMPLITUDE_MONITORING_SAMPLE_RATE = 10;
|
|
3493
3512
|
this._websocketUrl = 'wss://api.layercode.com/v1/pipelines/websocket';
|
|
@@ -3503,13 +3522,15 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3503
3522
|
this.agentAudioAmplitude = 0;
|
|
3504
3523
|
this.sessionId = options.sessionId || null;
|
|
3505
3524
|
this.pushToTalkActive = false;
|
|
3506
|
-
this.vadPausedPlayer = false;
|
|
3507
3525
|
this.pushToTalkEnabled = false;
|
|
3508
3526
|
this.canInterrupt = false;
|
|
3509
3527
|
this.userIsSpeaking = false;
|
|
3510
3528
|
this.endUserTurn = false;
|
|
3511
3529
|
this.recorderStarted = false;
|
|
3512
3530
|
this.readySent = false;
|
|
3531
|
+
this.currentTurnText = '';
|
|
3532
|
+
this.currentTurnId = null;
|
|
3533
|
+
this.audioBuffer = [];
|
|
3513
3534
|
// Bind event handlers
|
|
3514
3535
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3515
3536
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
@@ -3524,6 +3545,7 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3524
3545
|
console.log('silero vad model timeout');
|
|
3525
3546
|
// TODO: send message to server to indicate that the vad model timed out
|
|
3526
3547
|
this.userIsSpeaking = true; // allow audio to be sent to the server
|
|
3548
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3527
3549
|
}, 2000);
|
|
3528
3550
|
if (!this.canInterrupt) {
|
|
3529
3551
|
dist.MicVAD.new({
|
|
@@ -3535,15 +3557,48 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3535
3557
|
minSpeechFrames: 15,
|
|
3536
3558
|
preSpeechPadFrames: 0,
|
|
3537
3559
|
onSpeechStart: () => {
|
|
3538
|
-
|
|
3539
|
-
|
|
3540
|
-
|
|
3560
|
+
this.userIsSpeaking = true;
|
|
3561
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3562
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3563
|
+
this._wsSend({
|
|
3564
|
+
type: 'vad_events',
|
|
3565
|
+
event: 'vad_start',
|
|
3566
|
+
});
|
|
3541
3567
|
},
|
|
3542
3568
|
onVADMisfire: () => {
|
|
3569
|
+
console.log('onVADMisfire: Short utterance detected, resuming bot');
|
|
3570
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3543
3571
|
this.userIsSpeaking = false;
|
|
3572
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3573
|
+
// Send vad_end to indicate the short utterance is over
|
|
3574
|
+
this._wsSend({
|
|
3575
|
+
type: 'vad_events',
|
|
3576
|
+
event: 'vad_end',
|
|
3577
|
+
});
|
|
3578
|
+
// End the user's turn
|
|
3579
|
+
this._wsSend({
|
|
3580
|
+
type: 'trigger.turn.end',
|
|
3581
|
+
role: 'user',
|
|
3582
|
+
});
|
|
3583
|
+
// Resume bot audio if it was playing
|
|
3584
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3585
|
+
console.log('onVADMisfire: Resuming bot audio');
|
|
3586
|
+
this.wavPlayer.play();
|
|
3587
|
+
}
|
|
3544
3588
|
},
|
|
3545
3589
|
onSpeechEnd: () => {
|
|
3546
|
-
|
|
3590
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3591
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3592
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3593
|
+
this.userIsSpeaking = false;
|
|
3594
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3595
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3596
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3597
|
+
this._wsSend({
|
|
3598
|
+
type: 'vad_events',
|
|
3599
|
+
event: 'vad_end',
|
|
3600
|
+
});
|
|
3601
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3547
3602
|
},
|
|
3548
3603
|
})
|
|
3549
3604
|
.then((vad) => {
|
|
@@ -3565,43 +3620,59 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3565
3620
|
positiveSpeechThreshold: 0.3,
|
|
3566
3621
|
negativeSpeechThreshold: 0.2,
|
|
3567
3622
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3568
|
-
minSpeechFrames:
|
|
3623
|
+
minSpeechFrames: 5,
|
|
3569
3624
|
preSpeechPadFrames: 0,
|
|
3570
3625
|
onSpeechStart: () => {
|
|
3571
3626
|
// Only pause agent audio if it's currently playing
|
|
3572
3627
|
if (this.wavPlayer.isPlaying) {
|
|
3573
3628
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3574
3629
|
this.wavPlayer.pause();
|
|
3575
|
-
this.vadPausedPlayer = true; // VAD is responsible for this pause
|
|
3576
3630
|
}
|
|
3577
3631
|
else {
|
|
3578
3632
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3579
3633
|
}
|
|
3580
|
-
this.userIsSpeaking = true;
|
|
3581
3634
|
console.log('onSpeechStart: sending vad_start');
|
|
3582
3635
|
this._wsSend({
|
|
3583
3636
|
type: 'vad_events',
|
|
3584
3637
|
event: 'vad_start',
|
|
3585
3638
|
});
|
|
3639
|
+
this.userIsSpeaking = true;
|
|
3640
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3641
|
+
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3642
|
+
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3586
3643
|
},
|
|
3587
3644
|
onVADMisfire: () => {
|
|
3588
3645
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3589
3646
|
this.userIsSpeaking = false;
|
|
3590
|
-
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3647
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3648
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3649
|
+
// Add the missing delay before resuming to prevent race conditions
|
|
3650
|
+
setTimeout(() => {
|
|
3651
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3652
|
+
console.log('onVADMisfire: Resuming after delay');
|
|
3653
|
+
this.wavPlayer.play();
|
|
3654
|
+
this.userIsSpeaking = true;
|
|
3655
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3656
|
+
}
|
|
3657
|
+
else {
|
|
3658
|
+
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3659
|
+
this.endUserTurn = true;
|
|
3660
|
+
}
|
|
3661
|
+
}, this.options.vadResumeDelay);
|
|
3598
3662
|
},
|
|
3599
3663
|
onSpeechEnd: () => {
|
|
3664
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3665
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3666
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3600
3667
|
this.userIsSpeaking = false;
|
|
3668
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3669
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3670
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3601
3671
|
this._wsSend({
|
|
3602
3672
|
type: 'vad_events',
|
|
3603
3673
|
event: 'vad_end',
|
|
3604
3674
|
});
|
|
3675
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3605
3676
|
},
|
|
3606
3677
|
})
|
|
3607
3678
|
.then((vad) => {
|
|
@@ -3635,14 +3706,33 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3635
3706
|
reason: 'completed',
|
|
3636
3707
|
});
|
|
3637
3708
|
}
|
|
3709
|
+
_estimateWordsHeard(text, playbackOffsetSeconds) {
|
|
3710
|
+
const words = text.split(/\s+/).filter((word) => word.length > 0);
|
|
3711
|
+
const totalWords = words.length;
|
|
3712
|
+
// Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
|
|
3713
|
+
const estimatedWordsPerSecond = 2.5;
|
|
3714
|
+
const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
|
|
3715
|
+
const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
|
|
3716
|
+
return { wordsHeard: estimatedWordsHeard, textHeard };
|
|
3717
|
+
}
|
|
3638
3718
|
async _clientInterruptAssistantReplay() {
|
|
3639
|
-
await this.wavPlayer.interrupt();
|
|
3640
|
-
|
|
3641
|
-
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3719
|
+
const offsetData = await this.wavPlayer.interrupt();
|
|
3720
|
+
if (offsetData && this.currentTurnText && this.currentTurnId) {
|
|
3721
|
+
const { wordsHeard, textHeard } = this._estimateWordsHeard(this.currentTurnText, offsetData.currentTime);
|
|
3722
|
+
const totalWords = this.currentTurnText.split(/\s+/).filter((word) => word.length > 0).length;
|
|
3723
|
+
console.log(`Interruption detected: ${wordsHeard}/${totalWords} words heard, text: "${textHeard}"`);
|
|
3724
|
+
// Send interruption event with context
|
|
3725
|
+
this._wsSend({
|
|
3726
|
+
type: 'trigger.response.audio.interrupted',
|
|
3727
|
+
playback_offset: offsetData.currentTime,
|
|
3728
|
+
interruption_context: {
|
|
3729
|
+
turn_id: this.currentTurnId,
|
|
3730
|
+
estimated_words_heard: wordsHeard,
|
|
3731
|
+
total_words: totalWords,
|
|
3732
|
+
text_heard: textHeard,
|
|
3733
|
+
},
|
|
3734
|
+
});
|
|
3735
|
+
}
|
|
3646
3736
|
}
|
|
3647
3737
|
async triggerUserTurnStarted() {
|
|
3648
3738
|
if (!this.pushToTalkActive) {
|
|
@@ -3673,20 +3763,44 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3673
3763
|
// Sent from the server to this client when a new user turn is detected
|
|
3674
3764
|
console.log('received turn.start from server');
|
|
3675
3765
|
console.log(message);
|
|
3676
|
-
if (message.role === '
|
|
3766
|
+
if (message.role === 'assistant') {
|
|
3767
|
+
// Start tracking new assistant turn
|
|
3768
|
+
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3769
|
+
// This prevents race conditions where text arrives before audio
|
|
3770
|
+
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3771
|
+
}
|
|
3772
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3677
3773
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3678
3774
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3679
3775
|
await this._clientInterruptAssistantReplay();
|
|
3680
3776
|
}
|
|
3681
|
-
// if (message.role === 'assistant') {
|
|
3682
|
-
// // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
|
|
3683
|
-
// console.log('Clearing audio buffer as assistant turn has started');
|
|
3684
|
-
// await this._clientInterruptAssistantReplay();
|
|
3685
|
-
// }
|
|
3686
3777
|
break;
|
|
3687
3778
|
case 'response.audio':
|
|
3688
3779
|
const audioBuffer = base64ToArrayBuffer(message.content);
|
|
3689
3780
|
this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
|
|
3781
|
+
// Set current turn ID from first audio message, or update if different turn
|
|
3782
|
+
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3783
|
+
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3784
|
+
const oldTurnId = this.currentTurnId;
|
|
3785
|
+
this.currentTurnId = message.turn_id;
|
|
3786
|
+
this.currentTurnText = ''; // Reset text for new turn
|
|
3787
|
+
// Clean up interrupted tracks, keeping only the current turn
|
|
3788
|
+
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3789
|
+
}
|
|
3790
|
+
break;
|
|
3791
|
+
case 'response.text':
|
|
3792
|
+
// Set turn ID from first text message if not set, or accumulate if matches current turn
|
|
3793
|
+
if (!this.currentTurnId || message.turn_id === this.currentTurnId) {
|
|
3794
|
+
if (!this.currentTurnId) {
|
|
3795
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3796
|
+
this.currentTurnId = message.turn_id;
|
|
3797
|
+
this.currentTurnText = '';
|
|
3798
|
+
}
|
|
3799
|
+
this.currentTurnText += message.content;
|
|
3800
|
+
}
|
|
3801
|
+
else {
|
|
3802
|
+
console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
|
|
3803
|
+
}
|
|
3690
3804
|
break;
|
|
3691
3805
|
// case 'response.end':
|
|
3692
3806
|
// console.log('received response.end');
|
|
@@ -3715,17 +3829,29 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3715
3829
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3716
3830
|
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3717
3831
|
if (sendAudio) {
|
|
3832
|
+
// If we have buffered audio, send it first
|
|
3833
|
+
if (this.audioBuffer.length > 0) {
|
|
3834
|
+
console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
|
|
3835
|
+
for (const bufferedAudio of this.audioBuffer) {
|
|
3836
|
+
this._wsSend({
|
|
3837
|
+
type: 'client.audio',
|
|
3838
|
+
content: bufferedAudio,
|
|
3839
|
+
});
|
|
3840
|
+
}
|
|
3841
|
+
this.audioBuffer = []; // Clear the buffer after sending
|
|
3842
|
+
}
|
|
3843
|
+
// Send the current audio
|
|
3718
3844
|
this._wsSend({
|
|
3719
3845
|
type: 'client.audio',
|
|
3720
3846
|
content: base64,
|
|
3721
3847
|
});
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3728
|
-
|
|
3848
|
+
}
|
|
3849
|
+
else {
|
|
3850
|
+
// Buffer audio when not sending (to catch audio just before VAD triggers)
|
|
3851
|
+
this.audioBuffer.push(base64);
|
|
3852
|
+
// Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
|
|
3853
|
+
if (this.audioBuffer.length > 10) {
|
|
3854
|
+
this.audioBuffer.shift(); // Remove oldest chunk
|
|
3729
3855
|
}
|
|
3730
3856
|
}
|
|
3731
3857
|
}
|
|
@@ -3860,10 +3986,20 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3860
3986
|
}
|
|
3861
3987
|
}
|
|
3862
3988
|
async disconnect() {
|
|
3863
|
-
|
|
3989
|
+
// Clean up VAD if it exists
|
|
3990
|
+
if (this.vad) {
|
|
3991
|
+
this.vad.pause();
|
|
3992
|
+
this.vad.destroy();
|
|
3993
|
+
this.vad = null;
|
|
3994
|
+
}
|
|
3864
3995
|
this.wavRecorder.quit();
|
|
3865
3996
|
this.wavPlayer.disconnect();
|
|
3866
|
-
|
|
3997
|
+
// Close websocket and ensure status is updated
|
|
3998
|
+
if (this.ws) {
|
|
3999
|
+
this.ws.close();
|
|
4000
|
+
this._setStatus('disconnected');
|
|
4001
|
+
this.options.onDisconnect();
|
|
4002
|
+
}
|
|
3867
4003
|
}
|
|
3868
4004
|
/**
|
|
3869
4005
|
* Gets the microphone MediaStream used by this client
|
|
@@ -3872,6 +4008,25 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3872
4008
|
getStream() {
|
|
3873
4009
|
return this.wavRecorder.getStream();
|
|
3874
4010
|
}
|
|
4011
|
+
/**
|
|
4012
|
+
* Switches the input device for the microphone and restarts recording
|
|
4013
|
+
* @param {string} deviceId - The deviceId of the new microphone
|
|
4014
|
+
*/
|
|
4015
|
+
async setInputDevice(deviceId) {
|
|
4016
|
+
if (this.wavRecorder) {
|
|
4017
|
+
try {
|
|
4018
|
+
await this.wavRecorder.end();
|
|
4019
|
+
}
|
|
4020
|
+
catch (e) { }
|
|
4021
|
+
try {
|
|
4022
|
+
await this.wavRecorder.quit();
|
|
4023
|
+
}
|
|
4024
|
+
catch (e) { }
|
|
4025
|
+
}
|
|
4026
|
+
await this.wavRecorder.begin(deviceId);
|
|
4027
|
+
await this.wavRecorder.record(this._handleDataAvailable, 1638);
|
|
4028
|
+
this._setupAmplitudeMonitoring(this.wavRecorder, this.options.onUserAmplitudeChange, (amp) => (this.userAudioAmplitude = amp));
|
|
4029
|
+
}
|
|
3875
4030
|
}
|
|
3876
4031
|
|
|
3877
4032
|
return LayercodeClient;
|