@layercode/js-sdk 1.0.21 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -516,6 +516,24 @@ registerProcessor('stream_processor', StreamProcessor);
|
|
|
516
516
|
this.isPlaying = false;
|
|
517
517
|
}
|
|
518
518
|
|
|
519
|
+
/**
|
|
520
|
+
* Clears interrupted track IDs to prevent memory leaks
|
|
521
|
+
* @param {string[]} [keepTrackIds] - Track IDs to keep in the interrupted list
|
|
522
|
+
*/
|
|
523
|
+
clearInterruptedTracks(keepTrackIds = []) {
|
|
524
|
+
if (keepTrackIds.length === 0) {
|
|
525
|
+
this.interruptedTrackIds = {};
|
|
526
|
+
} else {
|
|
527
|
+
const newInterruptedTracks = {};
|
|
528
|
+
for (const trackId of keepTrackIds) {
|
|
529
|
+
if (this.interruptedTrackIds[trackId]) {
|
|
530
|
+
newInterruptedTracks[trackId] = true;
|
|
531
|
+
}
|
|
532
|
+
}
|
|
533
|
+
this.interruptedTrackIds = newInterruptedTracks;
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
|
|
519
537
|
/**
|
|
520
538
|
* Connects the audio context and enables output to speakers
|
|
521
539
|
* @returns {Promise<true>}
|
|
@@ -749,7 +767,7 @@ registerProcessor('stream_processor', StreamProcessor);
|
|
|
749
767
|
this.analyser.disconnect();
|
|
750
768
|
}
|
|
751
769
|
|
|
752
|
-
if (this.context) {
|
|
770
|
+
if (this.context && this.context.state !== 'closed') {
|
|
753
771
|
this.context.close().catch((err) => console.error("Error closing audio context:", err));
|
|
754
772
|
}
|
|
755
773
|
|
|
@@ -3504,17 +3522,59 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3504
3522
|
this.agentAudioAmplitude = 0;
|
|
3505
3523
|
this.sessionId = options.sessionId || null;
|
|
3506
3524
|
this.pushToTalkActive = false;
|
|
3507
|
-
this.vadPausedPlayer = false;
|
|
3508
3525
|
this.pushToTalkEnabled = false;
|
|
3509
3526
|
this.canInterrupt = false;
|
|
3510
3527
|
this.userIsSpeaking = false;
|
|
3511
3528
|
this.endUserTurn = false;
|
|
3512
3529
|
this.recorderStarted = false;
|
|
3513
3530
|
this.readySent = false;
|
|
3531
|
+
this.currentTurnId = null;
|
|
3532
|
+
this.audioBuffer = [];
|
|
3533
|
+
this.audioPauseTime = null;
|
|
3514
3534
|
// Bind event handlers
|
|
3515
3535
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3516
3536
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3517
3537
|
}
|
|
3538
|
+
_setupAmplitudeBasedVAD() {
|
|
3539
|
+
let isSpeakingByAmplitude = false;
|
|
3540
|
+
let silenceFrames = 0;
|
|
3541
|
+
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3542
|
+
const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
|
|
3543
|
+
// Monitor amplitude changes
|
|
3544
|
+
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3545
|
+
const wasSpeaking = isSpeakingByAmplitude;
|
|
3546
|
+
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3547
|
+
silenceFrames = 0;
|
|
3548
|
+
if (!wasSpeaking) {
|
|
3549
|
+
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3550
|
+
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3551
|
+
this.audioPauseTime = Date.now();
|
|
3552
|
+
this.wavPlayer.pause();
|
|
3553
|
+
}
|
|
3554
|
+
isSpeakingByAmplitude = true;
|
|
3555
|
+
this.userIsSpeaking = true;
|
|
3556
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3557
|
+
this._wsSend({
|
|
3558
|
+
type: 'vad_events',
|
|
3559
|
+
event: 'vad_start',
|
|
3560
|
+
});
|
|
3561
|
+
}
|
|
3562
|
+
}
|
|
3563
|
+
else {
|
|
3564
|
+
silenceFrames++;
|
|
3565
|
+
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3566
|
+
// Speech ended
|
|
3567
|
+
isSpeakingByAmplitude = false;
|
|
3568
|
+
this.userIsSpeaking = false;
|
|
3569
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3570
|
+
this._wsSend({
|
|
3571
|
+
type: 'vad_events',
|
|
3572
|
+
event: 'vad_end',
|
|
3573
|
+
});
|
|
3574
|
+
}
|
|
3575
|
+
}
|
|
3576
|
+
});
|
|
3577
|
+
}
|
|
3518
3578
|
_initializeVAD() {
|
|
3519
3579
|
console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
|
|
3520
3580
|
// If we're in push to talk mode, we don't need to use the VAD model
|
|
@@ -3523,9 +3583,17 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3523
3583
|
}
|
|
3524
3584
|
const timeout = setTimeout(() => {
|
|
3525
3585
|
console.log('silero vad model timeout');
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
this.
|
|
3586
|
+
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3587
|
+
// Send a message to server indicating VAD failure
|
|
3588
|
+
this._wsSend({
|
|
3589
|
+
type: 'vad_events',
|
|
3590
|
+
event: 'vad_model_failed',
|
|
3591
|
+
});
|
|
3592
|
+
// In automatic mode without VAD, allow the bot to speak initially
|
|
3593
|
+
this.userIsSpeaking = false;
|
|
3594
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3595
|
+
// Set up amplitude-based fallback detection
|
|
3596
|
+
this._setupAmplitudeBasedVAD();
|
|
3529
3597
|
}, 2000);
|
|
3530
3598
|
if (!this.canInterrupt) {
|
|
3531
3599
|
dist.MicVAD.new({
|
|
@@ -3534,20 +3602,30 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3534
3602
|
positiveSpeechThreshold: 0.3,
|
|
3535
3603
|
negativeSpeechThreshold: 0.2,
|
|
3536
3604
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3537
|
-
minSpeechFrames:
|
|
3605
|
+
minSpeechFrames: 0,
|
|
3538
3606
|
preSpeechPadFrames: 0,
|
|
3539
3607
|
onSpeechStart: () => {
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3608
|
+
this.userIsSpeaking = true;
|
|
3609
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3610
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3611
|
+
this._wsSend({
|
|
3612
|
+
type: 'vad_events',
|
|
3613
|
+
event: 'vad_start',
|
|
3614
|
+
});
|
|
3544
3615
|
},
|
|
3545
|
-
|
|
3616
|
+
onSpeechEnd: () => {
|
|
3617
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3618
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3619
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3546
3620
|
this.userIsSpeaking = false;
|
|
3547
3621
|
this.options.onUserIsSpeakingChange(false);
|
|
3548
|
-
|
|
3549
|
-
|
|
3550
|
-
this.
|
|
3622
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3623
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3624
|
+
this._wsSend({
|
|
3625
|
+
type: 'vad_events',
|
|
3626
|
+
event: 'vad_end',
|
|
3627
|
+
});
|
|
3628
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3551
3629
|
},
|
|
3552
3630
|
})
|
|
3553
3631
|
.then((vad) => {
|
|
@@ -3569,41 +3647,59 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3569
3647
|
positiveSpeechThreshold: 0.3,
|
|
3570
3648
|
negativeSpeechThreshold: 0.2,
|
|
3571
3649
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3572
|
-
minSpeechFrames:
|
|
3650
|
+
minSpeechFrames: 5,
|
|
3573
3651
|
preSpeechPadFrames: 0,
|
|
3574
3652
|
onSpeechStart: () => {
|
|
3575
3653
|
// Only pause agent audio if it's currently playing
|
|
3576
3654
|
if (this.wavPlayer.isPlaying) {
|
|
3577
3655
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3656
|
+
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3578
3657
|
this.wavPlayer.pause();
|
|
3579
|
-
this.vadPausedPlayer = true; // VAD is responsible for this pause
|
|
3580
3658
|
}
|
|
3581
3659
|
else {
|
|
3582
3660
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3583
3661
|
}
|
|
3584
|
-
this.userIsSpeaking = true;
|
|
3585
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3586
3662
|
console.log('onSpeechStart: sending vad_start');
|
|
3587
3663
|
this._wsSend({
|
|
3588
3664
|
type: 'vad_events',
|
|
3589
3665
|
event: 'vad_start',
|
|
3590
3666
|
});
|
|
3667
|
+
this.userIsSpeaking = true;
|
|
3668
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3669
|
+
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3670
|
+
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3591
3671
|
},
|
|
3592
3672
|
onVADMisfire: () => {
|
|
3593
3673
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3594
3674
|
this.userIsSpeaking = false;
|
|
3675
|
+
this.audioBuffer = []; // Clear buffer on misfire
|
|
3595
3676
|
this.options.onUserIsSpeakingChange(false);
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
this.wavPlayer.
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
|
|
3677
|
+
// Add the missing delay before resuming to prevent race conditions
|
|
3678
|
+
setTimeout(() => {
|
|
3679
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3680
|
+
console.log('onVADMisfire: Resuming after delay');
|
|
3681
|
+
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3682
|
+
this.wavPlayer.play();
|
|
3683
|
+
}
|
|
3684
|
+
else {
|
|
3685
|
+
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3686
|
+
this.endUserTurn = true;
|
|
3687
|
+
}
|
|
3688
|
+
}, this.options.vadResumeDelay);
|
|
3604
3689
|
},
|
|
3605
3690
|
onSpeechEnd: () => {
|
|
3606
|
-
|
|
3691
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3692
|
+
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3693
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3694
|
+
this.userIsSpeaking = false;
|
|
3695
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3696
|
+
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3697
|
+
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3698
|
+
this._wsSend({
|
|
3699
|
+
type: 'vad_events',
|
|
3700
|
+
event: 'vad_end',
|
|
3701
|
+
});
|
|
3702
|
+
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3607
3703
|
},
|
|
3608
3704
|
})
|
|
3609
3705
|
.then((vad) => {
|
|
@@ -3638,13 +3734,36 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3638
3734
|
});
|
|
3639
3735
|
}
|
|
3640
3736
|
async _clientInterruptAssistantReplay() {
|
|
3641
|
-
await this.wavPlayer.interrupt();
|
|
3642
|
-
|
|
3643
|
-
|
|
3644
|
-
|
|
3645
|
-
|
|
3646
|
-
|
|
3647
|
-
|
|
3737
|
+
const offsetData = await this.wavPlayer.interrupt();
|
|
3738
|
+
if (offsetData && this.currentTurnId) {
|
|
3739
|
+
let offsetMs = offsetData.currentTime * 1000;
|
|
3740
|
+
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3741
|
+
if (this.audioPauseTime) {
|
|
3742
|
+
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3743
|
+
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3744
|
+
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3745
|
+
offsetMs = adjustedOffsetMs;
|
|
3746
|
+
this.audioPauseTime = null; // Clear the pause time
|
|
3747
|
+
}
|
|
3748
|
+
else {
|
|
3749
|
+
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3750
|
+
}
|
|
3751
|
+
// Send interruption event with accurate playback offset in milliseconds
|
|
3752
|
+
this._wsSend({
|
|
3753
|
+
type: 'trigger.response.audio.interrupted',
|
|
3754
|
+
playback_offset: offsetMs,
|
|
3755
|
+
interruption_context: {
|
|
3756
|
+
turn_id: this.currentTurnId,
|
|
3757
|
+
playback_offset_ms: offsetMs,
|
|
3758
|
+
},
|
|
3759
|
+
});
|
|
3760
|
+
}
|
|
3761
|
+
else {
|
|
3762
|
+
console.warn('Interruption requested but missing required data:', {
|
|
3763
|
+
hasOffsetData: !!offsetData,
|
|
3764
|
+
hasTurnId: !!this.currentTurnId,
|
|
3765
|
+
});
|
|
3766
|
+
}
|
|
3648
3767
|
}
|
|
3649
3768
|
async triggerUserTurnStarted() {
|
|
3650
3769
|
if (!this.pushToTalkActive) {
|
|
@@ -3675,24 +3794,38 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3675
3794
|
// Sent from the server to this client when a new user turn is detected
|
|
3676
3795
|
console.log('received turn.start from server');
|
|
3677
3796
|
console.log(message);
|
|
3678
|
-
if (message.role === '
|
|
3797
|
+
if (message.role === 'assistant') {
|
|
3798
|
+
// Start tracking new assistant turn
|
|
3799
|
+
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3800
|
+
// This prevents race conditions where text arrives before audio
|
|
3801
|
+
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3802
|
+
}
|
|
3803
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3679
3804
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3680
3805
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3681
3806
|
await this._clientInterruptAssistantReplay();
|
|
3682
3807
|
}
|
|
3683
|
-
// if (message.role === 'assistant') {
|
|
3684
|
-
// // Clear the buffer of audio when the assisatnt starts a new turn, as it may have been paused previously by VAD, leaving some audio frames in the buffer.
|
|
3685
|
-
// console.log('Clearing audio buffer as assistant turn has started');
|
|
3686
|
-
// await this._clientInterruptAssistantReplay();
|
|
3687
|
-
// }
|
|
3688
3808
|
break;
|
|
3689
3809
|
case 'response.audio':
|
|
3690
3810
|
const audioBuffer = base64ToArrayBuffer(message.content);
|
|
3691
3811
|
this.wavPlayer.add16BitPCM(audioBuffer, message.turn_id);
|
|
3812
|
+
// Set current turn ID from first audio message, or update if different turn
|
|
3813
|
+
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3814
|
+
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3815
|
+
this.currentTurnId = message.turn_id;
|
|
3816
|
+
// Clean up interrupted tracks, keeping only the current turn
|
|
3817
|
+
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3818
|
+
}
|
|
3692
3819
|
break;
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3820
|
+
case 'response.text': {
|
|
3821
|
+
// Set turn ID from first text message if not set
|
|
3822
|
+
if (!this.currentTurnId) {
|
|
3823
|
+
this.currentTurnId = message.turn_id;
|
|
3824
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3825
|
+
}
|
|
3826
|
+
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3827
|
+
break;
|
|
3828
|
+
}
|
|
3696
3829
|
case 'response.data':
|
|
3697
3830
|
console.log('received response.data', message);
|
|
3698
3831
|
this.options.onDataMessage(message);
|
|
@@ -3717,18 +3850,29 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3717
3850
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3718
3851
|
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3719
3852
|
if (sendAudio) {
|
|
3853
|
+
// If we have buffered audio, send it first
|
|
3854
|
+
if (this.audioBuffer.length > 0) {
|
|
3855
|
+
console.log(`Sending ${this.audioBuffer.length} buffered audio chunks`);
|
|
3856
|
+
for (const bufferedAudio of this.audioBuffer) {
|
|
3857
|
+
this._wsSend({
|
|
3858
|
+
type: 'client.audio',
|
|
3859
|
+
content: bufferedAudio,
|
|
3860
|
+
});
|
|
3861
|
+
}
|
|
3862
|
+
this.audioBuffer = []; // Clear the buffer after sending
|
|
3863
|
+
}
|
|
3864
|
+
// Send the current audio
|
|
3720
3865
|
this._wsSend({
|
|
3721
3866
|
type: 'client.audio',
|
|
3722
3867
|
content: base64,
|
|
3723
3868
|
});
|
|
3724
|
-
|
|
3725
|
-
|
|
3726
|
-
|
|
3727
|
-
|
|
3728
|
-
|
|
3729
|
-
|
|
3730
|
-
|
|
3731
|
-
});
|
|
3869
|
+
}
|
|
3870
|
+
else {
|
|
3871
|
+
// Buffer audio when not sending (to catch audio just before VAD triggers)
|
|
3872
|
+
this.audioBuffer.push(base64);
|
|
3873
|
+
// Keep buffer size reasonable (e.g., last 10 chunks ≈ 200ms at 20ms chunks)
|
|
3874
|
+
if (this.audioBuffer.length > 10) {
|
|
3875
|
+
this.audioBuffer.shift(); // Remove oldest chunk
|
|
3732
3876
|
}
|
|
3733
3877
|
}
|
|
3734
3878
|
}
|
|
@@ -3785,6 +3929,8 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3785
3929
|
async connect() {
|
|
3786
3930
|
try {
|
|
3787
3931
|
this._setStatus('connecting');
|
|
3932
|
+
// Reset turn tracking for clean start
|
|
3933
|
+
this._resetTurnTracking();
|
|
3788
3934
|
// Get session key from server
|
|
3789
3935
|
let authorizeSessionRequestBody = {
|
|
3790
3936
|
pipeline_id: this.options.pipelineId,
|
|
@@ -3862,11 +4008,27 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3862
4008
|
throw error;
|
|
3863
4009
|
}
|
|
3864
4010
|
}
|
|
4011
|
+
_resetTurnTracking() {
|
|
4012
|
+
this.currentTurnId = null;
|
|
4013
|
+
console.log('Reset turn tracking state');
|
|
4014
|
+
}
|
|
3865
4015
|
async disconnect() {
|
|
3866
|
-
|
|
4016
|
+
// Clean up VAD if it exists
|
|
4017
|
+
if (this.vad) {
|
|
4018
|
+
this.vad.pause();
|
|
4019
|
+
this.vad.destroy();
|
|
4020
|
+
this.vad = null;
|
|
4021
|
+
}
|
|
3867
4022
|
this.wavRecorder.quit();
|
|
3868
4023
|
this.wavPlayer.disconnect();
|
|
3869
|
-
|
|
4024
|
+
// Reset turn tracking
|
|
4025
|
+
this._resetTurnTracking();
|
|
4026
|
+
// Close websocket and ensure status is updated
|
|
4027
|
+
if (this.ws) {
|
|
4028
|
+
this.ws.close();
|
|
4029
|
+
this._setStatus('disconnected');
|
|
4030
|
+
this.options.onDisconnect();
|
|
4031
|
+
}
|
|
3870
4032
|
}
|
|
3871
4033
|
/**
|
|
3872
4034
|
* Gets the microphone MediaStream used by this client
|