@layercode/js-sdk 1.0.24 → 1.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3519,12 +3519,11 @@ class LayercodeClient {
|
|
|
3519
3519
|
this.pushToTalkEnabled = false;
|
|
3520
3520
|
this.canInterrupt = false;
|
|
3521
3521
|
this.userIsSpeaking = false;
|
|
3522
|
-
this.endUserTurn = false;
|
|
3523
3522
|
this.recorderStarted = false;
|
|
3524
3523
|
this.readySent = false;
|
|
3525
3524
|
this.currentTurnId = null;
|
|
3526
3525
|
this.audioBuffer = [];
|
|
3527
|
-
this.audioPauseTime = null;
|
|
3526
|
+
// this.audioPauseTime = null;
|
|
3528
3527
|
// Bind event handlers
|
|
3529
3528
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3530
3529
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
@@ -3533,18 +3532,13 @@ class LayercodeClient {
|
|
|
3533
3532
|
let isSpeakingByAmplitude = false;
|
|
3534
3533
|
let silenceFrames = 0;
|
|
3535
3534
|
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3536
|
-
const SILENCE_FRAMES_THRESHOLD =
|
|
3535
|
+
const SILENCE_FRAMES_THRESHOLD = 6.4; // 6.4 * 20ms chunks = 128ms silence. Same as Silero ((frame samples: 512 / sampleRate: 16000) * 1000 * redemptionFrames: 4) = 128 ms silence
|
|
3537
3536
|
// Monitor amplitude changes
|
|
3538
3537
|
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3539
3538
|
const wasSpeaking = isSpeakingByAmplitude;
|
|
3540
3539
|
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3541
3540
|
silenceFrames = 0;
|
|
3542
3541
|
if (!wasSpeaking) {
|
|
3543
|
-
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3544
|
-
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3545
|
-
this.audioPauseTime = Date.now();
|
|
3546
|
-
this.wavPlayer.pause();
|
|
3547
|
-
}
|
|
3548
3542
|
isSpeakingByAmplitude = true;
|
|
3549
3543
|
this.userIsSpeaking = true;
|
|
3550
3544
|
this.options.onUserIsSpeakingChange(true);
|
|
@@ -3557,7 +3551,6 @@ class LayercodeClient {
|
|
|
3557
3551
|
else {
|
|
3558
3552
|
silenceFrames++;
|
|
3559
3553
|
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3560
|
-
// Speech ended
|
|
3561
3554
|
isSpeakingByAmplitude = false;
|
|
3562
3555
|
this.userIsSpeaking = false;
|
|
3563
3556
|
this.options.onUserIsSpeakingChange(false);
|
|
@@ -3575,7 +3568,7 @@ class LayercodeClient {
|
|
|
3575
3568
|
if (this.pushToTalkEnabled) {
|
|
3576
3569
|
return;
|
|
3577
3570
|
}
|
|
3578
|
-
const
|
|
3571
|
+
const vadLoadTimeout = setTimeout(() => {
|
|
3579
3572
|
console.log('silero vad model timeout');
|
|
3580
3573
|
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3581
3574
|
// Send a message to server indicating VAD failure
|
|
@@ -3583,134 +3576,54 @@ class LayercodeClient {
|
|
|
3583
3576
|
type: 'vad_events',
|
|
3584
3577
|
event: 'vad_model_failed',
|
|
3585
3578
|
});
|
|
3586
|
-
// In automatic mode without VAD, allow the bot to speak initially
|
|
3587
|
-
this.userIsSpeaking = false;
|
|
3588
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3589
3579
|
// Set up amplitude-based fallback detection
|
|
3590
3580
|
this._setupAmplitudeBasedVAD();
|
|
3591
3581
|
}, 2000);
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
});
|
|
3634
|
-
}
|
|
3635
|
-
else {
|
|
3636
|
-
dist.MicVAD.new({
|
|
3637
|
-
stream: this.wavRecorder.getStream() || undefined,
|
|
3638
|
-
model: 'v5',
|
|
3639
|
-
// baseAssetPath: '/', // Use if bundling model locally
|
|
3640
|
-
// onnxWASMBasePath: '/', // Use if bundling model locally
|
|
3641
|
-
positiveSpeechThreshold: 0.5,
|
|
3642
|
-
negativeSpeechThreshold: 0.3,
|
|
3643
|
-
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3644
|
-
minSpeechFrames: 25,
|
|
3645
|
-
preSpeechPadFrames: 0,
|
|
3646
|
-
onSpeechStart: () => {
|
|
3647
|
-
// Only pause agent audio if it's currently playing
|
|
3648
|
-
if (this.wavPlayer.isPlaying) {
|
|
3649
|
-
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3650
|
-
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3651
|
-
this.wavPlayer.pause();
|
|
3652
|
-
}
|
|
3653
|
-
else {
|
|
3654
|
-
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3655
|
-
}
|
|
3656
|
-
console.log('onSpeechStart: sending vad_start');
|
|
3657
|
-
this._wsSend({
|
|
3658
|
-
type: 'vad_events',
|
|
3659
|
-
event: 'vad_start',
|
|
3660
|
-
});
|
|
3661
|
-
this.userIsSpeaking = true;
|
|
3662
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3663
|
-
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3664
|
-
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3665
|
-
},
|
|
3666
|
-
onVADMisfire: () => {
|
|
3667
|
-
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3668
|
-
this.userIsSpeaking = false;
|
|
3669
|
-
this.audioBuffer = []; // Clear buffer on misfire
|
|
3670
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3671
|
-
// Add the missing delay before resuming to prevent race conditions
|
|
3672
|
-
setTimeout(() => {
|
|
3673
|
-
if (!this.wavPlayer.isPlaying) {
|
|
3674
|
-
console.log('onVADMisfire: Resuming after delay');
|
|
3675
|
-
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3676
|
-
this.wavPlayer.play();
|
|
3677
|
-
}
|
|
3678
|
-
else {
|
|
3679
|
-
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3680
|
-
this.endUserTurn = true;
|
|
3681
|
-
}
|
|
3682
|
-
}, this.options.vadResumeDelay);
|
|
3683
|
-
},
|
|
3684
|
-
onSpeechEnd: () => {
|
|
3685
|
-
console.log('onSpeechEnd: sending vad_end');
|
|
3686
|
-
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3687
|
-
this.audioBuffer = []; // Clear buffer on speech end
|
|
3688
|
-
this.userIsSpeaking = false;
|
|
3689
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3690
|
-
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3691
|
-
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3692
|
-
this._wsSend({
|
|
3693
|
-
type: 'vad_events',
|
|
3694
|
-
event: 'vad_end',
|
|
3695
|
-
});
|
|
3696
|
-
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3697
|
-
},
|
|
3698
|
-
})
|
|
3699
|
-
.then((vad) => {
|
|
3700
|
-
clearTimeout(timeout);
|
|
3701
|
-
this.vad = vad;
|
|
3702
|
-
this.vad.start();
|
|
3703
|
-
console.log('VAD started');
|
|
3704
|
-
})
|
|
3705
|
-
.catch((error) => {
|
|
3706
|
-
console.error('Error initializing VAD:', error);
|
|
3707
|
-
});
|
|
3708
|
-
}
|
|
3582
|
+
dist.MicVAD.new({
|
|
3583
|
+
stream: this.wavRecorder.getStream() || undefined,
|
|
3584
|
+
model: 'v5',
|
|
3585
|
+
positiveSpeechThreshold: 0.15,
|
|
3586
|
+
negativeSpeechThreshold: 0.05,
|
|
3587
|
+
redemptionFrames: 4,
|
|
3588
|
+
minSpeechFrames: 2,
|
|
3589
|
+
preSpeechPadFrames: 0,
|
|
3590
|
+
frameSamples: 512, // Required for v5 as per https://docs.vad.ricky0123.com/user-guide/algorithm/#configuration
|
|
3591
|
+
onSpeechStart: () => {
|
|
3592
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3593
|
+
this.userIsSpeaking = true;
|
|
3594
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3595
|
+
this._wsSend({
|
|
3596
|
+
type: 'vad_events',
|
|
3597
|
+
event: 'vad_start',
|
|
3598
|
+
});
|
|
3599
|
+
},
|
|
3600
|
+
onSpeechEnd: () => {
|
|
3601
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3602
|
+
this.userIsSpeaking = false;
|
|
3603
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3604
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3605
|
+
this._wsSend({
|
|
3606
|
+
type: 'vad_events',
|
|
3607
|
+
event: 'vad_end',
|
|
3608
|
+
});
|
|
3609
|
+
},
|
|
3610
|
+
// onVADMisfire: () => {
|
|
3611
|
+
// // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd.
|
|
3612
|
+
// },
|
|
3613
|
+
})
|
|
3614
|
+
.then((vad) => {
|
|
3615
|
+
clearTimeout(vadLoadTimeout);
|
|
3616
|
+
this.vad = vad;
|
|
3617
|
+
this.vad.start();
|
|
3618
|
+
console.log('VAD started');
|
|
3619
|
+
})
|
|
3620
|
+
.catch((error) => {
|
|
3621
|
+
console.error('Error initializing VAD:', error);
|
|
3622
|
+
});
|
|
3709
3623
|
}
|
|
3710
3624
|
/**
|
|
3711
3625
|
* Updates the connection status and triggers the callback
|
|
3712
3626
|
* @param {string} status - New status value
|
|
3713
|
-
* @private
|
|
3714
3627
|
*/
|
|
3715
3628
|
_setStatus(status) {
|
|
3716
3629
|
this.status = status;
|
|
@@ -3718,7 +3631,6 @@ class LayercodeClient {
|
|
|
3718
3631
|
}
|
|
3719
3632
|
/**
|
|
3720
3633
|
* Handles when agent audio finishes playing
|
|
3721
|
-
* @private
|
|
3722
3634
|
*/
|
|
3723
3635
|
_clientResponseAudioReplayFinished() {
|
|
3724
3636
|
console.log('clientResponseAudioReplayFinished');
|
|
@@ -3731,17 +3643,6 @@ class LayercodeClient {
|
|
|
3731
3643
|
const offsetData = await this.wavPlayer.interrupt();
|
|
3732
3644
|
if (offsetData && this.currentTurnId) {
|
|
3733
3645
|
let offsetMs = offsetData.currentTime * 1000;
|
|
3734
|
-
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3735
|
-
if (this.audioPauseTime) {
|
|
3736
|
-
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3737
|
-
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3738
|
-
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3739
|
-
offsetMs = adjustedOffsetMs;
|
|
3740
|
-
this.audioPauseTime = null; // Clear the pause time
|
|
3741
|
-
}
|
|
3742
|
-
else {
|
|
3743
|
-
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3744
|
-
}
|
|
3745
3646
|
// Send interruption event with accurate playback offset in milliseconds
|
|
3746
3647
|
this._wsSend({
|
|
3747
3648
|
type: 'trigger.response.audio.interrupted',
|
|
@@ -3775,7 +3676,6 @@ class LayercodeClient {
|
|
|
3775
3676
|
/**
|
|
3776
3677
|
* Handles incoming WebSocket messages
|
|
3777
3678
|
* @param {MessageEvent} event - The WebSocket message event
|
|
3778
|
-
* @private
|
|
3779
3679
|
*/
|
|
3780
3680
|
async _handleWebSocketMessage(event) {
|
|
3781
3681
|
try {
|
|
@@ -3790,12 +3690,10 @@ class LayercodeClient {
|
|
|
3790
3690
|
console.log(message);
|
|
3791
3691
|
if (message.role === 'assistant') {
|
|
3792
3692
|
// Start tracking new assistant turn
|
|
3793
|
-
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3794
|
-
// This prevents race conditions where text arrives before audio
|
|
3795
3693
|
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3796
3694
|
}
|
|
3797
|
-
else if (message.role === 'user' && !this.pushToTalkEnabled
|
|
3798
|
-
// Interrupt any playing assistant audio if this is a turn
|
|
3695
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled) {
|
|
3696
|
+
// Interrupt any playing assistant audio if this is a turn triggered by the server (and not push to talk, which will have already called interrupt)
|
|
3799
3697
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3800
3698
|
await this._clientInterruptAssistantReplay();
|
|
3801
3699
|
}
|
|
@@ -3817,7 +3715,6 @@ class LayercodeClient {
|
|
|
3817
3715
|
this.currentTurnId = message.turn_id;
|
|
3818
3716
|
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3819
3717
|
}
|
|
3820
|
-
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3821
3718
|
break;
|
|
3822
3719
|
}
|
|
3823
3720
|
case 'response.data':
|
|
@@ -3837,7 +3734,6 @@ class LayercodeClient {
|
|
|
3837
3734
|
/**
|
|
3838
3735
|
* Handles available client browser microphone audio data and sends it over the WebSocket
|
|
3839
3736
|
* @param {ArrayBuffer} data - The audio data buffer
|
|
3840
|
-
* @private
|
|
3841
3737
|
*/
|
|
3842
3738
|
_handleDataAvailable(data) {
|
|
3843
3739
|
try {
|
|
@@ -3897,7 +3793,6 @@ class LayercodeClient {
|
|
|
3897
3793
|
* @param {WavRecorder | WavStreamPlayer} source - The audio source (recorder or player).
|
|
3898
3794
|
* @param {(amplitude: number) => void} callback - The callback function to invoke on amplitude change.
|
|
3899
3795
|
* @param {(amplitude: number) => void} updateInternalState - Function to update the internal amplitude state.
|
|
3900
|
-
* @private
|
|
3901
3796
|
*/
|
|
3902
3797
|
_setupAmplitudeMonitoring(source, callback, updateInternalState) {
|
|
3903
3798
|
// Set up amplitude monitoring only if a callback is provided
|