@layercode/js-sdk 1.0.24 → 1.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3525,12 +3525,11 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3525
3525
|
this.pushToTalkEnabled = false;
|
|
3526
3526
|
this.canInterrupt = false;
|
|
3527
3527
|
this.userIsSpeaking = false;
|
|
3528
|
-
this.endUserTurn = false;
|
|
3529
3528
|
this.recorderStarted = false;
|
|
3530
3529
|
this.readySent = false;
|
|
3531
3530
|
this.currentTurnId = null;
|
|
3532
3531
|
this.audioBuffer = [];
|
|
3533
|
-
this.audioPauseTime = null;
|
|
3532
|
+
// this.audioPauseTime = null;
|
|
3534
3533
|
// Bind event handlers
|
|
3535
3534
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3536
3535
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
@@ -3539,18 +3538,13 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3539
3538
|
let isSpeakingByAmplitude = false;
|
|
3540
3539
|
let silenceFrames = 0;
|
|
3541
3540
|
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3542
|
-
const SILENCE_FRAMES_THRESHOLD =
|
|
3541
|
+
const SILENCE_FRAMES_THRESHOLD = 6.4; // 6.4 * 20ms chunks = 128ms silence. Same as Silero ((frame samples: 512 / sampleRate: 16000) * 1000 * redemptionFrames: 4) = 128 ms silence
|
|
3543
3542
|
// Monitor amplitude changes
|
|
3544
3543
|
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3545
3544
|
const wasSpeaking = isSpeakingByAmplitude;
|
|
3546
3545
|
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3547
3546
|
silenceFrames = 0;
|
|
3548
3547
|
if (!wasSpeaking) {
|
|
3549
|
-
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3550
|
-
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3551
|
-
this.audioPauseTime = Date.now();
|
|
3552
|
-
this.wavPlayer.pause();
|
|
3553
|
-
}
|
|
3554
3548
|
isSpeakingByAmplitude = true;
|
|
3555
3549
|
this.userIsSpeaking = true;
|
|
3556
3550
|
this.options.onUserIsSpeakingChange(true);
|
|
@@ -3563,7 +3557,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3563
3557
|
else {
|
|
3564
3558
|
silenceFrames++;
|
|
3565
3559
|
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3566
|
-
// Speech ended
|
|
3567
3560
|
isSpeakingByAmplitude = false;
|
|
3568
3561
|
this.userIsSpeaking = false;
|
|
3569
3562
|
this.options.onUserIsSpeakingChange(false);
|
|
@@ -3581,7 +3574,7 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3581
3574
|
if (this.pushToTalkEnabled) {
|
|
3582
3575
|
return;
|
|
3583
3576
|
}
|
|
3584
|
-
const
|
|
3577
|
+
const vadLoadTimeout = setTimeout(() => {
|
|
3585
3578
|
console.log('silero vad model timeout');
|
|
3586
3579
|
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3587
3580
|
// Send a message to server indicating VAD failure
|
|
@@ -3589,134 +3582,54 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3589
3582
|
type: 'vad_events',
|
|
3590
3583
|
event: 'vad_model_failed',
|
|
3591
3584
|
});
|
|
3592
|
-
// In automatic mode without VAD, allow the bot to speak initially
|
|
3593
|
-
this.userIsSpeaking = false;
|
|
3594
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3595
3585
|
// Set up amplitude-based fallback detection
|
|
3596
3586
|
this._setupAmplitudeBasedVAD();
|
|
3597
3587
|
}, 2000);
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
|
|
3601
|
-
|
|
3602
|
-
|
|
3603
|
-
|
|
3604
|
-
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
|
|
3618
|
-
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
|
|
3622
|
-
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
3631
|
-
|
|
3632
|
-
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
});
|
|
3640
|
-
}
|
|
3641
|
-
else {
|
|
3642
|
-
dist.MicVAD.new({
|
|
3643
|
-
stream: this.wavRecorder.getStream() || undefined,
|
|
3644
|
-
model: 'v5',
|
|
3645
|
-
// baseAssetPath: '/', // Use if bundling model locally
|
|
3646
|
-
// onnxWASMBasePath: '/', // Use if bundling model locally
|
|
3647
|
-
positiveSpeechThreshold: 0.5,
|
|
3648
|
-
negativeSpeechThreshold: 0.3,
|
|
3649
|
-
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3650
|
-
minSpeechFrames: 25,
|
|
3651
|
-
preSpeechPadFrames: 0,
|
|
3652
|
-
onSpeechStart: () => {
|
|
3653
|
-
// Only pause agent audio if it's currently playing
|
|
3654
|
-
if (this.wavPlayer.isPlaying) {
|
|
3655
|
-
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3656
|
-
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3657
|
-
this.wavPlayer.pause();
|
|
3658
|
-
}
|
|
3659
|
-
else {
|
|
3660
|
-
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3661
|
-
}
|
|
3662
|
-
console.log('onSpeechStart: sending vad_start');
|
|
3663
|
-
this._wsSend({
|
|
3664
|
-
type: 'vad_events',
|
|
3665
|
-
event: 'vad_start',
|
|
3666
|
-
});
|
|
3667
|
-
this.userIsSpeaking = true;
|
|
3668
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3669
|
-
this.endUserTurn = false; // Reset endUserTurn when speech starts
|
|
3670
|
-
console.log('onSpeechStart: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3671
|
-
},
|
|
3672
|
-
onVADMisfire: () => {
|
|
3673
|
-
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3674
|
-
this.userIsSpeaking = false;
|
|
3675
|
-
this.audioBuffer = []; // Clear buffer on misfire
|
|
3676
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3677
|
-
// Add the missing delay before resuming to prevent race conditions
|
|
3678
|
-
setTimeout(() => {
|
|
3679
|
-
if (!this.wavPlayer.isPlaying) {
|
|
3680
|
-
console.log('onVADMisfire: Resuming after delay');
|
|
3681
|
-
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3682
|
-
this.wavPlayer.play();
|
|
3683
|
-
}
|
|
3684
|
-
else {
|
|
3685
|
-
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
3686
|
-
this.endUserTurn = true;
|
|
3687
|
-
}
|
|
3688
|
-
}, this.options.vadResumeDelay);
|
|
3689
|
-
},
|
|
3690
|
-
onSpeechEnd: () => {
|
|
3691
|
-
console.log('onSpeechEnd: sending vad_end');
|
|
3692
|
-
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
3693
|
-
this.audioBuffer = []; // Clear buffer on speech end
|
|
3694
|
-
this.userIsSpeaking = false;
|
|
3695
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3696
|
-
console.log('onSpeechEnd: State after update - endUserTurn:', this.endUserTurn, 'userIsSpeaking:', this.userIsSpeaking);
|
|
3697
|
-
// Send vad_end immediately instead of waiting for next audio chunk
|
|
3698
|
-
this._wsSend({
|
|
3699
|
-
type: 'vad_events',
|
|
3700
|
-
event: 'vad_end',
|
|
3701
|
-
});
|
|
3702
|
-
this.endUserTurn = false; // Reset the flag after sending vad_end
|
|
3703
|
-
},
|
|
3704
|
-
})
|
|
3705
|
-
.then((vad) => {
|
|
3706
|
-
clearTimeout(timeout);
|
|
3707
|
-
this.vad = vad;
|
|
3708
|
-
this.vad.start();
|
|
3709
|
-
console.log('VAD started');
|
|
3710
|
-
})
|
|
3711
|
-
.catch((error) => {
|
|
3712
|
-
console.error('Error initializing VAD:', error);
|
|
3713
|
-
});
|
|
3714
|
-
}
|
|
3588
|
+
dist.MicVAD.new({
|
|
3589
|
+
stream: this.wavRecorder.getStream() || undefined,
|
|
3590
|
+
model: 'v5',
|
|
3591
|
+
positiveSpeechThreshold: 0.15,
|
|
3592
|
+
negativeSpeechThreshold: 0.05,
|
|
3593
|
+
redemptionFrames: 4,
|
|
3594
|
+
minSpeechFrames: 2,
|
|
3595
|
+
preSpeechPadFrames: 0,
|
|
3596
|
+
frameSamples: 512, // Required for v5 as per https://docs.vad.ricky0123.com/user-guide/algorithm/#configuration
|
|
3597
|
+
onSpeechStart: () => {
|
|
3598
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3599
|
+
this.userIsSpeaking = true;
|
|
3600
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3601
|
+
this._wsSend({
|
|
3602
|
+
type: 'vad_events',
|
|
3603
|
+
event: 'vad_start',
|
|
3604
|
+
});
|
|
3605
|
+
},
|
|
3606
|
+
onSpeechEnd: () => {
|
|
3607
|
+
console.log('onSpeechEnd: sending vad_end');
|
|
3608
|
+
this.userIsSpeaking = false;
|
|
3609
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3610
|
+
this.audioBuffer = []; // Clear buffer on speech end
|
|
3611
|
+
this._wsSend({
|
|
3612
|
+
type: 'vad_events',
|
|
3613
|
+
event: 'vad_end',
|
|
3614
|
+
});
|
|
3615
|
+
},
|
|
3616
|
+
// onVADMisfire: () => {
|
|
3617
|
+
// // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd.
|
|
3618
|
+
// },
|
|
3619
|
+
})
|
|
3620
|
+
.then((vad) => {
|
|
3621
|
+
clearTimeout(vadLoadTimeout);
|
|
3622
|
+
this.vad = vad;
|
|
3623
|
+
this.vad.start();
|
|
3624
|
+
console.log('VAD started');
|
|
3625
|
+
})
|
|
3626
|
+
.catch((error) => {
|
|
3627
|
+
console.error('Error initializing VAD:', error);
|
|
3628
|
+
});
|
|
3715
3629
|
}
|
|
3716
3630
|
/**
|
|
3717
3631
|
* Updates the connection status and triggers the callback
|
|
3718
3632
|
* @param {string} status - New status value
|
|
3719
|
-
* @private
|
|
3720
3633
|
*/
|
|
3721
3634
|
_setStatus(status) {
|
|
3722
3635
|
this.status = status;
|
|
@@ -3724,7 +3637,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3724
3637
|
}
|
|
3725
3638
|
/**
|
|
3726
3639
|
* Handles when agent audio finishes playing
|
|
3727
|
-
* @private
|
|
3728
3640
|
*/
|
|
3729
3641
|
_clientResponseAudioReplayFinished() {
|
|
3730
3642
|
console.log('clientResponseAudioReplayFinished');
|
|
@@ -3737,17 +3649,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3737
3649
|
const offsetData = await this.wavPlayer.interrupt();
|
|
3738
3650
|
if (offsetData && this.currentTurnId) {
|
|
3739
3651
|
let offsetMs = offsetData.currentTime * 1000;
|
|
3740
|
-
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3741
|
-
if (this.audioPauseTime) {
|
|
3742
|
-
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3743
|
-
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3744
|
-
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3745
|
-
offsetMs = adjustedOffsetMs;
|
|
3746
|
-
this.audioPauseTime = null; // Clear the pause time
|
|
3747
|
-
}
|
|
3748
|
-
else {
|
|
3749
|
-
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3750
|
-
}
|
|
3751
3652
|
// Send interruption event with accurate playback offset in milliseconds
|
|
3752
3653
|
this._wsSend({
|
|
3753
3654
|
type: 'trigger.response.audio.interrupted',
|
|
@@ -3781,7 +3682,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3781
3682
|
/**
|
|
3782
3683
|
* Handles incoming WebSocket messages
|
|
3783
3684
|
* @param {MessageEvent} event - The WebSocket message event
|
|
3784
|
-
* @private
|
|
3785
3685
|
*/
|
|
3786
3686
|
async _handleWebSocketMessage(event) {
|
|
3787
3687
|
try {
|
|
@@ -3796,12 +3696,10 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3796
3696
|
console.log(message);
|
|
3797
3697
|
if (message.role === 'assistant') {
|
|
3798
3698
|
// Start tracking new assistant turn
|
|
3799
|
-
// Note: Don't reset currentTurnId here - let response.audio set it
|
|
3800
|
-
// This prevents race conditions where text arrives before audio
|
|
3801
3699
|
console.log('Assistant turn started, will track new turn ID from audio/text');
|
|
3802
3700
|
}
|
|
3803
|
-
else if (message.role === 'user' && !this.pushToTalkEnabled
|
|
3804
|
-
// Interrupt any playing assistant audio if this is a turn
|
|
3701
|
+
else if (message.role === 'user' && !this.pushToTalkEnabled) {
|
|
3702
|
+
// Interrupt any playing assistant audio if this is a turn triggered by the server (and not push to talk, which will have already called interrupt)
|
|
3805
3703
|
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3806
3704
|
await this._clientInterruptAssistantReplay();
|
|
3807
3705
|
}
|
|
@@ -3823,7 +3721,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3823
3721
|
this.currentTurnId = message.turn_id;
|
|
3824
3722
|
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3825
3723
|
}
|
|
3826
|
-
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3827
3724
|
break;
|
|
3828
3725
|
}
|
|
3829
3726
|
case 'response.data':
|
|
@@ -3843,7 +3740,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3843
3740
|
/**
|
|
3844
3741
|
* Handles available client browser microphone audio data and sends it over the WebSocket
|
|
3845
3742
|
* @param {ArrayBuffer} data - The audio data buffer
|
|
3846
|
-
* @private
|
|
3847
3743
|
*/
|
|
3848
3744
|
_handleDataAvailable(data) {
|
|
3849
3745
|
try {
|
|
@@ -3903,7 +3799,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3903
3799
|
* @param {WavRecorder | WavStreamPlayer} source - The audio source (recorder or player).
|
|
3904
3800
|
* @param {(amplitude: number) => void} callback - The callback function to invoke on amplitude change.
|
|
3905
3801
|
* @param {(amplitude: number) => void} updateInternalState - Function to update the internal amplitude state.
|
|
3906
|
-
* @private
|
|
3907
3802
|
*/
|
|
3908
3803
|
_setupAmplitudeMonitoring(source, callback, updateInternalState) {
|
|
3909
3804
|
// Set up amplitude monitoring only if a callback is provided
|