@layercode/js-sdk 1.0.22 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3528,13 +3528,53 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3528
3528
|
this.endUserTurn = false;
|
|
3529
3529
|
this.recorderStarted = false;
|
|
3530
3530
|
this.readySent = false;
|
|
3531
|
-
this.currentTurnText = '';
|
|
3532
3531
|
this.currentTurnId = null;
|
|
3533
3532
|
this.audioBuffer = [];
|
|
3533
|
+
this.audioPauseTime = null;
|
|
3534
3534
|
// Bind event handlers
|
|
3535
3535
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3536
3536
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3537
3537
|
}
|
|
3538
|
+
_setupAmplitudeBasedVAD() {
|
|
3539
|
+
let isSpeakingByAmplitude = false;
|
|
3540
|
+
let silenceFrames = 0;
|
|
3541
|
+
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3542
|
+
const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
|
|
3543
|
+
// Monitor amplitude changes
|
|
3544
|
+
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3545
|
+
const wasSpeaking = isSpeakingByAmplitude;
|
|
3546
|
+
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3547
|
+
silenceFrames = 0;
|
|
3548
|
+
if (!wasSpeaking) {
|
|
3549
|
+
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3550
|
+
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3551
|
+
this.audioPauseTime = Date.now();
|
|
3552
|
+
this.wavPlayer.pause();
|
|
3553
|
+
}
|
|
3554
|
+
isSpeakingByAmplitude = true;
|
|
3555
|
+
this.userIsSpeaking = true;
|
|
3556
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3557
|
+
this._wsSend({
|
|
3558
|
+
type: 'vad_events',
|
|
3559
|
+
event: 'vad_start',
|
|
3560
|
+
});
|
|
3561
|
+
}
|
|
3562
|
+
}
|
|
3563
|
+
else {
|
|
3564
|
+
silenceFrames++;
|
|
3565
|
+
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3566
|
+
// Speech ended
|
|
3567
|
+
isSpeakingByAmplitude = false;
|
|
3568
|
+
this.userIsSpeaking = false;
|
|
3569
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3570
|
+
this._wsSend({
|
|
3571
|
+
type: 'vad_events',
|
|
3572
|
+
event: 'vad_end',
|
|
3573
|
+
});
|
|
3574
|
+
}
|
|
3575
|
+
}
|
|
3576
|
+
});
|
|
3577
|
+
}
|
|
3538
3578
|
_initializeVAD() {
|
|
3539
3579
|
console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
|
|
3540
3580
|
// If we're in push to talk mode, we don't need to use the VAD model
|
|
@@ -3543,9 +3583,17 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3543
3583
|
}
|
|
3544
3584
|
const timeout = setTimeout(() => {
|
|
3545
3585
|
console.log('silero vad model timeout');
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
this.
|
|
3586
|
+
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3587
|
+
// Send a message to server indicating VAD failure
|
|
3588
|
+
this._wsSend({
|
|
3589
|
+
type: 'vad_events',
|
|
3590
|
+
event: 'vad_model_failed',
|
|
3591
|
+
});
|
|
3592
|
+
// In automatic mode without VAD, allow the bot to speak initially
|
|
3593
|
+
this.userIsSpeaking = false;
|
|
3594
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3595
|
+
// Set up amplitude-based fallback detection
|
|
3596
|
+
this._setupAmplitudeBasedVAD();
|
|
3549
3597
|
}, 2000);
|
|
3550
3598
|
if (!this.canInterrupt) {
|
|
3551
3599
|
dist.MicVAD.new({
|
|
@@ -3554,7 +3602,7 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3554
3602
|
positiveSpeechThreshold: 0.3,
|
|
3555
3603
|
negativeSpeechThreshold: 0.2,
|
|
3556
3604
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3557
|
-
minSpeechFrames:
|
|
3605
|
+
minSpeechFrames: 0,
|
|
3558
3606
|
preSpeechPadFrames: 0,
|
|
3559
3607
|
onSpeechStart: () => {
|
|
3560
3608
|
this.userIsSpeaking = true;
|
|
@@ -3565,27 +3613,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3565
3613
|
event: 'vad_start',
|
|
3566
3614
|
});
|
|
3567
3615
|
},
|
|
3568
|
-
onVADMisfire: () => {
|
|
3569
|
-
console.log('onVADMisfire: Short utterance detected, resuming bot');
|
|
3570
|
-
this.audioBuffer = []; // Clear buffer on misfire
|
|
3571
|
-
this.userIsSpeaking = false;
|
|
3572
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3573
|
-
// Send vad_end to indicate the short utterance is over
|
|
3574
|
-
this._wsSend({
|
|
3575
|
-
type: 'vad_events',
|
|
3576
|
-
event: 'vad_end',
|
|
3577
|
-
});
|
|
3578
|
-
// End the user's turn
|
|
3579
|
-
this._wsSend({
|
|
3580
|
-
type: 'trigger.turn.end',
|
|
3581
|
-
role: 'user',
|
|
3582
|
-
});
|
|
3583
|
-
// Resume bot audio if it was playing
|
|
3584
|
-
if (!this.wavPlayer.isPlaying) {
|
|
3585
|
-
console.log('onVADMisfire: Resuming bot audio');
|
|
3586
|
-
this.wavPlayer.play();
|
|
3587
|
-
}
|
|
3588
|
-
},
|
|
3589
3616
|
onSpeechEnd: () => {
|
|
3590
3617
|
console.log('onSpeechEnd: sending vad_end');
|
|
3591
3618
|
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
@@ -3626,6 +3653,7 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3626
3653
|
// Only pause agent audio if it's currently playing
|
|
3627
3654
|
if (this.wavPlayer.isPlaying) {
|
|
3628
3655
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3656
|
+
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3629
3657
|
this.wavPlayer.pause();
|
|
3630
3658
|
}
|
|
3631
3659
|
else {
|
|
@@ -3650,9 +3678,8 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3650
3678
|
setTimeout(() => {
|
|
3651
3679
|
if (!this.wavPlayer.isPlaying) {
|
|
3652
3680
|
console.log('onVADMisfire: Resuming after delay');
|
|
3681
|
+
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3653
3682
|
this.wavPlayer.play();
|
|
3654
|
-
this.userIsSpeaking = true;
|
|
3655
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3656
3683
|
}
|
|
3657
3684
|
else {
|
|
3658
3685
|
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
@@ -3706,33 +3733,37 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3706
3733
|
reason: 'completed',
|
|
3707
3734
|
});
|
|
3708
3735
|
}
|
|
3709
|
-
_estimateWordsHeard(text, playbackOffsetSeconds) {
|
|
3710
|
-
const words = text.split(/\s+/).filter((word) => word.length > 0);
|
|
3711
|
-
const totalWords = words.length;
|
|
3712
|
-
// Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
|
|
3713
|
-
const estimatedWordsPerSecond = 2.5;
|
|
3714
|
-
const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
|
|
3715
|
-
const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
|
|
3716
|
-
return { wordsHeard: estimatedWordsHeard, textHeard };
|
|
3717
|
-
}
|
|
3718
3736
|
async _clientInterruptAssistantReplay() {
|
|
3719
3737
|
const offsetData = await this.wavPlayer.interrupt();
|
|
3720
|
-
if (offsetData && this.
|
|
3721
|
-
|
|
3722
|
-
|
|
3723
|
-
|
|
3724
|
-
|
|
3738
|
+
if (offsetData && this.currentTurnId) {
|
|
3739
|
+
let offsetMs = offsetData.currentTime * 1000;
|
|
3740
|
+
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3741
|
+
if (this.audioPauseTime) {
|
|
3742
|
+
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3743
|
+
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3744
|
+
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3745
|
+
offsetMs = adjustedOffsetMs;
|
|
3746
|
+
this.audioPauseTime = null; // Clear the pause time
|
|
3747
|
+
}
|
|
3748
|
+
else {
|
|
3749
|
+
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3750
|
+
}
|
|
3751
|
+
// Send interruption event with accurate playback offset in milliseconds
|
|
3725
3752
|
this._wsSend({
|
|
3726
3753
|
type: 'trigger.response.audio.interrupted',
|
|
3727
|
-
playback_offset:
|
|
3754
|
+
playback_offset: offsetMs,
|
|
3728
3755
|
interruption_context: {
|
|
3729
3756
|
turn_id: this.currentTurnId,
|
|
3730
|
-
|
|
3731
|
-
total_words: totalWords,
|
|
3732
|
-
text_heard: textHeard,
|
|
3757
|
+
playback_offset_ms: offsetMs,
|
|
3733
3758
|
},
|
|
3734
3759
|
});
|
|
3735
3760
|
}
|
|
3761
|
+
else {
|
|
3762
|
+
console.warn('Interruption requested but missing required data:', {
|
|
3763
|
+
hasOffsetData: !!offsetData,
|
|
3764
|
+
hasTurnId: !!this.currentTurnId,
|
|
3765
|
+
});
|
|
3766
|
+
}
|
|
3736
3767
|
}
|
|
3737
3768
|
async triggerUserTurnStarted() {
|
|
3738
3769
|
if (!this.pushToTalkActive) {
|
|
@@ -3781,30 +3812,20 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3781
3812
|
// Set current turn ID from first audio message, or update if different turn
|
|
3782
3813
|
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3783
3814
|
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3784
|
-
const oldTurnId = this.currentTurnId;
|
|
3785
3815
|
this.currentTurnId = message.turn_id;
|
|
3786
|
-
this.currentTurnText = ''; // Reset text for new turn
|
|
3787
3816
|
// Clean up interrupted tracks, keeping only the current turn
|
|
3788
3817
|
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3789
3818
|
}
|
|
3790
3819
|
break;
|
|
3791
|
-
case 'response.text':
|
|
3792
|
-
// Set turn ID from first text message if not set
|
|
3793
|
-
if (!this.currentTurnId
|
|
3794
|
-
|
|
3795
|
-
|
|
3796
|
-
this.currentTurnId = message.turn_id;
|
|
3797
|
-
this.currentTurnText = '';
|
|
3798
|
-
}
|
|
3799
|
-
this.currentTurnText += message.content;
|
|
3800
|
-
}
|
|
3801
|
-
else {
|
|
3802
|
-
console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
|
|
3820
|
+
case 'response.text': {
|
|
3821
|
+
// Set turn ID from first text message if not set
|
|
3822
|
+
if (!this.currentTurnId) {
|
|
3823
|
+
this.currentTurnId = message.turn_id;
|
|
3824
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3803
3825
|
}
|
|
3826
|
+
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3804
3827
|
break;
|
|
3805
|
-
|
|
3806
|
-
// console.log('received response.end');
|
|
3807
|
-
// break;
|
|
3828
|
+
}
|
|
3808
3829
|
case 'response.data':
|
|
3809
3830
|
console.log('received response.data', message);
|
|
3810
3831
|
this.options.onDataMessage(message);
|
|
@@ -3908,6 +3929,8 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3908
3929
|
async connect() {
|
|
3909
3930
|
try {
|
|
3910
3931
|
this._setStatus('connecting');
|
|
3932
|
+
// Reset turn tracking for clean start
|
|
3933
|
+
this._resetTurnTracking();
|
|
3911
3934
|
// Get session key from server
|
|
3912
3935
|
let authorizeSessionRequestBody = {
|
|
3913
3936
|
pipeline_id: this.options.pipelineId,
|
|
@@ -3985,6 +4008,10 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3985
4008
|
throw error;
|
|
3986
4009
|
}
|
|
3987
4010
|
}
|
|
4011
|
+
_resetTurnTracking() {
|
|
4012
|
+
this.currentTurnId = null;
|
|
4013
|
+
console.log('Reset turn tracking state');
|
|
4014
|
+
}
|
|
3988
4015
|
async disconnect() {
|
|
3989
4016
|
// Clean up VAD if it exists
|
|
3990
4017
|
if (this.vad) {
|
|
@@ -3994,6 +4021,8 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3994
4021
|
}
|
|
3995
4022
|
this.wavRecorder.quit();
|
|
3996
4023
|
this.wavPlayer.disconnect();
|
|
4024
|
+
// Reset turn tracking
|
|
4025
|
+
this._resetTurnTracking();
|
|
3997
4026
|
// Close websocket and ensure status is updated
|
|
3998
4027
|
if (this.ws) {
|
|
3999
4028
|
this.ws.close();
|