@layercode/js-sdk 1.0.22 → 1.0.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3522,13 +3522,53 @@ class LayercodeClient {
|
|
|
3522
3522
|
this.endUserTurn = false;
|
|
3523
3523
|
this.recorderStarted = false;
|
|
3524
3524
|
this.readySent = false;
|
|
3525
|
-
this.currentTurnText = '';
|
|
3526
3525
|
this.currentTurnId = null;
|
|
3527
3526
|
this.audioBuffer = [];
|
|
3527
|
+
this.audioPauseTime = null;
|
|
3528
3528
|
// Bind event handlers
|
|
3529
3529
|
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3530
3530
|
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3531
3531
|
}
|
|
3532
|
+
_setupAmplitudeBasedVAD() {
|
|
3533
|
+
let isSpeakingByAmplitude = false;
|
|
3534
|
+
let silenceFrames = 0;
|
|
3535
|
+
const AMPLITUDE_THRESHOLD = 0.01; // Adjust based on testing
|
|
3536
|
+
const SILENCE_FRAMES_THRESHOLD = 30; // ~600ms at 20ms chunks
|
|
3537
|
+
// Monitor amplitude changes
|
|
3538
|
+
this.wavRecorder.startAmplitudeMonitoring((amplitude) => {
|
|
3539
|
+
const wasSpeaking = isSpeakingByAmplitude;
|
|
3540
|
+
if (amplitude > AMPLITUDE_THRESHOLD) {
|
|
3541
|
+
silenceFrames = 0;
|
|
3542
|
+
if (!wasSpeaking) {
|
|
3543
|
+
// Speech started - pause audio if playing and track timing for interruption calculation
|
|
3544
|
+
if (this.canInterrupt && this.wavPlayer.isPlaying) {
|
|
3545
|
+
this.audioPauseTime = Date.now();
|
|
3546
|
+
this.wavPlayer.pause();
|
|
3547
|
+
}
|
|
3548
|
+
isSpeakingByAmplitude = true;
|
|
3549
|
+
this.userIsSpeaking = true;
|
|
3550
|
+
this.options.onUserIsSpeakingChange(true);
|
|
3551
|
+
this._wsSend({
|
|
3552
|
+
type: 'vad_events',
|
|
3553
|
+
event: 'vad_start',
|
|
3554
|
+
});
|
|
3555
|
+
}
|
|
3556
|
+
}
|
|
3557
|
+
else {
|
|
3558
|
+
silenceFrames++;
|
|
3559
|
+
if (wasSpeaking && silenceFrames >= SILENCE_FRAMES_THRESHOLD) {
|
|
3560
|
+
// Speech ended
|
|
3561
|
+
isSpeakingByAmplitude = false;
|
|
3562
|
+
this.userIsSpeaking = false;
|
|
3563
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3564
|
+
this._wsSend({
|
|
3565
|
+
type: 'vad_events',
|
|
3566
|
+
event: 'vad_end',
|
|
3567
|
+
});
|
|
3568
|
+
}
|
|
3569
|
+
}
|
|
3570
|
+
});
|
|
3571
|
+
}
|
|
3532
3572
|
_initializeVAD() {
|
|
3533
3573
|
console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
|
|
3534
3574
|
// If we're in push to talk mode, we don't need to use the VAD model
|
|
@@ -3537,9 +3577,17 @@ class LayercodeClient {
|
|
|
3537
3577
|
}
|
|
3538
3578
|
const timeout = setTimeout(() => {
|
|
3539
3579
|
console.log('silero vad model timeout');
|
|
3540
|
-
|
|
3541
|
-
|
|
3542
|
-
this.
|
|
3580
|
+
console.warn('VAD model failed to load - falling back to amplitude-based detection');
|
|
3581
|
+
// Send a message to server indicating VAD failure
|
|
3582
|
+
this._wsSend({
|
|
3583
|
+
type: 'vad_events',
|
|
3584
|
+
event: 'vad_model_failed',
|
|
3585
|
+
});
|
|
3586
|
+
// In automatic mode without VAD, allow the bot to speak initially
|
|
3587
|
+
this.userIsSpeaking = false;
|
|
3588
|
+
this.options.onUserIsSpeakingChange(false);
|
|
3589
|
+
// Set up amplitude-based fallback detection
|
|
3590
|
+
this._setupAmplitudeBasedVAD();
|
|
3543
3591
|
}, 2000);
|
|
3544
3592
|
if (!this.canInterrupt) {
|
|
3545
3593
|
dist.MicVAD.new({
|
|
@@ -3548,7 +3596,7 @@ class LayercodeClient {
|
|
|
3548
3596
|
positiveSpeechThreshold: 0.3,
|
|
3549
3597
|
negativeSpeechThreshold: 0.2,
|
|
3550
3598
|
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3551
|
-
minSpeechFrames:
|
|
3599
|
+
minSpeechFrames: 0,
|
|
3552
3600
|
preSpeechPadFrames: 0,
|
|
3553
3601
|
onSpeechStart: () => {
|
|
3554
3602
|
this.userIsSpeaking = true;
|
|
@@ -3559,27 +3607,6 @@ class LayercodeClient {
|
|
|
3559
3607
|
event: 'vad_start',
|
|
3560
3608
|
});
|
|
3561
3609
|
},
|
|
3562
|
-
onVADMisfire: () => {
|
|
3563
|
-
console.log('onVADMisfire: Short utterance detected, resuming bot');
|
|
3564
|
-
this.audioBuffer = []; // Clear buffer on misfire
|
|
3565
|
-
this.userIsSpeaking = false;
|
|
3566
|
-
this.options.onUserIsSpeakingChange(false);
|
|
3567
|
-
// Send vad_end to indicate the short utterance is over
|
|
3568
|
-
this._wsSend({
|
|
3569
|
-
type: 'vad_events',
|
|
3570
|
-
event: 'vad_end',
|
|
3571
|
-
});
|
|
3572
|
-
// End the user's turn
|
|
3573
|
-
this._wsSend({
|
|
3574
|
-
type: 'trigger.turn.end',
|
|
3575
|
-
role: 'user',
|
|
3576
|
-
});
|
|
3577
|
-
// Resume bot audio if it was playing
|
|
3578
|
-
if (!this.wavPlayer.isPlaying) {
|
|
3579
|
-
console.log('onVADMisfire: Resuming bot audio');
|
|
3580
|
-
this.wavPlayer.play();
|
|
3581
|
-
}
|
|
3582
|
-
},
|
|
3583
3610
|
onSpeechEnd: () => {
|
|
3584
3611
|
console.log('onSpeechEnd: sending vad_end');
|
|
3585
3612
|
this.endUserTurn = true; // Set flag to indicate that the user turn has ended
|
|
@@ -3620,6 +3647,7 @@ class LayercodeClient {
|
|
|
3620
3647
|
// Only pause agent audio if it's currently playing
|
|
3621
3648
|
if (this.wavPlayer.isPlaying) {
|
|
3622
3649
|
console.log('onSpeechStart: WavPlayer is playing, pausing it.');
|
|
3650
|
+
this.audioPauseTime = Date.now(); // Track when we paused
|
|
3623
3651
|
this.wavPlayer.pause();
|
|
3624
3652
|
}
|
|
3625
3653
|
else {
|
|
@@ -3644,9 +3672,8 @@ class LayercodeClient {
|
|
|
3644
3672
|
setTimeout(() => {
|
|
3645
3673
|
if (!this.wavPlayer.isPlaying) {
|
|
3646
3674
|
console.log('onVADMisfire: Resuming after delay');
|
|
3675
|
+
this.audioPauseTime = null; // Clear pause time since we're resuming
|
|
3647
3676
|
this.wavPlayer.play();
|
|
3648
|
-
this.userIsSpeaking = true;
|
|
3649
|
-
this.options.onUserIsSpeakingChange(true);
|
|
3650
3677
|
}
|
|
3651
3678
|
else {
|
|
3652
3679
|
console.log('onVADMisfire: Not resuming - either no pause or user speaking again');
|
|
@@ -3700,33 +3727,37 @@ class LayercodeClient {
|
|
|
3700
3727
|
reason: 'completed',
|
|
3701
3728
|
});
|
|
3702
3729
|
}
|
|
3703
|
-
_estimateWordsHeard(text, playbackOffsetSeconds) {
|
|
3704
|
-
const words = text.split(/\s+/).filter((word) => word.length > 0);
|
|
3705
|
-
const totalWords = words.length;
|
|
3706
|
-
// Rough estimation: average speaking rate is ~150 words per minute (2.5 words per second)
|
|
3707
|
-
const estimatedWordsPerSecond = 2.5;
|
|
3708
|
-
const estimatedWordsHeard = Math.min(Math.floor(playbackOffsetSeconds * estimatedWordsPerSecond), totalWords);
|
|
3709
|
-
const textHeard = words.slice(0, estimatedWordsHeard).join(' ');
|
|
3710
|
-
return { wordsHeard: estimatedWordsHeard, textHeard };
|
|
3711
|
-
}
|
|
3712
3730
|
async _clientInterruptAssistantReplay() {
|
|
3713
3731
|
const offsetData = await this.wavPlayer.interrupt();
|
|
3714
|
-
if (offsetData && this.
|
|
3715
|
-
|
|
3716
|
-
|
|
3717
|
-
|
|
3718
|
-
|
|
3732
|
+
if (offsetData && this.currentTurnId) {
|
|
3733
|
+
let offsetMs = offsetData.currentTime * 1000;
|
|
3734
|
+
// Calculate accurate offset by subtracting pause time if audio was paused for VAD
|
|
3735
|
+
if (this.audioPauseTime) {
|
|
3736
|
+
const pauseDurationMs = Date.now() - this.audioPauseTime;
|
|
3737
|
+
const adjustedOffsetMs = Math.max(0, offsetMs - pauseDurationMs);
|
|
3738
|
+
console.log(`Interruption detected: Raw offset ${offsetMs}ms, pause duration ${pauseDurationMs}ms, adjusted offset ${adjustedOffsetMs}ms for turn ${this.currentTurnId}`);
|
|
3739
|
+
offsetMs = adjustedOffsetMs;
|
|
3740
|
+
this.audioPauseTime = null; // Clear the pause time
|
|
3741
|
+
}
|
|
3742
|
+
else {
|
|
3743
|
+
console.log(`Interruption detected: ${offsetMs}ms offset for turn ${this.currentTurnId} (no pause adjustment needed)`);
|
|
3744
|
+
}
|
|
3745
|
+
// Send interruption event with accurate playback offset in milliseconds
|
|
3719
3746
|
this._wsSend({
|
|
3720
3747
|
type: 'trigger.response.audio.interrupted',
|
|
3721
|
-
playback_offset:
|
|
3748
|
+
playback_offset: offsetMs,
|
|
3722
3749
|
interruption_context: {
|
|
3723
3750
|
turn_id: this.currentTurnId,
|
|
3724
|
-
|
|
3725
|
-
total_words: totalWords,
|
|
3726
|
-
text_heard: textHeard,
|
|
3751
|
+
playback_offset_ms: offsetMs,
|
|
3727
3752
|
},
|
|
3728
3753
|
});
|
|
3729
3754
|
}
|
|
3755
|
+
else {
|
|
3756
|
+
console.warn('Interruption requested but missing required data:', {
|
|
3757
|
+
hasOffsetData: !!offsetData,
|
|
3758
|
+
hasTurnId: !!this.currentTurnId,
|
|
3759
|
+
});
|
|
3760
|
+
}
|
|
3730
3761
|
}
|
|
3731
3762
|
async triggerUserTurnStarted() {
|
|
3732
3763
|
if (!this.pushToTalkActive) {
|
|
@@ -3775,30 +3806,20 @@ class LayercodeClient {
|
|
|
3775
3806
|
// Set current turn ID from first audio message, or update if different turn
|
|
3776
3807
|
if (!this.currentTurnId || this.currentTurnId !== message.turn_id) {
|
|
3777
3808
|
console.log(`Setting current turn ID to: ${message.turn_id} (was: ${this.currentTurnId})`);
|
|
3778
|
-
const oldTurnId = this.currentTurnId;
|
|
3779
3809
|
this.currentTurnId = message.turn_id;
|
|
3780
|
-
this.currentTurnText = ''; // Reset text for new turn
|
|
3781
3810
|
// Clean up interrupted tracks, keeping only the current turn
|
|
3782
3811
|
this.wavPlayer.clearInterruptedTracks(this.currentTurnId ? [this.currentTurnId] : []);
|
|
3783
3812
|
}
|
|
3784
3813
|
break;
|
|
3785
|
-
case 'response.text':
|
|
3786
|
-
// Set turn ID from first text message if not set
|
|
3787
|
-
if (!this.currentTurnId
|
|
3788
|
-
|
|
3789
|
-
|
|
3790
|
-
this.currentTurnId = message.turn_id;
|
|
3791
|
-
this.currentTurnText = '';
|
|
3792
|
-
}
|
|
3793
|
-
this.currentTurnText += message.content;
|
|
3794
|
-
}
|
|
3795
|
-
else {
|
|
3796
|
-
console.log(`Ignoring text for turn ${message.turn_id}, current turn is ${this.currentTurnId}`);
|
|
3814
|
+
case 'response.text': {
|
|
3815
|
+
// Set turn ID from first text message if not set
|
|
3816
|
+
if (!this.currentTurnId) {
|
|
3817
|
+
this.currentTurnId = message.turn_id;
|
|
3818
|
+
console.log(`Setting current turn ID to: ${message.turn_id} from text message`);
|
|
3797
3819
|
}
|
|
3820
|
+
// Note: We no longer track text content in the client - the pipeline handles interruption estimation
|
|
3798
3821
|
break;
|
|
3799
|
-
|
|
3800
|
-
// console.log('received response.end');
|
|
3801
|
-
// break;
|
|
3822
|
+
}
|
|
3802
3823
|
case 'response.data':
|
|
3803
3824
|
console.log('received response.data', message);
|
|
3804
3825
|
this.options.onDataMessage(message);
|
|
@@ -3902,6 +3923,8 @@ class LayercodeClient {
|
|
|
3902
3923
|
async connect() {
|
|
3903
3924
|
try {
|
|
3904
3925
|
this._setStatus('connecting');
|
|
3926
|
+
// Reset turn tracking for clean start
|
|
3927
|
+
this._resetTurnTracking();
|
|
3905
3928
|
// Get session key from server
|
|
3906
3929
|
let authorizeSessionRequestBody = {
|
|
3907
3930
|
pipeline_id: this.options.pipelineId,
|
|
@@ -3979,6 +4002,10 @@ class LayercodeClient {
|
|
|
3979
4002
|
throw error;
|
|
3980
4003
|
}
|
|
3981
4004
|
}
|
|
4005
|
+
_resetTurnTracking() {
|
|
4006
|
+
this.currentTurnId = null;
|
|
4007
|
+
console.log('Reset turn tracking state');
|
|
4008
|
+
}
|
|
3982
4009
|
async disconnect() {
|
|
3983
4010
|
// Clean up VAD if it exists
|
|
3984
4011
|
if (this.vad) {
|
|
@@ -3988,6 +4015,8 @@ class LayercodeClient {
|
|
|
3988
4015
|
}
|
|
3989
4016
|
this.wavRecorder.quit();
|
|
3990
4017
|
this.wavPlayer.disconnect();
|
|
4018
|
+
// Reset turn tracking
|
|
4019
|
+
this._resetTurnTracking();
|
|
3991
4020
|
// Close websocket and ensure status is updated
|
|
3992
4021
|
if (this.ws) {
|
|
3993
4022
|
this.ws.close();
|