@layercode/js-sdk 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -3480,7 +3480,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3480
3480
|
sessionId: options.sessionId || null,
|
|
3481
3481
|
authorizeSessionEndpoint: options.authorizeSessionEndpoint,
|
|
3482
3482
|
metadata: options.metadata || {},
|
|
3483
|
-
vadEnabled: options.vadEnabled || true,
|
|
3484
3483
|
vadResumeDelay: options.vadResumeDelay || 500,
|
|
3485
3484
|
onConnect: options.onConnect || (() => { }),
|
|
3486
3485
|
onDisconnect: options.onDisconnect || (() => { }),
|
|
@@ -3498,7 +3497,67 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3498
3497
|
sampleRate: 16000, // TODO should be set my fetched pipeline config
|
|
3499
3498
|
});
|
|
3500
3499
|
this.vad = null;
|
|
3501
|
-
|
|
3500
|
+
this.ws = null;
|
|
3501
|
+
this.status = 'disconnected';
|
|
3502
|
+
this.userAudioAmplitude = 0;
|
|
3503
|
+
this.agentAudioAmplitude = 0;
|
|
3504
|
+
this.sessionId = options.sessionId || null;
|
|
3505
|
+
this.pushToTalkActive = false;
|
|
3506
|
+
this.vadPausedPlayer = false;
|
|
3507
|
+
this.pushToTalkEnabled = false;
|
|
3508
|
+
this.canInterrupt = false;
|
|
3509
|
+
this.userIsSpeaking = false;
|
|
3510
|
+
// Bind event handlers
|
|
3511
|
+
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3512
|
+
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3513
|
+
}
|
|
3514
|
+
_initializeVAD() {
|
|
3515
|
+
console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
|
|
3516
|
+
// If we're in push to talk mode, we don't need to use the VAD model
|
|
3517
|
+
if (this.pushToTalkEnabled) {
|
|
3518
|
+
return;
|
|
3519
|
+
}
|
|
3520
|
+
const timeout = setTimeout(() => {
|
|
3521
|
+
console.log('silero vad model timeout');
|
|
3522
|
+
// TODO: send message to server to indicate that the vad model timed out
|
|
3523
|
+
this.userIsSpeaking = true; // allow audio to be sent to the server
|
|
3524
|
+
}, 2000);
|
|
3525
|
+
if (!this.canInterrupt) {
|
|
3526
|
+
dist.MicVAD.new({
|
|
3527
|
+
stream: this.wavRecorder.getStream() || undefined,
|
|
3528
|
+
model: 'v5',
|
|
3529
|
+
positiveSpeechThreshold: 0.3,
|
|
3530
|
+
negativeSpeechThreshold: 0.2,
|
|
3531
|
+
redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
|
|
3532
|
+
minSpeechFrames: 15,
|
|
3533
|
+
preSpeechPadFrames: 0,
|
|
3534
|
+
onSpeechStart: () => {
|
|
3535
|
+
if (!this.wavPlayer.isPlaying) {
|
|
3536
|
+
this.userIsSpeaking = true;
|
|
3537
|
+
}
|
|
3538
|
+
},
|
|
3539
|
+
onVADMisfire: () => {
|
|
3540
|
+
this.userIsSpeaking = false;
|
|
3541
|
+
},
|
|
3542
|
+
onSpeechEnd: () => {
|
|
3543
|
+
this.userIsSpeaking = false;
|
|
3544
|
+
this._wsSend({
|
|
3545
|
+
type: 'vad_events',
|
|
3546
|
+
event: 'vad_end',
|
|
3547
|
+
});
|
|
3548
|
+
},
|
|
3549
|
+
})
|
|
3550
|
+
.then((vad) => {
|
|
3551
|
+
clearTimeout(timeout);
|
|
3552
|
+
this.vad = vad;
|
|
3553
|
+
this.vad.start();
|
|
3554
|
+
console.log('VAD started');
|
|
3555
|
+
})
|
|
3556
|
+
.catch((error) => {
|
|
3557
|
+
console.error('Error initializing VAD:', error);
|
|
3558
|
+
});
|
|
3559
|
+
}
|
|
3560
|
+
else {
|
|
3502
3561
|
dist.MicVAD.new({
|
|
3503
3562
|
stream: this.wavRecorder.getStream() || undefined,
|
|
3504
3563
|
model: 'v5',
|
|
@@ -3519,36 +3578,35 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3519
3578
|
else {
|
|
3520
3579
|
console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
|
|
3521
3580
|
}
|
|
3581
|
+
this.userIsSpeaking = true;
|
|
3582
|
+
console.log('onSpeechStart: sending vad_start');
|
|
3583
|
+
this._wsSend({
|
|
3584
|
+
type: 'vad_events',
|
|
3585
|
+
event: 'vad_start',
|
|
3586
|
+
});
|
|
3522
3587
|
},
|
|
3523
3588
|
onVADMisfire: () => {
|
|
3524
3589
|
// If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
|
|
3590
|
+
this.userIsSpeaking = false;
|
|
3525
3591
|
if (this.vadPausedPlayer) {
|
|
3526
3592
|
console.log('onSpeechEnd: VAD paused the player, resuming');
|
|
3527
3593
|
this.wavPlayer.play();
|
|
3528
3594
|
this.vadPausedPlayer = false; // Reset flag
|
|
3529
|
-
// Option to extend delay in the case where the transcriber takes longer to detect a new turn
|
|
3530
|
-
// console.log('onVADMisfire: VAD paused the player, resuming in ' + this.options.vadResumeDelay + 'ms');
|
|
3531
|
-
// // Add configurable delay before resuming playback
|
|
3532
|
-
// setTimeout(() => {
|
|
3533
|
-
// this.wavPlayer.play();
|
|
3534
|
-
// this.vadPausedPlayer = false; // Reset flag
|
|
3535
|
-
// }, this.options.vadResumeDelay);
|
|
3536
3595
|
}
|
|
3537
3596
|
else {
|
|
3538
3597
|
console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
|
|
3539
3598
|
}
|
|
3540
3599
|
},
|
|
3541
|
-
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3547
|
-
|
|
3548
|
-
// }
|
|
3549
|
-
// },
|
|
3600
|
+
onSpeechEnd: () => {
|
|
3601
|
+
this.userIsSpeaking = false;
|
|
3602
|
+
this._wsSend({
|
|
3603
|
+
type: 'vad_events',
|
|
3604
|
+
event: 'vad_end',
|
|
3605
|
+
});
|
|
3606
|
+
},
|
|
3550
3607
|
})
|
|
3551
3608
|
.then((vad) => {
|
|
3609
|
+
clearTimeout(timeout);
|
|
3552
3610
|
this.vad = vad;
|
|
3553
3611
|
this.vad.start();
|
|
3554
3612
|
console.log('VAD started');
|
|
@@ -3557,16 +3615,6 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3557
3615
|
console.error('Error initializing VAD:', error);
|
|
3558
3616
|
});
|
|
3559
3617
|
}
|
|
3560
|
-
this.ws = null;
|
|
3561
|
-
this.status = 'disconnected';
|
|
3562
|
-
this.userAudioAmplitude = 0;
|
|
3563
|
-
this.agentAudioAmplitude = 0;
|
|
3564
|
-
this.sessionId = options.sessionId || null;
|
|
3565
|
-
this.pushToTalkActive = false;
|
|
3566
|
-
this.vadPausedPlayer = false;
|
|
3567
|
-
// Bind event handlers
|
|
3568
|
-
this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
|
|
3569
|
-
this._handleDataAvailable = this._handleDataAvailable.bind(this);
|
|
3570
3618
|
}
|
|
3571
3619
|
/**
|
|
3572
3620
|
* Updates the connection status and triggers the callback
|
|
@@ -3626,10 +3674,9 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3626
3674
|
// Sent from the server to this client when a new user turn is detected
|
|
3627
3675
|
console.log('received turn.start from server');
|
|
3628
3676
|
console.log(message);
|
|
3629
|
-
|
|
3630
|
-
if (message.role === 'user') {
|
|
3677
|
+
if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
|
|
3631
3678
|
// Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
|
|
3632
|
-
console.log('interrupting assistant audio, as user turn has started and
|
|
3679
|
+
console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
|
|
3633
3680
|
await this._clientInterruptAssistantReplay();
|
|
3634
3681
|
}
|
|
3635
3682
|
// if (message.role === 'assistant') {
|
|
@@ -3667,10 +3714,13 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3667
3714
|
_handleDataAvailable(data) {
|
|
3668
3715
|
try {
|
|
3669
3716
|
const base64 = arrayBufferToBase64(data.mono);
|
|
3670
|
-
this.
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3717
|
+
const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
|
|
3718
|
+
if (sendAudio) {
|
|
3719
|
+
this._wsSend({
|
|
3720
|
+
type: 'client.audio',
|
|
3721
|
+
content: base64,
|
|
3722
|
+
});
|
|
3723
|
+
}
|
|
3674
3724
|
}
|
|
3675
3725
|
catch (error) {
|
|
3676
3726
|
console.error('Error processing audio:', error);
|
|
@@ -3743,6 +3793,19 @@ registerProcessor('audio_processor', AudioProcessor);
|
|
|
3743
3793
|
this.ws = new WebSocket(`${this._websocketUrl}?${new URLSearchParams({
|
|
3744
3794
|
client_session_key: authorizeSessionResponseBody.client_session_key,
|
|
3745
3795
|
})}`);
|
|
3796
|
+
const config = authorizeSessionResponseBody.config;
|
|
3797
|
+
console.log('config', config);
|
|
3798
|
+
if (config.transcription.trigger === 'push_to_talk') {
|
|
3799
|
+
this.pushToTalkEnabled = true;
|
|
3800
|
+
}
|
|
3801
|
+
else if (config.transcription.trigger === 'automatic') {
|
|
3802
|
+
this.pushToTalkEnabled = false;
|
|
3803
|
+
this.canInterrupt = config.transcription.can_interrupt;
|
|
3804
|
+
}
|
|
3805
|
+
else {
|
|
3806
|
+
throw new Error(`Unknown trigger: ${config.transcription.trigger}`);
|
|
3807
|
+
}
|
|
3808
|
+
this._initializeVAD();
|
|
3746
3809
|
// Bind the websocket message callbacks
|
|
3747
3810
|
this.ws.onmessage = this._handleWebSocketMessage;
|
|
3748
3811
|
this.ws.onopen = () => {
|