@layercode/js-sdk 1.0.14 → 1.0.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3480,7 +3480,6 @@ registerProcessor('audio_processor', AudioProcessor);
3480
3480
  sessionId: options.sessionId || null,
3481
3481
  authorizeSessionEndpoint: options.authorizeSessionEndpoint,
3482
3482
  metadata: options.metadata || {},
3483
- vadEnabled: options.vadEnabled || true,
3484
3483
  vadResumeDelay: options.vadResumeDelay || 500,
3485
3484
  onConnect: options.onConnect || (() => { }),
3486
3485
  onDisconnect: options.onDisconnect || (() => { }),
@@ -3498,7 +3497,67 @@ registerProcessor('audio_processor', AudioProcessor);
3498
3497
  sampleRate: 16000, // TODO should be set my fetched pipeline config
3499
3498
  });
3500
3499
  this.vad = null;
3501
- if (this.options.vadEnabled) {
3500
+ this.ws = null;
3501
+ this.status = 'disconnected';
3502
+ this.userAudioAmplitude = 0;
3503
+ this.agentAudioAmplitude = 0;
3504
+ this.sessionId = options.sessionId || null;
3505
+ this.pushToTalkActive = false;
3506
+ this.vadPausedPlayer = false;
3507
+ this.pushToTalkEnabled = false;
3508
+ this.canInterrupt = false;
3509
+ this.userIsSpeaking = false;
3510
+ // Bind event handlers
3511
+ this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3512
+ this._handleDataAvailable = this._handleDataAvailable.bind(this);
3513
+ }
3514
+ _initializeVAD() {
3515
+ console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3516
+ // If we're in push to talk mode, we don't need to use the VAD model
3517
+ if (this.pushToTalkEnabled) {
3518
+ return;
3519
+ }
3520
+ const timeout = setTimeout(() => {
3521
+ console.log('silero vad model timeout');
3522
+ // TODO: send message to server to indicate that the vad model timed out
3523
+ this.userIsSpeaking = true; // allow audio to be sent to the server
3524
+ }, 2000);
3525
+ if (!this.canInterrupt) {
3526
+ dist.MicVAD.new({
3527
+ stream: this.wavRecorder.getStream() || undefined,
3528
+ model: 'v5',
3529
+ positiveSpeechThreshold: 0.3,
3530
+ negativeSpeechThreshold: 0.2,
3531
+ redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3532
+ minSpeechFrames: 15,
3533
+ preSpeechPadFrames: 0,
3534
+ onSpeechStart: () => {
3535
+ if (!this.wavPlayer.isPlaying) {
3536
+ this.userIsSpeaking = true;
3537
+ }
3538
+ },
3539
+ onVADMisfire: () => {
3540
+ this.userIsSpeaking = false;
3541
+ },
3542
+ onSpeechEnd: () => {
3543
+ this.userIsSpeaking = false;
3544
+ this._wsSend({
3545
+ type: 'vad_events',
3546
+ event: 'vad_end',
3547
+ });
3548
+ },
3549
+ })
3550
+ .then((vad) => {
3551
+ clearTimeout(timeout);
3552
+ this.vad = vad;
3553
+ this.vad.start();
3554
+ console.log('VAD started');
3555
+ })
3556
+ .catch((error) => {
3557
+ console.error('Error initializing VAD:', error);
3558
+ });
3559
+ }
3560
+ else {
3502
3561
  dist.MicVAD.new({
3503
3562
  stream: this.wavRecorder.getStream() || undefined,
3504
3563
  model: 'v5',
@@ -3519,36 +3578,35 @@ registerProcessor('audio_processor', AudioProcessor);
3519
3578
  else {
3520
3579
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3521
3580
  }
3581
+ this.userIsSpeaking = true;
3582
+ console.log('onSpeechStart: sending vad_start');
3583
+ this._wsSend({
3584
+ type: 'vad_events',
3585
+ event: 'vad_start',
3586
+ });
3522
3587
  },
3523
3588
  onVADMisfire: () => {
3524
3589
  // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3590
+ this.userIsSpeaking = false;
3525
3591
  if (this.vadPausedPlayer) {
3526
3592
  console.log('onSpeechEnd: VAD paused the player, resuming');
3527
3593
  this.wavPlayer.play();
3528
3594
  this.vadPausedPlayer = false; // Reset flag
3529
- // Option to extend delay in the case where the transcriber takes longer to detect a new turn
3530
- // console.log('onVADMisfire: VAD paused the player, resuming in ' + this.options.vadResumeDelay + 'ms');
3531
- // // Add configurable delay before resuming playback
3532
- // setTimeout(() => {
3533
- // this.wavPlayer.play();
3534
- // this.vadPausedPlayer = false; // Reset flag
3535
- // }, this.options.vadResumeDelay);
3536
3595
  }
3537
3596
  else {
3538
3597
  console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3539
3598
  }
3540
3599
  },
3541
- // onSpeechEnd: () => {
3542
- // if (this.vadPausedPlayer) {
3543
- // console.log('onSpeechEnd: VAD paused the player, resuming');
3544
- // this.wavPlayer.play();
3545
- // this.vadPausedPlayer = false; // Reset flag
3546
- // } else {
3547
- // console.log('onSpeechEnd: VAD did not pause the player, not resuming.');
3548
- // }
3549
- // },
3600
+ onSpeechEnd: () => {
3601
+ this.userIsSpeaking = false;
3602
+ this._wsSend({
3603
+ type: 'vad_events',
3604
+ event: 'vad_end',
3605
+ });
3606
+ },
3550
3607
  })
3551
3608
  .then((vad) => {
3609
+ clearTimeout(timeout);
3552
3610
  this.vad = vad;
3553
3611
  this.vad.start();
3554
3612
  console.log('VAD started');
@@ -3557,16 +3615,6 @@ registerProcessor('audio_processor', AudioProcessor);
3557
3615
  console.error('Error initializing VAD:', error);
3558
3616
  });
3559
3617
  }
3560
- this.ws = null;
3561
- this.status = 'disconnected';
3562
- this.userAudioAmplitude = 0;
3563
- this.agentAudioAmplitude = 0;
3564
- this.sessionId = options.sessionId || null;
3565
- this.pushToTalkActive = false;
3566
- this.vadPausedPlayer = false;
3567
- // Bind event handlers
3568
- this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3569
- this._handleDataAvailable = this._handleDataAvailable.bind(this);
3570
3618
  }
3571
3619
  /**
3572
3620
  * Updates the connection status and triggers the callback
@@ -3626,10 +3674,9 @@ registerProcessor('audio_processor', AudioProcessor);
3626
3674
  // Sent from the server to this client when a new user turn is detected
3627
3675
  console.log('received turn.start from server');
3628
3676
  console.log(message);
3629
- // if (message.role === 'user' && !this.pushToTalkActive) {
3630
- if (message.role === 'user') {
3677
+ if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3631
3678
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3632
- console.log('interrupting assistant audio, as user turn has started and pushToTalkActive is false');
3679
+ console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3633
3680
  await this._clientInterruptAssistantReplay();
3634
3681
  }
3635
3682
  // if (message.role === 'assistant') {
@@ -3667,10 +3714,13 @@ registerProcessor('audio_processor', AudioProcessor);
3667
3714
  _handleDataAvailable(data) {
3668
3715
  try {
3669
3716
  const base64 = arrayBufferToBase64(data.mono);
3670
- this._wsSend({
3671
- type: 'client.audio',
3672
- content: base64,
3673
- });
3717
+ const sendAudio = this.pushToTalkEnabled ? this.pushToTalkActive : this.userIsSpeaking;
3718
+ if (sendAudio) {
3719
+ this._wsSend({
3720
+ type: 'client.audio',
3721
+ content: base64,
3722
+ });
3723
+ }
3674
3724
  }
3675
3725
  catch (error) {
3676
3726
  console.error('Error processing audio:', error);
@@ -3743,6 +3793,19 @@ registerProcessor('audio_processor', AudioProcessor);
3743
3793
  this.ws = new WebSocket(`${this._websocketUrl}?${new URLSearchParams({
3744
3794
  client_session_key: authorizeSessionResponseBody.client_session_key,
3745
3795
  })}`);
3796
+ const config = authorizeSessionResponseBody.config;
3797
+ console.log('config', config);
3798
+ if (config.transcription.trigger === 'push_to_talk') {
3799
+ this.pushToTalkEnabled = true;
3800
+ }
3801
+ else if (config.transcription.trigger === 'automatic') {
3802
+ this.pushToTalkEnabled = false;
3803
+ this.canInterrupt = config.transcription.can_interrupt;
3804
+ }
3805
+ else {
3806
+ throw new Error(`Unknown trigger: ${config.transcription.trigger}`);
3807
+ }
3808
+ this._initializeVAD();
3746
3809
  // Bind the websocket message callbacks
3747
3810
  this.ws.onmessage = this._handleWebSocketMessage;
3748
3811
  this.ws.onopen = () => {