@layercode/js-sdk 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3480,7 +3480,7 @@ registerProcessor('audio_processor', AudioProcessor);
3480
3480
  sessionId: options.sessionId || null,
3481
3481
  authorizeSessionEndpoint: options.authorizeSessionEndpoint,
3482
3482
  metadata: options.metadata || {},
3483
- vadEnabled: options.vadEnabled || true,
3483
+ vadResumeDelay: options.vadResumeDelay || 500,
3484
3484
  onConnect: options.onConnect || (() => { }),
3485
3485
  onDisconnect: options.onDisconnect || (() => { }),
3486
3486
  onError: options.onError || (() => { }),
@@ -3497,16 +3497,31 @@ registerProcessor('audio_processor', AudioProcessor);
3497
3497
  sampleRate: 16000, // TODO should be set my fetched pipeline config
3498
3498
  });
3499
3499
  this.vad = null;
3500
- if (this.options.vadEnabled) {
3500
+ this.ws = null;
3501
+ this.status = 'disconnected';
3502
+ this.userAudioAmplitude = 0;
3503
+ this.agentAudioAmplitude = 0;
3504
+ this.sessionId = options.sessionId || null;
3505
+ this.pushToTalkActive = false;
3506
+ this.vadPausedPlayer = false;
3507
+ this.pushToTalkEnabled = false;
3508
+ this.canInterrupt = false;
3509
+ // Bind event handlers
3510
+ this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3511
+ this._handleDataAvailable = this._handleDataAvailable.bind(this);
3512
+ }
3513
+ _initializeVAD() {
3514
+ console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3515
+ if (!this.pushToTalkEnabled && this.canInterrupt) {
3501
3516
  dist.MicVAD.new({
3502
3517
  stream: this.wavRecorder.getStream() || undefined,
3503
3518
  model: 'v5',
3504
3519
  // baseAssetPath: '/', // Use if bundling model locally
3505
3520
  // onnxWASMBasePath: '/', // Use if bundling model locally
3506
- positiveSpeechThreshold: 0.4,
3507
- negativeSpeechThreshold: 0.4,
3508
- redemptionFrames: 8,
3509
- minSpeechFrames: 20,
3521
+ positiveSpeechThreshold: 0.3,
3522
+ negativeSpeechThreshold: 0.2,
3523
+ redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3524
+ minSpeechFrames: 15,
3510
3525
  preSpeechPadFrames: 0,
3511
3526
  onSpeechStart: () => {
3512
3527
  // Only pause agent audio if it's currently playing
@@ -3517,23 +3532,35 @@ registerProcessor('audio_processor', AudioProcessor);
3517
3532
  }
3518
3533
  else {
3519
3534
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3520
- this.vadPausedPlayer = false;
3521
3535
  }
3522
3536
  },
3523
3537
  onVADMisfire: () => {
3524
- // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption
3538
+ // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3525
3539
  if (this.vadPausedPlayer) {
3526
- console.log('onVADMisfire: VAD paused the player, attempting to resume.');
3540
+ console.log('onSpeechEnd: VAD paused the player, resuming');
3527
3541
  this.wavPlayer.play();
3528
3542
  this.vadPausedPlayer = false; // Reset flag
3543
+ // Option to extend delay in the case where the transcriber takes longer to detect a new turn
3544
+ // console.log('onVADMisfire: VAD paused the player, resuming in ' + this.options.vadResumeDelay + 'ms');
3545
+ // // Add configurable delay before resuming playback
3546
+ // setTimeout(() => {
3547
+ // this.wavPlayer.play();
3548
+ // this.vadPausedPlayer = false; // Reset flag
3549
+ // }, this.options.vadResumeDelay);
3529
3550
  }
3530
3551
  else {
3531
3552
  console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3532
3553
  }
3533
3554
  },
3534
- onSpeechEnd: () => {
3535
- // We don't take any action here, as the user speech is for more than minSpeechFrames and is very likely to result in a new turn start from the transcriber
3536
- },
3555
+ // onSpeechEnd: () => {
3556
+ // if (this.vadPausedPlayer) {
3557
+ // console.log('onSpeechEnd: VAD paused the player, resuming');
3558
+ // this.wavPlayer.play();
3559
+ // this.vadPausedPlayer = false; // Reset flag
3560
+ // } else {
3561
+ // console.log('onSpeechEnd: VAD did not pause the player, not resuming.');
3562
+ // }
3563
+ // },
3537
3564
  })
3538
3565
  .then((vad) => {
3539
3566
  this.vad = vad;
@@ -3544,16 +3571,6 @@ registerProcessor('audio_processor', AudioProcessor);
3544
3571
  console.error('Error initializing VAD:', error);
3545
3572
  });
3546
3573
  }
3547
- this.ws = null;
3548
- this.status = 'disconnected';
3549
- this.userAudioAmplitude = 0;
3550
- this.agentAudioAmplitude = 0;
3551
- this.sessionId = options.sessionId || null;
3552
- this.pushToTalkActive = false;
3553
- this.vadPausedPlayer = false;
3554
- // Bind event handlers
3555
- this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3556
- this._handleDataAvailable = this._handleDataAvailable.bind(this);
3557
3574
  }
3558
3575
  /**
3559
3576
  * Updates the connection status and triggers the callback
@@ -3613,10 +3630,9 @@ registerProcessor('audio_processor', AudioProcessor);
3613
3630
  // Sent from the server to this client when a new user turn is detected
3614
3631
  console.log('received turn.start from server');
3615
3632
  console.log(message);
3616
- // if (message.role === 'user' && !this.pushToTalkActive) {
3617
- if (message.role === 'user') {
3633
+ if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3618
3634
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3619
- console.log('interrupting assistant audio, as user turn has started and pushToTalkActive is false');
3635
+ console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3620
3636
  await this._clientInterruptAssistantReplay();
3621
3637
  }
3622
3638
  // if (message.role === 'assistant') {
@@ -3730,6 +3746,19 @@ registerProcessor('audio_processor', AudioProcessor);
3730
3746
  this.ws = new WebSocket(`${this._websocketUrl}?${new URLSearchParams({
3731
3747
  client_session_key: authorizeSessionResponseBody.client_session_key,
3732
3748
  })}`);
3749
+ const config = authorizeSessionResponseBody.config;
3750
+ console.log('config', config);
3751
+ if (config.transcription.trigger === 'push_to_talk') {
3752
+ this.pushToTalkEnabled = true;
3753
+ }
3754
+ else if (config.transcription.trigger === 'automatic') {
3755
+ this.pushToTalkEnabled = false;
3756
+ this.canInterrupt = config.transcription.can_interrupt;
3757
+ }
3758
+ else {
3759
+ throw new Error(`Unknown trigger: ${config.transcription.trigger}`);
3760
+ }
3761
+ this._initializeVAD();
3733
3762
  // Bind the websocket message callbacks
3734
3763
  this.ws.onmessage = this._handleWebSocketMessage;
3735
3764
  this.ws.onopen = () => {