@layercode/js-sdk 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3474,7 +3474,7 @@ class LayercodeClient {
3474
3474
  sessionId: options.sessionId || null,
3475
3475
  authorizeSessionEndpoint: options.authorizeSessionEndpoint,
3476
3476
  metadata: options.metadata || {},
3477
- vadEnabled: options.vadEnabled || true,
3477
+ vadResumeDelay: options.vadResumeDelay || 500,
3478
3478
  onConnect: options.onConnect || (() => { }),
3479
3479
  onDisconnect: options.onDisconnect || (() => { }),
3480
3480
  onError: options.onError || (() => { }),
@@ -3491,16 +3491,31 @@ class LayercodeClient {
3491
3491
  sampleRate: 16000, // TODO should be set my fetched pipeline config
3492
3492
  });
3493
3493
  this.vad = null;
3494
- if (this.options.vadEnabled) {
3494
+ this.ws = null;
3495
+ this.status = 'disconnected';
3496
+ this.userAudioAmplitude = 0;
3497
+ this.agentAudioAmplitude = 0;
3498
+ this.sessionId = options.sessionId || null;
3499
+ this.pushToTalkActive = false;
3500
+ this.vadPausedPlayer = false;
3501
+ this.pushToTalkEnabled = false;
3502
+ this.canInterrupt = false;
3503
+ // Bind event handlers
3504
+ this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3505
+ this._handleDataAvailable = this._handleDataAvailable.bind(this);
3506
+ }
3507
+ _initializeVAD() {
3508
+ console.log('initializing VAD', { pushToTalkEnabled: this.pushToTalkEnabled, canInterrupt: this.canInterrupt });
3509
+ if (!this.pushToTalkEnabled && this.canInterrupt) {
3495
3510
  dist.MicVAD.new({
3496
3511
  stream: this.wavRecorder.getStream() || undefined,
3497
3512
  model: 'v5',
3498
3513
  // baseAssetPath: '/', // Use if bundling model locally
3499
3514
  // onnxWASMBasePath: '/', // Use if bundling model locally
3500
- positiveSpeechThreshold: 0.4,
3501
- negativeSpeechThreshold: 0.4,
3502
- redemptionFrames: 8,
3503
- minSpeechFrames: 20,
3515
+ positiveSpeechThreshold: 0.3,
3516
+ negativeSpeechThreshold: 0.2,
3517
+ redemptionFrames: 25, // Number of frames of silence before onVADMisfire or onSpeechEnd is called. Effectively a delay before restarting.
3518
+ minSpeechFrames: 15,
3504
3519
  preSpeechPadFrames: 0,
3505
3520
  onSpeechStart: () => {
3506
3521
  // Only pause agent audio if it's currently playing
@@ -3511,23 +3526,35 @@ class LayercodeClient {
3511
3526
  }
3512
3527
  else {
3513
3528
  console.log('onSpeechStart: WavPlayer is not playing, VAD will not pause.');
3514
- this.vadPausedPlayer = false;
3515
3529
  }
3516
3530
  },
3517
3531
  onVADMisfire: () => {
3518
- // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption
3532
+ // If the speech detected was for less than minSpeechFrames, this is called instead of onSpeechEnd, and we should resume the assistant audio as it was a false interruption. We include a configurable delay so the assistant isn't too quick to start speaking again.
3519
3533
  if (this.vadPausedPlayer) {
3520
- console.log('onVADMisfire: VAD paused the player, attempting to resume.');
3534
+ console.log('onSpeechEnd: VAD paused the player, resuming');
3521
3535
  this.wavPlayer.play();
3522
3536
  this.vadPausedPlayer = false; // Reset flag
3537
+ // Option to extend delay in the case where the transcriber takes longer to detect a new turn
3538
+ // console.log('onVADMisfire: VAD paused the player, resuming in ' + this.options.vadResumeDelay + 'ms');
3539
+ // // Add configurable delay before resuming playback
3540
+ // setTimeout(() => {
3541
+ // this.wavPlayer.play();
3542
+ // this.vadPausedPlayer = false; // Reset flag
3543
+ // }, this.options.vadResumeDelay);
3523
3544
  }
3524
3545
  else {
3525
3546
  console.log('onVADMisfire: VAD did not pause the player, no action taken to resume.');
3526
3547
  }
3527
3548
  },
3528
- onSpeechEnd: () => {
3529
- // We don't take any action here, as the user speech is for more than minSpeechFrames and is very likely to result in a new turn start from the transcriber
3530
- },
3549
+ // onSpeechEnd: () => {
3550
+ // if (this.vadPausedPlayer) {
3551
+ // console.log('onSpeechEnd: VAD paused the player, resuming');
3552
+ // this.wavPlayer.play();
3553
+ // this.vadPausedPlayer = false; // Reset flag
3554
+ // } else {
3555
+ // console.log('onSpeechEnd: VAD did not pause the player, not resuming.');
3556
+ // }
3557
+ // },
3531
3558
  })
3532
3559
  .then((vad) => {
3533
3560
  this.vad = vad;
@@ -3538,16 +3565,6 @@ class LayercodeClient {
3538
3565
  console.error('Error initializing VAD:', error);
3539
3566
  });
3540
3567
  }
3541
- this.ws = null;
3542
- this.status = 'disconnected';
3543
- this.userAudioAmplitude = 0;
3544
- this.agentAudioAmplitude = 0;
3545
- this.sessionId = options.sessionId || null;
3546
- this.pushToTalkActive = false;
3547
- this.vadPausedPlayer = false;
3548
- // Bind event handlers
3549
- this._handleWebSocketMessage = this._handleWebSocketMessage.bind(this);
3550
- this._handleDataAvailable = this._handleDataAvailable.bind(this);
3551
3568
  }
3552
3569
  /**
3553
3570
  * Updates the connection status and triggers the callback
@@ -3607,10 +3624,9 @@ class LayercodeClient {
3607
3624
  // Sent from the server to this client when a new user turn is detected
3608
3625
  console.log('received turn.start from server');
3609
3626
  console.log(message);
3610
- // if (message.role === 'user' && !this.pushToTalkActive) {
3611
- if (message.role === 'user') {
3627
+ if (message.role === 'user' && !this.pushToTalkEnabled && this.canInterrupt) {
3612
3628
  // Interrupt any playing assistant audio if this is a turn trigged by the server (and not push to talk, which will have already called interrupt)
3613
- console.log('interrupting assistant audio, as user turn has started and pushToTalkActive is false');
3629
+ console.log('interrupting assistant audio, as user turn has started and pushToTalkEnabled is false');
3614
3630
  await this._clientInterruptAssistantReplay();
3615
3631
  }
3616
3632
  // if (message.role === 'assistant') {
@@ -3724,6 +3740,19 @@ class LayercodeClient {
3724
3740
  this.ws = new WebSocket(`${this._websocketUrl}?${new URLSearchParams({
3725
3741
  client_session_key: authorizeSessionResponseBody.client_session_key,
3726
3742
  })}`);
3743
+ const config = authorizeSessionResponseBody.config;
3744
+ console.log('config', config);
3745
+ if (config.transcription.trigger === 'push_to_talk') {
3746
+ this.pushToTalkEnabled = true;
3747
+ }
3748
+ else if (config.transcription.trigger === 'automatic') {
3749
+ this.pushToTalkEnabled = false;
3750
+ this.canInterrupt = config.transcription.can_interrupt;
3751
+ }
3752
+ else {
3753
+ throw new Error(`Unknown trigger: ${config.transcription.trigger}`);
3754
+ }
3755
+ this._initializeVAD();
3727
3756
  // Bind the websocket message callbacks
3728
3757
  this.ws.onmessage = this._handleWebSocketMessage;
3729
3758
  this.ws.onopen = () => {