npm - @chat21/chat21-web-widget - Versions diffs - 5.1.32-rc9 → 5.1.33-rc11 - Mend

@chat21/chat21-web-widget 5.1.32-rc9 → 5.1.33-rc11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (246) hide show

package/src/app/providers/voice/voice.service.ts CHANGED Viewed

@@ -1,9 +1,10 @@
 import { Inject, Injectable, Optional } from '@angular/core';
 import type { MicVAD } from '@ricky0123/vad-web';
 import { getDefaultRealTimeVADOptions } from '@ricky0123/vad-web';
-import { BehaviorSubject, Observable, Subject } from 'rxjs';
+import { BehaviorSubject, Observable, Subject, Subscription } from 'rxjs';
 import { LoggerInstance } from 'src/chat21-core/providers/logger/loggerInstance';
 import { LoggerService } from 'src/chat21-core/providers/abstract/logger.service';
+import { Globals } from 'src/app/utils/globals';
 import {
   DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS,
@@ -12,12 +13,22 @@ import {
 } from './audio.types';
 import { SpeechToTextProvider } from './STT&TTS/speech-provider.abstract';
 import { VadService } from './vad.service';
+import { VoiceStreamingService } from './voice-streaming.service';
+import {
+  VoiceTtsKaraokeFrame,
+  VoiceTtsKaraokeWord,
+  VoiceStreamingSessionConfig,
+  VoiceWsControlMessage,
+} from './voice-streaming.types';
+import { TtsAudioPlaybackCoordinator } from '../tts-audio-playback-coordinator.service';
 const VOICE_RECORDING_MIME = 'audio/webm';
 /**
- * Voce: VadService (ONNX WASM) → MicVAD → MediaRecorder su ogni segmento parlato.
- * Opzionalmente STT (`SpeechToTextProvider`) arricchisce il payload con `transcript`.
+ * Due modalità:
+ * - **Ingresso WSS** (`voiceIngressStream`): microfono → proxy in streaming; niente VAD locale — silenzio/turni gestiti dal server.
+ *   Eventi `transcript` / TTS binario arrivano sulla WSS.
+ * - **Legacy**: MicVAD + segmenti WebM (upload/STT client-side) se non passi `voiceIngressStream`.
  */
 @Injectable({ providedIn: 'root' })
 export class VoiceService {
@@ -28,52 +39,234 @@ export class VoiceService {
   private sessionConstraints: MediaStreamConstraints = DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS;
   private onRecordingComplete?: (result: VoiceSegmentPayload) => void;
   private enableTranscription = true;
+  private voiceIngressConfig?: VoiceStreamingSessionConfig | null = null;
   private readonly audioSegmentSubject = new Subject<VoiceSegmentPayload>();
-  /** Emesso a ogni fine segmento parlato: audio WebM + opzionalmente `transcript` / `transcriptionError`. */
-  readonly audioSegment$: Observable<VoiceSegmentPayload> = this.audioSegmentSubject.asObservable();
   private readonly speechStartSubject = new Subject<void>();
   /** Emesso quando il microfono intercetta parlato (VAD speech start). */
   readonly speechStart$: Observable<void> = this.speechStartSubject.asObservable();
-  // 🔊 REALTIME VOLUME STREAM
+  private readonly speechEndSubject = new Subject<void>();
+  /** Emesso quando il parlato termina (VAD speech end). */
+  readonly speechEnd$: Observable<void> = this.speechEndSubject.asObservable();
+  /** Trascrizione dall’evento WSS `transcript` (proxy). */
+  private readonly voiceTranscriptSubject = new Subject<{ text: string; isFinal: boolean }>();
+  readonly voiceTranscript$: Observable<{ text: string; isFinal: boolean }> = this.voiceTranscriptSubject.asObservable();
+  /** Testo TTS in riproduzione, emesso dall'evento WSS `speaking` (proxy). */
+  private readonly voiceTtsTextSubject = new Subject<string>();
+  /** Emette il testo del bot che sta per essere riprodotto come audio TTS. */
+  readonly voiceTtsText$: Observable<string> = this.voiceTtsTextSubject.asObservable();
+  /** Errore applicativo dal proxy (evento `error`): testo descrittivo del problema. */
+  private readonly _wsError$ = new Subject<string>();
+  readonly wsError$: Observable<string> = this._wsError$.asObservable();
   private readonly volumeSubject = new BehaviorSubject<number>(0);
   readonly volume$: Observable<number> = this.volumeSubject.asObservable();
+  /**
+   * Emits `true` while a WSS voice-proxy session is active.
+   * Used to suppress the tiledesk-server TTS playback path (audio-sync component)
+   * when the speech-proxy is already handling TTS over the WebSocket binary channel.
+   */
+  private readonly _isWssVoiceActive$ = new BehaviorSubject<boolean>(false);
+  readonly isWssVoiceActive$: Observable<boolean> = this._isWssVoiceActive$.asObservable();
+  get isWssVoiceActive(): boolean { return this._isWssVoiceActive$.getValue(); }
+  /**
+   * UIDs of TTS messages that were played by the speech-proxy during an active voice session.
+   * These messages must never be replayed by audio-sync after the session ends.
+   */
+  private readonly _proxyHandledTtsIds = new Set<string>();
+  /** Register a TTS message UID as having been played by the proxy. */
+  markProxyHandled(uid: string): void {
+    if (uid) { this._proxyHandledTtsIds.add(uid); }
+  }
+  /** Returns true if the message was already played by the proxy and should not be replayed. */
+  wasProxyHandled(uid: string | undefined): boolean {
+    return !!uid && this._proxyHandledTtsIds.has(uid);
+  }
+  // 🎙️ TTS GATE — suppresses segment emission while TTS is playing
+  private isTTSActive = false;
+  private ttsGateSub?: Subscription;
+  private wsControlSub?: Subscription;
+  private ttsChunkSub?: Subscription;
+  // 🚫 ACQUISITION GATE — pauses VAD from speech-end until TTS response cycle completes
+  private isWaitingForResponse = false;
+  private responseTimeoutId?: ReturnType<typeof setTimeout>;
+  private readonly _isAcquisitionBlocked$ = new BehaviorSubject<boolean>(false);
+  /** Emits `true` from user speech-end until VAD resumes after TTS finishes; drives the grey orb. */
+  readonly isAcquisitionBlocked$: Observable<boolean> = this._isAcquisitionBlocked$.asObservable();
   // 🎧 AUDIO ANALYSER
   private audioContext?: AudioContext;
   private analyser?: AnalyserNode;
   /** Buffer dedicato (`ArrayBuffer`) per compatibilità con `getByteFrequencyData`. */
   private dataArray?: Uint8Array;
+  /** RAF ID for volume loop - used to cancel on cleanup */
+  private volumeRafId?: number;
+  /** Riproduzione chunk TTS binari dal proxy (Web Audio). */
+  private ttsPlayContext?: AudioContext;
+  private ttsNextPlayTime = 0;
+  // Tracks how many TTS audio sources are still decoding or playing.
+  // Incremented synchronously when a binary chunk arrives (before decodeAudioData).
+  // Decremented in src.onended (or on decode error).
+  private _activeTtsSources = 0;
+  // References to active AudioBufferSourceNodes so they can be stopped on preemption.
+  private _activeTtsSourceNodes: AudioBufferSourceNode[] = [];
+  // Monotonic counter incremented every time all in-flight TTS audio is invalidated
+  // (barge_in or a new speaking event).  playWsTtsChunk captures this at entry and
+  // checks it after the async decodeAudioData call to discard stale results.
+  private _ttsGeneration = 0;
+  // ── Ordered-scheduling state ──────────────────────────────────────────────────────────────────
+  // Chunks arrive over WebSocket and their decodeAudioData calls run concurrently.
+  // Because a smaller/later chunk can decode faster than a larger/earlier one, scheduling
+  // based solely on decode-completion order causes audio to play out of arrival order
+  // (e.g. "manuale" starts before "scrittura" even though it arrived after it).
+  // Fix: assign a monotonic sequence number on arrival, decode in parallel, but only
+  // schedule a buffer once every preceding buffer has already been scheduled.
+  private _ttsChunkSeq = 0;       // Incremented on each chunk arrival (arrival order)
+  private _ttsScheduledSeq = 0;   // Next sequence slot that is allowed to be scheduled
+  // Decoded buffers waiting for their turn to be scheduled (keyed by arrival sequence)
+  private _ttsDecodedPending = new Map<number, AudioBuffer>();
+  // ─────────────────────────────────────────────────────────────────────────────────────────────
+  // Set to true by the 'done' event; triggers acquisition unblock once all sources end.
+  private _unblockAfterTts = false;
+  private _unblockSafetyTimer: ReturnType<typeof setTimeout> | null = null;
+  // Fallback timer started after sendPlaybackComplete. If the proxy does not reply
+  // with 'listening' within the timeout window, the UI is force-unblocked so the
+  // user is not left stuck waiting indefinitely.
+  private _listeningFallbackTimer: ReturnType<typeof setTimeout> | null = null;
+  // Track when the last TTS chunk is expected to finish playing.
+  // Used to calculate a proper safety timer duration for long messages.
+  private _ttsExpectedEndTime = 0;
+  // ── WSS TTS Karaoke ──────────────────────────────────────────────────────────────────────────
+  private _kText = '';
+  private _kWords: Array<VoiceTtsKaraokeWord & { start: number; end: number }> = [];
+  private _kStartContextTime = 0;
+  private _kDuration = 0;
+  private _kRafId?: number;
+  private _kLastActiveIndex = -2;
+  private readonly _voiceTtsKaraokeSubject = new Subject<VoiceTtsKaraokeFrame>();
+  /** Emits word-state frames while WebSocket TTS audio plays; drives the karaoke highlight in bubble-message. */
+  readonly voiceTtsKaraoke$: Observable<VoiceTtsKaraokeFrame> = this._voiceTtsKaraokeSubject.asObservable();
+  // ─────────────────────────────────────────────────────────────────────────────────────────────
+  // ── Thinking / typing-indicator sound ─────────────────────────────────────────────────────────
+  // Played on loop while the bot is thinking or the first TTS chunk hasn't arrived yet.
+  // Only active during WSS voice sessions (voice-proxy mode).
+  private _keyboardSoundEl: HTMLAudioElement | null = null;
+  // ─────────────────────────────────────────────────────────────────────────────────────────────
   private readonly logger: LoggerService = LoggerInstance.getInstance();
+  private readonly bufferTime = 200000; // used as max safety timer duration for long TTS messages
   constructor(
     private readonly vadService: VadService,
+    private readonly ttsPlayback: TtsAudioPlaybackCoordinator,
+    private readonly voiceStreaming: VoiceStreamingService,
     @Optional() @Inject(SpeechToTextProvider) private readonly speechToText: SpeechToTextProvider | null,
+    private readonly globals: Globals,
   ) {}
   get isSessionActive(): boolean {
     return !!this.vad || !!this.stream;
   }
+  /**
+   * Returns the speech-proxy's streaming TTS endpoint URL, or `null` when no proxy is configured.
+   * The audio-sync component uses this to redirect TTS calls from the tiledesk-server to the proxy.
+   */
+  get proxyTtsStreamUrl(): string | null {
+    const base = this.voiceStreaming.proxyHttpBaseUrl;
+    return base ? `${base}/api/tts/stream` : null;
+  }
+  get proxyTtsUrl(): string | null {
+    const base = this.voiceStreaming.proxyHttpBaseUrl;
+    return base ? `${base}/api/tts` : null;
+  }
   /**
    * Richiede il microfono, avvia VAD in ascolto (inizio/fine parlato) e registra in WebM per segmento.
    */
   async startSession(options: VoiceSessionStartOptions = {}): Promise<void> {
+    const mode = options.voiceIngressStream ? 'wss-proxy' : 'legacy-vad';
+    this.logger.info('[VoiceService] startSession', { mode });
     await this.stopSession();
     this.sessionConstraints = options.constraints ?? DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS;
     this.onRecordingComplete = options.onRecordingComplete;
     this.enableTranscription = options.enableTranscription !== false;
+    this.voiceIngressConfig = options.voiceIngressStream;
-    await this.vadService.ensureOnnxRuntimeEnv();
+    if (this.voiceIngressConfig) {
+      await this.startWssVoiceSession();
+      return;
+    }
+    await this.startLegacyVadSession(options);
+  }
+  /** Sessione guidata dal proxy: solo mic + volume + WSS (mic in upload, eventi + TTS in download). */
+  private async startWssVoiceSession(): Promise<void> {
+    this.logger.info('[VoiceService] acquiring microphone for WSS session');
     this.stream = await navigator.mediaDevices.getUserMedia(this.sessionConstraints);
+    const tracks = this.stream.getAudioTracks();
+    this.logger.info('[VoiceService] microphone acquired', {
+      tracks: tracks.length,
+      label: tracks[0]?.label ?? '(unknown)',
+    });
     // 🎧 AUDIO ANALYSER INIT
     this.initAudioAnalyser(this.stream);
+    this.startVolumeLoop();
+    try {
+      // Subscribe before start() so early events (e.g. proxy 'error') are not lost.
+      this.wsControlSub = this.voiceStreaming.wsControl$.subscribe((msg) => this.onWsControl(msg));
+      this.ttsChunkSub = this.voiceStreaming.ttsBinaryChunk$.subscribe((buf) => void this.playWsTtsChunk(buf));
+      await this.voiceStreaming.start(this.voiceIngressConfig!, { sharedMediaStream: this.stream });
+      // Signal that the voice proxy is now live — suppresses tiledesk-server TTS.
+      this._isWssVoiceActive$.next(true);
+      this.logger.info('[VoiceService] WSS voice session started (no local VAD)');
+    } catch (e) {
+      this.wsControlSub?.unsubscribe();
+      this.wsControlSub = undefined;
+      this.ttsChunkSub?.unsubscribe();
+      this.ttsChunkSub = undefined;
+      this.voiceIngressConfig = null;
+      if (this.stream) {
+        this.stream.getTracks().forEach((t) => t.stop());
+        this.stream = undefined;
+      }
+      this.audioContext?.close();
+      this.audioContext = undefined;
+      this.analyser = undefined;
+      this.dataArray = undefined;
+      throw e;
+    }
+  }
+  /** VAD + segmenti (nessun ingresso WSS). */
+  private async startLegacyVadSession(options: VoiceSessionStartOptions): Promise<void> {
+    await this.vadService.ensureOnnxRuntimeEnv();
+    this.stream = await navigator.mediaDevices.getUserMedia(this.sessionConstraints);
+    this.initAudioAnalyser(this.stream);
     const vadDefaults = getDefaultRealTimeVADOptions('legacy');
@@ -92,10 +285,16 @@ export class VoiceService {
       },
       onSpeechEnd: () => {
         this.logger.log('[VoiceService] speech end');
+        this.speechEndSubject.next();
         this.stopMediaRecorderSegment();
+        // Pause VAD immediately — new recordings are blocked until the TTS response cycle completes.
+        this.isWaitingForResponse = true;
+        this._isAcquisitionBlocked$.next(true);
+        this.setResponseSafetyTimeout();
+        void this.vad?.pause();
       },
       minSpeechMs: 480,
-      redemptionMs: 1920,
+      redemptionMs: 800,//1920,
       preSpeechPadMs: 960,
     });
@@ -103,13 +302,427 @@ export class VoiceService {
     // 🔁 start volume loop
     this.startVolumeLoop();
+    // 🎙️ gate segments while TTS is playing; resume VAD when TTS cycle completes
+    this.ttsGateSub = this.ttsPlayback.isTTSPlaying$.subscribe((playing) => {
+      this.isTTSActive = playing;
+      this.logger.log('[VoiceService] TTS gate', playing ? 'closed (bot speaking)' : 'open (listening)');
+      if (!playing && this.isWaitingForResponse) {
+        this.resumeVadAfterResponse();
+      }
+    });
+  }
+  private onWsControl(msg: VoiceWsControlMessage): void {
+    this.logger.log('[VoiceService] ← ws-control', msg.event, msg);
+    switch (msg.event) {
+      case 'session_started':
+        this.logger.log('[VoiceService] session_started', { requestId: msg.requestId ?? '' });
+        break;
+      case 'listening':
+        // Proxy confirmed it is in LISTENING state — unblock the UI and resume
+        // the MediaRecorder. Recording was paused on 'thinking' and must only
+        // restart here, after TTS playback has fully completed and the proxy
+        // is confirmed ready to receive audio again.
+        if (this._listeningFallbackTimer !== null) {
+          clearTimeout(this._listeningFallbackTimer);
+          this._listeningFallbackTimer = null;
+        }
+        // If TTS never arrived (edge case) the keyboard sound would still be looping — stop it.
+        this._stopKeyboardSound();
+        this._isAcquisitionBlocked$.next(false);
+        this.voiceStreaming.resumeRecording();
+        this.logger.log('[VoiceService] listening – acquisition unblocked, recording resumed');
+        break;
+      case 'transcript': {
+        const text = typeof msg.text === 'string' ? msg.text : '';
+        const isFinal = !!msg.isFinal;
+        // Guard: if the proxy has already moved to PROCESSING (thinking) or SPEAKING,
+        // this transcript is a stale in-flight STT result. Discard it so it cannot
+        // override the blocked acquisition state or reach any downstream subscriber.
+        // 'thinking' is stronger than 'transcript' — state must not regress.
+        if (this._isAcquisitionBlocked$.value) {
+          this.logger.warn('[VoiceService] transcript discarded – arrived after thinking/speaking (stale STT result)', { text, isFinal });
+          break;
+        }
+        this.logger.log('[VoiceService] transcript', { text, isFinal });
+        this.voiceTranscriptSubject.next({ text, isFinal });
+        break;
+      }
+      case 'thinking':
+        // Block acquisition UI while the bot processes the utterance.
+        // Pause the MediaRecorder so no audio chunks are sent to the proxy
+        // during PROCESSING state. Recording resumes only after the proxy
+        // confirms LISTENING (i.e. after TTS playback has fully finished).
+        this._isAcquisitionBlocked$.next(true);
+        this.voiceStreaming.pauseRecording();
+        // Play keyboard typing sound to mask the silence while the bot generates its response.
+        this._startKeyboardSound();
+        this.logger.log('[VoiceService] thinking – acquisition blocked, recording paused', { activeTtsSources: this._activeTtsSources });
+        break;
+      case 'speaking': {
+        this._isAcquisitionBlocked$.next(true);
+        // Do NOT mute the microphone. The MediaStream is captured with
+        // echoCancellation: true, so the browser's AEC filters out the bot's
+        // speaker output before it reaches the MediaRecorder. Audio keeps
+        // flowing to the proxy so Flux can fire StartOfTurn when the user
+        // speaks, enabling server-side barge-in detection.
+        this._cancelAllTtsAudio();
+        // Reset TTS scheduling so new chunks play from now, not a stale future time.
+        this.ttsNextPlayTime = this.ttsPlayContext?.currentTime ?? 0;
+        // Reset expected end time for new TTS stream
+        this._ttsExpectedEndTime = 0;
+        const preview = typeof msg.text === 'string' ? msg.text.slice(0, 80) : '';
+        this.logger.log('[VoiceService] speaking – acquisition blocked, TTS text preview', { preview });
+        // Keep keyboard sound going (or start it as a fallback if 'thinking' was missed)
+        // until the first TTS audio chunk actually starts playing.
+        this._startKeyboardSound();
+        // Emit the text being spoken so UI can display it alongside the audio.
+        if (typeof msg.text === 'string' && msg.text) {
+          this.voiceTtsTextSubject.next(msg.text);
+          this._startTtsKaraoke(msg.text);
+        }
+        break;
+      }
+      case 'done':
+        // Do not unblock immediately — the audio binary may still be decoding/playing.
+        // _activeTtsSources tracks pending sources; when the last one ends, acquisition unblocks.
+        if (this._activeTtsSources > 0) {
+          this._unblockAfterTts = true;
+          // Calculate safety timer based on expected audio end time.
+          // Add 5 seconds buffer for network/decode latency.
+          // Minimum 5 seconds, maximum 300 seconds for very long messages.
+          const remainingMs = Math.max(0, this._ttsExpectedEndTime - Date.now());
+          const safetyMs = Math.min(this.bufferTime, Math.max(5000, remainingMs + 5000));
+          if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
+          this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), safetyMs);
+          this.logger.log('[VoiceService] done – TTS still pending, waiting for all sources to end', {
+            activeTtsSources: this._activeTtsSources,
+            expectedEndInMs: remainingMs,
+            safetyTimerMs: safetyMs
+          });
+        } else {
+          // No audio sources tracked yet, but binary TTS chunks may still be in-flight
+          // (WebSocket binary frames can arrive after the JSON 'done' control message).
+          // Set _unblockAfterTts so that _onTtsSourceEnded() triggers _flushTtsUnblock
+          // naturally when those chunks finish playing, instead of relying solely on the
+          // safety timer (which would delay unblock by 10 s even when audio ends sooner).
+          this._unblockAfterTts = true;
+          this.logger.log('[VoiceService] done – no active sources yet, arming unblock for in-flight chunks');
+          // Safety timer as last resort in case no chunks arrive at all.
+          if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
+          this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), 10000);
+        }
+        break;
+      case 'error': {
+        const errorMsg = typeof msg.message === 'string' ? msg.message : 'Voice session error';
+        this.logger.error('[VoiceService] WSS error', errorMsg);
+        this._wsError$.next(errorMsg);
+        break;
+      }
+      default:
+        this.logger.warn('[VoiceService] unhandled ws-control event', msg.event);
+        break;
+    }
   }
   /**
-   * @param options.discardInProgressSegment — non inviare STT/upload per il segmento WebM corrente (es. interruzione da messaggio in arrivo).
+   * Chunk TTS: ogni buffer deve essere decodificabile da `decodeAudioData` (es. segmento WebM/Opus completo).
+   *
+   * Decode-race fix: multiple chunks decode concurrently; a smaller/later chunk can finish
+   * decoding before a larger/earlier one, which would cause the AudioBufferSourceNode to be
+   * scheduled out of arrival order (e.g. "manuale" before "scrittura").  To prevent this, each
+   * chunk is assigned a monotonic sequence number on arrival and stored in _ttsDecodedPending
+   * after decoding.  _drainTtsDecodedBuffers() only advances the schedule when the next
+   * expected sequence slot is present, guaranteeing arrival-order playback regardless of decode speed.
    */
-  async stopSession(options?: { discardInProgressSegment?: boolean }): Promise<void> {
+  private async playWsTtsChunk(buf: ArrayBuffer): Promise<void> {
+    // Assign arrival-order sequence number SYNCHRONOUSLY before any await.
+    const seq = this._ttsChunkSeq++;
+    // Capture the current generation BEFORE the synchronous increment so that
+    // if _cancelAllTtsAudio() fires (incrementing _ttsGeneration) while this
+    // decode is in-flight, the mismatch is detected and the stale chunk is discarded.
+    const capturedGeneration = this._ttsGeneration;
+    // Increment SYNCHRONOUSLY before any await so the 'done' event handler (which arrives
+    // on the next WebSocket message — a different event-loop tick) sees a non-zero count.
+    this._activeTtsSources++;
+    this.logger.log('[VoiceService] TTS chunk received', { seq, bytes: buf.byteLength, activeTtsSources: this._activeTtsSources });
+    try {
+      if (!this.ttsPlayContext || this.ttsPlayContext.state === 'closed') {
+        this.ttsPlayContext = new AudioContext();
+        this.ttsNextPlayTime = this.ttsPlayContext.currentTime;
+        this.logger.info('[VoiceService] TTS AudioContext created');
+      }
+      const ctx = this.ttsPlayContext;
+      const audioBuf = await ctx.decodeAudioData(buf.slice(0));
+      // Stale-chunk guard: barge_in or a new speaking event called _cancelAllTtsAudio()
+      // which incremented _ttsGeneration. Discard this decoded buffer so no audio plays
+      // for a turn that was already cancelled, and undo the counter increment.
+      if (this._ttsGeneration !== capturedGeneration) {
+        this._activeTtsSources = Math.max(0, this._activeTtsSources - 1);
+        this.logger.log('[VoiceService] TTS chunk discarded – stale generation', { seq, capturedGeneration, currentGeneration: this._ttsGeneration });
+        return;
+      }
+      // Store the decoded buffer under its arrival sequence number and attempt to
+      // flush any contiguous run of decoded buffers in order.
+      this._ttsDecodedPending.set(seq, audioBuf);
+      this._drainTtsDecodedBuffers();
+    } catch (e) {
+      // Advance the scheduler past this failed slot so subsequent decoded chunks are
+      // not blocked waiting for a slot that will never be filled.
+      if (seq === this._ttsScheduledSeq) {
+        this._ttsScheduledSeq++;
+        this._drainTtsDecodedBuffers();
+      }
+      this._onTtsSourceEnded();
+      this.logger.warn('[VoiceService] TTS chunk decode failed', { seq }, e);
+    }
+  }
+  /**
+   * Schedules decoded TTS buffers in strict arrival order.
+   * Called after every successful decode. Drains the _ttsDecodedPending map
+   * starting at _ttsScheduledSeq, stopping as soon as the next slot is missing
+   * (i.e. that chunk is still decoding or failed).
+   */
+  private _drainTtsDecodedBuffers(): void {
+    const ctx = this.ttsPlayContext;
+    if (!ctx) return;
+    while (this._ttsDecodedPending.has(this._ttsScheduledSeq)) {
+      const audioBuf = this._ttsDecodedPending.get(this._ttsScheduledSeq)!;
+      this._ttsDecodedPending.delete(this._ttsScheduledSeq);
+      this._ttsScheduledSeq++;
+      const src = ctx.createBufferSource();
+      src.buffer = audioBuf;
+      src.connect(ctx.destination);
+      const t0 = Math.max(ctx.currentTime, this.ttsNextPlayTime);
+      src.start(t0);
+      this.ttsNextPlayTime = t0 + audioBuf.duration;
+      // Track the expected end time in wall-clock time (ms) for safety timer calculation.
+      const audioEndDelayMs = (this.ttsNextPlayTime - ctx.currentTime) * 1000;
+      this._ttsExpectedEndTime = Date.now() + audioEndDelayMs;
+      const isFirstChunk = this._activeTtsSourceNodes.length === 0;
+      this._activeTtsSourceNodes.push(src);
+      if (isFirstChunk) {
+        // First real audio about to play — stop the keyboard typing sound immediately.
+        this._stopKeyboardSound();
+        this.logger.info('[VoiceService] TTS playback started', { durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3) });
+      }
+      this.logger.log('[VoiceService] TTS chunk scheduled', { seq: this._ttsScheduledSeq - 1, durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3), activeTtsSources: this._activeTtsSources, expectedEndInMs: audioEndDelayMs.toFixed(0) });
+      src.onended = () => this._onTtsSourceEnded(src);
+    }
+  }
+  private _onTtsSourceEnded(src?: AudioBufferSourceNode): void {
+    this._activeTtsSources = Math.max(0, this._activeTtsSources - 1);
+    if (src) {
+      const idx = this._activeTtsSourceNodes.indexOf(src);
+      if (idx !== -1) { this._activeTtsSourceNodes.splice(idx, 1); }
+    }
+    this.logger.log('[VoiceService] TTS source ended', { activeTtsSources: this._activeTtsSources, unblockPending: this._unblockAfterTts });
+    if (this._activeTtsSources === 0) {
+      this.logger.info('[VoiceService] TTS playback ended – all sources finished');
+      console.log('[VoiceService] TTS audio finished playing');
+    }
+    if (this._unblockAfterTts && this._activeTtsSources === 0) {
+      this._flushTtsUnblock(false);
+    }
+  }
+  /**
+   * Immediately stops all currently playing/scheduled TTS audio sources.
+   * Called when a new `speaking` event arrives (new bot turn) to prevent overlap with
+   * the previous turn's audio, and during `stopSession`.
+   * Clears `onended` callbacks BEFORE stopping so that `_onTtsSourceEnded` is NOT
+   * invoked for cancelled nodes (avoiding spurious `sendPlaybackComplete` calls).
+   * Also increments `_ttsGeneration` so any in-flight `decodeAudioData` promises
+   * can detect that their result is stale and discard the decoded buffer.
+   */
+  private _cancelAllTtsAudio(): void {
+    this._ttsGeneration++;
+    if (this._unblockSafetyTimer !== null) {
+      clearTimeout(this._unblockSafetyTimer);
+      this._unblockSafetyTimer = null;
+    }
+    for (const src of this._activeTtsSourceNodes) {
+      src.onended = null;
+      try { src.stop(); } catch { /* already ended — ignore */ }
+    }
+    this._activeTtsSourceNodes = [];
+    this._activeTtsSources = 0;
+    this._unblockAfterTts = false;
+    this._ttsExpectedEndTime = 0;
+    // Reset ordered-scheduling state so the next speaking turn starts fresh.
+    this._ttsChunkSeq = 0;
+    this._ttsScheduledSeq = 0;
+    this._ttsDecodedPending.clear();
+    this._stopTtsKaraoke(true);
+    this.logger.log('[VoiceService] TTS cancelled – all audio sources stopped');
+  }
+  private _flushTtsUnblock(fromSafetyTimer = false): void {
+    this._unblockAfterTts = false;
+    this._activeTtsSources = 0;
+    if (this._unblockSafetyTimer !== null) {
+      clearTimeout(this._unblockSafetyTimer);
+      this._unblockSafetyTimer = null;
+    }
+    if (fromSafetyTimer) {
+      this.logger.warn('[VoiceService] TTS unblock: safety timer fired – forcing playback complete');
+    } else {
+      this.logger.log('[VoiceService] TTS unblock: all sources ended, sending playback complete');
+    }
+    this._stopTtsKaraoke(true);
+    // Signal the proxy that TTS playback is complete. The proxy will transition
+    // to LISTENING and send a 'listening' event back; the mic resumes and the UI
+    // unblocks only then — so the user sees 'listening' exactly when the stream
+    // is open, not before.
+    // Start a fallback timer: if the proxy does not respond with 'listening' within
+    // 3 seconds (network hiccup, server race, etc.) force-unblock so the user is
+    // never left stuck. The timer is cancelled immediately if 'listening' arrives.
+    this.voiceStreaming.sendPlaybackComplete();
+    if (this._listeningFallbackTimer !== null) clearTimeout(this._listeningFallbackTimer);
+    this._listeningFallbackTimer = setTimeout(() => {
+      this._listeningFallbackTimer = null;
+      this.logger.warn('[VoiceService] listening fallback timer fired – proxy did not respond, force-unblocking');
+      this._isAcquisitionBlocked$.next(false);
+      this.voiceStreaming.resumeRecording();
+    }, 3000);
+  }
+  // ── WSS TTS Karaoke helpers ───────────────────────────────────────────────
+  private _startTtsKaraoke(text: string): void {
+    this._stopTtsKaraoke(false);
+    this._kText = text;
+    const rawWords = text.trim().split(/\s+/).filter((w) => w.length > 0);
+    if (rawWords.length === 0) return;
+    // ~140 WPM → ~0.43 s/word (same estimate as audio-sync)
+    const duration = Math.max(1, rawWords.length * 0.43);
+    this._kDuration = duration;
+    const step = duration / rawWords.length;
+    this._kWords = rawWords.map((w, i) => ({
+      text: w,
+      start: i * step,
+      end: (i + 1) * step,
+      state: 'future' as const,
+    }));
+    this._kStartContextTime = this.ttsPlayContext?.currentTime ?? 0;
+    this._kLastActiveIndex = -2;
+    this._rafKaraokeLoop();
+  }
+  private _stopTtsKaraoke(markAllPast: boolean): void {
+    if (this._kRafId !== undefined) {
+      cancelAnimationFrame(this._kRafId);
+      this._kRafId = undefined;
+    }
+    if (markAllPast && this._kWords.length > 0) {
+      this._kWords.forEach((w) => { w.state = 'past'; });
+      this._voiceTtsKaraokeSubject.next({
+        text: this._kText,
+        words: this._kWords.map(({ text, state }) => ({ text, state })),
+        activeIndex: -1,
+      });
+      this._kWords = [];
+      this._kText = '';
+    }
+  }
+  private _rafKaraokeLoop(): void {
+    const elapsed = (this.ttsPlayContext?.currentTime ?? 0) - this._kStartContextTime;
+    let activeIndex = -1;
+    this._kWords.forEach((w) => {
+      if (elapsed >= w.end) {
+        w.state = 'past';
+      } else if (elapsed >= w.start && elapsed < w.end) {
+        w.state = 'active';
+        activeIndex = this._kWords.indexOf(w);
+      } else {
+        w.state = 'future';
+      }
+    });
+    if (activeIndex !== this._kLastActiveIndex) {
+      this._kLastActiveIndex = activeIndex;
+      this._voiceTtsKaraokeSubject.next({
+        text: this._kText,
+        words: this._kWords.map(({ text, state }) => ({ text, state })),
+        activeIndex,
+      });
+    }
+    if (elapsed < this._kDuration) {
+      this._kRafId = requestAnimationFrame(() => this._rafKaraokeLoop());
+    }
+  }
+  // ─────────────────────────────────────────────────────────────────────────
+  // ── Keyboard typing-indicator sound helpers ───────────────────────────────
+  /**
+   * Starts the keyboard sound on loop to mask silence while the bot is
+   * generating its response. No-op if already playing.
+   * Only called during WSS voice sessions (voice-proxy mode).
+   */
+  private _startKeyboardSound(): void {
+    if (this._keyboardSoundEl) return; // already playing
+    const file = this.globals.keyboardSoundFile ?? 'keyboard.mp3';
+    const src = /^https?:\/\//i.test(file)
+      ? file
+      : `${this.globals.baseLocation}/assets/sounds/${file}`;
+    const audio = new Audio(src);
+    audio.loop = true;
+    audio.volume = Math.min(1, Math.max(0, this.globals.keyboardSoundVolume));
+    audio.play().catch((e) => this.logger.warn('[VoiceService] keyboard sound play failed', e));
+    this._keyboardSoundEl = audio;
+    this.logger.log('[VoiceService] keyboard sound started', { src, volume: audio.volume });
+  }
+  /** Stops and discards the keyboard typing sound. No-op if not playing. */
+  private _stopKeyboardSound(): void {
+    if (!this._keyboardSoundEl) return;
+    this._keyboardSoundEl.pause();
+    this._keyboardSoundEl.currentTime = 0;
+    this._keyboardSoundEl = null;
+    this.logger.log('[VoiceService] keyboard sound stopped');
+  }
+  // ─────────────────────────────────────────────────────────────────────────
+  async stopSession(options?: { discardInProgressSegment?: boolean}): Promise<{ voiceIngressResultUrl: string | null }> {
     const discard = options?.discardInProgressSegment === true;
+    this.logger.info('[VoiceService] stopSession', { discard, isWssVoiceActive: this._isWssVoiceActive$.getValue() });
+    this.wsControlSub?.unsubscribe();
+    this.wsControlSub = undefined;
+    this.ttsChunkSub?.unsubscribe();
+    this.ttsChunkSub = undefined;
+    try {
+      if (this.ttsPlayContext && this.ttsPlayContext.state !== 'closed') {
+        await this.ttsPlayContext.close();
+      }
+    } catch {
+      /* ignore */
+    }
+    this._cancelAllTtsAudio();
+    this.ttsPlayContext = undefined;
+    this.ttsNextPlayTime = 0;
+    this._stopKeyboardSound();
+    let voiceIngressResultUrl: string | null = null;
+    if (this.voiceIngressConfig) {
+      try {
+        const { resultUrl } = await this.voiceStreaming.stop({discard: true, awaitServerResultUrl: true});
+        voiceIngressResultUrl = resultUrl ?? null;
+      } catch (e) {
+        this.logger.log('[VoiceService] stopSession voiceStreaming.stop', e);
+      }
+      this._isWssVoiceActive$.next(false);
+      this.voiceIngressConfig = null;
+    }
     if (this.mediaRecorder) {
       if (discard) {
@@ -140,6 +753,10 @@ export class VoiceService {
     }
     // 🎧 cleanup audio context
+    if (this.volumeRafId) {
+      cancelAnimationFrame(this.volumeRafId);
+      this.volumeRafId = undefined;
+    }
     this.audioContext?.close();
     this.audioContext = undefined;
     this.analyser = undefined;
@@ -148,6 +765,23 @@ export class VoiceService {
     this.volumeSubject.next(0);
     this.onRecordingComplete = undefined;
+    // 🎙️ release TTS gate subscription
+    this.ttsGateSub?.unsubscribe();
+    this.ttsGateSub = undefined;
+    this.isTTSActive = false;
+    // 🚫 clear acquisition gate
+    clearTimeout(this.responseTimeoutId);
+    this.responseTimeoutId = undefined;
+    this.isWaitingForResponse = false;
+    if (this._listeningFallbackTimer !== null) {
+      clearTimeout(this._listeningFallbackTimer);
+      this._listeningFallbackTimer = null;
+    }
+    this._isAcquisitionBlocked$.next(false);
+    return { voiceIngressResultUrl };
   }
   /**
@@ -155,6 +789,9 @@ export class VoiceService {
    * Lo stream resta in ascolto per il prossimo `onSpeechStart`.
    */
   discardCurrentRecordingSegment(): void {
+    if (!this.vad) {
+      return;
+    }
     if (this.mediaRecorder) {
       this.mediaRecorder.onstop = null;
       this.mediaRecorder.ondataavailable = null;
@@ -164,13 +801,45 @@ export class VoiceService {
     }
     this.mediaRecorder = undefined;
     this.audioChunks = [];
-    this.logger.log('[VoiceService] discarded in-progress segment; VAD session unchanged');
+    this.logger.log('[VoiceService] discarded in-progress segment (legacy VAD)');
+  }
+  /**
+   * 🔄 RESUME VAD AFTER RESPONSE
+   * Called when isTTSPlaying$ goes false while isWaitingForResponse is true,
+   * or by the safety timeout if no TTS response arrives within 30 s.
+   */
+  private resumeVadAfterResponse(): void {
+    this.isWaitingForResponse = false;
+    clearTimeout(this.responseTimeoutId);
+    this.responseTimeoutId = undefined;
+    this._isAcquisitionBlocked$.next(false);
+    if (this.vad) {
+      this.vad.start().catch((e) => this.logger.log('[VoiceService] VAD resume error', e));
+    }
+  }
+  /**
+   * ⏱️ SAFETY TIMEOUT
+   * Forces VAD re-enable after 30 s in case no TTS response ever arrives.
+   */
+  private setResponseSafetyTimeout(): void {
+    clearTimeout(this.responseTimeoutId);
+    this.responseTimeoutId = setTimeout(() => {
+      this.logger.log('[VoiceService] safety timeout: re-enabling VAD acquisition');
+      this.resumeVadAfterResponse();
+    }, 30_000);
   }
   /**
    * 🎧 AUDIO ANALYSER INIT
    */
   private initAudioAnalyser(stream: MediaStream): void {
+    if (!stream?.getAudioTracks?.()?.length) {
+      this.logger.log('[VoiceService] initAudioAnalyser: no audio track on stream, skipping analyser');
+      return;
+    }
     this.audioContext = new AudioContext();
     const source = this.audioContext.createMediaStreamSource(stream);
@@ -190,8 +859,7 @@ export class VoiceService {
   private startVolumeLoop(): void {
     const tick = () => {
       if (!this.analyser || !this.dataArray) {
-        requestAnimationFrame(tick);
-        return;
+        return; // Stop the loop if analyser is cleaned up
       }
       this.analyser.getByteFrequencyData(
@@ -207,10 +875,10 @@ export class VoiceService {
       this.volumeSubject.next(volume);
-      requestAnimationFrame(tick);
+      this.volumeRafId = requestAnimationFrame(tick);
     };
-    tick();
+    this.volumeRafId = requestAnimationFrame(tick);
   }
   /**
@@ -290,10 +958,16 @@ export class VoiceService {
    * 📡 EMIT RESULT
    */
   private emitSegmentPayload(payload: VoiceSegmentPayload): void {
-    this.logger.log( '[VoiceService] segment ready', payload.transcript ?? payload.transcriptionError ?? payload.blob.size);
+    if (this.isTTSActive) {
+      this.logger.log('[VoiceService] segment suppressed — TTS is playing');
+      return;
+    }
+    this.logger.log('[VoiceService] segment ready', payload.transcript ?? payload.transcriptionError ?? payload.blob.size);
     this.audioSegmentSubject.next(payload);
     this.onRecordingComplete?.(payload);
   }
 }