@chat21/chat21-web-widget 5.1.33-rc11 → 5.1.33-rc9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/CHANGELOG.md +0 -7
  2. package/package.json +1 -1
  3. package/playwright-report/index.html +90 -0
  4. package/src/app/component/conversation-detail/conversation/conversation.component.ts +1 -3
  5. package/src/app/component/conversation-detail/conversation-content/conversation-content.component.spec.ts +7 -0
  6. package/src/app/component/conversation-detail/conversation-content/conversation-content.component.ts +5 -7
  7. package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.html +3 -4
  8. package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.scss +18 -9
  9. package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.ts +0 -6
  10. package/src/app/component/conversation-detail/stream-audio-spectrum/stream-audio-spectrum.component.html +5 -8
  11. package/src/app/component/conversation-detail/stream-audio-spectrum/stream-audio-spectrum.component.scss +1 -5
  12. package/src/app/component/form/inputs/form-text/form-text.component.ts +3 -9
  13. package/src/app/component/message/bubble-message/bubble-message.component.scss +0 -5
  14. package/src/app/component/message/bubble-message/bubble-message.component.ts +0 -14
  15. package/src/app/component/message/json-sources/json-sources.component.scss +8 -12
  16. package/src/app/pipe/marked.pipe.ts +41 -51
  17. package/src/app/providers/global-settings.service.ts +0 -29
  18. package/src/app/providers/json-sources-parser.service.ts +32 -25
  19. package/src/app/providers/voice/voice-streaming.service.ts +19 -11
  20. package/src/app/providers/voice/voice-streaming.types.ts +1 -0
  21. package/src/app/providers/voice/voice.service.spec.ts +45 -12
  22. package/src/app/providers/voice/voice.service.ts +45 -215
  23. package/src/app/utils/globals.ts +0 -10
  24. package/src/assets/i18n/en.json +125 -106
  25. package/src/assets/i18n/es.json +0 -1
  26. package/src/assets/i18n/fr.json +0 -1
  27. package/src/assets/i18n/it.json +0 -1
  28. package/test-results/.last-run.json +4 -0
  29. package/src/assets/sounds/keyboard.mp3 +0 -0
@@ -13,11 +13,10 @@ import {
13
13
  VoiceWsControlMessage,
14
14
  } from './voice-streaming.types';
15
15
 
16
- // Flux docs recommend 80ms chunks for optimal latency; 160ms is a practical
17
- // balance for WebM containerization overhead in the browser while providing
18
- // good STT accuracy.
16
+ // Flux docs recommend 80ms chunks for optimal latency; 250ms is a practical
17
+ // balance for WebM containerization overhead in the browser.
19
18
  // Source: https://developers.deepgram.com/docs/flux/quickstart
20
- const DEFAULT_TIMESLICE_MS = 160;
19
+ const DEFAULT_TIMESLICE_MS = 250;
21
20
  const READY_TIMEOUT_MS = 10_000;
22
21
  const SESSION_STARTED_TIMEOUT_MS = 10_000;
23
22
 
@@ -259,12 +258,6 @@ export class VoiceStreamingService {
259
258
  this.mediaStream = shared
260
259
  ? shared
261
260
  : await navigator.mediaDevices.getUserMedia({ audio: true });
262
- const tracks = this.mediaStream.getAudioTracks();
263
- this.logger.info('[VoiceStreaming] microphone acquired', {
264
- shared: !!shared,
265
- tracks: tracks.length,
266
- label: tracks[0]?.label ?? '(unknown)',
267
- });
268
261
  const recorderOpts: MediaRecorderOptions = {};
269
262
  if (mime) {
270
263
  recorderOpts.mimeType = mime;
@@ -578,7 +571,7 @@ export class VoiceStreamingService {
578
571
 
579
572
  /**
580
573
  * Send `{ event: "tts_playback_complete" }` to the proxy, signalling that TTS
581
- * playback has finished and the microphone is now safe to receive user speech.
574
+ * playback has finished and the microphone is ready to receive user speech.
582
575
  */
583
576
  sendPlaybackComplete(): void {
584
577
  if (this.ws?.readyState === WebSocket.OPEN) {
@@ -587,6 +580,21 @@ export class VoiceStreamingService {
587
580
  }
588
581
  }
589
582
 
583
+ /**
584
+ * Send `{ event: "barge_in" }` to the proxy, requesting an immediate interruption
585
+ * of the ongoing TTS playback. Use when the user explicitly wants to speak while
586
+ * the bot is talking (e.g. via a UI button or a client-side VAD onset).
587
+ *
588
+ * The proxy will stop the TTS stream and transition to LISTENING; the widget should
589
+ * handle the server-sent `barge_in` and `listening` events to update local state.
590
+ */
591
+ sendBargeIn(): void {
592
+ if (this.ws?.readyState === WebSocket.OPEN) {
593
+ this.ws.send(JSON.stringify({ event: 'barge_in' }));
594
+ this.logger.info('[VoiceStreaming] barge_in sent');
595
+ }
596
+ }
597
+
590
598
  private cleanup(): void {
591
599
  this.logger.info('[VoiceStreaming] cleanup', { state: this._currentState, sessionId: this.currentSessionId });
592
600
  this.audioChunkCount = 0;
@@ -86,6 +86,7 @@ export type VoiceWsServerEventName =
86
86
  | 'thinking'
87
87
  | 'speaking'
88
88
  | 'done'
89
+ | 'barge_in'
89
90
  | 'error';
90
91
 
91
92
  /** Messaggio di controllo JSON dal proxy (`msg.event`); altri campi sono ignorati se non gestiti. */
@@ -44,7 +44,7 @@ describe('VoiceService', () => {
44
44
 
45
45
  voiceStreamingMock = jasmine.createSpyObj<VoiceStreamingService>(
46
46
  'VoiceStreamingService',
47
- ['start', 'stop', 'setAudioMuted', 'sendPlaybackComplete', 'pauseRecording', 'resumeRecording'],
47
+ ['start', 'stop', 'setAudioMuted', 'sendPlaybackComplete', 'sendBargeIn'],
48
48
  );
49
49
  voiceStreamingMock.start.and.returnValue(Promise.resolve());
50
50
  voiceStreamingMock.stop.and.returnValue(
@@ -65,8 +65,6 @@ describe('VoiceService', () => {
65
65
  ],
66
66
  });
67
67
  service = TestBed.inject(VoiceService);
68
- spyOn(service as any, '_startKeyboardSound').and.stub();
69
- spyOn(service as any, '_stopKeyboardSound').and.stub();
70
68
  });
71
69
 
72
70
  // ── Existing session lifecycle tests ──────────────────────────────────────
@@ -158,21 +156,22 @@ describe('VoiceService', () => {
158
156
  expect(voiceStreamingMock.setAudioMuted).not.toHaveBeenCalled();
159
157
  });
160
158
 
161
- it('empty-audio path: sendPlaybackComplete after flush but acquisition stays blocked until "listening"', async () => {
159
+ it('empty-audio path: sendPlaybackComplete immediately but acquisition stays blocked until "listening"', async () => {
162
160
  const blocked = await startWssSession();
163
161
  const initialLen = blocked.length;
164
162
 
165
- // done with no binary audio arms unblock; flush sends playback complete to proxy
163
+ // Simulate done arriving with NO binary audio (_activeTtsSources === 0)
166
164
  wsControl$.next({ event: 'speaking', text: 'hello' } as VoiceWsControlMessage);
167
165
  wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
168
166
 
169
- expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
170
- (service as any)._flushTtsUnblock(false);
167
+ // Proxy signalled immediately
171
168
  expect(voiceStreamingMock.sendPlaybackComplete).toHaveBeenCalledTimes(1);
172
169
 
170
+ // Acquisition must still be blocked — proxy hasn't confirmed LISTENING yet
173
171
  const afterDone = blocked.slice(initialLen);
174
172
  expect(afterDone.every((v) => v === true)).toBeTrue();
175
173
 
174
+ // Unblock only after proxy confirms
176
175
  wsControl$.next({ event: 'listening' } as VoiceWsControlMessage);
177
176
  expect(blocked[blocked.length - 1]).toBeFalse();
178
177
  });
@@ -190,19 +189,22 @@ describe('VoiceService', () => {
190
189
 
191
190
  // ── Audio preemption tests (SPEC-002) ────────────────────────────────────
192
191
 
193
- it('second "speaking" cancels first audio: sendPlaybackComplete only after flush for the new turn', async () => {
192
+ it('second "speaking" cancels first audio: sendPlaybackComplete called exactly once for the new turn', async () => {
194
193
  await startWssSession();
195
194
  voiceStreamingMock.sendPlaybackComplete.calls.reset();
196
195
 
196
+ // First turn: audio chunk arrives → _activeTtsSources = 1 (sync) → done sets _unblockAfterTts
197
197
  wsControl$.next({ event: 'speaking', text: 'first' } as VoiceWsControlMessage);
198
- ttsBinaryChunk$.next(new ArrayBuffer(4));
199
- wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
198
+ ttsBinaryChunk$.next(new ArrayBuffer(4)); // _activeTtsSources++ synchronously
199
+ wsControl$.next({ event: 'done' } as VoiceWsControlMessage); // _unblockAfterTts = true
200
200
 
201
+ // Second turn preempts while first audio is still "playing"
201
202
  wsControl$.next({ event: 'speaking', text: 'second' } as VoiceWsControlMessage);
203
+ // _cancelAllTtsAudio() resets _activeTtsSources=0, _unblockAfterTts=false
204
+
205
+ // done with no audio → sendPlaybackComplete immediately (new turn, _activeTtsSources = 0)
202
206
  wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
203
207
 
204
- expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
205
- (service as any)._flushTtsUnblock(false);
206
208
  expect(voiceStreamingMock.sendPlaybackComplete).toHaveBeenCalledTimes(1);
207
209
  });
208
210
 
@@ -224,4 +226,35 @@ describe('VoiceService', () => {
224
226
  expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
225
227
  });
226
228
 
229
+ // ── Barge-in ──────────────────────────────────────────────────────────────
230
+
231
+ it('barge_in event cancels TTS audio and unblocks acquisition without sending tts_playback_complete', async () => {
232
+ await startWssSession();
233
+ voiceStreamingMock.sendPlaybackComplete.calls.reset();
234
+
235
+ // Simulate bot speaking with audio in flight
236
+ wsControl$.next({ event: 'speaking', text: 'hello' } as VoiceWsControlMessage);
237
+ ttsBinaryChunk$.next(new ArrayBuffer(4)); // _activeTtsSources++ synchronously
238
+ wsControl$.next({ event: 'done' } as VoiceWsControlMessage); // _unblockAfterTts = true
239
+
240
+ // Proxy detects user speech and sends barge_in
241
+ wsControl$.next({ event: 'barge_in' } as VoiceWsControlMessage);
242
+
243
+ // tts_playback_complete must NOT be sent — it was an interruption, not a completion
244
+ expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
245
+ expect(voiceStreamingMock.setAudioMuted).not.toHaveBeenCalled();
246
+ expect((service as any)._isAcquisitionBlocked$.getValue()).toBe(false);
247
+ });
248
+
249
+ it('barge_in while no TTS is active does not throw and still unblocks acquisition', async () => {
250
+ await startWssSession();
251
+ voiceStreamingMock.sendPlaybackComplete.calls.reset();
252
+
253
+ // No speaking event — mic was never muted
254
+ expect(() => {
255
+ wsControl$.next({ event: 'barge_in' } as VoiceWsControlMessage);
256
+ }).not.toThrow();
257
+
258
+ expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
259
+ });
227
260
  });
@@ -4,7 +4,6 @@ import { getDefaultRealTimeVADOptions } from '@ricky0123/vad-web';
4
4
  import { BehaviorSubject, Observable, Subject, Subscription } from 'rxjs';
5
5
  import { LoggerInstance } from 'src/chat21-core/providers/logger/loggerInstance';
6
6
  import { LoggerService } from 'src/chat21-core/providers/abstract/logger.service';
7
- import { Globals } from 'src/app/utils/globals';
8
7
 
9
8
  import {
10
9
  DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS,
@@ -110,8 +109,6 @@ export class VoiceService {
110
109
  private analyser?: AnalyserNode;
111
110
  /** Buffer dedicato (`ArrayBuffer`) per compatibilità con `getByteFrequencyData`. */
112
111
  private dataArray?: Uint8Array;
113
- /** RAF ID for volume loop - used to cancel on cleanup */
114
- private volumeRafId?: number;
115
112
 
116
113
  /** Riproduzione chunk TTS binari dal proxy (Web Audio). */
117
114
  private ttsPlayContext?: AudioContext;
@@ -127,29 +124,9 @@ export class VoiceService {
127
124
  // (barge_in or a new speaking event). playWsTtsChunk captures this at entry and
128
125
  // checks it after the async decodeAudioData call to discard stale results.
129
126
  private _ttsGeneration = 0;
130
-
131
- // ── Ordered-scheduling state ──────────────────────────────────────────────────────────────────
132
- // Chunks arrive over WebSocket and their decodeAudioData calls run concurrently.
133
- // Because a smaller/later chunk can decode faster than a larger/earlier one, scheduling
134
- // based solely on decode-completion order causes audio to play out of arrival order
135
- // (e.g. "manuale" starts before "scrittura" even though it arrived after it).
136
- // Fix: assign a monotonic sequence number on arrival, decode in parallel, but only
137
- // schedule a buffer once every preceding buffer has already been scheduled.
138
- private _ttsChunkSeq = 0; // Incremented on each chunk arrival (arrival order)
139
- private _ttsScheduledSeq = 0; // Next sequence slot that is allowed to be scheduled
140
- // Decoded buffers waiting for their turn to be scheduled (keyed by arrival sequence)
141
- private _ttsDecodedPending = new Map<number, AudioBuffer>();
142
- // ─────────────────────────────────────────────────────────────────────────────────────────────
143
127
  // Set to true by the 'done' event; triggers acquisition unblock once all sources end.
144
128
  private _unblockAfterTts = false;
145
129
  private _unblockSafetyTimer: ReturnType<typeof setTimeout> | null = null;
146
- // Fallback timer started after sendPlaybackComplete. If the proxy does not reply
147
- // with 'listening' within the timeout window, the UI is force-unblocked so the
148
- // user is not left stuck waiting indefinitely.
149
- private _listeningFallbackTimer: ReturnType<typeof setTimeout> | null = null;
150
- // Track when the last TTS chunk is expected to finish playing.
151
- // Used to calculate a proper safety timer duration for long messages.
152
- private _ttsExpectedEndTime = 0;
153
130
 
154
131
  // ── WSS TTS Karaoke ──────────────────────────────────────────────────────────────────────────
155
132
  private _kText = '';
@@ -164,22 +141,13 @@ export class VoiceService {
164
141
  readonly voiceTtsKaraoke$: Observable<VoiceTtsKaraokeFrame> = this._voiceTtsKaraokeSubject.asObservable();
165
142
  // ─────────────────────────────────────────────────────────────────────────────────────────────
166
143
 
167
- // ── Thinking / typing-indicator sound ─────────────────────────────────────────────────────────
168
- // Played on loop while the bot is thinking or the first TTS chunk hasn't arrived yet.
169
- // Only active during WSS voice sessions (voice-proxy mode).
170
- private _keyboardSoundEl: HTMLAudioElement | null = null;
171
- // ─────────────────────────────────────────────────────────────────────────────────────────────
172
-
173
144
  private readonly logger: LoggerService = LoggerInstance.getInstance();
174
145
 
175
- private readonly bufferTime = 200000; // used as max safety timer duration for long TTS messages
176
-
177
146
  constructor(
178
147
  private readonly vadService: VadService,
179
148
  private readonly ttsPlayback: TtsAudioPlaybackCoordinator,
180
149
  private readonly voiceStreaming: VoiceStreamingService,
181
150
  @Optional() @Inject(SpeechToTextProvider) private readonly speechToText: SpeechToTextProvider | null,
182
- private readonly globals: Globals,
183
151
  ) {}
184
152
 
185
153
  get isSessionActive(): boolean {
@@ -204,8 +172,6 @@ export class VoiceService {
204
172
  * Richiede il microfono, avvia VAD in ascolto (inizio/fine parlato) e registra in WebM per segmento.
205
173
  */
206
174
  async startSession(options: VoiceSessionStartOptions = {}): Promise<void> {
207
- const mode = options.voiceIngressStream ? 'wss-proxy' : 'legacy-vad';
208
- this.logger.info('[VoiceService] startSession', { mode });
209
175
  await this.stopSession();
210
176
 
211
177
  this.sessionConstraints = options.constraints ?? DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS;
@@ -223,13 +189,7 @@ export class VoiceService {
223
189
 
224
190
  /** Sessione guidata dal proxy: solo mic + volume + WSS (mic in upload, eventi + TTS in download). */
225
191
  private async startWssVoiceSession(): Promise<void> {
226
- this.logger.info('[VoiceService] acquiring microphone for WSS session');
227
192
  this.stream = await navigator.mediaDevices.getUserMedia(this.sessionConstraints);
228
- const tracks = this.stream.getAudioTracks();
229
- this.logger.info('[VoiceService] microphone acquired', {
230
- tracks: tracks.length,
231
- label: tracks[0]?.label ?? '(unknown)',
232
- });
233
193
 
234
194
  // 🎧 AUDIO ANALYSER INIT
235
195
  this.initAudioAnalyser(this.stream);
@@ -242,7 +202,7 @@ export class VoiceService {
242
202
  await this.voiceStreaming.start(this.voiceIngressConfig!, { sharedMediaStream: this.stream });
243
203
  // Signal that the voice proxy is now live — suppresses tiledesk-server TTS.
244
204
  this._isWssVoiceActive$.next(true);
245
- this.logger.info('[VoiceService] WSS voice session started (no local VAD)');
205
+ this.logger.log('[VoiceService] sessione WSS (nessun VAD locale)');
246
206
  } catch (e) {
247
207
  this.wsControlSub?.unsubscribe();
248
208
  this.wsControlSub = undefined;
@@ -320,45 +280,25 @@ export class VoiceService {
320
280
  this.logger.log('[VoiceService] session_started', { requestId: msg.requestId ?? '' });
321
281
  break;
322
282
  case 'listening':
323
- // Proxy confirmed it is in LISTENING state — unblock the UI and resume
324
- // the MediaRecorder. Recording was paused on 'thinking' and must only
325
- // restart here, after TTS playback has fully completed and the proxy
326
- // is confirmed ready to receive audio again.
327
- if (this._listeningFallbackTimer !== null) {
328
- clearTimeout(this._listeningFallbackTimer);
329
- this._listeningFallbackTimer = null;
330
- }
331
- // If TTS never arrived (edge case) the keyboard sound would still be looping — stop it.
332
- this._stopKeyboardSound();
283
+ // Proxy confirmed it is in LISTENING state — unblock the UI.
284
+ // Audio has been flowing continuously (AEC handles echo suppression),
285
+ // so there is nothing to unmute here.
333
286
  this._isAcquisitionBlocked$.next(false);
334
- this.voiceStreaming.resumeRecording();
335
- this.logger.log('[VoiceService] listening – acquisition unblocked, recording resumed');
287
+ this.logger.log('[VoiceService] listening – acquisition unblocked');
336
288
  break;
337
289
  case 'transcript': {
338
290
  const text = typeof msg.text === 'string' ? msg.text : '';
339
291
  const isFinal = !!msg.isFinal;
340
- // Guard: if the proxy has already moved to PROCESSING (thinking) or SPEAKING,
341
- // this transcript is a stale in-flight STT result. Discard it so it cannot
342
- // override the blocked acquisition state or reach any downstream subscriber.
343
- // 'thinking' is stronger than 'transcript' — state must not regress.
344
- if (this._isAcquisitionBlocked$.value) {
345
- this.logger.warn('[VoiceService] transcript discarded – arrived after thinking/speaking (stale STT result)', { text, isFinal });
346
- break;
347
- }
348
292
  this.logger.log('[VoiceService] transcript', { text, isFinal });
349
293
  this.voiceTranscriptSubject.next({ text, isFinal });
350
294
  break;
351
295
  }
352
296
  case 'thinking':
353
297
  // Block acquisition UI while the bot processes the utterance.
354
- // Pause the MediaRecorder so no audio chunks are sent to the proxy
355
- // during PROCESSING state. Recording resumes only after the proxy
356
- // confirms LISTENING (i.e. after TTS playback has fully finished).
298
+ // Audio continues flowing to the proxy so the server can detect
299
+ // barge-in via Flux STT even during PROCESSING state.
357
300
  this._isAcquisitionBlocked$.next(true);
358
- this.voiceStreaming.pauseRecording();
359
- // Play keyboard typing sound to mask the silence while the bot generates its response.
360
- this._startKeyboardSound();
361
- this.logger.log('[VoiceService] thinking – acquisition blocked, recording paused', { activeTtsSources: this._activeTtsSources });
301
+ this.logger.log('[VoiceService] thinking – acquisition blocked', { activeTtsSources: this._activeTtsSources });
362
302
  break;
363
303
  case 'speaking': {
364
304
  this._isAcquisitionBlocked$.next(true);
@@ -370,13 +310,8 @@ export class VoiceService {
370
310
  this._cancelAllTtsAudio();
371
311
  // Reset TTS scheduling so new chunks play from now, not a stale future time.
372
312
  this.ttsNextPlayTime = this.ttsPlayContext?.currentTime ?? 0;
373
- // Reset expected end time for new TTS stream
374
- this._ttsExpectedEndTime = 0;
375
313
  const preview = typeof msg.text === 'string' ? msg.text.slice(0, 80) : '';
376
314
  this.logger.log('[VoiceService] speaking – acquisition blocked, TTS text preview', { preview });
377
- // Keep keyboard sound going (or start it as a fallback if 'thinking' was missed)
378
- // until the first TTS audio chunk actually starts playing.
379
- this._startKeyboardSound();
380
315
  // Emit the text being spoken so UI can display it alongside the audio.
381
316
  if (typeof msg.text === 'string' && msg.text) {
382
317
  this.voiceTtsTextSubject.next(msg.text);
@@ -389,31 +324,31 @@ export class VoiceService {
389
324
  // _activeTtsSources tracks pending sources; when the last one ends, acquisition unblocks.
390
325
  if (this._activeTtsSources > 0) {
391
326
  this._unblockAfterTts = true;
392
- // Calculate safety timer based on expected audio end time.
393
- // Add 5 seconds buffer for network/decode latency.
394
- // Minimum 5 seconds, maximum 300 seconds for very long messages.
395
- const remainingMs = Math.max(0, this._ttsExpectedEndTime - Date.now());
396
- const safetyMs = Math.min(this.bufferTime, Math.max(5000, remainingMs + 5000));
327
+ // Safety: force-unblock after 15 s in case onended never fires.
397
328
  if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
398
- this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), safetyMs);
399
- this.logger.log('[VoiceService] done – TTS still pending, waiting for all sources to end', {
400
- activeTtsSources: this._activeTtsSources,
401
- expectedEndInMs: remainingMs,
402
- safetyTimerMs: safetyMs
403
- });
329
+ this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), 15000);
330
+ this.logger.log('[VoiceService] done – TTS still pending, waiting for all sources to end', { activeTtsSources: this._activeTtsSources });
404
331
  } else {
405
- // No audio sources tracked yet, but binary TTS chunks may still be in-flight
406
- // (WebSocket binary frames can arrive after the JSON 'done' control message).
407
- // Set _unblockAfterTts so that _onTtsSourceEnded() triggers _flushTtsUnblock
408
- // naturally when those chunks finish playing, instead of relying solely on the
409
- // safety timer (which would delay unblock by 10 s even when audio ends sooner).
410
- this._unblockAfterTts = true;
411
- this.logger.log('[VoiceService] done no active sources yet, arming unblock for in-flight chunks');
412
- // Safety timer as last resort in case no chunks arrive at all.
413
- if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
414
- this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), 10000);
332
+ // No audio sources pending playback was already complete (or audio was empty).
333
+ // Signal the proxy synchronously; mic stays muted until the proxy confirms
334
+ // LISTENING via the 'listening' event.
335
+ this.logger.log('[VoiceService] done no pending TTS, sending playback complete immediately');
336
+ this.voiceStreaming.sendPlaybackComplete();
337
+ // Do NOT unblock acquisition here — proxy will send 'listening' which is
338
+ // the single source of truth for unblocking both UI and mic.
415
339
  }
416
340
  break;
341
+ case 'barge_in':
342
+ // Proxy's VAD detected user speech while the bot was talking — stop TTS immediately.
343
+ // Do NOT send tts_playback_complete; this is an interruption, not a normal completion.
344
+ // The proxy will follow with { event: "listening" } which authoritatively unblocks the UI.
345
+ // Audio was never muted, so there is nothing to unmute.
346
+ this._cancelAllTtsAudio();
347
+ this.ttsNextPlayTime = 0;
348
+ this._unblockAfterTts = false;
349
+ this._isAcquisitionBlocked$.next(false);
350
+ this.logger.log('[VoiceService] barge_in – TTS cancelled, acquisition unblocked');
351
+ break;
417
352
  case 'error': {
418
353
  const errorMsg = typeof msg.message === 'string' ? msg.message : 'Voice session error';
419
354
  this.logger.error('[VoiceService] WSS error', errorMsg);
@@ -426,19 +361,8 @@ export class VoiceService {
426
361
  }
427
362
  }
428
363
 
429
- /**
430
- * Chunk TTS: ogni buffer deve essere decodificabile da `decodeAudioData` (es. segmento WebM/Opus completo).
431
- *
432
- * Decode-race fix: multiple chunks decode concurrently; a smaller/later chunk can finish
433
- * decoding before a larger/earlier one, which would cause the AudioBufferSourceNode to be
434
- * scheduled out of arrival order (e.g. "manuale" before "scrittura"). To prevent this, each
435
- * chunk is assigned a monotonic sequence number on arrival and stored in _ttsDecodedPending
436
- * after decoding. _drainTtsDecodedBuffers() only advances the schedule when the next
437
- * expected sequence slot is present, guaranteeing arrival-order playback regardless of decode speed.
438
- */
364
+ /** Chunk TTS: ogni buffer deve essere decodificabile da `decodeAudioData` (es. segmento WebM/Opus completo). */
439
365
  private async playWsTtsChunk(buf: ArrayBuffer): Promise<void> {
440
- // Assign arrival-order sequence number SYNCHRONOUSLY before any await.
441
- const seq = this._ttsChunkSeq++;
442
366
  // Capture the current generation BEFORE the synchronous increment so that
443
367
  // if _cancelAllTtsAudio() fires (incrementing _ttsGeneration) while this
444
368
  // decode is in-flight, the mismatch is detected and the stale chunk is discarded.
@@ -446,12 +370,11 @@ export class VoiceService {
446
370
  // Increment SYNCHRONOUSLY before any await so the 'done' event handler (which arrives
447
371
  // on the next WebSocket message — a different event-loop tick) sees a non-zero count.
448
372
  this._activeTtsSources++;
449
- this.logger.log('[VoiceService] TTS chunk received', { seq, bytes: buf.byteLength, activeTtsSources: this._activeTtsSources });
373
+ this.logger.log('[VoiceService] TTS chunk received', { bytes: buf.byteLength, activeTtsSources: this._activeTtsSources });
450
374
  try {
451
375
  if (!this.ttsPlayContext || this.ttsPlayContext.state === 'closed') {
452
376
  this.ttsPlayContext = new AudioContext();
453
377
  this.ttsNextPlayTime = this.ttsPlayContext.currentTime;
454
- this.logger.info('[VoiceService] TTS AudioContext created');
455
378
  }
456
379
  const ctx = this.ttsPlayContext;
457
380
  const audioBuf = await ctx.decodeAudioData(buf.slice(0));
@@ -460,57 +383,21 @@ export class VoiceService {
460
383
  // for a turn that was already cancelled, and undo the counter increment.
461
384
  if (this._ttsGeneration !== capturedGeneration) {
462
385
  this._activeTtsSources = Math.max(0, this._activeTtsSources - 1);
463
- this.logger.log('[VoiceService] TTS chunk discarded – stale generation', { seq, capturedGeneration, currentGeneration: this._ttsGeneration });
386
+ this.logger.log('[VoiceService] TTS chunk discarded – stale generation', { capturedGeneration, currentGeneration: this._ttsGeneration });
464
387
  return;
465
388
  }
466
- // Store the decoded buffer under its arrival sequence number and attempt to
467
- // flush any contiguous run of decoded buffers in order.
468
- this._ttsDecodedPending.set(seq, audioBuf);
469
- this._drainTtsDecodedBuffers();
470
- } catch (e) {
471
- // Advance the scheduler past this failed slot so subsequent decoded chunks are
472
- // not blocked waiting for a slot that will never be filled.
473
- if (seq === this._ttsScheduledSeq) {
474
- this._ttsScheduledSeq++;
475
- this._drainTtsDecodedBuffers();
476
- }
477
- this._onTtsSourceEnded();
478
- this.logger.warn('[VoiceService] TTS chunk decode failed', { seq }, e);
479
- }
480
- }
481
-
482
- /**
483
- * Schedules decoded TTS buffers in strict arrival order.
484
- * Called after every successful decode. Drains the _ttsDecodedPending map
485
- * starting at _ttsScheduledSeq, stopping as soon as the next slot is missing
486
- * (i.e. that chunk is still decoding or failed).
487
- */
488
- private _drainTtsDecodedBuffers(): void {
489
- const ctx = this.ttsPlayContext;
490
- if (!ctx) return;
491
- while (this._ttsDecodedPending.has(this._ttsScheduledSeq)) {
492
- const audioBuf = this._ttsDecodedPending.get(this._ttsScheduledSeq)!;
493
- this._ttsDecodedPending.delete(this._ttsScheduledSeq);
494
- this._ttsScheduledSeq++;
495
-
496
389
  const src = ctx.createBufferSource();
497
390
  src.buffer = audioBuf;
498
391
  src.connect(ctx.destination);
499
392
  const t0 = Math.max(ctx.currentTime, this.ttsNextPlayTime);
500
393
  src.start(t0);
501
394
  this.ttsNextPlayTime = t0 + audioBuf.duration;
502
- // Track the expected end time in wall-clock time (ms) for safety timer calculation.
503
- const audioEndDelayMs = (this.ttsNextPlayTime - ctx.currentTime) * 1000;
504
- this._ttsExpectedEndTime = Date.now() + audioEndDelayMs;
505
- const isFirstChunk = this._activeTtsSourceNodes.length === 0;
506
395
  this._activeTtsSourceNodes.push(src);
507
- if (isFirstChunk) {
508
- // First real audio about to play — stop the keyboard typing sound immediately.
509
- this._stopKeyboardSound();
510
- this.logger.info('[VoiceService] TTS playback started', { durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3) });
511
- }
512
- this.logger.log('[VoiceService] TTS chunk scheduled', { seq: this._ttsScheduledSeq - 1, durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3), activeTtsSources: this._activeTtsSources, expectedEndInMs: audioEndDelayMs.toFixed(0) });
396
+ this.logger.log('[VoiceService] TTS chunk scheduled', { durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3), activeTtsSources: this._activeTtsSources });
513
397
  src.onended = () => this._onTtsSourceEnded(src);
398
+ } catch (e) {
399
+ this._onTtsSourceEnded();
400
+ this.logger.warn('[VoiceService] TTS chunk decode failed', e);
514
401
  }
515
402
  }
516
403
 
@@ -521,10 +408,6 @@ export class VoiceService {
521
408
  if (idx !== -1) { this._activeTtsSourceNodes.splice(idx, 1); }
522
409
  }
523
410
  this.logger.log('[VoiceService] TTS source ended', { activeTtsSources: this._activeTtsSources, unblockPending: this._unblockAfterTts });
524
- if (this._activeTtsSources === 0) {
525
- this.logger.info('[VoiceService] TTS playback ended – all sources finished');
526
- console.log('[VoiceService] TTS audio finished playing');
527
- }
528
411
  if (this._unblockAfterTts && this._activeTtsSources === 0) {
529
412
  this._flushTtsUnblock(false);
530
413
  }
@@ -552,11 +435,6 @@ export class VoiceService {
552
435
  this._activeTtsSourceNodes = [];
553
436
  this._activeTtsSources = 0;
554
437
  this._unblockAfterTts = false;
555
- this._ttsExpectedEndTime = 0;
556
- // Reset ordered-scheduling state so the next speaking turn starts fresh.
557
- this._ttsChunkSeq = 0;
558
- this._ttsScheduledSeq = 0;
559
- this._ttsDecodedPending.clear();
560
438
  this._stopTtsKaraoke(true);
561
439
  this.logger.log('[VoiceService] TTS cancelled – all audio sources stopped');
562
440
  }
@@ -574,21 +452,12 @@ export class VoiceService {
574
452
  this.logger.log('[VoiceService] TTS unblock: all sources ended, sending playback complete');
575
453
  }
576
454
  this._stopTtsKaraoke(true);
577
- // Signal the proxy that TTS playback is complete. The proxy will transition
578
- // to LISTENING and send a 'listening' event back; the mic resumes and the UI
579
- // unblocks only then — so the user sees 'listening' exactly when the stream
580
- // is open, not before.
581
- // Start a fallback timer: if the proxy does not respond with 'listening' within
582
- // 3 seconds (network hiccup, server race, etc.) force-unblock so the user is
583
- // never left stuck. The timer is cancelled immediately if 'listening' arrives.
455
+ // Signal the proxy that TTS playback is complete. The proxy will transition
456
+ // to LISTENING and send a 'listening' event back; the mic is unmuted there
457
+ // (not here) so it is live only when the proxy is confirmed ready.
458
+ // Do NOT call _isAcquisitionBlocked$.next(false) here — 'listening' is the
459
+ // single source of truth so that UI and mic unblock atomically.
584
460
  this.voiceStreaming.sendPlaybackComplete();
585
- if (this._listeningFallbackTimer !== null) clearTimeout(this._listeningFallbackTimer);
586
- this._listeningFallbackTimer = setTimeout(() => {
587
- this._listeningFallbackTimer = null;
588
- this.logger.warn('[VoiceService] listening fallback timer fired – proxy did not respond, force-unblocking');
589
- this._isAcquisitionBlocked$.next(false);
590
- this.voiceStreaming.resumeRecording();
591
- }, 3000);
592
461
  }
593
462
 
594
463
  // ── WSS TTS Karaoke helpers ───────────────────────────────────────────────
@@ -661,39 +530,8 @@ export class VoiceService {
661
530
 
662
531
  // ─────────────────────────────────────────────────────────────────────────
663
532
 
664
- // ── Keyboard typing-indicator sound helpers ───────────────────────────────
665
- /**
666
- * Starts the keyboard sound on loop to mask silence while the bot is
667
- * generating its response. No-op if already playing.
668
- * Only called during WSS voice sessions (voice-proxy mode).
669
- */
670
- private _startKeyboardSound(): void {
671
- if (this._keyboardSoundEl) return; // already playing
672
- const file = this.globals.keyboardSoundFile ?? 'keyboard.mp3';
673
- const src = /^https?:\/\//i.test(file)
674
- ? file
675
- : `${this.globals.baseLocation}/assets/sounds/${file}`;
676
- const audio = new Audio(src);
677
- audio.loop = true;
678
- audio.volume = Math.min(1, Math.max(0, this.globals.keyboardSoundVolume));
679
- audio.play().catch((e) => this.logger.warn('[VoiceService] keyboard sound play failed', e));
680
- this._keyboardSoundEl = audio;
681
- this.logger.log('[VoiceService] keyboard sound started', { src, volume: audio.volume });
682
- }
683
-
684
- /** Stops and discards the keyboard typing sound. No-op if not playing. */
685
- private _stopKeyboardSound(): void {
686
- if (!this._keyboardSoundEl) return;
687
- this._keyboardSoundEl.pause();
688
- this._keyboardSoundEl.currentTime = 0;
689
- this._keyboardSoundEl = null;
690
- this.logger.log('[VoiceService] keyboard sound stopped');
691
- }
692
- // ─────────────────────────────────────────────────────────────────────────
693
-
694
533
  async stopSession(options?: { discardInProgressSegment?: boolean}): Promise<{ voiceIngressResultUrl: string | null }> {
695
534
  const discard = options?.discardInProgressSegment === true;
696
- this.logger.info('[VoiceService] stopSession', { discard, isWssVoiceActive: this._isWssVoiceActive$.getValue() });
697
535
 
698
536
  this.wsControlSub?.unsubscribe();
699
537
  this.wsControlSub = undefined;
@@ -710,7 +548,6 @@ export class VoiceService {
710
548
  this._cancelAllTtsAudio();
711
549
  this.ttsPlayContext = undefined;
712
550
  this.ttsNextPlayTime = 0;
713
- this._stopKeyboardSound();
714
551
 
715
552
  let voiceIngressResultUrl: string | null = null;
716
553
  if (this.voiceIngressConfig) {
@@ -753,10 +590,6 @@ export class VoiceService {
753
590
  }
754
591
 
755
592
  // 🎧 cleanup audio context
756
- if (this.volumeRafId) {
757
- cancelAnimationFrame(this.volumeRafId);
758
- this.volumeRafId = undefined;
759
- }
760
593
  this.audioContext?.close();
761
594
  this.audioContext = undefined;
762
595
  this.analyser = undefined;
@@ -775,10 +608,6 @@ export class VoiceService {
775
608
  clearTimeout(this.responseTimeoutId);
776
609
  this.responseTimeoutId = undefined;
777
610
  this.isWaitingForResponse = false;
778
- if (this._listeningFallbackTimer !== null) {
779
- clearTimeout(this._listeningFallbackTimer);
780
- this._listeningFallbackTimer = null;
781
- }
782
611
  this._isAcquisitionBlocked$.next(false);
783
612
 
784
613
  return { voiceIngressResultUrl };
@@ -859,7 +688,8 @@ export class VoiceService {
859
688
  private startVolumeLoop(): void {
860
689
  const tick = () => {
861
690
  if (!this.analyser || !this.dataArray) {
862
- return; // Stop the loop if analyser is cleaned up
691
+ requestAnimationFrame(tick);
692
+ return;
863
693
  }
864
694
 
865
695
  this.analyser.getByteFrequencyData(
@@ -875,10 +705,10 @@ export class VoiceService {
875
705
 
876
706
  this.volumeSubject.next(volume);
877
707
 
878
- this.volumeRafId = requestAnimationFrame(tick);
708
+ requestAnimationFrame(tick);
879
709
  };
880
710
 
881
- this.volumeRafId = requestAnimationFrame(tick);
711
+ tick();
882
712
  }
883
713
 
884
714
  /**