@chat21/chat21-web-widget 5.1.33-rc11 → 5.1.33-rc9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +0 -7
- package/package.json +1 -1
- package/playwright-report/index.html +90 -0
- package/src/app/component/conversation-detail/conversation/conversation.component.ts +1 -3
- package/src/app/component/conversation-detail/conversation-content/conversation-content.component.spec.ts +7 -0
- package/src/app/component/conversation-detail/conversation-content/conversation-content.component.ts +5 -7
- package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.html +3 -4
- package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.scss +18 -9
- package/src/app/component/conversation-detail/conversation-footer/conversation-footer.component.ts +0 -6
- package/src/app/component/conversation-detail/stream-audio-spectrum/stream-audio-spectrum.component.html +5 -8
- package/src/app/component/conversation-detail/stream-audio-spectrum/stream-audio-spectrum.component.scss +1 -5
- package/src/app/component/form/inputs/form-text/form-text.component.ts +3 -9
- package/src/app/component/message/bubble-message/bubble-message.component.scss +0 -5
- package/src/app/component/message/bubble-message/bubble-message.component.ts +0 -14
- package/src/app/component/message/json-sources/json-sources.component.scss +8 -12
- package/src/app/pipe/marked.pipe.ts +41 -51
- package/src/app/providers/global-settings.service.ts +0 -29
- package/src/app/providers/json-sources-parser.service.ts +32 -25
- package/src/app/providers/voice/voice-streaming.service.ts +19 -11
- package/src/app/providers/voice/voice-streaming.types.ts +1 -0
- package/src/app/providers/voice/voice.service.spec.ts +45 -12
- package/src/app/providers/voice/voice.service.ts +45 -215
- package/src/app/utils/globals.ts +0 -10
- package/src/assets/i18n/en.json +125 -106
- package/src/assets/i18n/es.json +0 -1
- package/src/assets/i18n/fr.json +0 -1
- package/src/assets/i18n/it.json +0 -1
- package/test-results/.last-run.json +4 -0
- package/src/assets/sounds/keyboard.mp3 +0 -0
|
@@ -13,11 +13,10 @@ import {
|
|
|
13
13
|
VoiceWsControlMessage,
|
|
14
14
|
} from './voice-streaming.types';
|
|
15
15
|
|
|
16
|
-
// Flux docs recommend 80ms chunks for optimal latency;
|
|
17
|
-
// balance for WebM containerization overhead in the browser
|
|
18
|
-
// good STT accuracy.
|
|
16
|
+
// Flux docs recommend 80ms chunks for optimal latency; 250ms is a practical
|
|
17
|
+
// balance for WebM containerization overhead in the browser.
|
|
19
18
|
// Source: https://developers.deepgram.com/docs/flux/quickstart
|
|
20
|
-
const DEFAULT_TIMESLICE_MS =
|
|
19
|
+
const DEFAULT_TIMESLICE_MS = 250;
|
|
21
20
|
const READY_TIMEOUT_MS = 10_000;
|
|
22
21
|
const SESSION_STARTED_TIMEOUT_MS = 10_000;
|
|
23
22
|
|
|
@@ -259,12 +258,6 @@ export class VoiceStreamingService {
|
|
|
259
258
|
this.mediaStream = shared
|
|
260
259
|
? shared
|
|
261
260
|
: await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
262
|
-
const tracks = this.mediaStream.getAudioTracks();
|
|
263
|
-
this.logger.info('[VoiceStreaming] microphone acquired', {
|
|
264
|
-
shared: !!shared,
|
|
265
|
-
tracks: tracks.length,
|
|
266
|
-
label: tracks[0]?.label ?? '(unknown)',
|
|
267
|
-
});
|
|
268
261
|
const recorderOpts: MediaRecorderOptions = {};
|
|
269
262
|
if (mime) {
|
|
270
263
|
recorderOpts.mimeType = mime;
|
|
@@ -578,7 +571,7 @@ export class VoiceStreamingService {
|
|
|
578
571
|
|
|
579
572
|
/**
|
|
580
573
|
* Send `{ event: "tts_playback_complete" }` to the proxy, signalling that TTS
|
|
581
|
-
* playback has finished and the microphone is
|
|
574
|
+
* playback has finished and the microphone is ready to receive user speech.
|
|
582
575
|
*/
|
|
583
576
|
sendPlaybackComplete(): void {
|
|
584
577
|
if (this.ws?.readyState === WebSocket.OPEN) {
|
|
@@ -587,6 +580,21 @@ export class VoiceStreamingService {
|
|
|
587
580
|
}
|
|
588
581
|
}
|
|
589
582
|
|
|
583
|
+
/**
|
|
584
|
+
* Send `{ event: "barge_in" }` to the proxy, requesting an immediate interruption
|
|
585
|
+
* of the ongoing TTS playback. Use when the user explicitly wants to speak while
|
|
586
|
+
* the bot is talking (e.g. via a UI button or a client-side VAD onset).
|
|
587
|
+
*
|
|
588
|
+
* The proxy will stop the TTS stream and transition to LISTENING; the widget should
|
|
589
|
+
* handle the server-sent `barge_in` and `listening` events to update local state.
|
|
590
|
+
*/
|
|
591
|
+
sendBargeIn(): void {
|
|
592
|
+
if (this.ws?.readyState === WebSocket.OPEN) {
|
|
593
|
+
this.ws.send(JSON.stringify({ event: 'barge_in' }));
|
|
594
|
+
this.logger.info('[VoiceStreaming] barge_in sent');
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
590
598
|
private cleanup(): void {
|
|
591
599
|
this.logger.info('[VoiceStreaming] cleanup', { state: this._currentState, sessionId: this.currentSessionId });
|
|
592
600
|
this.audioChunkCount = 0;
|
|
@@ -44,7 +44,7 @@ describe('VoiceService', () => {
|
|
|
44
44
|
|
|
45
45
|
voiceStreamingMock = jasmine.createSpyObj<VoiceStreamingService>(
|
|
46
46
|
'VoiceStreamingService',
|
|
47
|
-
['start', 'stop', 'setAudioMuted', 'sendPlaybackComplete', '
|
|
47
|
+
['start', 'stop', 'setAudioMuted', 'sendPlaybackComplete', 'sendBargeIn'],
|
|
48
48
|
);
|
|
49
49
|
voiceStreamingMock.start.and.returnValue(Promise.resolve());
|
|
50
50
|
voiceStreamingMock.stop.and.returnValue(
|
|
@@ -65,8 +65,6 @@ describe('VoiceService', () => {
|
|
|
65
65
|
],
|
|
66
66
|
});
|
|
67
67
|
service = TestBed.inject(VoiceService);
|
|
68
|
-
spyOn(service as any, '_startKeyboardSound').and.stub();
|
|
69
|
-
spyOn(service as any, '_stopKeyboardSound').and.stub();
|
|
70
68
|
});
|
|
71
69
|
|
|
72
70
|
// ── Existing session lifecycle tests ──────────────────────────────────────
|
|
@@ -158,21 +156,22 @@ describe('VoiceService', () => {
|
|
|
158
156
|
expect(voiceStreamingMock.setAudioMuted).not.toHaveBeenCalled();
|
|
159
157
|
});
|
|
160
158
|
|
|
161
|
-
it('empty-audio path: sendPlaybackComplete
|
|
159
|
+
it('empty-audio path: sendPlaybackComplete immediately but acquisition stays blocked until "listening"', async () => {
|
|
162
160
|
const blocked = await startWssSession();
|
|
163
161
|
const initialLen = blocked.length;
|
|
164
162
|
|
|
165
|
-
// done with
|
|
163
|
+
// Simulate done arriving with NO binary audio (_activeTtsSources === 0)
|
|
166
164
|
wsControl$.next({ event: 'speaking', text: 'hello' } as VoiceWsControlMessage);
|
|
167
165
|
wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
|
|
168
166
|
|
|
169
|
-
|
|
170
|
-
(service as any)._flushTtsUnblock(false);
|
|
167
|
+
// Proxy signalled immediately
|
|
171
168
|
expect(voiceStreamingMock.sendPlaybackComplete).toHaveBeenCalledTimes(1);
|
|
172
169
|
|
|
170
|
+
// Acquisition must still be blocked — proxy hasn't confirmed LISTENING yet
|
|
173
171
|
const afterDone = blocked.slice(initialLen);
|
|
174
172
|
expect(afterDone.every((v) => v === true)).toBeTrue();
|
|
175
173
|
|
|
174
|
+
// Unblock only after proxy confirms
|
|
176
175
|
wsControl$.next({ event: 'listening' } as VoiceWsControlMessage);
|
|
177
176
|
expect(blocked[blocked.length - 1]).toBeFalse();
|
|
178
177
|
});
|
|
@@ -190,19 +189,22 @@ describe('VoiceService', () => {
|
|
|
190
189
|
|
|
191
190
|
// ── Audio preemption tests (SPEC-002) ────────────────────────────────────
|
|
192
191
|
|
|
193
|
-
it('second "speaking" cancels first audio: sendPlaybackComplete
|
|
192
|
+
it('second "speaking" cancels first audio: sendPlaybackComplete called exactly once for the new turn', async () => {
|
|
194
193
|
await startWssSession();
|
|
195
194
|
voiceStreamingMock.sendPlaybackComplete.calls.reset();
|
|
196
195
|
|
|
196
|
+
// First turn: audio chunk arrives → _activeTtsSources = 1 (sync) → done sets _unblockAfterTts
|
|
197
197
|
wsControl$.next({ event: 'speaking', text: 'first' } as VoiceWsControlMessage);
|
|
198
|
-
ttsBinaryChunk$.next(new ArrayBuffer(4));
|
|
199
|
-
wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
|
|
198
|
+
ttsBinaryChunk$.next(new ArrayBuffer(4)); // _activeTtsSources++ synchronously
|
|
199
|
+
wsControl$.next({ event: 'done' } as VoiceWsControlMessage); // _unblockAfterTts = true
|
|
200
200
|
|
|
201
|
+
// Second turn preempts while first audio is still "playing"
|
|
201
202
|
wsControl$.next({ event: 'speaking', text: 'second' } as VoiceWsControlMessage);
|
|
203
|
+
// _cancelAllTtsAudio() resets _activeTtsSources=0, _unblockAfterTts=false
|
|
204
|
+
|
|
205
|
+
// done with no audio → sendPlaybackComplete immediately (new turn, _activeTtsSources = 0)
|
|
202
206
|
wsControl$.next({ event: 'done' } as VoiceWsControlMessage);
|
|
203
207
|
|
|
204
|
-
expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
|
|
205
|
-
(service as any)._flushTtsUnblock(false);
|
|
206
208
|
expect(voiceStreamingMock.sendPlaybackComplete).toHaveBeenCalledTimes(1);
|
|
207
209
|
});
|
|
208
210
|
|
|
@@ -224,4 +226,35 @@ describe('VoiceService', () => {
|
|
|
224
226
|
expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
|
|
225
227
|
});
|
|
226
228
|
|
|
229
|
+
// ── Barge-in ──────────────────────────────────────────────────────────────
|
|
230
|
+
|
|
231
|
+
it('barge_in event cancels TTS audio and unblocks acquisition without sending tts_playback_complete', async () => {
|
|
232
|
+
await startWssSession();
|
|
233
|
+
voiceStreamingMock.sendPlaybackComplete.calls.reset();
|
|
234
|
+
|
|
235
|
+
// Simulate bot speaking with audio in flight
|
|
236
|
+
wsControl$.next({ event: 'speaking', text: 'hello' } as VoiceWsControlMessage);
|
|
237
|
+
ttsBinaryChunk$.next(new ArrayBuffer(4)); // _activeTtsSources++ synchronously
|
|
238
|
+
wsControl$.next({ event: 'done' } as VoiceWsControlMessage); // _unblockAfterTts = true
|
|
239
|
+
|
|
240
|
+
// Proxy detects user speech and sends barge_in
|
|
241
|
+
wsControl$.next({ event: 'barge_in' } as VoiceWsControlMessage);
|
|
242
|
+
|
|
243
|
+
// tts_playback_complete must NOT be sent — it was an interruption, not a completion
|
|
244
|
+
expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
|
|
245
|
+
expect(voiceStreamingMock.setAudioMuted).not.toHaveBeenCalled();
|
|
246
|
+
expect((service as any)._isAcquisitionBlocked$.getValue()).toBe(false);
|
|
247
|
+
});
|
|
248
|
+
|
|
249
|
+
it('barge_in while no TTS is active does not throw and still unblocks acquisition', async () => {
|
|
250
|
+
await startWssSession();
|
|
251
|
+
voiceStreamingMock.sendPlaybackComplete.calls.reset();
|
|
252
|
+
|
|
253
|
+
// No speaking event — mic was never muted
|
|
254
|
+
expect(() => {
|
|
255
|
+
wsControl$.next({ event: 'barge_in' } as VoiceWsControlMessage);
|
|
256
|
+
}).not.toThrow();
|
|
257
|
+
|
|
258
|
+
expect(voiceStreamingMock.sendPlaybackComplete).not.toHaveBeenCalled();
|
|
259
|
+
});
|
|
227
260
|
});
|
|
@@ -4,7 +4,6 @@ import { getDefaultRealTimeVADOptions } from '@ricky0123/vad-web';
|
|
|
4
4
|
import { BehaviorSubject, Observable, Subject, Subscription } from 'rxjs';
|
|
5
5
|
import { LoggerInstance } from 'src/chat21-core/providers/logger/loggerInstance';
|
|
6
6
|
import { LoggerService } from 'src/chat21-core/providers/abstract/logger.service';
|
|
7
|
-
import { Globals } from 'src/app/utils/globals';
|
|
8
7
|
|
|
9
8
|
import {
|
|
10
9
|
DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS,
|
|
@@ -110,8 +109,6 @@ export class VoiceService {
|
|
|
110
109
|
private analyser?: AnalyserNode;
|
|
111
110
|
/** Buffer dedicato (`ArrayBuffer`) per compatibilità con `getByteFrequencyData`. */
|
|
112
111
|
private dataArray?: Uint8Array;
|
|
113
|
-
/** RAF ID for volume loop - used to cancel on cleanup */
|
|
114
|
-
private volumeRafId?: number;
|
|
115
112
|
|
|
116
113
|
/** Riproduzione chunk TTS binari dal proxy (Web Audio). */
|
|
117
114
|
private ttsPlayContext?: AudioContext;
|
|
@@ -127,29 +124,9 @@ export class VoiceService {
|
|
|
127
124
|
// (barge_in or a new speaking event). playWsTtsChunk captures this at entry and
|
|
128
125
|
// checks it after the async decodeAudioData call to discard stale results.
|
|
129
126
|
private _ttsGeneration = 0;
|
|
130
|
-
|
|
131
|
-
// ── Ordered-scheduling state ──────────────────────────────────────────────────────────────────
|
|
132
|
-
// Chunks arrive over WebSocket and their decodeAudioData calls run concurrently.
|
|
133
|
-
// Because a smaller/later chunk can decode faster than a larger/earlier one, scheduling
|
|
134
|
-
// based solely on decode-completion order causes audio to play out of arrival order
|
|
135
|
-
// (e.g. "manuale" starts before "scrittura" even though it arrived after it).
|
|
136
|
-
// Fix: assign a monotonic sequence number on arrival, decode in parallel, but only
|
|
137
|
-
// schedule a buffer once every preceding buffer has already been scheduled.
|
|
138
|
-
private _ttsChunkSeq = 0; // Incremented on each chunk arrival (arrival order)
|
|
139
|
-
private _ttsScheduledSeq = 0; // Next sequence slot that is allowed to be scheduled
|
|
140
|
-
// Decoded buffers waiting for their turn to be scheduled (keyed by arrival sequence)
|
|
141
|
-
private _ttsDecodedPending = new Map<number, AudioBuffer>();
|
|
142
|
-
// ─────────────────────────────────────────────────────────────────────────────────────────────
|
|
143
127
|
// Set to true by the 'done' event; triggers acquisition unblock once all sources end.
|
|
144
128
|
private _unblockAfterTts = false;
|
|
145
129
|
private _unblockSafetyTimer: ReturnType<typeof setTimeout> | null = null;
|
|
146
|
-
// Fallback timer started after sendPlaybackComplete. If the proxy does not reply
|
|
147
|
-
// with 'listening' within the timeout window, the UI is force-unblocked so the
|
|
148
|
-
// user is not left stuck waiting indefinitely.
|
|
149
|
-
private _listeningFallbackTimer: ReturnType<typeof setTimeout> | null = null;
|
|
150
|
-
// Track when the last TTS chunk is expected to finish playing.
|
|
151
|
-
// Used to calculate a proper safety timer duration for long messages.
|
|
152
|
-
private _ttsExpectedEndTime = 0;
|
|
153
130
|
|
|
154
131
|
// ── WSS TTS Karaoke ──────────────────────────────────────────────────────────────────────────
|
|
155
132
|
private _kText = '';
|
|
@@ -164,22 +141,13 @@ export class VoiceService {
|
|
|
164
141
|
readonly voiceTtsKaraoke$: Observable<VoiceTtsKaraokeFrame> = this._voiceTtsKaraokeSubject.asObservable();
|
|
165
142
|
// ─────────────────────────────────────────────────────────────────────────────────────────────
|
|
166
143
|
|
|
167
|
-
// ── Thinking / typing-indicator sound ─────────────────────────────────────────────────────────
|
|
168
|
-
// Played on loop while the bot is thinking or the first TTS chunk hasn't arrived yet.
|
|
169
|
-
// Only active during WSS voice sessions (voice-proxy mode).
|
|
170
|
-
private _keyboardSoundEl: HTMLAudioElement | null = null;
|
|
171
|
-
// ─────────────────────────────────────────────────────────────────────────────────────────────
|
|
172
|
-
|
|
173
144
|
private readonly logger: LoggerService = LoggerInstance.getInstance();
|
|
174
145
|
|
|
175
|
-
private readonly bufferTime = 200000; // used as max safety timer duration for long TTS messages
|
|
176
|
-
|
|
177
146
|
constructor(
|
|
178
147
|
private readonly vadService: VadService,
|
|
179
148
|
private readonly ttsPlayback: TtsAudioPlaybackCoordinator,
|
|
180
149
|
private readonly voiceStreaming: VoiceStreamingService,
|
|
181
150
|
@Optional() @Inject(SpeechToTextProvider) private readonly speechToText: SpeechToTextProvider | null,
|
|
182
|
-
private readonly globals: Globals,
|
|
183
151
|
) {}
|
|
184
152
|
|
|
185
153
|
get isSessionActive(): boolean {
|
|
@@ -204,8 +172,6 @@ export class VoiceService {
|
|
|
204
172
|
* Richiede il microfono, avvia VAD in ascolto (inizio/fine parlato) e registra in WebM per segmento.
|
|
205
173
|
*/
|
|
206
174
|
async startSession(options: VoiceSessionStartOptions = {}): Promise<void> {
|
|
207
|
-
const mode = options.voiceIngressStream ? 'wss-proxy' : 'legacy-vad';
|
|
208
|
-
this.logger.info('[VoiceService] startSession', { mode });
|
|
209
175
|
await this.stopSession();
|
|
210
176
|
|
|
211
177
|
this.sessionConstraints = options.constraints ?? DEFAULT_VOICE_MEDIA_STREAM_CONSTRAINTS;
|
|
@@ -223,13 +189,7 @@ export class VoiceService {
|
|
|
223
189
|
|
|
224
190
|
/** Sessione guidata dal proxy: solo mic + volume + WSS (mic in upload, eventi + TTS in download). */
|
|
225
191
|
private async startWssVoiceSession(): Promise<void> {
|
|
226
|
-
this.logger.info('[VoiceService] acquiring microphone for WSS session');
|
|
227
192
|
this.stream = await navigator.mediaDevices.getUserMedia(this.sessionConstraints);
|
|
228
|
-
const tracks = this.stream.getAudioTracks();
|
|
229
|
-
this.logger.info('[VoiceService] microphone acquired', {
|
|
230
|
-
tracks: tracks.length,
|
|
231
|
-
label: tracks[0]?.label ?? '(unknown)',
|
|
232
|
-
});
|
|
233
193
|
|
|
234
194
|
// 🎧 AUDIO ANALYSER INIT
|
|
235
195
|
this.initAudioAnalyser(this.stream);
|
|
@@ -242,7 +202,7 @@ export class VoiceService {
|
|
|
242
202
|
await this.voiceStreaming.start(this.voiceIngressConfig!, { sharedMediaStream: this.stream });
|
|
243
203
|
// Signal that the voice proxy is now live — suppresses tiledesk-server TTS.
|
|
244
204
|
this._isWssVoiceActive$.next(true);
|
|
245
|
-
this.logger.
|
|
205
|
+
this.logger.log('[VoiceService] sessione WSS (nessun VAD locale)');
|
|
246
206
|
} catch (e) {
|
|
247
207
|
this.wsControlSub?.unsubscribe();
|
|
248
208
|
this.wsControlSub = undefined;
|
|
@@ -320,45 +280,25 @@ export class VoiceService {
|
|
|
320
280
|
this.logger.log('[VoiceService] session_started', { requestId: msg.requestId ?? '' });
|
|
321
281
|
break;
|
|
322
282
|
case 'listening':
|
|
323
|
-
// Proxy confirmed it is in LISTENING state — unblock the UI
|
|
324
|
-
//
|
|
325
|
-
//
|
|
326
|
-
// is confirmed ready to receive audio again.
|
|
327
|
-
if (this._listeningFallbackTimer !== null) {
|
|
328
|
-
clearTimeout(this._listeningFallbackTimer);
|
|
329
|
-
this._listeningFallbackTimer = null;
|
|
330
|
-
}
|
|
331
|
-
// If TTS never arrived (edge case) the keyboard sound would still be looping — stop it.
|
|
332
|
-
this._stopKeyboardSound();
|
|
283
|
+
// Proxy confirmed it is in LISTENING state — unblock the UI.
|
|
284
|
+
// Audio has been flowing continuously (AEC handles echo suppression),
|
|
285
|
+
// so there is nothing to unmute here.
|
|
333
286
|
this._isAcquisitionBlocked$.next(false);
|
|
334
|
-
this.
|
|
335
|
-
this.logger.log('[VoiceService] listening – acquisition unblocked, recording resumed');
|
|
287
|
+
this.logger.log('[VoiceService] listening – acquisition unblocked');
|
|
336
288
|
break;
|
|
337
289
|
case 'transcript': {
|
|
338
290
|
const text = typeof msg.text === 'string' ? msg.text : '';
|
|
339
291
|
const isFinal = !!msg.isFinal;
|
|
340
|
-
// Guard: if the proxy has already moved to PROCESSING (thinking) or SPEAKING,
|
|
341
|
-
// this transcript is a stale in-flight STT result. Discard it so it cannot
|
|
342
|
-
// override the blocked acquisition state or reach any downstream subscriber.
|
|
343
|
-
// 'thinking' is stronger than 'transcript' — state must not regress.
|
|
344
|
-
if (this._isAcquisitionBlocked$.value) {
|
|
345
|
-
this.logger.warn('[VoiceService] transcript discarded – arrived after thinking/speaking (stale STT result)', { text, isFinal });
|
|
346
|
-
break;
|
|
347
|
-
}
|
|
348
292
|
this.logger.log('[VoiceService] transcript', { text, isFinal });
|
|
349
293
|
this.voiceTranscriptSubject.next({ text, isFinal });
|
|
350
294
|
break;
|
|
351
295
|
}
|
|
352
296
|
case 'thinking':
|
|
353
297
|
// Block acquisition UI while the bot processes the utterance.
|
|
354
|
-
//
|
|
355
|
-
//
|
|
356
|
-
// confirms LISTENING (i.e. after TTS playback has fully finished).
|
|
298
|
+
// Audio continues flowing to the proxy so the server can detect
|
|
299
|
+
// barge-in via Flux STT even during PROCESSING state.
|
|
357
300
|
this._isAcquisitionBlocked$.next(true);
|
|
358
|
-
this.
|
|
359
|
-
// Play keyboard typing sound to mask the silence while the bot generates its response.
|
|
360
|
-
this._startKeyboardSound();
|
|
361
|
-
this.logger.log('[VoiceService] thinking – acquisition blocked, recording paused', { activeTtsSources: this._activeTtsSources });
|
|
301
|
+
this.logger.log('[VoiceService] thinking – acquisition blocked', { activeTtsSources: this._activeTtsSources });
|
|
362
302
|
break;
|
|
363
303
|
case 'speaking': {
|
|
364
304
|
this._isAcquisitionBlocked$.next(true);
|
|
@@ -370,13 +310,8 @@ export class VoiceService {
|
|
|
370
310
|
this._cancelAllTtsAudio();
|
|
371
311
|
// Reset TTS scheduling so new chunks play from now, not a stale future time.
|
|
372
312
|
this.ttsNextPlayTime = this.ttsPlayContext?.currentTime ?? 0;
|
|
373
|
-
// Reset expected end time for new TTS stream
|
|
374
|
-
this._ttsExpectedEndTime = 0;
|
|
375
313
|
const preview = typeof msg.text === 'string' ? msg.text.slice(0, 80) : '';
|
|
376
314
|
this.logger.log('[VoiceService] speaking – acquisition blocked, TTS text preview', { preview });
|
|
377
|
-
// Keep keyboard sound going (or start it as a fallback if 'thinking' was missed)
|
|
378
|
-
// until the first TTS audio chunk actually starts playing.
|
|
379
|
-
this._startKeyboardSound();
|
|
380
315
|
// Emit the text being spoken so UI can display it alongside the audio.
|
|
381
316
|
if (typeof msg.text === 'string' && msg.text) {
|
|
382
317
|
this.voiceTtsTextSubject.next(msg.text);
|
|
@@ -389,31 +324,31 @@ export class VoiceService {
|
|
|
389
324
|
// _activeTtsSources tracks pending sources; when the last one ends, acquisition unblocks.
|
|
390
325
|
if (this._activeTtsSources > 0) {
|
|
391
326
|
this._unblockAfterTts = true;
|
|
392
|
-
//
|
|
393
|
-
// Add 5 seconds buffer for network/decode latency.
|
|
394
|
-
// Minimum 5 seconds, maximum 300 seconds for very long messages.
|
|
395
|
-
const remainingMs = Math.max(0, this._ttsExpectedEndTime - Date.now());
|
|
396
|
-
const safetyMs = Math.min(this.bufferTime, Math.max(5000, remainingMs + 5000));
|
|
327
|
+
// Safety: force-unblock after 15 s in case onended never fires.
|
|
397
328
|
if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
|
|
398
|
-
this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true),
|
|
399
|
-
this.logger.log('[VoiceService] done – TTS still pending, waiting for all sources to end', {
|
|
400
|
-
activeTtsSources: this._activeTtsSources,
|
|
401
|
-
expectedEndInMs: remainingMs,
|
|
402
|
-
safetyTimerMs: safetyMs
|
|
403
|
-
});
|
|
329
|
+
this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), 15000);
|
|
330
|
+
this.logger.log('[VoiceService] done – TTS still pending, waiting for all sources to end', { activeTtsSources: this._activeTtsSources });
|
|
404
331
|
} else {
|
|
405
|
-
// No audio sources
|
|
406
|
-
//
|
|
407
|
-
//
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
// Safety timer as last resort in case no chunks arrive at all.
|
|
413
|
-
if (this._unblockSafetyTimer !== null) clearTimeout(this._unblockSafetyTimer);
|
|
414
|
-
this._unblockSafetyTimer = setTimeout(() => this._flushTtsUnblock(true), 10000);
|
|
332
|
+
// No audio sources pending — playback was already complete (or audio was empty).
|
|
333
|
+
// Signal the proxy synchronously; mic stays muted until the proxy confirms
|
|
334
|
+
// LISTENING via the 'listening' event.
|
|
335
|
+
this.logger.log('[VoiceService] done – no pending TTS, sending playback complete immediately');
|
|
336
|
+
this.voiceStreaming.sendPlaybackComplete();
|
|
337
|
+
// Do NOT unblock acquisition here — proxy will send 'listening' which is
|
|
338
|
+
// the single source of truth for unblocking both UI and mic.
|
|
415
339
|
}
|
|
416
340
|
break;
|
|
341
|
+
case 'barge_in':
|
|
342
|
+
// Proxy's VAD detected user speech while the bot was talking — stop TTS immediately.
|
|
343
|
+
// Do NOT send tts_playback_complete; this is an interruption, not a normal completion.
|
|
344
|
+
// The proxy will follow with { event: "listening" } which authoritatively unblocks the UI.
|
|
345
|
+
// Audio was never muted, so there is nothing to unmute.
|
|
346
|
+
this._cancelAllTtsAudio();
|
|
347
|
+
this.ttsNextPlayTime = 0;
|
|
348
|
+
this._unblockAfterTts = false;
|
|
349
|
+
this._isAcquisitionBlocked$.next(false);
|
|
350
|
+
this.logger.log('[VoiceService] barge_in – TTS cancelled, acquisition unblocked');
|
|
351
|
+
break;
|
|
417
352
|
case 'error': {
|
|
418
353
|
const errorMsg = typeof msg.message === 'string' ? msg.message : 'Voice session error';
|
|
419
354
|
this.logger.error('[VoiceService] WSS error', errorMsg);
|
|
@@ -426,19 +361,8 @@ export class VoiceService {
|
|
|
426
361
|
}
|
|
427
362
|
}
|
|
428
363
|
|
|
429
|
-
/**
|
|
430
|
-
* Chunk TTS: ogni buffer deve essere decodificabile da `decodeAudioData` (es. segmento WebM/Opus completo).
|
|
431
|
-
*
|
|
432
|
-
* Decode-race fix: multiple chunks decode concurrently; a smaller/later chunk can finish
|
|
433
|
-
* decoding before a larger/earlier one, which would cause the AudioBufferSourceNode to be
|
|
434
|
-
* scheduled out of arrival order (e.g. "manuale" before "scrittura"). To prevent this, each
|
|
435
|
-
* chunk is assigned a monotonic sequence number on arrival and stored in _ttsDecodedPending
|
|
436
|
-
* after decoding. _drainTtsDecodedBuffers() only advances the schedule when the next
|
|
437
|
-
* expected sequence slot is present, guaranteeing arrival-order playback regardless of decode speed.
|
|
438
|
-
*/
|
|
364
|
+
/** Chunk TTS: ogni buffer deve essere decodificabile da `decodeAudioData` (es. segmento WebM/Opus completo). */
|
|
439
365
|
private async playWsTtsChunk(buf: ArrayBuffer): Promise<void> {
|
|
440
|
-
// Assign arrival-order sequence number SYNCHRONOUSLY before any await.
|
|
441
|
-
const seq = this._ttsChunkSeq++;
|
|
442
366
|
// Capture the current generation BEFORE the synchronous increment so that
|
|
443
367
|
// if _cancelAllTtsAudio() fires (incrementing _ttsGeneration) while this
|
|
444
368
|
// decode is in-flight, the mismatch is detected and the stale chunk is discarded.
|
|
@@ -446,12 +370,11 @@ export class VoiceService {
|
|
|
446
370
|
// Increment SYNCHRONOUSLY before any await so the 'done' event handler (which arrives
|
|
447
371
|
// on the next WebSocket message — a different event-loop tick) sees a non-zero count.
|
|
448
372
|
this._activeTtsSources++;
|
|
449
|
-
this.logger.log('[VoiceService] TTS chunk received', {
|
|
373
|
+
this.logger.log('[VoiceService] TTS chunk received', { bytes: buf.byteLength, activeTtsSources: this._activeTtsSources });
|
|
450
374
|
try {
|
|
451
375
|
if (!this.ttsPlayContext || this.ttsPlayContext.state === 'closed') {
|
|
452
376
|
this.ttsPlayContext = new AudioContext();
|
|
453
377
|
this.ttsNextPlayTime = this.ttsPlayContext.currentTime;
|
|
454
|
-
this.logger.info('[VoiceService] TTS AudioContext created');
|
|
455
378
|
}
|
|
456
379
|
const ctx = this.ttsPlayContext;
|
|
457
380
|
const audioBuf = await ctx.decodeAudioData(buf.slice(0));
|
|
@@ -460,57 +383,21 @@ export class VoiceService {
|
|
|
460
383
|
// for a turn that was already cancelled, and undo the counter increment.
|
|
461
384
|
if (this._ttsGeneration !== capturedGeneration) {
|
|
462
385
|
this._activeTtsSources = Math.max(0, this._activeTtsSources - 1);
|
|
463
|
-
this.logger.log('[VoiceService] TTS chunk discarded – stale generation', {
|
|
386
|
+
this.logger.log('[VoiceService] TTS chunk discarded – stale generation', { capturedGeneration, currentGeneration: this._ttsGeneration });
|
|
464
387
|
return;
|
|
465
388
|
}
|
|
466
|
-
// Store the decoded buffer under its arrival sequence number and attempt to
|
|
467
|
-
// flush any contiguous run of decoded buffers in order.
|
|
468
|
-
this._ttsDecodedPending.set(seq, audioBuf);
|
|
469
|
-
this._drainTtsDecodedBuffers();
|
|
470
|
-
} catch (e) {
|
|
471
|
-
// Advance the scheduler past this failed slot so subsequent decoded chunks are
|
|
472
|
-
// not blocked waiting for a slot that will never be filled.
|
|
473
|
-
if (seq === this._ttsScheduledSeq) {
|
|
474
|
-
this._ttsScheduledSeq++;
|
|
475
|
-
this._drainTtsDecodedBuffers();
|
|
476
|
-
}
|
|
477
|
-
this._onTtsSourceEnded();
|
|
478
|
-
this.logger.warn('[VoiceService] TTS chunk decode failed', { seq }, e);
|
|
479
|
-
}
|
|
480
|
-
}
|
|
481
|
-
|
|
482
|
-
/**
|
|
483
|
-
* Schedules decoded TTS buffers in strict arrival order.
|
|
484
|
-
* Called after every successful decode. Drains the _ttsDecodedPending map
|
|
485
|
-
* starting at _ttsScheduledSeq, stopping as soon as the next slot is missing
|
|
486
|
-
* (i.e. that chunk is still decoding or failed).
|
|
487
|
-
*/
|
|
488
|
-
private _drainTtsDecodedBuffers(): void {
|
|
489
|
-
const ctx = this.ttsPlayContext;
|
|
490
|
-
if (!ctx) return;
|
|
491
|
-
while (this._ttsDecodedPending.has(this._ttsScheduledSeq)) {
|
|
492
|
-
const audioBuf = this._ttsDecodedPending.get(this._ttsScheduledSeq)!;
|
|
493
|
-
this._ttsDecodedPending.delete(this._ttsScheduledSeq);
|
|
494
|
-
this._ttsScheduledSeq++;
|
|
495
|
-
|
|
496
389
|
const src = ctx.createBufferSource();
|
|
497
390
|
src.buffer = audioBuf;
|
|
498
391
|
src.connect(ctx.destination);
|
|
499
392
|
const t0 = Math.max(ctx.currentTime, this.ttsNextPlayTime);
|
|
500
393
|
src.start(t0);
|
|
501
394
|
this.ttsNextPlayTime = t0 + audioBuf.duration;
|
|
502
|
-
// Track the expected end time in wall-clock time (ms) for safety timer calculation.
|
|
503
|
-
const audioEndDelayMs = (this.ttsNextPlayTime - ctx.currentTime) * 1000;
|
|
504
|
-
this._ttsExpectedEndTime = Date.now() + audioEndDelayMs;
|
|
505
|
-
const isFirstChunk = this._activeTtsSourceNodes.length === 0;
|
|
506
395
|
this._activeTtsSourceNodes.push(src);
|
|
507
|
-
|
|
508
|
-
// First real audio about to play — stop the keyboard typing sound immediately.
|
|
509
|
-
this._stopKeyboardSound();
|
|
510
|
-
this.logger.info('[VoiceService] TTS playback started', { durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3) });
|
|
511
|
-
}
|
|
512
|
-
this.logger.log('[VoiceService] TTS chunk scheduled', { seq: this._ttsScheduledSeq - 1, durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3), activeTtsSources: this._activeTtsSources, expectedEndInMs: audioEndDelayMs.toFixed(0) });
|
|
396
|
+
this.logger.log('[VoiceService] TTS chunk scheduled', { durationS: audioBuf.duration.toFixed(3), startsAtS: t0.toFixed(3), activeTtsSources: this._activeTtsSources });
|
|
513
397
|
src.onended = () => this._onTtsSourceEnded(src);
|
|
398
|
+
} catch (e) {
|
|
399
|
+
this._onTtsSourceEnded();
|
|
400
|
+
this.logger.warn('[VoiceService] TTS chunk decode failed', e);
|
|
514
401
|
}
|
|
515
402
|
}
|
|
516
403
|
|
|
@@ -521,10 +408,6 @@ export class VoiceService {
|
|
|
521
408
|
if (idx !== -1) { this._activeTtsSourceNodes.splice(idx, 1); }
|
|
522
409
|
}
|
|
523
410
|
this.logger.log('[VoiceService] TTS source ended', { activeTtsSources: this._activeTtsSources, unblockPending: this._unblockAfterTts });
|
|
524
|
-
if (this._activeTtsSources === 0) {
|
|
525
|
-
this.logger.info('[VoiceService] TTS playback ended – all sources finished');
|
|
526
|
-
console.log('[VoiceService] TTS audio finished playing');
|
|
527
|
-
}
|
|
528
411
|
if (this._unblockAfterTts && this._activeTtsSources === 0) {
|
|
529
412
|
this._flushTtsUnblock(false);
|
|
530
413
|
}
|
|
@@ -552,11 +435,6 @@ export class VoiceService {
|
|
|
552
435
|
this._activeTtsSourceNodes = [];
|
|
553
436
|
this._activeTtsSources = 0;
|
|
554
437
|
this._unblockAfterTts = false;
|
|
555
|
-
this._ttsExpectedEndTime = 0;
|
|
556
|
-
// Reset ordered-scheduling state so the next speaking turn starts fresh.
|
|
557
|
-
this._ttsChunkSeq = 0;
|
|
558
|
-
this._ttsScheduledSeq = 0;
|
|
559
|
-
this._ttsDecodedPending.clear();
|
|
560
438
|
this._stopTtsKaraoke(true);
|
|
561
439
|
this.logger.log('[VoiceService] TTS cancelled – all audio sources stopped');
|
|
562
440
|
}
|
|
@@ -574,21 +452,12 @@ export class VoiceService {
|
|
|
574
452
|
this.logger.log('[VoiceService] TTS unblock: all sources ended, sending playback complete');
|
|
575
453
|
}
|
|
576
454
|
this._stopTtsKaraoke(true);
|
|
577
|
-
// Signal the proxy that TTS playback is complete.
|
|
578
|
-
// to LISTENING and send a 'listening' event back; the mic
|
|
579
|
-
//
|
|
580
|
-
//
|
|
581
|
-
//
|
|
582
|
-
// 3 seconds (network hiccup, server race, etc.) force-unblock so the user is
|
|
583
|
-
// never left stuck. The timer is cancelled immediately if 'listening' arrives.
|
|
455
|
+
// Signal the proxy that TTS playback is complete. The proxy will transition
|
|
456
|
+
// to LISTENING and send a 'listening' event back; the mic is unmuted there
|
|
457
|
+
// (not here) so it is live only when the proxy is confirmed ready.
|
|
458
|
+
// Do NOT call _isAcquisitionBlocked$.next(false) here — 'listening' is the
|
|
459
|
+
// single source of truth so that UI and mic unblock atomically.
|
|
584
460
|
this.voiceStreaming.sendPlaybackComplete();
|
|
585
|
-
if (this._listeningFallbackTimer !== null) clearTimeout(this._listeningFallbackTimer);
|
|
586
|
-
this._listeningFallbackTimer = setTimeout(() => {
|
|
587
|
-
this._listeningFallbackTimer = null;
|
|
588
|
-
this.logger.warn('[VoiceService] listening fallback timer fired – proxy did not respond, force-unblocking');
|
|
589
|
-
this._isAcquisitionBlocked$.next(false);
|
|
590
|
-
this.voiceStreaming.resumeRecording();
|
|
591
|
-
}, 3000);
|
|
592
461
|
}
|
|
593
462
|
|
|
594
463
|
// ── WSS TTS Karaoke helpers ───────────────────────────────────────────────
|
|
@@ -661,39 +530,8 @@ export class VoiceService {
|
|
|
661
530
|
|
|
662
531
|
// ─────────────────────────────────────────────────────────────────────────
|
|
663
532
|
|
|
664
|
-
// ── Keyboard typing-indicator sound helpers ───────────────────────────────
|
|
665
|
-
/**
|
|
666
|
-
* Starts the keyboard sound on loop to mask silence while the bot is
|
|
667
|
-
* generating its response. No-op if already playing.
|
|
668
|
-
* Only called during WSS voice sessions (voice-proxy mode).
|
|
669
|
-
*/
|
|
670
|
-
private _startKeyboardSound(): void {
|
|
671
|
-
if (this._keyboardSoundEl) return; // already playing
|
|
672
|
-
const file = this.globals.keyboardSoundFile ?? 'keyboard.mp3';
|
|
673
|
-
const src = /^https?:\/\//i.test(file)
|
|
674
|
-
? file
|
|
675
|
-
: `${this.globals.baseLocation}/assets/sounds/${file}`;
|
|
676
|
-
const audio = new Audio(src);
|
|
677
|
-
audio.loop = true;
|
|
678
|
-
audio.volume = Math.min(1, Math.max(0, this.globals.keyboardSoundVolume));
|
|
679
|
-
audio.play().catch((e) => this.logger.warn('[VoiceService] keyboard sound play failed', e));
|
|
680
|
-
this._keyboardSoundEl = audio;
|
|
681
|
-
this.logger.log('[VoiceService] keyboard sound started', { src, volume: audio.volume });
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
/** Stops and discards the keyboard typing sound. No-op if not playing. */
|
|
685
|
-
private _stopKeyboardSound(): void {
|
|
686
|
-
if (!this._keyboardSoundEl) return;
|
|
687
|
-
this._keyboardSoundEl.pause();
|
|
688
|
-
this._keyboardSoundEl.currentTime = 0;
|
|
689
|
-
this._keyboardSoundEl = null;
|
|
690
|
-
this.logger.log('[VoiceService] keyboard sound stopped');
|
|
691
|
-
}
|
|
692
|
-
// ─────────────────────────────────────────────────────────────────────────
|
|
693
|
-
|
|
694
533
|
async stopSession(options?: { discardInProgressSegment?: boolean}): Promise<{ voiceIngressResultUrl: string | null }> {
|
|
695
534
|
const discard = options?.discardInProgressSegment === true;
|
|
696
|
-
this.logger.info('[VoiceService] stopSession', { discard, isWssVoiceActive: this._isWssVoiceActive$.getValue() });
|
|
697
535
|
|
|
698
536
|
this.wsControlSub?.unsubscribe();
|
|
699
537
|
this.wsControlSub = undefined;
|
|
@@ -710,7 +548,6 @@ export class VoiceService {
|
|
|
710
548
|
this._cancelAllTtsAudio();
|
|
711
549
|
this.ttsPlayContext = undefined;
|
|
712
550
|
this.ttsNextPlayTime = 0;
|
|
713
|
-
this._stopKeyboardSound();
|
|
714
551
|
|
|
715
552
|
let voiceIngressResultUrl: string | null = null;
|
|
716
553
|
if (this.voiceIngressConfig) {
|
|
@@ -753,10 +590,6 @@ export class VoiceService {
|
|
|
753
590
|
}
|
|
754
591
|
|
|
755
592
|
// 🎧 cleanup audio context
|
|
756
|
-
if (this.volumeRafId) {
|
|
757
|
-
cancelAnimationFrame(this.volumeRafId);
|
|
758
|
-
this.volumeRafId = undefined;
|
|
759
|
-
}
|
|
760
593
|
this.audioContext?.close();
|
|
761
594
|
this.audioContext = undefined;
|
|
762
595
|
this.analyser = undefined;
|
|
@@ -775,10 +608,6 @@ export class VoiceService {
|
|
|
775
608
|
clearTimeout(this.responseTimeoutId);
|
|
776
609
|
this.responseTimeoutId = undefined;
|
|
777
610
|
this.isWaitingForResponse = false;
|
|
778
|
-
if (this._listeningFallbackTimer !== null) {
|
|
779
|
-
clearTimeout(this._listeningFallbackTimer);
|
|
780
|
-
this._listeningFallbackTimer = null;
|
|
781
|
-
}
|
|
782
611
|
this._isAcquisitionBlocked$.next(false);
|
|
783
612
|
|
|
784
613
|
return { voiceIngressResultUrl };
|
|
@@ -859,7 +688,8 @@ export class VoiceService {
|
|
|
859
688
|
private startVolumeLoop(): void {
|
|
860
689
|
const tick = () => {
|
|
861
690
|
if (!this.analyser || !this.dataArray) {
|
|
862
|
-
|
|
691
|
+
requestAnimationFrame(tick);
|
|
692
|
+
return;
|
|
863
693
|
}
|
|
864
694
|
|
|
865
695
|
this.analyser.getByteFrequencyData(
|
|
@@ -875,10 +705,10 @@ export class VoiceService {
|
|
|
875
705
|
|
|
876
706
|
this.volumeSubject.next(volume);
|
|
877
707
|
|
|
878
|
-
|
|
708
|
+
requestAnimationFrame(tick);
|
|
879
709
|
};
|
|
880
710
|
|
|
881
|
-
|
|
711
|
+
tick();
|
|
882
712
|
}
|
|
883
713
|
|
|
884
714
|
/**
|