agentgui 1.0.529 → 1.0.530

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,415 +1,12 @@
1
1
  (function() {
2
2
  var BASE = window.__BASE_URL || '';
3
- var isRecording = false;
4
3
  var ttsEnabled = true;
5
- var voiceActive = false;
6
- var currentConversationId = null;
7
4
  var speechQueue = [];
8
5
  var isSpeaking = false;
9
6
  var currentAudio = null;
10
- var mediaStream = null;
11
- var audioContext = null;
12
- var workletNode = null;
13
- var recordedChunks = [];
14
- var TARGET_SAMPLE_RATE = 16000;
15
- var spokenChunks = new Set();
16
- var renderedSeqs = new Set();
17
- var isLoadingHistory = false;
18
- var _lastVoiceBlockText = null;
19
- var _lastVoiceBlockTime = 0;
20
- var _voiceBreakNext = false;
21
7
  var selectedVoiceId = localStorage.getItem('gmgui-voice-selection') || 'default';
22
8
  var ttsAudioCache = new Map();
23
9
  var TTS_CLIENT_CACHE_MAX = 50;
24
-
25
- function init() {
26
- setupTTSToggle();
27
- setupUI();
28
- setupStreamingListener();
29
- setupAgentSelector();
30
- setupVoiceSelector();
31
- }
32
-
33
- function setupVoiceSelector() {
34
- var selector = document.getElementById('voiceSelector');
35
- if (!selector) return;
36
- var saved = localStorage.getItem('gmgui-voice-selection');
37
- if (saved) selectedVoiceId = saved;
38
- if (window.wsManager) {
39
- window.wsManager.subscribeToVoiceList(function(voices) {
40
- if (!Array.isArray(voices)) return;
41
- selector.innerHTML = '';
42
- var builtIn = voices.filter(function(v) { return !v.isCustom; });
43
- var custom = voices.filter(function(v) { return v.isCustom; });
44
- if (builtIn.length) {
45
- var grp1 = document.createElement('optgroup');
46
- grp1.label = 'Built-in Voices';
47
- builtIn.forEach(function(voice) {
48
- var opt = document.createElement('option');
49
- opt.value = voice.id;
50
- var parts = [];
51
- if (voice.gender) parts.push(voice.gender);
52
- if (voice.accent) parts.push(voice.accent);
53
- opt.textContent = voice.name + (parts.length ? ' (' + parts.join(', ') + ')' : '');
54
- grp1.appendChild(opt);
55
- });
56
- selector.appendChild(grp1);
57
- }
58
- if (custom.length) {
59
- var grp2 = document.createElement('optgroup');
60
- grp2.label = 'Custom Voices';
61
- custom.forEach(function(voice) {
62
- var opt = document.createElement('option');
63
- opt.value = voice.id;
64
- opt.textContent = voice.name;
65
- grp2.appendChild(opt);
66
- });
67
- selector.appendChild(grp2);
68
- }
69
- if (selectedVoiceId && selector.querySelector('option[value="' + selectedVoiceId + '"]')) {
70
- selector.value = selectedVoiceId;
71
- }
72
- });
73
- return;
74
- }
75
- if (window.wsClient) {
76
- window.wsClient.rpc('voices')
77
- .then(function(data) {
78
- if (!data.ok || !Array.isArray(data.voices)) return;
79
- selector.innerHTML = '';
80
- var builtIn = data.voices.filter(function(v) { return !v.isCustom; });
81
- var custom = data.voices.filter(function(v) { return v.isCustom; });
82
- if (builtIn.length) {
83
- var grp1 = document.createElement('optgroup');
84
- grp1.label = 'Built-in Voices';
85
- builtIn.forEach(function(voice) {
86
- var opt = document.createElement('option');
87
- opt.value = voice.id;
88
- var parts = [];
89
- if (voice.gender) parts.push(voice.gender);
90
- if (voice.accent) parts.push(voice.accent);
91
- opt.textContent = voice.name + (parts.length ? ' (' + parts.join(', ') + ')' : '');
92
- grp1.appendChild(opt);
93
- });
94
- selector.appendChild(grp1);
95
- }
96
- if (custom.length) {
97
- var grp2 = document.createElement('optgroup');
98
- grp2.label = 'Custom Voices';
99
- custom.forEach(function(voice) {
100
- var opt = document.createElement('option');
101
- opt.value = voice.id;
102
- opt.textContent = voice.name;
103
- grp2.appendChild(opt);
104
- });
105
- selector.appendChild(grp2);
106
- }
107
- if (selectedVoiceId && selector.querySelector('option[value="' + selectedVoiceId + '"]')) {
108
- selector.value = selectedVoiceId;
109
- }
110
- })
111
- .catch(function(err) { console.error('[Voice] Failed to load voices:', err); });
112
- }
113
- selector.addEventListener('change', function() {
114
- selectedVoiceId = selector.value;
115
- localStorage.setItem('gmgui-voice-selection', selectedVoiceId);
116
- sendVoiceToServer();
117
- });
118
- }
119
-
120
- function syncVoiceSelectorWithRetry(maxRetries) {
121
- maxRetries = maxRetries || 20;
122
- var voiceSelector = document.querySelector('[data-voice-agent-selector]');
123
- var mainSelector = document.querySelector('[data-agent-selector]');
124
- if (!voiceSelector || !mainSelector) return;
125
- if (mainSelector.innerHTML.trim() === '' && maxRetries > 0) {
126
- setTimeout(function() { syncVoiceSelectorWithRetry(maxRetries - 1); }, 250);
127
- return;
128
- }
129
- voiceSelector.innerHTML = mainSelector.innerHTML;
130
- if (mainSelector.value) voiceSelector.value = mainSelector.value;
131
- }
132
-
133
- function syncVoiceCliSelectorWithRetry(maxRetries) {
134
- maxRetries = maxRetries || 20;
135
- var voiceCliSelector = document.querySelector('[data-voice-cli-selector]');
136
- var mainCliSelector = document.querySelector('[data-cli-selector]');
137
- if (!voiceCliSelector || !mainCliSelector) return;
138
- if (mainCliSelector.innerHTML.trim() === '' && maxRetries > 0) {
139
- setTimeout(function() { syncVoiceCliSelectorWithRetry(maxRetries - 1); }, 250);
140
- return;
141
- }
142
- voiceCliSelector.innerHTML = mainCliSelector.innerHTML;
143
- if (mainCliSelector.value) voiceCliSelector.value = mainCliSelector.value;
144
- }
145
-
146
- function syncVoiceModelSelectorWithRetry(maxRetries) {
147
- maxRetries = maxRetries || 20;
148
- var voiceModelSelector = document.querySelector('[data-voice-model-selector]');
149
- var mainModelSelector = document.querySelector('[data-model-selector]');
150
- if (!voiceModelSelector || !mainModelSelector) return;
151
- if (mainModelSelector.innerHTML.trim() === '' && maxRetries > 0) {
152
- setTimeout(function() { syncVoiceModelSelectorWithRetry(maxRetries - 1); }, 250);
153
- return;
154
- }
155
- voiceModelSelector.innerHTML = mainModelSelector.innerHTML;
156
- if (mainModelSelector.value) voiceModelSelector.value = mainModelSelector.value;
157
- }
158
-
159
- function setupAgentSelector() {
160
- var voiceSelector = document.querySelector('[data-voice-agent-selector]');
161
- if (!voiceSelector) return;
162
- var mainSelector = document.querySelector('[data-agent-selector]');
163
- if (mainSelector) {
164
- syncVoiceSelectorWithRetry();
165
- var observer = new MutationObserver(syncVoiceSelectorWithRetry);
166
- observer.observe(mainSelector, { childList: true, subtree: true });
167
- mainSelector.addEventListener('change', function() {
168
- voiceSelector.value = mainSelector.value;
169
- });
170
- voiceSelector.addEventListener('change', function() {
171
- mainSelector.value = voiceSelector.value;
172
- });
173
- }
174
- window.addEventListener('agents-loaded', syncVoiceSelectorWithRetry);
175
-
176
- var mainCliSelector = document.querySelector('[data-cli-selector]');
177
- if (mainCliSelector) {
178
- syncVoiceCliSelectorWithRetry();
179
- var cliObserver = new MutationObserver(syncVoiceCliSelectorWithRetry);
180
- cliObserver.observe(mainCliSelector, { childList: true, subtree: true });
181
- mainCliSelector.addEventListener('change', function() {
182
- var voiceCliSelector = document.querySelector('[data-voice-cli-selector]');
183
- if (voiceCliSelector) voiceCliSelector.value = mainCliSelector.value;
184
- });
185
- var voiceCliSelector = document.querySelector('[data-voice-cli-selector]');
186
- if (voiceCliSelector) {
187
- voiceCliSelector.addEventListener('change', function() {
188
- mainCliSelector.value = voiceCliSelector.value;
189
- });
190
- }
191
- }
192
-
193
- var mainModelSelector = document.querySelector('[data-model-selector]');
194
- if (mainModelSelector) {
195
- syncVoiceModelSelectorWithRetry();
196
- var modelObserver = new MutationObserver(syncVoiceModelSelectorWithRetry);
197
- modelObserver.observe(mainModelSelector, { childList: true, subtree: true });
198
- mainModelSelector.addEventListener('change', function() {
199
- var voiceModelSelector = document.querySelector('[data-voice-model-selector]');
200
- if (voiceModelSelector) voiceModelSelector.value = mainModelSelector.value;
201
- });
202
- var voiceModelSelector = document.querySelector('[data-voice-model-selector]');
203
- if (voiceModelSelector) {
204
- voiceModelSelector.addEventListener('change', function() {
205
- mainModelSelector.value = voiceModelSelector.value;
206
- });
207
- }
208
- }
209
- }
210
-
211
- function setupTTSToggle() {
212
- var toggle = document.getElementById('voiceTTSToggle');
213
- if (toggle) {
214
- var saved = localStorage.getItem('gmgui-auto-speak');
215
- if (saved !== null) {
216
- ttsEnabled = saved === 'true';
217
- toggle.checked = ttsEnabled;
218
- }
219
- toggle.addEventListener('change', function() {
220
- ttsEnabled = toggle.checked;
221
- localStorage.setItem('gmgui-auto-speak', ttsEnabled);
222
- if (!ttsEnabled) stopSpeaking();
223
- });
224
- }
225
- var stopBtn = document.getElementById('voiceStopSpeaking');
226
- if (stopBtn) {
227
- stopBtn.addEventListener('click', stopSpeaking);
228
- }
229
- }
230
-
231
- function setupUI() {
232
- var micBtn = document.getElementById('voiceMicBtn');
233
- if (micBtn) {
234
- micBtn.removeAttribute('disabled');
235
- micBtn.title = 'Hold to record';
236
- micBtn.addEventListener('mousedown', function(e) {
237
- e.preventDefault();
238
- startRecording();
239
- });
240
- micBtn.addEventListener('mouseup', function(e) {
241
- e.preventDefault();
242
- stopRecording();
243
- });
244
- micBtn.addEventListener('mouseleave', function(e) {
245
- if (isRecording) stopRecording();
246
- });
247
- micBtn.addEventListener('touchstart', function(e) {
248
- e.preventDefault();
249
- startRecording();
250
- });
251
- micBtn.addEventListener('touchend', function(e) {
252
- e.preventDefault();
253
- stopRecording();
254
- });
255
- micBtn.addEventListener('touchcancel', function(e) {
256
- if (isRecording) stopRecording();
257
- });
258
- }
259
- var sendBtn = document.getElementById('voiceSendBtn');
260
- if (sendBtn) {
261
- sendBtn.addEventListener('click', sendVoiceMessage);
262
- }
263
- var transcript = document.getElementById('voiceTranscript');
264
- if (transcript) {
265
- transcript.addEventListener('keydown', function(e) {
266
- if (e.ctrlKey && e.key === 'Enter' || e.metaKey && e.key === 'Enter') {
267
- e.preventDefault();
268
- sendVoiceMessage();
269
- }
270
- });
271
- }
272
- }
273
-
274
-
275
- async function startRecording() {
276
- if (isRecording) return;
277
- var el = document.getElementById('voiceTranscript');
278
- if (el) {
279
- if (el.value !== undefined) {
280
- el.value = '';
281
- } else {
282
- el.textContent = '';
283
- el.setAttribute('data-final', '');
284
- }
285
- }
286
- var result = await window.STTHandler.startRecording();
287
- if (result.success) {
288
- isRecording = true;
289
- var micBtn = document.getElementById('voiceMicBtn');
290
- if (micBtn) micBtn.classList.add('recording');
291
- } else {
292
- if (el) el.textContent = 'Mic access denied: ' + result.error;
293
- }
294
- }
295
-
296
- async function stopRecording() {
297
- if (!isRecording) return;
298
- isRecording = false;
299
- var micBtn = document.getElementById('voiceMicBtn');
300
- if (micBtn) micBtn.classList.remove('recording');
301
- var el = document.getElementById('voiceTranscript');
302
-
303
- if (el) {
304
- if (el.value !== undefined) {
305
- el.value = 'Transcribing...';
306
- } else {
307
- el.textContent = 'Transcribing...';
308
- }
309
- }
310
-
311
- var result = await window.STTHandler.stopRecording();
312
- if (result.success) {
313
- if (el) {
314
- if (el.value !== undefined) {
315
- el.value = result.text;
316
- } else {
317
- el.textContent = result.text;
318
- el.setAttribute('data-final', result.text);
319
- }
320
- }
321
- } else {
322
- if (el) {
323
- if (el.value !== undefined) {
324
- el.value = 'Error: ' + result.error;
325
- } else {
326
- el.textContent = 'Error: ' + result.error;
327
- }
328
- }
329
- }
330
- }
331
-
332
- function sendVoiceMessage() {
333
- var el = document.getElementById('voiceTranscript');
334
- if (!el) return;
335
- var text = (el.value || el.textContent || '').trim();
336
- if (!text || text.startsWith('Transcribing') || text.startsWith('Error')) return;
337
- addVoiceBlock(text, true);
338
- if (el.value !== undefined) {
339
- el.value = '';
340
- } else {
341
- el.textContent = '';
342
- el.setAttribute('data-final', '');
343
- }
344
- if (typeof agentGUIClient !== 'undefined' && agentGUIClient) {
345
- var input = agentGUIClient.ui.messageInput;
346
- if (input) {
347
- input.value = text;
348
- agentGUIClient.startExecution();
349
- }
350
- }
351
- }
352
-
353
- function speak(text) {
354
- if (!ttsEnabled) return;
355
- speakDirect(text);
356
- }
357
-
358
- function speakDirect(text) {
359
- var clean = text.replace(/<[^>]*>/g, '').trim();
360
- if (!clean) return;
361
- var parts = [];
362
- if (typeof agentGUIClient !== 'undefined' && agentGUIClient && typeof agentGUIClient.parseMarkdownCodeBlocks === 'function') {
363
- parts = agentGUIClient.parseMarkdownCodeBlocks(clean);
364
- } else {
365
- parts = [{ type: 'text', content: clean }];
366
- }
367
- parts.forEach(function(part) {
368
- if (part.type === 'code') return;
369
- var segment = part.content.trim();
370
- if (segment) {
371
- speechQueue.push(segment);
372
- }
373
- });
374
- processQueue();
375
- }
376
-
377
- function cacheTTSAudio(cacheKey, b64) {
378
- if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
379
- var oldest = ttsAudioCache.keys().next().value;
380
- ttsAudioCache.delete(oldest);
381
- }
382
- var binary = atob(b64);
383
- var bytes = new Uint8Array(binary.length);
384
- for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
385
- ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
386
- }
387
-
388
- function getCachedTTSBlob(text) {
389
- var key = selectedVoiceId + ':' + text;
390
- return ttsAudioCache.get(key) || null;
391
- }
392
-
393
- function splitSentences(text) {
394
- if (!text) return [text];
395
- var raw = text.match(/[^.!?]+[.!?]+[\s]?|[^.!?]+$/g);
396
- if (!raw) return [text];
397
- var sentences = raw.map(function(s) { return s.trim(); }).filter(function(s) { return s.length > 0; });
398
- var result = [];
399
- for (var i = 0; i < sentences.length; i++) {
400
- var s = sentences[i];
401
- if (result.length > 0) {
402
- var prev = result[result.length - 1];
403
- if (s.match(/^(\d+[\.\)]|\d+\s)/) || prev.match(/\d+[\.\)]$/)) {
404
- result[result.length - 1] = prev + ' ' + s;
405
- continue;
406
- }
407
- }
408
- result.push(s);
409
- }
410
- return result;
411
- }
412
-
413
10
  var audioChunkQueue = [];
414
11
  var isPlayingChunk = false;
415
12
  var streamDone = false;
@@ -417,577 +14,107 @@
417
14
  var TTS_MAX_FAILURES = 3;
418
15
  var ttsDisabledUntilReset = false;
419
16
  var streamingSupported = true;
420
- var streamingFailedAt = 0;
421
17
 
422
- var pendingVoiceUpdates = [];
423
- var MAX_PENDING_UPDATES = 100;
18
+ window.addEventListener('ws-message', function(e) {
19
+ var data = e.detail;
20
+ if (!data) return;
21
+ if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) cacheTTSAudio(data.cacheKey, data.audio);
22
+ if (data.type === 'sync_connected') sendVoiceToServer();
23
+ });
424
24
 
425
- function optimizePromptForSpeech(text) {
426
- var optimizationInstructions = ' [Optimize for speech: Keep it short. Use simple words. Use short sentences. Focus on clarity.]';
427
- return text + optimizationInstructions;
25
+ function cacheTTSAudio(cacheKey, b64) {
26
+ if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) ttsAudioCache.delete(ttsAudioCache.keys().next().value);
27
+ var binary = atob(b64), bytes = new Uint8Array(binary.length);
28
+ for (var i = 0; i < binary.length; i++) bytes[i] = binary.charCodeAt(i);
29
+ ttsAudioCache.set(cacheKey, new Blob([bytes], { type: 'audio/wav' }));
30
+ }
31
+
32
+ function speakDirect(text) {
33
+ var clean = text.replace(/<[^>]*>/g, '').trim();
34
+ if (!clean) return;
35
+ var parts = (typeof agentGUIClient !== 'undefined' && agentGUIClient && typeof agentGUIClient.parseMarkdownCodeBlocks === 'function')
36
+ ? agentGUIClient.parseMarkdownCodeBlocks(clean) : [{ type: 'text', content: clean }];
37
+ parts.forEach(function(p) { if (p.type !== 'code' && p.content.trim()) speechQueue.push(p.content.trim()); });
38
+ processQueue();
428
39
  }
429
40
 
430
41
  function playNextChunk() {
431
- if (audioChunkQueue.length === 0) {
42
+ if (!audioChunkQueue.length) {
432
43
  isPlayingChunk = false;
433
- if (streamDone) {
434
- isSpeaking = false;
435
- processQueue();
436
- }
44
+ if (streamDone) { isSpeaking = false; processQueue(); }
437
45
  return;
438
46
  }
439
47
  isPlayingChunk = true;
440
- var blob = audioChunkQueue.shift();
441
- var url = URL.createObjectURL(blob);
48
+ var blob = audioChunkQueue.shift(), url = URL.createObjectURL(blob);
442
49
  currentAudio = new Audio(url);
443
- currentAudio.onended = function() {
444
- URL.revokeObjectURL(url);
445
- currentAudio = null;
446
- playNextChunk();
447
- };
448
- currentAudio.onerror = function() {
449
- URL.revokeObjectURL(url);
450
- currentAudio = null;
451
- playNextChunk();
452
- };
453
- currentAudio.play().catch(function() {
454
- URL.revokeObjectURL(url);
455
- currentAudio = null;
456
- playNextChunk();
457
- });
458
- }
459
-
460
- function preGenerateTTS(text) {
461
- if (!ttsEnabled) return;
462
- var clean = text.replace(/<[^>]*>/g, '').trim();
463
- if (!clean) return;
464
- var parts = [];
465
- if (typeof agentGUIClient !== 'undefined' && agentGUIClient && typeof agentGUIClient.parseMarkdownCodeBlocks === 'function') {
466
- parts = agentGUIClient.parseMarkdownCodeBlocks(clean);
467
- } else {
468
- parts = [{ type: 'text', content: clean }];
469
- }
470
- parts.forEach(function(part) {
471
- if (part.type === 'code') return;
472
- var segment = part.content.trim();
473
- if (!segment) return;
474
- var cacheKey = selectedVoiceId + ':' + segment;
475
- if (ttsAudioCache.has(cacheKey)) return;
476
- var optimizedText = optimizePromptForSpeech(segment);
477
- fetch(BASE + '/api/tts', {
478
- method: 'POST',
479
- headers: { 'Content-Type': 'application/json' },
480
- body: JSON.stringify({ text: optimizedText, voiceId: selectedVoiceId })
481
- }).then(function(resp) {
482
- if (!resp.ok) throw new Error('TTS pre-generation failed: ' + resp.status);
483
- return resp.arrayBuffer();
484
- }).then(function(buf) {
485
- var blob = new Blob([buf], { type: 'audio/wav' });
486
- if (ttsAudioCache.size >= TTS_CLIENT_CACHE_MAX) {
487
- var oldest = ttsAudioCache.keys().next().value;
488
- ttsAudioCache.delete(oldest);
489
- }
490
- ttsAudioCache.set(cacheKey, blob);
491
- }).catch(function(err) {
492
- console.warn('[Voice] TTS pre-generation failed:', err);
493
- });
494
- });
50
+ var next = function() { URL.revokeObjectURL(url); currentAudio = null; playNextChunk(); };
51
+ currentAudio.onended = next; currentAudio.onerror = next;
52
+ currentAudio.play().catch(next);
495
53
  }
496
54
 
497
55
  function processQueue() {
498
- if (isSpeaking || speechQueue.length === 0) return;
499
- if (ttsDisabledUntilReset) {
500
- speechQueue = [];
501
- return;
502
- }
503
- isSpeaking = true;
504
- streamDone = false;
56
+ if (isSpeaking || !speechQueue.length) return;
57
+ if (ttsDisabledUntilReset) { speechQueue = []; return; }
58
+ isSpeaking = true; streamDone = false;
505
59
  var text = speechQueue.shift();
506
- audioChunkQueue = [];
507
- isPlayingChunk = false;
508
-
509
- var cachedBlob = getCachedTTSBlob(text);
510
- if (cachedBlob) {
511
- ttsConsecutiveFailures = 0;
512
- audioChunkQueue.push(cachedBlob);
513
- streamDone = true;
514
- if (!isPlayingChunk) playNextChunk();
515
- return;
516
- }
517
-
518
- var sentences = [text];
519
- var cachedSentences = [];
520
- var uncachedText = [text];
521
-
522
- if (cachedSentences.length === sentences.length) {
523
- ttsConsecutiveFailures = 0;
524
- for (var j = 0; j < cachedSentences.length; j++) {
525
- audioChunkQueue.push(cachedSentences[j].blob);
526
- }
527
- streamDone = true;
528
- if (!isPlayingChunk) playNextChunk();
529
- return;
530
- }
531
-
532
- if (cachedSentences.length > 0) {
533
- ttsConsecutiveFailures = 0;
534
- for (var k = 0; k < cachedSentences.length; k++) {
535
- audioChunkQueue.push(cachedSentences[k].blob);
536
- }
537
- if (!isPlayingChunk) playNextChunk();
538
- }
539
-
540
- var remainingText = uncachedText.join(' ');
541
- var optimizedText = optimizePromptForSpeech(remainingText);
542
-
543
- function onTtsSuccess() {
544
- ttsConsecutiveFailures = 0;
545
- }
546
-
547
- function onTtsFailed() {
548
- ttsConsecutiveFailures++;
549
- if (ttsConsecutiveFailures >= TTS_MAX_FAILURES) {
550
- console.warn('[Voice] TTS failed ' + ttsConsecutiveFailures + ' times consecutively, disabling until reset');
551
- ttsDisabledUntilReset = true;
552
- speechQueue = [];
553
- }
554
- streamDone = true;
555
- isSpeaking = false;
556
- if (!ttsDisabledUntilReset) {
557
- processQueue();
558
- }
559
- }
560
-
561
- function tryStreaming() {
562
- if (!streamingSupported) { tryNonStreaming(optimizedText); return; }
563
- fetch(BASE + '/api/tts-stream', {
564
- method: 'POST',
565
- headers: { 'Content-Type': 'application/json' },
566
- body: JSON.stringify({ text: optimizedText, voiceId: selectedVoiceId })
567
- }).then(function(resp) {
568
- if (!resp.ok) {
569
- streamingSupported = false;
570
- streamingFailedAt = Date.now();
571
- throw new Error('TTS stream failed: ' + resp.status);
572
- }
573
- var reader = resp.body.getReader();
574
- var buffer = new Uint8Array(0);
575
-
576
- function concat(a, b) {
577
- var c = new Uint8Array(a.length + b.length);
578
- c.set(a, 0);
579
- c.set(b, a.length);
580
- return c;
581
- }
582
-
583
- function pump() {
584
- return reader.read().then(function(result) {
585
- if (result.done) {
586
- onTtsSuccess();
587
- streamDone = true;
588
- if (!isPlayingChunk && audioChunkQueue.length === 0) {
589
- isSpeaking = false;
590
- processQueue();
591
- }
592
- return;
593
- }
594
- buffer = concat(buffer, result.value);
595
- while (buffer.length >= 4) {
596
- var view = new DataView(buffer.buffer, buffer.byteOffset, 4);
597
- var chunkLen = view.getUint32(0, false);
598
- if (buffer.length < 4 + chunkLen) break;
599
- var wavData = buffer.slice(4, 4 + chunkLen);
600
- buffer = buffer.slice(4 + chunkLen);
601
- var blob = new Blob([wavData], { type: 'audio/wav' });
602
- audioChunkQueue.push(blob);
60
+ audioChunkQueue = []; isPlayingChunk = false;
61
+ var cached = ttsAudioCache.get(selectedVoiceId + ':' + text);
62
+ if (cached) { ttsConsecutiveFailures = 0; audioChunkQueue.push(cached); streamDone = true; if (!isPlayingChunk) playNextChunk(); return; }
63
+ var opt = text + ' [Optimize for speech: Keep it short. Use simple words. Use short sentences. Focus on clarity.]';
64
+ function ok() { ttsConsecutiveFailures = 0; }
65
+ function fail() {
66
+ if (++ttsConsecutiveFailures >= TTS_MAX_FAILURES) { ttsDisabledUntilReset = true; speechQueue = []; }
67
+ streamDone = true; isSpeaking = false;
68
+ if (!ttsDisabledUntilReset) processQueue();
69
+ }
70
+ function stream() {
71
+ if (!streamingSupported) { nonStream(opt); return; }
72
+ fetch(BASE + '/api/tts-stream', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text: opt, voiceId: selectedVoiceId }) })
73
+ .then(function(r) {
74
+ if (!r.ok) { streamingSupported = false; throw 0; }
75
+ var reader = r.body.getReader(), buf = new Uint8Array(0);
76
+ function cat(a, b) { var c = new Uint8Array(a.length + b.length); c.set(a); c.set(b, a.length); return c; }
77
+ function pump() { return reader.read().then(function(res) {
78
+ if (res.done) { ok(); streamDone = true; if (!isPlayingChunk && !audioChunkQueue.length) { isSpeaking = false; processQueue(); } return; }
79
+ buf = cat(buf, res.value);
80
+ while (buf.length >= 4) {
81
+ var len = new DataView(buf.buffer, buf.byteOffset, 4).getUint32(0, false);
82
+ if (buf.length < 4 + len) break;
83
+ audioChunkQueue.push(new Blob([buf.slice(4, 4 + len)], { type: 'audio/wav' }));
84
+ buf = buf.slice(4 + len);
603
85
  if (!isPlayingChunk) playNextChunk();
604
86
  }
605
87
  return pump();
606
- });
607
- }
608
-
609
- return pump();
610
- }).catch(function() {
611
- tryNonStreaming(remainingText);
612
- });
88
+ }); }
89
+ return pump();
90
+ }).catch(function() { nonStream(text); });
613
91
  }
614
-
615
- function tryNonStreaming(txt) {
616
- fetch(BASE + '/api/tts', {
617
- method: 'POST',
618
- headers: { 'Content-Type': 'application/json' },
619
- body: JSON.stringify({ text: txt, voiceId: selectedVoiceId })
620
- }).then(function(resp) {
621
- if (!resp.ok) throw new Error('TTS failed: ' + resp.status);
622
- return resp.arrayBuffer();
623
- }).then(function(buf) {
624
- onTtsSuccess();
625
- var blob = new Blob([buf], { type: 'audio/wav' });
626
- audioChunkQueue.push(blob);
627
- streamDone = true;
628
- if (!isPlayingChunk) playNextChunk();
629
- }).catch(function() {
630
- onTtsFailed();
631
- });
92
+ function nonStream(txt) {
93
+ fetch(BASE + '/api/tts', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ text: txt, voiceId: selectedVoiceId }) })
94
+ .then(function(r) { if (!r.ok) throw 0; return r.arrayBuffer(); })
95
+ .then(function(b) { ok(); audioChunkQueue.push(new Blob([b], { type: 'audio/wav' })); streamDone = true; if (!isPlayingChunk) playNextChunk(); })
96
+ .catch(fail);
632
97
  }
633
-
634
- tryStreaming();
98
+ stream();
635
99
  }
636
100
 
637
101
  function stopSpeaking() {
638
- speechQueue = [];
639
- audioChunkQueue = [];
640
- isPlayingChunk = false;
641
- isSpeaking = false;
642
- ttsConsecutiveFailures = 0;
643
- ttsDisabledUntilReset = false;
644
- if (currentAudio) {
645
- currentAudio.pause();
646
- currentAudio = null;
647
- }
648
- }
649
-
650
- function stripHtml(text) {
651
- return text.replace(/<[^>]*>/g, '').replace(/[ \t]+/g, ' ').trim();
652
- }
653
-
654
- function addVoiceBlock(text, isUser) {
655
- var container = document.getElementById('voiceMessages');
656
- if (!container) return;
657
- var emptyMsg = container.querySelector('.voice-empty');
658
- if (emptyMsg) emptyMsg.remove();
659
- var lastChild = container.lastElementChild;
660
- if (!isUser && !_voiceBreakNext && !isLoadingHistory && lastChild && lastChild.classList.contains('voice-block') && !lastChild.classList.contains('voice-block-user')) {
661
- var contentSpan = lastChild.querySelector('.voice-block-content');
662
- if (contentSpan) {
663
- contentSpan.textContent += '\n' + stripHtml(text);
664
- lastChild._fullText = (lastChild._fullText || contentSpan.textContent) + '\n' + text;
665
- scrollVoiceToBottom();
666
- return lastChild;
667
- }
668
- }
669
- _voiceBreakNext = false;
670
- var div = document.createElement('div');
671
- div.className = 'voice-block' + (isUser ? ' voice-block-user' : '');
672
- if (isUser) {
673
- div.textContent = text;
674
- } else {
675
- var contentSpan = document.createElement('span');
676
- contentSpan.className = 'voice-block-content';
677
- contentSpan.textContent = stripHtml(text);
678
- div.appendChild(contentSpan);
679
- div._fullText = text;
680
- var rereadBtn = document.createElement('button');
681
- rereadBtn.className = 'voice-reread-btn';
682
- rereadBtn.title = 'Re-read aloud';
683
- rereadBtn.innerHTML = '<svg viewBox="0 0 24 24" width="16" height="16" fill="none" stroke="currentColor" stroke-width="2"><polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"/><path d="M19.07 4.93a10 10 0 0 1 0 14.14M15.54 8.46a5 5 0 0 1 0 7.07"/></svg>';
684
- rereadBtn.addEventListener('click', function() {
685
- speak(div._fullText || contentSpan.textContent);
686
- });
687
- div.appendChild(rereadBtn);
688
- }
689
- container.appendChild(div);
690
- scrollVoiceToBottom();
691
- return div;
692
- }
693
-
694
- function addVoiceResultBlock(block, autoSpeak) {
695
- var container = document.getElementById('voiceMessages');
696
- if (!container) return;
697
- var emptyMsg = container.querySelector('.voice-empty');
698
- if (emptyMsg) emptyMsg.remove();
699
- var div = document.createElement('div');
700
- div.className = 'voice-block';
701
- var isError = block.is_error || false;
702
- var duration = block.duration_ms ? (block.duration_ms / 1000).toFixed(1) + 's' : '';
703
- var cost = block.total_cost_usd ? '$' + block.total_cost_usd.toFixed(4) : '';
704
- var resultText = '';
705
- if (block.result) {
706
- resultText = typeof block.result === 'string' ? block.result : JSON.stringify(block.result);
707
- }
708
- var displayText = stripHtml(resultText);
709
- var html = '';
710
- if (displayText) {
711
- html += '<div>' + escapeHtml(displayText) + '</div>';
712
- }
713
- if (duration || cost) {
714
- html += '<div class="voice-result-stats">';
715
- if (duration) html += duration;
716
- if (duration && cost) html += ' | ';
717
- if (cost) html += cost;
718
- html += '</div>';
719
- }
720
- if (!html) {
721
- html = isError ? 'Execution failed' : 'Execution complete';
722
- }
723
- div.innerHTML = html;
724
- if (resultText) {
725
- var rereadBtn = document.createElement('button');
726
- rereadBtn.className = 'voice-reread-btn';
727
- rereadBtn.title = 'Re-read aloud';
728
- rereadBtn.innerHTML = '<svg viewBox="0 0 24 24" width="16" height="16" fill="none" stroke="currentColor" stroke-width="2"><polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5"/><path d="M19.07 4.93a10 10 0 0 1 0 14.14M15.54 8.46a5 5 0 0 1 0 7.07"/></svg>';
729
- rereadBtn.addEventListener('click', function() {
730
- speak(resultText);
731
- });
732
- div.appendChild(rereadBtn);
733
- }
734
- container.appendChild(div);
735
- scrollVoiceToBottom();
736
- if (autoSpeak && ttsEnabled && resultText) {
737
- speak(resultText);
738
- }
739
- return div;
740
- }
741
-
742
- function scrollVoiceToBottom() {
743
- var scroll = document.getElementById('voiceScroll');
744
- if (scroll) {
745
- requestAnimationFrame(function() {
746
- scroll.scrollTop = scroll.scrollHeight;
747
- });
748
- }
102
+ speechQueue = []; audioChunkQueue = []; isPlayingChunk = false; isSpeaking = false;
103
+ ttsConsecutiveFailures = 0; ttsDisabledUntilReset = false;
104
+ if (currentAudio) { currentAudio.pause(); currentAudio = null; }
749
105
  }
750
106
 
751
107
  function sendVoiceToServer() {
752
- if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected) {
108
+ if (typeof agentGUIClient !== 'undefined' && agentGUIClient && agentGUIClient.wsManager && agentGUIClient.wsManager.isConnected)
753
109
  agentGUIClient.wsManager.sendMessage({ type: 'set_voice', voiceId: selectedVoiceId });
754
- }
755
- }
756
-
757
- function setupStreamingListener() {
758
- window.addEventListener('ws-message', function(e) {
759
- var data = e.detail;
760
- if (!data) return;
761
- if (data.type === 'tts_audio' && data.audio && data.voiceId === selectedVoiceId) {
762
- cacheTTSAudio(data.cacheKey, data.audio);
763
- }
764
- if (data.type === 'sync_connected') {
765
- sendVoiceToServer();
766
- }
767
- if (data.type === 'streaming_progress' || data.type === 'message_created' || data.type === 'streaming_start') {
768
- if (data.conversationId && data.conversationId !== currentConversationId) return;
769
- if (!voiceActive) {
770
- pendingVoiceUpdates.push(data);
771
- if (pendingVoiceUpdates.length > MAX_PENDING_UPDATES) {
772
- pendingVoiceUpdates.shift();
773
- }
774
- return;
775
- }
776
- }
777
- if (!voiceActive) return;
778
- if (data.type === 'streaming_progress' && data.block) {
779
- if (data.seq !== undefined && renderedSeqs.has(data.seq)) return;
780
- if (data.seq !== undefined) renderedSeqs.add(data.seq);
781
- handleVoiceBlock(data.block, true, data.blockRole);
782
- }
783
- if (data.type === 'message_created' && data.message) {
784
- var message = data.message;
785
- if (message.role === 'user' && message.content) {
786
- handleVoiceBlock({ type: 'text', text: message.content }, true, 'user');
787
- }
788
- }
789
- if (data.type === 'streaming_start') {
790
- spokenChunks = new Set();
791
- renderedSeqs = new Set();
792
- _voiceBreakNext = false;
793
- }
794
- });
795
- window.addEventListener('conversation-selected', function(e) {
796
- var newConversationId = e.detail.conversationId;
797
- if (currentConversationId && currentConversationId !== newConversationId) {
798
- unsubscribeFromConversation();
799
- pendingVoiceUpdates = [];
800
- }
801
- currentConversationId = newConversationId;
802
- stopSpeaking();
803
- spokenChunks = new Set();
804
- renderedSeqs = new Set();
805
- if (voiceActive) {
806
- subscribeToConversation(currentConversationId);
807
- loadVoiceBlocks(currentConversationId);
808
- processPendingUpdates();
809
- }
810
- });
811
- }
812
-
813
- function handleVoiceBlock(block, isNew, blockRole) {
814
- if (!block || !block.type) return;
815
- if (block.type === 'text' && block.text) {
816
- var now = Date.now();
817
- if (_lastVoiceBlockText === block.text && (now - _lastVoiceBlockTime) < 500) {
818
- return;
819
- }
820
- _lastVoiceBlockText = block.text;
821
- _lastVoiceBlockTime = now;
822
-
823
- var isUser = blockRole === 'user' || blockRole === 'tool_result';
824
- var div = addVoiceBlock(block.text, isUser);
825
- if (div && isNew && ttsEnabled && blockRole === 'assistant') {
826
- div.classList.add('speaking');
827
- preGenerateTTS(block.text);
828
- speak(block.text);
829
- setTimeout(function() { div.classList.remove('speaking'); }, 2000);
830
- }
831
- } else if (block.type === 'result') {
832
- _voiceBreakNext = true;
833
- }
834
- }
835
-
836
- function loadVoiceBlocks(conversationId) {
837
- var container = document.getElementById('voiceMessages');
838
- if (!container) return;
839
- container.innerHTML = '';
840
- _lastVoiceBlockText = null;
841
- _lastVoiceBlockTime = 0;
842
- _voiceBreakNext = false;
843
- if (!conversationId) {
844
- showVoiceEmpty(container);
845
- unsubscribeFromConversation();
846
- return;
847
- }
848
- isLoadingHistory = true;
849
- subscribeToConversation(conversationId);
850
- if (window.wsClient) {
851
- window.wsClient.rpc('conv.chunks', { id: conversationId })
852
- .then(function(data) {
853
- if (!data.ok || !Array.isArray(data.chunks) || data.chunks.length === 0) {
854
- isLoadingHistory = false;
855
- showVoiceEmpty(container);
856
- return;
857
- }
858
- var hasContent = false;
859
- _voiceBreakNext = false;
860
- data.chunks.forEach(function(chunk) {
861
- if (chunk.sequence !== undefined) renderedSeqs.add(chunk.sequence);
862
- var block = typeof chunk.data === 'string' ? JSON.parse(chunk.data) : chunk.data;
863
- if (!block) return;
864
- if (block.type === 'text' && block.text) {
865
- var isUser = chunk.type === 'user';
866
- addVoiceBlock(block.text, isUser);
867
- hasContent = true;
868
- } else if (block.type === 'result') {
869
- _voiceBreakNext = true;
870
- }
871
- });
872
- if (!hasContent) showVoiceEmpty(container);
873
- isLoadingHistory = false;
874
- })
875
- .catch(function() {
876
- isLoadingHistory = false;
877
- showVoiceEmpty(container);
878
- });
879
- } else {
880
- isLoadingHistory = false;
881
- showVoiceEmpty(container);
882
- }
883
- }
884
-
885
- function subscribeToConversation(conversationId) {
886
- if (!conversationId || typeof agentGUIClient === 'undefined' || !agentGUIClient || !agentGUIClient.wsManager) {
887
- return;
888
- }
889
- agentGUIClient.wsManager.sendMessage({ type: 'subscribe', conversationId: conversationId, timestamp: Date.now() });
890
- }
891
-
892
- function unsubscribeFromConversation() {
893
- if (typeof agentGUIClient === 'undefined' || !agentGUIClient || !agentGUIClient.wsManager || !currentConversationId) {
894
- return;
895
- }
896
- agentGUIClient.wsManager.sendMessage({ type: 'unsubscribe', conversationId: currentConversationId, timestamp: Date.now() });
897
- }
898
-
899
- function showVoiceEmpty(container) {
900
- container.innerHTML = '<div class="voice-empty"><div class="voice-empty-icon"><svg viewBox="0 0 24 24" width="64" height="64" fill="none" stroke="currentColor" stroke-width="1.5" stroke-linecap="round" stroke-linejoin="round"><path d="M12 1a3 3 0 0 0-3 3v8a3 3 0 0 0 6 0V4a3 3 0 0 0-3-3z"/><path d="M19 10v2a7 7 0 0 1-14 0v-2"/><line x1="12" y1="19" x2="12" y2="23"/><line x1="8" y1="23" x2="16" y2="23"/></svg></div><div>Hold the microphone button to record.<br>Release to transcribe. Tap Send to submit.<br>New responses will be read aloud.</div></div>';
901
- }
902
-
903
- function activate() {
904
- voiceActive = true;
905
- if (currentConversationId) {
906
- subscribeToConversation(currentConversationId);
907
- loadVoiceBlocks(currentConversationId);
908
- processPendingUpdates();
909
- } else {
910
- var container = document.getElementById('voiceMessages');
911
- if (container && !container.hasChildNodes()) {
912
- showVoiceEmpty(container);
913
- }
914
- }
915
- }
916
-
917
- function processPendingUpdates() {
918
- if (!voiceActive) return;
919
- var updates = pendingVoiceUpdates.splice(0, pendingVoiceUpdates.length);
920
- for (var i = 0; i < updates.length; i++) {
921
- var data = updates[i];
922
- if (data.type === 'streaming_progress' && data.block) {
923
- if (data.seq !== undefined && renderedSeqs.has(data.seq)) continue;
924
- if (data.seq !== undefined) renderedSeqs.add(data.seq);
925
- handleVoiceBlock(data.block, true, data.blockRole);
926
- }
927
- if (data.type === 'message_created' && data.message) {
928
- var message = data.message;
929
- if (message.role === 'user' && message.content) {
930
- handleVoiceBlock({ type: 'text', text: message.content }, true, 'user');
931
- }
932
- }
933
- if (data.type === 'streaming_start') {
934
- spokenChunks = new Set();
935
- renderedSeqs = new Set();
936
- _voiceBreakNext = false;
937
- }
938
- }
939
- }
940
-
941
- function deactivate() {
942
- voiceActive = false;
943
- stopSpeaking();
944
- unsubscribeFromConversation();
945
- pendingVoiceUpdates = [];
946
- }
947
-
948
- function escapeHtml(text) {
949
- var map = { '&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;', "'": '&#39;' };
950
- return text.replace(/[&<>"']/g, function(c) { return map[c]; });
951
- }
952
-
953
- function getAutoSpeak() {
954
- return ttsEnabled;
955
- }
956
-
957
- function setAutoSpeak(value) {
958
- ttsEnabled = Boolean(value);
959
- localStorage.setItem('gmgui-auto-speak', ttsEnabled);
960
- var toggle = document.getElementById('voiceTTSToggle');
961
- if (toggle) toggle.checked = ttsEnabled;
962
- if (!ttsEnabled) stopSpeaking();
963
- }
964
-
965
- function getVoice() {
966
- return selectedVoiceId;
967
- }
968
-
969
- function setVoice(voiceId) {
970
- selectedVoiceId = String(voiceId);
971
- localStorage.setItem('gmgui-voice-selection', selectedVoiceId);
972
- var selector = document.getElementById('voiceSelector');
973
- if (selector) selector.value = selectedVoiceId;
974
- sendVoiceToServer();
975
110
  }
976
111
 
977
112
  window.voiceModule = {
978
- activate: activate,
979
- deactivate: deactivate,
980
- handleBlock: handleVoiceBlock,
981
- getAutoSpeak: getAutoSpeak,
982
- setAutoSpeak: setAutoSpeak,
983
- getVoice: getVoice,
984
- setVoice: setVoice,
985
- speakText: speakDirect
113
+ getAutoSpeak: function() { return ttsEnabled; },
114
+ setAutoSpeak: function(v) { ttsEnabled = Boolean(v); localStorage.setItem('gmgui-auto-speak', ttsEnabled); if (!ttsEnabled) stopSpeaking(); },
115
+ getVoice: function() { return selectedVoiceId; },
116
+ setVoice: function(id) { selectedVoiceId = String(id); localStorage.setItem('gmgui-voice-selection', selectedVoiceId); sendVoiceToServer(); },
117
+ speakText: speakDirect,
118
+ stopSpeaking: stopSpeaking
986
119
  };
987
-
988
- if (document.readyState === 'loading') {
989
- document.addEventListener('DOMContentLoaded', init);
990
- } else {
991
- init();
992
- }
993
120
  })();