agentgui 1.0.143 → 1.0.145

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/database.js CHANGED
@@ -510,6 +510,13 @@ export const queries = {
510
510
  return stmt.all(conversationId, status);
511
511
  },
512
512
 
513
+ getActiveSessions() {
514
+ const stmt = db.prepare(
515
+ "SELECT * FROM sessions WHERE status IN ('active', 'pending') ORDER BY started_at DESC"
516
+ );
517
+ return stmt.all();
518
+ },
519
+
513
520
  createEvent(type, data, conversationId = null, sessionId = null) {
514
521
  const id = generateId('evt');
515
522
  const now = Date.now();
@@ -52,6 +52,10 @@ class AgentRunner {
52
52
  const args = this.buildArgs(prompt, config);
53
53
  const proc = spawn(this.command, args, { cwd });
54
54
 
55
+ if (config.onPid) {
56
+ try { config.onPid(proc.pid); } catch (e) {}
57
+ }
58
+
55
59
  let jsonBuffer = '';
56
60
  const outputs = [];
57
61
  let timedOut = false;
@@ -150,6 +154,10 @@ class AgentRunner {
150
154
 
151
155
  const proc = spawn(cmd, args, { cwd });
152
156
 
157
+ if (config.onPid) {
158
+ try { config.onPid(proc.pid); } catch (e) {}
159
+ }
160
+
153
161
  const outputs = [];
154
162
  let timedOut = false;
155
163
  let sessionId = null;
package/lib/speech.js ADDED
@@ -0,0 +1,192 @@
1
+ import { pipeline, env } from '@huggingface/transformers';
2
+ import { createRequire } from 'module';
3
+ import fs from 'fs';
4
+ import path from 'path';
5
+ import { fileURLToPath } from 'url';
6
+
7
+ const require = createRequire(import.meta.url);
8
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
9
+ const ROOT = path.dirname(__dirname);
10
+ const DATA_DIR = path.join(ROOT, 'data');
11
+
12
+ const SPEAKER_EMBEDDINGS_URL = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin';
13
+ const SPEAKER_EMBEDDINGS_PATH = path.join(DATA_DIR, 'speaker_embeddings.bin');
14
+ const SAMPLE_RATE_TTS = 16000;
15
+ const SAMPLE_RATE_STT = 16000;
16
+
17
+ let sttPipeline = null;
18
+ let ttsPipeline = null;
19
+ let speakerEmbeddings = null;
20
+ let sttLoading = false;
21
+ let ttsLoading = false;
22
+
23
+ function whisperModelPath() {
24
+ try {
25
+ const webtalkDir = path.dirname(require.resolve('webtalk'));
26
+ const p = path.join(webtalkDir, 'models', 'onnx-community', 'whisper-base');
27
+ if (fs.existsSync(p)) return p;
28
+ } catch (_) {}
29
+ return 'onnx-community/whisper-base';
30
+ }
31
+
32
+ async function ensureSpeakerEmbeddings() {
33
+ if (speakerEmbeddings) return speakerEmbeddings;
34
+ if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true });
35
+ if (!fs.existsSync(SPEAKER_EMBEDDINGS_PATH)) {
36
+ const resp = await fetch(SPEAKER_EMBEDDINGS_URL);
37
+ if (!resp.ok) throw new Error('Failed to download speaker embeddings');
38
+ fs.writeFileSync(SPEAKER_EMBEDDINGS_PATH, Buffer.from(await resp.arrayBuffer()));
39
+ }
40
+ const buf = fs.readFileSync(SPEAKER_EMBEDDINGS_PATH);
41
+ speakerEmbeddings = new Float32Array(new Uint8Array(buf).buffer);
42
+ return speakerEmbeddings;
43
+ }
44
+
45
+ async function getSTT() {
46
+ if (sttPipeline) return sttPipeline;
47
+ if (sttLoading) {
48
+ while (sttLoading) await new Promise(r => setTimeout(r, 100));
49
+ return sttPipeline;
50
+ }
51
+ sttLoading = true;
52
+ try {
53
+ const modelPath = whisperModelPath();
54
+ const isLocal = !modelPath.includes('/') || fs.existsSync(modelPath);
55
+ env.allowLocalModels = true;
56
+ env.allowRemoteModels = !isLocal;
57
+ if (isLocal) env.localModelPath = '';
58
+ sttPipeline = await pipeline('automatic-speech-recognition', modelPath, {
59
+ device: 'cpu',
60
+ local_files_only: isLocal,
61
+ });
62
+ return sttPipeline;
63
+ } finally {
64
+ sttLoading = false;
65
+ }
66
+ }
67
+
68
+ async function getTTS() {
69
+ if (ttsPipeline) return ttsPipeline;
70
+ if (ttsLoading) {
71
+ while (ttsLoading) await new Promise(r => setTimeout(r, 100));
72
+ return ttsPipeline;
73
+ }
74
+ ttsLoading = true;
75
+ try {
76
+ env.allowRemoteModels = true;
77
+ ttsPipeline = await pipeline('text-to-speech', 'Xenova/speecht5_tts', {
78
+ device: 'cpu',
79
+ dtype: 'fp32',
80
+ });
81
+ await ensureSpeakerEmbeddings();
82
+ return ttsPipeline;
83
+ } finally {
84
+ ttsLoading = false;
85
+ }
86
+ }
87
+
88
+ function decodeWavToFloat32(buffer) {
89
+ const view = new DataView(buffer.buffer || buffer);
90
+ const riff = String.fromCharCode(view.getUint8(0), view.getUint8(1), view.getUint8(2), view.getUint8(3));
91
+ if (riff !== 'RIFF') throw new Error('Not a WAV file');
92
+ const numChannels = view.getUint16(22, true);
93
+ const sampleRate = view.getUint32(24, true);
94
+ const bitsPerSample = view.getUint16(34, true);
95
+ let dataOffset = 44;
96
+ for (let i = 36; i < view.byteLength - 8; i++) {
97
+ if (view.getUint8(i) === 0x64 && view.getUint8(i+1) === 0x61 &&
98
+ view.getUint8(i+2) === 0x74 && view.getUint8(i+3) === 0x61) {
99
+ dataOffset = i + 8;
100
+ break;
101
+ }
102
+ }
103
+ const bytesPerSample = bitsPerSample / 8;
104
+ const numSamples = Math.floor((view.byteLength - dataOffset) / (bytesPerSample * numChannels));
105
+ const audio = new Float32Array(numSamples);
106
+ for (let i = 0; i < numSamples; i++) {
107
+ const offset = dataOffset + i * bytesPerSample * numChannels;
108
+ if (bitsPerSample === 16) {
109
+ audio[i] = view.getInt16(offset, true) / 32768;
110
+ } else if (bitsPerSample === 32) {
111
+ audio[i] = view.getFloat32(offset, true);
112
+ } else {
113
+ audio[i] = (view.getUint8(offset) - 128) / 128;
114
+ }
115
+ }
116
+ return { audio, sampleRate };
117
+ }
118
+
119
+ function resampleTo16k(audio, fromRate) {
120
+ if (fromRate === SAMPLE_RATE_STT) return audio;
121
+ const ratio = fromRate / SAMPLE_RATE_STT;
122
+ const newLen = Math.round(audio.length / ratio);
123
+ const result = new Float32Array(newLen);
124
+ for (let i = 0; i < newLen; i++) {
125
+ const srcIdx = i * ratio;
126
+ const lo = Math.floor(srcIdx);
127
+ const hi = Math.min(lo + 1, audio.length - 1);
128
+ const frac = srcIdx - lo;
129
+ result[i] = audio[lo] * (1 - frac) + audio[hi] * frac;
130
+ }
131
+ return result;
132
+ }
133
+
134
+ function encodeWav(float32Audio, sampleRate) {
135
+ const numSamples = float32Audio.length;
136
+ const bytesPerSample = 2;
137
+ const dataSize = numSamples * bytesPerSample;
138
+ const buffer = new ArrayBuffer(44 + dataSize);
139
+ const view = new DataView(buffer);
140
+ const writeStr = (off, str) => { for (let i = 0; i < str.length; i++) view.setUint8(off + i, str.charCodeAt(i)); };
141
+ writeStr(0, 'RIFF');
142
+ view.setUint32(4, 36 + dataSize, true);
143
+ writeStr(8, 'WAVE');
144
+ writeStr(12, 'fmt ');
145
+ view.setUint32(16, 16, true);
146
+ view.setUint16(20, 1, true);
147
+ view.setUint16(22, 1, true);
148
+ view.setUint32(24, sampleRate, true);
149
+ view.setUint32(28, sampleRate * bytesPerSample, true);
150
+ view.setUint16(32, bytesPerSample, true);
151
+ view.setUint16(34, 16, true);
152
+ writeStr(36, 'data');
153
+ view.setUint32(40, dataSize, true);
154
+ for (let i = 0; i < numSamples; i++) {
155
+ const s = Math.max(-1, Math.min(1, float32Audio[i]));
156
+ view.setInt16(44 + i * 2, s < 0 ? s * 32768 : s * 32767, true);
157
+ }
158
+ return Buffer.from(buffer);
159
+ }
160
+
161
+ async function transcribe(audioBuffer) {
162
+ const stt = await getSTT();
163
+ let audio;
164
+ const buf = Buffer.isBuffer(audioBuffer) ? audioBuffer : Buffer.from(audioBuffer);
165
+ const isWav = buf.length > 4 && buf.toString('ascii', 0, 4) === 'RIFF';
166
+ if (isWav) {
167
+ const decoded = decodeWavToFloat32(buf);
168
+ audio = resampleTo16k(decoded.audio, decoded.sampleRate);
169
+ } else {
170
+ audio = new Float32Array(buf.buffer, buf.byteOffset, buf.byteLength / 4);
171
+ }
172
+ const result = await stt(audio);
173
+ return result.text || '';
174
+ }
175
+
176
+ async function synthesize(text) {
177
+ const tts = await getTTS();
178
+ const embeddings = await ensureSpeakerEmbeddings();
179
+ const result = await tts(text, { speaker_embeddings: embeddings });
180
+ return encodeWav(result.audio, result.sampling_rate || SAMPLE_RATE_TTS);
181
+ }
182
+
183
+ function getStatus() {
184
+ return {
185
+ sttReady: !!sttPipeline,
186
+ ttsReady: !!ttsPipeline,
187
+ sttLoading,
188
+ ttsLoading,
189
+ };
190
+ }
191
+
192
+ export { transcribe, synthesize, getSTT, getTTS, getStatus };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "agentgui",
3
- "version": "1.0.143",
3
+ "version": "1.0.145",
4
4
  "description": "Multi-agent ACP client with real-time communication",
5
5
  "type": "module",
6
6
  "main": "server.js",
@@ -22,10 +22,12 @@
22
22
  },
23
23
  "dependencies": {
24
24
  "@anthropic-ai/claude-code": "^2.1.37",
25
+ "@huggingface/transformers": "^3.8.1",
25
26
  "better-sqlite3": "^12.6.2",
26
27
  "busboy": "^1.6.0",
27
28
  "express": "^5.2.1",
28
29
  "fsbrowse": "^0.2.13",
30
+ "onnxruntime-node": "^1.24.1",
29
31
  "webtalk": "github:anEntrypoint/realtime-whisper-webgpu",
30
32
  "ws": "^8.14.2"
31
33
  }
package/server.js CHANGED
@@ -8,12 +8,12 @@ import { execSync } from 'child_process';
8
8
  import { createRequire } from 'module';
9
9
  import { queries } from './database.js';
10
10
  import { runClaudeWithStreaming } from './lib/claude-runner.js';
11
+ import { transcribe, synthesize, getStatus as getSpeechStatus } from './lib/speech.js';
11
12
 
12
13
  const require = createRequire(import.meta.url);
13
14
  const express = require('express');
14
15
  const Busboy = require('busboy');
15
16
  const fsbrowse = require('fsbrowse');
16
- const { webtalk } = require('webtalk');
17
17
 
18
18
  const SYSTEM_PROMPT = `Always write your responses in ripple-ui enhanced HTML. Avoid overriding light/dark mode CSS variables. Use all the benefits of HTML to express technical details with proper semantic markup, tables, code blocks, headings, and lists. Write clean, well-structured HTML that respects the existing design system.`;
19
19
 
@@ -37,28 +37,6 @@ if (!fs.existsSync(staticDir)) fs.mkdirSync(staticDir, { recursive: true });
37
37
  // Express sub-app for fsbrowse file browser and file upload
38
38
  const expressApp = express();
39
39
 
40
- // Separate Express app for webtalk (STT/TTS) - isolated to contain COEP/COOP headers
41
- const webtalkApp = express();
42
- const webtalkInstance = webtalk(webtalkApp, { path: '/webtalk' });
43
-
44
- const webtalkSdkDir = path.dirname(require.resolve('webtalk'));
45
- const WASM_MIN_BYTES = 1000000;
46
- const webtalkCriticalFiles = [
47
- { path: path.join(webtalkSdkDir, 'assets', 'ort-wasm-simd-threaded.jsep.wasm'), minBytes: WASM_MIN_BYTES }
48
- ];
49
- for (const file of webtalkCriticalFiles) {
50
- try {
51
- if (fs.existsSync(file.path)) {
52
- const stat = fs.statSync(file.path);
53
- if (stat.size < file.minBytes) {
54
- debugLog(`Removing corrupt file ${path.basename(file.path)} (${stat.size} bytes, need ${file.minBytes}+)`);
55
- fs.unlinkSync(file.path);
56
- }
57
- }
58
- } catch (e) { debugLog(`File check error: ${e.message}`); }
59
- }
60
-
61
- webtalkInstance.init().catch(err => debugLog('Webtalk init: ' + err.message));
62
40
 
63
41
  // File upload endpoint - copies dropped files to conversation workingDirectory
64
42
  expressApp.post(BASE_URL + '/api/upload/:conversationId', (req, res) => {
@@ -165,63 +143,9 @@ const server = http.createServer(async (req, res) => {
165
143
  res.setHeader('Access-Control-Allow-Origin', '*');
166
144
  res.setHeader('Access-Control-Allow-Methods', 'GET, POST, PUT, DELETE, OPTIONS');
167
145
  res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization');
168
- res.setHeader('Cross-Origin-Embedder-Policy', 'credentialless');
169
- res.setHeader('Cross-Origin-Opener-Policy', 'same-origin');
170
- res.setHeader('Cross-Origin-Resource-Policy', 'cross-origin');
171
146
  if (req.method === 'OPTIONS') { res.writeHead(200); res.end(); return; }
172
147
 
173
148
  const pathOnly = req.url.split('?')[0];
174
- const webtalkPrefix = BASE_URL + '/webtalk';
175
- const isWebtalkRoute = pathOnly.startsWith(webtalkPrefix) ||
176
- pathOnly.startsWith(BASE_URL + '/api/tts-status') ||
177
- pathOnly.startsWith(BASE_URL + '/assets/') ||
178
- pathOnly.startsWith(BASE_URL + '/tts/') ||
179
- pathOnly.startsWith(BASE_URL + '/models/') ||
180
- pathOnly.startsWith('/webtalk') ||
181
- pathOnly.startsWith('/assets/') ||
182
- pathOnly.startsWith('/tts/') ||
183
- pathOnly.startsWith('/models/');
184
- if (isWebtalkRoute) {
185
- const webtalkSdkDir = path.dirname(require.resolve('webtalk'));
186
- const sdkFiles = { '/demo': 'app.html', '/sdk.js': 'sdk.js', '/stt.js': 'stt.js', '/tts.js': 'tts.js', '/tts-utils.js': 'tts-utils.js' };
187
- let stripped = pathOnly.startsWith(webtalkPrefix) ? pathOnly.slice(webtalkPrefix.length) : (pathOnly.startsWith('/webtalk') ? pathOnly.slice('/webtalk'.length) : null);
188
- if (stripped !== null && !sdkFiles[stripped] && !stripped.endsWith('.js') && sdkFiles[stripped + '.js']) stripped += '.js';
189
- if (stripped !== null && sdkFiles[stripped]) {
190
- const filePath = path.join(webtalkSdkDir, sdkFiles[stripped]);
191
- return fs.readFile(filePath, 'utf-8', (err, content) => {
192
- if (err) { res.writeHead(404); res.end('Not found'); return; }
193
- if (stripped === '/demo') {
194
- let patched = content
195
- .replace(/from\s+['"](\/webtalk\/[^'"]+)['"]/g, (_, p) => `from '${BASE_URL}${p}'`)
196
- .replace(/from\s+['"]\.\/([^'"]+)['"]/g, (_, p) => `from '${BASE_URL}/webtalk/${p}'`)
197
- .replace('<head>', `<head>\n <script>window.__WEBTALK_BASE='${BASE_URL}';</script>`);
198
- res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8', 'Cross-Origin-Embedder-Policy': 'credentialless', 'Cross-Origin-Opener-Policy': 'same-origin', 'Cross-Origin-Resource-Policy': 'cross-origin' });
199
- return res.end(patched);
200
- }
201
- let js = content;
202
- const ensureExt = (mod) => mod.endsWith('.js') ? mod : mod + '.js';
203
- if (js.includes('require(') || js.includes('module.exports')) {
204
- js = js.replace(/const\s*\{([^}]+)\}\s*=\s*require\(['"]\.\/([^'"]+)['"]\);?/g, (_, names, mod) => `import {${names}} from '${BASE_URL}/webtalk/${ensureExt(mod)}';`);
205
- js = js.replace(/const\s+(\w+)\s*=\s*require\(['"]\.\/([^'"]+)['"]\);?/g, (_, name, mod) => `import ${name} from '${BASE_URL}/webtalk/${ensureExt(mod)}';`);
206
- js = js.replace(/module\.exports\s*=\s*\{([^}]+)\};?/, (_, names) => `export {${names.trim().replace(/\s+/g, ' ')} };`);
207
- }
208
- js = js.replace(/from\s+['"]\.\/([^'"]+)['"]/g, (_, p) => `from '${BASE_URL}/webtalk/${ensureExt(p)}'`);
209
- res.writeHead(200, { 'Content-Type': 'application/javascript; charset=utf-8', 'Cross-Origin-Resource-Policy': 'cross-origin' });
210
- res.end(js);
211
- });
212
- }
213
- if (req.url.startsWith(BASE_URL)) req.url = req.url.slice(BASE_URL.length) || '/';
214
- const isModelOrAsset = pathOnly.includes('/models/') || pathOnly.includes('/assets/') || pathOnly.endsWith('.wasm') || pathOnly.endsWith('.onnx');
215
- if (isModelOrAsset) {
216
- res.setHeader('Cache-Control', 'public, max-age=604800, immutable');
217
- }
218
- const origSetHeader = res.setHeader.bind(res);
219
- res.setHeader = (name, value) => {
220
- if (name.toLowerCase() === 'cross-origin-embedder-policy') return;
221
- origSetHeader(name, value);
222
- };
223
- return webtalkApp(req, res);
224
- }
225
149
 
226
150
  // Route file upload and fsbrowse requests through Express sub-app
227
151
  if (pathOnly.startsWith(BASE_URL + '/api/upload/') || pathOnly.startsWith(BASE_URL + '/files/')) {
@@ -516,6 +440,53 @@ const server = http.createServer(async (req, res) => {
516
440
  return;
517
441
  }
518
442
 
443
+ if (routePath === '/api/stt' && req.method === 'POST') {
444
+ try {
445
+ const chunks = [];
446
+ for await (const chunk of req) chunks.push(chunk);
447
+ const audioBuffer = Buffer.concat(chunks);
448
+ if (audioBuffer.length === 0) {
449
+ res.writeHead(400, { 'Content-Type': 'application/json' });
450
+ res.end(JSON.stringify({ error: 'No audio data' }));
451
+ return;
452
+ }
453
+ const text = await transcribe(audioBuffer);
454
+ res.writeHead(200, { 'Content-Type': 'application/json' });
455
+ res.end(JSON.stringify({ text: text.trim() }));
456
+ } catch (err) {
457
+ debugLog('[STT] Error: ' + err.message);
458
+ res.writeHead(500, { 'Content-Type': 'application/json' });
459
+ res.end(JSON.stringify({ error: err.message }));
460
+ }
461
+ return;
462
+ }
463
+
464
+ if (routePath === '/api/tts' && req.method === 'POST') {
465
+ try {
466
+ const body = await parseBody(req);
467
+ const text = body.text || '';
468
+ if (!text) {
469
+ res.writeHead(400, { 'Content-Type': 'application/json' });
470
+ res.end(JSON.stringify({ error: 'No text provided' }));
471
+ return;
472
+ }
473
+ const wavBuffer = await synthesize(text);
474
+ res.writeHead(200, { 'Content-Type': 'audio/wav', 'Content-Length': wavBuffer.length });
475
+ res.end(wavBuffer);
476
+ } catch (err) {
477
+ debugLog('[TTS] Error: ' + err.message);
478
+ res.writeHead(500, { 'Content-Type': 'application/json' });
479
+ res.end(JSON.stringify({ error: err.message }));
480
+ }
481
+ return;
482
+ }
483
+
484
+ if (routePath === '/api/speech-status' && req.method === 'GET') {
485
+ res.writeHead(200, { 'Content-Type': 'application/json' });
486
+ res.end(JSON.stringify(getSpeechStatus()));
487
+ return;
488
+ }
489
+
519
490
  if (routePath === '/api/folders' && req.method === 'POST') {
520
491
  const body = await parseBody(req);
521
492
  const folderPath = body.path || STARTUP_CWD;
@@ -591,7 +562,7 @@ const server = http.createServer(async (req, res) => {
591
562
  }
592
563
  });
593
564
 
594
- const MIME_TYPES = { '.html': 'text/html; charset=utf-8', '.js': 'application/javascript; charset=utf-8', '.css': 'text/css; charset=utf-8', '.json': 'application/json', '.png': 'image/png', '.jpg': 'image/jpeg', '.svg': 'image/svg+xml', '.wasm': 'application/wasm', '.onnx': 'application/octet-stream' };
565
+ const MIME_TYPES = { '.html': 'text/html; charset=utf-8', '.js': 'application/javascript; charset=utf-8', '.css': 'text/css; charset=utf-8', '.json': 'application/json', '.png': 'image/png', '.jpg': 'image/jpeg', '.svg': 'image/svg+xml' };
595
566
 
596
567
  function serveFile(filePath, res) {
597
568
  const ext = path.extname(filePath).toLowerCase();
@@ -613,7 +584,7 @@ function serveFile(filePath, res) {
613
584
  fs.readFile(filePath, (err, data) => {
614
585
  if (err) { res.writeHead(500); res.end('Server error'); return; }
615
586
  let content = data.toString();
616
- const baseTag = `<script>window.__BASE_URL='${BASE_URL}';</script>\n <script type="importmap">{"imports":{"webtalk-sdk":"${BASE_URL}/webtalk/sdk.js"}}</script>`;
587
+ const baseTag = `<script>window.__BASE_URL='${BASE_URL}';</script>`;
617
588
  content = content.replace('<head>', '<head>\n ' + baseTag);
618
589
  if (watch) {
619
590
  content += `\n<script>(function(){const ws=new WebSocket('ws://'+location.host+'${BASE_URL}/hot-reload');ws.onmessage=e=>{if(JSON.parse(e.data).type==='reload')location.reload()};})();</script>`;
@@ -640,7 +611,7 @@ function persistChunkWithRetry(sessionId, conversationId, sequence, blockType, b
640
611
 
641
612
  async function processMessageWithStreaming(conversationId, messageId, sessionId, content, agentId) {
642
613
  const startTime = Date.now();
643
- activeExecutions.set(conversationId, true);
614
+ activeExecutions.set(conversationId, { pid: null, startTime, sessionId });
644
615
  queries.setIsStreaming(conversationId, true);
645
616
  queries.updateSession(sessionId, { status: 'active' });
646
617
 
@@ -756,7 +727,11 @@ async function processMessageWithStreaming(conversationId, messageId, sessionId,
756
727
  print: true,
757
728
  resumeSessionId,
758
729
  systemPrompt: SYSTEM_PROMPT,
759
- onEvent
730
+ onEvent,
731
+ onPid: (pid) => {
732
+ const entry = activeExecutions.get(conversationId);
733
+ if (entry) entry.pid = pid;
734
+ }
760
735
  };
761
736
 
762
737
  const { outputs, sessionId: claudeSessionId } = await runClaudeWithStreaming(content, cwd, agentId || 'claude-code', config);
@@ -1030,6 +1005,66 @@ server.on('error', (err) => {
1030
1005
  }
1031
1006
  });
1032
1007
 
1008
+ function recoverStaleSessions() {
1009
+ try {
1010
+ const staleSessions = queries.getActiveSessions ? queries.getActiveSessions() : [];
1011
+ let recoveredCount = 0;
1012
+ for (const session of staleSessions) {
1013
+ if (!activeExecutions.has(session.conversationId)) {
1014
+ queries.updateSession(session.id, {
1015
+ status: 'error',
1016
+ error: 'Agent died unexpectedly (server restart)',
1017
+ completed_at: Date.now()
1018
+ });
1019
+ queries.setIsStreaming(session.conversationId, false);
1020
+ broadcastSync({
1021
+ type: 'streaming_error',
1022
+ sessionId: session.id,
1023
+ conversationId: session.conversationId,
1024
+ error: 'Agent died unexpectedly (server restart)',
1025
+ recoverable: false,
1026
+ timestamp: Date.now()
1027
+ });
1028
+ recoveredCount++;
1029
+ }
1030
+ }
1031
+ if (recoveredCount > 0) {
1032
+ console.log(`[RECOVERY] Recovered ${recoveredCount} stale active session(s)`);
1033
+ }
1034
+ } catch (err) {
1035
+ console.error('[RECOVERY] Stale session recovery error:', err.message);
1036
+ }
1037
+ }
1038
+
1039
+ function performAgentHealthCheck() {
1040
+ for (const [conversationId, entry] of activeExecutions) {
1041
+ if (!entry || !entry.pid) continue;
1042
+ try {
1043
+ process.kill(entry.pid, 0);
1044
+ } catch (err) {
1045
+ debugLog(`[HEALTH] Agent PID ${entry.pid} for conv ${conversationId} is dead`);
1046
+ activeExecutions.delete(conversationId);
1047
+ queries.setIsStreaming(conversationId, false);
1048
+ if (entry.sessionId) {
1049
+ queries.updateSession(entry.sessionId, {
1050
+ status: 'error',
1051
+ error: 'Agent process died unexpectedly',
1052
+ completed_at: Date.now()
1053
+ });
1054
+ }
1055
+ broadcastSync({
1056
+ type: 'streaming_error',
1057
+ sessionId: entry.sessionId,
1058
+ conversationId,
1059
+ error: 'Agent process died unexpectedly',
1060
+ recoverable: false,
1061
+ timestamp: Date.now()
1062
+ });
1063
+ drainMessageQueue(conversationId);
1064
+ }
1065
+ }
1066
+ }
1067
+
1033
1068
  function onServerReady() {
1034
1069
  console.log(`GMGUI running on http://localhost:${PORT}${BASE_URL}/`);
1035
1070
  console.log(`Agents: ${discoveredAgents.map(a => a.name).join(', ') || 'none'}`);
@@ -1041,12 +1076,18 @@ function onServerReady() {
1041
1076
  console.log(`Cleaned up ${deletedCount} empty conversation(s) on startup`);
1042
1077
  }
1043
1078
 
1079
+ // Recover stale active sessions from previous run
1080
+ recoverStaleSessions();
1081
+
1044
1082
  // Run auto-import immediately
1045
1083
  performAutoImport();
1046
1084
 
1047
1085
  // Then run it every 30 seconds (constant automatic importing)
1048
1086
  setInterval(performAutoImport, 30000);
1049
1087
 
1088
+ // Agent health check every 30 seconds
1089
+ setInterval(performAgentHealthCheck, 30000);
1090
+
1050
1091
  }
1051
1092
 
1052
1093
  function performAutoImport() {
@@ -792,7 +792,13 @@ class AgentGUIClient {
792
792
  const inputStr = JSON.stringify(block.input, null, 2);
793
793
  inputHtml = `<details class="tool-input-details"><summary class="tool-input-summary">Input</summary><pre class="tool-input-pre">${this.escapeHtml(inputStr)}</pre></details>`;
794
794
  }
795
- html += `<div class="streaming-block-tool-use"><div class="tool-use-header"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(block.name || 'unknown')}</span></div>${inputHtml}</div>`;
795
+ const tn = block.name || 'unknown';
796
+ const foldable = tn.startsWith('mcp__') || tn === 'Edit';
797
+ if (foldable) {
798
+ html += `<details class="streaming-block-tool-use"><summary class="tool-use-header" style="cursor:pointer;user-select:none;list-style:none;"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(tn)}</span></summary>${inputHtml}</details>`;
799
+ } else {
800
+ html += `<div class="streaming-block-tool-use"><div class="tool-use-header"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(tn)}</span></div>${inputHtml}</div>`;
801
+ }
796
802
  } else if (block.type === 'tool_result') {
797
803
  const content = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
798
804
  const smartHtml = typeof StreamingRenderer !== 'undefined' ? StreamingRenderer.renderSmartContentHTML(content, this.escapeHtml.bind(this)) : `<pre class="tool-result-pre">${this.escapeHtml(content.length > 2000 ? content.substring(0, 2000) + '\n... (truncated)' : content)}</pre>`;
@@ -1433,7 +1439,13 @@ class AgentGUIClient {
1433
1439
  const inputStr = JSON.stringify(block.input, null, 2);
1434
1440
  inputHtml = `<details class="tool-input-details"><summary class="tool-input-summary">Input</summary><pre class="tool-input-pre">${this.escapeHtml(inputStr)}</pre></details>`;
1435
1441
  }
1436
- contentHtml += `<div class="streaming-block-tool-use"><div class="tool-use-header"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(block.name || 'unknown')}</span></div>${inputHtml}</div>`;
1442
+ const tn2 = block.name || 'unknown';
1443
+ const foldable2 = tn2.startsWith('mcp__') || tn2 === 'Edit';
1444
+ if (foldable2) {
1445
+ contentHtml += `<details class="streaming-block-tool-use"><summary class="tool-use-header" style="cursor:pointer;user-select:none;list-style:none;"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(tn2)}</span></summary>${inputHtml}</details>`;
1446
+ } else {
1447
+ contentHtml += `<div class="streaming-block-tool-use"><div class="tool-use-header"><span class="tool-use-icon">&#9881;</span> <span class="tool-use-name">${this.escapeHtml(tn2)}</span></div>${inputHtml}</div>`;
1448
+ }
1437
1449
  } else if (block.type === 'tool_result') {
1438
1450
  const content = typeof block.content === 'string' ? block.content : JSON.stringify(block.content);
1439
1451
  const smartHtml = typeof StreamingRenderer !== 'undefined' ? StreamingRenderer.renderSmartContentHTML(content, this.escapeHtml.bind(this)) : `<pre class="tool-result-pre">${this.escapeHtml(content.length > 2000 ? content.substring(0, 2000) + '\n... (truncated)' : content)}</pre>`;
@@ -624,12 +624,31 @@ class StreamingRenderer {
624
624
  * Render tool use block with smart parameter display
625
625
  */
626
626
  renderBlockToolUse(block, context) {
627
- const div = document.createElement('div');
628
- div.className = 'block-tool-use';
629
-
630
627
  const toolName = block.name || 'unknown';
631
628
  const input = block.input || {};
629
+ const shouldFold = toolName.startsWith('mcp__') || toolName === 'Edit';
630
+
631
+ if (shouldFold) {
632
+ const details = document.createElement('details');
633
+ details.className = 'block-tool-use';
634
+ const summary = document.createElement('summary');
635
+ summary.className = 'tool-header';
636
+ summary.style.cssText = 'cursor:pointer;user-select:none;list-style:none;';
637
+ summary.innerHTML = `
638
+ <span class="tool-icon">${this.getToolIcon(toolName)}</span>
639
+ <span class="tool-name"><code>${this.escapeHtml(toolName)}</code></span>
640
+ `;
641
+ details.appendChild(summary);
642
+ if (Object.keys(input).length > 0) {
643
+ const paramsDiv = document.createElement('div');
644
+ paramsDiv.innerHTML = this.renderSmartParams(toolName, input);
645
+ details.appendChild(paramsDiv);
646
+ }
647
+ return details;
648
+ }
632
649
 
650
+ const div = document.createElement('div');
651
+ div.className = 'block-tool-use';
633
652
  div.innerHTML = `
634
653
  <div class="tool-header">
635
654
  <span class="tool-icon">${this.getToolIcon(toolName)}</span>
@@ -637,7 +656,6 @@ class StreamingRenderer {
637
656
  </div>
638
657
  ${Object.keys(input).length > 0 ? this.renderSmartParams(toolName, input) : ''}
639
658
  `;
640
-
641
659
  return div;
642
660
  }
643
661
 
@@ -1,167 +1,24 @@
1
1
  (function() {
2
- const BASE = window.__BASE_URL || '';
3
- let STT = null;
4
- let TTS = null;
2
+ var BASE = window.__BASE_URL || '';
3
+ var isRecording = false;
4
+ var ttsEnabled = true;
5
+ var voiceActive = false;
6
+ var lastSpokenBlockIndex = -1;
7
+ var currentConversationId = null;
8
+ var speechQueue = [];
9
+ var isSpeaking = false;
10
+ var currentAudio = null;
11
+ var mediaStream = null;
12
+ var audioContext = null;
13
+ var scriptNode = null;
14
+ var recordedChunks = [];
15
+ var TARGET_SAMPLE_RATE = 16000;
5
16
 
6
- async function loadSDK() {
7
- try {
8
- const mod = await import(BASE + '/webtalk/sdk.js');
9
- STT = mod.STT;
10
- TTS = mod.TTS;
11
- return true;
12
- } catch (e) {
13
- console.warn('Webtalk SDK load failed:', e.message);
14
- return false;
15
- }
16
- }
17
- let stt = null;
18
- let tts = null;
19
- let isRecording = false;
20
- let ttsEnabled = true;
21
- let voiceActive = false;
22
- let lastSpokenBlockIndex = -1;
23
- let currentConversationId = null;
24
- let sttReady = false;
25
- let ttsReady = false;
26
- let speechQueue = [];
27
- let isSpeaking = false;
28
-
29
- async function init() {
17
+ function init() {
30
18
  setupTTSToggle();
31
19
  setupUI();
32
20
  setupStreamingListener();
33
21
  setupAgentSelector();
34
- var sdkLoaded = await loadSDK();
35
- if (sdkLoaded) {
36
- initSTT();
37
- initTTS();
38
- } else {
39
- sttLoadPhase = 'failed';
40
- updateMicState();
41
- }
42
- }
43
-
44
- var sttLoadPhase = 'starting';
45
-
46
- async function initSTT() {
47
- try {
48
- stt = new STT({
49
- basePath: BASE + '/webtalk',
50
- onTranscript: function(text) {
51
- var el = document.getElementById('voiceTranscript');
52
- if (el) {
53
- el.textContent = text;
54
- el.setAttribute('data-final', text);
55
- }
56
- },
57
- onPartial: function(text) {
58
- var el = document.getElementById('voiceTranscript');
59
- if (el) {
60
- var existing = el.getAttribute('data-final') || '';
61
- el.textContent = existing + text;
62
- }
63
- },
64
- onStatus: function(status) {
65
- var micBtn = document.getElementById('voiceMicBtn');
66
- if (!micBtn) return;
67
- if (status === 'recording') {
68
- micBtn.classList.add('recording');
69
- } else {
70
- micBtn.classList.remove('recording');
71
- }
72
- }
73
- });
74
- var origInit = stt.init.bind(stt);
75
- var initPromise = new Promise(function(resolve, reject) {
76
- origInit().then(resolve).catch(reject);
77
- if (stt.worker) {
78
- var origHandler = stt.worker.onmessage;
79
- stt.worker.onmessage = function(e) {
80
- var msg = e.data;
81
- if (msg && msg.status) {
82
- if (msg.status === 'progress' || msg.status === 'download') {
83
- if (sttLoadPhase !== 'downloading') {
84
- sttLoadPhase = 'downloading';
85
- updateMicState();
86
- }
87
- } else if (msg.status === 'done' && msg.file && msg.file.endsWith('.onnx')) {
88
- sttLoadPhase = 'compiling';
89
- updateMicState();
90
- }
91
- }
92
- if (origHandler) origHandler.call(stt.worker, e);
93
- };
94
- }
95
- });
96
- await initPromise;
97
- sttReady = true;
98
- updateMicState();
99
- } catch (e) {
100
- console.warn('STT init failed:', e.message);
101
- sttLoadPhase = 'failed';
102
- updateMicState();
103
- }
104
- }
105
-
106
- function updateMicState() {
107
- var micBtn = document.getElementById('voiceMicBtn');
108
- if (!micBtn) return;
109
- if (sttReady) {
110
- micBtn.removeAttribute('disabled');
111
- micBtn.title = 'Click to record';
112
- micBtn.classList.remove('loading');
113
- } else if (sttLoadPhase === 'failed') {
114
- micBtn.setAttribute('disabled', 'true');
115
- micBtn.title = 'Speech recognition failed to load';
116
- micBtn.classList.remove('loading');
117
- } else {
118
- micBtn.setAttribute('disabled', 'true');
119
- micBtn.classList.add('loading');
120
- if (sttLoadPhase === 'downloading') {
121
- micBtn.title = 'Downloading speech models...';
122
- } else if (sttLoadPhase === 'compiling') {
123
- micBtn.title = 'Compiling speech models (may take a minute)...';
124
- } else {
125
- micBtn.title = 'Loading speech recognition...';
126
- }
127
- }
128
- }
129
-
130
- async function initTTS(retries) {
131
- var maxRetries = retries || 3;
132
- for (var attempt = 0; attempt < maxRetries; attempt++) {
133
- try {
134
- tts = new TTS({
135
- basePath: BASE + '/webtalk',
136
- apiBasePath: BASE,
137
- onStatus: function() {},
138
- onAudioReady: function(url) {
139
- var audio = new Audio(url);
140
- audio.onended = function() {
141
- isSpeaking = false;
142
- processQueue();
143
- };
144
- audio.onerror = function() {
145
- isSpeaking = false;
146
- processQueue();
147
- };
148
- audio.play().catch(function() {
149
- isSpeaking = false;
150
- processQueue();
151
- });
152
- }
153
- });
154
- await tts.init();
155
- ttsReady = true;
156
- return;
157
- } catch (e) {
158
- console.warn('TTS init attempt ' + (attempt + 1) + '/' + maxRetries + ' failed:', e.message);
159
- tts = null;
160
- if (attempt < maxRetries - 1) {
161
- await new Promise(function(r) { setTimeout(r, 3000 * (attempt + 1)); });
162
- }
163
- }
164
- }
165
22
  }
166
23
 
167
24
  function setupAgentSelector() {
@@ -203,6 +60,8 @@
203
60
  function setupUI() {
204
61
  var micBtn = document.getElementById('voiceMicBtn');
205
62
  if (micBtn) {
63
+ micBtn.removeAttribute('disabled');
64
+ micBtn.title = 'Click to record';
206
65
  micBtn.addEventListener('click', function(e) {
207
66
  e.preventDefault();
208
67
  if (!isRecording) {
@@ -216,43 +75,104 @@
216
75
  if (sendBtn) {
217
76
  sendBtn.addEventListener('click', sendVoiceMessage);
218
77
  }
219
- updateMicState();
78
+ }
79
+
80
+ function resampleBuffer(inputBuffer, fromRate, toRate) {
81
+ if (fromRate === toRate) return inputBuffer;
82
+ var ratio = fromRate / toRate;
83
+ var newLen = Math.round(inputBuffer.length / ratio);
84
+ var result = new Float32Array(newLen);
85
+ for (var i = 0; i < newLen; i++) {
86
+ var srcIdx = i * ratio;
87
+ var lo = Math.floor(srcIdx);
88
+ var hi = Math.min(lo + 1, inputBuffer.length - 1);
89
+ var frac = srcIdx - lo;
90
+ result[i] = inputBuffer[lo] * (1 - frac) + inputBuffer[hi] * frac;
91
+ }
92
+ return result;
220
93
  }
221
94
 
222
95
  async function startRecording() {
223
96
  if (isRecording) return;
224
97
  var el = document.getElementById('voiceTranscript');
225
- if (!stt || !sttReady) {
226
- if (el) el.textContent = 'Speech recognition still loading, please wait...';
227
- return;
228
- }
229
98
  if (el) {
230
99
  el.textContent = '';
231
100
  el.setAttribute('data-final', '');
232
101
  }
233
- isRecording = true;
234
102
  try {
235
- await stt.startRecording();
103
+ mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
104
+ audioContext = new (window.AudioContext || window.webkitAudioContext)();
105
+ var source = audioContext.createMediaStreamSource(mediaStream);
106
+ scriptNode = audioContext.createScriptProcessor(4096, 1, 1);
107
+ recordedChunks = [];
108
+ scriptNode.onaudioprocess = function(e) {
109
+ var data = e.inputBuffer.getChannelData(0);
110
+ recordedChunks.push(new Float32Array(data));
111
+ };
112
+ source.connect(scriptNode);
113
+ scriptNode.connect(audioContext.destination);
114
+ isRecording = true;
115
+ var micBtn = document.getElementById('voiceMicBtn');
116
+ if (micBtn) micBtn.classList.add('recording');
236
117
  } catch (e) {
237
118
  isRecording = false;
238
119
  if (el) el.textContent = 'Mic access denied or unavailable: ' + e.message;
239
- console.warn('Recording start failed:', e.message);
240
120
  }
241
121
  }
242
122
 
243
123
  async function stopRecording() {
244
- if (!stt || !isRecording) return;
124
+ if (!isRecording) return;
245
125
  isRecording = false;
126
+ var micBtn = document.getElementById('voiceMicBtn');
127
+ if (micBtn) micBtn.classList.remove('recording');
128
+ var el = document.getElementById('voiceTranscript');
129
+ if (scriptNode) { scriptNode.disconnect(); scriptNode = null; }
130
+ if (mediaStream) {
131
+ mediaStream.getTracks().forEach(function(t) { t.stop(); });
132
+ mediaStream = null;
133
+ }
134
+ var sourceSampleRate = audioContext ? audioContext.sampleRate : 48000;
135
+ if (audioContext) { audioContext.close().catch(function() {}); audioContext = null; }
136
+ if (recordedChunks.length === 0) return;
137
+ var totalLen = 0;
138
+ for (var i = 0; i < recordedChunks.length; i++) totalLen += recordedChunks[i].length;
139
+ var merged = new Float32Array(totalLen);
140
+ var offset = 0;
141
+ for (var j = 0; j < recordedChunks.length; j++) {
142
+ merged.set(recordedChunks[j], offset);
143
+ offset += recordedChunks[j].length;
144
+ }
145
+ recordedChunks = [];
146
+ var resampled = resampleBuffer(merged, sourceSampleRate, TARGET_SAMPLE_RATE);
147
+ if (el) el.textContent = 'Transcribing...';
246
148
  try {
247
- await stt.stopRecording();
248
- } catch (e) {}
149
+ var pcmBuffer = resampled.buffer;
150
+ var resp = await fetch(BASE + '/api/stt', {
151
+ method: 'POST',
152
+ headers: { 'Content-Type': 'application/octet-stream' },
153
+ body: pcmBuffer
154
+ });
155
+ var data = await resp.json();
156
+ if (data.text) {
157
+ if (el) {
158
+ el.textContent = data.text;
159
+ el.setAttribute('data-final', data.text);
160
+ }
161
+ } else if (data.error) {
162
+ if (el) el.textContent = 'Error: ' + data.error;
163
+ } else {
164
+ if (el) el.textContent = '';
165
+ }
166
+ } catch (e) {
167
+ if (el) el.textContent = 'Transcription failed: ' + e.message;
168
+ }
249
169
  }
250
170
 
251
171
  function sendVoiceMessage() {
252
172
  var el = document.getElementById('voiceTranscript');
253
173
  if (!el) return;
254
174
  var text = el.textContent.trim();
255
- if (!text) return;
175
+ if (!text || text.startsWith('Transcribing') || text.startsWith('Error')) return;
256
176
  addVoiceBlock(text, true);
257
177
  el.textContent = '';
258
178
  el.setAttribute('data-final', '');
@@ -266,7 +186,7 @@
266
186
  }
267
187
 
268
188
  function speak(text) {
269
- if (!ttsEnabled || !tts || !ttsReady) return;
189
+ if (!ttsEnabled) return;
270
190
  var clean = text.replace(/<[^>]*>/g, '').trim();
271
191
  if (!clean) return;
272
192
  speechQueue.push(clean);
@@ -277,7 +197,35 @@
277
197
  if (isSpeaking || speechQueue.length === 0) return;
278
198
  isSpeaking = true;
279
199
  var text = speechQueue.shift();
280
- tts.generate(text).catch(function() {
200
+ fetch(BASE + '/api/tts', {
201
+ method: 'POST',
202
+ headers: { 'Content-Type': 'application/json' },
203
+ body: JSON.stringify({ text: text })
204
+ }).then(function(resp) {
205
+ if (!resp.ok) throw new Error('TTS failed');
206
+ return resp.blob();
207
+ }).then(function(blob) {
208
+ var url = URL.createObjectURL(blob);
209
+ currentAudio = new Audio(url);
210
+ currentAudio.onended = function() {
211
+ URL.revokeObjectURL(url);
212
+ currentAudio = null;
213
+ isSpeaking = false;
214
+ processQueue();
215
+ };
216
+ currentAudio.onerror = function() {
217
+ URL.revokeObjectURL(url);
218
+ currentAudio = null;
219
+ isSpeaking = false;
220
+ processQueue();
221
+ };
222
+ currentAudio.play().catch(function() {
223
+ URL.revokeObjectURL(url);
224
+ currentAudio = null;
225
+ isSpeaking = false;
226
+ processQueue();
227
+ });
228
+ }).catch(function() {
281
229
  isSpeaking = false;
282
230
  processQueue();
283
231
  });
@@ -286,7 +234,10 @@
286
234
  function stopSpeaking() {
287
235
  speechQueue = [];
288
236
  isSpeaking = false;
289
- if (tts) tts.stop();
237
+ if (currentAudio) {
238
+ currentAudio.pause();
239
+ currentAudio = null;
240
+ }
290
241
  }
291
242
 
292
243
  function addVoiceBlock(text, isUser) {