nodebench-mcp 2.13.0 → 2.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/dist/__tests__/evalHarness.test.js +1 -1
  2. package/dist/__tests__/gaiaCapabilityAudioEval.test.js +9 -14
  3. package/dist/__tests__/gaiaCapabilityAudioEval.test.js.map +1 -1
  4. package/dist/__tests__/gaiaCapabilityEval.test.js +88 -14
  5. package/dist/__tests__/gaiaCapabilityEval.test.js.map +1 -1
  6. package/dist/__tests__/gaiaCapabilityFilesEval.test.js +9 -5
  7. package/dist/__tests__/gaiaCapabilityFilesEval.test.js.map +1 -1
  8. package/dist/__tests__/gaiaCapabilityMediaEval.test.js +16 -16
  9. package/dist/__tests__/gaiaCapabilityMediaEval.test.js.map +1 -1
  10. package/dist/__tests__/helpers/answerMatch.d.ts +36 -7
  11. package/dist/__tests__/helpers/answerMatch.js +224 -35
  12. package/dist/__tests__/helpers/answerMatch.js.map +1 -1
  13. package/dist/__tests__/presetRealWorldBench.test.js +11 -0
  14. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  15. package/dist/__tests__/tools.test.js +16 -6
  16. package/dist/__tests__/tools.test.js.map +1 -1
  17. package/dist/__tests__/toolsetGatingEval.test.js +11 -1
  18. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  19. package/dist/db.js +21 -0
  20. package/dist/db.js.map +1 -1
  21. package/dist/index.js +31 -4
  22. package/dist/index.js.map +1 -1
  23. package/dist/tools/gitWorkflowTools.d.ts +11 -0
  24. package/dist/tools/gitWorkflowTools.js +580 -0
  25. package/dist/tools/gitWorkflowTools.js.map +1 -0
  26. package/dist/tools/localFileTools.js +2 -2
  27. package/dist/tools/localFileTools.js.map +1 -1
  28. package/dist/tools/metaTools.js +82 -0
  29. package/dist/tools/metaTools.js.map +1 -1
  30. package/dist/tools/parallelAgentTools.js +228 -0
  31. package/dist/tools/parallelAgentTools.js.map +1 -1
  32. package/dist/tools/patternTools.d.ts +13 -0
  33. package/dist/tools/patternTools.js +456 -0
  34. package/dist/tools/patternTools.js.map +1 -0
  35. package/dist/tools/seoTools.d.ts +16 -0
  36. package/dist/tools/seoTools.js +866 -0
  37. package/dist/tools/seoTools.js.map +1 -0
  38. package/dist/tools/toolRegistry.js +260 -0
  39. package/dist/tools/toolRegistry.js.map +1 -1
  40. package/dist/tools/toonTools.d.ts +15 -0
  41. package/dist/tools/toonTools.js +94 -0
  42. package/dist/tools/toonTools.js.map +1 -0
  43. package/dist/tools/voiceBridgeTools.d.ts +15 -0
  44. package/dist/tools/voiceBridgeTools.js +1427 -0
  45. package/dist/tools/voiceBridgeTools.js.map +1 -0
  46. package/package.json +3 -2
@@ -0,0 +1,1427 @@
1
+ /**
2
+ * Voice Bridge Tools — Knowledge-based planning tools for voice interface design.
3
+ *
4
+ * These are research/planning tools that help agents design, validate, scaffold,
5
+ * and benchmark voice pipelines. They do NOT perform actual voice processing —
6
+ * all data comes from built-in knowledge bases of STT/TTS/LLM latency and cost.
7
+ *
8
+ * 4 tools:
9
+ * - design_voice_pipeline: Recommend optimal STT/TTS/LLM stack for requirements
10
+ * - analyze_voice_config: Validate a pipeline config for compatibility and cost
11
+ * - generate_voice_scaffold: Generate starter code for a voice bridge
12
+ * - benchmark_voice_latency: Calculate theoretical latency for pipeline configs
13
+ */
14
+ const STT_ENGINES = [
15
+ {
16
+ name: "Whisper (OpenAI, local)",
17
+ key: "whisper",
18
+ latencyMs: 2000,
19
+ costPerMin: 0,
20
+ privacy: "local",
21
+ platforms: ["mac", "windows", "linux"],
22
+ quality: "high",
23
+ streaming: false,
24
+ notes: "Runs locally via whisper.cpp or Python. 1-3s latency depending on model size and hardware.",
25
+ },
26
+ {
27
+ name: "Whisper MLX (Apple Silicon)",
28
+ key: "whisper_mlx",
29
+ latencyMs: 800,
30
+ costPerMin: 0,
31
+ privacy: "local",
32
+ platforms: ["mac"],
33
+ quality: "high",
34
+ streaming: false,
35
+ notes: "Optimized for Apple M-series chips via MLX framework. Significantly faster than CPU Whisper.",
36
+ },
37
+ {
38
+ name: "Deepgram",
39
+ key: "deepgram",
40
+ latencyMs: 300,
41
+ costPerMin: 0.0043,
42
+ privacy: "cloud",
43
+ platforms: ["mac", "windows", "linux", "browser", "mobile"],
44
+ quality: "high",
45
+ streaming: true,
46
+ notes: "Cloud STT with very low latency. WebSocket streaming. $0.0043/min (Nova-2).",
47
+ },
48
+ {
49
+ name: "Google Cloud Speech-to-Text",
50
+ key: "google_speech",
51
+ latencyMs: 500,
52
+ costPerMin: 0.006,
53
+ privacy: "cloud",
54
+ platforms: ["mac", "windows", "linux", "browser", "mobile"],
55
+ quality: "high",
56
+ streaming: true,
57
+ notes: "Mature cloud STT. $0.006/min standard, $0.009/min enhanced. Good multilingual support.",
58
+ },
59
+ {
60
+ name: "Web Speech API (browser)",
61
+ key: "web_speech_api_stt",
62
+ latencyMs: 500,
63
+ costPerMin: 0,
64
+ privacy: "cloud",
65
+ platforms: ["browser"],
66
+ quality: "medium",
67
+ streaming: true,
68
+ notes: "Free browser-native STT. Uses device/OS speech engine. Chrome uses Google, Safari uses Apple. Not available in Node.js.",
69
+ },
70
+ ];
71
+ const TTS_ENGINES = [
72
+ {
73
+ name: "Edge TTS (Microsoft)",
74
+ key: "edge_tts",
75
+ latencyMs: 500,
76
+ costPerMin: 0,
77
+ privacy: "cloud",
78
+ platforms: ["mac", "windows", "linux"],
79
+ quality: "high",
80
+ streaming: true,
81
+ notes: "Free cloud TTS via edge-tts Python package. High quality voices, ~0.5s TTFB. May have rate limits.",
82
+ },
83
+ {
84
+ name: "Cartesia Sonic",
85
+ key: "cartesia",
86
+ latencyMs: 100,
87
+ costPerMin: 0.042,
88
+ privacy: "cloud",
89
+ platforms: ["mac", "windows", "linux", "browser", "mobile"],
90
+ quality: "high",
91
+ streaming: true,
92
+ notes: "Ultra-low latency streaming TTS. ~100ms TTFB. $0.042/min. Best for real-time voice agents.",
93
+ },
94
+ {
95
+ name: "ElevenLabs",
96
+ key: "elevenlabs",
97
+ latencyMs: 1000,
98
+ costPerMin: 0.18,
99
+ privacy: "cloud",
100
+ platforms: ["mac", "windows", "linux", "browser", "mobile"],
101
+ quality: "high",
102
+ streaming: true,
103
+ notes: "Highest quality voice cloning and synthesis. ~1s TTFB. Expensive ($0.18/min). Best for pre-generated content.",
104
+ },
105
+ {
106
+ name: "macOS say",
107
+ key: "macos_say",
108
+ latencyMs: 50,
109
+ costPerMin: 0,
110
+ privacy: "local",
111
+ platforms: ["mac"],
112
+ quality: "medium",
113
+ streaming: false,
114
+ notes: "Built-in macOS speech synthesis. Instant, free, fully local. Limited voice quality.",
115
+ },
116
+ {
117
+ name: "Piper TTS",
118
+ key: "piper",
119
+ latencyMs: 200,
120
+ costPerMin: 0,
121
+ privacy: "local",
122
+ platforms: ["mac", "windows", "linux"],
123
+ quality: "medium",
124
+ streaming: false,
125
+ notes: "Fast local neural TTS. ONNX-based, runs on CPU. Good quality for a local engine. ~200ms.",
126
+ },
127
+ {
128
+ name: "Browser speechSynthesis",
129
+ key: "web_speech_api_tts",
130
+ latencyMs: 100,
131
+ costPerMin: 0,
132
+ privacy: "local",
133
+ platforms: ["browser"],
134
+ quality: "low",
135
+ streaming: false,
136
+ notes: "Free browser-native TTS. Uses OS voices. Quality varies by platform. Not available in Node.js.",
137
+ },
138
+ ];
139
+ const LLM_ENGINES = [
140
+ {
141
+ name: "Claude Haiku",
142
+ key: "claude_haiku",
143
+ firstTokenMs: 300,
144
+ tokensPerSec: 120,
145
+ costPer1kTokens: 0.00025,
146
+ privacy: "cloud",
147
+ quality: "medium",
148
+ notes: "Fastest Claude model. Good for voice where speed matters. ~300ms TTFT.",
149
+ },
150
+ {
151
+ name: "Claude Sonnet",
152
+ key: "claude_sonnet",
153
+ firstTokenMs: 600,
154
+ tokensPerSec: 80,
155
+ costPer1kTokens: 0.003,
156
+ privacy: "cloud",
157
+ quality: "high",
158
+ notes: "Best balance of speed and quality. ~600ms TTFT. Good for complex voice interactions.",
159
+ },
160
+ {
161
+ name: "GPT-4o-mini",
162
+ key: "gpt4o_mini",
163
+ firstTokenMs: 400,
164
+ tokensPerSec: 100,
165
+ costPer1kTokens: 0.00015,
166
+ privacy: "cloud",
167
+ quality: "medium",
168
+ notes: "Fast and cheap. ~400ms TTFT. Good for simple voice interactions.",
169
+ },
170
+ {
171
+ name: "Local Llama (llama.cpp)",
172
+ key: "local_llama",
173
+ firstTokenMs: 1000,
174
+ tokensPerSec: 30,
175
+ costPer1kTokens: 0,
176
+ privacy: "local",
177
+ quality: "medium",
178
+ notes: "Fully private, runs on consumer hardware. 2-5s for first response depending on model/hardware. Slower but free.",
179
+ },
180
+ ];
181
+ // ─── Helper functions ────────────────────────────────────────────────────────
182
+ function findStt(key) {
183
+ return STT_ENGINES.find((e) => e.key === key || e.name.toLowerCase().includes(key.toLowerCase()));
184
+ }
185
+ function findTts(key) {
186
+ return TTS_ENGINES.find((e) => e.key === key || e.name.toLowerCase().includes(key.toLowerCase()));
187
+ }
188
+ function findLlm(key) {
189
+ return LLM_ENGINES.find((e) => e.key === key || e.name.toLowerCase().includes(key.toLowerCase()));
190
+ }
191
+ function estimateRoundTrip(stt, tts, llm, streaming) {
192
+ const avgResponseTokens = 60;
193
+ const llmCompletionMs = Math.round((avgResponseTokens / llm.tokensPerSec) * 1000);
194
+ const ttsEffective = streaming && tts.streaming ? Math.round(tts.latencyMs * 0.5) : tts.latencyMs;
195
+ const totalMs = stt.latencyMs + llm.firstTokenMs + llmCompletionMs + tts.latencyMs;
196
+ // With streaming, user hears audio before LLM finishes
197
+ const perceivedMs = stt.latencyMs + llm.firstTokenMs + ttsEffective;
198
+ return {
199
+ totalMs,
200
+ perceivedMs,
201
+ breakdown: {
202
+ sttMs: stt.latencyMs,
203
+ llmFirstTokenMs: llm.firstTokenMs,
204
+ llmCompletionMs,
205
+ ttsMs: tts.latencyMs,
206
+ },
207
+ };
208
+ }
209
+ function estimateMonthlyCost(stt, tts, llm, minutesPerDay) {
210
+ const daysPerMonth = 30;
211
+ const totalMinutes = minutesPerDay * daysPerMonth;
212
+ // Assume ~150 tokens per minute of conversation (input + output)
213
+ const tokensPerMinute = 150;
214
+ const sttCost = stt.costPerMin * totalMinutes;
215
+ const ttsCost = tts.costPerMin * totalMinutes;
216
+ const llmCost = (tokensPerMinute / 1000) * llm.costPer1kTokens * totalMinutes;
217
+ return {
218
+ monthly: Math.round((sttCost + ttsCost + llmCost) * 100) / 100,
219
+ breakdown: {
220
+ stt: Math.round(sttCost * 100) / 100,
221
+ tts: Math.round(ttsCost * 100) / 100,
222
+ llm: Math.round(llmCost * 100) / 100,
223
+ },
224
+ };
225
+ }
226
+ function rateLatency(perceivedMs) {
227
+ if (perceivedMs <= 500)
228
+ return "excellent";
229
+ if (perceivedMs <= 1000)
230
+ return "good";
231
+ if (perceivedMs <= 2000)
232
+ return "acceptable";
233
+ return "poor";
234
+ }
235
+ // ─── Scaffold templates ──────────────────────────────────────────────────────
236
+ function getScaffoldWhisperEdge(includeVAD) {
237
+ const vadCode = includeVAD
238
+ ? `
239
+ import numpy as np
240
+
241
+ def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
242
+ """Simple energy-based Voice Activity Detection."""
243
+ energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
244
+ return energy > threshold
245
+ `
246
+ : "";
247
+ const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
248
+ const vadUsage = includeVAD
249
+ ? `
250
+ # Check for voice activity before sending to Whisper
251
+ if not detect_voice_activity(audio_data):
252
+ continue`
253
+ : "";
254
+ return {
255
+ files: [
256
+ {
257
+ path: "voice_bridge.py",
258
+ content: `"""Voice Bridge: Whisper STT + Edge TTS + WebSocket server."""
259
+ import asyncio
260
+ import json
261
+ import tempfile
262
+ import wave
263
+ import whisper
264
+ import edge_tts
265
+ import websockets
266
+ ${vadImport}
267
+ # Load Whisper model (use "tiny" or "base" for speed, "small" for accuracy)
268
+ model = whisper.load_model("base")
269
+
270
+ async def transcribe(audio_data: bytes) -> str:
271
+ """Transcribe audio bytes using Whisper."""
272
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
273
+ # Write raw PCM as WAV
274
+ with wave.open(f.name, "wb") as wf:
275
+ wf.setnchannels(1)
276
+ wf.setsampwidth(2)
277
+ wf.setframerate(16000)
278
+ wf.writeframes(audio_data)
279
+ result = model.transcribe(f.name, language="en")
280
+ return result["text"].strip()
281
+
282
+ async def synthesize(text: str) -> bytes:
283
+ """Synthesize speech using Edge TTS."""
284
+ communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
285
+ audio_chunks = []
286
+ async for chunk in communicate.stream():
287
+ if chunk["type"] == "audio":
288
+ audio_chunks.append(chunk["data"])
289
+ return b"".join(audio_chunks)
290
+
291
+ async def handle_client(websocket):
292
+ """Handle a WebSocket client connection."""
293
+ print("Client connected")
294
+ try:
295
+ async for message in websocket:
296
+ if isinstance(message, bytes):${vadUsage}
297
+ # Transcribe speech to text
298
+ transcript = await transcribe(message)
299
+ print(f"User: {transcript}")
300
+
301
+ # TODO: Send transcript to your LLM and get response
302
+ llm_response = f"Echo: {transcript}"
303
+
304
+ # Synthesize response to speech
305
+ audio = await synthesize(llm_response)
306
+ await websocket.send(audio)
307
+ else:
308
+ # Handle text/control messages
309
+ data = json.loads(message)
310
+ if data.get("type") == "ping":
311
+ await websocket.send(json.dumps({{"type": "pong"}}))
312
+ except websockets.exceptions.ConnectionClosed:
313
+ print("Client disconnected")
314
+
315
+ async def main():
316
+ print("Voice Bridge starting on ws://localhost:8765")
317
+ async with websockets.serve(handle_client, "localhost", 8765):
318
+ await asyncio.Future() # Run forever
319
+
320
+ if __name__ == "__main__":
321
+ asyncio.run(main())
322
+ `,
323
+ description: "Main voice bridge server: Whisper STT + Edge TTS over WebSocket",
324
+ },
325
+ ...(includeVAD
326
+ ? [
327
+ {
328
+ path: "vad.py",
329
+ content: vadCode.trim() + "\n",
330
+ description: "Simple energy-based Voice Activity Detection",
331
+ },
332
+ ]
333
+ : []),
334
+ {
335
+ path: "requirements.txt",
336
+ content: `openai-whisper
337
+ edge-tts
338
+ websockets
339
+ numpy
340
+ `,
341
+ description: "Python dependencies",
342
+ },
343
+ ],
344
+ setupInstructions: [
345
+ "pip install -r requirements.txt",
346
+ "python voice_bridge.py",
347
+ "Connect a WebSocket client to ws://localhost:8765",
348
+ "Send raw PCM audio (16kHz, 16-bit, mono) as binary WebSocket frames",
349
+ "Receive synthesized audio back as binary frames",
350
+ ],
351
+ dependencies: ["openai-whisper", "edge-tts", "websockets", "numpy", "ffmpeg (system)"],
352
+ };
353
+ }
354
+ function getScaffoldDeepgramCartesia(includeVAD) {
355
+ const vadBlock = includeVAD
356
+ ? `
357
+ // Simple energy-based VAD
358
+ function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
359
+ let energy = 0;
360
+ for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
361
+ return Math.sqrt(energy / samples.length) > threshold;
362
+ }
363
+ `
364
+ : "";
365
+ return {
366
+ files: [
367
+ {
368
+ path: "src/voice-bridge.ts",
369
+ content: `/**
370
+ * Voice Bridge: Deepgram STT + Cartesia TTS (TypeScript)
371
+ *
372
+ * Requires:
373
+ * DEEPGRAM_API_KEY — from https://console.deepgram.com
374
+ * CARTESIA_API_KEY — from https://play.cartesia.ai
375
+ */
376
+ import { createClient, LiveTranscriptionEvents } from "@deepgram/sdk";
377
+ import Cartesia from "@cartesia/cartesia-js";
378
+ import { WebSocketServer } from "ws";
379
+
380
+ const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
381
+ const cartesia = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY! });
382
+ ${vadBlock}
383
+ const wss = new WebSocketServer({ port: 8765 });
384
+ console.log("Voice Bridge listening on ws://localhost:8765");
385
+
386
+ wss.on("connection", (ws) => {
387
+ console.log("Client connected");
388
+
389
+ // Set up Deepgram live transcription
390
+ const dgConn = deepgram.listen.live({
391
+ model: "nova-2",
392
+ language: "en",
393
+ smart_format: true,
394
+ interim_results: false,
395
+ });
396
+
397
+ dgConn.on(LiveTranscriptionEvents.Open, () => {
398
+ console.log("Deepgram connection open");
399
+ });
400
+
401
+ dgConn.on(LiveTranscriptionEvents.Transcript, async (data) => {
402
+ const transcript = data.channel?.alternatives?.[0]?.transcript;
403
+ if (!transcript) return;
404
+
405
+ console.log(\`User: \${transcript}\`);
406
+
407
+ // TODO: Send transcript to your LLM and get response
408
+ const llmResponse = \`Echo: \${transcript}\`;
409
+
410
+ // Synthesize with Cartesia (streaming)
411
+ const ttsResponse = await cartesia.tts.sse({
412
+ modelId: "sonic-english",
413
+ transcript: llmResponse,
414
+ voice: { mode: "id", id: "a0e99841-438c-4a64-b679-ae501e7d6091" },
415
+ output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: 24000 },
416
+ });
417
+
418
+ for await (const chunk of ttsResponse) {
419
+ if (ws.readyState === ws.OPEN) {
420
+ ws.send(chunk);
421
+ }
422
+ }
423
+ });
424
+
425
+ dgConn.on(LiveTranscriptionEvents.Error, (err) => {
426
+ console.error("Deepgram error:", err);
427
+ });
428
+
429
+ ws.on("message", (data: Buffer) => {${includeVAD ? "\n // VAD check\n const samples = new Float32Array(data.buffer);\n if (!detectVoiceActivity(samples)) return;\n" : ""}
430
+ dgConn.send(data);
431
+ });
432
+
433
+ ws.on("close", () => {
434
+ dgConn.finish();
435
+ console.log("Client disconnected");
436
+ });
437
+ });
438
+ `,
439
+ description: "TypeScript voice bridge: Deepgram Nova-2 STT + Cartesia Sonic streaming TTS",
440
+ },
441
+ {
442
+ path: "package.json",
443
+ content: JSON.stringify({
444
+ name: "voice-bridge-deepgram-cartesia",
445
+ type: "module",
446
+ scripts: { start: "tsx src/voice-bridge.ts", build: "tsc" },
447
+ dependencies: {
448
+ "@deepgram/sdk": "^3.0.0",
449
+ "@cartesia/cartesia-js": "^1.0.0",
450
+ ws: "^8.0.0",
451
+ },
452
+ devDependencies: { typescript: "^5.7.0", tsx: "^4.0.0", "@types/ws": "^8.0.0" },
453
+ }, null, 2),
454
+ description: "Package manifest with Deepgram and Cartesia SDKs",
455
+ },
456
+ ],
457
+ setupInstructions: [
458
+ "npm install",
459
+ "Set DEEPGRAM_API_KEY and CARTESIA_API_KEY environment variables",
460
+ "npx tsx src/voice-bridge.ts",
461
+ "Connect a WebSocket client to ws://localhost:8765",
462
+ "Send raw PCM audio as binary frames, receive synthesized audio back",
463
+ ],
464
+ dependencies: ["@deepgram/sdk", "@cartesia/cartesia-js", "ws", "tsx"],
465
+ };
466
+ }
467
+ function getScaffoldBrowserWebspeech() {
468
+ return {
469
+ files: [
470
+ {
471
+ path: "index.html",
472
+ content: `<!DOCTYPE html>
473
+ <html lang="en">
474
+ <head>
475
+ <meta charset="UTF-8">
476
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
477
+ <title>Voice Bridge — Web Speech API</title>
478
+ <style>
479
+ body { font-family: system-ui, sans-serif; max-width: 600px; margin: 2rem auto; padding: 0 1rem; }
480
+ #status { padding: 0.5rem; border-radius: 4px; margin: 1rem 0; }
481
+ .listening { background: #dcfce7; color: #166534; }
482
+ .speaking { background: #dbeafe; color: #1e40af; }
483
+ .idle { background: #f3f4f6; color: #374151; }
484
+ #transcript { white-space: pre-wrap; border: 1px solid #d1d5db; padding: 1rem; border-radius: 4px; min-height: 200px; max-height: 400px; overflow-y: auto; }
485
+ button { padding: 0.75rem 1.5rem; font-size: 1rem; border: none; border-radius: 4px; cursor: pointer; margin: 0.25rem; }
486
+ #startBtn { background: #2563eb; color: white; }
487
+ #startBtn:disabled { background: #93c5fd; cursor: not-allowed; }
488
+ #stopBtn { background: #dc2626; color: white; }
489
+ </style>
490
+ </head>
491
+ <body>
492
+ <h1>Voice Bridge</h1>
493
+ <p>Browser-native STT + TTS using the Web Speech API. No server needed.</p>
494
+ <div>
495
+ <button id="startBtn" onclick="startListening()">Start Listening</button>
496
+ <button id="stopBtn" onclick="stopListening()">Stop</button>
497
+ </div>
498
+ <div id="status" class="idle">Idle</div>
499
+ <div id="transcript"></div>
500
+
501
+ <script>
502
+ const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
503
+ if (!SpeechRecognition) {
504
+ document.getElementById("status").textContent = "Web Speech API not supported in this browser.";
505
+ }
506
+
507
+ const recognition = new SpeechRecognition();
508
+ recognition.continuous = true;
509
+ recognition.interimResults = true;
510
+ recognition.lang = "en-US";
511
+
512
+ const synth = window.speechSynthesis;
513
+ const transcriptEl = document.getElementById("transcript");
514
+ const statusEl = document.getElementById("status");
515
+ let isListening = false;
516
+
517
+ function setStatus(text, cls) {
518
+ statusEl.textContent = text;
519
+ statusEl.className = cls;
520
+ }
521
+
522
+ function appendTranscript(role, text) {
523
+ transcriptEl.textContent += role + ": " + text + "\\n";
524
+ transcriptEl.scrollTop = transcriptEl.scrollHeight;
525
+ }
526
+
527
+ recognition.onresult = (event) => {
528
+ const last = event.results[event.results.length - 1];
529
+ if (last.isFinal) {
530
+ const transcript = last[0].transcript.trim();
531
+ if (!transcript) return;
532
+
533
+ appendTranscript("You", transcript);
534
+ setStatus("Processing...", "speaking");
535
+
536
+ // TODO: Send transcript to your LLM API and get response
537
+ const llmResponse = "Echo: " + transcript;
538
+
539
+ appendTranscript("Assistant", llmResponse);
540
+
541
+ // Speak the response
542
+ const utterance = new SpeechSynthesisUtterance(llmResponse);
543
+ utterance.onstart = () => setStatus("Speaking...", "speaking");
544
+ utterance.onend = () => setStatus("Listening...", "listening");
545
+ synth.speak(utterance);
546
+ }
547
+ };
548
+
549
+ recognition.onerror = (event) => {
550
+ console.error("Speech recognition error:", event.error);
551
+ if (event.error !== "no-speech") {
552
+ setStatus("Error: " + event.error, "idle");
553
+ }
554
+ };
555
+
556
+ recognition.onend = () => {
557
+ if (isListening) recognition.start(); // Auto-restart
558
+ };
559
+
560
+ function startListening() {
561
+ isListening = true;
562
+ recognition.start();
563
+ setStatus("Listening...", "listening");
564
+ document.getElementById("startBtn").disabled = true;
565
+ }
566
+
567
+ function stopListening() {
568
+ isListening = false;
569
+ recognition.stop();
570
+ synth.cancel();
571
+ setStatus("Idle", "idle");
572
+ document.getElementById("startBtn").disabled = false;
573
+ }
574
+ </script>
575
+ </body>
576
+ </html>
577
+ `,
578
+ description: "Self-contained HTML page using Web Speech API for both STT and TTS",
579
+ },
580
+ ],
581
+ setupInstructions: [
582
+ "Open index.html in Chrome or Edge (best Web Speech API support)",
583
+ "Click 'Start Listening' and allow microphone access",
584
+ "Speak — your words will be transcribed and echoed back via TTS",
585
+ "Replace the 'Echo' logic with an LLM API call for real conversations",
586
+ ],
587
+ dependencies: [],
588
+ };
589
+ }
590
+ function getScaffoldWhisperPiper(includeVAD) {
591
+ const vadCode = includeVAD
592
+ ? `
593
+ import numpy as np
594
+
595
+ def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
596
+ """Simple energy-based Voice Activity Detection."""
597
+ energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
598
+ return energy > threshold
599
+ `
600
+ : "";
601
+ const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
602
+ const vadUsage = includeVAD
603
+ ? `
604
+ # Check for voice activity
605
+ if not detect_voice_activity(audio_data):
606
+ continue`
607
+ : "";
608
+ return {
609
+ files: [
610
+ {
611
+ path: "voice_bridge.py",
612
+ content: `"""Fully Local Voice Bridge: Whisper STT + Piper TTS. No cloud dependencies."""
613
+ import asyncio
614
+ import json
615
+ import subprocess
616
+ import tempfile
617
+ import wave
618
+ import whisper
619
+ import websockets
620
+ ${vadImport}
621
+ # Load Whisper model
622
+ model = whisper.load_model("base")
623
+
624
+ # Piper TTS configuration
625
+ PIPER_MODEL = "en_US-lessac-medium" # Download from https://github.com/rhasspy/piper/releases
626
+ PIPER_BIN = "piper" # Ensure piper is on PATH
627
+
628
+ async def transcribe(audio_data: bytes) -> str:
629
+ """Transcribe audio bytes using Whisper."""
630
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
631
+ with wave.open(f.name, "wb") as wf:
632
+ wf.setnchannels(1)
633
+ wf.setsampwidth(2)
634
+ wf.setframerate(16000)
635
+ wf.writeframes(audio_data)
636
+ result = model.transcribe(f.name, language="en")
637
+ return result["text"].strip()
638
+
639
+ async def synthesize(text: str) -> bytes:
640
+ """Synthesize speech using Piper TTS (local, ONNX-based)."""
641
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
642
+ proc = await asyncio.create_subprocess_exec(
643
+ PIPER_BIN,
644
+ "--model", PIPER_MODEL,
645
+ "--output_file", f.name,
646
+ stdin=asyncio.subprocess.PIPE,
647
+ stdout=asyncio.subprocess.PIPE,
648
+ stderr=asyncio.subprocess.PIPE,
649
+ )
650
+ await proc.communicate(input=text.encode("utf-8"))
651
+ with open(f.name, "rb") as audio:
652
+ return audio.read()
653
+
654
+ async def handle_client(websocket):
655
+ """Handle a WebSocket client connection."""
656
+ print("Client connected")
657
+ try:
658
+ async for message in websocket:
659
+ if isinstance(message, bytes):${vadUsage}
660
+ transcript = await transcribe(message)
661
+ print(f"User: {transcript}")
662
+
663
+ # TODO: Send to local LLM (e.g. llama.cpp server)
664
+ llm_response = f"Echo: {transcript}"
665
+
666
+ audio = await synthesize(llm_response)
667
+ await websocket.send(audio)
668
+ else:
669
+ data = json.loads(message)
670
+ if data.get("type") == "ping":
671
+ await websocket.send(json.dumps({{"type": "pong"}}))
672
+ except websockets.exceptions.ConnectionClosed:
673
+ print("Client disconnected")
674
+
675
+ async def main():
676
+ print("Local Voice Bridge starting on ws://localhost:8765")
677
+ print("Fully offline — no cloud APIs used.")
678
+ async with websockets.serve(handle_client, "localhost", 8765):
679
+ await asyncio.Future()
680
+
681
+ if __name__ == "__main__":
682
+ asyncio.run(main())
683
+ `,
684
+ description: "Fully local voice bridge: Whisper + Piper (no cloud dependencies)",
685
+ },
686
+ ...(includeVAD
687
+ ? [
688
+ {
689
+ path: "vad.py",
690
+ content: vadCode.trim() + "\n",
691
+ description: "Simple energy-based Voice Activity Detection",
692
+ },
693
+ ]
694
+ : []),
695
+ {
696
+ path: "requirements.txt",
697
+ content: `openai-whisper
698
+ websockets
699
+ numpy
700
+ `,
701
+ description: "Python dependencies (Piper installed separately as binary)",
702
+ },
703
+ ],
704
+ setupInstructions: [
705
+ "pip install -r requirements.txt",
706
+ "Download Piper binary from https://github.com/rhasspy/piper/releases",
707
+ "Download a Piper voice model (e.g. en_US-lessac-medium.onnx)",
708
+ "Ensure 'piper' is on your PATH",
709
+ "python voice_bridge.py",
710
+ "Connect via WebSocket at ws://localhost:8765",
711
+ ],
712
+ dependencies: ["openai-whisper", "websockets", "numpy", "piper (system binary)", "ffmpeg (system)"],
713
+ };
714
+ }
715
+ function getScaffoldCustom(language, includeVAD) {
716
+ if (language === "python") {
717
+ const vadBlock = includeVAD
718
+ ? `
719
+ class SimpleVAD:
720
+ """Energy-based Voice Activity Detection."""
721
+ def __init__(self, threshold: float = 0.02):
722
+ self.threshold = threshold
723
+
724
+ def is_speech(self, audio_chunk) -> bool:
725
+ import numpy as np
726
+ energy = np.sqrt(np.mean(np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) ** 2))
727
+ return energy > self.threshold
728
+ `
729
+ : "";
730
+ return {
731
+ files: [
732
+ {
733
+ path: "voice_bridge.py",
734
+ content: `"""Custom Voice Bridge Template — Plug in any STT/TTS/LLM."""
735
+ import asyncio
736
+ import json
737
+ from abc import ABC, abstractmethod
738
+ import websockets
739
+
740
+
741
+ class STTProvider(ABC):
742
+ @abstractmethod
743
+ async def transcribe(self, audio_data: bytes) -> str: ...
744
+
745
+ class TTSProvider(ABC):
746
+ @abstractmethod
747
+ async def synthesize(self, text: str) -> bytes: ...
748
+
749
+ class LLMProvider(ABC):
750
+ @abstractmethod
751
+ async def generate(self, prompt: str, history: list[dict]) -> str: ...
752
+ ${vadBlock}
753
+
754
+ # ─── Implement your providers here ────────────────────────────────
755
+
756
+ class MySTT(STTProvider):
757
+ async def transcribe(self, audio_data: bytes) -> str:
758
+ # TODO: Replace with your STT implementation
759
+ raise NotImplementedError("Implement STT provider")
760
+
761
+ class MyTTS(TTSProvider):
762
+ async def synthesize(self, text: str) -> bytes:
763
+ # TODO: Replace with your TTS implementation
764
+ raise NotImplementedError("Implement TTS provider")
765
+
766
+ class MyLLM(LLMProvider):
767
+ async def generate(self, prompt: str, history: list[dict]) -> str:
768
+ # TODO: Replace with your LLM implementation
769
+ return f"Echo: {prompt}"
770
+
771
+
772
+ # ─── Voice bridge server ──────────────────────────────────────────
773
+
774
+ class VoiceBridge:
775
+ def __init__(self, stt: STTProvider, tts: TTSProvider, llm: LLMProvider):
776
+ self.stt = stt
777
+ self.tts = tts
778
+ self.llm = llm
779
+
780
+ async def handle_client(self, websocket):
781
+ history = []
782
+ async for message in websocket:
783
+ if isinstance(message, bytes):
784
+ transcript = await self.stt.transcribe(message)
785
+ history.append({"role": "user", "content": transcript})
786
+ response = await self.llm.generate(transcript, history)
787
+ history.append({"role": "assistant", "content": response})
788
+ audio = await self.tts.synthesize(response)
789
+ await websocket.send(audio)
790
+ else:
791
+ data = json.loads(message)
792
+ await websocket.send(json.dumps({"type": "pong"}))
793
+
794
+ async def start(self, host: str = "localhost", port: int = 8765):
795
+ print(f"Voice Bridge on ws://{host}:{port}")
796
+ async with websockets.serve(self.handle_client, host, port):
797
+ await asyncio.Future()
798
+
799
+
800
+ if __name__ == "__main__":
801
+ bridge = VoiceBridge(stt=MySTT(), tts=MyTTS(), llm=MyLLM())
802
+ asyncio.run(bridge.start())
803
+ `,
804
+ description: "Pluggable voice bridge template with abstract STT/TTS/LLM interfaces",
805
+ },
806
+ ],
807
+ setupInstructions: [
808
+ "Implement MySTT, MyTTS, and MyLLM classes with your chosen providers",
809
+ "pip install websockets (+ your provider SDKs)",
810
+ "python voice_bridge.py",
811
+ ],
812
+ dependencies: ["websockets"],
813
+ };
814
+ }
815
+ // TypeScript custom template
816
+ const vadBlock = includeVAD
817
+ ? `
818
+ /** Simple energy-based Voice Activity Detection. */
819
+ function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
820
+ let energy = 0;
821
+ for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
822
+ return Math.sqrt(energy / samples.length) > threshold;
823
+ }
824
+ `
825
+ : "";
826
+ return {
827
+ files: [
828
+ {
829
+ path: "src/voice-bridge.ts",
830
+ content: `/**
831
+ * Custom Voice Bridge Template — Plug in any STT/TTS/LLM.
832
+ */
833
+ import { WebSocketServer } from "ws";
834
+
835
+ // ─── Provider interfaces ─────────────────────────────────────────
836
+
837
+ interface STTProvider {
838
+ transcribe(audio: Buffer): Promise<string>;
839
+ }
840
+
841
+ interface TTSProvider {
842
+ synthesize(text: string): Promise<Buffer>;
843
+ }
844
+
845
+ interface LLMProvider {
846
+ generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string>;
847
+ }
848
+ ${vadBlock}
849
+ // ─── Implement your providers here ───────────────────────────────
850
+
851
+ class MySTT implements STTProvider {
852
+ async transcribe(audio: Buffer): Promise<string> {
853
+ // TODO: Replace with your STT implementation
854
+ throw new Error("Implement STT provider");
855
+ }
856
+ }
857
+
858
+ class MyTTS implements TTSProvider {
859
+ async synthesize(text: string): Promise<Buffer> {
860
+ // TODO: Replace with your TTS implementation
861
+ throw new Error("Implement TTS provider");
862
+ }
863
+ }
864
+
865
+ class MyLLM implements LLMProvider {
866
+ async generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string> {
867
+ // TODO: Replace with your LLM implementation
868
+ return \`Echo: \${prompt}\`;
869
+ }
870
+ }
871
+
872
+ // ─── Voice bridge server ─────────────────────────────────────────
873
+
874
+ const stt = new MySTT();
875
+ const tts = new MyTTS();
876
+ const llm = new MyLLM();
877
+
878
+ const wss = new WebSocketServer({ port: 8765 });
879
+ console.log("Voice Bridge listening on ws://localhost:8765");
880
+
881
+ wss.on("connection", (ws) => {
882
+ const history: Array<{ role: string; content: string }> = [];
883
+
884
+ ws.on("message", async (data: Buffer) => {
885
+ try {
886
+ const transcript = await stt.transcribe(data);
887
+ history.push({ role: "user", content: transcript });
888
+
889
+ const response = await llm.generate(transcript, history);
890
+ history.push({ role: "assistant", content: response });
891
+
892
+ const audio = await tts.synthesize(response);
893
+ ws.send(audio);
894
+ } catch (err) {
895
+ console.error("Pipeline error:", err);
896
+ }
897
+ });
898
+ });
899
+ `,
900
+ description: "Pluggable voice bridge template with STT/TTS/LLM interfaces",
901
+ },
902
+ {
903
+ path: "package.json",
904
+ content: JSON.stringify({
905
+ name: "voice-bridge-custom",
906
+ type: "module",
907
+ scripts: { start: "tsx src/voice-bridge.ts", build: "tsc" },
908
+ dependencies: { ws: "^8.0.0" },
909
+ devDependencies: { typescript: "^5.7.0", tsx: "^4.0.0", "@types/ws": "^8.0.0" },
910
+ }, null, 2),
911
+ description: "Package manifest — add your provider SDKs as dependencies",
912
+ },
913
+ ],
914
+ setupInstructions: [
915
+ "npm install",
916
+ "Implement MySTT, MyTTS, and MyLLM classes with your chosen providers",
917
+ "Add provider SDK packages to dependencies",
918
+ "npx tsx src/voice-bridge.ts",
919
+ ],
920
+ dependencies: ["ws", "tsx"],
921
+ };
922
+ }
923
+ // ─── Tools ───────────────────────────────────────────────────────────────────
924
+ export const voiceBridgeTools = [
925
+ // ─── Tool 1: design_voice_pipeline ──────────────────────────────────────────
926
+ {
927
+ name: "design_voice_pipeline",
928
+ description: "Given requirements (latency, privacy, platform, budget), recommend an optimal STT/TTS/LLM stack. Returns top 3 configurations ranked by fit, with estimated round-trip latency, monthly cost, and architecture notes. Knowledge-based — no live measurements.",
929
+ inputSchema: {
930
+ type: "object",
931
+ properties: {
932
+ requirements: {
933
+ type: "object",
934
+ description: "Pipeline requirements to match against",
935
+ properties: {
936
+ maxLatencyMs: {
937
+ type: "number",
938
+ description: "Maximum acceptable round-trip latency in ms (default: 2000)",
939
+ },
940
+ privacy: {
941
+ type: "string",
942
+ enum: ["local", "cloud", "hybrid"],
943
+ description: "Privacy requirement: 'local' (no data leaves device), 'cloud' (OK), 'hybrid' (STT/TTS local, LLM cloud)",
944
+ },
945
+ platform: {
946
+ type: "string",
947
+ enum: ["mac", "windows", "linux", "browser", "mobile"],
948
+ description: "Target platform",
949
+ },
950
+ budget: {
951
+ type: "string",
952
+ enum: ["free", "low", "medium", "enterprise"],
953
+ description: "Budget tier: free ($0), low (<$10/mo), medium (<$100/mo), enterprise (unlimited)",
954
+ },
955
+ },
956
+ },
957
+ },
958
+ required: ["requirements"],
959
+ },
960
+ handler: async (args) => {
961
+ const reqs = args.requirements ?? {};
962
+ const maxLatency = reqs.maxLatencyMs ?? 2000;
963
+ const privacy = reqs.privacy ?? "cloud";
964
+ const platform = reqs.platform ?? "linux";
965
+ const budget = reqs.budget ?? "medium";
966
+ const budgetCaps = { free: 0, low: 10, medium: 100, enterprise: Infinity };
967
+ const monthlyCap = budgetCaps[budget] ?? 100;
968
+ // Filter compatible engines
969
+ const filteredStt = STT_ENGINES.filter((e) => {
970
+ if (platform && !e.platforms.includes(platform))
971
+ return false;
972
+ if (privacy === "local" && e.privacy !== "local")
973
+ return false;
974
+ if (budget === "free" && e.costPerMin > 0)
975
+ return false;
976
+ return true;
977
+ });
978
+ const filteredTts = TTS_ENGINES.filter((e) => {
979
+ if (platform && !e.platforms.includes(platform))
980
+ return false;
981
+ if (privacy === "local" && e.privacy !== "local")
982
+ return false;
983
+ if (budget === "free" && e.costPerMin > 0)
984
+ return false;
985
+ return true;
986
+ });
987
+ const filteredLlm = LLM_ENGINES.filter((e) => {
988
+ if (privacy === "local" && e.privacy !== "local")
989
+ return false;
990
+ if (budget === "free" && e.costPer1kTokens > 0)
991
+ return false;
992
+ return true;
993
+ });
994
+ // For hybrid privacy: allow cloud LLM but keep STT/TTS local
995
+ const hybridLlm = privacy === "hybrid"
996
+ ? LLM_ENGINES.filter((e) => budget === "free" ? e.costPer1kTokens === 0 : true)
997
+ : filteredLlm;
998
+ const effectiveLlm = privacy === "hybrid" ? hybridLlm : filteredLlm;
999
+ if (filteredStt.length === 0 || filteredTts.length === 0 || effectiveLlm.length === 0) {
1000
+ return {
1001
+ error: false,
1002
+ recommended: null,
1003
+ alternatives: [],
1004
+ message: "No compatible configuration found for the given requirements. Try relaxing privacy or budget constraints.",
1005
+ availableStt: filteredStt.map((e) => e.name),
1006
+ availableTts: filteredTts.map((e) => e.name),
1007
+ availableLlm: effectiveLlm.map((e) => e.name),
1008
+ considerations: [
1009
+ filteredStt.length === 0 ? `No STT engines match platform=${platform} + privacy=${privacy}` : null,
1010
+ filteredTts.length === 0 ? `No TTS engines match platform=${platform} + privacy=${privacy}` : null,
1011
+ effectiveLlm.length === 0 ? `No LLM engines match privacy=${privacy} + budget=${budget}` : null,
1012
+ ].filter(Boolean),
1013
+ };
1014
+ }
1015
+ // Generate all valid combinations and score them
1016
+ const candidates = [];
1017
+ for (const stt of filteredStt) {
1018
+ for (const tts of filteredTts) {
1019
+ for (const llm of effectiveLlm) {
1020
+ const latency = estimateRoundTrip(stt, tts, llm, true);
1021
+ const cost = estimateMonthlyCost(stt, tts, llm, 30); // 30 min/day estimate
1022
+ // Skip if over latency or budget
1023
+ if (latency.perceivedMs > maxLatency * 1.5)
1024
+ continue;
1025
+ if (cost.monthly > monthlyCap)
1026
+ continue;
1027
+ // Score: lower is better (latency-weighted + cost penalty)
1028
+ const latencyScore = latency.perceivedMs / maxLatency;
1029
+ const costScore = monthlyCap > 0 ? cost.monthly / monthlyCap : 0;
1030
+ const qualityBonus = (stt.quality === "high" ? -0.1 : 0) +
1031
+ (tts.quality === "high" ? -0.1 : 0) +
1032
+ (llm.quality === "high" ? -0.1 : 0);
1033
+ candidates.push({
1034
+ stt,
1035
+ tts,
1036
+ llm,
1037
+ latency,
1038
+ cost,
1039
+ score: latencyScore * 0.5 + costScore * 0.3 + qualityBonus,
1040
+ });
1041
+ }
1042
+ }
1043
+ }
1044
+ candidates.sort((a, b) => a.score - b.score);
1045
+ const top3 = candidates.slice(0, 3);
1046
+ if (top3.length === 0) {
1047
+ return {
1048
+ error: false,
1049
+ recommended: null,
1050
+ alternatives: [],
1051
+ message: "No configurations meet the latency and budget constraints simultaneously. Try increasing maxLatencyMs or budget.",
1052
+ considerations: [
1053
+ `Max latency: ${maxLatency}ms`,
1054
+ `Monthly cap: $${monthlyCap}`,
1055
+ `Platform: ${platform}`,
1056
+ `Privacy: ${privacy}`,
1057
+ ],
1058
+ };
1059
+ }
1060
+ const formatCandidate = (c) => ({
1061
+ stt: { name: c.stt.name, key: c.stt.key, latencyMs: c.stt.latencyMs, privacy: c.stt.privacy },
1062
+ tts: { name: c.tts.name, key: c.tts.key, latencyMs: c.tts.latencyMs, privacy: c.tts.privacy },
1063
+ llm: { name: c.llm.name, key: c.llm.key, firstTokenMs: c.llm.firstTokenMs, privacy: c.llm.privacy },
1064
+ estimatedLatencyMs: c.latency.perceivedMs,
1065
+ totalLatencyMs: c.latency.totalMs,
1066
+ latencyBreakdown: c.latency.breakdown,
1067
+ monthlyEstimate: `$${c.cost.monthly}`,
1068
+ costBreakdown: c.cost.breakdown,
1069
+ rating: rateLatency(c.latency.perceivedMs),
1070
+ });
1071
+ // Determine overall architecture
1072
+ const best = top3[0];
1073
+ const allLocal = best.stt.privacy === "local" && best.tts.privacy === "local" && best.llm.privacy === "local";
1074
+ const allCloud = best.stt.privacy === "cloud" && best.tts.privacy === "cloud" && best.llm.privacy === "cloud";
1075
+ const architecture = allLocal ? "local" : allCloud ? "cloud" : "hybrid";
1076
+ // Build considerations
1077
+ const considerations = [];
1078
+ if (best.latency.perceivedMs > 1500) {
1079
+ considerations.push("Perceived latency exceeds 1.5s — consider streaming TTS or a faster LLM.");
1080
+ }
1081
+ if (best.stt.privacy === "cloud") {
1082
+ considerations.push("STT sends audio to cloud — ensure compliance with data privacy requirements.");
1083
+ }
1084
+ if (best.tts.costPerMin > 0.1) {
1085
+ considerations.push("TTS cost is high — consider caching common phrases or pre-generating audio.");
1086
+ }
1087
+ if (platform === "browser" && (best.stt.key === "whisper" || best.tts.key === "piper")) {
1088
+ considerations.push("Selected engines require server-side processing — add a WebSocket bridge for browser use.");
1089
+ }
1090
+ if (best.llm.privacy === "local") {
1091
+ considerations.push("Local LLM requires adequate GPU/CPU — test on target hardware before committing.");
1092
+ }
1093
+ return {
1094
+ recommended: formatCandidate(best),
1095
+ alternatives: top3.slice(1).map(formatCandidate),
1096
+ architecture,
1097
+ considerations,
1098
+ _quickRef: {
1099
+ nextAction: "Use analyze_voice_config to validate the recommended stack, or generate_voice_scaffold to get starter code.",
1100
+ nextTools: ["analyze_voice_config", "generate_voice_scaffold", "benchmark_voice_latency"],
1101
+ },
1102
+ };
1103
+ },
1104
+ },
1105
+ // ─── Tool 2: analyze_voice_config ───────────────────────────────────────────
1106
+ {
1107
+ name: "analyze_voice_config",
1108
+ description: "Validate a voice pipeline configuration. Checks STT/TTS/LLM compatibility, estimates per-minute costs, identifies the latency bottleneck, and suggests optimizations (streaming, caching, pre-generation). Knowledge-based analysis, no live calls.",
1109
+ inputSchema: {
1110
+ type: "object",
1111
+ properties: {
1112
+ stt: {
1113
+ type: "string",
1114
+ description: "STT engine key or name (e.g. 'whisper', 'deepgram', 'web_speech_api_stt')",
1115
+ },
1116
+ tts: {
1117
+ type: "string",
1118
+ description: "TTS engine key or name (e.g. 'edge_tts', 'cartesia', 'piper', 'web_speech_api_tts')",
1119
+ },
1120
+ llm: {
1121
+ type: "string",
1122
+ description: "LLM key or name (e.g. 'claude_haiku', 'gpt4o_mini', 'local_llama')",
1123
+ },
1124
+ targetLatencyMs: {
1125
+ type: "number",
1126
+ description: "Target round-trip latency in ms (default: 1500)",
1127
+ },
1128
+ },
1129
+ required: ["stt", "tts", "llm"],
1130
+ },
1131
+ handler: async (args) => {
1132
+ const sttEngine = findStt(args.stt);
1133
+ const ttsEngine = findTts(args.tts);
1134
+ const llmEngine = findLlm(args.llm);
1135
+ const targetLatency = args.targetLatencyMs ?? 1500;
1136
+ const warnings = [];
1137
+ const compatibility = [];
1138
+ const optimizations = [];
1139
+ // Check if engines are recognized
1140
+ if (!sttEngine) {
1141
+ warnings.push(`Unknown STT engine: '${args.stt}'. Known engines: ${STT_ENGINES.map((e) => e.key).join(", ")}`);
1142
+ }
1143
+ if (!ttsEngine) {
1144
+ warnings.push(`Unknown TTS engine: '${args.tts}'. Known engines: ${TTS_ENGINES.map((e) => e.key).join(", ")}`);
1145
+ }
1146
+ if (!llmEngine) {
1147
+ warnings.push(`Unknown LLM engine: '${args.llm}'. Known engines: ${LLM_ENGINES.map((e) => e.key).join(", ")}`);
1148
+ }
1149
+ if (!sttEngine || !ttsEngine || !llmEngine) {
1150
+ return {
1151
+ valid: false,
1152
+ compatibility: [],
1153
+ estimatedLatencyMs: null,
1154
+ costPerMinute: null,
1155
+ bottleneck: null,
1156
+ optimizations: [],
1157
+ warnings,
1158
+ };
1159
+ }
1160
+ // Compatibility checks
1161
+ const sttBrowserOnly = sttEngine.key === "web_speech_api_stt";
1162
+ const ttsBrowserOnly = ttsEngine.key === "web_speech_api_tts";
1163
+ if (sttBrowserOnly && !ttsBrowserOnly) {
1164
+ compatibility.push("Web Speech API STT is browser-only — TTS engine must also work in-browser or use a server bridge.");
1165
+ }
1166
+ if (ttsBrowserOnly && !sttBrowserOnly) {
1167
+ compatibility.push("Browser speechSynthesis is browser-only — STT engine must also work in-browser or use a server bridge.");
1168
+ }
1169
+ if (sttBrowserOnly || ttsBrowserOnly) {
1170
+ compatibility.push("Web Speech API is not available in Node.js — this config requires a browser environment.");
1171
+ }
1172
+ if (ttsEngine.key === "macos_say" && sttEngine.key !== "whisper_mlx" && sttEngine.key !== "whisper") {
1173
+ compatibility.push("macOS 'say' is Mac-only — ensure STT engine also supports macOS.");
1174
+ }
1175
+ if (llmEngine.privacy === "local" && sttEngine.privacy === "cloud") {
1176
+ compatibility.push("Mixed privacy: local LLM but cloud STT — audio still leaves the device.");
1177
+ }
1178
+ if (compatibility.length === 0) {
1179
+ compatibility.push("All components are compatible.");
1180
+ }
1181
+ // Latency estimation
1182
+ const latency = estimateRoundTrip(sttEngine, ttsEngine, llmEngine, true);
1183
+ const batchLatency = estimateRoundTrip(sttEngine, ttsEngine, llmEngine, false);
1184
+ // Identify bottleneck
1185
+ const { breakdown } = latency;
1186
+ const components = [
1187
+ { name: "STT", ms: breakdown.sttMs },
1188
+ { name: "LLM (first token)", ms: breakdown.llmFirstTokenMs },
1189
+ { name: "LLM (completion)", ms: breakdown.llmCompletionMs },
1190
+ { name: "TTS", ms: breakdown.ttsMs },
1191
+ ];
1192
+ components.sort((a, b) => b.ms - a.ms);
1193
+ const bottleneck = components[0].name;
1194
+ // Cost estimation
1195
+ const costPerMin = sttEngine.costPerMin +
1196
+ ttsEngine.costPerMin +
1197
+ (150 / 1000) * llmEngine.costPer1kTokens; // ~150 tokens/min
1198
+ // Optimization suggestions
1199
+ if (!sttEngine.streaming && sttEngine.latencyMs > 500) {
1200
+ optimizations.push(`STT bottleneck: ${sttEngine.name} is ${sttEngine.latencyMs}ms. Consider Deepgram (300ms, streaming) for lower latency.`);
1201
+ }
1202
+ if (ttsEngine.streaming) {
1203
+ optimizations.push("TTS supports streaming — enable it to reduce perceived latency by starting playback before generation completes.");
1204
+ }
1205
+ else {
1206
+ optimizations.push(`TTS (${ttsEngine.name}) does not support streaming — audio is generated in full before playback. Consider Cartesia or Edge TTS for streaming.`);
1207
+ }
1208
+ if (llmEngine.firstTokenMs > 500) {
1209
+ optimizations.push(`LLM first token is ${llmEngine.firstTokenMs}ms. For lower latency, consider Claude Haiku (300ms) or GPT-4o-mini (400ms).`);
1210
+ }
1211
+ if (latency.perceivedMs > targetLatency) {
1212
+ optimizations.push(`Perceived latency (${latency.perceivedMs}ms) exceeds target (${targetLatency}ms). Focus on reducing ${bottleneck}.`);
1213
+ }
1214
+ optimizations.push("Cache frequent responses (greetings, confirmations) as pre-generated audio to skip TTS entirely.");
1215
+ optimizations.push("Use streaming LLM output piped directly to streaming TTS to overlap computation with audio generation.");
1216
+ const valid = warnings.length === 0;
1217
+ return {
1218
+ valid,
1219
+ compatibility,
1220
+ estimatedLatencyMs: latency.perceivedMs,
1221
+ totalLatencyMs: latency.totalMs,
1222
+ batchLatencyMs: batchLatency.totalMs,
1223
+ latencyBreakdown: breakdown,
1224
+ costPerMinute: Math.round(costPerMin * 10000) / 10000,
1225
+ bottleneck,
1226
+ optimizations,
1227
+ warnings,
1228
+ rating: rateLatency(latency.perceivedMs),
1229
+ _quickRef: {
1230
+ nextAction: "Use generate_voice_scaffold to get starter code, or benchmark_voice_latency to compare alternatives.",
1231
+ nextTools: ["generate_voice_scaffold", "benchmark_voice_latency", "design_voice_pipeline"],
1232
+ },
1233
+ };
1234
+ },
1235
+ },
1236
+ // ─── Tool 3: generate_voice_scaffold ────────────────────────────────────────
1237
+ {
1238
+ name: "generate_voice_scaffold",
1239
+ description: "Generate starter code for a voice bridge. Returns file contents, setup instructions, and dependency lists for the selected stack. Stacks: whisper_edge (Python, Whisper + Edge TTS), deepgram_cartesia (TypeScript, cloud), browser_webspeech (HTML/JS, no server), whisper_piper (Python, fully local), custom (pluggable interfaces).",
1240
+ inputSchema: {
1241
+ type: "object",
1242
+ properties: {
1243
+ stack: {
1244
+ type: "string",
1245
+ enum: ["whisper_edge", "deepgram_cartesia", "browser_webspeech", "whisper_piper", "custom"],
1246
+ description: "Pre-built stack template to scaffold",
1247
+ },
1248
+ language: {
1249
+ type: "string",
1250
+ enum: ["typescript", "python"],
1251
+ description: "Language for 'custom' stack (default: typescript). Ignored for pre-built stacks.",
1252
+ },
1253
+ includeVAD: {
1254
+ type: "boolean",
1255
+ description: "Include Voice Activity Detection code (simple energy-based). Default: false.",
1256
+ },
1257
+ },
1258
+ required: ["stack"],
1259
+ },
1260
+ handler: async (args) => {
1261
+ const stack = args.stack;
1262
+ const language = args.language ?? "typescript";
1263
+ const includeVAD = args.includeVAD ?? false;
1264
+ let result;
1265
+ switch (stack) {
1266
+ case "whisper_edge":
1267
+ result = getScaffoldWhisperEdge(includeVAD);
1268
+ break;
1269
+ case "deepgram_cartesia":
1270
+ result = getScaffoldDeepgramCartesia(includeVAD);
1271
+ break;
1272
+ case "browser_webspeech":
1273
+ result = getScaffoldBrowserWebspeech();
1274
+ break;
1275
+ case "whisper_piper":
1276
+ result = getScaffoldWhisperPiper(includeVAD);
1277
+ break;
1278
+ case "custom":
1279
+ result = getScaffoldCustom(language, includeVAD);
1280
+ break;
1281
+ default:
1282
+ return {
1283
+ error: true,
1284
+ message: `Unknown stack: '${stack}'. Choose from: whisper_edge, deepgram_cartesia, browser_webspeech, whisper_piper, custom.`,
1285
+ };
1286
+ }
1287
+ return {
1288
+ stack,
1289
+ language: stack === "browser_webspeech" ? "html/javascript" : stack === "custom" ? language : stack.includes("whisper") ? "python" : "typescript",
1290
+ includeVAD,
1291
+ files: result.files.map((f) => ({
1292
+ path: f.path,
1293
+ description: f.description,
1294
+ content: f.content,
1295
+ sizeBytes: Buffer.byteLength(f.content, "utf8"),
1296
+ })),
1297
+ setupInstructions: result.setupInstructions,
1298
+ dependencies: result.dependencies,
1299
+ _quickRef: {
1300
+ nextAction: "Review the generated files. Write them to disk, install dependencies, and test the pipeline.",
1301
+ nextTools: ["analyze_voice_config", "benchmark_voice_latency"],
1302
+ },
1303
+ };
1304
+ },
1305
+ },
1306
+ // ─── Tool 4: benchmark_voice_latency ────────────────────────────────────────
1307
+ {
1308
+ name: "benchmark_voice_latency",
1309
+ description: "Calculate theoretical latency for one or more voice pipeline configurations. Breaks down STT, LLM (first token + completion), and TTS latency. Shows total and user-perceived latency (with streaming optimizations). Rates each config as excellent/good/acceptable/poor. Knowledge-based, no live measurements.",
1310
+ inputSchema: {
1311
+ type: "object",
1312
+ properties: {
1313
+ configs: {
1314
+ type: "array",
1315
+ description: "Array of pipeline configurations to benchmark",
1316
+ items: {
1317
+ type: "object",
1318
+ properties: {
1319
+ name: {
1320
+ type: "string",
1321
+ description: "Human-readable name for this configuration",
1322
+ },
1323
+ stt: {
1324
+ type: "string",
1325
+ description: "STT engine key (e.g. 'whisper', 'deepgram')",
1326
+ },
1327
+ tts: {
1328
+ type: "string",
1329
+ description: "TTS engine key (e.g. 'edge_tts', 'cartesia')",
1330
+ },
1331
+ llm: {
1332
+ type: "string",
1333
+ description: "LLM key (e.g. 'claude_haiku', 'gpt4o_mini')",
1334
+ },
1335
+ streamingEnabled: {
1336
+ type: "boolean",
1337
+ description: "Whether streaming is enabled for TTS (default: true)",
1338
+ },
1339
+ },
1340
+ required: ["name", "stt", "tts", "llm"],
1341
+ },
1342
+ },
1343
+ },
1344
+ required: ["configs"],
1345
+ },
1346
+ handler: async (args) => {
1347
+ const configs = args.configs ?? [];
1348
+ if (configs.length === 0) {
1349
+ return {
1350
+ error: true,
1351
+ message: "No configurations provided. Pass an array of {name, stt, tts, llm} objects.",
1352
+ };
1353
+ }
1354
+ const results = [];
1355
+ for (const config of configs) {
1356
+ const sttEngine = findStt(config.stt);
1357
+ const ttsEngine = findTts(config.tts);
1358
+ const llmEngine = findLlm(config.llm);
1359
+ const streaming = config.streamingEnabled !== false;
1360
+ const errors = [];
1361
+ if (!sttEngine)
1362
+ errors.push(`Unknown STT: '${config.stt}'`);
1363
+ if (!ttsEngine)
1364
+ errors.push(`Unknown TTS: '${config.tts}'`);
1365
+ if (!llmEngine)
1366
+ errors.push(`Unknown LLM: '${config.llm}'`);
1367
+ if (errors.length > 0) {
1368
+ results.push({
1369
+ name: config.name,
1370
+ breakdown: { sttMs: 0, llmFirstTokenMs: 0, llmCompletionMs: 0, ttsMs: 0 },
1371
+ totalMs: 0,
1372
+ perceivedMs: 0,
1373
+ rating: "poor",
1374
+ errors,
1375
+ });
1376
+ continue;
1377
+ }
1378
+ const latency = estimateRoundTrip(sttEngine, ttsEngine, llmEngine, streaming);
1379
+ results.push({
1380
+ name: config.name,
1381
+ breakdown: latency.breakdown,
1382
+ totalMs: latency.totalMs,
1383
+ perceivedMs: latency.perceivedMs,
1384
+ rating: rateLatency(latency.perceivedMs),
1385
+ });
1386
+ }
1387
+ // Sort by perceived latency for recommendation
1388
+ const validResults = results.filter((r) => !r.errors);
1389
+ validResults.sort((a, b) => a.perceivedMs - b.perceivedMs);
1390
+ const recommendation = validResults.length > 0
1391
+ ? `Best: '${validResults[0].name}' at ${validResults[0].perceivedMs}ms perceived latency (${validResults[0].rating}).`
1392
+ : "No valid configurations to compare.";
1393
+ // Generate insights
1394
+ const insights = [];
1395
+ if (validResults.length >= 2) {
1396
+ const fastest = validResults[0];
1397
+ const slowest = validResults[validResults.length - 1];
1398
+ insights.push(`Fastest config ('${fastest.name}') is ${slowest.perceivedMs - fastest.perceivedMs}ms faster than slowest ('${slowest.name}').`);
1399
+ }
1400
+ const excellentCount = validResults.filter((r) => r.rating === "excellent").length;
1401
+ const poorCount = validResults.filter((r) => r.rating === "poor").length;
1402
+ if (excellentCount > 0) {
1403
+ insights.push(`${excellentCount} config(s) rated 'excellent' (under 500ms perceived latency).`);
1404
+ }
1405
+ if (poorCount > 0) {
1406
+ insights.push(`${poorCount} config(s) rated 'poor' (over 2000ms). Consider faster STT/TTS or streaming.`);
1407
+ }
1408
+ // Check for common bottleneck patterns
1409
+ for (const r of validResults) {
1410
+ if (r.breakdown.sttMs >= r.breakdown.llmFirstTokenMs + r.breakdown.ttsMs) {
1411
+ insights.push(`'${r.name}': STT is the dominant bottleneck (${r.breakdown.sttMs}ms). Consider cloud STT for lower latency.`);
1412
+ break;
1413
+ }
1414
+ }
1415
+ return {
1416
+ results,
1417
+ recommendation,
1418
+ insights,
1419
+ _quickRef: {
1420
+ nextAction: "Pick the best config and use generate_voice_scaffold to get starter code.",
1421
+ nextTools: ["generate_voice_scaffold", "analyze_voice_config", "design_voice_pipeline"],
1422
+ },
1423
+ };
1424
+ },
1425
+ },
1426
+ ];
1427
+ //# sourceMappingURL=voiceBridgeTools.js.map