nodebench-mcp 2.17.0 → 2.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/LICENSE +21 -0
  2. package/NODEBENCH_AGENTS.md +2 -2
  3. package/README.md +514 -82
  4. package/dist/__tests__/analytics.test.d.ts +11 -0
  5. package/dist/__tests__/analytics.test.js +546 -0
  6. package/dist/__tests__/analytics.test.js.map +1 -0
  7. package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
  8. package/dist/__tests__/dynamicLoading.test.js +278 -0
  9. package/dist/__tests__/dynamicLoading.test.js.map +1 -0
  10. package/dist/__tests__/evalHarness.test.js +1 -1
  11. package/dist/__tests__/evalHarness.test.js.map +1 -1
  12. package/dist/__tests__/helpers/answerMatch.js +22 -22
  13. package/dist/__tests__/presetRealWorldBench.test.js +9 -0
  14. package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
  15. package/dist/__tests__/tools.test.js +1 -1
  16. package/dist/__tests__/toolsetGatingEval.test.js +9 -1
  17. package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
  18. package/dist/analytics/index.d.ts +10 -0
  19. package/dist/analytics/index.js +11 -0
  20. package/dist/analytics/index.js.map +1 -0
  21. package/dist/analytics/projectDetector.d.ts +19 -0
  22. package/dist/analytics/projectDetector.js +259 -0
  23. package/dist/analytics/projectDetector.js.map +1 -0
  24. package/dist/analytics/schema.d.ts +57 -0
  25. package/dist/analytics/schema.js +157 -0
  26. package/dist/analytics/schema.js.map +1 -0
  27. package/dist/analytics/smartPreset.d.ts +63 -0
  28. package/dist/analytics/smartPreset.js +300 -0
  29. package/dist/analytics/smartPreset.js.map +1 -0
  30. package/dist/analytics/toolTracker.d.ts +59 -0
  31. package/dist/analytics/toolTracker.js +163 -0
  32. package/dist/analytics/toolTracker.js.map +1 -0
  33. package/dist/analytics/usageStats.d.ts +64 -0
  34. package/dist/analytics/usageStats.js +252 -0
  35. package/dist/analytics/usageStats.js.map +1 -0
  36. package/dist/db.js +359 -321
  37. package/dist/db.js.map +1 -1
  38. package/dist/index.d.ts +2 -1
  39. package/dist/index.js +652 -89
  40. package/dist/index.js.map +1 -1
  41. package/dist/tools/architectTools.js +13 -13
  42. package/dist/tools/critterTools.js +14 -14
  43. package/dist/tools/parallelAgentTools.js +176 -176
  44. package/dist/tools/patternTools.js +11 -11
  45. package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
  46. package/dist/tools/progressiveDiscoveryTools.js +111 -19
  47. package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
  48. package/dist/tools/researchWritingTools.js +42 -42
  49. package/dist/tools/rssTools.js +396 -396
  50. package/dist/tools/toolRegistry.d.ts +17 -0
  51. package/dist/tools/toolRegistry.js +65 -17
  52. package/dist/tools/toolRegistry.js.map +1 -1
  53. package/dist/tools/voiceBridgeTools.js +498 -498
  54. package/dist/toolsetRegistry.d.ts +10 -0
  55. package/dist/toolsetRegistry.js +84 -0
  56. package/dist/toolsetRegistry.js.map +1 -0
  57. package/package.json +4 -4
@@ -235,90 +235,90 @@ function rateLatency(perceivedMs) {
235
235
  // ─── Scaffold templates ──────────────────────────────────────────────────────
236
236
  function getScaffoldWhisperEdge(includeVAD) {
237
237
  const vadCode = includeVAD
238
- ? `
239
- import numpy as np
240
-
241
- def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
242
- """Simple energy-based Voice Activity Detection."""
243
- energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
244
- return energy > threshold
238
+ ? `
239
+ import numpy as np
240
+
241
+ def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
242
+ """Simple energy-based Voice Activity Detection."""
243
+ energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
244
+ return energy > threshold
245
245
  `
246
246
  : "";
247
247
  const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
248
248
  const vadUsage = includeVAD
249
- ? `
250
- # Check for voice activity before sending to Whisper
251
- if not detect_voice_activity(audio_data):
249
+ ? `
250
+ # Check for voice activity before sending to Whisper
251
+ if not detect_voice_activity(audio_data):
252
252
  continue`
253
253
  : "";
254
254
  return {
255
255
  files: [
256
256
  {
257
257
  path: "voice_bridge.py",
258
- content: `"""Voice Bridge: Whisper STT + Edge TTS + WebSocket server."""
259
- import asyncio
260
- import json
261
- import tempfile
262
- import wave
263
- import whisper
264
- import edge_tts
265
- import websockets
266
- ${vadImport}
267
- # Load Whisper model (use "tiny" or "base" for speed, "small" for accuracy)
268
- model = whisper.load_model("base")
269
-
270
- async def transcribe(audio_data: bytes) -> str:
271
- """Transcribe audio bytes using Whisper."""
272
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
273
- # Write raw PCM as WAV
274
- with wave.open(f.name, "wb") as wf:
275
- wf.setnchannels(1)
276
- wf.setsampwidth(2)
277
- wf.setframerate(16000)
278
- wf.writeframes(audio_data)
279
- result = model.transcribe(f.name, language="en")
280
- return result["text"].strip()
281
-
282
- async def synthesize(text: str) -> bytes:
283
- """Synthesize speech using Edge TTS."""
284
- communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
285
- audio_chunks = []
286
- async for chunk in communicate.stream():
287
- if chunk["type"] == "audio":
288
- audio_chunks.append(chunk["data"])
289
- return b"".join(audio_chunks)
290
-
291
- async def handle_client(websocket):
292
- """Handle a WebSocket client connection."""
293
- print("Client connected")
294
- try:
295
- async for message in websocket:
296
- if isinstance(message, bytes):${vadUsage}
297
- # Transcribe speech to text
298
- transcript = await transcribe(message)
299
- print(f"User: {transcript}")
300
-
301
- # TODO: Send transcript to your LLM and get response
302
- llm_response = f"Echo: {transcript}"
303
-
304
- # Synthesize response to speech
305
- audio = await synthesize(llm_response)
306
- await websocket.send(audio)
307
- else:
308
- # Handle text/control messages
309
- data = json.loads(message)
310
- if data.get("type") == "ping":
311
- await websocket.send(json.dumps({{"type": "pong"}}))
312
- except websockets.exceptions.ConnectionClosed:
313
- print("Client disconnected")
314
-
315
- async def main():
316
- print("Voice Bridge starting on ws://localhost:8765")
317
- async with websockets.serve(handle_client, "localhost", 8765):
318
- await asyncio.Future() # Run forever
319
-
320
- if __name__ == "__main__":
321
- asyncio.run(main())
258
+ content: `"""Voice Bridge: Whisper STT + Edge TTS + WebSocket server."""
259
+ import asyncio
260
+ import json
261
+ import tempfile
262
+ import wave
263
+ import whisper
264
+ import edge_tts
265
+ import websockets
266
+ ${vadImport}
267
+ # Load Whisper model (use "tiny" or "base" for speed, "small" for accuracy)
268
+ model = whisper.load_model("base")
269
+
270
+ async def transcribe(audio_data: bytes) -> str:
271
+ """Transcribe audio bytes using Whisper."""
272
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
273
+ # Write raw PCM as WAV
274
+ with wave.open(f.name, "wb") as wf:
275
+ wf.setnchannels(1)
276
+ wf.setsampwidth(2)
277
+ wf.setframerate(16000)
278
+ wf.writeframes(audio_data)
279
+ result = model.transcribe(f.name, language="en")
280
+ return result["text"].strip()
281
+
282
+ async def synthesize(text: str) -> bytes:
283
+ """Synthesize speech using Edge TTS."""
284
+ communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
285
+ audio_chunks = []
286
+ async for chunk in communicate.stream():
287
+ if chunk["type"] == "audio":
288
+ audio_chunks.append(chunk["data"])
289
+ return b"".join(audio_chunks)
290
+
291
+ async def handle_client(websocket):
292
+ """Handle a WebSocket client connection."""
293
+ print("Client connected")
294
+ try:
295
+ async for message in websocket:
296
+ if isinstance(message, bytes):${vadUsage}
297
+ # Transcribe speech to text
298
+ transcript = await transcribe(message)
299
+ print(f"User: {transcript}")
300
+
301
+ # TODO: Send transcript to your LLM and get response
302
+ llm_response = f"Echo: {transcript}"
303
+
304
+ # Synthesize response to speech
305
+ audio = await synthesize(llm_response)
306
+ await websocket.send(audio)
307
+ else:
308
+ # Handle text/control messages
309
+ data = json.loads(message)
310
+ if data.get("type") == "ping":
311
+ await websocket.send(json.dumps({{"type": "pong"}}))
312
+ except websockets.exceptions.ConnectionClosed:
313
+ print("Client disconnected")
314
+
315
+ async def main():
316
+ print("Voice Bridge starting on ws://localhost:8765")
317
+ async with websockets.serve(handle_client, "localhost", 8765):
318
+ await asyncio.Future() # Run forever
319
+
320
+ if __name__ == "__main__":
321
+ asyncio.run(main())
322
322
  `,
323
323
  description: "Main voice bridge server: Whisper STT + Edge TTS over WebSocket",
324
324
  },
@@ -333,10 +333,10 @@ if __name__ == "__main__":
333
333
  : []),
334
334
  {
335
335
  path: "requirements.txt",
336
- content: `openai-whisper
337
- edge-tts
338
- websockets
339
- numpy
336
+ content: `openai-whisper
337
+ edge-tts
338
+ websockets
339
+ numpy
340
340
  `,
341
341
  description: "Python dependencies",
342
342
  },
@@ -353,88 +353,88 @@ numpy
353
353
  }
354
354
  function getScaffoldDeepgramCartesia(includeVAD) {
355
355
  const vadBlock = includeVAD
356
- ? `
357
- // Simple energy-based VAD
358
- function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
359
- let energy = 0;
360
- for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
361
- return Math.sqrt(energy / samples.length) > threshold;
362
- }
356
+ ? `
357
+ // Simple energy-based VAD
358
+ function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
359
+ let energy = 0;
360
+ for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
361
+ return Math.sqrt(energy / samples.length) > threshold;
362
+ }
363
363
  `
364
364
  : "";
365
365
  return {
366
366
  files: [
367
367
  {
368
368
  path: "src/voice-bridge.ts",
369
- content: `/**
370
- * Voice Bridge: Deepgram STT + Cartesia TTS (TypeScript)
371
- *
372
- * Requires:
373
- * DEEPGRAM_API_KEY — from https://console.deepgram.com
374
- * CARTESIA_API_KEY — from https://play.cartesia.ai
375
- */
376
- import { createClient, LiveTranscriptionEvents } from "@deepgram/sdk";
377
- import Cartesia from "@cartesia/cartesia-js";
378
- import { WebSocketServer } from "ws";
379
-
380
- const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
381
- const cartesia = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY! });
382
- ${vadBlock}
383
- const wss = new WebSocketServer({ port: 8765 });
384
- console.log("Voice Bridge listening on ws://localhost:8765");
385
-
386
- wss.on("connection", (ws) => {
387
- console.log("Client connected");
388
-
389
- // Set up Deepgram live transcription
390
- const dgConn = deepgram.listen.live({
391
- model: "nova-2",
392
- language: "en",
393
- smart_format: true,
394
- interim_results: false,
395
- });
396
-
397
- dgConn.on(LiveTranscriptionEvents.Open, () => {
398
- console.log("Deepgram connection open");
399
- });
400
-
401
- dgConn.on(LiveTranscriptionEvents.Transcript, async (data) => {
402
- const transcript = data.channel?.alternatives?.[0]?.transcript;
403
- if (!transcript) return;
404
-
405
- console.log(\`User: \${transcript}\`);
406
-
407
- // TODO: Send transcript to your LLM and get response
408
- const llmResponse = \`Echo: \${transcript}\`;
409
-
410
- // Synthesize with Cartesia (streaming)
411
- const ttsResponse = await cartesia.tts.sse({
412
- modelId: "sonic-english",
413
- transcript: llmResponse,
414
- voice: { mode: "id", id: "a0e99841-438c-4a64-b679-ae501e7d6091" },
415
- output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: 24000 },
416
- });
417
-
418
- for await (const chunk of ttsResponse) {
419
- if (ws.readyState === ws.OPEN) {
420
- ws.send(chunk);
421
- }
422
- }
423
- });
424
-
425
- dgConn.on(LiveTranscriptionEvents.Error, (err) => {
426
- console.error("Deepgram error:", err);
427
- });
428
-
429
- ws.on("message", (data: Buffer) => {${includeVAD ? "\n // VAD check\n const samples = new Float32Array(data.buffer);\n if (!detectVoiceActivity(samples)) return;\n" : ""}
430
- dgConn.send(data);
431
- });
432
-
433
- ws.on("close", () => {
434
- dgConn.finish();
435
- console.log("Client disconnected");
436
- });
437
- });
369
+ content: `/**
370
+ * Voice Bridge: Deepgram STT + Cartesia TTS (TypeScript)
371
+ *
372
+ * Requires:
373
+ * DEEPGRAM_API_KEY — from https://console.deepgram.com
374
+ * CARTESIA_API_KEY — from https://play.cartesia.ai
375
+ */
376
+ import { createClient, LiveTranscriptionEvents } from "@deepgram/sdk";
377
+ import Cartesia from "@cartesia/cartesia-js";
378
+ import { WebSocketServer } from "ws";
379
+
380
+ const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
381
+ const cartesia = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY! });
382
+ ${vadBlock}
383
+ const wss = new WebSocketServer({ port: 8765 });
384
+ console.log("Voice Bridge listening on ws://localhost:8765");
385
+
386
+ wss.on("connection", (ws) => {
387
+ console.log("Client connected");
388
+
389
+ // Set up Deepgram live transcription
390
+ const dgConn = deepgram.listen.live({
391
+ model: "nova-2",
392
+ language: "en",
393
+ smart_format: true,
394
+ interim_results: false,
395
+ });
396
+
397
+ dgConn.on(LiveTranscriptionEvents.Open, () => {
398
+ console.log("Deepgram connection open");
399
+ });
400
+
401
+ dgConn.on(LiveTranscriptionEvents.Transcript, async (data) => {
402
+ const transcript = data.channel?.alternatives?.[0]?.transcript;
403
+ if (!transcript) return;
404
+
405
+ console.log(\`User: \${transcript}\`);
406
+
407
+ // TODO: Send transcript to your LLM and get response
408
+ const llmResponse = \`Echo: \${transcript}\`;
409
+
410
+ // Synthesize with Cartesia (streaming)
411
+ const ttsResponse = await cartesia.tts.sse({
412
+ modelId: "sonic-english",
413
+ transcript: llmResponse,
414
+ voice: { mode: "id", id: "a0e99841-438c-4a64-b679-ae501e7d6091" },
415
+ output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: 24000 },
416
+ });
417
+
418
+ for await (const chunk of ttsResponse) {
419
+ if (ws.readyState === ws.OPEN) {
420
+ ws.send(chunk);
421
+ }
422
+ }
423
+ });
424
+
425
+ dgConn.on(LiveTranscriptionEvents.Error, (err) => {
426
+ console.error("Deepgram error:", err);
427
+ });
428
+
429
+ ws.on("message", (data: Buffer) => {${includeVAD ? "\n // VAD check\n const samples = new Float32Array(data.buffer);\n if (!detectVoiceActivity(samples)) return;\n" : ""}
430
+ dgConn.send(data);
431
+ });
432
+
433
+ ws.on("close", () => {
434
+ dgConn.finish();
435
+ console.log("Client disconnected");
436
+ });
437
+ });
438
438
  `,
439
439
  description: "TypeScript voice bridge: Deepgram Nova-2 STT + Cartesia Sonic streaming TTS",
440
440
  },
@@ -469,111 +469,111 @@ function getScaffoldBrowserWebspeech() {
469
469
  files: [
470
470
  {
471
471
  path: "index.html",
472
- content: `<!DOCTYPE html>
473
- <html lang="en">
474
- <head>
475
- <meta charset="UTF-8">
476
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
477
- <title>Voice Bridge — Web Speech API</title>
478
- <style>
479
- body { font-family: system-ui, sans-serif; max-width: 600px; margin: 2rem auto; padding: 0 1rem; }
480
- #status { padding: 0.5rem; border-radius: 4px; margin: 1rem 0; }
481
- .listening { background: #dcfce7; color: #166534; }
482
- .speaking { background: #dbeafe; color: #1e40af; }
483
- .idle { background: #f3f4f6; color: #374151; }
484
- #transcript { white-space: pre-wrap; border: 1px solid #d1d5db; padding: 1rem; border-radius: 4px; min-height: 200px; max-height: 400px; overflow-y: auto; }
485
- button { padding: 0.75rem 1.5rem; font-size: 1rem; border: none; border-radius: 4px; cursor: pointer; margin: 0.25rem; }
486
- #startBtn { background: #2563eb; color: white; }
487
- #startBtn:disabled { background: #93c5fd; cursor: not-allowed; }
488
- #stopBtn { background: #dc2626; color: white; }
489
- </style>
490
- </head>
491
- <body>
492
- <h1>Voice Bridge</h1>
493
- <p>Browser-native STT + TTS using the Web Speech API. No server needed.</p>
494
- <div>
495
- <button id="startBtn" onclick="startListening()">Start Listening</button>
496
- <button id="stopBtn" onclick="stopListening()">Stop</button>
497
- </div>
498
- <div id="status" class="idle">Idle</div>
499
- <div id="transcript"></div>
500
-
501
- <script>
502
- const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
503
- if (!SpeechRecognition) {
504
- document.getElementById("status").textContent = "Web Speech API not supported in this browser.";
505
- }
506
-
507
- const recognition = new SpeechRecognition();
508
- recognition.continuous = true;
509
- recognition.interimResults = true;
510
- recognition.lang = "en-US";
511
-
512
- const synth = window.speechSynthesis;
513
- const transcriptEl = document.getElementById("transcript");
514
- const statusEl = document.getElementById("status");
515
- let isListening = false;
516
-
517
- function setStatus(text, cls) {
518
- statusEl.textContent = text;
519
- statusEl.className = cls;
520
- }
521
-
522
- function appendTranscript(role, text) {
523
- transcriptEl.textContent += role + ": " + text + "\\n";
524
- transcriptEl.scrollTop = transcriptEl.scrollHeight;
525
- }
526
-
527
- recognition.onresult = (event) => {
528
- const last = event.results[event.results.length - 1];
529
- if (last.isFinal) {
530
- const transcript = last[0].transcript.trim();
531
- if (!transcript) return;
532
-
533
- appendTranscript("You", transcript);
534
- setStatus("Processing...", "speaking");
535
-
536
- // TODO: Send transcript to your LLM API and get response
537
- const llmResponse = "Echo: " + transcript;
538
-
539
- appendTranscript("Assistant", llmResponse);
540
-
541
- // Speak the response
542
- const utterance = new SpeechSynthesisUtterance(llmResponse);
543
- utterance.onstart = () => setStatus("Speaking...", "speaking");
544
- utterance.onend = () => setStatus("Listening...", "listening");
545
- synth.speak(utterance);
546
- }
547
- };
548
-
549
- recognition.onerror = (event) => {
550
- console.error("Speech recognition error:", event.error);
551
- if (event.error !== "no-speech") {
552
- setStatus("Error: " + event.error, "idle");
553
- }
554
- };
555
-
556
- recognition.onend = () => {
557
- if (isListening) recognition.start(); // Auto-restart
558
- };
559
-
560
- function startListening() {
561
- isListening = true;
562
- recognition.start();
563
- setStatus("Listening...", "listening");
564
- document.getElementById("startBtn").disabled = true;
565
- }
566
-
567
- function stopListening() {
568
- isListening = false;
569
- recognition.stop();
570
- synth.cancel();
571
- setStatus("Idle", "idle");
572
- document.getElementById("startBtn").disabled = false;
573
- }
574
- </script>
575
- </body>
576
- </html>
472
+ content: `<!DOCTYPE html>
473
+ <html lang="en">
474
+ <head>
475
+ <meta charset="UTF-8">
476
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
477
+ <title>Voice Bridge — Web Speech API</title>
478
+ <style>
479
+ body { font-family: system-ui, sans-serif; max-width: 600px; margin: 2rem auto; padding: 0 1rem; }
480
+ #status { padding: 0.5rem; border-radius: 4px; margin: 1rem 0; }
481
+ .listening { background: #dcfce7; color: #166534; }
482
+ .speaking { background: #dbeafe; color: #1e40af; }
483
+ .idle { background: #f3f4f6; color: #374151; }
484
+ #transcript { white-space: pre-wrap; border: 1px solid #d1d5db; padding: 1rem; border-radius: 4px; min-height: 200px; max-height: 400px; overflow-y: auto; }
485
+ button { padding: 0.75rem 1.5rem; font-size: 1rem; border: none; border-radius: 4px; cursor: pointer; margin: 0.25rem; }
486
+ #startBtn { background: #2563eb; color: white; }
487
+ #startBtn:disabled { background: #93c5fd; cursor: not-allowed; }
488
+ #stopBtn { background: #dc2626; color: white; }
489
+ </style>
490
+ </head>
491
+ <body>
492
+ <h1>Voice Bridge</h1>
493
+ <p>Browser-native STT + TTS using the Web Speech API. No server needed.</p>
494
+ <div>
495
+ <button id="startBtn" onclick="startListening()">Start Listening</button>
496
+ <button id="stopBtn" onclick="stopListening()">Stop</button>
497
+ </div>
498
+ <div id="status" class="idle">Idle</div>
499
+ <div id="transcript"></div>
500
+
501
+ <script>
502
+ const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
503
+ if (!SpeechRecognition) {
504
+ document.getElementById("status").textContent = "Web Speech API not supported in this browser.";
505
+ }
506
+
507
+ const recognition = new SpeechRecognition();
508
+ recognition.continuous = true;
509
+ recognition.interimResults = true;
510
+ recognition.lang = "en-US";
511
+
512
+ const synth = window.speechSynthesis;
513
+ const transcriptEl = document.getElementById("transcript");
514
+ const statusEl = document.getElementById("status");
515
+ let isListening = false;
516
+
517
+ function setStatus(text, cls) {
518
+ statusEl.textContent = text;
519
+ statusEl.className = cls;
520
+ }
521
+
522
+ function appendTranscript(role, text) {
523
+ transcriptEl.textContent += role + ": " + text + "\\n";
524
+ transcriptEl.scrollTop = transcriptEl.scrollHeight;
525
+ }
526
+
527
+ recognition.onresult = (event) => {
528
+ const last = event.results[event.results.length - 1];
529
+ if (last.isFinal) {
530
+ const transcript = last[0].transcript.trim();
531
+ if (!transcript) return;
532
+
533
+ appendTranscript("You", transcript);
534
+ setStatus("Processing...", "speaking");
535
+
536
+ // TODO: Send transcript to your LLM API and get response
537
+ const llmResponse = "Echo: " + transcript;
538
+
539
+ appendTranscript("Assistant", llmResponse);
540
+
541
+ // Speak the response
542
+ const utterance = new SpeechSynthesisUtterance(llmResponse);
543
+ utterance.onstart = () => setStatus("Speaking...", "speaking");
544
+ utterance.onend = () => setStatus("Listening...", "listening");
545
+ synth.speak(utterance);
546
+ }
547
+ };
548
+
549
+ recognition.onerror = (event) => {
550
+ console.error("Speech recognition error:", event.error);
551
+ if (event.error !== "no-speech") {
552
+ setStatus("Error: " + event.error, "idle");
553
+ }
554
+ };
555
+
556
+ recognition.onend = () => {
557
+ if (isListening) recognition.start(); // Auto-restart
558
+ };
559
+
560
+ function startListening() {
561
+ isListening = true;
562
+ recognition.start();
563
+ setStatus("Listening...", "listening");
564
+ document.getElementById("startBtn").disabled = true;
565
+ }
566
+
567
+ function stopListening() {
568
+ isListening = false;
569
+ recognition.stop();
570
+ synth.cancel();
571
+ setStatus("Idle", "idle");
572
+ document.getElementById("startBtn").disabled = false;
573
+ }
574
+ </script>
575
+ </body>
576
+ </html>
577
577
  `,
578
578
  description: "Self-contained HTML page using Web Speech API for both STT and TTS",
579
579
  },
@@ -589,97 +589,97 @@ function getScaffoldBrowserWebspeech() {
589
589
  }
590
590
  function getScaffoldWhisperPiper(includeVAD) {
591
591
  const vadCode = includeVAD
592
- ? `
593
- import numpy as np
594
-
595
- def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
596
- """Simple energy-based Voice Activity Detection."""
597
- energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
598
- return energy > threshold
592
+ ? `
593
+ import numpy as np
594
+
595
+ def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
596
+ """Simple energy-based Voice Activity Detection."""
597
+ energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
598
+ return energy > threshold
599
599
  `
600
600
  : "";
601
601
  const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
602
602
  const vadUsage = includeVAD
603
- ? `
604
- # Check for voice activity
605
- if not detect_voice_activity(audio_data):
603
+ ? `
604
+ # Check for voice activity
605
+ if not detect_voice_activity(audio_data):
606
606
  continue`
607
607
  : "";
608
608
  return {
609
609
  files: [
610
610
  {
611
611
  path: "voice_bridge.py",
612
- content: `"""Fully Local Voice Bridge: Whisper STT + Piper TTS. No cloud dependencies."""
613
- import asyncio
614
- import json
615
- import subprocess
616
- import tempfile
617
- import wave
618
- import whisper
619
- import websockets
620
- ${vadImport}
621
- # Load Whisper model
622
- model = whisper.load_model("base")
623
-
624
- # Piper TTS configuration
625
- PIPER_MODEL = "en_US-lessac-medium" # Download from https://github.com/rhasspy/piper/releases
626
- PIPER_BIN = "piper" # Ensure piper is on PATH
627
-
628
- async def transcribe(audio_data: bytes) -> str:
629
- """Transcribe audio bytes using Whisper."""
630
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
631
- with wave.open(f.name, "wb") as wf:
632
- wf.setnchannels(1)
633
- wf.setsampwidth(2)
634
- wf.setframerate(16000)
635
- wf.writeframes(audio_data)
636
- result = model.transcribe(f.name, language="en")
637
- return result["text"].strip()
638
-
639
- async def synthesize(text: str) -> bytes:
640
- """Synthesize speech using Piper TTS (local, ONNX-based)."""
641
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
642
- proc = await asyncio.create_subprocess_exec(
643
- PIPER_BIN,
644
- "--model", PIPER_MODEL,
645
- "--output_file", f.name,
646
- stdin=asyncio.subprocess.PIPE,
647
- stdout=asyncio.subprocess.PIPE,
648
- stderr=asyncio.subprocess.PIPE,
649
- )
650
- await proc.communicate(input=text.encode("utf-8"))
651
- with open(f.name, "rb") as audio:
652
- return audio.read()
653
-
654
- async def handle_client(websocket):
655
- """Handle a WebSocket client connection."""
656
- print("Client connected")
657
- try:
658
- async for message in websocket:
659
- if isinstance(message, bytes):${vadUsage}
660
- transcript = await transcribe(message)
661
- print(f"User: {transcript}")
662
-
663
- # TODO: Send to local LLM (e.g. llama.cpp server)
664
- llm_response = f"Echo: {transcript}"
665
-
666
- audio = await synthesize(llm_response)
667
- await websocket.send(audio)
668
- else:
669
- data = json.loads(message)
670
- if data.get("type") == "ping":
671
- await websocket.send(json.dumps({{"type": "pong"}}))
672
- except websockets.exceptions.ConnectionClosed:
673
- print("Client disconnected")
674
-
675
- async def main():
676
- print("Local Voice Bridge starting on ws://localhost:8765")
677
- print("Fully offline — no cloud APIs used.")
678
- async with websockets.serve(handle_client, "localhost", 8765):
679
- await asyncio.Future()
680
-
681
- if __name__ == "__main__":
682
- asyncio.run(main())
612
+ content: `"""Fully Local Voice Bridge: Whisper STT + Piper TTS. No cloud dependencies."""
613
+ import asyncio
614
+ import json
615
+ import subprocess
616
+ import tempfile
617
+ import wave
618
+ import whisper
619
+ import websockets
620
+ ${vadImport}
621
+ # Load Whisper model
622
+ model = whisper.load_model("base")
623
+
624
+ # Piper TTS configuration
625
+ PIPER_MODEL = "en_US-lessac-medium" # Download from https://github.com/rhasspy/piper/releases
626
+ PIPER_BIN = "piper" # Ensure piper is on PATH
627
+
628
+ async def transcribe(audio_data: bytes) -> str:
629
+ """Transcribe audio bytes using Whisper."""
630
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
631
+ with wave.open(f.name, "wb") as wf:
632
+ wf.setnchannels(1)
633
+ wf.setsampwidth(2)
634
+ wf.setframerate(16000)
635
+ wf.writeframes(audio_data)
636
+ result = model.transcribe(f.name, language="en")
637
+ return result["text"].strip()
638
+
639
+ async def synthesize(text: str) -> bytes:
640
+ """Synthesize speech using Piper TTS (local, ONNX-based)."""
641
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
642
+ proc = await asyncio.create_subprocess_exec(
643
+ PIPER_BIN,
644
+ "--model", PIPER_MODEL,
645
+ "--output_file", f.name,
646
+ stdin=asyncio.subprocess.PIPE,
647
+ stdout=asyncio.subprocess.PIPE,
648
+ stderr=asyncio.subprocess.PIPE,
649
+ )
650
+ await proc.communicate(input=text.encode("utf-8"))
651
+ with open(f.name, "rb") as audio:
652
+ return audio.read()
653
+
654
+ async def handle_client(websocket):
655
+ """Handle a WebSocket client connection."""
656
+ print("Client connected")
657
+ try:
658
+ async for message in websocket:
659
+ if isinstance(message, bytes):${vadUsage}
660
+ transcript = await transcribe(message)
661
+ print(f"User: {transcript}")
662
+
663
+ # TODO: Send to local LLM (e.g. llama.cpp server)
664
+ llm_response = f"Echo: {transcript}"
665
+
666
+ audio = await synthesize(llm_response)
667
+ await websocket.send(audio)
668
+ else:
669
+ data = json.loads(message)
670
+ if data.get("type") == "ping":
671
+ await websocket.send(json.dumps({{"type": "pong"}}))
672
+ except websockets.exceptions.ConnectionClosed:
673
+ print("Client disconnected")
674
+
675
+ async def main():
676
+ print("Local Voice Bridge starting on ws://localhost:8765")
677
+ print("Fully offline — no cloud APIs used.")
678
+ async with websockets.serve(handle_client, "localhost", 8765):
679
+ await asyncio.Future()
680
+
681
+ if __name__ == "__main__":
682
+ asyncio.run(main())
683
683
  `,
684
684
  description: "Fully local voice bridge: Whisper + Piper (no cloud dependencies)",
685
685
  },
@@ -694,9 +694,9 @@ if __name__ == "__main__":
694
694
  : []),
695
695
  {
696
696
  path: "requirements.txt",
697
- content: `openai-whisper
698
- websockets
699
- numpy
697
+ content: `openai-whisper
698
+ websockets
699
+ numpy
700
700
  `,
701
701
  description: "Python dependencies (Piper installed separately as binary)",
702
702
  },
@@ -715,91 +715,91 @@ numpy
715
715
  function getScaffoldCustom(language, includeVAD) {
716
716
  if (language === "python") {
717
717
  const vadBlock = includeVAD
718
- ? `
719
- class SimpleVAD:
720
- """Energy-based Voice Activity Detection."""
721
- def __init__(self, threshold: float = 0.02):
722
- self.threshold = threshold
723
-
724
- def is_speech(self, audio_chunk) -> bool:
725
- import numpy as np
726
- energy = np.sqrt(np.mean(np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) ** 2))
727
- return energy > self.threshold
718
+ ? `
719
+ class SimpleVAD:
720
+ """Energy-based Voice Activity Detection."""
721
+ def __init__(self, threshold: float = 0.02):
722
+ self.threshold = threshold
723
+
724
+ def is_speech(self, audio_chunk) -> bool:
725
+ import numpy as np
726
+ energy = np.sqrt(np.mean(np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) ** 2))
727
+ return energy > self.threshold
728
728
  `
729
729
  : "";
730
730
  return {
731
731
  files: [
732
732
  {
733
733
  path: "voice_bridge.py",
734
- content: `"""Custom Voice Bridge Template — Plug in any STT/TTS/LLM."""
735
- import asyncio
736
- import json
737
- from abc import ABC, abstractmethod
738
- import websockets
739
-
740
-
741
- class STTProvider(ABC):
742
- @abstractmethod
743
- async def transcribe(self, audio_data: bytes) -> str: ...
744
-
745
- class TTSProvider(ABC):
746
- @abstractmethod
747
- async def synthesize(self, text: str) -> bytes: ...
748
-
749
- class LLMProvider(ABC):
750
- @abstractmethod
751
- async def generate(self, prompt: str, history: list[dict]) -> str: ...
752
- ${vadBlock}
753
-
754
- # ─── Implement your providers here ────────────────────────────────
755
-
756
- class MySTT(STTProvider):
757
- async def transcribe(self, audio_data: bytes) -> str:
758
- # TODO: Replace with your STT implementation
759
- raise NotImplementedError("Implement STT provider")
760
-
761
- class MyTTS(TTSProvider):
762
- async def synthesize(self, text: str) -> bytes:
763
- # TODO: Replace with your TTS implementation
764
- raise NotImplementedError("Implement TTS provider")
765
-
766
- class MyLLM(LLMProvider):
767
- async def generate(self, prompt: str, history: list[dict]) -> str:
768
- # TODO: Replace with your LLM implementation
769
- return f"Echo: {prompt}"
770
-
771
-
772
- # ─── Voice bridge server ──────────────────────────────────────────
773
-
774
- class VoiceBridge:
775
- def __init__(self, stt: STTProvider, tts: TTSProvider, llm: LLMProvider):
776
- self.stt = stt
777
- self.tts = tts
778
- self.llm = llm
779
-
780
- async def handle_client(self, websocket):
781
- history = []
782
- async for message in websocket:
783
- if isinstance(message, bytes):
784
- transcript = await self.stt.transcribe(message)
785
- history.append({"role": "user", "content": transcript})
786
- response = await self.llm.generate(transcript, history)
787
- history.append({"role": "assistant", "content": response})
788
- audio = await self.tts.synthesize(response)
789
- await websocket.send(audio)
790
- else:
791
- data = json.loads(message)
792
- await websocket.send(json.dumps({"type": "pong"}))
793
-
794
- async def start(self, host: str = "localhost", port: int = 8765):
795
- print(f"Voice Bridge on ws://{host}:{port}")
796
- async with websockets.serve(self.handle_client, host, port):
797
- await asyncio.Future()
798
-
799
-
800
- if __name__ == "__main__":
801
- bridge = VoiceBridge(stt=MySTT(), tts=MyTTS(), llm=MyLLM())
802
- asyncio.run(bridge.start())
734
+ content: `"""Custom Voice Bridge Template — Plug in any STT/TTS/LLM."""
735
+ import asyncio
736
+ import json
737
+ from abc import ABC, abstractmethod
738
+ import websockets
739
+
740
+
741
+ class STTProvider(ABC):
742
+ @abstractmethod
743
+ async def transcribe(self, audio_data: bytes) -> str: ...
744
+
745
+ class TTSProvider(ABC):
746
+ @abstractmethod
747
+ async def synthesize(self, text: str) -> bytes: ...
748
+
749
+ class LLMProvider(ABC):
750
+ @abstractmethod
751
+ async def generate(self, prompt: str, history: list[dict]) -> str: ...
752
+ ${vadBlock}
753
+
754
+ # ─── Implement your providers here ────────────────────────────────
755
+
756
+ class MySTT(STTProvider):
757
+ async def transcribe(self, audio_data: bytes) -> str:
758
+ # TODO: Replace with your STT implementation
759
+ raise NotImplementedError("Implement STT provider")
760
+
761
+ class MyTTS(TTSProvider):
762
+ async def synthesize(self, text: str) -> bytes:
763
+ # TODO: Replace with your TTS implementation
764
+ raise NotImplementedError("Implement TTS provider")
765
+
766
+ class MyLLM(LLMProvider):
767
+ async def generate(self, prompt: str, history: list[dict]) -> str:
768
+ # TODO: Replace with your LLM implementation
769
+ return f"Echo: {prompt}"
770
+
771
+
772
+ # ─── Voice bridge server ──────────────────────────────────────────
773
+
774
+ class VoiceBridge:
775
+ def __init__(self, stt: STTProvider, tts: TTSProvider, llm: LLMProvider):
776
+ self.stt = stt
777
+ self.tts = tts
778
+ self.llm = llm
779
+
780
+ async def handle_client(self, websocket):
781
+ history = []
782
+ async for message in websocket:
783
+ if isinstance(message, bytes):
784
+ transcript = await self.stt.transcribe(message)
785
+ history.append({"role": "user", "content": transcript})
786
+ response = await self.llm.generate(transcript, history)
787
+ history.append({"role": "assistant", "content": response})
788
+ audio = await self.tts.synthesize(response)
789
+ await websocket.send(audio)
790
+ else:
791
+ data = json.loads(message)
792
+ await websocket.send(json.dumps({"type": "pong"}))
793
+
794
+ async def start(self, host: str = "localhost", port: int = 8765):
795
+ print(f"Voice Bridge on ws://{host}:{port}")
796
+ async with websockets.serve(self.handle_client, host, port):
797
+ await asyncio.Future()
798
+
799
+
800
+ if __name__ == "__main__":
801
+ bridge = VoiceBridge(stt=MySTT(), tts=MyTTS(), llm=MyLLM())
802
+ asyncio.run(bridge.start())
803
803
  `,
804
804
  description: "Pluggable voice bridge template with abstract STT/TTS/LLM interfaces",
805
805
  },
@@ -814,88 +814,88 @@ if __name__ == "__main__":
814
814
  }
815
815
  // TypeScript custom template
816
816
  const vadBlock = includeVAD
817
- ? `
818
- /** Simple energy-based Voice Activity Detection. */
819
- function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
820
- let energy = 0;
821
- for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
822
- return Math.sqrt(energy / samples.length) > threshold;
823
- }
817
+ ? `
818
+ /** Simple energy-based Voice Activity Detection. */
819
+ function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
820
+ let energy = 0;
821
+ for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
822
+ return Math.sqrt(energy / samples.length) > threshold;
823
+ }
824
824
  `
825
825
  : "";
826
826
  return {
827
827
  files: [
828
828
  {
829
829
  path: "src/voice-bridge.ts",
830
- content: `/**
831
- * Custom Voice Bridge Template — Plug in any STT/TTS/LLM.
832
- */
833
- import { WebSocketServer } from "ws";
834
-
835
- // ─── Provider interfaces ─────────────────────────────────────────
836
-
837
- interface STTProvider {
838
- transcribe(audio: Buffer): Promise<string>;
839
- }
840
-
841
- interface TTSProvider {
842
- synthesize(text: string): Promise<Buffer>;
843
- }
844
-
845
- interface LLMProvider {
846
- generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string>;
847
- }
848
- ${vadBlock}
849
- // ─── Implement your providers here ───────────────────────────────
850
-
851
- class MySTT implements STTProvider {
852
- async transcribe(audio: Buffer): Promise<string> {
853
- // TODO: Replace with your STT implementation
854
- throw new Error("Implement STT provider");
855
- }
856
- }
857
-
858
- class MyTTS implements TTSProvider {
859
- async synthesize(text: string): Promise<Buffer> {
860
- // TODO: Replace with your TTS implementation
861
- throw new Error("Implement TTS provider");
862
- }
863
- }
864
-
865
- class MyLLM implements LLMProvider {
866
- async generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string> {
867
- // TODO: Replace with your LLM implementation
868
- return \`Echo: \${prompt}\`;
869
- }
870
- }
871
-
872
- // ─── Voice bridge server ─────────────────────────────────────────
873
-
874
- const stt = new MySTT();
875
- const tts = new MyTTS();
876
- const llm = new MyLLM();
877
-
878
- const wss = new WebSocketServer({ port: 8765 });
879
- console.log("Voice Bridge listening on ws://localhost:8765");
880
-
881
- wss.on("connection", (ws) => {
882
- const history: Array<{ role: string; content: string }> = [];
883
-
884
- ws.on("message", async (data: Buffer) => {
885
- try {
886
- const transcript = await stt.transcribe(data);
887
- history.push({ role: "user", content: transcript });
888
-
889
- const response = await llm.generate(transcript, history);
890
- history.push({ role: "assistant", content: response });
891
-
892
- const audio = await tts.synthesize(response);
893
- ws.send(audio);
894
- } catch (err) {
895
- console.error("Pipeline error:", err);
896
- }
897
- });
898
- });
830
+ content: `/**
831
+ * Custom Voice Bridge Template — Plug in any STT/TTS/LLM.
832
+ */
833
+ import { WebSocketServer } from "ws";
834
+
835
+ // ─── Provider interfaces ─────────────────────────────────────────
836
+
837
+ interface STTProvider {
838
+ transcribe(audio: Buffer): Promise<string>;
839
+ }
840
+
841
+ interface TTSProvider {
842
+ synthesize(text: string): Promise<Buffer>;
843
+ }
844
+
845
+ interface LLMProvider {
846
+ generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string>;
847
+ }
848
+ ${vadBlock}
849
+ // ─── Implement your providers here ───────────────────────────────
850
+
851
+ class MySTT implements STTProvider {
852
+ async transcribe(audio: Buffer): Promise<string> {
853
+ // TODO: Replace with your STT implementation
854
+ throw new Error("Implement STT provider");
855
+ }
856
+ }
857
+
858
+ class MyTTS implements TTSProvider {
859
+ async synthesize(text: string): Promise<Buffer> {
860
+ // TODO: Replace with your TTS implementation
861
+ throw new Error("Implement TTS provider");
862
+ }
863
+ }
864
+
865
+ class MyLLM implements LLMProvider {
866
+ async generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string> {
867
+ // TODO: Replace with your LLM implementation
868
+ return \`Echo: \${prompt}\`;
869
+ }
870
+ }
871
+
872
+ // ─── Voice bridge server ─────────────────────────────────────────
873
+
874
+ const stt = new MySTT();
875
+ const tts = new MyTTS();
876
+ const llm = new MyLLM();
877
+
878
+ const wss = new WebSocketServer({ port: 8765 });
879
+ console.log("Voice Bridge listening on ws://localhost:8765");
880
+
881
+ wss.on("connection", (ws) => {
882
+ const history: Array<{ role: string; content: string }> = [];
883
+
884
+ ws.on("message", async (data: Buffer) => {
885
+ try {
886
+ const transcript = await stt.transcribe(data);
887
+ history.push({ role: "user", content: transcript });
888
+
889
+ const response = await llm.generate(transcript, history);
890
+ history.push({ role: "assistant", content: response });
891
+
892
+ const audio = await tts.synthesize(response);
893
+ ws.send(audio);
894
+ } catch (err) {
895
+ console.error("Pipeline error:", err);
896
+ }
897
+ });
898
+ });
899
899
  `,
900
900
  description: "Pluggable voice bridge template with STT/TTS/LLM interfaces",
901
901
  },