nodebench-mcp 2.17.0 → 2.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/NODEBENCH_AGENTS.md +2 -2
- package/README.md +514 -82
- package/dist/__tests__/analytics.test.d.ts +11 -0
- package/dist/__tests__/analytics.test.js +546 -0
- package/dist/__tests__/analytics.test.js.map +1 -0
- package/dist/__tests__/dynamicLoading.test.d.ts +1 -0
- package/dist/__tests__/dynamicLoading.test.js +278 -0
- package/dist/__tests__/dynamicLoading.test.js.map +1 -0
- package/dist/__tests__/evalHarness.test.js +1 -1
- package/dist/__tests__/evalHarness.test.js.map +1 -1
- package/dist/__tests__/helpers/answerMatch.js +22 -22
- package/dist/__tests__/presetRealWorldBench.test.js +9 -0
- package/dist/__tests__/presetRealWorldBench.test.js.map +1 -1
- package/dist/__tests__/tools.test.js +1 -1
- package/dist/__tests__/toolsetGatingEval.test.js +9 -1
- package/dist/__tests__/toolsetGatingEval.test.js.map +1 -1
- package/dist/analytics/index.d.ts +10 -0
- package/dist/analytics/index.js +11 -0
- package/dist/analytics/index.js.map +1 -0
- package/dist/analytics/projectDetector.d.ts +19 -0
- package/dist/analytics/projectDetector.js +259 -0
- package/dist/analytics/projectDetector.js.map +1 -0
- package/dist/analytics/schema.d.ts +57 -0
- package/dist/analytics/schema.js +157 -0
- package/dist/analytics/schema.js.map +1 -0
- package/dist/analytics/smartPreset.d.ts +63 -0
- package/dist/analytics/smartPreset.js +300 -0
- package/dist/analytics/smartPreset.js.map +1 -0
- package/dist/analytics/toolTracker.d.ts +59 -0
- package/dist/analytics/toolTracker.js +163 -0
- package/dist/analytics/toolTracker.js.map +1 -0
- package/dist/analytics/usageStats.d.ts +64 -0
- package/dist/analytics/usageStats.js +252 -0
- package/dist/analytics/usageStats.js.map +1 -0
- package/dist/db.js +359 -321
- package/dist/db.js.map +1 -1
- package/dist/index.d.ts +2 -1
- package/dist/index.js +652 -89
- package/dist/index.js.map +1 -1
- package/dist/tools/architectTools.js +13 -13
- package/dist/tools/critterTools.js +14 -14
- package/dist/tools/parallelAgentTools.js +176 -176
- package/dist/tools/patternTools.js +11 -11
- package/dist/tools/progressiveDiscoveryTools.d.ts +5 -1
- package/dist/tools/progressiveDiscoveryTools.js +111 -19
- package/dist/tools/progressiveDiscoveryTools.js.map +1 -1
- package/dist/tools/researchWritingTools.js +42 -42
- package/dist/tools/rssTools.js +396 -396
- package/dist/tools/toolRegistry.d.ts +17 -0
- package/dist/tools/toolRegistry.js +65 -17
- package/dist/tools/toolRegistry.js.map +1 -1
- package/dist/tools/voiceBridgeTools.js +498 -498
- package/dist/toolsetRegistry.d.ts +10 -0
- package/dist/toolsetRegistry.js +84 -0
- package/dist/toolsetRegistry.js.map +1 -0
- package/package.json +4 -4
|
@@ -235,90 +235,90 @@ function rateLatency(perceivedMs) {
|
|
|
235
235
|
// ─── Scaffold templates ──────────────────────────────────────────────────────
|
|
236
236
|
function getScaffoldWhisperEdge(includeVAD) {
|
|
237
237
|
const vadCode = includeVAD
|
|
238
|
-
? `
|
|
239
|
-
import numpy as np
|
|
240
|
-
|
|
241
|
-
def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
|
|
242
|
-
"""Simple energy-based Voice Activity Detection."""
|
|
243
|
-
energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
|
|
244
|
-
return energy > threshold
|
|
238
|
+
? `
|
|
239
|
+
import numpy as np
|
|
240
|
+
|
|
241
|
+
def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
|
|
242
|
+
"""Simple energy-based Voice Activity Detection."""
|
|
243
|
+
energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
|
|
244
|
+
return energy > threshold
|
|
245
245
|
`
|
|
246
246
|
: "";
|
|
247
247
|
const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
|
|
248
248
|
const vadUsage = includeVAD
|
|
249
|
-
? `
|
|
250
|
-
# Check for voice activity before sending to Whisper
|
|
251
|
-
if not detect_voice_activity(audio_data):
|
|
249
|
+
? `
|
|
250
|
+
# Check for voice activity before sending to Whisper
|
|
251
|
+
if not detect_voice_activity(audio_data):
|
|
252
252
|
continue`
|
|
253
253
|
: "";
|
|
254
254
|
return {
|
|
255
255
|
files: [
|
|
256
256
|
{
|
|
257
257
|
path: "voice_bridge.py",
|
|
258
|
-
content: `"""Voice Bridge: Whisper STT + Edge TTS + WebSocket server."""
|
|
259
|
-
import asyncio
|
|
260
|
-
import json
|
|
261
|
-
import tempfile
|
|
262
|
-
import wave
|
|
263
|
-
import whisper
|
|
264
|
-
import edge_tts
|
|
265
|
-
import websockets
|
|
266
|
-
${vadImport}
|
|
267
|
-
# Load Whisper model (use "tiny" or "base" for speed, "small" for accuracy)
|
|
268
|
-
model = whisper.load_model("base")
|
|
269
|
-
|
|
270
|
-
async def transcribe(audio_data: bytes) -> str:
|
|
271
|
-
"""Transcribe audio bytes using Whisper."""
|
|
272
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
273
|
-
# Write raw PCM as WAV
|
|
274
|
-
with wave.open(f.name, "wb") as wf:
|
|
275
|
-
wf.setnchannels(1)
|
|
276
|
-
wf.setsampwidth(2)
|
|
277
|
-
wf.setframerate(16000)
|
|
278
|
-
wf.writeframes(audio_data)
|
|
279
|
-
result = model.transcribe(f.name, language="en")
|
|
280
|
-
return result["text"].strip()
|
|
281
|
-
|
|
282
|
-
async def synthesize(text: str) -> bytes:
|
|
283
|
-
"""Synthesize speech using Edge TTS."""
|
|
284
|
-
communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
|
|
285
|
-
audio_chunks = []
|
|
286
|
-
async for chunk in communicate.stream():
|
|
287
|
-
if chunk["type"] == "audio":
|
|
288
|
-
audio_chunks.append(chunk["data"])
|
|
289
|
-
return b"".join(audio_chunks)
|
|
290
|
-
|
|
291
|
-
async def handle_client(websocket):
|
|
292
|
-
"""Handle a WebSocket client connection."""
|
|
293
|
-
print("Client connected")
|
|
294
|
-
try:
|
|
295
|
-
async for message in websocket:
|
|
296
|
-
if isinstance(message, bytes):${vadUsage}
|
|
297
|
-
# Transcribe speech to text
|
|
298
|
-
transcript = await transcribe(message)
|
|
299
|
-
print(f"User: {transcript}")
|
|
300
|
-
|
|
301
|
-
# TODO: Send transcript to your LLM and get response
|
|
302
|
-
llm_response = f"Echo: {transcript}"
|
|
303
|
-
|
|
304
|
-
# Synthesize response to speech
|
|
305
|
-
audio = await synthesize(llm_response)
|
|
306
|
-
await websocket.send(audio)
|
|
307
|
-
else:
|
|
308
|
-
# Handle text/control messages
|
|
309
|
-
data = json.loads(message)
|
|
310
|
-
if data.get("type") == "ping":
|
|
311
|
-
await websocket.send(json.dumps({{"type": "pong"}}))
|
|
312
|
-
except websockets.exceptions.ConnectionClosed:
|
|
313
|
-
print("Client disconnected")
|
|
314
|
-
|
|
315
|
-
async def main():
|
|
316
|
-
print("Voice Bridge starting on ws://localhost:8765")
|
|
317
|
-
async with websockets.serve(handle_client, "localhost", 8765):
|
|
318
|
-
await asyncio.Future() # Run forever
|
|
319
|
-
|
|
320
|
-
if __name__ == "__main__":
|
|
321
|
-
asyncio.run(main())
|
|
258
|
+
content: `"""Voice Bridge: Whisper STT + Edge TTS + WebSocket server."""
|
|
259
|
+
import asyncio
|
|
260
|
+
import json
|
|
261
|
+
import tempfile
|
|
262
|
+
import wave
|
|
263
|
+
import whisper
|
|
264
|
+
import edge_tts
|
|
265
|
+
import websockets
|
|
266
|
+
${vadImport}
|
|
267
|
+
# Load Whisper model (use "tiny" or "base" for speed, "small" for accuracy)
|
|
268
|
+
model = whisper.load_model("base")
|
|
269
|
+
|
|
270
|
+
async def transcribe(audio_data: bytes) -> str:
|
|
271
|
+
"""Transcribe audio bytes using Whisper."""
|
|
272
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
273
|
+
# Write raw PCM as WAV
|
|
274
|
+
with wave.open(f.name, "wb") as wf:
|
|
275
|
+
wf.setnchannels(1)
|
|
276
|
+
wf.setsampwidth(2)
|
|
277
|
+
wf.setframerate(16000)
|
|
278
|
+
wf.writeframes(audio_data)
|
|
279
|
+
result = model.transcribe(f.name, language="en")
|
|
280
|
+
return result["text"].strip()
|
|
281
|
+
|
|
282
|
+
async def synthesize(text: str) -> bytes:
|
|
283
|
+
"""Synthesize speech using Edge TTS."""
|
|
284
|
+
communicate = edge_tts.Communicate(text, voice="en-US-AriaNeural")
|
|
285
|
+
audio_chunks = []
|
|
286
|
+
async for chunk in communicate.stream():
|
|
287
|
+
if chunk["type"] == "audio":
|
|
288
|
+
audio_chunks.append(chunk["data"])
|
|
289
|
+
return b"".join(audio_chunks)
|
|
290
|
+
|
|
291
|
+
async def handle_client(websocket):
|
|
292
|
+
"""Handle a WebSocket client connection."""
|
|
293
|
+
print("Client connected")
|
|
294
|
+
try:
|
|
295
|
+
async for message in websocket:
|
|
296
|
+
if isinstance(message, bytes):${vadUsage}
|
|
297
|
+
# Transcribe speech to text
|
|
298
|
+
transcript = await transcribe(message)
|
|
299
|
+
print(f"User: {transcript}")
|
|
300
|
+
|
|
301
|
+
# TODO: Send transcript to your LLM and get response
|
|
302
|
+
llm_response = f"Echo: {transcript}"
|
|
303
|
+
|
|
304
|
+
# Synthesize response to speech
|
|
305
|
+
audio = await synthesize(llm_response)
|
|
306
|
+
await websocket.send(audio)
|
|
307
|
+
else:
|
|
308
|
+
# Handle text/control messages
|
|
309
|
+
data = json.loads(message)
|
|
310
|
+
if data.get("type") == "ping":
|
|
311
|
+
await websocket.send(json.dumps({{"type": "pong"}}))
|
|
312
|
+
except websockets.exceptions.ConnectionClosed:
|
|
313
|
+
print("Client disconnected")
|
|
314
|
+
|
|
315
|
+
async def main():
|
|
316
|
+
print("Voice Bridge starting on ws://localhost:8765")
|
|
317
|
+
async with websockets.serve(handle_client, "localhost", 8765):
|
|
318
|
+
await asyncio.Future() # Run forever
|
|
319
|
+
|
|
320
|
+
if __name__ == "__main__":
|
|
321
|
+
asyncio.run(main())
|
|
322
322
|
`,
|
|
323
323
|
description: "Main voice bridge server: Whisper STT + Edge TTS over WebSocket",
|
|
324
324
|
},
|
|
@@ -333,10 +333,10 @@ if __name__ == "__main__":
|
|
|
333
333
|
: []),
|
|
334
334
|
{
|
|
335
335
|
path: "requirements.txt",
|
|
336
|
-
content: `openai-whisper
|
|
337
|
-
edge-tts
|
|
338
|
-
websockets
|
|
339
|
-
numpy
|
|
336
|
+
content: `openai-whisper
|
|
337
|
+
edge-tts
|
|
338
|
+
websockets
|
|
339
|
+
numpy
|
|
340
340
|
`,
|
|
341
341
|
description: "Python dependencies",
|
|
342
342
|
},
|
|
@@ -353,88 +353,88 @@ numpy
|
|
|
353
353
|
}
|
|
354
354
|
function getScaffoldDeepgramCartesia(includeVAD) {
|
|
355
355
|
const vadBlock = includeVAD
|
|
356
|
-
? `
|
|
357
|
-
// Simple energy-based VAD
|
|
358
|
-
function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
|
|
359
|
-
let energy = 0;
|
|
360
|
-
for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
|
|
361
|
-
return Math.sqrt(energy / samples.length) > threshold;
|
|
362
|
-
}
|
|
356
|
+
? `
|
|
357
|
+
// Simple energy-based VAD
|
|
358
|
+
function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
|
|
359
|
+
let energy = 0;
|
|
360
|
+
for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
|
|
361
|
+
return Math.sqrt(energy / samples.length) > threshold;
|
|
362
|
+
}
|
|
363
363
|
`
|
|
364
364
|
: "";
|
|
365
365
|
return {
|
|
366
366
|
files: [
|
|
367
367
|
{
|
|
368
368
|
path: "src/voice-bridge.ts",
|
|
369
|
-
content: `/**
|
|
370
|
-
* Voice Bridge: Deepgram STT + Cartesia TTS (TypeScript)
|
|
371
|
-
*
|
|
372
|
-
* Requires:
|
|
373
|
-
* DEEPGRAM_API_KEY — from https://console.deepgram.com
|
|
374
|
-
* CARTESIA_API_KEY — from https://play.cartesia.ai
|
|
375
|
-
*/
|
|
376
|
-
import { createClient, LiveTranscriptionEvents } from "@deepgram/sdk";
|
|
377
|
-
import Cartesia from "@cartesia/cartesia-js";
|
|
378
|
-
import { WebSocketServer } from "ws";
|
|
379
|
-
|
|
380
|
-
const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
|
|
381
|
-
const cartesia = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY! });
|
|
382
|
-
${vadBlock}
|
|
383
|
-
const wss = new WebSocketServer({ port: 8765 });
|
|
384
|
-
console.log("Voice Bridge listening on ws://localhost:8765");
|
|
385
|
-
|
|
386
|
-
wss.on("connection", (ws) => {
|
|
387
|
-
console.log("Client connected");
|
|
388
|
-
|
|
389
|
-
// Set up Deepgram live transcription
|
|
390
|
-
const dgConn = deepgram.listen.live({
|
|
391
|
-
model: "nova-2",
|
|
392
|
-
language: "en",
|
|
393
|
-
smart_format: true,
|
|
394
|
-
interim_results: false,
|
|
395
|
-
});
|
|
396
|
-
|
|
397
|
-
dgConn.on(LiveTranscriptionEvents.Open, () => {
|
|
398
|
-
console.log("Deepgram connection open");
|
|
399
|
-
});
|
|
400
|
-
|
|
401
|
-
dgConn.on(LiveTranscriptionEvents.Transcript, async (data) => {
|
|
402
|
-
const transcript = data.channel?.alternatives?.[0]?.transcript;
|
|
403
|
-
if (!transcript) return;
|
|
404
|
-
|
|
405
|
-
console.log(\`User: \${transcript}\`);
|
|
406
|
-
|
|
407
|
-
// TODO: Send transcript to your LLM and get response
|
|
408
|
-
const llmResponse = \`Echo: \${transcript}\`;
|
|
409
|
-
|
|
410
|
-
// Synthesize with Cartesia (streaming)
|
|
411
|
-
const ttsResponse = await cartesia.tts.sse({
|
|
412
|
-
modelId: "sonic-english",
|
|
413
|
-
transcript: llmResponse,
|
|
414
|
-
voice: { mode: "id", id: "a0e99841-438c-4a64-b679-ae501e7d6091" },
|
|
415
|
-
output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: 24000 },
|
|
416
|
-
});
|
|
417
|
-
|
|
418
|
-
for await (const chunk of ttsResponse) {
|
|
419
|
-
if (ws.readyState === ws.OPEN) {
|
|
420
|
-
ws.send(chunk);
|
|
421
|
-
}
|
|
422
|
-
}
|
|
423
|
-
});
|
|
424
|
-
|
|
425
|
-
dgConn.on(LiveTranscriptionEvents.Error, (err) => {
|
|
426
|
-
console.error("Deepgram error:", err);
|
|
427
|
-
});
|
|
428
|
-
|
|
429
|
-
ws.on("message", (data: Buffer) => {${includeVAD ? "\n // VAD check\n const samples = new Float32Array(data.buffer);\n if (!detectVoiceActivity(samples)) return;\n" : ""}
|
|
430
|
-
dgConn.send(data);
|
|
431
|
-
});
|
|
432
|
-
|
|
433
|
-
ws.on("close", () => {
|
|
434
|
-
dgConn.finish();
|
|
435
|
-
console.log("Client disconnected");
|
|
436
|
-
});
|
|
437
|
-
});
|
|
369
|
+
content: `/**
|
|
370
|
+
* Voice Bridge: Deepgram STT + Cartesia TTS (TypeScript)
|
|
371
|
+
*
|
|
372
|
+
* Requires:
|
|
373
|
+
* DEEPGRAM_API_KEY — from https://console.deepgram.com
|
|
374
|
+
* CARTESIA_API_KEY — from https://play.cartesia.ai
|
|
375
|
+
*/
|
|
376
|
+
import { createClient, LiveTranscriptionEvents } from "@deepgram/sdk";
|
|
377
|
+
import Cartesia from "@cartesia/cartesia-js";
|
|
378
|
+
import { WebSocketServer } from "ws";
|
|
379
|
+
|
|
380
|
+
const deepgram = createClient(process.env.DEEPGRAM_API_KEY!);
|
|
381
|
+
const cartesia = new Cartesia({ apiKey: process.env.CARTESIA_API_KEY! });
|
|
382
|
+
${vadBlock}
|
|
383
|
+
const wss = new WebSocketServer({ port: 8765 });
|
|
384
|
+
console.log("Voice Bridge listening on ws://localhost:8765");
|
|
385
|
+
|
|
386
|
+
wss.on("connection", (ws) => {
|
|
387
|
+
console.log("Client connected");
|
|
388
|
+
|
|
389
|
+
// Set up Deepgram live transcription
|
|
390
|
+
const dgConn = deepgram.listen.live({
|
|
391
|
+
model: "nova-2",
|
|
392
|
+
language: "en",
|
|
393
|
+
smart_format: true,
|
|
394
|
+
interim_results: false,
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
dgConn.on(LiveTranscriptionEvents.Open, () => {
|
|
398
|
+
console.log("Deepgram connection open");
|
|
399
|
+
});
|
|
400
|
+
|
|
401
|
+
dgConn.on(LiveTranscriptionEvents.Transcript, async (data) => {
|
|
402
|
+
const transcript = data.channel?.alternatives?.[0]?.transcript;
|
|
403
|
+
if (!transcript) return;
|
|
404
|
+
|
|
405
|
+
console.log(\`User: \${transcript}\`);
|
|
406
|
+
|
|
407
|
+
// TODO: Send transcript to your LLM and get response
|
|
408
|
+
const llmResponse = \`Echo: \${transcript}\`;
|
|
409
|
+
|
|
410
|
+
// Synthesize with Cartesia (streaming)
|
|
411
|
+
const ttsResponse = await cartesia.tts.sse({
|
|
412
|
+
modelId: "sonic-english",
|
|
413
|
+
transcript: llmResponse,
|
|
414
|
+
voice: { mode: "id", id: "a0e99841-438c-4a64-b679-ae501e7d6091" },
|
|
415
|
+
output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: 24000 },
|
|
416
|
+
});
|
|
417
|
+
|
|
418
|
+
for await (const chunk of ttsResponse) {
|
|
419
|
+
if (ws.readyState === ws.OPEN) {
|
|
420
|
+
ws.send(chunk);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
});
|
|
424
|
+
|
|
425
|
+
dgConn.on(LiveTranscriptionEvents.Error, (err) => {
|
|
426
|
+
console.error("Deepgram error:", err);
|
|
427
|
+
});
|
|
428
|
+
|
|
429
|
+
ws.on("message", (data: Buffer) => {${includeVAD ? "\n // VAD check\n const samples = new Float32Array(data.buffer);\n if (!detectVoiceActivity(samples)) return;\n" : ""}
|
|
430
|
+
dgConn.send(data);
|
|
431
|
+
});
|
|
432
|
+
|
|
433
|
+
ws.on("close", () => {
|
|
434
|
+
dgConn.finish();
|
|
435
|
+
console.log("Client disconnected");
|
|
436
|
+
});
|
|
437
|
+
});
|
|
438
438
|
`,
|
|
439
439
|
description: "TypeScript voice bridge: Deepgram Nova-2 STT + Cartesia Sonic streaming TTS",
|
|
440
440
|
},
|
|
@@ -469,111 +469,111 @@ function getScaffoldBrowserWebspeech() {
|
|
|
469
469
|
files: [
|
|
470
470
|
{
|
|
471
471
|
path: "index.html",
|
|
472
|
-
content: `<!DOCTYPE html>
|
|
473
|
-
<html lang="en">
|
|
474
|
-
<head>
|
|
475
|
-
<meta charset="UTF-8">
|
|
476
|
-
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
477
|
-
<title>Voice Bridge — Web Speech API</title>
|
|
478
|
-
<style>
|
|
479
|
-
body { font-family: system-ui, sans-serif; max-width: 600px; margin: 2rem auto; padding: 0 1rem; }
|
|
480
|
-
#status { padding: 0.5rem; border-radius: 4px; margin: 1rem 0; }
|
|
481
|
-
.listening { background: #dcfce7; color: #166534; }
|
|
482
|
-
.speaking { background: #dbeafe; color: #1e40af; }
|
|
483
|
-
.idle { background: #f3f4f6; color: #374151; }
|
|
484
|
-
#transcript { white-space: pre-wrap; border: 1px solid #d1d5db; padding: 1rem; border-radius: 4px; min-height: 200px; max-height: 400px; overflow-y: auto; }
|
|
485
|
-
button { padding: 0.75rem 1.5rem; font-size: 1rem; border: none; border-radius: 4px; cursor: pointer; margin: 0.25rem; }
|
|
486
|
-
#startBtn { background: #2563eb; color: white; }
|
|
487
|
-
#startBtn:disabled { background: #93c5fd; cursor: not-allowed; }
|
|
488
|
-
#stopBtn { background: #dc2626; color: white; }
|
|
489
|
-
</style>
|
|
490
|
-
</head>
|
|
491
|
-
<body>
|
|
492
|
-
<h1>Voice Bridge</h1>
|
|
493
|
-
<p>Browser-native STT + TTS using the Web Speech API. No server needed.</p>
|
|
494
|
-
<div>
|
|
495
|
-
<button id="startBtn" onclick="startListening()">Start Listening</button>
|
|
496
|
-
<button id="stopBtn" onclick="stopListening()">Stop</button>
|
|
497
|
-
</div>
|
|
498
|
-
<div id="status" class="idle">Idle</div>
|
|
499
|
-
<div id="transcript"></div>
|
|
500
|
-
|
|
501
|
-
<script>
|
|
502
|
-
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
|
503
|
-
if (!SpeechRecognition) {
|
|
504
|
-
document.getElementById("status").textContent = "Web Speech API not supported in this browser.";
|
|
505
|
-
}
|
|
506
|
-
|
|
507
|
-
const recognition = new SpeechRecognition();
|
|
508
|
-
recognition.continuous = true;
|
|
509
|
-
recognition.interimResults = true;
|
|
510
|
-
recognition.lang = "en-US";
|
|
511
|
-
|
|
512
|
-
const synth = window.speechSynthesis;
|
|
513
|
-
const transcriptEl = document.getElementById("transcript");
|
|
514
|
-
const statusEl = document.getElementById("status");
|
|
515
|
-
let isListening = false;
|
|
516
|
-
|
|
517
|
-
function setStatus(text, cls) {
|
|
518
|
-
statusEl.textContent = text;
|
|
519
|
-
statusEl.className = cls;
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
function appendTranscript(role, text) {
|
|
523
|
-
transcriptEl.textContent += role + ": " + text + "\\n";
|
|
524
|
-
transcriptEl.scrollTop = transcriptEl.scrollHeight;
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
recognition.onresult = (event) => {
|
|
528
|
-
const last = event.results[event.results.length - 1];
|
|
529
|
-
if (last.isFinal) {
|
|
530
|
-
const transcript = last[0].transcript.trim();
|
|
531
|
-
if (!transcript) return;
|
|
532
|
-
|
|
533
|
-
appendTranscript("You", transcript);
|
|
534
|
-
setStatus("Processing...", "speaking");
|
|
535
|
-
|
|
536
|
-
// TODO: Send transcript to your LLM API and get response
|
|
537
|
-
const llmResponse = "Echo: " + transcript;
|
|
538
|
-
|
|
539
|
-
appendTranscript("Assistant", llmResponse);
|
|
540
|
-
|
|
541
|
-
// Speak the response
|
|
542
|
-
const utterance = new SpeechSynthesisUtterance(llmResponse);
|
|
543
|
-
utterance.onstart = () => setStatus("Speaking...", "speaking");
|
|
544
|
-
utterance.onend = () => setStatus("Listening...", "listening");
|
|
545
|
-
synth.speak(utterance);
|
|
546
|
-
}
|
|
547
|
-
};
|
|
548
|
-
|
|
549
|
-
recognition.onerror = (event) => {
|
|
550
|
-
console.error("Speech recognition error:", event.error);
|
|
551
|
-
if (event.error !== "no-speech") {
|
|
552
|
-
setStatus("Error: " + event.error, "idle");
|
|
553
|
-
}
|
|
554
|
-
};
|
|
555
|
-
|
|
556
|
-
recognition.onend = () => {
|
|
557
|
-
if (isListening) recognition.start(); // Auto-restart
|
|
558
|
-
};
|
|
559
|
-
|
|
560
|
-
function startListening() {
|
|
561
|
-
isListening = true;
|
|
562
|
-
recognition.start();
|
|
563
|
-
setStatus("Listening...", "listening");
|
|
564
|
-
document.getElementById("startBtn").disabled = true;
|
|
565
|
-
}
|
|
566
|
-
|
|
567
|
-
function stopListening() {
|
|
568
|
-
isListening = false;
|
|
569
|
-
recognition.stop();
|
|
570
|
-
synth.cancel();
|
|
571
|
-
setStatus("Idle", "idle");
|
|
572
|
-
document.getElementById("startBtn").disabled = false;
|
|
573
|
-
}
|
|
574
|
-
</script>
|
|
575
|
-
</body>
|
|
576
|
-
</html>
|
|
472
|
+
content: `<!DOCTYPE html>
|
|
473
|
+
<html lang="en">
|
|
474
|
+
<head>
|
|
475
|
+
<meta charset="UTF-8">
|
|
476
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
477
|
+
<title>Voice Bridge — Web Speech API</title>
|
|
478
|
+
<style>
|
|
479
|
+
body { font-family: system-ui, sans-serif; max-width: 600px; margin: 2rem auto; padding: 0 1rem; }
|
|
480
|
+
#status { padding: 0.5rem; border-radius: 4px; margin: 1rem 0; }
|
|
481
|
+
.listening { background: #dcfce7; color: #166534; }
|
|
482
|
+
.speaking { background: #dbeafe; color: #1e40af; }
|
|
483
|
+
.idle { background: #f3f4f6; color: #374151; }
|
|
484
|
+
#transcript { white-space: pre-wrap; border: 1px solid #d1d5db; padding: 1rem; border-radius: 4px; min-height: 200px; max-height: 400px; overflow-y: auto; }
|
|
485
|
+
button { padding: 0.75rem 1.5rem; font-size: 1rem; border: none; border-radius: 4px; cursor: pointer; margin: 0.25rem; }
|
|
486
|
+
#startBtn { background: #2563eb; color: white; }
|
|
487
|
+
#startBtn:disabled { background: #93c5fd; cursor: not-allowed; }
|
|
488
|
+
#stopBtn { background: #dc2626; color: white; }
|
|
489
|
+
</style>
|
|
490
|
+
</head>
|
|
491
|
+
<body>
|
|
492
|
+
<h1>Voice Bridge</h1>
|
|
493
|
+
<p>Browser-native STT + TTS using the Web Speech API. No server needed.</p>
|
|
494
|
+
<div>
|
|
495
|
+
<button id="startBtn" onclick="startListening()">Start Listening</button>
|
|
496
|
+
<button id="stopBtn" onclick="stopListening()">Stop</button>
|
|
497
|
+
</div>
|
|
498
|
+
<div id="status" class="idle">Idle</div>
|
|
499
|
+
<div id="transcript"></div>
|
|
500
|
+
|
|
501
|
+
<script>
|
|
502
|
+
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
|
|
503
|
+
if (!SpeechRecognition) {
|
|
504
|
+
document.getElementById("status").textContent = "Web Speech API not supported in this browser.";
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
const recognition = new SpeechRecognition();
|
|
508
|
+
recognition.continuous = true;
|
|
509
|
+
recognition.interimResults = true;
|
|
510
|
+
recognition.lang = "en-US";
|
|
511
|
+
|
|
512
|
+
const synth = window.speechSynthesis;
|
|
513
|
+
const transcriptEl = document.getElementById("transcript");
|
|
514
|
+
const statusEl = document.getElementById("status");
|
|
515
|
+
let isListening = false;
|
|
516
|
+
|
|
517
|
+
function setStatus(text, cls) {
|
|
518
|
+
statusEl.textContent = text;
|
|
519
|
+
statusEl.className = cls;
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
function appendTranscript(role, text) {
|
|
523
|
+
transcriptEl.textContent += role + ": " + text + "\\n";
|
|
524
|
+
transcriptEl.scrollTop = transcriptEl.scrollHeight;
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
recognition.onresult = (event) => {
|
|
528
|
+
const last = event.results[event.results.length - 1];
|
|
529
|
+
if (last.isFinal) {
|
|
530
|
+
const transcript = last[0].transcript.trim();
|
|
531
|
+
if (!transcript) return;
|
|
532
|
+
|
|
533
|
+
appendTranscript("You", transcript);
|
|
534
|
+
setStatus("Processing...", "speaking");
|
|
535
|
+
|
|
536
|
+
// TODO: Send transcript to your LLM API and get response
|
|
537
|
+
const llmResponse = "Echo: " + transcript;
|
|
538
|
+
|
|
539
|
+
appendTranscript("Assistant", llmResponse);
|
|
540
|
+
|
|
541
|
+
// Speak the response
|
|
542
|
+
const utterance = new SpeechSynthesisUtterance(llmResponse);
|
|
543
|
+
utterance.onstart = () => setStatus("Speaking...", "speaking");
|
|
544
|
+
utterance.onend = () => setStatus("Listening...", "listening");
|
|
545
|
+
synth.speak(utterance);
|
|
546
|
+
}
|
|
547
|
+
};
|
|
548
|
+
|
|
549
|
+
recognition.onerror = (event) => {
|
|
550
|
+
console.error("Speech recognition error:", event.error);
|
|
551
|
+
if (event.error !== "no-speech") {
|
|
552
|
+
setStatus("Error: " + event.error, "idle");
|
|
553
|
+
}
|
|
554
|
+
};
|
|
555
|
+
|
|
556
|
+
recognition.onend = () => {
|
|
557
|
+
if (isListening) recognition.start(); // Auto-restart
|
|
558
|
+
};
|
|
559
|
+
|
|
560
|
+
function startListening() {
|
|
561
|
+
isListening = true;
|
|
562
|
+
recognition.start();
|
|
563
|
+
setStatus("Listening...", "listening");
|
|
564
|
+
document.getElementById("startBtn").disabled = true;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
function stopListening() {
|
|
568
|
+
isListening = false;
|
|
569
|
+
recognition.stop();
|
|
570
|
+
synth.cancel();
|
|
571
|
+
setStatus("Idle", "idle");
|
|
572
|
+
document.getElementById("startBtn").disabled = false;
|
|
573
|
+
}
|
|
574
|
+
</script>
|
|
575
|
+
</body>
|
|
576
|
+
</html>
|
|
577
577
|
`,
|
|
578
578
|
description: "Self-contained HTML page using Web Speech API for both STT and TTS",
|
|
579
579
|
},
|
|
@@ -589,97 +589,97 @@ function getScaffoldBrowserWebspeech() {
|
|
|
589
589
|
}
|
|
590
590
|
function getScaffoldWhisperPiper(includeVAD) {
|
|
591
591
|
const vadCode = includeVAD
|
|
592
|
-
? `
|
|
593
|
-
import numpy as np
|
|
594
|
-
|
|
595
|
-
def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
|
|
596
|
-
"""Simple energy-based Voice Activity Detection."""
|
|
597
|
-
energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
|
|
598
|
-
return energy > threshold
|
|
592
|
+
? `
|
|
593
|
+
import numpy as np
|
|
594
|
+
|
|
595
|
+
def detect_voice_activity(audio_chunk: np.ndarray, threshold: float = 0.02) -> bool:
|
|
596
|
+
"""Simple energy-based Voice Activity Detection."""
|
|
597
|
+
energy = np.sqrt(np.mean(audio_chunk.astype(np.float32) ** 2))
|
|
598
|
+
return energy > threshold
|
|
599
599
|
`
|
|
600
600
|
: "";
|
|
601
601
|
const vadImport = includeVAD ? "from vad import detect_voice_activity\n" : "";
|
|
602
602
|
const vadUsage = includeVAD
|
|
603
|
-
? `
|
|
604
|
-
# Check for voice activity
|
|
605
|
-
if not detect_voice_activity(audio_data):
|
|
603
|
+
? `
|
|
604
|
+
# Check for voice activity
|
|
605
|
+
if not detect_voice_activity(audio_data):
|
|
606
606
|
continue`
|
|
607
607
|
: "";
|
|
608
608
|
return {
|
|
609
609
|
files: [
|
|
610
610
|
{
|
|
611
611
|
path: "voice_bridge.py",
|
|
612
|
-
content: `"""Fully Local Voice Bridge: Whisper STT + Piper TTS. No cloud dependencies."""
|
|
613
|
-
import asyncio
|
|
614
|
-
import json
|
|
615
|
-
import subprocess
|
|
616
|
-
import tempfile
|
|
617
|
-
import wave
|
|
618
|
-
import whisper
|
|
619
|
-
import websockets
|
|
620
|
-
${vadImport}
|
|
621
|
-
# Load Whisper model
|
|
622
|
-
model = whisper.load_model("base")
|
|
623
|
-
|
|
624
|
-
# Piper TTS configuration
|
|
625
|
-
PIPER_MODEL = "en_US-lessac-medium" # Download from https://github.com/rhasspy/piper/releases
|
|
626
|
-
PIPER_BIN = "piper" # Ensure piper is on PATH
|
|
627
|
-
|
|
628
|
-
async def transcribe(audio_data: bytes) -> str:
|
|
629
|
-
"""Transcribe audio bytes using Whisper."""
|
|
630
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
631
|
-
with wave.open(f.name, "wb") as wf:
|
|
632
|
-
wf.setnchannels(1)
|
|
633
|
-
wf.setsampwidth(2)
|
|
634
|
-
wf.setframerate(16000)
|
|
635
|
-
wf.writeframes(audio_data)
|
|
636
|
-
result = model.transcribe(f.name, language="en")
|
|
637
|
-
return result["text"].strip()
|
|
638
|
-
|
|
639
|
-
async def synthesize(text: str) -> bytes:
|
|
640
|
-
"""Synthesize speech using Piper TTS (local, ONNX-based)."""
|
|
641
|
-
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
642
|
-
proc = await asyncio.create_subprocess_exec(
|
|
643
|
-
PIPER_BIN,
|
|
644
|
-
"--model", PIPER_MODEL,
|
|
645
|
-
"--output_file", f.name,
|
|
646
|
-
stdin=asyncio.subprocess.PIPE,
|
|
647
|
-
stdout=asyncio.subprocess.PIPE,
|
|
648
|
-
stderr=asyncio.subprocess.PIPE,
|
|
649
|
-
)
|
|
650
|
-
await proc.communicate(input=text.encode("utf-8"))
|
|
651
|
-
with open(f.name, "rb") as audio:
|
|
652
|
-
return audio.read()
|
|
653
|
-
|
|
654
|
-
async def handle_client(websocket):
|
|
655
|
-
"""Handle a WebSocket client connection."""
|
|
656
|
-
print("Client connected")
|
|
657
|
-
try:
|
|
658
|
-
async for message in websocket:
|
|
659
|
-
if isinstance(message, bytes):${vadUsage}
|
|
660
|
-
transcript = await transcribe(message)
|
|
661
|
-
print(f"User: {transcript}")
|
|
662
|
-
|
|
663
|
-
# TODO: Send to local LLM (e.g. llama.cpp server)
|
|
664
|
-
llm_response = f"Echo: {transcript}"
|
|
665
|
-
|
|
666
|
-
audio = await synthesize(llm_response)
|
|
667
|
-
await websocket.send(audio)
|
|
668
|
-
else:
|
|
669
|
-
data = json.loads(message)
|
|
670
|
-
if data.get("type") == "ping":
|
|
671
|
-
await websocket.send(json.dumps({{"type": "pong"}}))
|
|
672
|
-
except websockets.exceptions.ConnectionClosed:
|
|
673
|
-
print("Client disconnected")
|
|
674
|
-
|
|
675
|
-
async def main():
|
|
676
|
-
print("Local Voice Bridge starting on ws://localhost:8765")
|
|
677
|
-
print("Fully offline — no cloud APIs used.")
|
|
678
|
-
async with websockets.serve(handle_client, "localhost", 8765):
|
|
679
|
-
await asyncio.Future()
|
|
680
|
-
|
|
681
|
-
if __name__ == "__main__":
|
|
682
|
-
asyncio.run(main())
|
|
612
|
+
content: `"""Fully Local Voice Bridge: Whisper STT + Piper TTS. No cloud dependencies."""
|
|
613
|
+
import asyncio
|
|
614
|
+
import json
|
|
615
|
+
import subprocess
|
|
616
|
+
import tempfile
|
|
617
|
+
import wave
|
|
618
|
+
import whisper
|
|
619
|
+
import websockets
|
|
620
|
+
${vadImport}
|
|
621
|
+
# Load Whisper model
|
|
622
|
+
model = whisper.load_model("base")
|
|
623
|
+
|
|
624
|
+
# Piper TTS configuration
|
|
625
|
+
PIPER_MODEL = "en_US-lessac-medium" # Download from https://github.com/rhasspy/piper/releases
|
|
626
|
+
PIPER_BIN = "piper" # Ensure piper is on PATH
|
|
627
|
+
|
|
628
|
+
async def transcribe(audio_data: bytes) -> str:
|
|
629
|
+
"""Transcribe audio bytes using Whisper."""
|
|
630
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
631
|
+
with wave.open(f.name, "wb") as wf:
|
|
632
|
+
wf.setnchannels(1)
|
|
633
|
+
wf.setsampwidth(2)
|
|
634
|
+
wf.setframerate(16000)
|
|
635
|
+
wf.writeframes(audio_data)
|
|
636
|
+
result = model.transcribe(f.name, language="en")
|
|
637
|
+
return result["text"].strip()
|
|
638
|
+
|
|
639
|
+
async def synthesize(text: str) -> bytes:
|
|
640
|
+
"""Synthesize speech using Piper TTS (local, ONNX-based)."""
|
|
641
|
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as f:
|
|
642
|
+
proc = await asyncio.create_subprocess_exec(
|
|
643
|
+
PIPER_BIN,
|
|
644
|
+
"--model", PIPER_MODEL,
|
|
645
|
+
"--output_file", f.name,
|
|
646
|
+
stdin=asyncio.subprocess.PIPE,
|
|
647
|
+
stdout=asyncio.subprocess.PIPE,
|
|
648
|
+
stderr=asyncio.subprocess.PIPE,
|
|
649
|
+
)
|
|
650
|
+
await proc.communicate(input=text.encode("utf-8"))
|
|
651
|
+
with open(f.name, "rb") as audio:
|
|
652
|
+
return audio.read()
|
|
653
|
+
|
|
654
|
+
async def handle_client(websocket):
|
|
655
|
+
"""Handle a WebSocket client connection."""
|
|
656
|
+
print("Client connected")
|
|
657
|
+
try:
|
|
658
|
+
async for message in websocket:
|
|
659
|
+
if isinstance(message, bytes):${vadUsage}
|
|
660
|
+
transcript = await transcribe(message)
|
|
661
|
+
print(f"User: {transcript}")
|
|
662
|
+
|
|
663
|
+
# TODO: Send to local LLM (e.g. llama.cpp server)
|
|
664
|
+
llm_response = f"Echo: {transcript}"
|
|
665
|
+
|
|
666
|
+
audio = await synthesize(llm_response)
|
|
667
|
+
await websocket.send(audio)
|
|
668
|
+
else:
|
|
669
|
+
data = json.loads(message)
|
|
670
|
+
if data.get("type") == "ping":
|
|
671
|
+
await websocket.send(json.dumps({{"type": "pong"}}))
|
|
672
|
+
except websockets.exceptions.ConnectionClosed:
|
|
673
|
+
print("Client disconnected")
|
|
674
|
+
|
|
675
|
+
async def main():
|
|
676
|
+
print("Local Voice Bridge starting on ws://localhost:8765")
|
|
677
|
+
print("Fully offline — no cloud APIs used.")
|
|
678
|
+
async with websockets.serve(handle_client, "localhost", 8765):
|
|
679
|
+
await asyncio.Future()
|
|
680
|
+
|
|
681
|
+
if __name__ == "__main__":
|
|
682
|
+
asyncio.run(main())
|
|
683
683
|
`,
|
|
684
684
|
description: "Fully local voice bridge: Whisper + Piper (no cloud dependencies)",
|
|
685
685
|
},
|
|
@@ -694,9 +694,9 @@ if __name__ == "__main__":
|
|
|
694
694
|
: []),
|
|
695
695
|
{
|
|
696
696
|
path: "requirements.txt",
|
|
697
|
-
content: `openai-whisper
|
|
698
|
-
websockets
|
|
699
|
-
numpy
|
|
697
|
+
content: `openai-whisper
|
|
698
|
+
websockets
|
|
699
|
+
numpy
|
|
700
700
|
`,
|
|
701
701
|
description: "Python dependencies (Piper installed separately as binary)",
|
|
702
702
|
},
|
|
@@ -715,91 +715,91 @@ numpy
|
|
|
715
715
|
function getScaffoldCustom(language, includeVAD) {
|
|
716
716
|
if (language === "python") {
|
|
717
717
|
const vadBlock = includeVAD
|
|
718
|
-
? `
|
|
719
|
-
class SimpleVAD:
|
|
720
|
-
"""Energy-based Voice Activity Detection."""
|
|
721
|
-
def __init__(self, threshold: float = 0.02):
|
|
722
|
-
self.threshold = threshold
|
|
723
|
-
|
|
724
|
-
def is_speech(self, audio_chunk) -> bool:
|
|
725
|
-
import numpy as np
|
|
726
|
-
energy = np.sqrt(np.mean(np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) ** 2))
|
|
727
|
-
return energy > self.threshold
|
|
718
|
+
? `
|
|
719
|
+
class SimpleVAD:
|
|
720
|
+
"""Energy-based Voice Activity Detection."""
|
|
721
|
+
def __init__(self, threshold: float = 0.02):
|
|
722
|
+
self.threshold = threshold
|
|
723
|
+
|
|
724
|
+
def is_speech(self, audio_chunk) -> bool:
|
|
725
|
+
import numpy as np
|
|
726
|
+
energy = np.sqrt(np.mean(np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) ** 2))
|
|
727
|
+
return energy > self.threshold
|
|
728
728
|
`
|
|
729
729
|
: "";
|
|
730
730
|
return {
|
|
731
731
|
files: [
|
|
732
732
|
{
|
|
733
733
|
path: "voice_bridge.py",
|
|
734
|
-
content: `"""Custom Voice Bridge Template — Plug in any STT/TTS/LLM."""
|
|
735
|
-
import asyncio
|
|
736
|
-
import json
|
|
737
|
-
from abc import ABC, abstractmethod
|
|
738
|
-
import websockets
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
class STTProvider(ABC):
|
|
742
|
-
@abstractmethod
|
|
743
|
-
async def transcribe(self, audio_data: bytes) -> str: ...
|
|
744
|
-
|
|
745
|
-
class TTSProvider(ABC):
|
|
746
|
-
@abstractmethod
|
|
747
|
-
async def synthesize(self, text: str) -> bytes: ...
|
|
748
|
-
|
|
749
|
-
class LLMProvider(ABC):
|
|
750
|
-
@abstractmethod
|
|
751
|
-
async def generate(self, prompt: str, history: list[dict]) -> str: ...
|
|
752
|
-
${vadBlock}
|
|
753
|
-
|
|
754
|
-
# ─── Implement your providers here ────────────────────────────────
|
|
755
|
-
|
|
756
|
-
class MySTT(STTProvider):
|
|
757
|
-
async def transcribe(self, audio_data: bytes) -> str:
|
|
758
|
-
# TODO: Replace with your STT implementation
|
|
759
|
-
raise NotImplementedError("Implement STT provider")
|
|
760
|
-
|
|
761
|
-
class MyTTS(TTSProvider):
|
|
762
|
-
async def synthesize(self, text: str) -> bytes:
|
|
763
|
-
# TODO: Replace with your TTS implementation
|
|
764
|
-
raise NotImplementedError("Implement TTS provider")
|
|
765
|
-
|
|
766
|
-
class MyLLM(LLMProvider):
|
|
767
|
-
async def generate(self, prompt: str, history: list[dict]) -> str:
|
|
768
|
-
# TODO: Replace with your LLM implementation
|
|
769
|
-
return f"Echo: {prompt}"
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
# ─── Voice bridge server ──────────────────────────────────────────
|
|
773
|
-
|
|
774
|
-
class VoiceBridge:
|
|
775
|
-
def __init__(self, stt: STTProvider, tts: TTSProvider, llm: LLMProvider):
|
|
776
|
-
self.stt = stt
|
|
777
|
-
self.tts = tts
|
|
778
|
-
self.llm = llm
|
|
779
|
-
|
|
780
|
-
async def handle_client(self, websocket):
|
|
781
|
-
history = []
|
|
782
|
-
async for message in websocket:
|
|
783
|
-
if isinstance(message, bytes):
|
|
784
|
-
transcript = await self.stt.transcribe(message)
|
|
785
|
-
history.append({"role": "user", "content": transcript})
|
|
786
|
-
response = await self.llm.generate(transcript, history)
|
|
787
|
-
history.append({"role": "assistant", "content": response})
|
|
788
|
-
audio = await self.tts.synthesize(response)
|
|
789
|
-
await websocket.send(audio)
|
|
790
|
-
else:
|
|
791
|
-
data = json.loads(message)
|
|
792
|
-
await websocket.send(json.dumps({"type": "pong"}))
|
|
793
|
-
|
|
794
|
-
async def start(self, host: str = "localhost", port: int = 8765):
|
|
795
|
-
print(f"Voice Bridge on ws://{host}:{port}")
|
|
796
|
-
async with websockets.serve(self.handle_client, host, port):
|
|
797
|
-
await asyncio.Future()
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
if __name__ == "__main__":
|
|
801
|
-
bridge = VoiceBridge(stt=MySTT(), tts=MyTTS(), llm=MyLLM())
|
|
802
|
-
asyncio.run(bridge.start())
|
|
734
|
+
content: `"""Custom Voice Bridge Template — Plug in any STT/TTS/LLM."""
|
|
735
|
+
import asyncio
|
|
736
|
+
import json
|
|
737
|
+
from abc import ABC, abstractmethod
|
|
738
|
+
import websockets
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
class STTProvider(ABC):
|
|
742
|
+
@abstractmethod
|
|
743
|
+
async def transcribe(self, audio_data: bytes) -> str: ...
|
|
744
|
+
|
|
745
|
+
class TTSProvider(ABC):
|
|
746
|
+
@abstractmethod
|
|
747
|
+
async def synthesize(self, text: str) -> bytes: ...
|
|
748
|
+
|
|
749
|
+
class LLMProvider(ABC):
|
|
750
|
+
@abstractmethod
|
|
751
|
+
async def generate(self, prompt: str, history: list[dict]) -> str: ...
|
|
752
|
+
${vadBlock}
|
|
753
|
+
|
|
754
|
+
# ─── Implement your providers here ────────────────────────────────
|
|
755
|
+
|
|
756
|
+
class MySTT(STTProvider):
|
|
757
|
+
async def transcribe(self, audio_data: bytes) -> str:
|
|
758
|
+
# TODO: Replace with your STT implementation
|
|
759
|
+
raise NotImplementedError("Implement STT provider")
|
|
760
|
+
|
|
761
|
+
class MyTTS(TTSProvider):
|
|
762
|
+
async def synthesize(self, text: str) -> bytes:
|
|
763
|
+
# TODO: Replace with your TTS implementation
|
|
764
|
+
raise NotImplementedError("Implement TTS provider")
|
|
765
|
+
|
|
766
|
+
class MyLLM(LLMProvider):
|
|
767
|
+
async def generate(self, prompt: str, history: list[dict]) -> str:
|
|
768
|
+
# TODO: Replace with your LLM implementation
|
|
769
|
+
return f"Echo: {prompt}"
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
# ─── Voice bridge server ──────────────────────────────────────────
|
|
773
|
+
|
|
774
|
+
class VoiceBridge:
|
|
775
|
+
def __init__(self, stt: STTProvider, tts: TTSProvider, llm: LLMProvider):
|
|
776
|
+
self.stt = stt
|
|
777
|
+
self.tts = tts
|
|
778
|
+
self.llm = llm
|
|
779
|
+
|
|
780
|
+
async def handle_client(self, websocket):
|
|
781
|
+
history = []
|
|
782
|
+
async for message in websocket:
|
|
783
|
+
if isinstance(message, bytes):
|
|
784
|
+
transcript = await self.stt.transcribe(message)
|
|
785
|
+
history.append({"role": "user", "content": transcript})
|
|
786
|
+
response = await self.llm.generate(transcript, history)
|
|
787
|
+
history.append({"role": "assistant", "content": response})
|
|
788
|
+
audio = await self.tts.synthesize(response)
|
|
789
|
+
await websocket.send(audio)
|
|
790
|
+
else:
|
|
791
|
+
data = json.loads(message)
|
|
792
|
+
await websocket.send(json.dumps({"type": "pong"}))
|
|
793
|
+
|
|
794
|
+
async def start(self, host: str = "localhost", port: int = 8765):
|
|
795
|
+
print(f"Voice Bridge on ws://{host}:{port}")
|
|
796
|
+
async with websockets.serve(self.handle_client, host, port):
|
|
797
|
+
await asyncio.Future()
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
if __name__ == "__main__":
|
|
801
|
+
bridge = VoiceBridge(stt=MySTT(), tts=MyTTS(), llm=MyLLM())
|
|
802
|
+
asyncio.run(bridge.start())
|
|
803
803
|
`,
|
|
804
804
|
description: "Pluggable voice bridge template with abstract STT/TTS/LLM interfaces",
|
|
805
805
|
},
|
|
@@ -814,88 +814,88 @@ if __name__ == "__main__":
|
|
|
814
814
|
}
|
|
815
815
|
// TypeScript custom template
|
|
816
816
|
const vadBlock = includeVAD
|
|
817
|
-
? `
|
|
818
|
-
/** Simple energy-based Voice Activity Detection. */
|
|
819
|
-
function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
|
|
820
|
-
let energy = 0;
|
|
821
|
-
for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
|
|
822
|
-
return Math.sqrt(energy / samples.length) > threshold;
|
|
823
|
-
}
|
|
817
|
+
? `
|
|
818
|
+
/** Simple energy-based Voice Activity Detection. */
|
|
819
|
+
function detectVoiceActivity(samples: Float32Array, threshold = 0.02): boolean {
|
|
820
|
+
let energy = 0;
|
|
821
|
+
for (let i = 0; i < samples.length; i++) energy += samples[i] * samples[i];
|
|
822
|
+
return Math.sqrt(energy / samples.length) > threshold;
|
|
823
|
+
}
|
|
824
824
|
`
|
|
825
825
|
: "";
|
|
826
826
|
return {
|
|
827
827
|
files: [
|
|
828
828
|
{
|
|
829
829
|
path: "src/voice-bridge.ts",
|
|
830
|
-
content: `/**
|
|
831
|
-
* Custom Voice Bridge Template — Plug in any STT/TTS/LLM.
|
|
832
|
-
*/
|
|
833
|
-
import { WebSocketServer } from "ws";
|
|
834
|
-
|
|
835
|
-
// ─── Provider interfaces ─────────────────────────────────────────
|
|
836
|
-
|
|
837
|
-
interface STTProvider {
|
|
838
|
-
transcribe(audio: Buffer): Promise<string>;
|
|
839
|
-
}
|
|
840
|
-
|
|
841
|
-
interface TTSProvider {
|
|
842
|
-
synthesize(text: string): Promise<Buffer>;
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
interface LLMProvider {
|
|
846
|
-
generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string>;
|
|
847
|
-
}
|
|
848
|
-
${vadBlock}
|
|
849
|
-
// ─── Implement your providers here ───────────────────────────────
|
|
850
|
-
|
|
851
|
-
class MySTT implements STTProvider {
|
|
852
|
-
async transcribe(audio: Buffer): Promise<string> {
|
|
853
|
-
// TODO: Replace with your STT implementation
|
|
854
|
-
throw new Error("Implement STT provider");
|
|
855
|
-
}
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
class MyTTS implements TTSProvider {
|
|
859
|
-
async synthesize(text: string): Promise<Buffer> {
|
|
860
|
-
// TODO: Replace with your TTS implementation
|
|
861
|
-
throw new Error("Implement TTS provider");
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
|
|
865
|
-
class MyLLM implements LLMProvider {
|
|
866
|
-
async generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string> {
|
|
867
|
-
// TODO: Replace with your LLM implementation
|
|
868
|
-
return \`Echo: \${prompt}\`;
|
|
869
|
-
}
|
|
870
|
-
}
|
|
871
|
-
|
|
872
|
-
// ─── Voice bridge server ─────────────────────────────────────────
|
|
873
|
-
|
|
874
|
-
const stt = new MySTT();
|
|
875
|
-
const tts = new MyTTS();
|
|
876
|
-
const llm = new MyLLM();
|
|
877
|
-
|
|
878
|
-
const wss = new WebSocketServer({ port: 8765 });
|
|
879
|
-
console.log("Voice Bridge listening on ws://localhost:8765");
|
|
880
|
-
|
|
881
|
-
wss.on("connection", (ws) => {
|
|
882
|
-
const history: Array<{ role: string; content: string }> = [];
|
|
883
|
-
|
|
884
|
-
ws.on("message", async (data: Buffer) => {
|
|
885
|
-
try {
|
|
886
|
-
const transcript = await stt.transcribe(data);
|
|
887
|
-
history.push({ role: "user", content: transcript });
|
|
888
|
-
|
|
889
|
-
const response = await llm.generate(transcript, history);
|
|
890
|
-
history.push({ role: "assistant", content: response });
|
|
891
|
-
|
|
892
|
-
const audio = await tts.synthesize(response);
|
|
893
|
-
ws.send(audio);
|
|
894
|
-
} catch (err) {
|
|
895
|
-
console.error("Pipeline error:", err);
|
|
896
|
-
}
|
|
897
|
-
});
|
|
898
|
-
});
|
|
830
|
+
content: `/**
|
|
831
|
+
* Custom Voice Bridge Template — Plug in any STT/TTS/LLM.
|
|
832
|
+
*/
|
|
833
|
+
import { WebSocketServer } from "ws";
|
|
834
|
+
|
|
835
|
+
// ─── Provider interfaces ─────────────────────────────────────────
|
|
836
|
+
|
|
837
|
+
interface STTProvider {
|
|
838
|
+
transcribe(audio: Buffer): Promise<string>;
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
interface TTSProvider {
|
|
842
|
+
synthesize(text: string): Promise<Buffer>;
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
interface LLMProvider {
|
|
846
|
+
generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string>;
|
|
847
|
+
}
|
|
848
|
+
${vadBlock}
|
|
849
|
+
// ─── Implement your providers here ───────────────────────────────
|
|
850
|
+
|
|
851
|
+
class MySTT implements STTProvider {
|
|
852
|
+
async transcribe(audio: Buffer): Promise<string> {
|
|
853
|
+
// TODO: Replace with your STT implementation
|
|
854
|
+
throw new Error("Implement STT provider");
|
|
855
|
+
}
|
|
856
|
+
}
|
|
857
|
+
|
|
858
|
+
class MyTTS implements TTSProvider {
|
|
859
|
+
async synthesize(text: string): Promise<Buffer> {
|
|
860
|
+
// TODO: Replace with your TTS implementation
|
|
861
|
+
throw new Error("Implement TTS provider");
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
|
|
865
|
+
class MyLLM implements LLMProvider {
|
|
866
|
+
async generate(prompt: string, history: Array<{ role: string; content: string }>): Promise<string> {
|
|
867
|
+
// TODO: Replace with your LLM implementation
|
|
868
|
+
return \`Echo: \${prompt}\`;
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
|
|
872
|
+
// ─── Voice bridge server ─────────────────────────────────────────
|
|
873
|
+
|
|
874
|
+
const stt = new MySTT();
|
|
875
|
+
const tts = new MyTTS();
|
|
876
|
+
const llm = new MyLLM();
|
|
877
|
+
|
|
878
|
+
const wss = new WebSocketServer({ port: 8765 });
|
|
879
|
+
console.log("Voice Bridge listening on ws://localhost:8765");
|
|
880
|
+
|
|
881
|
+
wss.on("connection", (ws) => {
|
|
882
|
+
const history: Array<{ role: string; content: string }> = [];
|
|
883
|
+
|
|
884
|
+
ws.on("message", async (data: Buffer) => {
|
|
885
|
+
try {
|
|
886
|
+
const transcript = await stt.transcribe(data);
|
|
887
|
+
history.push({ role: "user", content: transcript });
|
|
888
|
+
|
|
889
|
+
const response = await llm.generate(transcript, history);
|
|
890
|
+
history.push({ role: "assistant", content: response });
|
|
891
|
+
|
|
892
|
+
const audio = await tts.synthesize(response);
|
|
893
|
+
ws.send(audio);
|
|
894
|
+
} catch (err) {
|
|
895
|
+
console.error("Pipeline error:", err);
|
|
896
|
+
}
|
|
897
|
+
});
|
|
898
|
+
});
|
|
899
899
|
`,
|
|
900
900
|
description: "Pluggable voice bridge template with STT/TTS/LLM interfaces",
|
|
901
901
|
},
|