voicecc 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +6 -0
- package/README.md +48 -0
- package/bin/voicecc.js +39 -0
- package/dashboard/dist/assets/index-BXemFrMp.css +1 -0
- package/dashboard/dist/assets/index-dAYfRls7.js +11 -0
- package/dashboard/dist/audio-processor.js +126 -0
- package/dashboard/dist/index.html +13 -0
- package/dashboard/routes/auth.ts +119 -0
- package/dashboard/routes/browser-call.ts +87 -0
- package/dashboard/routes/claude-md.ts +50 -0
- package/dashboard/routes/conversations.ts +203 -0
- package/dashboard/routes/integrations.ts +154 -0
- package/dashboard/routes/mcp-servers.ts +198 -0
- package/dashboard/routes/settings.ts +64 -0
- package/dashboard/routes/tunnel.ts +66 -0
- package/dashboard/routes/twilio.ts +120 -0
- package/dashboard/routes/voice.ts +48 -0
- package/dashboard/routes/webrtc.ts +85 -0
- package/dashboard/server.ts +130 -0
- package/dashboard/tsconfig.json +13 -0
- package/init/CLAUDE.md +18 -0
- package/package.json +59 -0
- package/run.ts +68 -0
- package/scripts/postinstall.js +228 -0
- package/services/browser-call-manager.ts +106 -0
- package/services/device-pairing.ts +176 -0
- package/services/env.ts +88 -0
- package/services/tunnel.ts +204 -0
- package/services/twilio-manager.ts +126 -0
- package/sidecar/assets/startup.pcm +0 -0
- package/sidecar/audio-adapter.ts +60 -0
- package/sidecar/audio-capture.ts +220 -0
- package/sidecar/browser-audio-playback.test.ts +149 -0
- package/sidecar/browser-audio.ts +147 -0
- package/sidecar/browser-server.ts +331 -0
- package/sidecar/chime.test.ts +69 -0
- package/sidecar/chime.ts +54 -0
- package/sidecar/claude-session.ts +295 -0
- package/sidecar/endpointing.ts +163 -0
- package/sidecar/index.ts +83 -0
- package/sidecar/local-audio.ts +126 -0
- package/sidecar/mic-vpio +0 -0
- package/sidecar/mic-vpio.swift +484 -0
- package/sidecar/mock-tts-server-tagged.mjs +132 -0
- package/sidecar/narration.ts +204 -0
- package/sidecar/scripts/generate-startup-audio.py +79 -0
- package/sidecar/session-lock.ts +123 -0
- package/sidecar/sherpa-onnx-node.d.ts +4 -0
- package/sidecar/stt.ts +199 -0
- package/sidecar/tts-server.py +193 -0
- package/sidecar/tts.ts +481 -0
- package/sidecar/twilio-audio.ts +338 -0
- package/sidecar/twilio-server.ts +436 -0
- package/sidecar/types.ts +210 -0
- package/sidecar/vad.ts +101 -0
- package/sidecar/voice-loop-bugs.test.ts +522 -0
- package/sidecar/voice-session.ts +523 -0
- package/skills/voice/SKILL.md +26 -0
- package/tsconfig.json +22 -0
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Processes Claude's streaming output into TTS-friendly text.
|
|
3
|
+
*
|
|
4
|
+
* Two modes of operation:
|
|
5
|
+
* - Response mode: passes text_delta content through immediately for streaming
|
|
6
|
+
* TTS. Text is buffered into sentences downstream in the TTS module.
|
|
7
|
+
* - Long-task mode: emits periodic template-based summaries during tool use
|
|
8
|
+
* (e.g. "Running Bash...", "Still working on Bash...").
|
|
9
|
+
*
|
|
10
|
+
* Responsibilities:
|
|
11
|
+
* - Pass through streaming text deltas immediately for low-latency TTS
|
|
12
|
+
* - Track tool execution and emit periodic spoken summaries
|
|
13
|
+
* - Flush remaining text on result/error events
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
import type { ClaudeStreamEvent, NarrationConfig } from "./types.js";
|
|
17
|
+
|
|
18
|
+
/** Strip markdown syntax so text reads naturally when spoken. */
|
|
19
|
+
function stripMarkdown(text: string): string {
|
|
20
|
+
return text
|
|
21
|
+
.replace(/\*+/g, "") // bold/italic asterisks
|
|
22
|
+
.replace(/#+\s*/g, "") // heading markers
|
|
23
|
+
.replace(/`+/g, "") // inline code / code fences
|
|
24
|
+
.replace(/\[([^\]]*)\]\([^)]*\)/g, "$1") // [text](url) → text
|
|
25
|
+
.replace(/^-\s+/gm, "") // unordered list markers
|
|
26
|
+
.replace(/^\d+\.\s+/gm, ""); // ordered list markers
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// ============================================================================
|
|
30
|
+
// INTERFACES
|
|
31
|
+
// ============================================================================
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Narrator instance that processes Claude stream events into speakable text.
|
|
35
|
+
*/
|
|
36
|
+
export interface Narrator {
|
|
37
|
+
/**
|
|
38
|
+
* Process a single Claude stream event and return any text ready to be spoken.
|
|
39
|
+
* @param event - The Claude stream event to process
|
|
40
|
+
* @returns Array of strings to speak (often empty, sometimes 1-2 sentences)
|
|
41
|
+
*/
|
|
42
|
+
processEvent(event: ClaudeStreamEvent): string[];
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Flush any remaining buffered text that hasn't been emitted yet.
|
|
46
|
+
* @returns Array of remaining text strings to speak
|
|
47
|
+
*/
|
|
48
|
+
flush(): string[];
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Reset all internal state for a new conversation turn.
|
|
52
|
+
*/
|
|
53
|
+
reset(): void;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ============================================================================
|
|
57
|
+
// MAIN HANDLERS
|
|
58
|
+
// ============================================================================
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Create a new Narrator instance that converts Claude stream events into
|
|
62
|
+
* TTS-friendly sentence chunks.
|
|
63
|
+
* @param config - Narration configuration (summaryIntervalMs controls long-task summary frequency)
|
|
64
|
+
* @returns A Narrator instance
|
|
65
|
+
*/
|
|
66
|
+
export function createNarrator(config: NarrationConfig, onEmit?: (text: string) => void): Narrator {
|
|
67
|
+
// -- internal state --
|
|
68
|
+
let currentToolName: string | null = null;
|
|
69
|
+
let summaryTimer: NodeJS.Timeout | null = null;
|
|
70
|
+
let inLongTask = false;
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Process a single Claude stream event.
|
|
74
|
+
* @param event - The streaming event from Claude
|
|
75
|
+
* @returns Array of strings to speak
|
|
76
|
+
*/
|
|
77
|
+
function processEvent(event: ClaudeStreamEvent): string[] {
|
|
78
|
+
switch (event.type) {
|
|
79
|
+
case "text_delta":
|
|
80
|
+
return handleTextDelta(event);
|
|
81
|
+
case "tool_start":
|
|
82
|
+
return handleToolStart(event);
|
|
83
|
+
case "tool_end":
|
|
84
|
+
return handleToolEnd();
|
|
85
|
+
case "result":
|
|
86
|
+
case "error":
|
|
87
|
+
return handleTerminal();
|
|
88
|
+
default:
|
|
89
|
+
return [];
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Flush any remaining text in the buffer.
|
|
95
|
+
* @returns Array of remaining text strings
|
|
96
|
+
*/
|
|
97
|
+
function flush(): string[] {
|
|
98
|
+
return [];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Reset all state for a new conversation turn.
|
|
103
|
+
*/
|
|
104
|
+
function reset(): void {
|
|
105
|
+
currentToolName = null;
|
|
106
|
+
clearSummaryTimer();
|
|
107
|
+
inLongTask = false;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return { processEvent, flush, reset };
|
|
111
|
+
|
|
112
|
+
// ============================================================================
|
|
113
|
+
// HELPER FUNCTIONS
|
|
114
|
+
// ============================================================================
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Handle a text_delta event: pass through immediately, exit long-task mode.
|
|
118
|
+
* Text chunking for TTS is handled downstream by TextSplitterStream.
|
|
119
|
+
* @param event - The text_delta event
|
|
120
|
+
* @returns Array containing the delta text
|
|
121
|
+
*/
|
|
122
|
+
function handleTextDelta(event: ClaudeStreamEvent): string[] {
|
|
123
|
+
// Text arriving means Claude is responding directly -- leave long-task mode
|
|
124
|
+
if (inLongTask) {
|
|
125
|
+
clearSummaryTimer();
|
|
126
|
+
inLongTask = false;
|
|
127
|
+
currentToolName = null;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
const results: string[] = [];
|
|
131
|
+
if (event.content) {
|
|
132
|
+
const clean = stripMarkdown(event.content);
|
|
133
|
+
if (clean) results.push(clean);
|
|
134
|
+
}
|
|
135
|
+
return results;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Handle a tool_start event: enter long-task mode, start the summary timer,
|
|
140
|
+
* and emit an initial "Running {toolName}..." message.
|
|
141
|
+
* @param event - The tool_start event (must have toolName)
|
|
142
|
+
* @returns Array containing the initial tool message
|
|
143
|
+
*/
|
|
144
|
+
function handleToolStart(event: ClaudeStreamEvent): string[] {
|
|
145
|
+
const toolName = event.toolName ?? "unknown tool";
|
|
146
|
+
currentToolName = toolName;
|
|
147
|
+
inLongTask = true;
|
|
148
|
+
|
|
149
|
+
// Clear any existing timer before starting a new one
|
|
150
|
+
clearSummaryTimer();
|
|
151
|
+
startSummaryTimer();
|
|
152
|
+
|
|
153
|
+
return [`Running ${toolName}...`];
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/**
|
|
157
|
+
* Handle a tool_end event: clear current tool context but stay in long-task
|
|
158
|
+
* mode since more tools might follow.
|
|
159
|
+
* @returns Empty array
|
|
160
|
+
*/
|
|
161
|
+
function handleToolEnd(): string[] {
|
|
162
|
+
currentToolName = null;
|
|
163
|
+
return [];
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
/**
|
|
167
|
+
* Handle result or error events: flush all remaining text and reset state.
|
|
168
|
+
* @returns Array of any remaining text
|
|
169
|
+
*/
|
|
170
|
+
function handleTerminal(): string[] {
|
|
171
|
+
const remaining = flush();
|
|
172
|
+
|
|
173
|
+
// Full reset for next turn
|
|
174
|
+
clearSummaryTimer();
|
|
175
|
+
currentToolName = null;
|
|
176
|
+
inLongTask = false;
|
|
177
|
+
|
|
178
|
+
return remaining;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
/**
|
|
182
|
+
* Start the periodic summary timer for long-task mode.
|
|
183
|
+
* Emits "Still working on {toolName}..." at the configured interval.
|
|
184
|
+
*/
|
|
185
|
+
function startSummaryTimer(): void {
|
|
186
|
+
summaryTimer = setInterval(() => {
|
|
187
|
+
const name = currentToolName ?? "the task";
|
|
188
|
+
const summary = `Still working on ${name}...`;
|
|
189
|
+
if (onEmit) {
|
|
190
|
+
onEmit(summary);
|
|
191
|
+
}
|
|
192
|
+
}, config.summaryIntervalMs);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Clear the summary timer if one is active.
|
|
197
|
+
*/
|
|
198
|
+
function clearSummaryTimer(): void {
|
|
199
|
+
if (summaryTimer !== null) {
|
|
200
|
+
clearInterval(summaryTimer);
|
|
201
|
+
summaryTimer = null;
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
One-time script to generate the startup audio greeting.
|
|
3
|
+
|
|
4
|
+
Uses mlx_audio's Kokoro model (same API as tts-server.py) to synthesize a short
|
|
5
|
+
spoken greeting and writes it as raw 24kHz 16-bit signed mono PCM to
|
|
6
|
+
sidecar/assets/startup.pcm.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
cd sidecar
|
|
10
|
+
.venv/bin/python3 scripts/generate-startup-audio.py
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
# ============================================================================
|
|
18
|
+
# CONSTANTS
|
|
19
|
+
# ============================================================================
|
|
20
|
+
|
|
21
|
+
MODEL_ID = "prince-canuma/Kokoro-82M"
|
|
22
|
+
VOICE = "af_heart"
|
|
23
|
+
STARTUP_TEXT = "Hi there! I'm Voice CC. How can I help you today?"
|
|
24
|
+
OUTPUT_DIR = os.path.join(os.path.dirname(__file__), "..", "assets")
|
|
25
|
+
OUTPUT_FILE = os.path.join(OUTPUT_DIR, "startup.pcm")
|
|
26
|
+
|
|
27
|
+
# ============================================================================
|
|
28
|
+
# MAIN ENTRYPOINT
|
|
29
|
+
# ============================================================================
|
|
30
|
+
|
|
31
|
+
def main():
|
|
32
|
+
"""Load the Kokoro model, generate startup audio, and save as raw PCM."""
|
|
33
|
+
from mlx_audio.tts.utils import load_model
|
|
34
|
+
|
|
35
|
+
print(f"Loading model: {MODEL_ID}")
|
|
36
|
+
model = load_model(MODEL_ID)
|
|
37
|
+
print(f"Model loaded (sample_rate={model.sample_rate})")
|
|
38
|
+
|
|
39
|
+
print(f"Generating: \"{STARTUP_TEXT}\"")
|
|
40
|
+
chunks = []
|
|
41
|
+
try:
|
|
42
|
+
for result in model.generate(text=STARTUP_TEXT, voice=VOICE, stream=True):
|
|
43
|
+
audio = np.array(result.audio, copy=False)
|
|
44
|
+
chunks.append(audio)
|
|
45
|
+
print(f" chunk {len(chunks)}: {audio.shape}")
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"ERROR during generation: {e}", file=sys.stderr)
|
|
48
|
+
import traceback
|
|
49
|
+
traceback.print_exc()
|
|
50
|
+
sys.exit(1)
|
|
51
|
+
|
|
52
|
+
combined = np.concatenate(chunks)
|
|
53
|
+
pcm = float32_to_int16_pcm(combined)
|
|
54
|
+
|
|
55
|
+
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
56
|
+
with open(OUTPUT_FILE, "wb") as f:
|
|
57
|
+
f.write(pcm)
|
|
58
|
+
|
|
59
|
+
duration_s = len(combined) / model.sample_rate
|
|
60
|
+
print(f"Wrote {len(pcm)} bytes ({duration_s:.1f}s) to {OUTPUT_FILE}")
|
|
61
|
+
|
|
62
|
+
# ============================================================================
|
|
63
|
+
# HELPER FUNCTIONS
|
|
64
|
+
# ============================================================================
|
|
65
|
+
|
|
66
|
+
def float32_to_int16_pcm(audio: np.ndarray) -> bytes:
|
|
67
|
+
"""
|
|
68
|
+
Convert float32 audio samples (-1.0..1.0) to 16-bit signed PCM bytes.
|
|
69
|
+
|
|
70
|
+
@param audio - numpy array of float32 samples
|
|
71
|
+
@returns Raw bytes of int16 little-endian PCM
|
|
72
|
+
"""
|
|
73
|
+
clamped = np.clip(audio, -1.0, 1.0)
|
|
74
|
+
int16 = (clamped * 32767).astype(np.int16)
|
|
75
|
+
return int16.tobytes()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
main()
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-process session limiter using PID-based lock files.
|
|
3
|
+
*
|
|
4
|
+
* Ensures the total number of active voice sessions (local mic + Twilio combined)
|
|
5
|
+
* does not exceed MAX_CONCURRENT_SESSIONS. Stale lock files from crashed processes
|
|
6
|
+
* are automatically cleaned up on every acquire.
|
|
7
|
+
*
|
|
8
|
+
* Responsibilities:
|
|
9
|
+
* - Acquire a session slot by creating a PID lock file in ~/.claude-voice-sessions/
|
|
10
|
+
* - Validate existing lock files by checking if their PIDs are still alive
|
|
11
|
+
* - Clean up stale lock files from dead processes
|
|
12
|
+
* - Release the lock file on session stop or process exit
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "fs";
|
|
16
|
+
import { join } from "path";
|
|
17
|
+
import { homedir } from "os";
|
|
18
|
+
import { randomUUID } from "crypto";
|
|
19
|
+
|
|
20
|
+
// ============================================================================
|
|
21
|
+
// CONSTANTS
|
|
22
|
+
// ============================================================================
|
|
23
|
+
|
|
24
|
+
/** Directory where PID lock files are stored */
|
|
25
|
+
const LOCK_DIR = join(homedir(), ".claude-voice-sessions");
|
|
26
|
+
|
|
27
|
+
// ============================================================================
|
|
28
|
+
// INTERFACES
|
|
29
|
+
// ============================================================================
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Handle returned by acquireSessionLock. Call release() to free the session slot.
|
|
33
|
+
*/
|
|
34
|
+
export interface SessionLock {
|
|
35
|
+
/** Release the session lock (deletes the lock file) */
|
|
36
|
+
release: () => void;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// MAIN HANDLERS
|
|
41
|
+
// ============================================================================
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Acquire a session lock slot. Throws if the maximum number of concurrent
|
|
45
|
+
* sessions has been reached.
|
|
46
|
+
*
|
|
47
|
+
* Cleans up stale lock files (dead PIDs) on every call. Creates a new lock
|
|
48
|
+
* file containing the current PID. Registers a process.on('exit') handler
|
|
49
|
+
* as a safety net to release on shutdown.
|
|
50
|
+
*
|
|
51
|
+
* @param maxSessions - Maximum number of concurrent sessions allowed
|
|
52
|
+
* @returns A SessionLock handle with a release() method
|
|
53
|
+
* @throws Error if maxSessions has been reached
|
|
54
|
+
*/
|
|
55
|
+
export function acquireSessionLock(maxSessions: number): SessionLock {
|
|
56
|
+
// Ensure lock directory exists
|
|
57
|
+
mkdirSync(LOCK_DIR, { recursive: true });
|
|
58
|
+
|
|
59
|
+
// List existing lock files and validate their PIDs
|
|
60
|
+
const files = readdirSync(LOCK_DIR).filter((f) => f.endsWith(".lock"));
|
|
61
|
+
let activeCount = 0;
|
|
62
|
+
|
|
63
|
+
for (const file of files) {
|
|
64
|
+
const filePath = join(LOCK_DIR, file);
|
|
65
|
+
try {
|
|
66
|
+
const pid = parseInt(readFileSync(filePath, "utf-8").trim(), 10);
|
|
67
|
+
if (isNaN(pid) || !isProcessAlive(pid)) {
|
|
68
|
+
// Stale lock file -- process is dead, clean it up
|
|
69
|
+
unlinkSync(filePath);
|
|
70
|
+
} else {
|
|
71
|
+
activeCount++;
|
|
72
|
+
}
|
|
73
|
+
} catch {
|
|
74
|
+
// File disappeared between readdir and read, or parse error -- skip
|
|
75
|
+
try { unlinkSync(filePath); } catch { /* already gone */ }
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (activeCount >= maxSessions) {
|
|
80
|
+
throw new Error(
|
|
81
|
+
`Session limit reached (${activeCount}/${maxSessions}). ` +
|
|
82
|
+
`Cannot start another voice session.`
|
|
83
|
+
);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Create a new lock file with the current PID
|
|
87
|
+
const lockFile = join(LOCK_DIR, `${randomUUID()}.lock`);
|
|
88
|
+
writeFileSync(lockFile, String(process.pid), "utf-8");
|
|
89
|
+
|
|
90
|
+
let released = false;
|
|
91
|
+
|
|
92
|
+
/** Delete the lock file if it hasn't been released yet */
|
|
93
|
+
function release(): void {
|
|
94
|
+
if (released) return;
|
|
95
|
+
released = true;
|
|
96
|
+
try { unlinkSync(lockFile); } catch { /* already gone */ }
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Safety net: release on process exit
|
|
100
|
+
process.on("exit", release);
|
|
101
|
+
|
|
102
|
+
return { release };
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
// ============================================================================
|
|
106
|
+
// HELPER FUNCTIONS
|
|
107
|
+
// ============================================================================
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Check if a process with the given PID is still alive.
|
|
111
|
+
* Uses signal 0 which does not kill the process -- it only checks existence.
|
|
112
|
+
*
|
|
113
|
+
* @param pid - The process ID to check
|
|
114
|
+
* @returns true if the process is alive, false otherwise
|
|
115
|
+
*/
|
|
116
|
+
export function isProcessAlive(pid: number): boolean {
|
|
117
|
+
try {
|
|
118
|
+
process.kill(pid, 0);
|
|
119
|
+
return true;
|
|
120
|
+
} catch {
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
}
|
package/sidecar/stt.ts
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Local speech-to-text via sherpa-onnx with Whisper ONNX model (offline/batch).
|
|
3
|
+
*
|
|
4
|
+
* Whisper models in sherpa-onnx are offline-only (not streaming). Audio is
|
|
5
|
+
* accumulated during speech (SPEECH_START to SPEECH_END), then batch-transcribed
|
|
6
|
+
* on SPEECH_END using `createOfflineRecognizer`.
|
|
7
|
+
*
|
|
8
|
+
* Responsibilities:
|
|
9
|
+
* - Load the sherpa-onnx offline recognizer with a Whisper ONNX model
|
|
10
|
+
* - Accumulate audio samples during speech into an internal buffer
|
|
11
|
+
* - Batch-transcribe the accumulated buffer on demand
|
|
12
|
+
* - Manage buffer and recognizer lifecycle
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { existsSync } from "fs";
|
|
16
|
+
import type { TranscriptionResult } from "./types.js";
|
|
17
|
+
|
|
18
|
+
// ============================================================================
|
|
19
|
+
// INTERFACES
|
|
20
|
+
// ============================================================================
|
|
21
|
+
|
|
22
|
+
/** Internal interface for the STT processor returned by createStt. */
|
|
23
|
+
interface SttProcessor {
|
|
24
|
+
/**
|
|
25
|
+
* Appends audio samples to the internal buffer.
|
|
26
|
+
* Call continuously during speech (between SPEECH_START and SPEECH_END).
|
|
27
|
+
*
|
|
28
|
+
* @param samples - Float32Array of audio samples (16kHz, normalized -1.0 to 1.0)
|
|
29
|
+
*/
|
|
30
|
+
accumulate(samples: Float32Array): void;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Batch-transcribes the accumulated audio buffer using the offline recognizer.
|
|
34
|
+
* Creates an offline stream, feeds the accumulated audio, decodes, and returns
|
|
35
|
+
* the result. Clears the buffer afterward.
|
|
36
|
+
*
|
|
37
|
+
* @returns Transcription result with text, isFinal flag, and timestamp
|
|
38
|
+
*/
|
|
39
|
+
transcribe(): Promise<TranscriptionResult>;
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Clears the accumulated audio buffer without transcribing.
|
|
43
|
+
* Use on interruption or when discarding a speech segment.
|
|
44
|
+
*/
|
|
45
|
+
clearBuffer(): void;
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Frees the underlying recognizer resources.
|
|
49
|
+
* Call on shutdown to prevent resource leaks.
|
|
50
|
+
*/
|
|
51
|
+
destroy(): void;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// ============================================================================
|
|
55
|
+
// CONSTANTS
|
|
56
|
+
// ============================================================================
|
|
57
|
+
|
|
58
|
+
/** Sample rate expected by the Whisper model */
|
|
59
|
+
const SAMPLE_RATE = 16000;
|
|
60
|
+
|
|
61
|
+
/** Default model file prefix (sherpa-onnx naming convention: "small.en", "tiny.en", etc.) */
|
|
62
|
+
const DEFAULT_MODEL_PREFIX = "small.en";
|
|
63
|
+
|
|
64
|
+
/** Required model file suffixes within the model directory */
|
|
65
|
+
const REQUIRED_SUFFIXES = ["-encoder.int8.onnx", "-decoder.int8.onnx", "-tokens.txt"];
|
|
66
|
+
|
|
67
|
+
// ============================================================================
|
|
68
|
+
// MAIN HANDLERS
|
|
69
|
+
// ============================================================================
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Loads the sherpa-onnx offline recognizer with the Whisper model at the given
|
|
73
|
+
* path and returns an SttProcessor.
|
|
74
|
+
*
|
|
75
|
+
* @param modelPath - Path to directory containing encoder.onnx, decoder.onnx, and tokens.txt
|
|
76
|
+
* @returns Promise resolving to an SttProcessor instance
|
|
77
|
+
* @throws Error if any required model files are missing
|
|
78
|
+
*/
|
|
79
|
+
async function createStt(modelPath: string): Promise<SttProcessor> {
|
|
80
|
+
validateModelFiles(modelPath);
|
|
81
|
+
|
|
82
|
+
// Dynamic import to avoid ONNX runtime conflict with kokoro-js.
|
|
83
|
+
// Both sherpa-onnx-node and kokoro-js bundle native ONNX runtimes that
|
|
84
|
+
// crash if loaded simultaneously via static imports.
|
|
85
|
+
const sherpa = (await import("sherpa-onnx-node")).default;
|
|
86
|
+
|
|
87
|
+
const prefix = DEFAULT_MODEL_PREFIX;
|
|
88
|
+
const config = {
|
|
89
|
+
modelConfig: {
|
|
90
|
+
whisper: {
|
|
91
|
+
encoder: `${modelPath}/${prefix}-encoder.int8.onnx`,
|
|
92
|
+
decoder: `${modelPath}/${prefix}-decoder.int8.onnx`,
|
|
93
|
+
},
|
|
94
|
+
tokens: `${modelPath}/${prefix}-tokens.txt`,
|
|
95
|
+
},
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
const recognizer = new sherpa.OfflineRecognizer(config);
|
|
99
|
+
|
|
100
|
+
// Buffer stored as array of chunks to avoid repeated copying during accumulation
|
|
101
|
+
let audioChunks: Float32Array[] = [];
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
accumulate(samples: Float32Array): void {
|
|
105
|
+
audioChunks.push(samples);
|
|
106
|
+
},
|
|
107
|
+
|
|
108
|
+
async transcribe(): Promise<TranscriptionResult> {
|
|
109
|
+
const combinedSamples = concatenateChunks(audioChunks);
|
|
110
|
+
audioChunks = [];
|
|
111
|
+
|
|
112
|
+
if (combinedSamples.length === 0) {
|
|
113
|
+
return { text: "", isFinal: true, timestamp: Date.now() };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Create a fresh stream, feed audio, decode
|
|
117
|
+
const stream = recognizer.createStream();
|
|
118
|
+
stream.acceptWaveform({ sampleRate: SAMPLE_RATE, samples: combinedSamples });
|
|
119
|
+
recognizer.decode(stream);
|
|
120
|
+
|
|
121
|
+
const result = recognizer.getResult(stream);
|
|
122
|
+
const text = result.text.trim();
|
|
123
|
+
|
|
124
|
+
return { text, isFinal: true, timestamp: Date.now() };
|
|
125
|
+
},
|
|
126
|
+
|
|
127
|
+
clearBuffer(): void {
|
|
128
|
+
audioChunks = [];
|
|
129
|
+
},
|
|
130
|
+
|
|
131
|
+
destroy(): void {
|
|
132
|
+
audioChunks = [];
|
|
133
|
+
// recognizer cleanup is handled by sherpa-onnx-node garbage collection
|
|
134
|
+
},
|
|
135
|
+
};
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// ============================================================================
|
|
139
|
+
// HELPER FUNCTIONS
|
|
140
|
+
// ============================================================================
|
|
141
|
+
|
|
142
|
+
/**
|
|
143
|
+
* Validates that all required model files exist in the given directory.
|
|
144
|
+
*
|
|
145
|
+
* @param modelPath - Path to the model directory
|
|
146
|
+
* @throws Error with details about which files are missing
|
|
147
|
+
*/
|
|
148
|
+
function validateModelFiles(modelPath: string): void {
|
|
149
|
+
if (!existsSync(modelPath)) {
|
|
150
|
+
throw new Error(
|
|
151
|
+
`STT model directory not found: ${modelPath}. ` +
|
|
152
|
+
`Download a Whisper ONNX model and place encoder.onnx, decoder.onnx, and tokens.txt in this directory.`
|
|
153
|
+
);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
const expectedFiles = REQUIRED_SUFFIXES.map((suffix) => `${DEFAULT_MODEL_PREFIX}${suffix}`);
|
|
157
|
+
const missingFiles = expectedFiles.filter(
|
|
158
|
+
(file) => !existsSync(`${modelPath}/${file}`)
|
|
159
|
+
);
|
|
160
|
+
|
|
161
|
+
if (missingFiles.length > 0) {
|
|
162
|
+
throw new Error(
|
|
163
|
+
`Missing STT model files in ${modelPath}: ${missingFiles.join(", ")}. ` +
|
|
164
|
+
`Required files: ${expectedFiles.join(", ")}.`
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Concatenates an array of Float32Array chunks into a single Float32Array.
|
|
171
|
+
* Avoids repeated copying during accumulation by deferring concatenation
|
|
172
|
+
* until transcription time.
|
|
173
|
+
*
|
|
174
|
+
* @param chunks - Array of Float32Array audio chunks
|
|
175
|
+
* @returns Single concatenated Float32Array
|
|
176
|
+
*/
|
|
177
|
+
function concatenateChunks(chunks: Float32Array[]): Float32Array {
|
|
178
|
+
if (chunks.length === 0) {
|
|
179
|
+
return new Float32Array(0);
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
if (chunks.length === 1) {
|
|
183
|
+
return chunks[0];
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
const totalLength = chunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
|
187
|
+
const result = new Float32Array(totalLength);
|
|
188
|
+
|
|
189
|
+
let offset = 0;
|
|
190
|
+
for (const chunk of chunks) {
|
|
191
|
+
result.set(chunk, offset);
|
|
192
|
+
offset += chunk.length;
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
return result;
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
export { createStt };
|
|
199
|
+
export type { SttProcessor };
|