voicecc 1.1.36 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/voicecc.js +94 -1
- package/dashboard/dist/assets/index-DCeOdulF.js +28 -0
- package/dashboard/dist/index.html +1 -1
- package/dashboard/routes/agents.ts +28 -8
- package/dashboard/routes/browser-call.ts +3 -2
- package/dashboard/routes/chat.ts +75 -55
- package/dashboard/routes/providers.ts +5 -74
- package/dashboard/routes/twilio.ts +104 -5
- package/dashboard/routes/voice.ts +98 -0
- package/dashboard/server.ts +48 -1
- package/package.json +2 -3
- package/server/index.ts +96 -8
- package/server/services/twilio-manager.ts +29 -10
- package/dashboard/dist/assets/index-C62C9Gp0.js +0 -28
- package/dashboard/dist/audio-processor.js +0 -126
- package/server/services/heartbeat.ts +0 -403
- package/server/voice/assets/chime.wav +0 -0
- package/server/voice/assets/startup.pcm +0 -0
- package/server/voice/audio-adapter.ts +0 -60
- package/server/voice/audio-inactivity.test.ts +0 -108
- package/server/voice/audio-inactivity.ts +0 -91
- package/server/voice/browser-audio-playback.test.ts +0 -149
- package/server/voice/browser-audio.ts +0 -147
- package/server/voice/browser-server.ts +0 -311
- package/server/voice/chat-server.ts +0 -236
- package/server/voice/chime.test.ts +0 -69
- package/server/voice/chime.ts +0 -36
- package/server/voice/claude-session.ts +0 -293
- package/server/voice/endpointing.ts +0 -163
- package/server/voice/mic-vpio +0 -0
- package/server/voice/narration.ts +0 -204
- package/server/voice/prompt-builder.ts +0 -108
- package/server/voice/session-lock.ts +0 -123
- package/server/voice/stt-elevenlabs.ts +0 -210
- package/server/voice/stt-provider.ts +0 -106
- package/server/voice/tts-elevenlabs-hiss.test.ts +0 -183
- package/server/voice/tts-elevenlabs.ts +0 -397
- package/server/voice/tts-provider.ts +0 -155
- package/server/voice/twilio-audio.ts +0 -338
- package/server/voice/twilio-server.ts +0 -540
- package/server/voice/types.ts +0 -282
- package/server/voice/vad.ts +0 -101
- package/server/voice/voice-loop-bugs.test.ts +0 -348
- package/server/voice/voice-server.ts +0 -129
- package/server/voice/voice-session.ts +0 -539
|
@@ -1,397 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ElevenLabs TTS provider via streaming HTTP API.
|
|
3
|
-
*
|
|
4
|
-
* Calls the ElevenLabs text-to-speech streaming endpoint to generate audio,
|
|
5
|
-
* then writes raw PCM chunks to the speaker stream for playback. No subprocess
|
|
6
|
-
* is needed -- audio is fetched over HTTP and piped directly into the pipeline.
|
|
7
|
-
*
|
|
8
|
-
* Responsibilities:
|
|
9
|
-
* - POST text to the ElevenLabs TTS streaming API and receive chunked PCM audio
|
|
10
|
-
* - Buffer streaming text deltas into sentences via shared bufferSentences utility
|
|
11
|
-
* - Write PCM audio to the speaker stream with backpressure handling
|
|
12
|
-
* - Track playback timing and wait for audio to finish before resolving
|
|
13
|
-
* - Support interruption by cancelling in-flight requests and clearing playback
|
|
14
|
-
*/
|
|
15
|
-
|
|
16
|
-
import type { Writable } from "stream";
|
|
17
|
-
import type { TtsPlayer, TextChunk } from "./types.js";
|
|
18
|
-
|
|
19
|
-
// ============================================================================
|
|
20
|
-
// CONSTANTS
|
|
21
|
-
// ============================================================================
|
|
22
|
-
|
|
23
|
-
/** ElevenLabs TTS streaming API base URL */
|
|
24
|
-
const ELEVENLABS_TTS_BASE_URL = "https://api.elevenlabs.io/v1/text-to-speech";
|
|
25
|
-
|
|
26
|
-
/** PCM output sample rate in Hz (must match speaker pipeline) */
|
|
27
|
-
const TTS_SAMPLE_RATE = 24000;
|
|
28
|
-
|
|
29
|
-
/** Speaker audio bit depth */
|
|
30
|
-
const SPEAKER_BIT_DEPTH = 16;
|
|
31
|
-
|
|
32
|
-
/** Speaker channel count */
|
|
33
|
-
const SPEAKER_CHANNELS = 1;
|
|
34
|
-
|
|
35
|
-
/** Bytes per second of PCM audio at 24kHz 16-bit mono */
|
|
36
|
-
const BYTES_PER_SECOND = TTS_SAMPLE_RATE * (SPEAKER_BIT_DEPTH / 8) * SPEAKER_CHANNELS;
|
|
37
|
-
|
|
38
|
-
/** Interval (ms) for checking the interrupt flag during playback wait */
|
|
39
|
-
const INTERRUPT_CHECK_INTERVAL_MS = 50;
|
|
40
|
-
|
|
41
|
-
// ============================================================================
|
|
42
|
-
// INTERFACES
|
|
43
|
-
// ============================================================================
|
|
44
|
-
|
|
45
|
-
/**
|
|
46
|
-
* Configuration for the ElevenLabs TTS provider.
|
|
47
|
-
*/
|
|
48
|
-
export interface ElevenlabsTtsConfig {
|
|
49
|
-
/** ElevenLabs API key for authentication */
|
|
50
|
-
apiKey: string;
|
|
51
|
-
/** ElevenLabs voice ID to use for generation */
|
|
52
|
-
voiceId: string;
|
|
53
|
-
/** ElevenLabs model ID (e.g. "eleven_monolingual_v1") */
|
|
54
|
-
modelId: string;
|
|
55
|
-
/** Writable stream for PCM audio output */
|
|
56
|
-
speakerInput: Writable;
|
|
57
|
-
/** Callback to clear the playback buffer on interruption */
|
|
58
|
-
interruptPlayback: () => void;
|
|
59
|
-
/** Callback to resume playback after an interrupt */
|
|
60
|
-
resumePlayback: () => void;
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
// ============================================================================
|
|
64
|
-
// MAIN HANDLERS
|
|
65
|
-
// ============================================================================
|
|
66
|
-
|
|
67
|
-
/**
|
|
68
|
-
* Create a TtsPlayer that uses the ElevenLabs streaming TTS API.
|
|
69
|
-
*
|
|
70
|
-
* Sends text to the ElevenLabs API and receives raw PCM audio at 24kHz 16-bit
|
|
71
|
-
* mono, which is written directly to the speaker stream. No format conversion
|
|
72
|
-
* is needed since the output matches the speaker pipeline exactly.
|
|
73
|
-
*
|
|
74
|
-
* @param config - ElevenLabs TTS configuration (API key, voice, model, speaker stream)
|
|
75
|
-
* @returns A TtsPlayer instance ready for playback
|
|
76
|
-
*/
|
|
77
|
-
export async function createElevenlabsTts(config: ElevenlabsTtsConfig): Promise<TtsPlayer> {
|
|
78
|
-
const { apiKey, voiceId, modelId, speakerInput, interruptPlayback, resumePlayback } = config;
|
|
79
|
-
|
|
80
|
-
let destroyed = false;
|
|
81
|
-
let speaking = false;
|
|
82
|
-
let interruptFlag = false;
|
|
83
|
-
let wasInterrupted = false;
|
|
84
|
-
|
|
85
|
-
/**
|
|
86
|
-
* POST text to the ElevenLabs TTS streaming endpoint and stream PCM chunks
|
|
87
|
-
* to the speaker. Returns the total number of PCM bytes written.
|
|
88
|
-
*
|
|
89
|
-
* @param text - The text to synthesize
|
|
90
|
-
* @returns Total PCM bytes written to the speaker stream
|
|
91
|
-
*/
|
|
92
|
-
async function streamTtsToSpeaker(text: string): Promise<number> {
|
|
93
|
-
const url = `${ELEVENLABS_TTS_BASE_URL}/${voiceId}/stream?output_format=pcm_24000`;
|
|
94
|
-
|
|
95
|
-
const response = await fetch(url, {
|
|
96
|
-
method: "POST",
|
|
97
|
-
headers: {
|
|
98
|
-
"Content-Type": "application/json",
|
|
99
|
-
"xi-api-key": apiKey,
|
|
100
|
-
},
|
|
101
|
-
body: JSON.stringify({ text, model_id: modelId }),
|
|
102
|
-
});
|
|
103
|
-
|
|
104
|
-
if (!response.ok) {
|
|
105
|
-
const errorText = await response.text().catch(() => "unknown error");
|
|
106
|
-
throw new Error(`ElevenLabs TTS API error ${response.status}: ${errorText}`);
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
let totalBytes = 0;
|
|
110
|
-
|
|
111
|
-
for await (const chunk of readResponseChunks(response)) {
|
|
112
|
-
if (interruptFlag) break;
|
|
113
|
-
|
|
114
|
-
const pcmBuffer = Buffer.from(chunk);
|
|
115
|
-
totalBytes += pcmBuffer.length;
|
|
116
|
-
await writePcm(speakerInput, pcmBuffer);
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
return totalBytes;
|
|
120
|
-
}
|
|
121
|
-
|
|
122
|
-
/**
|
|
123
|
-
* Wait for the estimated remaining playback time, allowing interruption to cancel.
|
|
124
|
-
*
|
|
125
|
-
* @param remainingMs - Milliseconds to wait for playback to finish
|
|
126
|
-
*/
|
|
127
|
-
function waitForPlayback(remainingMs: number): Promise<void> {
|
|
128
|
-
return new Promise<void>((resolve) => {
|
|
129
|
-
const timer = setTimeout(resolve, remainingMs);
|
|
130
|
-
|
|
131
|
-
// Poll the interrupt flag to allow early cancellation
|
|
132
|
-
const check = setInterval(() => {
|
|
133
|
-
if (interruptFlag) {
|
|
134
|
-
clearTimeout(timer);
|
|
135
|
-
clearInterval(check);
|
|
136
|
-
resolve();
|
|
137
|
-
}
|
|
138
|
-
}, INTERRUPT_CHECK_INTERVAL_MS);
|
|
139
|
-
|
|
140
|
-
// Clean up interval when timer fires naturally
|
|
141
|
-
setTimeout(() => clearInterval(check), remainingMs + 100);
|
|
142
|
-
});
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
/**
|
|
146
|
-
* Generate audio for a single text string via ElevenLabs API and play it.
|
|
147
|
-
* @param text - The text to speak
|
|
148
|
-
*/
|
|
149
|
-
async function speak(text: string): Promise<void> {
|
|
150
|
-
if (destroyed) throw new Error("TtsPlayer has been destroyed");
|
|
151
|
-
|
|
152
|
-
interruptFlag = false;
|
|
153
|
-
speaking = true;
|
|
154
|
-
|
|
155
|
-
if (wasInterrupted) {
|
|
156
|
-
resumePlayback();
|
|
157
|
-
wasInterrupted = false;
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
try {
|
|
161
|
-
await streamTtsToSpeaker(text);
|
|
162
|
-
} finally {
|
|
163
|
-
speaking = false;
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
|
|
167
|
-
/**
|
|
168
|
-
* Stream text chunks into TTS for pipelined playback.
|
|
169
|
-
* Buffers text deltas into sentences, generates audio per sentence via
|
|
170
|
-
* the ElevenLabs API, and writes PCM to the speaker stream.
|
|
171
|
-
* @param texts - Async iterable of text chunks from the narrator
|
|
172
|
-
*/
|
|
173
|
-
async function speakStream(texts: AsyncIterable<TextChunk>): Promise<void> {
|
|
174
|
-
if (destroyed) throw new Error("TtsPlayer has been destroyed");
|
|
175
|
-
|
|
176
|
-
const t0 = Date.now();
|
|
177
|
-
let firstTextLogged = false;
|
|
178
|
-
let chunkIndex = 0;
|
|
179
|
-
let playbackFinishAt = 0;
|
|
180
|
-
|
|
181
|
-
interruptFlag = false;
|
|
182
|
-
speaking = true;
|
|
183
|
-
|
|
184
|
-
if (wasInterrupted) {
|
|
185
|
-
resumePlayback();
|
|
186
|
-
wasInterrupted = false;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
|
-
try {
|
|
190
|
-
for await (const sentence of bufferSentences(texts)) {
|
|
191
|
-
if (interruptFlag) break;
|
|
192
|
-
|
|
193
|
-
if (!firstTextLogged) {
|
|
194
|
-
console.log(`[tts-elevenlabs] first sentence at +${Date.now() - t0}ms: "${sentence.slice(0, 50)}${sentence.length > 50 ? "..." : ""}"`);
|
|
195
|
-
firstTextLogged = true;
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
const sentAt = Date.now();
|
|
199
|
-
|
|
200
|
-
// Fetch streamed PCM from ElevenLabs
|
|
201
|
-
const url = `${ELEVENLABS_TTS_BASE_URL}/${voiceId}/stream?output_format=pcm_24000`;
|
|
202
|
-
const response = await fetch(url, {
|
|
203
|
-
method: "POST",
|
|
204
|
-
headers: {
|
|
205
|
-
"Content-Type": "application/json",
|
|
206
|
-
"xi-api-key": apiKey,
|
|
207
|
-
},
|
|
208
|
-
body: JSON.stringify({ text: sentence, model_id: modelId }),
|
|
209
|
-
});
|
|
210
|
-
|
|
211
|
-
if (!response.ok) {
|
|
212
|
-
const errorText = await response.text().catch(() => "unknown error");
|
|
213
|
-
throw new Error(`ElevenLabs TTS API error ${response.status}: ${errorText}`);
|
|
214
|
-
}
|
|
215
|
-
|
|
216
|
-
if (interruptFlag) break;
|
|
217
|
-
|
|
218
|
-
// Read chunked PCM from the response body
|
|
219
|
-
for await (const chunk of readResponseChunks(response)) {
|
|
220
|
-
if (interruptFlag) break;
|
|
221
|
-
|
|
222
|
-
const pcmBuffer = Buffer.from(chunk);
|
|
223
|
-
const now = Date.now() - t0;
|
|
224
|
-
const audioDurationMs = (pcmBuffer.length / BYTES_PER_SECOND) * 1000;
|
|
225
|
-
const genMs = Date.now() - sentAt;
|
|
226
|
-
|
|
227
|
-
chunkIndex++;
|
|
228
|
-
|
|
229
|
-
await writePcm(speakerInput, pcmBuffer);
|
|
230
|
-
|
|
231
|
-
// Track estimated playback end. If the speaker buffer drained during a
|
|
232
|
-
// gap (e.g. tool call), new audio starts from now, not after previous audio.
|
|
233
|
-
playbackFinishAt = Math.max(playbackFinishAt, Date.now()) + audioDurationMs;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
if (interruptFlag) break;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
// Wait for buffered audio to finish playing through the speakers
|
|
240
|
-
if (!interruptFlag && playbackFinishAt > 0) {
|
|
241
|
-
const remainingMs = playbackFinishAt - Date.now();
|
|
242
|
-
if (remainingMs > 0) {
|
|
243
|
-
console.log(`[tts-elevenlabs] waiting ${(remainingMs / 1000).toFixed(1)}s for playback to finish`);
|
|
244
|
-
await waitForPlayback(remainingMs);
|
|
245
|
-
}
|
|
246
|
-
}
|
|
247
|
-
} finally {
|
|
248
|
-
speaking = false;
|
|
249
|
-
}
|
|
250
|
-
}
|
|
251
|
-
|
|
252
|
-
/**
|
|
253
|
-
* Interrupt current playback and cancel in-flight generation.
|
|
254
|
-
* Clears the playback buffer and sets the interrupt flag.
|
|
255
|
-
*/
|
|
256
|
-
function interrupt(): void {
|
|
257
|
-
if (destroyed) return;
|
|
258
|
-
interruptFlag = true;
|
|
259
|
-
wasInterrupted = true;
|
|
260
|
-
interruptPlayback();
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
/**
|
|
264
|
-
* Check whether TTS is currently active.
|
|
265
|
-
* @returns true if a speak/speakStream call is in progress
|
|
266
|
-
*/
|
|
267
|
-
function checkIsSpeaking(): boolean {
|
|
268
|
-
return speaking;
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
/**
|
|
272
|
-
* Free all resources and prevent further usage.
|
|
273
|
-
*/
|
|
274
|
-
function destroyPlayer(): void {
|
|
275
|
-
if (destroyed) return;
|
|
276
|
-
destroyed = true;
|
|
277
|
-
interrupt();
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
return {
|
|
281
|
-
speak,
|
|
282
|
-
speakStream,
|
|
283
|
-
interrupt,
|
|
284
|
-
isSpeaking: checkIsSpeaking,
|
|
285
|
-
destroy: destroyPlayer,
|
|
286
|
-
};
|
|
287
|
-
}
|
|
288
|
-
|
|
289
|
-
// ============================================================================
|
|
290
|
-
// HELPER FUNCTIONS
|
|
291
|
-
// ============================================================================
|
|
292
|
-
|
|
293
|
-
/**
|
|
294
|
-
* Write a PCM buffer to the speaker stream, respecting backpressure.
|
|
295
|
-
* @param stream - The speaker writable stream
|
|
296
|
-
* @param pcmBuffer - Raw PCM bytes to write
|
|
297
|
-
*/
|
|
298
|
-
function writePcm(stream: Writable, pcmBuffer: Buffer): Promise<void> {
|
|
299
|
-
return new Promise<void>((resolve, reject) => {
|
|
300
|
-
const ok = stream.write(pcmBuffer, (err: Error | null | undefined) => {
|
|
301
|
-
if (err) reject(err);
|
|
302
|
-
});
|
|
303
|
-
if (ok) {
|
|
304
|
-
resolve();
|
|
305
|
-
} else {
|
|
306
|
-
stream.once("drain", () => resolve());
|
|
307
|
-
}
|
|
308
|
-
});
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
/** Sentence-ending punctuation pattern: .!? followed by whitespace or end */
|
|
312
|
-
const SENTENCE_END_RE = /[.!?][\s]+/;
|
|
313
|
-
|
|
314
|
-
/** Minimum sentence length before we'll split on punctuation */
|
|
315
|
-
const MIN_SENTENCE_LENGTH = 20;
|
|
316
|
-
|
|
317
|
-
/**
|
|
318
|
-
* Buffer streaming text deltas into complete sentences for TTS generation.
|
|
319
|
-
* Chunks tagged with { flush: true } are yielded immediately (e.g. tool narration).
|
|
320
|
-
* Plain string chunks are buffered and split on sentence-ending punctuation.
|
|
321
|
-
* @param texts - Async iterable of TextChunk from the narrator
|
|
322
|
-
* @yields Complete sentences ready for TTS
|
|
323
|
-
*/
|
|
324
|
-
async function* bufferSentences(texts: AsyncIterable<TextChunk>): AsyncGenerator<string> {
|
|
325
|
-
let buffer = "";
|
|
326
|
-
|
|
327
|
-
for await (const raw of texts) {
|
|
328
|
-
if (typeof raw !== "string") {
|
|
329
|
-
if (buffer.trim()) {
|
|
330
|
-
yield buffer.trim();
|
|
331
|
-
buffer = "";
|
|
332
|
-
}
|
|
333
|
-
yield raw.text;
|
|
334
|
-
continue;
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
buffer += raw;
|
|
338
|
-
|
|
339
|
-
while (buffer.length >= MIN_SENTENCE_LENGTH) {
|
|
340
|
-
const match = SENTENCE_END_RE.exec(buffer.slice(MIN_SENTENCE_LENGTH - 1));
|
|
341
|
-
if (!match) break;
|
|
342
|
-
|
|
343
|
-
const splitIndex = MIN_SENTENCE_LENGTH - 1 + match.index + match[0].length;
|
|
344
|
-
const sentence = buffer.slice(0, splitIndex).trim();
|
|
345
|
-
buffer = buffer.slice(splitIndex);
|
|
346
|
-
|
|
347
|
-
if (sentence) yield sentence;
|
|
348
|
-
}
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
const remaining = buffer.trim();
|
|
352
|
-
if (remaining) yield remaining;
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
/**
|
|
356
|
-
* Read chunks from a fetch Response body as an async iterable, ensuring each
|
|
357
|
-
* yielded chunk is aligned to 16-bit sample boundaries (even byte count).
|
|
358
|
-
*
|
|
359
|
-
* HTTP streaming splits the byte stream at arbitrary TCP packet boundaries.
|
|
360
|
-
* A chunk with an odd byte count splits a 16-bit PCM sample in half. Downstream
|
|
361
|
-
* consumers (browser WebSocket -> Int16Array) interpret each chunk independently,
|
|
362
|
-
* so a misaligned chunk corrupts all its samples (heard as hiss/static).
|
|
363
|
-
*
|
|
364
|
-
* @param response - The fetch Response to read from
|
|
365
|
-
* @yields Buffer chunks of sample-aligned raw PCM audio data
|
|
366
|
-
*/
|
|
367
|
-
async function* readResponseChunks(response: Response): AsyncGenerator<Buffer> {
|
|
368
|
-
const body = response.body;
|
|
369
|
-
if (!body) throw new Error("ElevenLabs TTS response has no body");
|
|
370
|
-
|
|
371
|
-
const reader = body.getReader();
|
|
372
|
-
let leftover: Buffer | null = null;
|
|
373
|
-
|
|
374
|
-
try {
|
|
375
|
-
while (true) {
|
|
376
|
-
const { done, value } = await reader.read();
|
|
377
|
-
if (done) break;
|
|
378
|
-
if (!value) continue;
|
|
379
|
-
|
|
380
|
-
let chunk: Buffer = leftover ? Buffer.concat([leftover, value]) : Buffer.from(value);
|
|
381
|
-
leftover = null;
|
|
382
|
-
|
|
383
|
-
// Hold back the last byte if odd length (split sample)
|
|
384
|
-
if (chunk.byteLength % 2 !== 0) {
|
|
385
|
-
leftover = Buffer.from(chunk.subarray(chunk.byteLength - 1));
|
|
386
|
-
chunk = chunk.subarray(0, chunk.byteLength - 1);
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
if (chunk.byteLength > 0) yield chunk;
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
// Flush any remaining byte (only happens with malformed PCM)
|
|
393
|
-
if (leftover && leftover.byteLength > 0) yield leftover;
|
|
394
|
-
} finally {
|
|
395
|
-
reader.releaseLock();
|
|
396
|
-
}
|
|
397
|
-
}
|
|
@@ -1,155 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* TTS provider factory and readiness checks.
|
|
3
|
-
*
|
|
4
|
-
* Routes TTS creation to the ElevenLabs provider implementation.
|
|
5
|
-
* Checks provider readiness (API keys) for dashboard status.
|
|
6
|
-
*
|
|
7
|
-
* Responsibilities:
|
|
8
|
-
* - Create a TtsPlayer for the configured provider
|
|
9
|
-
* - Check provider readiness (API keys set)
|
|
10
|
-
* - Provide static metadata about available TTS providers
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
import { createElevenlabsTts } from "./tts-elevenlabs.js";
|
|
14
|
-
import { readEnv } from "../services/env.js";
|
|
15
|
-
|
|
16
|
-
import type { Writable } from "stream";
|
|
17
|
-
import type { TtsPlayer, TtsProviderType, TtsProviderConfig, ProviderStatus } from "./types.js";
|
|
18
|
-
|
|
19
|
-
// ============================================================================
|
|
20
|
-
// CONSTANTS
|
|
21
|
-
// ============================================================================
|
|
22
|
-
|
|
23
|
-
/** ElevenLabs API base URL */
|
|
24
|
-
const ELEVENLABS_API_BASE = "https://api.elevenlabs.io/v1";
|
|
25
|
-
|
|
26
|
-
// ============================================================================
|
|
27
|
-
// INTERFACES
|
|
28
|
-
// ============================================================================
|
|
29
|
-
|
|
30
|
-
/**
|
|
31
|
-
* A voice option returned by listVoicesForProvider.
|
|
32
|
-
*/
|
|
33
|
-
export interface VoiceOption {
|
|
34
|
-
id: string;
|
|
35
|
-
name: string;
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/**
|
|
39
|
-
* Metadata about a TTS provider for display in the dashboard.
|
|
40
|
-
*/
|
|
41
|
-
export interface TtsProviderInfo {
|
|
42
|
-
/** Provider type identifier */
|
|
43
|
-
type: TtsProviderType;
|
|
44
|
-
/** Human-readable provider name */
|
|
45
|
-
name: string;
|
|
46
|
-
/** Short description of the provider */
|
|
47
|
-
description: string;
|
|
48
|
-
/** Environment variable name for the API key (undefined = no key needed) */
|
|
49
|
-
requiresApiKey?: string;
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
/**
|
|
53
|
-
* Options for creating a TTS player via the provider factory.
|
|
54
|
-
*/
|
|
55
|
-
export interface CreateTtsOptions {
|
|
56
|
-
/** Provider configuration (which provider + per-provider settings) */
|
|
57
|
-
providerConfig: TtsProviderConfig;
|
|
58
|
-
/** Writable stream for PCM audio output */
|
|
59
|
-
speakerInput: Writable;
|
|
60
|
-
/** Callback to clear the playback buffer on interruption */
|
|
61
|
-
interruptPlayback: () => void;
|
|
62
|
-
/** Callback to resume playback after an interrupt */
|
|
63
|
-
resumePlayback: () => void;
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
// ============================================================================
|
|
67
|
-
// MAIN HANDLERS
|
|
68
|
-
// ============================================================================
|
|
69
|
-
|
|
70
|
-
/**
|
|
71
|
-
* Create a TtsPlayer for the configured provider.
|
|
72
|
-
*
|
|
73
|
-
* @param options - Provider config, speaker stream, and playback callbacks
|
|
74
|
-
* @returns A TtsPlayer instance ready for playback
|
|
75
|
-
* @throws Error if the provider is not implemented
|
|
76
|
-
*/
|
|
77
|
-
export async function createTtsForProvider(options: CreateTtsOptions): Promise<TtsPlayer> {
|
|
78
|
-
const { providerConfig, speakerInput, interruptPlayback, resumePlayback } = options;
|
|
79
|
-
|
|
80
|
-
switch (providerConfig.provider) {
|
|
81
|
-
case "elevenlabs":
|
|
82
|
-
return createElevenlabsTts({
|
|
83
|
-
apiKey: providerConfig.elevenlabs.apiKey,
|
|
84
|
-
voiceId: providerConfig.elevenlabs.voiceId,
|
|
85
|
-
modelId: providerConfig.elevenlabs.modelId,
|
|
86
|
-
speakerInput,
|
|
87
|
-
interruptPlayback,
|
|
88
|
-
resumePlayback,
|
|
89
|
-
});
|
|
90
|
-
|
|
91
|
-
default:
|
|
92
|
-
throw new Error(`Unknown TTS provider: ${providerConfig.provider}`);
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
/**
|
|
97
|
-
* Check whether a TTS provider is ready to use.
|
|
98
|
-
* Checks ELEVENLABS_API_KEY is set in .env.
|
|
99
|
-
*
|
|
100
|
-
* @param providerType - The provider to check
|
|
101
|
-
* @returns Readiness status with reason if not ready
|
|
102
|
-
*/
|
|
103
|
-
export async function getTtsProviderStatus(providerType: TtsProviderType): Promise<ProviderStatus> {
|
|
104
|
-
switch (providerType) {
|
|
105
|
-
case "elevenlabs": {
|
|
106
|
-
const env = await readEnv();
|
|
107
|
-
if (!env.ELEVENLABS_API_KEY) {
|
|
108
|
-
return { ready: false, reason: "missing_api_key", detail: "ELEVENLABS_API_KEY is not set in .env" };
|
|
109
|
-
}
|
|
110
|
-
return { ready: true };
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
default:
|
|
114
|
-
throw new Error(`Unknown TTS provider: ${providerType}`);
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
/**
|
|
119
|
-
* List available voices for a TTS provider.
|
|
120
|
-
* Fetches available voices from the ElevenLabs API.
|
|
121
|
-
*
|
|
122
|
-
* @param providerType - The provider to list voices for
|
|
123
|
-
* @returns Array of voice options
|
|
124
|
-
*/
|
|
125
|
-
export async function listVoicesForProvider(
|
|
126
|
-
providerType: TtsProviderType,
|
|
127
|
-
): Promise<VoiceOption[]> {
|
|
128
|
-
switch (providerType) {
|
|
129
|
-
case "elevenlabs": {
|
|
130
|
-
const res = await fetch(`${ELEVENLABS_API_BASE}/voices`);
|
|
131
|
-
if (!res.ok) throw new Error(`ElevenLabs API error: ${res.status}`);
|
|
132
|
-
const data = (await res.json()) as { voices: Array<{ voice_id: string; name: string }> };
|
|
133
|
-
return data.voices.map((v) => ({ id: v.voice_id, name: v.name }));
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
default:
|
|
137
|
-
throw new Error(`Unknown TTS provider: ${providerType}`);
|
|
138
|
-
}
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
/**
|
|
142
|
-
* Get the list of all known TTS providers with metadata.
|
|
143
|
-
*
|
|
144
|
-
* @returns Static array of TTS provider info
|
|
145
|
-
*/
|
|
146
|
-
export function getAvailableTtsProviders(): TtsProviderInfo[] {
|
|
147
|
-
return [
|
|
148
|
-
{
|
|
149
|
-
type: "elevenlabs",
|
|
150
|
-
name: "ElevenLabs",
|
|
151
|
-
description: "Cloud TTS via ElevenLabs streaming API",
|
|
152
|
-
requiresApiKey: "ELEVENLABS_API_KEY",
|
|
153
|
-
},
|
|
154
|
-
];
|
|
155
|
-
}
|