@clawdbot/voice-call 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +20 -0
- package/README.md +107 -0
- package/index.ts +477 -0
- package/package.json +14 -0
- package/src/cli.ts +297 -0
- package/src/config.ts +355 -0
- package/src/core-bridge.ts +190 -0
- package/src/manager.ts +846 -0
- package/src/media-stream.ts +279 -0
- package/src/providers/base.ts +67 -0
- package/src/providers/index.ts +9 -0
- package/src/providers/mock.ts +168 -0
- package/src/providers/stt-openai-realtime.ts +303 -0
- package/src/providers/telnyx.ts +364 -0
- package/src/providers/tts-openai.ts +264 -0
- package/src/providers/twilio.ts +537 -0
- package/src/response-generator.ts +171 -0
- package/src/runtime.ts +194 -0
- package/src/tunnel.ts +330 -0
- package/src/types.ts +272 -0
- package/src/utils.ts +12 -0
- package/src/voice-mapping.ts +65 -0
- package/src/webhook-security.ts +197 -0
- package/src/webhook.ts +480 -0
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenAI TTS Provider
|
|
3
|
+
*
|
|
4
|
+
* Generates speech audio using OpenAI's text-to-speech API.
|
|
5
|
+
* Handles audio format conversion for telephony (mu-law 8kHz).
|
|
6
|
+
*
|
|
7
|
+
* Best practices from OpenAI docs:
|
|
8
|
+
* - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
|
|
9
|
+
* - Use tts-1 for lower latency, tts-1-hd for higher quality
|
|
10
|
+
* - Use marin or cedar voices for best quality
|
|
11
|
+
* - Use pcm or wav format for fastest response times
|
|
12
|
+
*
|
|
13
|
+
* @see https://platform.openai.com/docs/guides/text-to-speech
|
|
14
|
+
*/
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* OpenAI TTS configuration.
|
|
18
|
+
*/
|
|
19
|
+
export interface OpenAITTSConfig {
|
|
20
|
+
/** OpenAI API key (uses OPENAI_API_KEY env if not set) */
|
|
21
|
+
apiKey?: string;
|
|
22
|
+
/**
|
|
23
|
+
* TTS model:
|
|
24
|
+
* - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
|
|
25
|
+
* - tts-1: lower latency
|
|
26
|
+
* - tts-1-hd: higher quality
|
|
27
|
+
*/
|
|
28
|
+
model?: string;
|
|
29
|
+
/**
|
|
30
|
+
* Voice to use. For best quality, use marin or cedar.
|
|
31
|
+
* All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
|
|
32
|
+
* Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
|
|
33
|
+
*/
|
|
34
|
+
voice?: string;
|
|
35
|
+
/** Speed multiplier (0.25 to 4.0) */
|
|
36
|
+
speed?: number;
|
|
37
|
+
/**
|
|
38
|
+
* Instructions for speech style (only works with gpt-4o-mini-tts model).
|
|
39
|
+
* Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
|
|
40
|
+
*/
|
|
41
|
+
instructions?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Supported OpenAI TTS voices (all 13 built-in voices).
|
|
46
|
+
* For best quality, use marin or cedar.
|
|
47
|
+
* Note: tts-1 and tts-1-hd support a smaller set.
|
|
48
|
+
*/
|
|
49
|
+
export const OPENAI_TTS_VOICES = [
|
|
50
|
+
"alloy",
|
|
51
|
+
"ash",
|
|
52
|
+
"ballad",
|
|
53
|
+
"coral",
|
|
54
|
+
"echo",
|
|
55
|
+
"fable",
|
|
56
|
+
"nova",
|
|
57
|
+
"onyx",
|
|
58
|
+
"sage",
|
|
59
|
+
"shimmer",
|
|
60
|
+
"verse",
|
|
61
|
+
"marin",
|
|
62
|
+
"cedar",
|
|
63
|
+
] as const;
|
|
64
|
+
|
|
65
|
+
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* OpenAI TTS Provider for generating speech audio.
|
|
69
|
+
*/
|
|
70
|
+
export class OpenAITTSProvider {
|
|
71
|
+
private apiKey: string;
|
|
72
|
+
private model: string;
|
|
73
|
+
private voice: OpenAITTSVoice;
|
|
74
|
+
private speed: number;
|
|
75
|
+
private instructions?: string;
|
|
76
|
+
|
|
77
|
+
constructor(config: OpenAITTSConfig = {}) {
|
|
78
|
+
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
|
|
79
|
+
// Default to gpt-4o-mini-tts for intelligent realtime applications
|
|
80
|
+
this.model = config.model || "gpt-4o-mini-tts";
|
|
81
|
+
// Default to coral - good balance of quality and natural tone
|
|
82
|
+
this.voice = (config.voice as OpenAITTSVoice) || "coral";
|
|
83
|
+
this.speed = config.speed || 1.0;
|
|
84
|
+
this.instructions = config.instructions;
|
|
85
|
+
|
|
86
|
+
if (!this.apiKey) {
|
|
87
|
+
throw new Error(
|
|
88
|
+
"OpenAI API key required (set OPENAI_API_KEY or pass apiKey)",
|
|
89
|
+
);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Generate speech audio from text.
|
|
95
|
+
* Returns raw PCM audio data (24kHz, mono, 16-bit).
|
|
96
|
+
*/
|
|
97
|
+
async synthesize(text: string, instructions?: string): Promise<Buffer> {
|
|
98
|
+
// Build request body
|
|
99
|
+
const body: Record<string, unknown> = {
|
|
100
|
+
model: this.model,
|
|
101
|
+
input: text,
|
|
102
|
+
voice: this.voice,
|
|
103
|
+
response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
|
|
104
|
+
speed: this.speed,
|
|
105
|
+
};
|
|
106
|
+
|
|
107
|
+
// Add instructions if using gpt-4o-mini-tts model
|
|
108
|
+
const effectiveInstructions = instructions || this.instructions;
|
|
109
|
+
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
|
110
|
+
body.instructions = effectiveInstructions;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
114
|
+
method: "POST",
|
|
115
|
+
headers: {
|
|
116
|
+
Authorization: `Bearer ${this.apiKey}`,
|
|
117
|
+
"Content-Type": "application/json",
|
|
118
|
+
},
|
|
119
|
+
body: JSON.stringify(body),
|
|
120
|
+
});
|
|
121
|
+
|
|
122
|
+
if (!response.ok) {
|
|
123
|
+
const error = await response.text();
|
|
124
|
+
throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
128
|
+
return Buffer.from(arrayBuffer);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Generate speech and convert to mu-law format for Twilio.
|
|
133
|
+
* Twilio Media Streams expect 8kHz mono mu-law audio.
|
|
134
|
+
*/
|
|
135
|
+
async synthesizeForTwilio(text: string): Promise<Buffer> {
|
|
136
|
+
// Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
|
|
137
|
+
const pcm24k = await this.synthesize(text);
|
|
138
|
+
|
|
139
|
+
// Resample from 24kHz to 8kHz
|
|
140
|
+
const pcm8k = resample24kTo8k(pcm24k);
|
|
141
|
+
|
|
142
|
+
// Encode to mu-law
|
|
143
|
+
return pcmToMulaw(pcm8k);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Resample 24kHz PCM to 8kHz using linear interpolation.
|
|
149
|
+
* Input/output: 16-bit signed little-endian mono.
|
|
150
|
+
*/
|
|
151
|
+
function resample24kTo8k(input: Buffer): Buffer {
|
|
152
|
+
const inputSamples = input.length / 2;
|
|
153
|
+
const outputSamples = Math.floor(inputSamples / 3);
|
|
154
|
+
const output = Buffer.alloc(outputSamples * 2);
|
|
155
|
+
|
|
156
|
+
for (let i = 0; i < outputSamples; i++) {
|
|
157
|
+
// Calculate position in input (3:1 ratio)
|
|
158
|
+
const srcPos = i * 3;
|
|
159
|
+
const srcIdx = srcPos * 2;
|
|
160
|
+
|
|
161
|
+
if (srcIdx + 3 < input.length) {
|
|
162
|
+
// Linear interpolation between samples
|
|
163
|
+
const s0 = input.readInt16LE(srcIdx);
|
|
164
|
+
const s1 = input.readInt16LE(srcIdx + 2);
|
|
165
|
+
const frac = srcPos % 1 || 0;
|
|
166
|
+
const sample = Math.round(s0 + frac * (s1 - s0));
|
|
167
|
+
output.writeInt16LE(clamp16(sample), i * 2);
|
|
168
|
+
} else {
|
|
169
|
+
// Last sample
|
|
170
|
+
output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return output;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/**
|
|
178
|
+
* Clamp value to 16-bit signed integer range.
|
|
179
|
+
*/
|
|
180
|
+
function clamp16(value: number): number {
|
|
181
|
+
return Math.max(-32768, Math.min(32767, value));
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Convert 16-bit PCM to 8-bit mu-law.
|
|
186
|
+
* Standard G.711 mu-law encoding for telephony.
|
|
187
|
+
*/
|
|
188
|
+
function pcmToMulaw(pcm: Buffer): Buffer {
|
|
189
|
+
const samples = pcm.length / 2;
|
|
190
|
+
const mulaw = Buffer.alloc(samples);
|
|
191
|
+
|
|
192
|
+
for (let i = 0; i < samples; i++) {
|
|
193
|
+
const sample = pcm.readInt16LE(i * 2);
|
|
194
|
+
mulaw[i] = linearToMulaw(sample);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return mulaw;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Convert a single 16-bit linear sample to 8-bit mu-law.
|
|
202
|
+
* Implements ITU-T G.711 mu-law encoding.
|
|
203
|
+
*/
|
|
204
|
+
function linearToMulaw(sample: number): number {
|
|
205
|
+
const BIAS = 132;
|
|
206
|
+
const CLIP = 32635;
|
|
207
|
+
|
|
208
|
+
// Get sign bit
|
|
209
|
+
const sign = sample < 0 ? 0x80 : 0;
|
|
210
|
+
if (sample < 0) sample = -sample;
|
|
211
|
+
|
|
212
|
+
// Clip to prevent overflow
|
|
213
|
+
if (sample > CLIP) sample = CLIP;
|
|
214
|
+
|
|
215
|
+
// Add bias and find segment
|
|
216
|
+
sample += BIAS;
|
|
217
|
+
let exponent = 7;
|
|
218
|
+
for (
|
|
219
|
+
let expMask = 0x4000;
|
|
220
|
+
(sample & expMask) === 0 && exponent > 0;
|
|
221
|
+
exponent--, expMask >>= 1
|
|
222
|
+
) {
|
|
223
|
+
// Find the segment (exponent)
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// Extract mantissa bits
|
|
227
|
+
const mantissa = (sample >> (exponent + 3)) & 0x0f;
|
|
228
|
+
|
|
229
|
+
// Combine into mu-law byte (inverted for transmission)
|
|
230
|
+
return ~(sign | (exponent << 4) | mantissa) & 0xff;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Convert 8-bit mu-law to 16-bit linear PCM.
|
|
235
|
+
* Useful for decoding incoming audio.
|
|
236
|
+
*/
|
|
237
|
+
export function mulawToLinear(mulaw: number): number {
|
|
238
|
+
// mu-law is transmitted inverted
|
|
239
|
+
mulaw = ~mulaw & 0xff;
|
|
240
|
+
|
|
241
|
+
const sign = mulaw & 0x80;
|
|
242
|
+
const exponent = (mulaw >> 4) & 0x07;
|
|
243
|
+
const mantissa = mulaw & 0x0f;
|
|
244
|
+
|
|
245
|
+
let sample = ((mantissa << 3) + 132) << exponent;
|
|
246
|
+
sample -= 132;
|
|
247
|
+
|
|
248
|
+
return sign ? -sample : sample;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Chunk audio buffer into 20ms frames for streaming.
|
|
253
|
+
* At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
|
|
254
|
+
*/
|
|
255
|
+
export function chunkAudio(
|
|
256
|
+
audio: Buffer,
|
|
257
|
+
chunkSize = 160,
|
|
258
|
+
): Generator<Buffer, void, unknown> {
|
|
259
|
+
return (function* () {
|
|
260
|
+
for (let i = 0; i < audio.length; i += chunkSize) {
|
|
261
|
+
yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
|
|
262
|
+
}
|
|
263
|
+
})();
|
|
264
|
+
}
|