@clawdbot/voice-call 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,264 @@
1
+ /**
2
+ * OpenAI TTS Provider
3
+ *
4
+ * Generates speech audio using OpenAI's text-to-speech API.
5
+ * Handles audio format conversion for telephony (mu-law 8kHz).
6
+ *
7
+ * Best practices from OpenAI docs:
8
+ * - Use gpt-4o-mini-tts for intelligent realtime applications (supports instructions)
9
+ * - Use tts-1 for lower latency, tts-1-hd for higher quality
10
+ * - Use marin or cedar voices for best quality
11
+ * - Use pcm or wav format for fastest response times
12
+ *
13
+ * @see https://platform.openai.com/docs/guides/text-to-speech
14
+ */
15
+
16
+ /**
17
+ * OpenAI TTS configuration.
18
+ */
19
+ export interface OpenAITTSConfig {
20
+ /** OpenAI API key (uses OPENAI_API_KEY env if not set) */
21
+ apiKey?: string;
22
+ /**
23
+ * TTS model:
24
+ * - gpt-4o-mini-tts: newest, supports instructions for tone/style control (recommended)
25
+ * - tts-1: lower latency
26
+ * - tts-1-hd: higher quality
27
+ */
28
+ model?: string;
29
+ /**
30
+ * Voice to use. For best quality, use marin or cedar.
31
+ * All 13 voices: alloy, ash, ballad, coral, echo, fable, nova, onyx, sage, shimmer, verse, marin, cedar
32
+ * Note: tts-1/tts-1-hd only support: alloy, ash, coral, echo, fable, onyx, nova, sage, shimmer
33
+ */
34
+ voice?: string;
35
+ /** Speed multiplier (0.25 to 4.0) */
36
+ speed?: number;
37
+ /**
38
+ * Instructions for speech style (only works with gpt-4o-mini-tts model).
39
+ * Examples: "Speak in a cheerful tone", "Talk like a sympathetic customer service agent"
40
+ */
41
+ instructions?: string;
42
+ }
43
+
44
+ /**
45
+ * Supported OpenAI TTS voices (all 13 built-in voices).
46
+ * For best quality, use marin or cedar.
47
+ * Note: tts-1 and tts-1-hd support a smaller set.
48
+ */
49
+ export const OPENAI_TTS_VOICES = [
50
+ "alloy",
51
+ "ash",
52
+ "ballad",
53
+ "coral",
54
+ "echo",
55
+ "fable",
56
+ "nova",
57
+ "onyx",
58
+ "sage",
59
+ "shimmer",
60
+ "verse",
61
+ "marin",
62
+ "cedar",
63
+ ] as const;
64
+
65
+ export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
66
+
67
+ /**
68
+ * OpenAI TTS Provider for generating speech audio.
69
+ */
70
+ export class OpenAITTSProvider {
71
+ private apiKey: string;
72
+ private model: string;
73
+ private voice: OpenAITTSVoice;
74
+ private speed: number;
75
+ private instructions?: string;
76
+
77
+ constructor(config: OpenAITTSConfig = {}) {
78
+ this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
79
+ // Default to gpt-4o-mini-tts for intelligent realtime applications
80
+ this.model = config.model || "gpt-4o-mini-tts";
81
+ // Default to coral - good balance of quality and natural tone
82
+ this.voice = (config.voice as OpenAITTSVoice) || "coral";
83
+ this.speed = config.speed || 1.0;
84
+ this.instructions = config.instructions;
85
+
86
+ if (!this.apiKey) {
87
+ throw new Error(
88
+ "OpenAI API key required (set OPENAI_API_KEY or pass apiKey)",
89
+ );
90
+ }
91
+ }
92
+
93
+ /**
94
+ * Generate speech audio from text.
95
+ * Returns raw PCM audio data (24kHz, mono, 16-bit).
96
+ */
97
+ async synthesize(text: string, instructions?: string): Promise<Buffer> {
98
+ // Build request body
99
+ const body: Record<string, unknown> = {
100
+ model: this.model,
101
+ input: text,
102
+ voice: this.voice,
103
+ response_format: "pcm", // Raw PCM audio (24kHz, mono, 16-bit signed LE)
104
+ speed: this.speed,
105
+ };
106
+
107
+ // Add instructions if using gpt-4o-mini-tts model
108
+ const effectiveInstructions = instructions || this.instructions;
109
+ if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
110
+ body.instructions = effectiveInstructions;
111
+ }
112
+
113
+ const response = await fetch("https://api.openai.com/v1/audio/speech", {
114
+ method: "POST",
115
+ headers: {
116
+ Authorization: `Bearer ${this.apiKey}`,
117
+ "Content-Type": "application/json",
118
+ },
119
+ body: JSON.stringify(body),
120
+ });
121
+
122
+ if (!response.ok) {
123
+ const error = await response.text();
124
+ throw new Error(`OpenAI TTS failed: ${response.status} - ${error}`);
125
+ }
126
+
127
+ const arrayBuffer = await response.arrayBuffer();
128
+ return Buffer.from(arrayBuffer);
129
+ }
130
+
131
+ /**
132
+ * Generate speech and convert to mu-law format for Twilio.
133
+ * Twilio Media Streams expect 8kHz mono mu-law audio.
134
+ */
135
+ async synthesizeForTwilio(text: string): Promise<Buffer> {
136
+ // Get raw PCM from OpenAI (24kHz, 16-bit signed LE, mono)
137
+ const pcm24k = await this.synthesize(text);
138
+
139
+ // Resample from 24kHz to 8kHz
140
+ const pcm8k = resample24kTo8k(pcm24k);
141
+
142
+ // Encode to mu-law
143
+ return pcmToMulaw(pcm8k);
144
+ }
145
+ }
146
+
147
+ /**
148
+ * Resample 24kHz PCM to 8kHz using linear interpolation.
149
+ * Input/output: 16-bit signed little-endian mono.
150
+ */
151
+ function resample24kTo8k(input: Buffer): Buffer {
152
+ const inputSamples = input.length / 2;
153
+ const outputSamples = Math.floor(inputSamples / 3);
154
+ const output = Buffer.alloc(outputSamples * 2);
155
+
156
+ for (let i = 0; i < outputSamples; i++) {
157
+ // Calculate position in input (3:1 ratio)
158
+ const srcPos = i * 3;
159
+ const srcIdx = srcPos * 2;
160
+
161
+ if (srcIdx + 3 < input.length) {
162
+ // Linear interpolation between samples
163
+ const s0 = input.readInt16LE(srcIdx);
164
+ const s1 = input.readInt16LE(srcIdx + 2);
165
+ const frac = srcPos % 1 || 0;
166
+ const sample = Math.round(s0 + frac * (s1 - s0));
167
+ output.writeInt16LE(clamp16(sample), i * 2);
168
+ } else {
169
+ // Last sample
170
+ output.writeInt16LE(input.readInt16LE(srcIdx), i * 2);
171
+ }
172
+ }
173
+
174
+ return output;
175
+ }
176
+
177
+ /**
178
+ * Clamp value to 16-bit signed integer range.
179
+ */
180
+ function clamp16(value: number): number {
181
+ return Math.max(-32768, Math.min(32767, value));
182
+ }
183
+
184
+ /**
185
+ * Convert 16-bit PCM to 8-bit mu-law.
186
+ * Standard G.711 mu-law encoding for telephony.
187
+ */
188
+ function pcmToMulaw(pcm: Buffer): Buffer {
189
+ const samples = pcm.length / 2;
190
+ const mulaw = Buffer.alloc(samples);
191
+
192
+ for (let i = 0; i < samples; i++) {
193
+ const sample = pcm.readInt16LE(i * 2);
194
+ mulaw[i] = linearToMulaw(sample);
195
+ }
196
+
197
+ return mulaw;
198
+ }
199
+
200
+ /**
201
+ * Convert a single 16-bit linear sample to 8-bit mu-law.
202
+ * Implements ITU-T G.711 mu-law encoding.
203
+ */
204
+ function linearToMulaw(sample: number): number {
205
+ const BIAS = 132;
206
+ const CLIP = 32635;
207
+
208
+ // Get sign bit
209
+ const sign = sample < 0 ? 0x80 : 0;
210
+ if (sample < 0) sample = -sample;
211
+
212
+ // Clip to prevent overflow
213
+ if (sample > CLIP) sample = CLIP;
214
+
215
+ // Add bias and find segment
216
+ sample += BIAS;
217
+ let exponent = 7;
218
+ for (
219
+ let expMask = 0x4000;
220
+ (sample & expMask) === 0 && exponent > 0;
221
+ exponent--, expMask >>= 1
222
+ ) {
223
+ // Find the segment (exponent)
224
+ }
225
+
226
+ // Extract mantissa bits
227
+ const mantissa = (sample >> (exponent + 3)) & 0x0f;
228
+
229
+ // Combine into mu-law byte (inverted for transmission)
230
+ return ~(sign | (exponent << 4) | mantissa) & 0xff;
231
+ }
232
+
233
+ /**
234
+ * Convert 8-bit mu-law to 16-bit linear PCM.
235
+ * Useful for decoding incoming audio.
236
+ */
237
+ export function mulawToLinear(mulaw: number): number {
238
+ // mu-law is transmitted inverted
239
+ mulaw = ~mulaw & 0xff;
240
+
241
+ const sign = mulaw & 0x80;
242
+ const exponent = (mulaw >> 4) & 0x07;
243
+ const mantissa = mulaw & 0x0f;
244
+
245
+ let sample = ((mantissa << 3) + 132) << exponent;
246
+ sample -= 132;
247
+
248
+ return sign ? -sample : sample;
249
+ }
250
+
251
+ /**
252
+ * Chunk audio buffer into 20ms frames for streaming.
253
+ * At 8kHz mono, 20ms = 160 samples = 160 bytes (mu-law).
254
+ */
255
+ export function chunkAudio(
256
+ audio: Buffer,
257
+ chunkSize = 160,
258
+ ): Generator<Buffer, void, unknown> {
259
+ return (function* () {
260
+ for (let i = 0; i < audio.length; i += chunkSize) {
261
+ yield audio.subarray(i, Math.min(i + chunkSize, audio.length));
262
+ }
263
+ })();
264
+ }