@mastra/voice-openai-realtime 0.0.1-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs ADDED
@@ -0,0 +1,481 @@
1
+ 'use strict';
2
+
3
+ var voice = require('@mastra/core/voice');
4
+ var openaiRealtimeApi = require('openai-realtime-api');
5
+ var stream = require('stream');
6
+ var zodToJsonSchema = require('zod-to-json-schema');
7
+
8
+ // src/index.ts
9
+ var transformTools = (tools) => {
10
+ const openaiTools = [];
11
+ for (const [name, tool] of Object.entries(tools || {})) {
12
+ let parameters;
13
+ if ("inputSchema" in tool && tool.inputSchema) {
14
+ if (typeof tool.inputSchema === "object" && tool.inputSchema._def && tool.inputSchema._def.typeName === "ZodObject") {
15
+ parameters = zodToJsonSchema.zodToJsonSchema(tool.inputSchema);
16
+ delete parameters.$schema;
17
+ } else {
18
+ parameters = tool.inputSchema;
19
+ }
20
+ } else if ("parameters" in tool) {
21
+ if (typeof tool.parameters === "object" && tool.parameters._def && tool.parameters._def.typeName === "ZodObject") {
22
+ parameters = zodToJsonSchema.zodToJsonSchema(tool.parameters);
23
+ delete parameters.$schema;
24
+ } else {
25
+ parameters = tool.parameters;
26
+ }
27
+ } else {
28
+ console.warn(`Tool ${name} has neither inputSchema nor parameters, skipping`);
29
+ continue;
30
+ }
31
+ const openaiTool = {
32
+ name,
33
+ description: tool.description || `Tool: ${name}`,
34
+ parameters
35
+ };
36
+ if (tool.execute) {
37
+ const executeAdapter = async (args) => {
38
+ try {
39
+ if (!tool.execute) {
40
+ throw new Error(`Tool ${name} has no execute function`);
41
+ }
42
+ if ("inputSchema" in tool) {
43
+ return await tool.execute({ context: args });
44
+ } else {
45
+ const options = {
46
+ toolCallId: "unknown",
47
+ messages: []
48
+ };
49
+ return await tool.execute(args, options);
50
+ }
51
+ } catch (error) {
52
+ console.error(`Error executing tool ${name}:`, error);
53
+ throw error;
54
+ }
55
+ };
56
+ openaiTools.push({ openaiTool, execute: executeAdapter });
57
+ } else {
58
+ console.warn(`Tool ${name} has no execute function, skipping`);
59
+ }
60
+ }
61
+ return openaiTools;
62
+ };
63
+ var isReadableStream = (obj) => {
64
+ return obj && obj instanceof stream.Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
65
+ };
66
+
67
+ // src/index.ts
68
+ var DEFAULT_VOICE = "alloy";
69
+ var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
70
+ var DEFAULT_VAD_CONFIG = {
71
+ type: "server_vad",
72
+ threshold: 0.5,
73
+ prefix_padding_ms: 1e3,
74
+ silence_duration_ms: 1e3
75
+ };
76
+ var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
77
+ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
78
+ client;
79
+ state;
80
+ events;
81
+ tools;
82
+ /**
83
+ * Creates a new instance of OpenAIRealtimeVoice.
84
+ *
85
+ * @param options - Configuration options for the voice instance
86
+ * @param options.chatModel - Configuration for the chat model
87
+ * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
88
+ * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
89
+ * @param options.chatModel.tools - Tools configuration for the model
90
+ * @param options.chatModel.options - Additional options for the realtime client
91
+ * @param options.chatModel.options.sessionConfig - Session configuration overrides
92
+ * @param options.chatModel.options.url - Custom WebSocket URL
93
+ * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
94
+ * @param options.chatModel.options.debug - Enable debug logging
95
+ * @param options.chatModel.options.tools - Additional tools configuration
96
+ * @param options.speaker - Voice ID to use (defaults to 'alloy')
97
+ *
98
+ * @example
99
+ * ```typescript
100
+ * const voice = new OpenAIRealtimeVoice({
101
+ * chatModel: {
102
+ * apiKey: 'your-api-key',
103
+ * model: 'gpt-4o-mini-realtime',
104
+ * },
105
+ * speaker: 'alloy'
106
+ * });
107
+ * ```
108
+ */
109
+ constructor({
110
+ chatModel,
111
+ speaker
112
+ } = {}) {
113
+ super();
114
+ this.client = new openaiRealtimeApi.RealtimeClient({
115
+ apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
116
+ model: chatModel?.model || DEFAULT_MODEL,
117
+ ...chatModel?.options,
118
+ sessionConfig: {
119
+ voice: speaker || DEFAULT_VOICE,
120
+ turn_detection: DEFAULT_VAD_CONFIG,
121
+ ...chatModel?.options?.sessionConfig
122
+ }
123
+ });
124
+ this.state = "close";
125
+ this.events = {};
126
+ this.setupEventListeners();
127
+ if (chatModel?.tools) {
128
+ this.addTools(chatModel.tools);
129
+ }
130
+ }
131
+ /**
132
+ * Returns a list of available voice speakers.
133
+ *
134
+ * @returns Promise resolving to an array of voice objects, each containing at least a voiceId
135
+ *
136
+ * @example
137
+ * ```typescript
138
+ * const speakers = await voice.getSpeakers();
139
+ * // speakers = [{ voiceId: 'alloy' }, { voiceId: 'echo' }, ...]
140
+ * ```
141
+ */
142
+ getSpeakers() {
143
+ return Promise.resolve(VOICES.map((v) => ({ voiceId: v })));
144
+ }
145
+ /**
146
+ * Disconnects from the OpenAI realtime session and cleans up resources.
147
+ * Should be called when you're done with the voice instance.
148
+ *
149
+ * @example
150
+ * ```typescript
151
+ * voice.close(); // Disconnects and cleans up
152
+ * ```
153
+ */
154
+ close() {
155
+ if (!this.client) return;
156
+ this.client.disconnect();
157
+ this.state = "close";
158
+ }
159
+ /**
160
+ * Equips the voice instance with a set of tools.
161
+ * Tools allow the model to perform additional actions during conversations.
162
+ *
163
+ * @param tools - Optional tools configuration to addTools
164
+ * @returns Transformed tools configuration ready for use with the model
165
+ *
166
+ * @example
167
+ * ```typescript
168
+ * const tools = {
169
+ * search: async (query: string) => { ... },
170
+ * calculate: (expression: string) => { ... }
171
+ * };
172
+ * voice.addTools(tools);
173
+ * ```
174
+ */
175
+ addTools(tools) {
176
+ const transformedTools = transformTools(tools);
177
+ for (const tool of transformedTools) {
178
+ this.client.addTool(tool.openaiTool, tool.execute);
179
+ }
180
+ }
181
+ /**
182
+ * Emits a speaking event using the configured voice model.
183
+ * Can accept either a string or a readable stream as input.
184
+ *
185
+ * @param input - The text to convert to speech, or a readable stream containing the text
186
+ * @param options - Optional configuration for this specific speech request
187
+ * @param options.speaker - Override the voice to use for this specific request
188
+ *
189
+ * @throws {Error} If the input text is empty
190
+ *
191
+ * @example
192
+ * ```typescript
193
+ * // Simple text to speech
194
+ * await voice.speak('Hello world');
195
+ *
196
+ * // With custom voice
197
+ * await voice.speak('Hello world', { speaker: 'echo' });
198
+ *
199
+ * // Using a stream
200
+ * const stream = fs.createReadStream('text.txt');
201
+ * await voice.speak(stream);
202
+ * ```
203
+ */
204
+ async speak(input, options) {
205
+ if (typeof input !== "string") {
206
+ const chunks = [];
207
+ for await (const chunk of input) {
208
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
209
+ }
210
+ input = Buffer.concat(chunks).toString("utf-8");
211
+ }
212
+ if (input.trim().length === 0) {
213
+ throw new Error("Input text is empty");
214
+ }
215
+ this.client.realtime.send("response.create", {
216
+ response: {
217
+ instructions: `Repeat the following text: ${input}`,
218
+ voice: options?.speaker ? options.speaker : void 0
219
+ }
220
+ });
221
+ }
222
+ /**
223
+ * Updates the session configuration for the voice instance.
224
+ * This can be used to modify voice settings, turn detection, and other parameters.
225
+ *
226
+ * @param sessionConfig - New session configuration to apply
227
+ *
228
+ * @example
229
+ * ```typescript
230
+ * voice.updateConfig({
231
+ * voice: 'echo',
232
+ * turn_detection: {
233
+ * type: 'server_vad',
234
+ * threshold: 0.5,
235
+ * silence_duration_ms: 1000
236
+ * }
237
+ * });
238
+ * ```
239
+ */
240
+ updateConfig(sessionConfig) {
241
+ this.client.updateSession(sessionConfig);
242
+ }
243
+ /**
244
+ * Processes audio input for speech recognition.
245
+ * Takes a readable stream of audio data and emits a writing event.
246
+ * The output of the writing event is int16 audio data.
247
+ *
248
+ * @param audioData - Readable stream containing the audio data to process
249
+ * @param options - Optional configuration for audio processing
250
+ *
251
+ * @throws {Error} If the audio data format is not supported
252
+ *
253
+ * @example
254
+ * ```typescript
255
+ * // Process audio from a file
256
+ * const audioStream = fs.createReadStream('audio.raw');
257
+ * await voice.listen(audioStream);
258
+ *
259
+ * // Process audio with options
260
+ * await voice.listen(microphoneStream, {
261
+ * format: 'int16',
262
+ * sampleRate: 24000
263
+ * });
264
+ * ```
265
+ */
266
+ async listen(audioData) {
267
+ if (isReadableStream(audioData)) {
268
+ const chunks = [];
269
+ for await (const chunk of audioData) {
270
+ const buffer2 = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
271
+ chunks.push(buffer2);
272
+ }
273
+ const buffer = Buffer.concat(chunks);
274
+ const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
275
+ const base64Audio = this.int16ArrayToBase64(int16Array);
276
+ this.client.realtime.send("conversation.item.create", {
277
+ item: {
278
+ type: "message",
279
+ role: "user",
280
+ content: [{ type: "input_audio", audio: base64Audio }]
281
+ }
282
+ });
283
+ this.client.realtime.send("response.create", {
284
+ response: {
285
+ modalities: ["text"],
286
+ instructions: `ONLY repeat the input and DO NOT say anything else`
287
+ }
288
+ });
289
+ } else {
290
+ this.emit("error", new Error("Unsupported audio data format"));
291
+ }
292
+ }
293
+ /**
294
+ * Establishes a connection to the OpenAI realtime service.
295
+ * Must be called before using speak, listen, or relay functions.
296
+ *
297
+ * @throws {Error} If connection fails or session creation times out
298
+ *
299
+ * @example
300
+ * ```typescript
301
+ * await voice.open();
302
+ * // Now ready for voice interactions
303
+ * ```
304
+ */
305
+ async connect() {
306
+ await this.client.connect();
307
+ await this.client.waitForSessionCreated();
308
+ this.state = "open";
309
+ }
310
+ /**
311
+ * Streams audio data in real-time to the OpenAI service.
312
+ * Useful for continuous audio streaming scenarios like live microphone input.
313
+ * Must be in 'open' state before calling this method.
314
+ *
315
+ * @param audioData - Readable stream of audio data to relay
316
+ * @throws {Error} If audio format is not supported
317
+ *
318
+ * @example
319
+ * ```typescript
320
+ * // First connect
321
+ * await voice.open();
322
+ *
323
+ * // Then relay audio
324
+ * const micStream = getMicrophoneStream();
325
+ * await voice.relay(micStream);
326
+ * ```
327
+ */
328
+ async send(audioData) {
329
+ if (!this.state || this.state !== "open") {
330
+ console.warn("Cannot relay audio when not open. Call open() first.");
331
+ return;
332
+ }
333
+ if (isReadableStream(audioData)) {
334
+ const stream = audioData;
335
+ stream.on("data", (chunk) => {
336
+ try {
337
+ const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
338
+ const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
339
+ this.client.appendInputAudio(int16Array);
340
+ } catch (err) {
341
+ this.emit("error", err);
342
+ }
343
+ });
344
+ } else if (audioData instanceof Int16Array) {
345
+ try {
346
+ this.client.appendInputAudio(audioData);
347
+ } catch (err) {
348
+ this.emit("error", err);
349
+ }
350
+ } else {
351
+ this.emit("error", new Error("Unsupported audio data format"));
352
+ }
353
+ }
354
+ /**
355
+ * Sends a response to the OpenAI Realtime API.
356
+ *
357
+ * Trigger a response to the real-time session.
358
+ *
359
+ * @param {Object} params - The parameters object
360
+ * @param {Realtime.ResponseConfig} params.options - Configuration options for the response
361
+ * @returns {Promise<void>} A promise that resolves when the response has been sent
362
+ *
363
+ * @example
364
+ * // Send a simple text response
365
+ * await realtimeVoice.answer({
366
+ * options: {
367
+ * content: "Hello, how can I help you today?",
368
+ * voice: "alloy"
369
+ * }
370
+ * });
371
+ */
372
+ async answer({ options }) {
373
+ this.client.realtime.send("response.create", { response: options ?? {} });
374
+ }
375
+ /**
376
+ * Registers an event listener for voice events.
377
+ * Available events: 'speaking', 'writing, 'error'
378
+ * Can listen to OpenAI Realtime events by prefixing with 'openAIRealtime:'
379
+ * Such as 'openAIRealtime:conversation.item.completed', 'openAIRealtime:conversation.updated', etc.
380
+ *
381
+ * @param event - Name of the event to listen for
382
+ * @param callback - Function to call when the event occurs
383
+ *
384
+ * @example
385
+ * ```typescript
386
+ * // Listen for speech events
387
+ * voice.on('speaking', (audioData: Int16Array) => {
388
+ * // Handle audio data
389
+ * });
390
+ *
391
+ * // Handle errors
392
+ * voice.on('error', (error: Error) => {
393
+ * console.error('Voice error:', error);
394
+ * });
395
+ * ```
396
+ */
397
+ on(event, callback) {
398
+ if (!this.events[event]) {
399
+ this.events[event] = [];
400
+ }
401
+ this.events[event].push(callback);
402
+ }
403
+ /**
404
+ * Removes a previously registered event listener.
405
+ *
406
+ * @param event - Name of the event to stop listening to
407
+ * @param callback - The specific callback function to remove
408
+ *
409
+ * @example
410
+ * ```typescript
411
+ * // Create event handler
412
+ * const handleSpeech = (audioData: Int16Array) => {
413
+ * // Handle audio data
414
+ * };
415
+ *
416
+ * // Add listener
417
+ * voice.on('speaking', handleSpeech);
418
+ *
419
+ * // Later, remove the listener
420
+ * voice.off('speaking', handleSpeech);
421
+ * ```
422
+ */
423
+ off(event, callback) {
424
+ if (!this.events[event]) return;
425
+ const index = this.events[event].indexOf(callback);
426
+ if (index !== -1) {
427
+ this.events[event].splice(index, 1);
428
+ }
429
+ }
430
+ /**
431
+ * Emit an event with arguments
432
+ * @param event Event name
433
+ * @param args Arguments to pass to the callbacks
434
+ */
435
+ emit(event, ...args) {
436
+ if (!this.events[event]) return;
437
+ for (const callback of this.events[event]) {
438
+ callback(...args);
439
+ }
440
+ }
441
+ setupEventListeners() {
442
+ this.client.on("error", (error) => {
443
+ this.emit("error", error);
444
+ });
445
+ this.client.on("conversation.created", (conversation) => {
446
+ this.emit("openAIRealtime:conversation.created", conversation);
447
+ });
448
+ this.client.on("conversation.interrupted", () => {
449
+ this.emit("openAIRealtime:conversation.interrupted");
450
+ });
451
+ this.client.on("conversation.updated", ({ delta }) => {
452
+ if (delta?.audio) {
453
+ this.emit("speaking", { audio: delta.audio });
454
+ }
455
+ });
456
+ this.client.on("conversation.item.appended", (item) => {
457
+ this.emit("openAIRealtime:conversation.item.appended", item);
458
+ });
459
+ this.client.on("conversation.item.completed", ({ item, delta }) => {
460
+ if (item.formatted.transcript) {
461
+ this.emit("writing", { text: item.formatted.transcript, role: item.role });
462
+ }
463
+ this.emit("openAIRealtime:conversation.item.completed", { item, delta });
464
+ });
465
+ }
466
+ int16ArrayToBase64(int16Array) {
467
+ const buffer = new ArrayBuffer(int16Array.length * 2);
468
+ const view = new DataView(buffer);
469
+ for (let i = 0; i < int16Array.length; i++) {
470
+ view.setInt16(i * 2, int16Array[i], true);
471
+ }
472
+ const uint8Array = new Uint8Array(buffer);
473
+ let binary = "";
474
+ for (let i = 0; i < uint8Array.length; i++) {
475
+ binary += String.fromCharCode(uint8Array[i]);
476
+ }
477
+ return btoa(binary);
478
+ }
479
+ };
480
+
481
+ exports.OpenAIRealtimeVoice = OpenAIRealtimeVoice;
@@ -0,0 +1 @@
1
+ export { OpenAIRealtimeVoice } from './_tsup-dts-rollup.cjs';
@@ -0,0 +1 @@
1
+ export { OpenAIRealtimeVoice } from './_tsup-dts-rollup.js';