@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,8 @@
1
1
  import { MastraVoice } from '@mastra/core/voice';
2
- import { RealtimeClient } from 'openai-realtime-api';
3
- import { Readable } from 'stream';
2
+ import { PassThrough, Readable } from 'stream';
4
3
  import { zodToJsonSchema } from 'zod-to-json-schema';
4
+ import { WebSocket } from 'ws';
5
+ import { EventEmitter } from 'events';
5
6
 
6
7
  // src/index.ts
7
8
  var transformTools = (tools) => {
@@ -27,6 +28,7 @@ var transformTools = (tools) => {
27
28
  continue;
28
29
  }
29
30
  const openaiTool = {
31
+ type: "function",
30
32
  name,
31
33
  description: tool.description || `Tool: ${name}`,
32
34
  parameters
@@ -61,22 +63,18 @@ var transformTools = (tools) => {
61
63
  var isReadableStream = (obj) => {
62
64
  return obj && obj instanceof Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
63
65
  };
64
-
65
- // src/index.ts
66
66
  var DEFAULT_VOICE = "alloy";
67
+ var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
67
68
  var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
68
- var DEFAULT_VAD_CONFIG = {
69
- type: "server_vad",
70
- threshold: 0.5,
71
- prefix_padding_ms: 1e3,
72
- silence_duration_ms: 1e3
73
- };
74
69
  var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
75
70
  var OpenAIRealtimeVoice = class extends MastraVoice {
76
- client;
71
+ ws;
77
72
  state;
73
+ client;
78
74
  events;
75
+ instructions;
79
76
  tools;
77
+ debug;
80
78
  /**
81
79
  * Creates a new instance of OpenAIRealtimeVoice.
82
80
  *
@@ -85,13 +83,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
85
83
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
86
84
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
87
85
  * @param options.chatModel.tools - Tools configuration for the model
88
- * @param options.chatModel.options - Additional options for the realtime client
89
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
90
- * @param options.chatModel.options.url - Custom WebSocket URL
91
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
92
- * @param options.chatModel.options.debug - Enable debug logging
93
- * @param options.chatModel.options.tools - Additional tools configuration
94
86
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
87
+ * @param options.debug - Enable debug mode
95
88
  *
96
89
  * @example
97
90
  * ```typescript
@@ -106,25 +99,26 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
106
99
  */
107
100
  constructor({
108
101
  chatModel,
109
- speaker
102
+ speaker,
103
+ debug = false
110
104
  } = {}) {
111
105
  super();
112
- this.client = new RealtimeClient({
113
- apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
114
- model: chatModel?.model || DEFAULT_MODEL,
115
- ...chatModel?.options,
116
- sessionConfig: {
117
- voice: speaker || DEFAULT_VOICE,
118
- turn_detection: DEFAULT_VAD_CONFIG,
119
- ...chatModel?.options?.sessionConfig
106
+ const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
107
+ const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
108
+ this.ws = new WebSocket(url, void 0, {
109
+ headers: {
110
+ Authorization: "Bearer " + apiKey,
111
+ "OpenAI-Beta": "realtime=v1"
120
112
  }
121
113
  });
114
+ this.client = new EventEmitter();
122
115
  this.state = "close";
123
116
  this.events = {};
117
+ this.tools = chatModel?.tools;
118
+ this.instructions = chatModel?.instructions;
119
+ this.speaker = speaker || DEFAULT_VOICE;
120
+ this.debug = debug;
124
121
  this.setupEventListeners();
125
- if (chatModel?.tools) {
126
- this.addTools(chatModel.tools);
127
- }
128
122
  }
129
123
  /**
130
124
  * Returns a list of available voice speakers.
@@ -150,8 +144,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
150
144
  * ```
151
145
  */
152
146
  close() {
153
- if (!this.client) return;
154
- this.client.disconnect();
147
+ if (!this.ws) return;
148
+ this.ws.close();
155
149
  this.state = "close";
156
150
  }
157
151
  /**
@@ -171,10 +165,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
171
165
  * ```
172
166
  */
173
167
  addTools(tools) {
174
- const transformedTools = transformTools(tools);
175
- for (const tool of transformedTools) {
176
- this.client.addTool(tool.openaiTool, tool.execute);
177
- }
168
+ const openaiTools = transformTools(tools);
169
+ this.updateConfig({
170
+ tools: openaiTools.map((t) => t.openaiTool)
171
+ });
178
172
  }
179
173
  /**
180
174
  * Emits a speaking event using the configured voice model.
@@ -210,7 +204,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
210
204
  if (input.trim().length === 0) {
211
205
  throw new Error("Input text is empty");
212
206
  }
213
- this.client.realtime.send("response.create", {
207
+ this.sendEvent("response.create", {
214
208
  response: {
215
209
  instructions: `Repeat the following text: ${input}`,
216
210
  voice: options?.speaker ? options.speaker : void 0
@@ -236,7 +230,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
236
230
  * ```
237
231
  */
238
232
  updateConfig(sessionConfig) {
239
- this.client.updateSession(sessionConfig);
233
+ this.sendEvent("session.update", { session: sessionConfig });
240
234
  }
241
235
  /**
242
236
  * Processes audio input for speech recognition.
@@ -271,14 +265,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
271
265
  const buffer = Buffer.concat(chunks);
272
266
  const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
273
267
  const base64Audio = this.int16ArrayToBase64(int16Array);
274
- this.client.realtime.send("conversation.item.create", {
268
+ this.sendEvent("conversation.item.create", {
275
269
  item: {
276
270
  type: "message",
277
271
  role: "user",
278
272
  content: [{ type: "input_audio", audio: base64Audio }]
279
273
  }
280
274
  });
281
- this.client.realtime.send("response.create", {
275
+ this.sendEvent("response.create", {
282
276
  response: {
283
277
  modalities: ["text"],
284
278
  instructions: `ONLY repeat the input and DO NOT say anything else`
@@ -288,6 +282,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
288
282
  this.emit("error", new Error("Unsupported audio data format"));
289
283
  }
290
284
  }
285
+ waitForOpen() {
286
+ return new Promise((resolve) => {
287
+ this.ws.on("open", resolve);
288
+ });
289
+ }
290
+ waitForSessionCreated() {
291
+ return new Promise((resolve) => {
292
+ this.client.on("session.created", resolve);
293
+ });
294
+ }
291
295
  /**
292
296
  * Establishes a connection to the OpenAI realtime service.
293
297
  * Must be called before using speak, listen, or relay functions.
@@ -301,8 +305,17 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
301
305
  * ```
302
306
  */
303
307
  async connect() {
304
- await this.client.connect();
305
- await this.client.waitForSessionCreated();
308
+ await this.waitForOpen();
309
+ await this.waitForSessionCreated();
310
+ const openaiTools = transformTools(this.tools);
311
+ this.updateConfig({
312
+ instructions: this.instructions,
313
+ tools: openaiTools.map((t) => t.openaiTool),
314
+ input_audio_transcription: {
315
+ model: "whisper-1"
316
+ },
317
+ voice: this.speaker
318
+ });
306
319
  this.state = "open";
307
320
  }
308
321
  /**
@@ -323,7 +336,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
323
336
  * await voice.relay(micStream);
324
337
  * ```
325
338
  */
326
- async send(audioData) {
339
+ async send(audioData, eventId) {
327
340
  if (!this.state || this.state !== "open") {
328
341
  console.warn("Cannot relay audio when not open. Call open() first.");
329
342
  return;
@@ -333,15 +346,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
333
346
  stream.on("data", (chunk) => {
334
347
  try {
335
348
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
336
- const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
337
- this.client.appendInputAudio(int16Array);
349
+ this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
338
350
  } catch (err) {
339
351
  this.emit("error", err);
340
352
  }
341
353
  });
342
354
  } else if (audioData instanceof Int16Array) {
343
355
  try {
344
- this.client.appendInputAudio(audioData);
356
+ this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
345
357
  } catch (err) {
346
358
  this.emit("error", err);
347
359
  }
@@ -368,7 +380,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
368
380
  * });
369
381
  */
370
382
  async answer({ options }) {
371
- this.client.realtime.send("response.create", { response: options ?? {} });
383
+ this.sendEvent("response.create", { response: options ?? {} });
372
384
  }
373
385
  /**
374
386
  * Registers an event listener for voice events.
@@ -437,29 +449,105 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
437
449
  }
438
450
  }
439
451
  setupEventListeners() {
440
- this.client.on("error", (error) => {
441
- this.emit("error", error);
452
+ const speakerStreams = /* @__PURE__ */ new Map();
453
+ this.ws.on("message", (message) => {
454
+ const data = JSON.parse(message.toString());
455
+ this.client.emit(data.type, data);
456
+ if (this.debug) {
457
+ const { delta, ...fields } = data;
458
+ console.log(data.type, fields, delta?.length < 100 ? delta : "");
459
+ }
442
460
  });
443
- this.client.on("conversation.created", (conversation) => {
444
- this.emit("openAIRealtime:conversation.created", conversation);
461
+ this.client.on("session.created", (ev) => {
462
+ this.emit("session.created", ev);
445
463
  });
446
- this.client.on("conversation.interrupted", () => {
447
- this.emit("openAIRealtime:conversation.interrupted");
464
+ this.client.on("session.updated", (ev) => {
465
+ this.emit("session.updated", ev);
448
466
  });
449
- this.client.on("conversation.updated", ({ delta }) => {
450
- if (delta?.audio) {
451
- this.emit("speaking", { audio: delta.audio });
452
- }
467
+ this.client.on("response.created", (ev) => {
468
+ this.emit("response.created", ev);
469
+ const speakerStream = new PassThrough();
470
+ speakerStream.id = ev.response.id;
471
+ speakerStreams.set(ev.response.id, speakerStream);
472
+ this.emit("speaker", speakerStream);
453
473
  });
454
- this.client.on("conversation.item.appended", (item) => {
455
- this.emit("openAIRealtime:conversation.item.appended", item);
474
+ this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
475
+ this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
456
476
  });
457
- this.client.on("conversation.item.completed", ({ item, delta }) => {
458
- if (item.formatted.transcript) {
459
- this.emit("writing", { text: item.formatted.transcript, role: item.role });
460
- }
461
- this.emit("openAIRealtime:conversation.item.completed", { item, delta });
477
+ this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
478
+ this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
479
+ });
480
+ this.client.on("response.audio.delta", (ev) => {
481
+ const audio = Buffer.from(ev.delta, "base64");
482
+ this.emit("speaking", { audio, response_id: ev.response_id });
483
+ const stream = speakerStreams.get(ev.response_id);
484
+ stream?.write(audio);
485
+ });
486
+ this.client.on("response.audio.done", (ev) => {
487
+ this.emit("speaking.done", { response_id: ev.response_id });
488
+ const stream = speakerStreams.get(ev.response_id);
489
+ stream?.end();
490
+ });
491
+ this.client.on("response.audio_transcript.delta", (ev) => {
492
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
462
493
  });
494
+ this.client.on("response.audio_transcript.done", (ev) => {
495
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
496
+ });
497
+ this.client.on("response.text.delta", (ev) => {
498
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
499
+ });
500
+ this.client.on("response.text.done", (ev) => {
501
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
502
+ });
503
+ this.client.on("response.done", (ev) => {
504
+ this.handleFunctionCalls(ev);
505
+ this.emit("response.done", ev);
506
+ speakerStreams.delete(ev.response.id);
507
+ });
508
+ }
509
+ async handleFunctionCalls(ev) {
510
+ for (const output of ev.response?.output ?? []) {
511
+ if (output.type === "function_call") {
512
+ await this.handleFunctionCall(output);
513
+ }
514
+ }
515
+ }
516
+ async handleFunctionCall(output) {
517
+ try {
518
+ const context = JSON.parse(output.arguments);
519
+ const tool = this.tools?.[output.name];
520
+ if (!tool) {
521
+ console.warn(`Tool "${output.name}" not found`);
522
+ return;
523
+ }
524
+ const result = await tool?.execute?.(
525
+ { context },
526
+ {
527
+ toolCallId: "unknown",
528
+ messages: []
529
+ }
530
+ );
531
+ this.sendEvent("conversation.item.create", {
532
+ item: {
533
+ type: "function_call_output",
534
+ call_id: output.call_id,
535
+ output: JSON.stringify(result)
536
+ }
537
+ });
538
+ } catch (e) {
539
+ const err = e;
540
+ console.warn(`Error calling tool "${output.name}":`, err.message);
541
+ this.sendEvent("conversation.item.create", {
542
+ item: {
543
+ type: "function_call_output",
544
+ call_id: output.call_id,
545
+ output: JSON.stringify({ error: err.message })
546
+ }
547
+ });
548
+ } finally {
549
+ this.sendEvent("response.create", {});
550
+ }
463
551
  }
464
552
  int16ArrayToBase64(int16Array) {
465
553
  const buffer = new ArrayBuffer(int16Array.length * 2);
@@ -474,6 +562,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
474
562
  }
475
563
  return btoa(binary);
476
564
  }
565
+ sendEvent(type, data) {
566
+ this.ws.send(
567
+ JSON.stringify({
568
+ type,
569
+ ...data
570
+ })
571
+ );
572
+ }
477
573
  };
478
574
 
479
575
  export { OpenAIRealtimeVoice };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/voice-openai-realtime",
3
- "version": "0.0.5-alpha.0",
3
+ "version": "0.1.0-alpha.1",
4
4
  "description": "Mastra OpenAI Realtime API integration",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -20,16 +20,18 @@
20
20
  },
21
21
  "dependencies": {
22
22
  "openai-realtime-api": "^1.0.7",
23
+ "ws": "^8.18.1",
23
24
  "zod-to-json-schema": "^3.24.1",
24
- "@mastra/core": "^0.6.5-alpha.0"
25
+ "@mastra/core": "^0.7.0-alpha.1"
25
26
  },
26
27
  "devDependencies": {
27
28
  "@microsoft/api-extractor": "^7.49.2",
28
29
  "@types/node": "^22.13.1",
29
- "eslint": "^9.20.1",
30
+ "@types/ws": "^8.18.0",
31
+ "eslint": "^9.23.0",
30
32
  "tsup": "^8.3.6",
31
33
  "typescript": "^5.7.3",
32
- "vitest": "^2.1.8",
34
+ "vitest": "^2.1.9",
33
35
  "@internal/lint": "0.0.1"
34
36
  },
35
37
  "scripts": {