@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
+ import { EventEmitter } from 'events';
2
+ import { PassThrough, Readable } from 'stream';
1
3
  import { MastraVoice } from '@mastra/core/voice';
2
- import { RealtimeClient } from 'openai-realtime-api';
3
- import { Readable } from 'stream';
4
+ import { WebSocket } from 'ws';
4
5
  import { zodToJsonSchema } from 'zod-to-json-schema';
5
6
 
6
7
  // src/index.ts
@@ -27,6 +28,7 @@ var transformTools = (tools) => {
27
28
  continue;
28
29
  }
29
30
  const openaiTool = {
31
+ type: "function",
30
32
  name,
31
33
  description: tool.description || `Tool: ${name}`,
32
34
  parameters
@@ -64,19 +66,17 @@ var isReadableStream = (obj) => {
64
66
 
65
67
  // src/index.ts
66
68
  var DEFAULT_VOICE = "alloy";
69
+ var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
67
70
  var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
68
- var DEFAULT_VAD_CONFIG = {
69
- type: "server_vad",
70
- threshold: 0.5,
71
- prefix_padding_ms: 1e3,
72
- silence_duration_ms: 1e3
73
- };
74
71
  var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
75
72
  var OpenAIRealtimeVoice = class extends MastraVoice {
76
- client;
73
+ ws;
77
74
  state;
75
+ client;
78
76
  events;
77
+ instructions;
79
78
  tools;
79
+ debug;
80
80
  /**
81
81
  * Creates a new instance of OpenAIRealtimeVoice.
82
82
  *
@@ -85,13 +85,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
85
85
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
86
86
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
87
87
  * @param options.chatModel.tools - Tools configuration for the model
88
- * @param options.chatModel.options - Additional options for the realtime client
89
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
90
- * @param options.chatModel.options.url - Custom WebSocket URL
91
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
92
- * @param options.chatModel.options.debug - Enable debug logging
93
- * @param options.chatModel.options.tools - Additional tools configuration
94
88
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
89
+ * @param options.debug - Enable debug mode
95
90
  *
96
91
  * @example
97
92
  * ```typescript
@@ -106,25 +101,26 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
106
101
  */
107
102
  constructor({
108
103
  chatModel,
109
- speaker
104
+ speaker,
105
+ debug = false
110
106
  } = {}) {
111
107
  super();
112
- this.client = new RealtimeClient({
113
- apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
114
- model: chatModel?.model || DEFAULT_MODEL,
115
- ...chatModel?.options,
116
- sessionConfig: {
117
- voice: speaker || DEFAULT_VOICE,
118
- turn_detection: DEFAULT_VAD_CONFIG,
119
- ...chatModel?.options?.sessionConfig
108
+ const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
109
+ const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
110
+ this.ws = new WebSocket(url, void 0, {
111
+ headers: {
112
+ Authorization: "Bearer " + apiKey,
113
+ "OpenAI-Beta": "realtime=v1"
120
114
  }
121
115
  });
116
+ this.client = new EventEmitter();
122
117
  this.state = "close";
123
118
  this.events = {};
119
+ this.tools = chatModel?.tools;
120
+ this.instructions = chatModel?.instructions;
121
+ this.speaker = speaker || DEFAULT_VOICE;
122
+ this.debug = debug;
124
123
  this.setupEventListeners();
125
- if (chatModel?.tools) {
126
- this.addTools(chatModel.tools);
127
- }
128
124
  }
129
125
  /**
130
126
  * Returns a list of available voice speakers.
@@ -150,8 +146,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
150
146
  * ```
151
147
  */
152
148
  close() {
153
- if (!this.client) return;
154
- this.client.disconnect();
149
+ if (!this.ws) return;
150
+ this.ws.close();
155
151
  this.state = "close";
156
152
  }
157
153
  /**
@@ -171,10 +167,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
171
167
  * ```
172
168
  */
173
169
  addTools(tools) {
174
- const transformedTools = transformTools(tools);
175
- for (const tool of transformedTools) {
176
- this.client.addTool(tool.openaiTool, tool.execute);
177
- }
170
+ const openaiTools = transformTools(tools);
171
+ this.updateConfig({
172
+ tools: openaiTools.map((t) => t.openaiTool)
173
+ });
178
174
  }
179
175
  /**
180
176
  * Emits a speaking event using the configured voice model.
@@ -210,7 +206,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
210
206
  if (input.trim().length === 0) {
211
207
  throw new Error("Input text is empty");
212
208
  }
213
- this.client.realtime.send("response.create", {
209
+ this.sendEvent("response.create", {
214
210
  response: {
215
211
  instructions: `Repeat the following text: ${input}`,
216
212
  voice: options?.speaker ? options.speaker : void 0
@@ -236,7 +232,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
236
232
  * ```
237
233
  */
238
234
  updateConfig(sessionConfig) {
239
- this.client.updateSession(sessionConfig);
235
+ this.sendEvent("session.update", { session: sessionConfig });
240
236
  }
241
237
  /**
242
238
  * Processes audio input for speech recognition.
@@ -271,14 +267,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
271
267
  const buffer = Buffer.concat(chunks);
272
268
  const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
273
269
  const base64Audio = this.int16ArrayToBase64(int16Array);
274
- this.client.realtime.send("conversation.item.create", {
270
+ this.sendEvent("conversation.item.create", {
275
271
  item: {
276
272
  type: "message",
277
273
  role: "user",
278
274
  content: [{ type: "input_audio", audio: base64Audio }]
279
275
  }
280
276
  });
281
- this.client.realtime.send("response.create", {
277
+ this.sendEvent("response.create", {
282
278
  response: {
283
279
  modalities: ["text"],
284
280
  instructions: `ONLY repeat the input and DO NOT say anything else`
@@ -288,6 +284,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
288
284
  this.emit("error", new Error("Unsupported audio data format"));
289
285
  }
290
286
  }
287
+ waitForOpen() {
288
+ return new Promise((resolve) => {
289
+ this.ws.on("open", resolve);
290
+ });
291
+ }
292
+ waitForSessionCreated() {
293
+ return new Promise((resolve) => {
294
+ this.client.on("session.created", resolve);
295
+ });
296
+ }
291
297
  /**
292
298
  * Establishes a connection to the OpenAI realtime service.
293
299
  * Must be called before using speak, listen, or relay functions.
@@ -301,8 +307,17 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
301
307
  * ```
302
308
  */
303
309
  async connect() {
304
- await this.client.connect();
305
- await this.client.waitForSessionCreated();
310
+ await this.waitForOpen();
311
+ await this.waitForSessionCreated();
312
+ const openaiTools = transformTools(this.tools);
313
+ this.updateConfig({
314
+ instructions: this.instructions,
315
+ tools: openaiTools.map((t) => t.openaiTool),
316
+ input_audio_transcription: {
317
+ model: "whisper-1"
318
+ },
319
+ voice: this.speaker
320
+ });
306
321
  this.state = "open";
307
322
  }
308
323
  /**
@@ -323,7 +338,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
323
338
  * await voice.relay(micStream);
324
339
  * ```
325
340
  */
326
- async send(audioData) {
341
+ async send(audioData, eventId) {
327
342
  if (!this.state || this.state !== "open") {
328
343
  console.warn("Cannot relay audio when not open. Call open() first.");
329
344
  return;
@@ -333,15 +348,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
333
348
  stream.on("data", (chunk) => {
334
349
  try {
335
350
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
336
- const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
337
- this.client.appendInputAudio(int16Array);
351
+ this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
338
352
  } catch (err) {
339
353
  this.emit("error", err);
340
354
  }
341
355
  });
342
356
  } else if (audioData instanceof Int16Array) {
343
357
  try {
344
- this.client.appendInputAudio(audioData);
358
+ this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
345
359
  } catch (err) {
346
360
  this.emit("error", err);
347
361
  }
@@ -368,7 +382,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
368
382
  * });
369
383
  */
370
384
  async answer({ options }) {
371
- this.client.realtime.send("response.create", { response: options ?? {} });
385
+ this.sendEvent("response.create", { response: options ?? {} });
372
386
  }
373
387
  /**
374
388
  * Registers an event listener for voice events.
@@ -437,30 +451,106 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
437
451
  }
438
452
  }
439
453
  setupEventListeners() {
440
- this.client.on("error", (error) => {
441
- this.emit("error", error);
454
+ const speakerStreams = /* @__PURE__ */ new Map();
455
+ this.ws.on("message", (message) => {
456
+ const data = JSON.parse(message.toString());
457
+ this.client.emit(data.type, data);
458
+ if (this.debug) {
459
+ const { delta, ...fields } = data;
460
+ console.log(data.type, fields, delta?.length < 100 ? delta : "");
461
+ }
442
462
  });
443
- this.client.on("conversation.created", (conversation) => {
444
- this.emit("openAIRealtime:conversation.created", conversation);
463
+ this.client.on("session.created", (ev) => {
464
+ this.emit("session.created", ev);
445
465
  });
446
- this.client.on("conversation.interrupted", () => {
447
- this.emit("openAIRealtime:conversation.interrupted");
466
+ this.client.on("session.updated", (ev) => {
467
+ this.emit("session.updated", ev);
448
468
  });
449
- this.client.on("conversation.updated", ({ delta }) => {
450
- if (delta?.audio) {
451
- this.emit("speaking", { audio: delta.audio });
452
- }
469
+ this.client.on("response.created", (ev) => {
470
+ this.emit("response.created", ev);
471
+ const speakerStream = new PassThrough();
472
+ speakerStream.id = ev.response.id;
473
+ speakerStreams.set(ev.response.id, speakerStream);
474
+ this.emit("speaker", speakerStream);
453
475
  });
454
- this.client.on("conversation.item.appended", (item) => {
455
- this.emit("openAIRealtime:conversation.item.appended", item);
476
+ this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
477
+ this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
456
478
  });
457
- this.client.on("conversation.item.completed", ({ item, delta }) => {
458
- if (item.formatted.transcript) {
459
- this.emit("writing", { text: item.formatted.transcript, role: item.role });
460
- }
461
- this.emit("openAIRealtime:conversation.item.completed", { item, delta });
479
+ this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
480
+ this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
481
+ });
482
+ this.client.on("response.audio.delta", (ev) => {
483
+ const audio = Buffer.from(ev.delta, "base64");
484
+ this.emit("speaking", { audio, response_id: ev.response_id });
485
+ const stream = speakerStreams.get(ev.response_id);
486
+ stream?.write(audio);
487
+ });
488
+ this.client.on("response.audio.done", (ev) => {
489
+ this.emit("speaking.done", { response_id: ev.response_id });
490
+ const stream = speakerStreams.get(ev.response_id);
491
+ stream?.end();
492
+ });
493
+ this.client.on("response.audio_transcript.delta", (ev) => {
494
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
495
+ });
496
+ this.client.on("response.audio_transcript.done", (ev) => {
497
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
498
+ });
499
+ this.client.on("response.text.delta", (ev) => {
500
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
501
+ });
502
+ this.client.on("response.text.done", (ev) => {
503
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
504
+ });
505
+ this.client.on("response.done", async (ev) => {
506
+ await this.handleFunctionCalls(ev);
507
+ this.emit("response.done", ev);
508
+ speakerStreams.delete(ev.response.id);
462
509
  });
463
510
  }
511
+ async handleFunctionCalls(ev) {
512
+ for (const output of ev.response?.output ?? []) {
513
+ if (output.type === "function_call") {
514
+ await this.handleFunctionCall(output);
515
+ }
516
+ }
517
+ }
518
+ async handleFunctionCall(output) {
519
+ try {
520
+ const context = JSON.parse(output.arguments);
521
+ const tool = this.tools?.[output.name];
522
+ if (!tool) {
523
+ console.warn(`Tool "${output.name}" not found`);
524
+ return;
525
+ }
526
+ const result = await tool?.execute?.(
527
+ { context },
528
+ {
529
+ toolCallId: "unknown",
530
+ messages: []
531
+ }
532
+ );
533
+ this.sendEvent("conversation.item.create", {
534
+ item: {
535
+ type: "function_call_output",
536
+ call_id: output.call_id,
537
+ output: JSON.stringify(result)
538
+ }
539
+ });
540
+ } catch (e) {
541
+ const err = e;
542
+ console.warn(`Error calling tool "${output.name}":`, err.message);
543
+ this.sendEvent("conversation.item.create", {
544
+ item: {
545
+ type: "function_call_output",
546
+ call_id: output.call_id,
547
+ output: JSON.stringify({ error: err.message })
548
+ }
549
+ });
550
+ } finally {
551
+ this.sendEvent("response.create", {});
552
+ }
553
+ }
464
554
  int16ArrayToBase64(int16Array) {
465
555
  const buffer = new ArrayBuffer(int16Array.length * 2);
466
556
  const view = new DataView(buffer);
@@ -474,6 +564,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
474
564
  }
475
565
  return btoa(binary);
476
566
  }
567
+ sendEvent(type, data) {
568
+ this.ws.send(
569
+ JSON.stringify({
570
+ type,
571
+ ...data
572
+ })
573
+ );
574
+ }
477
575
  };
478
576
 
479
577
  export { OpenAIRealtimeVoice };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/voice-openai-realtime",
3
- "version": "0.0.5-alpha.0",
3
+ "version": "0.1.0-alpha.2",
4
4
  "description": "Mastra OpenAI Realtime API integration",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -20,16 +20,18 @@
20
20
  },
21
21
  "dependencies": {
22
22
  "openai-realtime-api": "^1.0.7",
23
+ "ws": "^8.18.1",
23
24
  "zod-to-json-schema": "^3.24.1",
24
- "@mastra/core": "^0.6.5-alpha.0"
25
+ "@mastra/core": "^0.7.0-alpha.2"
25
26
  },
26
27
  "devDependencies": {
27
28
  "@microsoft/api-extractor": "^7.49.2",
28
29
  "@types/node": "^22.13.1",
29
- "eslint": "^9.20.1",
30
+ "@types/ws": "^8.18.0",
31
+ "eslint": "^9.23.0",
30
32
  "tsup": "^8.3.6",
31
33
  "typescript": "^5.7.3",
32
- "vitest": "^2.1.8",
34
+ "vitest": "^2.1.9",
33
35
  "@internal/lint": "0.0.1"
34
36
  },
35
37
  "scripts": {