@mastra/voice-openai-realtime 0.1.0-alpha.1 → 0.1.0-alpha.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,23 @@
1
1
 
2
- > @mastra/voice-openai-realtime@0.1.0-alpha.1 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
2
+ > @mastra/voice-openai-realtime@0.1.0-alpha.3 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
3
3
  > tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake
4
4
 
5
5
  CLI Building entry: src/index.ts
6
6
  CLI Using tsconfig: tsconfig.json
7
7
  CLI tsup v8.4.0
8
8
  TSC Build start
9
- TSC ⚡️ Build success in 9409ms
9
+ TSC ⚡️ Build success in 8539ms
10
10
  DTS Build start
11
11
  CLI Target: es2022
12
12
  Analysis will use the bundled TypeScript version 5.8.2
13
13
  Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.ts
14
14
  Analysis will use the bundled TypeScript version 5.8.2
15
15
  Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.cts
16
- DTS ⚡️ Build success in 10803ms
16
+ DTS ⚡️ Build success in 12102ms
17
17
  CLI Cleaning output folder
18
18
  ESM Build start
19
19
  CJS Build start
20
- CJS dist/index.cjs 17.77 KB
21
- CJS ⚡️ Build success in 694ms
22
- ESM dist/index.js 17.72 KB
23
- ESM ⚡️ Build success in 695ms
20
+ ESM dist/index.js 18.38 KB
21
+ ESM ⚡️ Build success in 718ms
22
+ CJS dist/index.cjs 18.44 KB
23
+ CJS ⚡️ Build success in 718ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,24 @@
1
1
  # @mastra/voice-openai-realtime
2
2
 
3
+ ## 0.1.0-alpha.3
4
+
5
+ ### Patch Changes
6
+
7
+ - a4686e8: Realtime event queue
8
+ - Updated dependencies [b3b34f5]
9
+ - Updated dependencies [a4686e8]
10
+ - @mastra/core@0.7.0-alpha.3
11
+
12
+ ## 0.1.0-alpha.2
13
+
14
+ ### Patch Changes
15
+
16
+ - Updated dependencies [a838fde]
17
+ - Updated dependencies [a8bd4cf]
18
+ - Updated dependencies [7a3eeb0]
19
+ - Updated dependencies [6530ad1]
20
+ - @mastra/core@0.7.0-alpha.2
21
+
3
22
  ## 0.1.0-alpha.1
4
23
 
5
24
  ### Minor Changes
@@ -50,14 +50,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
50
50
  private instructions?;
51
51
  private tools?;
52
52
  private debug;
53
+ private queue;
54
+ private transcriber;
53
55
  /**
54
56
  * Creates a new instance of OpenAIRealtimeVoice.
55
57
  *
56
58
  * @param options - Configuration options for the voice instance
57
- * @param options.chatModel - Configuration for the chat model
58
- * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
59
- * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
60
- * @param options.chatModel.tools - Tools configuration for the model
59
+ * @param options.url - The base URL for the OpenAI Realtime API
60
+ * @param options.model - The model ID to use (defaults to GPT-4 Mini Realtime)
61
+ * @param options.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
61
62
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
62
63
  * @param options.debug - Enable debug mode
63
64
  *
@@ -72,15 +73,12 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
72
73
  * });
73
74
  * ```
74
75
  */
75
- constructor({ chatModel, speaker, debug, }?: {
76
- chatModel?: {
77
- model?: string;
78
- apiKey?: string;
79
- tools?: TTools;
80
- instructions?: string;
81
- url?: string;
82
- };
76
+ constructor(options?: {
77
+ model?: string;
78
+ url?: string;
79
+ apiKey?: string;
83
80
  speaker?: Realtime.Voice;
81
+ transcriber?: Realtime.AudioTranscriptionModel;
84
82
  debug?: boolean;
85
83
  });
86
84
  /**
@@ -108,6 +106,19 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
108
106
  * ```
109
107
  */
110
108
  close(): void;
109
+ /**
110
+ * Equips the voice instance with a set of instructions.
111
+ * Instructions allow the model to perform additional actions during conversations.
112
+ *
113
+ * @param instructions - Optional instructions to addInstructions
114
+ * @returns Transformed instructions ready for use with the model
115
+ *
116
+ * @example
117
+ * ```typescript
118
+ * voice.addInstuctions('You are a helpful assistant.');
119
+ * ```
120
+ */
121
+ addInstructions(instructions?: string): void;
111
122
  /**
112
123
  * Equips the voice instance with a set of tools.
113
124
  * Tools allow the model to perform additional actions during conversations.
@@ -50,14 +50,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
50
50
  private instructions?;
51
51
  private tools?;
52
52
  private debug;
53
+ private queue;
54
+ private transcriber;
53
55
  /**
54
56
  * Creates a new instance of OpenAIRealtimeVoice.
55
57
  *
56
58
  * @param options - Configuration options for the voice instance
57
- * @param options.chatModel - Configuration for the chat model
58
- * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
59
- * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
60
- * @param options.chatModel.tools - Tools configuration for the model
59
+ * @param options.url - The base URL for the OpenAI Realtime API
60
+ * @param options.model - The model ID to use (defaults to GPT-4 Mini Realtime)
61
+ * @param options.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
61
62
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
62
63
  * @param options.debug - Enable debug mode
63
64
  *
@@ -72,15 +73,12 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
72
73
  * });
73
74
  * ```
74
75
  */
75
- constructor({ chatModel, speaker, debug, }?: {
76
- chatModel?: {
77
- model?: string;
78
- apiKey?: string;
79
- tools?: TTools;
80
- instructions?: string;
81
- url?: string;
82
- };
76
+ constructor(options?: {
77
+ model?: string;
78
+ url?: string;
79
+ apiKey?: string;
83
80
  speaker?: Realtime.Voice;
81
+ transcriber?: Realtime.AudioTranscriptionModel;
84
82
  debug?: boolean;
85
83
  });
86
84
  /**
@@ -108,6 +106,19 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
108
106
  * ```
109
107
  */
110
108
  close(): void;
109
+ /**
110
+ * Equips the voice instance with a set of instructions.
111
+ * Instructions allow the model to perform additional actions during conversations.
112
+ *
113
+ * @param instructions - Optional instructions to addInstructions
114
+ * @returns Transformed instructions ready for use with the model
115
+ *
116
+ * @example
117
+ * ```typescript
118
+ * voice.addInstuctions('You are a helpful assistant.');
119
+ * ```
120
+ */
121
+ addInstructions(instructions?: string): void;
111
122
  /**
112
123
  * Equips the voice instance with a set of tools.
113
124
  * Tools allow the model to perform additional actions during conversations.
package/dist/index.cjs CHANGED
@@ -1,10 +1,10 @@
1
1
  'use strict';
2
2
 
3
- var voice = require('@mastra/core/voice');
3
+ var events = require('events');
4
4
  var stream = require('stream');
5
- var zodToJsonSchema = require('zod-to-json-schema');
5
+ var voice = require('@mastra/core/voice');
6
6
  var ws = require('ws');
7
- var events = require('events');
7
+ var zodToJsonSchema = require('zod-to-json-schema');
8
8
 
9
9
  // src/index.ts
10
10
  var transformTools = (tools) => {
@@ -65,7 +65,10 @@ var transformTools = (tools) => {
65
65
  var isReadableStream = (obj) => {
66
66
  return obj && obj instanceof stream.Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
67
67
  };
68
+
69
+ // src/index.ts
68
70
  var DEFAULT_VOICE = "alloy";
71
+ var DEFAULT_TRANSCRIBER = "whisper-1";
69
72
  var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
70
73
  var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
71
74
  var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
@@ -77,14 +80,15 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
77
80
  instructions;
78
81
  tools;
79
82
  debug;
83
+ queue = [];
84
+ transcriber;
80
85
  /**
81
86
  * Creates a new instance of OpenAIRealtimeVoice.
82
87
  *
83
88
  * @param options - Configuration options for the voice instance
84
- * @param options.chatModel - Configuration for the chat model
85
- * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
86
- * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
87
- * @param options.chatModel.tools - Tools configuration for the model
89
+ * @param options.url - The base URL for the OpenAI Realtime API
90
+ * @param options.model - The model ID to use (defaults to GPT-4 Mini Realtime)
91
+ * @param options.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
88
92
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
89
93
  * @param options.debug - Enable debug mode
90
94
  *
@@ -99,14 +103,10 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
99
103
  * });
100
104
  * ```
101
105
  */
102
- constructor({
103
- chatModel,
104
- speaker,
105
- debug = false
106
- } = {}) {
106
+ constructor(options = {}) {
107
107
  super();
108
- const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
109
- const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
108
+ const url = `${options.url || DEFAULT_URL}?model=${options.model || DEFAULT_MODEL}`;
109
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
110
110
  this.ws = new ws.WebSocket(url, void 0, {
111
111
  headers: {
112
112
  Authorization: "Bearer " + apiKey,
@@ -116,10 +116,9 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
116
116
  this.client = new events.EventEmitter();
117
117
  this.state = "close";
118
118
  this.events = {};
119
- this.tools = chatModel?.tools;
120
- this.instructions = chatModel?.instructions;
121
- this.speaker = speaker || DEFAULT_VOICE;
122
- this.debug = debug;
119
+ this.speaker = options.speaker || DEFAULT_VOICE;
120
+ this.transcriber = options.transcriber || DEFAULT_TRANSCRIBER;
121
+ this.debug = options.debug || false;
123
122
  this.setupEventListeners();
124
123
  }
125
124
  /**
@@ -150,6 +149,21 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
150
149
  this.ws.close();
151
150
  this.state = "close";
152
151
  }
152
+ /**
153
+ * Equips the voice instance with a set of instructions.
154
+ * Instructions allow the model to perform additional actions during conversations.
155
+ *
156
+ * @param instructions - Optional instructions to addInstructions
157
+ * @returns Transformed instructions ready for use with the model
158
+ *
159
+ * @example
160
+ * ```typescript
161
+ * voice.addInstuctions('You are a helpful assistant.');
162
+ * ```
163
+ */
164
+ addInstructions(instructions) {
165
+ this.instructions = instructions;
166
+ }
153
167
  /**
154
168
  * Equips the voice instance with a set of tools.
155
169
  * Tools allow the model to perform additional actions during conversations.
@@ -167,10 +181,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
167
181
  * ```
168
182
  */
169
183
  addTools(tools) {
170
- const openaiTools = transformTools(tools);
171
- this.updateConfig({
172
- tools: openaiTools.map((t) => t.openaiTool)
173
- });
184
+ this.tools = tools || {};
174
185
  }
175
186
  /**
176
187
  * Emits a speaking event using the configured voice model.
@@ -314,7 +325,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
314
325
  instructions: this.instructions,
315
326
  tools: openaiTools.map((t) => t.openaiTool),
316
327
  input_audio_transcription: {
317
- model: "whisper-1"
328
+ model: this.transcriber
318
329
  },
319
330
  voice: this.speaker
320
331
  });
@@ -462,6 +473,10 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
462
473
  });
463
474
  this.client.on("session.created", (ev) => {
464
475
  this.emit("session.created", ev);
476
+ const queue = this.queue.splice(0, this.queue.length);
477
+ for (const ev2 of queue) {
478
+ this.ws.send(JSON.stringify(ev2));
479
+ }
465
480
  });
466
481
  this.client.on("session.updated", (ev) => {
467
482
  this.emit("session.updated", ev);
@@ -474,10 +489,10 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
474
489
  this.emit("speaker", speakerStream);
475
490
  });
476
491
  this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
477
- this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
492
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "user" });
478
493
  });
479
494
  this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
480
- this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
495
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "user" });
481
496
  });
482
497
  this.client.on("response.audio.delta", (ev) => {
483
498
  const audio = Buffer.from(ev.delta, "base64");
@@ -491,19 +506,19 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
491
506
  stream?.end();
492
507
  });
493
508
  this.client.on("response.audio_transcript.delta", (ev) => {
494
- this.emit("writing", { text: ev.delta, response_id: ev.response_id });
509
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "assistant" });
495
510
  });
496
511
  this.client.on("response.audio_transcript.done", (ev) => {
497
- this.emit("writing", { text: "\n", response_id: ev.response_id });
512
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "assistant" });
498
513
  });
499
514
  this.client.on("response.text.delta", (ev) => {
500
- this.emit("writing", { text: ev.delta, response_id: ev.response_id });
515
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "assistant" });
501
516
  });
502
517
  this.client.on("response.text.done", (ev) => {
503
- this.emit("writing", { text: "\n", response_id: ev.response_id });
518
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "assistant" });
504
519
  });
505
- this.client.on("response.done", (ev) => {
506
- this.handleFunctionCalls(ev);
520
+ this.client.on("response.done", async (ev) => {
521
+ await this.handleFunctionCalls(ev);
507
522
  this.emit("response.done", ev);
508
523
  speakerStreams.delete(ev.response.id);
509
524
  });
@@ -565,12 +580,16 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
565
580
  return btoa(binary);
566
581
  }
567
582
  sendEvent(type, data) {
568
- this.ws.send(
569
- JSON.stringify({
570
- type,
571
- ...data
572
- })
573
- );
583
+ if (this.ws.readyState !== this.ws.OPEN) {
584
+ this.queue.push({ type, ...data });
585
+ } else {
586
+ this.ws.send(
587
+ JSON.stringify({
588
+ type,
589
+ ...data
590
+ })
591
+ );
592
+ }
574
593
  }
575
594
  };
576
595
 
package/dist/index.js CHANGED
@@ -1,8 +1,8 @@
1
- import { MastraVoice } from '@mastra/core/voice';
1
+ import { EventEmitter } from 'events';
2
2
  import { PassThrough, Readable } from 'stream';
3
- import { zodToJsonSchema } from 'zod-to-json-schema';
3
+ import { MastraVoice } from '@mastra/core/voice';
4
4
  import { WebSocket } from 'ws';
5
- import { EventEmitter } from 'events';
5
+ import { zodToJsonSchema } from 'zod-to-json-schema';
6
6
 
7
7
  // src/index.ts
8
8
  var transformTools = (tools) => {
@@ -63,7 +63,10 @@ var transformTools = (tools) => {
63
63
  var isReadableStream = (obj) => {
64
64
  return obj && obj instanceof Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
65
65
  };
66
+
67
+ // src/index.ts
66
68
  var DEFAULT_VOICE = "alloy";
69
+ var DEFAULT_TRANSCRIBER = "whisper-1";
67
70
  var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
68
71
  var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
69
72
  var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
@@ -75,14 +78,15 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
75
78
  instructions;
76
79
  tools;
77
80
  debug;
81
+ queue = [];
82
+ transcriber;
78
83
  /**
79
84
  * Creates a new instance of OpenAIRealtimeVoice.
80
85
  *
81
86
  * @param options - Configuration options for the voice instance
82
- * @param options.chatModel - Configuration for the chat model
83
- * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
84
- * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
85
- * @param options.chatModel.tools - Tools configuration for the model
87
+ * @param options.url - The base URL for the OpenAI Realtime API
88
+ * @param options.model - The model ID to use (defaults to GPT-4 Mini Realtime)
89
+ * @param options.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
86
90
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
87
91
  * @param options.debug - Enable debug mode
88
92
  *
@@ -97,14 +101,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
97
101
  * });
98
102
  * ```
99
103
  */
100
- constructor({
101
- chatModel,
102
- speaker,
103
- debug = false
104
- } = {}) {
104
+ constructor(options = {}) {
105
105
  super();
106
- const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
107
- const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
106
+ const url = `${options.url || DEFAULT_URL}?model=${options.model || DEFAULT_MODEL}`;
107
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
108
108
  this.ws = new WebSocket(url, void 0, {
109
109
  headers: {
110
110
  Authorization: "Bearer " + apiKey,
@@ -114,10 +114,9 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
114
114
  this.client = new EventEmitter();
115
115
  this.state = "close";
116
116
  this.events = {};
117
- this.tools = chatModel?.tools;
118
- this.instructions = chatModel?.instructions;
119
- this.speaker = speaker || DEFAULT_VOICE;
120
- this.debug = debug;
117
+ this.speaker = options.speaker || DEFAULT_VOICE;
118
+ this.transcriber = options.transcriber || DEFAULT_TRANSCRIBER;
119
+ this.debug = options.debug || false;
121
120
  this.setupEventListeners();
122
121
  }
123
122
  /**
@@ -148,6 +147,21 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
148
147
  this.ws.close();
149
148
  this.state = "close";
150
149
  }
150
+ /**
151
+ * Equips the voice instance with a set of instructions.
152
+ * Instructions allow the model to perform additional actions during conversations.
153
+ *
154
+ * @param instructions - Optional instructions to addInstructions
155
+ * @returns Transformed instructions ready for use with the model
156
+ *
157
+ * @example
158
+ * ```typescript
159
+ * voice.addInstuctions('You are a helpful assistant.');
160
+ * ```
161
+ */
162
+ addInstructions(instructions) {
163
+ this.instructions = instructions;
164
+ }
151
165
  /**
152
166
  * Equips the voice instance with a set of tools.
153
167
  * Tools allow the model to perform additional actions during conversations.
@@ -165,10 +179,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
165
179
  * ```
166
180
  */
167
181
  addTools(tools) {
168
- const openaiTools = transformTools(tools);
169
- this.updateConfig({
170
- tools: openaiTools.map((t) => t.openaiTool)
171
- });
182
+ this.tools = tools || {};
172
183
  }
173
184
  /**
174
185
  * Emits a speaking event using the configured voice model.
@@ -312,7 +323,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
312
323
  instructions: this.instructions,
313
324
  tools: openaiTools.map((t) => t.openaiTool),
314
325
  input_audio_transcription: {
315
- model: "whisper-1"
326
+ model: this.transcriber
316
327
  },
317
328
  voice: this.speaker
318
329
  });
@@ -460,6 +471,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
460
471
  });
461
472
  this.client.on("session.created", (ev) => {
462
473
  this.emit("session.created", ev);
474
+ const queue = this.queue.splice(0, this.queue.length);
475
+ for (const ev2 of queue) {
476
+ this.ws.send(JSON.stringify(ev2));
477
+ }
463
478
  });
464
479
  this.client.on("session.updated", (ev) => {
465
480
  this.emit("session.updated", ev);
@@ -472,10 +487,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
472
487
  this.emit("speaker", speakerStream);
473
488
  });
474
489
  this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
475
- this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
490
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "user" });
476
491
  });
477
492
  this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
478
- this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
493
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "user" });
479
494
  });
480
495
  this.client.on("response.audio.delta", (ev) => {
481
496
  const audio = Buffer.from(ev.delta, "base64");
@@ -489,19 +504,19 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
489
504
  stream?.end();
490
505
  });
491
506
  this.client.on("response.audio_transcript.delta", (ev) => {
492
- this.emit("writing", { text: ev.delta, response_id: ev.response_id });
507
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "assistant" });
493
508
  });
494
509
  this.client.on("response.audio_transcript.done", (ev) => {
495
- this.emit("writing", { text: "\n", response_id: ev.response_id });
510
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "assistant" });
496
511
  });
497
512
  this.client.on("response.text.delta", (ev) => {
498
- this.emit("writing", { text: ev.delta, response_id: ev.response_id });
513
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id, role: "assistant" });
499
514
  });
500
515
  this.client.on("response.text.done", (ev) => {
501
- this.emit("writing", { text: "\n", response_id: ev.response_id });
516
+ this.emit("writing", { text: "\n", response_id: ev.response_id, role: "assistant" });
502
517
  });
503
- this.client.on("response.done", (ev) => {
504
- this.handleFunctionCalls(ev);
518
+ this.client.on("response.done", async (ev) => {
519
+ await this.handleFunctionCalls(ev);
505
520
  this.emit("response.done", ev);
506
521
  speakerStreams.delete(ev.response.id);
507
522
  });
@@ -563,12 +578,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
563
578
  return btoa(binary);
564
579
  }
565
580
  sendEvent(type, data) {
566
- this.ws.send(
567
- JSON.stringify({
568
- type,
569
- ...data
570
- })
571
- );
581
+ if (this.ws.readyState !== this.ws.OPEN) {
582
+ this.queue.push({ type, ...data });
583
+ } else {
584
+ this.ws.send(
585
+ JSON.stringify({
586
+ type,
587
+ ...data
588
+ })
589
+ );
590
+ }
572
591
  }
573
592
  };
574
593
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/voice-openai-realtime",
3
- "version": "0.1.0-alpha.1",
3
+ "version": "0.1.0-alpha.3",
4
4
  "description": "Mastra OpenAI Realtime API integration",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -22,7 +22,7 @@
22
22
  "openai-realtime-api": "^1.0.7",
23
23
  "ws": "^8.18.1",
24
24
  "zod-to-json-schema": "^3.24.1",
25
- "@mastra/core": "^0.7.0-alpha.1"
25
+ "@mastra/core": "^0.7.0-alpha.3"
26
26
  },
27
27
  "devDependencies": {
28
28
  "@microsoft/api-extractor": "^7.49.2",
package/src/index.ts CHANGED
@@ -1,10 +1,10 @@
1
+ import { EventEmitter } from 'events';
2
+ import { PassThrough } from 'stream';
1
3
  import type { ToolsInput } from '@mastra/core/agent';
2
4
  import { MastraVoice } from '@mastra/core/voice';
3
- import { isReadableStream, transformTools } from './utils';
4
- import { WebSocket } from 'ws';
5
- import { EventEmitter } from 'events';
6
5
  import type { Realtime, RealtimeServerEvents } from 'openai-realtime-api';
7
- import { PassThrough } from 'stream';
6
+ import { WebSocket } from 'ws';
7
+ import { isReadableStream, transformTools } from './utils';
8
8
 
9
9
  /**
10
10
  * Event callback function type
@@ -29,6 +29,8 @@ type EventMap = {
29
29
  /** Default voice for text-to-speech. 'alloy' provides a neutral, balanced voice suitable for most use cases */
30
30
  const DEFAULT_VOICE: Realtime.Voice = 'alloy';
31
31
 
32
+ const DEFAULT_TRANSCRIBER: Realtime.AudioTranscriptionModel = 'whisper-1';
33
+
32
34
  const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
33
35
 
34
36
  /**
@@ -36,21 +38,22 @@ const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
36
38
  * This model is optimized for low-latency responses while maintaining high quality output.
37
39
  */
38
40
  const DEFAULT_MODEL = 'gpt-4o-mini-realtime-preview-2024-12-17';
39
- /**
40
- * Default Voice Activity Detection (VAD) configuration.
41
- * These settings control how the system detects speech segments.
42
- *
43
- * @property {string} type - Uses server-side VAD for better accuracy
44
- * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
45
- * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
46
- * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
47
- */
48
- const DEFAULT_VAD_CONFIG = {
49
- type: 'server_vad',
50
- threshold: 0.5,
51
- prefix_padding_ms: 1000,
52
- silence_duration_ms: 1000,
53
- } as Realtime.TurnDetection;
41
+
42
+ // /**
43
+ // * Default Voice Activity Detection (VAD) configuration.
44
+ // * These settings control how the system detects speech segments.
45
+ // *
46
+ // * @property {string} type - Uses server-side VAD for better accuracy
47
+ // * @property {number} threshold - Speech detection sensitivity (0.5 = balanced)
48
+ // * @property {number} prefix_padding_ms - Includes 1 second of audio before speech
49
+ // * @property {number} silence_duration_ms - Waits 1 second of silence before ending turn
50
+ // */
51
+ // const DEFAULT_VAD_CONFIG = {
52
+ // type: 'server_vad',
53
+ // threshold: 0.5,
54
+ // prefix_padding_ms: 1000,
55
+ // silence_duration_ms: 1000,
56
+ // } as Realtime.TurnDetection;
54
57
 
55
58
  type TTools = ToolsInput;
56
59
 
@@ -110,15 +113,16 @@ export class OpenAIRealtimeVoice extends MastraVoice {
110
113
  private instructions?: string;
111
114
  private tools?: TTools;
112
115
  private debug: boolean;
116
+ private queue: unknown[] = [];
117
+ private transcriber: Realtime.AudioTranscriptionModel;
113
118
 
114
119
  /**
115
120
  * Creates a new instance of OpenAIRealtimeVoice.
116
121
  *
117
122
  * @param options - Configuration options for the voice instance
118
- * @param options.chatModel - Configuration for the chat model
119
- * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
120
- * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
121
- * @param options.chatModel.tools - Tools configuration for the model
123
+ * @param options.url - The base URL for the OpenAI Realtime API
124
+ * @param options.model - The model ID to use (defaults to GPT-4 Mini Realtime)
125
+ * @param options.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
122
126
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
123
127
  * @param options.debug - Enable debug mode
124
128
  *
@@ -133,25 +137,20 @@ export class OpenAIRealtimeVoice extends MastraVoice {
133
137
  * });
134
138
  * ```
135
139
  */
136
- constructor({
137
- chatModel,
138
- speaker,
139
- debug = false,
140
- }: {
141
- chatModel?: {
140
+ constructor(
141
+ options: {
142
142
  model?: string;
143
- apiKey?: string;
144
- tools?: TTools;
145
- instructions?: string;
146
143
  url?: string;
147
- };
148
- speaker?: Realtime.Voice;
149
- debug?: boolean;
150
- } = {}) {
144
+ apiKey?: string;
145
+ speaker?: Realtime.Voice;
146
+ transcriber?: Realtime.AudioTranscriptionModel;
147
+ debug?: boolean;
148
+ } = {},
149
+ ) {
151
150
  super();
152
151
 
153
- const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
154
- const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
152
+ const url = `${options.url || DEFAULT_URL}?model=${options.model || DEFAULT_MODEL}`;
153
+ const apiKey = options.apiKey || process.env.OPENAI_API_KEY;
155
154
  this.ws = new WebSocket(url, undefined, {
156
155
  headers: {
157
156
  Authorization: 'Bearer ' + apiKey,
@@ -162,10 +161,9 @@ export class OpenAIRealtimeVoice extends MastraVoice {
162
161
  this.client = new EventEmitter();
163
162
  this.state = 'close';
164
163
  this.events = {} as EventMap;
165
- this.tools = chatModel?.tools;
166
- this.instructions = chatModel?.instructions;
167
- this.speaker = speaker || DEFAULT_VOICE;
168
- this.debug = debug;
164
+ this.speaker = options.speaker || DEFAULT_VOICE;
165
+ this.transcriber = options.transcriber || DEFAULT_TRANSCRIBER;
166
+ this.debug = options.debug || false;
169
167
  this.setupEventListeners();
170
168
  }
171
169
 
@@ -199,6 +197,22 @@ export class OpenAIRealtimeVoice extends MastraVoice {
199
197
  this.state = 'close';
200
198
  }
201
199
 
200
+ /**
201
+ * Equips the voice instance with a set of instructions.
202
+ * Instructions allow the model to perform additional actions during conversations.
203
+ *
204
+ * @param instructions - Optional instructions to addInstructions
205
+ * @returns Transformed instructions ready for use with the model
206
+ *
207
+ * @example
208
+ * ```typescript
209
+ * voice.addInstuctions('You are a helpful assistant.');
210
+ * ```
211
+ */
212
+ addInstructions(instructions?: string) {
213
+ this.instructions = instructions;
214
+ }
215
+
202
216
  /**
203
217
  * Equips the voice instance with a set of tools.
204
218
  * Tools allow the model to perform additional actions during conversations.
@@ -216,10 +230,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
216
230
  * ```
217
231
  */
218
232
  addTools(tools?: TTools) {
219
- const openaiTools = transformTools(tools);
220
- this.updateConfig({
221
- tools: openaiTools.map(t => t.openaiTool),
222
- });
233
+ this.tools = tools || {};
223
234
  }
224
235
 
225
236
  /**
@@ -375,7 +386,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
375
386
  instructions: this.instructions,
376
387
  tools: openaiTools.map(t => t.openaiTool),
377
388
  input_audio_transcription: {
378
- model: 'whisper-1',
389
+ model: this.transcriber,
379
390
  },
380
391
  voice: this.speaker,
381
392
  });
@@ -535,6 +546,11 @@ export class OpenAIRealtimeVoice extends MastraVoice {
535
546
 
536
547
  this.client.on('session.created', ev => {
537
548
  this.emit('session.created', ev);
549
+
550
+ const queue = this.queue.splice(0, this.queue.length);
551
+ for (const ev of queue) {
552
+ this.ws.send(JSON.stringify(ev));
553
+ }
538
554
  });
539
555
  this.client.on('session.updated', ev => {
540
556
  this.emit('session.updated', ev);
@@ -550,10 +566,10 @@ export class OpenAIRealtimeVoice extends MastraVoice {
550
566
  this.emit('speaker', speakerStream);
551
567
  });
552
568
  this.client.on('conversation.item.input_audio_transcription.delta', ev => {
553
- this.emit('transcribing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
569
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
554
570
  });
555
571
  this.client.on('conversation.item.input_audio_transcription.done', ev => {
556
- this.emit('transcribing', { text: '\n', response_id: ev.response_id, role: 'user' });
572
+ this.emit('writing', { text: '\n', response_id: ev.response_id, role: 'user' });
557
573
  });
558
574
  this.client.on('response.audio.delta', ev => {
559
575
  const audio = Buffer.from(ev.delta, 'base64');
@@ -569,19 +585,19 @@ export class OpenAIRealtimeVoice extends MastraVoice {
569
585
  stream?.end();
570
586
  });
571
587
  this.client.on('response.audio_transcript.delta', ev => {
572
- this.emit('writing', { text: ev.delta, response_id: ev.response_id });
588
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id, role: 'assistant' });
573
589
  });
574
590
  this.client.on('response.audio_transcript.done', ev => {
575
- this.emit('writing', { text: '\n', response_id: ev.response_id });
591
+ this.emit('writing', { text: '\n', response_id: ev.response_id, role: 'assistant' });
576
592
  });
577
593
  this.client.on('response.text.delta', ev => {
578
- this.emit('writing', { text: ev.delta, response_id: ev.response_id });
594
+ this.emit('writing', { text: ev.delta, response_id: ev.response_id, role: 'assistant' });
579
595
  });
580
596
  this.client.on('response.text.done', ev => {
581
- this.emit('writing', { text: '\n', response_id: ev.response_id });
597
+ this.emit('writing', { text: '\n', response_id: ev.response_id, role: 'assistant' });
582
598
  });
583
- this.client.on('response.done', ev => {
584
- this.handleFunctionCalls(ev);
599
+ this.client.on('response.done', async ev => {
600
+ await this.handleFunctionCalls(ev);
585
601
  this.emit('response.done', ev);
586
602
  speakerStreams.delete(ev.response.id);
587
603
  });
@@ -647,11 +663,15 @@ export class OpenAIRealtimeVoice extends MastraVoice {
647
663
  }
648
664
 
649
665
  private sendEvent(type: string, data: any) {
650
- this.ws.send(
651
- JSON.stringify({
652
- type: type,
653
- ...data,
654
- }),
655
- );
666
+ if (this.ws.readyState !== this.ws.OPEN) {
667
+ this.queue.push({ type: type, ...data });
668
+ } else {
669
+ this.ws.send(
670
+ JSON.stringify({
671
+ type: type,
672
+ ...data,
673
+ }),
674
+ );
675
+ }
656
676
  }
657
677
  }