@mastra/voice-openai-realtime 0.0.4 → 0.1.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,23 +1,23 @@
1
1
 
2
- > @mastra/voice-openai-realtime@0.0.4-alpha.1 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
2
+ > @mastra/voice-openai-realtime@0.1.0-alpha.1 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
3
3
  > tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake
4
4
 
5
5
  CLI Building entry: src/index.ts
6
6
  CLI Using tsconfig: tsconfig.json
7
7
  CLI tsup v8.4.0
8
8
  TSC Build start
9
- TSC ⚡️ Build success in 7656ms
9
+ TSC ⚡️ Build success in 9409ms
10
10
  DTS Build start
11
11
  CLI Target: es2022
12
12
  Analysis will use the bundled TypeScript version 5.8.2
13
13
  Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.ts
14
14
  Analysis will use the bundled TypeScript version 5.8.2
15
15
  Writing package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.cts
16
- DTS ⚡️ Build success in 10136ms
16
+ DTS ⚡️ Build success in 10803ms
17
17
  CLI Cleaning output folder
18
18
  ESM Build start
19
19
  CJS Build start
20
- ESM dist/index.js 14.90 KB
21
- ESM ⚡️ Build success in 711ms
22
- CJS dist/index.cjs 14.99 KB
23
- CJS ⚡️ Build success in 712ms
20
+ CJS dist/index.cjs 17.77 KB
21
+ CJS ⚡️ Build success in 694ms
22
+ ESM dist/index.js 17.72 KB
23
+ ESM ⚡️ Build success in 695ms
package/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # @mastra/voice-openai-realtime
2
2
 
3
+ ## 0.1.0-alpha.1
4
+
5
+ ### Minor Changes
6
+
7
+ - 443b118: This update removed an external dependency on an unmaintained package and implemented a native websocket connection.
8
+
9
+ ### Patch Changes
10
+
11
+ - Updated dependencies [0b54522]
12
+ - Updated dependencies [1af25d5]
13
+ - Updated dependencies [27439ad]
14
+ - @mastra/core@0.7.0-alpha.1
15
+
16
+ ## 0.0.5-alpha.0
17
+
18
+ ### Patch Changes
19
+
20
+ - Updated dependencies [b4fbc59]
21
+ - @mastra/core@0.6.5-alpha.0
22
+
3
23
  ## 0.0.4
4
24
 
5
25
  ### Patch Changes
@@ -43,10 +43,13 @@ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
43
43
  * ```
44
44
  */
45
45
  export declare class OpenAIRealtimeVoice extends MastraVoice {
46
- private client;
46
+ private ws;
47
47
  private state;
48
+ private client;
48
49
  private events;
49
- tools?: TTools;
50
+ private instructions?;
51
+ private tools?;
52
+ private debug;
50
53
  /**
51
54
  * Creates a new instance of OpenAIRealtimeVoice.
52
55
  *
@@ -55,13 +58,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
55
58
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
56
59
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
57
60
  * @param options.chatModel.tools - Tools configuration for the model
58
- * @param options.chatModel.options - Additional options for the realtime client
59
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
60
- * @param options.chatModel.options.url - Custom WebSocket URL
61
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
62
- * @param options.chatModel.options.debug - Enable debug logging
63
- * @param options.chatModel.options.tools - Additional tools configuration
64
61
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
62
+ * @param options.debug - Enable debug mode
65
63
  *
66
64
  * @example
67
65
  * ```typescript
@@ -74,20 +72,16 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
74
72
  * });
75
73
  * ```
76
74
  */
77
- constructor({ chatModel, speaker, }?: {
75
+ constructor({ chatModel, speaker, debug, }?: {
78
76
  chatModel?: {
79
77
  model?: string;
80
78
  apiKey?: string;
81
79
  tools?: TTools;
82
- options?: {
83
- sessionConfig?: Realtime.SessionConfig;
84
- url?: string;
85
- dangerouslyAllowAPIKeyInBrowser?: boolean;
86
- debug?: boolean;
87
- tools?: TTools;
88
- };
80
+ instructions?: string;
81
+ url?: string;
89
82
  };
90
83
  speaker?: Realtime.Voice;
84
+ debug?: boolean;
91
85
  });
92
86
  /**
93
87
  * Returns a list of available voice speakers.
@@ -175,7 +169,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
175
169
  * });
176
170
  * ```
177
171
  */
178
- updateConfig(sessionConfig: Realtime.SessionConfig): void;
172
+ updateConfig(sessionConfig: unknown): void;
179
173
  /**
180
174
  * Processes audio input for speech recognition.
181
175
  * Takes a readable stream of audio data and emits a writing event.
@@ -200,6 +194,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
200
194
  * ```
201
195
  */
202
196
  listen(audioData: NodeJS.ReadableStream): Promise<void>;
197
+ waitForOpen(): Promise<unknown>;
198
+ waitForSessionCreated(): Promise<unknown>;
203
199
  /**
204
200
  * Establishes a connection to the OpenAI realtime service.
205
201
  * Must be called before using speak, listen, or relay functions.
@@ -231,7 +227,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
231
227
  * await voice.relay(micStream);
232
228
  * ```
233
229
  */
234
- send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
230
+ send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void>;
235
231
  /**
236
232
  * Sends a response to the OpenAI Realtime API.
237
233
  *
@@ -304,11 +300,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
304
300
  */
305
301
  private emit;
306
302
  private setupEventListeners;
303
+ private handleFunctionCalls;
304
+ private handleFunctionCall;
307
305
  private int16ArrayToBase64;
306
+ private sendEvent;
308
307
  }
309
308
 
310
309
  export declare const transformTools: (tools?: TTools_2) => {
311
310
  openaiTool: {
311
+ type: string;
312
312
  name: string;
313
313
  description: string;
314
314
  parameters: {
@@ -43,10 +43,13 @@ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
43
43
  * ```
44
44
  */
45
45
  export declare class OpenAIRealtimeVoice extends MastraVoice {
46
- private client;
46
+ private ws;
47
47
  private state;
48
+ private client;
48
49
  private events;
49
- tools?: TTools;
50
+ private instructions?;
51
+ private tools?;
52
+ private debug;
50
53
  /**
51
54
  * Creates a new instance of OpenAIRealtimeVoice.
52
55
  *
@@ -55,13 +58,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
55
58
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
56
59
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
57
60
  * @param options.chatModel.tools - Tools configuration for the model
58
- * @param options.chatModel.options - Additional options for the realtime client
59
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
60
- * @param options.chatModel.options.url - Custom WebSocket URL
61
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
62
- * @param options.chatModel.options.debug - Enable debug logging
63
- * @param options.chatModel.options.tools - Additional tools configuration
64
61
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
62
+ * @param options.debug - Enable debug mode
65
63
  *
66
64
  * @example
67
65
  * ```typescript
@@ -74,20 +72,16 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
74
72
  * });
75
73
  * ```
76
74
  */
77
- constructor({ chatModel, speaker, }?: {
75
+ constructor({ chatModel, speaker, debug, }?: {
78
76
  chatModel?: {
79
77
  model?: string;
80
78
  apiKey?: string;
81
79
  tools?: TTools;
82
- options?: {
83
- sessionConfig?: Realtime.SessionConfig;
84
- url?: string;
85
- dangerouslyAllowAPIKeyInBrowser?: boolean;
86
- debug?: boolean;
87
- tools?: TTools;
88
- };
80
+ instructions?: string;
81
+ url?: string;
89
82
  };
90
83
  speaker?: Realtime.Voice;
84
+ debug?: boolean;
91
85
  });
92
86
  /**
93
87
  * Returns a list of available voice speakers.
@@ -175,7 +169,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
175
169
  * });
176
170
  * ```
177
171
  */
178
- updateConfig(sessionConfig: Realtime.SessionConfig): void;
172
+ updateConfig(sessionConfig: unknown): void;
179
173
  /**
180
174
  * Processes audio input for speech recognition.
181
175
  * Takes a readable stream of audio data and emits a writing event.
@@ -200,6 +194,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
200
194
  * ```
201
195
  */
202
196
  listen(audioData: NodeJS.ReadableStream): Promise<void>;
197
+ waitForOpen(): Promise<unknown>;
198
+ waitForSessionCreated(): Promise<unknown>;
203
199
  /**
204
200
  * Establishes a connection to the OpenAI realtime service.
205
201
  * Must be called before using speak, listen, or relay functions.
@@ -231,7 +227,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
231
227
  * await voice.relay(micStream);
232
228
  * ```
233
229
  */
234
- send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
230
+ send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void>;
235
231
  /**
236
232
  * Sends a response to the OpenAI Realtime API.
237
233
  *
@@ -304,11 +300,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
304
300
  */
305
301
  private emit;
306
302
  private setupEventListeners;
303
+ private handleFunctionCalls;
304
+ private handleFunctionCall;
307
305
  private int16ArrayToBase64;
306
+ private sendEvent;
308
307
  }
309
308
 
310
309
  export declare const transformTools: (tools?: TTools_2) => {
311
310
  openaiTool: {
311
+ type: string;
312
312
  name: string;
313
313
  description: string;
314
314
  parameters: {
package/dist/index.cjs CHANGED
@@ -1,9 +1,10 @@
1
1
  'use strict';
2
2
 
3
3
  var voice = require('@mastra/core/voice');
4
- var openaiRealtimeApi = require('openai-realtime-api');
5
4
  var stream = require('stream');
6
5
  var zodToJsonSchema = require('zod-to-json-schema');
6
+ var ws = require('ws');
7
+ var events = require('events');
7
8
 
8
9
  // src/index.ts
9
10
  var transformTools = (tools) => {
@@ -29,6 +30,7 @@ var transformTools = (tools) => {
29
30
  continue;
30
31
  }
31
32
  const openaiTool = {
33
+ type: "function",
32
34
  name,
33
35
  description: tool.description || `Tool: ${name}`,
34
36
  parameters
@@ -63,22 +65,18 @@ var transformTools = (tools) => {
63
65
  var isReadableStream = (obj) => {
64
66
  return obj && obj instanceof stream.Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
65
67
  };
66
-
67
- // src/index.ts
68
68
  var DEFAULT_VOICE = "alloy";
69
+ var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
69
70
  var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
70
- var DEFAULT_VAD_CONFIG = {
71
- type: "server_vad",
72
- threshold: 0.5,
73
- prefix_padding_ms: 1e3,
74
- silence_duration_ms: 1e3
75
- };
76
71
  var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
77
72
  var OpenAIRealtimeVoice = class extends voice.MastraVoice {
78
- client;
73
+ ws;
79
74
  state;
75
+ client;
80
76
  events;
77
+ instructions;
81
78
  tools;
79
+ debug;
82
80
  /**
83
81
  * Creates a new instance of OpenAIRealtimeVoice.
84
82
  *
@@ -87,13 +85,8 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
87
85
  * @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
88
86
  * @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
89
87
  * @param options.chatModel.tools - Tools configuration for the model
90
- * @param options.chatModel.options - Additional options for the realtime client
91
- * @param options.chatModel.options.sessionConfig - Session configuration overrides
92
- * @param options.chatModel.options.url - Custom WebSocket URL
93
- * @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
94
- * @param options.chatModel.options.debug - Enable debug logging
95
- * @param options.chatModel.options.tools - Additional tools configuration
96
88
  * @param options.speaker - Voice ID to use (defaults to 'alloy')
89
+ * @param options.debug - Enable debug mode
97
90
  *
98
91
  * @example
99
92
  * ```typescript
@@ -108,25 +101,26 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
108
101
  */
109
102
  constructor({
110
103
  chatModel,
111
- speaker
104
+ speaker,
105
+ debug = false
112
106
  } = {}) {
113
107
  super();
114
- this.client = new openaiRealtimeApi.RealtimeClient({
115
- apiKey: chatModel?.apiKey || process.env.OPENAI_API_KEY,
116
- model: chatModel?.model || DEFAULT_MODEL,
117
- ...chatModel?.options,
118
- sessionConfig: {
119
- voice: speaker || DEFAULT_VOICE,
120
- turn_detection: DEFAULT_VAD_CONFIG,
121
- ...chatModel?.options?.sessionConfig
108
+ const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
109
+ const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
110
+ this.ws = new ws.WebSocket(url, void 0, {
111
+ headers: {
112
+ Authorization: "Bearer " + apiKey,
113
+ "OpenAI-Beta": "realtime=v1"
122
114
  }
123
115
  });
116
+ this.client = new events.EventEmitter();
124
117
  this.state = "close";
125
118
  this.events = {};
119
+ this.tools = chatModel?.tools;
120
+ this.instructions = chatModel?.instructions;
121
+ this.speaker = speaker || DEFAULT_VOICE;
122
+ this.debug = debug;
126
123
  this.setupEventListeners();
127
- if (chatModel?.tools) {
128
- this.addTools(chatModel.tools);
129
- }
130
124
  }
131
125
  /**
132
126
  * Returns a list of available voice speakers.
@@ -152,8 +146,8 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
152
146
  * ```
153
147
  */
154
148
  close() {
155
- if (!this.client) return;
156
- this.client.disconnect();
149
+ if (!this.ws) return;
150
+ this.ws.close();
157
151
  this.state = "close";
158
152
  }
159
153
  /**
@@ -173,10 +167,10 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
173
167
  * ```
174
168
  */
175
169
  addTools(tools) {
176
- const transformedTools = transformTools(tools);
177
- for (const tool of transformedTools) {
178
- this.client.addTool(tool.openaiTool, tool.execute);
179
- }
170
+ const openaiTools = transformTools(tools);
171
+ this.updateConfig({
172
+ tools: openaiTools.map((t) => t.openaiTool)
173
+ });
180
174
  }
181
175
  /**
182
176
  * Emits a speaking event using the configured voice model.
@@ -212,7 +206,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
212
206
  if (input.trim().length === 0) {
213
207
  throw new Error("Input text is empty");
214
208
  }
215
- this.client.realtime.send("response.create", {
209
+ this.sendEvent("response.create", {
216
210
  response: {
217
211
  instructions: `Repeat the following text: ${input}`,
218
212
  voice: options?.speaker ? options.speaker : void 0
@@ -238,7 +232,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
238
232
  * ```
239
233
  */
240
234
  updateConfig(sessionConfig) {
241
- this.client.updateSession(sessionConfig);
235
+ this.sendEvent("session.update", { session: sessionConfig });
242
236
  }
243
237
  /**
244
238
  * Processes audio input for speech recognition.
@@ -273,14 +267,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
273
267
  const buffer = Buffer.concat(chunks);
274
268
  const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
275
269
  const base64Audio = this.int16ArrayToBase64(int16Array);
276
- this.client.realtime.send("conversation.item.create", {
270
+ this.sendEvent("conversation.item.create", {
277
271
  item: {
278
272
  type: "message",
279
273
  role: "user",
280
274
  content: [{ type: "input_audio", audio: base64Audio }]
281
275
  }
282
276
  });
283
- this.client.realtime.send("response.create", {
277
+ this.sendEvent("response.create", {
284
278
  response: {
285
279
  modalities: ["text"],
286
280
  instructions: `ONLY repeat the input and DO NOT say anything else`
@@ -290,6 +284,16 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
290
284
  this.emit("error", new Error("Unsupported audio data format"));
291
285
  }
292
286
  }
287
+ waitForOpen() {
288
+ return new Promise((resolve) => {
289
+ this.ws.on("open", resolve);
290
+ });
291
+ }
292
+ waitForSessionCreated() {
293
+ return new Promise((resolve) => {
294
+ this.client.on("session.created", resolve);
295
+ });
296
+ }
293
297
  /**
294
298
  * Establishes a connection to the OpenAI realtime service.
295
299
  * Must be called before using speak, listen, or relay functions.
@@ -303,8 +307,17 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
303
307
  * ```
304
308
  */
305
309
  async connect() {
306
- await this.client.connect();
307
- await this.client.waitForSessionCreated();
310
+ await this.waitForOpen();
311
+ await this.waitForSessionCreated();
312
+ const openaiTools = transformTools(this.tools);
313
+ this.updateConfig({
314
+ instructions: this.instructions,
315
+ tools: openaiTools.map((t) => t.openaiTool),
316
+ input_audio_transcription: {
317
+ model: "whisper-1"
318
+ },
319
+ voice: this.speaker
320
+ });
308
321
  this.state = "open";
309
322
  }
310
323
  /**
@@ -325,7 +338,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
325
338
  * await voice.relay(micStream);
326
339
  * ```
327
340
  */
328
- async send(audioData) {
341
+ async send(audioData, eventId) {
329
342
  if (!this.state || this.state !== "open") {
330
343
  console.warn("Cannot relay audio when not open. Call open() first.");
331
344
  return;
@@ -335,15 +348,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
335
348
  stream.on("data", (chunk) => {
336
349
  try {
337
350
  const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
338
- const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
339
- this.client.appendInputAudio(int16Array);
351
+ this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
340
352
  } catch (err) {
341
353
  this.emit("error", err);
342
354
  }
343
355
  });
344
356
  } else if (audioData instanceof Int16Array) {
345
357
  try {
346
- this.client.appendInputAudio(audioData);
358
+ this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
347
359
  } catch (err) {
348
360
  this.emit("error", err);
349
361
  }
@@ -370,7 +382,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
370
382
  * });
371
383
  */
372
384
  async answer({ options }) {
373
- this.client.realtime.send("response.create", { response: options ?? {} });
385
+ this.sendEvent("response.create", { response: options ?? {} });
374
386
  }
375
387
  /**
376
388
  * Registers an event listener for voice events.
@@ -439,29 +451,105 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
439
451
  }
440
452
  }
441
453
  setupEventListeners() {
442
- this.client.on("error", (error) => {
443
- this.emit("error", error);
454
+ const speakerStreams = /* @__PURE__ */ new Map();
455
+ this.ws.on("message", (message) => {
456
+ const data = JSON.parse(message.toString());
457
+ this.client.emit(data.type, data);
458
+ if (this.debug) {
459
+ const { delta, ...fields } = data;
460
+ console.log(data.type, fields, delta?.length < 100 ? delta : "");
461
+ }
444
462
  });
445
- this.client.on("conversation.created", (conversation) => {
446
- this.emit("openAIRealtime:conversation.created", conversation);
463
+ this.client.on("session.created", (ev) => {
464
+ this.emit("session.created", ev);
447
465
  });
448
- this.client.on("conversation.interrupted", () => {
449
- this.emit("openAIRealtime:conversation.interrupted");
466
+ this.client.on("session.updated", (ev) => {
467
+ this.emit("session.updated", ev);
450
468
  });
451
- this.client.on("conversation.updated", ({ delta }) => {
452
- if (delta?.audio) {
453
- this.emit("speaking", { audio: delta.audio });
454
- }
469
+ this.client.on("response.created", (ev) => {
470
+ this.emit("response.created", ev);
471
+ const speakerStream = new stream.PassThrough();
472
+ speakerStream.id = ev.response.id;
473
+ speakerStreams.set(ev.response.id, speakerStream);
474
+ this.emit("speaker", speakerStream);
455
475
  });
456
- this.client.on("conversation.item.appended", (item) => {
457
- this.emit("openAIRealtime:conversation.item.appended", item);
476
+ this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
477
+ this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
458
478
  });
459
- this.client.on("conversation.item.completed", ({ item, delta }) => {
460
- if (item.formatted.transcript) {
461
- this.emit("writing", { text: item.formatted.transcript, role: item.role });
462
- }
463
- this.emit("openAIRealtime:conversation.item.completed", { item, delta });
479
+ this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
480
+ this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
481
+ });
482
+ this.client.on("response.audio.delta", (ev) => {
483
+ const audio = Buffer.from(ev.delta, "base64");
484
+ this.emit("speaking", { audio, response_id: ev.response_id });
485
+ const stream = speakerStreams.get(ev.response_id);
486
+ stream?.write(audio);
487
+ });
488
+ this.client.on("response.audio.done", (ev) => {
489
+ this.emit("speaking.done", { response_id: ev.response_id });
490
+ const stream = speakerStreams.get(ev.response_id);
491
+ stream?.end();
492
+ });
493
+ this.client.on("response.audio_transcript.delta", (ev) => {
494
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
464
495
  });
496
+ this.client.on("response.audio_transcript.done", (ev) => {
497
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
498
+ });
499
+ this.client.on("response.text.delta", (ev) => {
500
+ this.emit("writing", { text: ev.delta, response_id: ev.response_id });
501
+ });
502
+ this.client.on("response.text.done", (ev) => {
503
+ this.emit("writing", { text: "\n", response_id: ev.response_id });
504
+ });
505
+ this.client.on("response.done", (ev) => {
506
+ this.handleFunctionCalls(ev);
507
+ this.emit("response.done", ev);
508
+ speakerStreams.delete(ev.response.id);
509
+ });
510
+ }
511
+ async handleFunctionCalls(ev) {
512
+ for (const output of ev.response?.output ?? []) {
513
+ if (output.type === "function_call") {
514
+ await this.handleFunctionCall(output);
515
+ }
516
+ }
517
+ }
518
+ async handleFunctionCall(output) {
519
+ try {
520
+ const context = JSON.parse(output.arguments);
521
+ const tool = this.tools?.[output.name];
522
+ if (!tool) {
523
+ console.warn(`Tool "${output.name}" not found`);
524
+ return;
525
+ }
526
+ const result = await tool?.execute?.(
527
+ { context },
528
+ {
529
+ toolCallId: "unknown",
530
+ messages: []
531
+ }
532
+ );
533
+ this.sendEvent("conversation.item.create", {
534
+ item: {
535
+ type: "function_call_output",
536
+ call_id: output.call_id,
537
+ output: JSON.stringify(result)
538
+ }
539
+ });
540
+ } catch (e) {
541
+ const err = e;
542
+ console.warn(`Error calling tool "${output.name}":`, err.message);
543
+ this.sendEvent("conversation.item.create", {
544
+ item: {
545
+ type: "function_call_output",
546
+ call_id: output.call_id,
547
+ output: JSON.stringify({ error: err.message })
548
+ }
549
+ });
550
+ } finally {
551
+ this.sendEvent("response.create", {});
552
+ }
465
553
  }
466
554
  int16ArrayToBase64(int16Array) {
467
555
  const buffer = new ArrayBuffer(int16Array.length * 2);
@@ -476,6 +564,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
476
564
  }
477
565
  return btoa(binary);
478
566
  }
567
+ sendEvent(type, data) {
568
+ this.ws.send(
569
+ JSON.stringify({
570
+ type,
571
+ ...data
572
+ })
573
+ );
574
+ }
479
575
  };
480
576
 
481
577
  exports.OpenAIRealtimeVoice = OpenAIRealtimeVoice;