@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +13 -0
- package/dist/_tsup-dts-rollup.d.cts +18 -18
- package/dist/_tsup-dts-rollup.d.ts +18 -18
- package/dist/index.cjs +158 -62
- package/dist/index.js +159 -63
- package/package.json +6 -4
- package/src/index.ts +188 -74
- package/src/utils.ts +1 -0
package/dist/index.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { MastraVoice } from '@mastra/core/voice';
|
|
2
|
-
import {
|
|
3
|
-
import { Readable } from 'stream';
|
|
2
|
+
import { PassThrough, Readable } from 'stream';
|
|
4
3
|
import { zodToJsonSchema } from 'zod-to-json-schema';
|
|
4
|
+
import { WebSocket } from 'ws';
|
|
5
|
+
import { EventEmitter } from 'events';
|
|
5
6
|
|
|
6
7
|
// src/index.ts
|
|
7
8
|
var transformTools = (tools) => {
|
|
@@ -27,6 +28,7 @@ var transformTools = (tools) => {
|
|
|
27
28
|
continue;
|
|
28
29
|
}
|
|
29
30
|
const openaiTool = {
|
|
31
|
+
type: "function",
|
|
30
32
|
name,
|
|
31
33
|
description: tool.description || `Tool: ${name}`,
|
|
32
34
|
parameters
|
|
@@ -61,22 +63,18 @@ var transformTools = (tools) => {
|
|
|
61
63
|
var isReadableStream = (obj) => {
|
|
62
64
|
return obj && obj instanceof Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
|
|
63
65
|
};
|
|
64
|
-
|
|
65
|
-
// src/index.ts
|
|
66
66
|
var DEFAULT_VOICE = "alloy";
|
|
67
|
+
var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
|
|
67
68
|
var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
|
|
68
|
-
var DEFAULT_VAD_CONFIG = {
|
|
69
|
-
type: "server_vad",
|
|
70
|
-
threshold: 0.5,
|
|
71
|
-
prefix_padding_ms: 1e3,
|
|
72
|
-
silence_duration_ms: 1e3
|
|
73
|
-
};
|
|
74
69
|
var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
|
|
75
70
|
var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
76
|
-
|
|
71
|
+
ws;
|
|
77
72
|
state;
|
|
73
|
+
client;
|
|
78
74
|
events;
|
|
75
|
+
instructions;
|
|
79
76
|
tools;
|
|
77
|
+
debug;
|
|
80
78
|
/**
|
|
81
79
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
82
80
|
*
|
|
@@ -85,13 +83,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
85
83
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
86
84
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
87
85
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
88
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
89
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
90
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
91
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
92
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
93
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
94
86
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
87
|
+
* @param options.debug - Enable debug mode
|
|
95
88
|
*
|
|
96
89
|
* @example
|
|
97
90
|
* ```typescript
|
|
@@ -106,25 +99,26 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
106
99
|
*/
|
|
107
100
|
constructor({
|
|
108
101
|
chatModel,
|
|
109
|
-
speaker
|
|
102
|
+
speaker,
|
|
103
|
+
debug = false
|
|
110
104
|
} = {}) {
|
|
111
105
|
super();
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
turn_detection: DEFAULT_VAD_CONFIG,
|
|
119
|
-
...chatModel?.options?.sessionConfig
|
|
106
|
+
const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
|
|
107
|
+
const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
|
|
108
|
+
this.ws = new WebSocket(url, void 0, {
|
|
109
|
+
headers: {
|
|
110
|
+
Authorization: "Bearer " + apiKey,
|
|
111
|
+
"OpenAI-Beta": "realtime=v1"
|
|
120
112
|
}
|
|
121
113
|
});
|
|
114
|
+
this.client = new EventEmitter();
|
|
122
115
|
this.state = "close";
|
|
123
116
|
this.events = {};
|
|
117
|
+
this.tools = chatModel?.tools;
|
|
118
|
+
this.instructions = chatModel?.instructions;
|
|
119
|
+
this.speaker = speaker || DEFAULT_VOICE;
|
|
120
|
+
this.debug = debug;
|
|
124
121
|
this.setupEventListeners();
|
|
125
|
-
if (chatModel?.tools) {
|
|
126
|
-
this.addTools(chatModel.tools);
|
|
127
|
-
}
|
|
128
122
|
}
|
|
129
123
|
/**
|
|
130
124
|
* Returns a list of available voice speakers.
|
|
@@ -150,8 +144,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
150
144
|
* ```
|
|
151
145
|
*/
|
|
152
146
|
close() {
|
|
153
|
-
if (!this.
|
|
154
|
-
this.
|
|
147
|
+
if (!this.ws) return;
|
|
148
|
+
this.ws.close();
|
|
155
149
|
this.state = "close";
|
|
156
150
|
}
|
|
157
151
|
/**
|
|
@@ -171,10 +165,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
171
165
|
* ```
|
|
172
166
|
*/
|
|
173
167
|
addTools(tools) {
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
}
|
|
168
|
+
const openaiTools = transformTools(tools);
|
|
169
|
+
this.updateConfig({
|
|
170
|
+
tools: openaiTools.map((t) => t.openaiTool)
|
|
171
|
+
});
|
|
178
172
|
}
|
|
179
173
|
/**
|
|
180
174
|
* Emits a speaking event using the configured voice model.
|
|
@@ -210,7 +204,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
210
204
|
if (input.trim().length === 0) {
|
|
211
205
|
throw new Error("Input text is empty");
|
|
212
206
|
}
|
|
213
|
-
this.
|
|
207
|
+
this.sendEvent("response.create", {
|
|
214
208
|
response: {
|
|
215
209
|
instructions: `Repeat the following text: ${input}`,
|
|
216
210
|
voice: options?.speaker ? options.speaker : void 0
|
|
@@ -236,7 +230,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
236
230
|
* ```
|
|
237
231
|
*/
|
|
238
232
|
updateConfig(sessionConfig) {
|
|
239
|
-
this.
|
|
233
|
+
this.sendEvent("session.update", { session: sessionConfig });
|
|
240
234
|
}
|
|
241
235
|
/**
|
|
242
236
|
* Processes audio input for speech recognition.
|
|
@@ -271,14 +265,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
271
265
|
const buffer = Buffer.concat(chunks);
|
|
272
266
|
const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
|
|
273
267
|
const base64Audio = this.int16ArrayToBase64(int16Array);
|
|
274
|
-
this.
|
|
268
|
+
this.sendEvent("conversation.item.create", {
|
|
275
269
|
item: {
|
|
276
270
|
type: "message",
|
|
277
271
|
role: "user",
|
|
278
272
|
content: [{ type: "input_audio", audio: base64Audio }]
|
|
279
273
|
}
|
|
280
274
|
});
|
|
281
|
-
this.
|
|
275
|
+
this.sendEvent("response.create", {
|
|
282
276
|
response: {
|
|
283
277
|
modalities: ["text"],
|
|
284
278
|
instructions: `ONLY repeat the input and DO NOT say anything else`
|
|
@@ -288,6 +282,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
288
282
|
this.emit("error", new Error("Unsupported audio data format"));
|
|
289
283
|
}
|
|
290
284
|
}
|
|
285
|
+
waitForOpen() {
|
|
286
|
+
return new Promise((resolve) => {
|
|
287
|
+
this.ws.on("open", resolve);
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
waitForSessionCreated() {
|
|
291
|
+
return new Promise((resolve) => {
|
|
292
|
+
this.client.on("session.created", resolve);
|
|
293
|
+
});
|
|
294
|
+
}
|
|
291
295
|
/**
|
|
292
296
|
* Establishes a connection to the OpenAI realtime service.
|
|
293
297
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -301,8 +305,17 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
301
305
|
* ```
|
|
302
306
|
*/
|
|
303
307
|
async connect() {
|
|
304
|
-
await this.
|
|
305
|
-
await this.
|
|
308
|
+
await this.waitForOpen();
|
|
309
|
+
await this.waitForSessionCreated();
|
|
310
|
+
const openaiTools = transformTools(this.tools);
|
|
311
|
+
this.updateConfig({
|
|
312
|
+
instructions: this.instructions,
|
|
313
|
+
tools: openaiTools.map((t) => t.openaiTool),
|
|
314
|
+
input_audio_transcription: {
|
|
315
|
+
model: "whisper-1"
|
|
316
|
+
},
|
|
317
|
+
voice: this.speaker
|
|
318
|
+
});
|
|
306
319
|
this.state = "open";
|
|
307
320
|
}
|
|
308
321
|
/**
|
|
@@ -323,7 +336,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
323
336
|
* await voice.relay(micStream);
|
|
324
337
|
* ```
|
|
325
338
|
*/
|
|
326
|
-
async send(audioData) {
|
|
339
|
+
async send(audioData, eventId) {
|
|
327
340
|
if (!this.state || this.state !== "open") {
|
|
328
341
|
console.warn("Cannot relay audio when not open. Call open() first.");
|
|
329
342
|
return;
|
|
@@ -333,15 +346,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
333
346
|
stream.on("data", (chunk) => {
|
|
334
347
|
try {
|
|
335
348
|
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
336
|
-
|
|
337
|
-
this.client.appendInputAudio(int16Array);
|
|
349
|
+
this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
|
|
338
350
|
} catch (err) {
|
|
339
351
|
this.emit("error", err);
|
|
340
352
|
}
|
|
341
353
|
});
|
|
342
354
|
} else if (audioData instanceof Int16Array) {
|
|
343
355
|
try {
|
|
344
|
-
this.
|
|
356
|
+
this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
|
|
345
357
|
} catch (err) {
|
|
346
358
|
this.emit("error", err);
|
|
347
359
|
}
|
|
@@ -368,7 +380,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
368
380
|
* });
|
|
369
381
|
*/
|
|
370
382
|
async answer({ options }) {
|
|
371
|
-
this.
|
|
383
|
+
this.sendEvent("response.create", { response: options ?? {} });
|
|
372
384
|
}
|
|
373
385
|
/**
|
|
374
386
|
* Registers an event listener for voice events.
|
|
@@ -437,29 +449,105 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
437
449
|
}
|
|
438
450
|
}
|
|
439
451
|
setupEventListeners() {
|
|
440
|
-
|
|
441
|
-
|
|
452
|
+
const speakerStreams = /* @__PURE__ */ new Map();
|
|
453
|
+
this.ws.on("message", (message) => {
|
|
454
|
+
const data = JSON.parse(message.toString());
|
|
455
|
+
this.client.emit(data.type, data);
|
|
456
|
+
if (this.debug) {
|
|
457
|
+
const { delta, ...fields } = data;
|
|
458
|
+
console.log(data.type, fields, delta?.length < 100 ? delta : "");
|
|
459
|
+
}
|
|
442
460
|
});
|
|
443
|
-
this.client.on("
|
|
444
|
-
this.emit("
|
|
461
|
+
this.client.on("session.created", (ev) => {
|
|
462
|
+
this.emit("session.created", ev);
|
|
445
463
|
});
|
|
446
|
-
this.client.on("
|
|
447
|
-
this.emit("
|
|
464
|
+
this.client.on("session.updated", (ev) => {
|
|
465
|
+
this.emit("session.updated", ev);
|
|
448
466
|
});
|
|
449
|
-
this.client.on("
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
467
|
+
this.client.on("response.created", (ev) => {
|
|
468
|
+
this.emit("response.created", ev);
|
|
469
|
+
const speakerStream = new PassThrough();
|
|
470
|
+
speakerStream.id = ev.response.id;
|
|
471
|
+
speakerStreams.set(ev.response.id, speakerStream);
|
|
472
|
+
this.emit("speaker", speakerStream);
|
|
453
473
|
});
|
|
454
|
-
this.client.on("conversation.item.
|
|
455
|
-
this.emit("
|
|
474
|
+
this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
|
|
475
|
+
this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
|
|
456
476
|
});
|
|
457
|
-
this.client.on("conversation.item.
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
477
|
+
this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
|
|
478
|
+
this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
|
|
479
|
+
});
|
|
480
|
+
this.client.on("response.audio.delta", (ev) => {
|
|
481
|
+
const audio = Buffer.from(ev.delta, "base64");
|
|
482
|
+
this.emit("speaking", { audio, response_id: ev.response_id });
|
|
483
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
484
|
+
stream?.write(audio);
|
|
485
|
+
});
|
|
486
|
+
this.client.on("response.audio.done", (ev) => {
|
|
487
|
+
this.emit("speaking.done", { response_id: ev.response_id });
|
|
488
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
489
|
+
stream?.end();
|
|
490
|
+
});
|
|
491
|
+
this.client.on("response.audio_transcript.delta", (ev) => {
|
|
492
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
462
493
|
});
|
|
494
|
+
this.client.on("response.audio_transcript.done", (ev) => {
|
|
495
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
496
|
+
});
|
|
497
|
+
this.client.on("response.text.delta", (ev) => {
|
|
498
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
499
|
+
});
|
|
500
|
+
this.client.on("response.text.done", (ev) => {
|
|
501
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
502
|
+
});
|
|
503
|
+
this.client.on("response.done", (ev) => {
|
|
504
|
+
this.handleFunctionCalls(ev);
|
|
505
|
+
this.emit("response.done", ev);
|
|
506
|
+
speakerStreams.delete(ev.response.id);
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
async handleFunctionCalls(ev) {
|
|
510
|
+
for (const output of ev.response?.output ?? []) {
|
|
511
|
+
if (output.type === "function_call") {
|
|
512
|
+
await this.handleFunctionCall(output);
|
|
513
|
+
}
|
|
514
|
+
}
|
|
515
|
+
}
|
|
516
|
+
async handleFunctionCall(output) {
|
|
517
|
+
try {
|
|
518
|
+
const context = JSON.parse(output.arguments);
|
|
519
|
+
const tool = this.tools?.[output.name];
|
|
520
|
+
if (!tool) {
|
|
521
|
+
console.warn(`Tool "${output.name}" not found`);
|
|
522
|
+
return;
|
|
523
|
+
}
|
|
524
|
+
const result = await tool?.execute?.(
|
|
525
|
+
{ context },
|
|
526
|
+
{
|
|
527
|
+
toolCallId: "unknown",
|
|
528
|
+
messages: []
|
|
529
|
+
}
|
|
530
|
+
);
|
|
531
|
+
this.sendEvent("conversation.item.create", {
|
|
532
|
+
item: {
|
|
533
|
+
type: "function_call_output",
|
|
534
|
+
call_id: output.call_id,
|
|
535
|
+
output: JSON.stringify(result)
|
|
536
|
+
}
|
|
537
|
+
});
|
|
538
|
+
} catch (e) {
|
|
539
|
+
const err = e;
|
|
540
|
+
console.warn(`Error calling tool "${output.name}":`, err.message);
|
|
541
|
+
this.sendEvent("conversation.item.create", {
|
|
542
|
+
item: {
|
|
543
|
+
type: "function_call_output",
|
|
544
|
+
call_id: output.call_id,
|
|
545
|
+
output: JSON.stringify({ error: err.message })
|
|
546
|
+
}
|
|
547
|
+
});
|
|
548
|
+
} finally {
|
|
549
|
+
this.sendEvent("response.create", {});
|
|
550
|
+
}
|
|
463
551
|
}
|
|
464
552
|
int16ArrayToBase64(int16Array) {
|
|
465
553
|
const buffer = new ArrayBuffer(int16Array.length * 2);
|
|
@@ -474,6 +562,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
474
562
|
}
|
|
475
563
|
return btoa(binary);
|
|
476
564
|
}
|
|
565
|
+
sendEvent(type, data) {
|
|
566
|
+
this.ws.send(
|
|
567
|
+
JSON.stringify({
|
|
568
|
+
type,
|
|
569
|
+
...data
|
|
570
|
+
})
|
|
571
|
+
);
|
|
572
|
+
}
|
|
477
573
|
};
|
|
478
574
|
|
|
479
575
|
export { OpenAIRealtimeVoice };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/voice-openai-realtime",
|
|
3
|
-
"version": "0.0
|
|
3
|
+
"version": "0.1.0-alpha.1",
|
|
4
4
|
"description": "Mastra OpenAI Realtime API integration",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -20,16 +20,18 @@
|
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
22
|
"openai-realtime-api": "^1.0.7",
|
|
23
|
+
"ws": "^8.18.1",
|
|
23
24
|
"zod-to-json-schema": "^3.24.1",
|
|
24
|
-
"@mastra/core": "^0.
|
|
25
|
+
"@mastra/core": "^0.7.0-alpha.1"
|
|
25
26
|
},
|
|
26
27
|
"devDependencies": {
|
|
27
28
|
"@microsoft/api-extractor": "^7.49.2",
|
|
28
29
|
"@types/node": "^22.13.1",
|
|
29
|
-
"
|
|
30
|
+
"@types/ws": "^8.18.0",
|
|
31
|
+
"eslint": "^9.23.0",
|
|
30
32
|
"tsup": "^8.3.6",
|
|
31
33
|
"typescript": "^5.7.3",
|
|
32
|
-
"vitest": "^2.1.
|
|
34
|
+
"vitest": "^2.1.9",
|
|
33
35
|
"@internal/lint": "0.0.1"
|
|
34
36
|
},
|
|
35
37
|
"scripts": {
|