@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +23 -0
- package/dist/_tsup-dts-rollup.d.cts +18 -18
- package/dist/_tsup-dts-rollup.d.ts +18 -18
- package/dist/index.cjs +159 -61
- package/dist/index.js +159 -61
- package/package.json +6 -4
- package/src/index.ts +204 -89
- package/src/utils.ts +1 -0
package/dist/index.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
import { PassThrough, Readable } from 'stream';
|
|
1
3
|
import { MastraVoice } from '@mastra/core/voice';
|
|
2
|
-
import {
|
|
3
|
-
import { Readable } from 'stream';
|
|
4
|
+
import { WebSocket } from 'ws';
|
|
4
5
|
import { zodToJsonSchema } from 'zod-to-json-schema';
|
|
5
6
|
|
|
6
7
|
// src/index.ts
|
|
@@ -27,6 +28,7 @@ var transformTools = (tools) => {
|
|
|
27
28
|
continue;
|
|
28
29
|
}
|
|
29
30
|
const openaiTool = {
|
|
31
|
+
type: "function",
|
|
30
32
|
name,
|
|
31
33
|
description: tool.description || `Tool: ${name}`,
|
|
32
34
|
parameters
|
|
@@ -64,19 +66,17 @@ var isReadableStream = (obj) => {
|
|
|
64
66
|
|
|
65
67
|
// src/index.ts
|
|
66
68
|
var DEFAULT_VOICE = "alloy";
|
|
69
|
+
var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
|
|
67
70
|
var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
|
|
68
|
-
var DEFAULT_VAD_CONFIG = {
|
|
69
|
-
type: "server_vad",
|
|
70
|
-
threshold: 0.5,
|
|
71
|
-
prefix_padding_ms: 1e3,
|
|
72
|
-
silence_duration_ms: 1e3
|
|
73
|
-
};
|
|
74
71
|
var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
|
|
75
72
|
var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
76
|
-
|
|
73
|
+
ws;
|
|
77
74
|
state;
|
|
75
|
+
client;
|
|
78
76
|
events;
|
|
77
|
+
instructions;
|
|
79
78
|
tools;
|
|
79
|
+
debug;
|
|
80
80
|
/**
|
|
81
81
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
82
82
|
*
|
|
@@ -85,13 +85,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
85
85
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
86
86
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
87
87
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
88
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
89
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
90
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
91
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
92
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
93
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
94
88
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
89
|
+
* @param options.debug - Enable debug mode
|
|
95
90
|
*
|
|
96
91
|
* @example
|
|
97
92
|
* ```typescript
|
|
@@ -106,25 +101,26 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
106
101
|
*/
|
|
107
102
|
constructor({
|
|
108
103
|
chatModel,
|
|
109
|
-
speaker
|
|
104
|
+
speaker,
|
|
105
|
+
debug = false
|
|
110
106
|
} = {}) {
|
|
111
107
|
super();
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
turn_detection: DEFAULT_VAD_CONFIG,
|
|
119
|
-
...chatModel?.options?.sessionConfig
|
|
108
|
+
const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
|
|
109
|
+
const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
|
|
110
|
+
this.ws = new WebSocket(url, void 0, {
|
|
111
|
+
headers: {
|
|
112
|
+
Authorization: "Bearer " + apiKey,
|
|
113
|
+
"OpenAI-Beta": "realtime=v1"
|
|
120
114
|
}
|
|
121
115
|
});
|
|
116
|
+
this.client = new EventEmitter();
|
|
122
117
|
this.state = "close";
|
|
123
118
|
this.events = {};
|
|
119
|
+
this.tools = chatModel?.tools;
|
|
120
|
+
this.instructions = chatModel?.instructions;
|
|
121
|
+
this.speaker = speaker || DEFAULT_VOICE;
|
|
122
|
+
this.debug = debug;
|
|
124
123
|
this.setupEventListeners();
|
|
125
|
-
if (chatModel?.tools) {
|
|
126
|
-
this.addTools(chatModel.tools);
|
|
127
|
-
}
|
|
128
124
|
}
|
|
129
125
|
/**
|
|
130
126
|
* Returns a list of available voice speakers.
|
|
@@ -150,8 +146,8 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
150
146
|
* ```
|
|
151
147
|
*/
|
|
152
148
|
close() {
|
|
153
|
-
if (!this.
|
|
154
|
-
this.
|
|
149
|
+
if (!this.ws) return;
|
|
150
|
+
this.ws.close();
|
|
155
151
|
this.state = "close";
|
|
156
152
|
}
|
|
157
153
|
/**
|
|
@@ -171,10 +167,10 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
171
167
|
* ```
|
|
172
168
|
*/
|
|
173
169
|
addTools(tools) {
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
}
|
|
170
|
+
const openaiTools = transformTools(tools);
|
|
171
|
+
this.updateConfig({
|
|
172
|
+
tools: openaiTools.map((t) => t.openaiTool)
|
|
173
|
+
});
|
|
178
174
|
}
|
|
179
175
|
/**
|
|
180
176
|
* Emits a speaking event using the configured voice model.
|
|
@@ -210,7 +206,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
210
206
|
if (input.trim().length === 0) {
|
|
211
207
|
throw new Error("Input text is empty");
|
|
212
208
|
}
|
|
213
|
-
this.
|
|
209
|
+
this.sendEvent("response.create", {
|
|
214
210
|
response: {
|
|
215
211
|
instructions: `Repeat the following text: ${input}`,
|
|
216
212
|
voice: options?.speaker ? options.speaker : void 0
|
|
@@ -236,7 +232,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
236
232
|
* ```
|
|
237
233
|
*/
|
|
238
234
|
updateConfig(sessionConfig) {
|
|
239
|
-
this.
|
|
235
|
+
this.sendEvent("session.update", { session: sessionConfig });
|
|
240
236
|
}
|
|
241
237
|
/**
|
|
242
238
|
* Processes audio input for speech recognition.
|
|
@@ -271,14 +267,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
271
267
|
const buffer = Buffer.concat(chunks);
|
|
272
268
|
const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
|
|
273
269
|
const base64Audio = this.int16ArrayToBase64(int16Array);
|
|
274
|
-
this.
|
|
270
|
+
this.sendEvent("conversation.item.create", {
|
|
275
271
|
item: {
|
|
276
272
|
type: "message",
|
|
277
273
|
role: "user",
|
|
278
274
|
content: [{ type: "input_audio", audio: base64Audio }]
|
|
279
275
|
}
|
|
280
276
|
});
|
|
281
|
-
this.
|
|
277
|
+
this.sendEvent("response.create", {
|
|
282
278
|
response: {
|
|
283
279
|
modalities: ["text"],
|
|
284
280
|
instructions: `ONLY repeat the input and DO NOT say anything else`
|
|
@@ -288,6 +284,16 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
288
284
|
this.emit("error", new Error("Unsupported audio data format"));
|
|
289
285
|
}
|
|
290
286
|
}
|
|
287
|
+
waitForOpen() {
|
|
288
|
+
return new Promise((resolve) => {
|
|
289
|
+
this.ws.on("open", resolve);
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
waitForSessionCreated() {
|
|
293
|
+
return new Promise((resolve) => {
|
|
294
|
+
this.client.on("session.created", resolve);
|
|
295
|
+
});
|
|
296
|
+
}
|
|
291
297
|
/**
|
|
292
298
|
* Establishes a connection to the OpenAI realtime service.
|
|
293
299
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -301,8 +307,17 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
301
307
|
* ```
|
|
302
308
|
*/
|
|
303
309
|
async connect() {
|
|
304
|
-
await this.
|
|
305
|
-
await this.
|
|
310
|
+
await this.waitForOpen();
|
|
311
|
+
await this.waitForSessionCreated();
|
|
312
|
+
const openaiTools = transformTools(this.tools);
|
|
313
|
+
this.updateConfig({
|
|
314
|
+
instructions: this.instructions,
|
|
315
|
+
tools: openaiTools.map((t) => t.openaiTool),
|
|
316
|
+
input_audio_transcription: {
|
|
317
|
+
model: "whisper-1"
|
|
318
|
+
},
|
|
319
|
+
voice: this.speaker
|
|
320
|
+
});
|
|
306
321
|
this.state = "open";
|
|
307
322
|
}
|
|
308
323
|
/**
|
|
@@ -323,7 +338,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
323
338
|
* await voice.relay(micStream);
|
|
324
339
|
* ```
|
|
325
340
|
*/
|
|
326
|
-
async send(audioData) {
|
|
341
|
+
async send(audioData, eventId) {
|
|
327
342
|
if (!this.state || this.state !== "open") {
|
|
328
343
|
console.warn("Cannot relay audio when not open. Call open() first.");
|
|
329
344
|
return;
|
|
@@ -333,15 +348,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
333
348
|
stream.on("data", (chunk) => {
|
|
334
349
|
try {
|
|
335
350
|
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
336
|
-
|
|
337
|
-
this.client.appendInputAudio(int16Array);
|
|
351
|
+
this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
|
|
338
352
|
} catch (err) {
|
|
339
353
|
this.emit("error", err);
|
|
340
354
|
}
|
|
341
355
|
});
|
|
342
356
|
} else if (audioData instanceof Int16Array) {
|
|
343
357
|
try {
|
|
344
|
-
this.
|
|
358
|
+
this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
|
|
345
359
|
} catch (err) {
|
|
346
360
|
this.emit("error", err);
|
|
347
361
|
}
|
|
@@ -368,7 +382,7 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
368
382
|
* });
|
|
369
383
|
*/
|
|
370
384
|
async answer({ options }) {
|
|
371
|
-
this.
|
|
385
|
+
this.sendEvent("response.create", { response: options ?? {} });
|
|
372
386
|
}
|
|
373
387
|
/**
|
|
374
388
|
* Registers an event listener for voice events.
|
|
@@ -437,30 +451,106 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
437
451
|
}
|
|
438
452
|
}
|
|
439
453
|
setupEventListeners() {
|
|
440
|
-
|
|
441
|
-
|
|
454
|
+
const speakerStreams = /* @__PURE__ */ new Map();
|
|
455
|
+
this.ws.on("message", (message) => {
|
|
456
|
+
const data = JSON.parse(message.toString());
|
|
457
|
+
this.client.emit(data.type, data);
|
|
458
|
+
if (this.debug) {
|
|
459
|
+
const { delta, ...fields } = data;
|
|
460
|
+
console.log(data.type, fields, delta?.length < 100 ? delta : "");
|
|
461
|
+
}
|
|
442
462
|
});
|
|
443
|
-
this.client.on("
|
|
444
|
-
this.emit("
|
|
463
|
+
this.client.on("session.created", (ev) => {
|
|
464
|
+
this.emit("session.created", ev);
|
|
445
465
|
});
|
|
446
|
-
this.client.on("
|
|
447
|
-
this.emit("
|
|
466
|
+
this.client.on("session.updated", (ev) => {
|
|
467
|
+
this.emit("session.updated", ev);
|
|
448
468
|
});
|
|
449
|
-
this.client.on("
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
469
|
+
this.client.on("response.created", (ev) => {
|
|
470
|
+
this.emit("response.created", ev);
|
|
471
|
+
const speakerStream = new PassThrough();
|
|
472
|
+
speakerStream.id = ev.response.id;
|
|
473
|
+
speakerStreams.set(ev.response.id, speakerStream);
|
|
474
|
+
this.emit("speaker", speakerStream);
|
|
453
475
|
});
|
|
454
|
-
this.client.on("conversation.item.
|
|
455
|
-
this.emit("
|
|
476
|
+
this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
|
|
477
|
+
this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
|
|
456
478
|
});
|
|
457
|
-
this.client.on("conversation.item.
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
479
|
+
this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
|
|
480
|
+
this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
|
|
481
|
+
});
|
|
482
|
+
this.client.on("response.audio.delta", (ev) => {
|
|
483
|
+
const audio = Buffer.from(ev.delta, "base64");
|
|
484
|
+
this.emit("speaking", { audio, response_id: ev.response_id });
|
|
485
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
486
|
+
stream?.write(audio);
|
|
487
|
+
});
|
|
488
|
+
this.client.on("response.audio.done", (ev) => {
|
|
489
|
+
this.emit("speaking.done", { response_id: ev.response_id });
|
|
490
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
491
|
+
stream?.end();
|
|
492
|
+
});
|
|
493
|
+
this.client.on("response.audio_transcript.delta", (ev) => {
|
|
494
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
495
|
+
});
|
|
496
|
+
this.client.on("response.audio_transcript.done", (ev) => {
|
|
497
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
498
|
+
});
|
|
499
|
+
this.client.on("response.text.delta", (ev) => {
|
|
500
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
501
|
+
});
|
|
502
|
+
this.client.on("response.text.done", (ev) => {
|
|
503
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
504
|
+
});
|
|
505
|
+
this.client.on("response.done", async (ev) => {
|
|
506
|
+
await this.handleFunctionCalls(ev);
|
|
507
|
+
this.emit("response.done", ev);
|
|
508
|
+
speakerStreams.delete(ev.response.id);
|
|
462
509
|
});
|
|
463
510
|
}
|
|
511
|
+
async handleFunctionCalls(ev) {
|
|
512
|
+
for (const output of ev.response?.output ?? []) {
|
|
513
|
+
if (output.type === "function_call") {
|
|
514
|
+
await this.handleFunctionCall(output);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
async handleFunctionCall(output) {
|
|
519
|
+
try {
|
|
520
|
+
const context = JSON.parse(output.arguments);
|
|
521
|
+
const tool = this.tools?.[output.name];
|
|
522
|
+
if (!tool) {
|
|
523
|
+
console.warn(`Tool "${output.name}" not found`);
|
|
524
|
+
return;
|
|
525
|
+
}
|
|
526
|
+
const result = await tool?.execute?.(
|
|
527
|
+
{ context },
|
|
528
|
+
{
|
|
529
|
+
toolCallId: "unknown",
|
|
530
|
+
messages: []
|
|
531
|
+
}
|
|
532
|
+
);
|
|
533
|
+
this.sendEvent("conversation.item.create", {
|
|
534
|
+
item: {
|
|
535
|
+
type: "function_call_output",
|
|
536
|
+
call_id: output.call_id,
|
|
537
|
+
output: JSON.stringify(result)
|
|
538
|
+
}
|
|
539
|
+
});
|
|
540
|
+
} catch (e) {
|
|
541
|
+
const err = e;
|
|
542
|
+
console.warn(`Error calling tool "${output.name}":`, err.message);
|
|
543
|
+
this.sendEvent("conversation.item.create", {
|
|
544
|
+
item: {
|
|
545
|
+
type: "function_call_output",
|
|
546
|
+
call_id: output.call_id,
|
|
547
|
+
output: JSON.stringify({ error: err.message })
|
|
548
|
+
}
|
|
549
|
+
});
|
|
550
|
+
} finally {
|
|
551
|
+
this.sendEvent("response.create", {});
|
|
552
|
+
}
|
|
553
|
+
}
|
|
464
554
|
int16ArrayToBase64(int16Array) {
|
|
465
555
|
const buffer = new ArrayBuffer(int16Array.length * 2);
|
|
466
556
|
const view = new DataView(buffer);
|
|
@@ -474,6 +564,14 @@ var OpenAIRealtimeVoice = class extends MastraVoice {
|
|
|
474
564
|
}
|
|
475
565
|
return btoa(binary);
|
|
476
566
|
}
|
|
567
|
+
sendEvent(type, data) {
|
|
568
|
+
this.ws.send(
|
|
569
|
+
JSON.stringify({
|
|
570
|
+
type,
|
|
571
|
+
...data
|
|
572
|
+
})
|
|
573
|
+
);
|
|
574
|
+
}
|
|
477
575
|
};
|
|
478
576
|
|
|
479
577
|
export { OpenAIRealtimeVoice };
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/voice-openai-realtime",
|
|
3
|
-
"version": "0.0
|
|
3
|
+
"version": "0.1.0-alpha.2",
|
|
4
4
|
"description": "Mastra OpenAI Realtime API integration",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -20,16 +20,18 @@
|
|
|
20
20
|
},
|
|
21
21
|
"dependencies": {
|
|
22
22
|
"openai-realtime-api": "^1.0.7",
|
|
23
|
+
"ws": "^8.18.1",
|
|
23
24
|
"zod-to-json-schema": "^3.24.1",
|
|
24
|
-
"@mastra/core": "^0.
|
|
25
|
+
"@mastra/core": "^0.7.0-alpha.2"
|
|
25
26
|
},
|
|
26
27
|
"devDependencies": {
|
|
27
28
|
"@microsoft/api-extractor": "^7.49.2",
|
|
28
29
|
"@types/node": "^22.13.1",
|
|
29
|
-
"
|
|
30
|
+
"@types/ws": "^8.18.0",
|
|
31
|
+
"eslint": "^9.23.0",
|
|
30
32
|
"tsup": "^8.3.6",
|
|
31
33
|
"typescript": "^5.7.3",
|
|
32
|
-
"vitest": "^2.1.
|
|
34
|
+
"vitest": "^2.1.9",
|
|
33
35
|
"@internal/lint": "0.0.1"
|
|
34
36
|
},
|
|
35
37
|
"scripts": {
|