@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +13 -0
- package/dist/_tsup-dts-rollup.d.cts +18 -18
- package/dist/_tsup-dts-rollup.d.ts +18 -18
- package/dist/index.cjs +158 -62
- package/dist/index.js +159 -63
- package/package.json +6 -4
- package/src/index.ts +188 -74
- package/src/utils.ts +1 -0
package/src/index.ts
CHANGED
|
@@ -1,33 +1,35 @@
|
|
|
1
1
|
import type { ToolsInput } from '@mastra/core/agent';
|
|
2
2
|
import { MastraVoice } from '@mastra/core/voice';
|
|
3
|
-
import { RealtimeClient } from 'openai-realtime-api';
|
|
4
|
-
import type { Realtime } from 'openai-realtime-api';
|
|
5
3
|
import { isReadableStream, transformTools } from './utils';
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
type VoiceEventType =
|
|
11
|
-
| 'speak' // Emitted when starting to speak
|
|
12
|
-
| 'writing' // Emitted while speaking with audio data
|
|
13
|
-
| 'error'; // Emitted when an error occurs
|
|
4
|
+
import { WebSocket } from 'ws';
|
|
5
|
+
import { EventEmitter } from 'events';
|
|
6
|
+
import type { Realtime, RealtimeServerEvents } from 'openai-realtime-api';
|
|
7
|
+
import { PassThrough } from 'stream';
|
|
14
8
|
|
|
15
9
|
/**
|
|
16
10
|
* Event callback function type
|
|
17
11
|
*/
|
|
18
12
|
type EventCallback = (...args: any[]) => void;
|
|
19
13
|
|
|
14
|
+
type StreamWithId = PassThrough & { id: string };
|
|
15
|
+
|
|
20
16
|
/**
|
|
21
17
|
* Map of event types to their callback arrays
|
|
22
18
|
*/
|
|
23
19
|
type EventMap = {
|
|
24
|
-
[
|
|
20
|
+
transcribing: [{ text: string }];
|
|
21
|
+
writing: [{ text: string }];
|
|
22
|
+
speaking: [{ audio: string }];
|
|
23
|
+
speaker: [StreamWithId];
|
|
24
|
+
error: [Error];
|
|
25
25
|
} & {
|
|
26
26
|
[key: string]: EventCallback[];
|
|
27
27
|
};
|
|
28
28
|
|
|
29
29
|
/** Default voice for text-to-speech. 'alloy' provides a neutral, balanced voice suitable for most use cases */
|
|
30
|
-
const DEFAULT_VOICE = 'alloy';
|
|
30
|
+
const DEFAULT_VOICE: Realtime.Voice = 'alloy';
|
|
31
|
+
|
|
32
|
+
const DEFAULT_URL = 'wss://api.openai.com/v1/realtime';
|
|
31
33
|
|
|
32
34
|
/**
|
|
33
35
|
* Default model for real-time voice interactions.
|
|
@@ -62,6 +64,13 @@ type TTools = ToolsInput;
|
|
|
62
64
|
*/
|
|
63
65
|
const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse'];
|
|
64
66
|
|
|
67
|
+
type RealtimeClientServerEventMap = {
|
|
68
|
+
[K in RealtimeServerEvents.EventType]: [RealtimeServerEvents.EventMap[K]];
|
|
69
|
+
} & {
|
|
70
|
+
['conversation.item.input_audio_transcription.delta']: [{ delta: string; response_id: string }];
|
|
71
|
+
['conversation.item.input_audio_transcription.done']: [{ response_id: string }];
|
|
72
|
+
};
|
|
73
|
+
|
|
65
74
|
/**
|
|
66
75
|
* OpenAIRealtimeVoice provides real-time voice interaction capabilities using OpenAI's
|
|
67
76
|
* WebSocket-based API. It supports:
|
|
@@ -94,10 +103,13 @@ const VOICES = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'v
|
|
|
94
103
|
* ```
|
|
95
104
|
*/
|
|
96
105
|
export class OpenAIRealtimeVoice extends MastraVoice {
|
|
97
|
-
private
|
|
106
|
+
private ws: WebSocket;
|
|
98
107
|
private state: 'close' | 'open';
|
|
108
|
+
private client: EventEmitter<RealtimeClientServerEventMap>;
|
|
99
109
|
private events: EventMap;
|
|
100
|
-
|
|
110
|
+
private instructions?: string;
|
|
111
|
+
private tools?: TTools;
|
|
112
|
+
private debug: boolean;
|
|
101
113
|
|
|
102
114
|
/**
|
|
103
115
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
@@ -107,13 +119,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
107
119
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
108
120
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
109
121
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
110
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
111
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
112
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
113
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
114
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
115
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
116
122
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
123
|
+
* @param options.debug - Enable debug mode
|
|
117
124
|
*
|
|
118
125
|
* @example
|
|
119
126
|
* ```typescript
|
|
@@ -129,40 +136,37 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
129
136
|
constructor({
|
|
130
137
|
chatModel,
|
|
131
138
|
speaker,
|
|
139
|
+
debug = false,
|
|
132
140
|
}: {
|
|
133
141
|
chatModel?: {
|
|
134
142
|
model?: string;
|
|
135
143
|
apiKey?: string;
|
|
136
144
|
tools?: TTools;
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
url?: string;
|
|
140
|
-
dangerouslyAllowAPIKeyInBrowser?: boolean;
|
|
141
|
-
debug?: boolean;
|
|
142
|
-
tools?: TTools;
|
|
143
|
-
};
|
|
145
|
+
instructions?: string;
|
|
146
|
+
url?: string;
|
|
144
147
|
};
|
|
145
148
|
speaker?: Realtime.Voice;
|
|
149
|
+
debug?: boolean;
|
|
146
150
|
} = {}) {
|
|
147
151
|
super();
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
...chatModel?.options?.sessionConfig,
|
|
152
|
+
|
|
153
|
+
const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
|
|
154
|
+
const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
|
|
155
|
+
this.ws = new WebSocket(url, undefined, {
|
|
156
|
+
headers: {
|
|
157
|
+
Authorization: 'Bearer ' + apiKey,
|
|
158
|
+
'OpenAI-Beta': 'realtime=v1',
|
|
156
159
|
},
|
|
157
160
|
});
|
|
158
161
|
|
|
162
|
+
this.client = new EventEmitter();
|
|
159
163
|
this.state = 'close';
|
|
160
164
|
this.events = {} as EventMap;
|
|
165
|
+
this.tools = chatModel?.tools;
|
|
166
|
+
this.instructions = chatModel?.instructions;
|
|
167
|
+
this.speaker = speaker || DEFAULT_VOICE;
|
|
168
|
+
this.debug = debug;
|
|
161
169
|
this.setupEventListeners();
|
|
162
|
-
|
|
163
|
-
if (chatModel?.tools) {
|
|
164
|
-
this.addTools(chatModel.tools);
|
|
165
|
-
}
|
|
166
170
|
}
|
|
167
171
|
|
|
168
172
|
/**
|
|
@@ -190,8 +194,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
190
194
|
* ```
|
|
191
195
|
*/
|
|
192
196
|
close() {
|
|
193
|
-
if (!this.
|
|
194
|
-
this.
|
|
197
|
+
if (!this.ws) return;
|
|
198
|
+
this.ws.close();
|
|
195
199
|
this.state = 'close';
|
|
196
200
|
}
|
|
197
201
|
|
|
@@ -212,10 +216,10 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
212
216
|
* ```
|
|
213
217
|
*/
|
|
214
218
|
addTools(tools?: TTools) {
|
|
215
|
-
const
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
}
|
|
219
|
+
const openaiTools = transformTools(tools);
|
|
220
|
+
this.updateConfig({
|
|
221
|
+
tools: openaiTools.map(t => t.openaiTool),
|
|
222
|
+
});
|
|
219
223
|
}
|
|
220
224
|
|
|
221
225
|
/**
|
|
@@ -254,7 +258,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
254
258
|
throw new Error('Input text is empty');
|
|
255
259
|
}
|
|
256
260
|
|
|
257
|
-
this.
|
|
261
|
+
this.sendEvent('response.create', {
|
|
258
262
|
response: {
|
|
259
263
|
instructions: `Repeat the following text: ${input}`,
|
|
260
264
|
voice: options?.speaker ? options.speaker : undefined,
|
|
@@ -280,8 +284,8 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
280
284
|
* });
|
|
281
285
|
* ```
|
|
282
286
|
*/
|
|
283
|
-
updateConfig(sessionConfig:
|
|
284
|
-
this.
|
|
287
|
+
updateConfig(sessionConfig: unknown): void {
|
|
288
|
+
this.sendEvent('session.update', { session: sessionConfig });
|
|
285
289
|
}
|
|
286
290
|
|
|
287
291
|
/**
|
|
@@ -319,7 +323,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
319
323
|
const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
|
|
320
324
|
const base64Audio = this.int16ArrayToBase64(int16Array);
|
|
321
325
|
|
|
322
|
-
this.
|
|
326
|
+
this.sendEvent('conversation.item.create', {
|
|
323
327
|
item: {
|
|
324
328
|
type: 'message',
|
|
325
329
|
role: 'user',
|
|
@@ -327,7 +331,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
327
331
|
},
|
|
328
332
|
});
|
|
329
333
|
|
|
330
|
-
this.
|
|
334
|
+
this.sendEvent('response.create', {
|
|
331
335
|
response: {
|
|
332
336
|
modalities: ['text'],
|
|
333
337
|
instructions: `ONLY repeat the input and DO NOT say anything else`,
|
|
@@ -338,6 +342,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
338
342
|
}
|
|
339
343
|
}
|
|
340
344
|
|
|
345
|
+
waitForOpen() {
|
|
346
|
+
return new Promise(resolve => {
|
|
347
|
+
this.ws.on('open', resolve);
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
waitForSessionCreated() {
|
|
352
|
+
return new Promise(resolve => {
|
|
353
|
+
this.client.on('session.created', resolve);
|
|
354
|
+
});
|
|
355
|
+
}
|
|
356
|
+
|
|
341
357
|
/**
|
|
342
358
|
* Establishes a connection to the OpenAI realtime service.
|
|
343
359
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -351,8 +367,18 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
351
367
|
* ```
|
|
352
368
|
*/
|
|
353
369
|
async connect() {
|
|
354
|
-
await this.
|
|
355
|
-
await this.
|
|
370
|
+
await this.waitForOpen();
|
|
371
|
+
await this.waitForSessionCreated();
|
|
372
|
+
|
|
373
|
+
const openaiTools = transformTools(this.tools);
|
|
374
|
+
this.updateConfig({
|
|
375
|
+
instructions: this.instructions,
|
|
376
|
+
tools: openaiTools.map(t => t.openaiTool),
|
|
377
|
+
input_audio_transcription: {
|
|
378
|
+
model: 'whisper-1',
|
|
379
|
+
},
|
|
380
|
+
voice: this.speaker,
|
|
381
|
+
});
|
|
356
382
|
this.state = 'open';
|
|
357
383
|
}
|
|
358
384
|
|
|
@@ -374,7 +400,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
374
400
|
* await voice.relay(micStream);
|
|
375
401
|
* ```
|
|
376
402
|
*/
|
|
377
|
-
async send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void> {
|
|
403
|
+
async send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void> {
|
|
378
404
|
if (!this.state || this.state !== 'open') {
|
|
379
405
|
console.warn('Cannot relay audio when not open. Call open() first.');
|
|
380
406
|
return;
|
|
@@ -385,15 +411,14 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
385
411
|
stream.on('data', chunk => {
|
|
386
412
|
try {
|
|
387
413
|
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
388
|
-
|
|
389
|
-
this.client.appendInputAudio(int16Array);
|
|
414
|
+
this.sendEvent('input_audio_buffer.append', { audio: buffer.toString('base64'), event_id: eventId });
|
|
390
415
|
} catch (err) {
|
|
391
416
|
this.emit('error', err);
|
|
392
417
|
}
|
|
393
418
|
});
|
|
394
419
|
} else if (audioData instanceof Int16Array) {
|
|
395
420
|
try {
|
|
396
|
-
this.
|
|
421
|
+
this.sendEvent('input_audio_buffer.append', { audio: audioData, event_id: eventId });
|
|
397
422
|
} catch (err) {
|
|
398
423
|
this.emit('error', err);
|
|
399
424
|
}
|
|
@@ -421,7 +446,7 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
421
446
|
* });
|
|
422
447
|
*/
|
|
423
448
|
async answer({ options }: { options?: Realtime.ResponseConfig }) {
|
|
424
|
-
this.
|
|
449
|
+
this.sendEvent('response.create', { response: options ?? {} });
|
|
425
450
|
}
|
|
426
451
|
|
|
427
452
|
/**
|
|
@@ -496,35 +521,115 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
496
521
|
}
|
|
497
522
|
|
|
498
523
|
private setupEventListeners(): void {
|
|
499
|
-
|
|
500
|
-
|
|
524
|
+
const speakerStreams = new Map<string, StreamWithId>();
|
|
525
|
+
|
|
526
|
+
this.ws.on('message', message => {
|
|
527
|
+
const data = JSON.parse(message.toString());
|
|
528
|
+
this.client.emit(data.type, data);
|
|
529
|
+
|
|
530
|
+
if (this.debug) {
|
|
531
|
+
const { delta, ...fields } = data;
|
|
532
|
+
console.log(data.type, fields, delta?.length < 100 ? delta : '');
|
|
533
|
+
}
|
|
501
534
|
});
|
|
502
535
|
|
|
503
|
-
this.client.on('
|
|
504
|
-
this.emit('
|
|
536
|
+
this.client.on('session.created', ev => {
|
|
537
|
+
this.emit('session.created', ev);
|
|
538
|
+
});
|
|
539
|
+
this.client.on('session.updated', ev => {
|
|
540
|
+
this.emit('session.updated', ev);
|
|
505
541
|
});
|
|
542
|
+
this.client.on('response.created', ev => {
|
|
543
|
+
this.emit('response.created', ev);
|
|
506
544
|
|
|
507
|
-
|
|
508
|
-
|
|
545
|
+
const speakerStream = new PassThrough() as StreamWithId;
|
|
546
|
+
|
|
547
|
+
speakerStream.id = ev.response.id;
|
|
548
|
+
|
|
549
|
+
speakerStreams.set(ev.response.id, speakerStream);
|
|
550
|
+
this.emit('speaker', speakerStream);
|
|
551
|
+
});
|
|
552
|
+
this.client.on('conversation.item.input_audio_transcription.delta', ev => {
|
|
553
|
+
this.emit('transcribing', { text: ev.delta, response_id: ev.response_id, role: 'user' });
|
|
554
|
+
});
|
|
555
|
+
this.client.on('conversation.item.input_audio_transcription.done', ev => {
|
|
556
|
+
this.emit('transcribing', { text: '\n', response_id: ev.response_id, role: 'user' });
|
|
509
557
|
});
|
|
558
|
+
this.client.on('response.audio.delta', ev => {
|
|
559
|
+
const audio = Buffer.from(ev.delta, 'base64');
|
|
560
|
+
this.emit('speaking', { audio, response_id: ev.response_id });
|
|
510
561
|
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
this.emit('speaking', { audio: delta.audio });
|
|
514
|
-
}
|
|
562
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
563
|
+
stream?.write(audio);
|
|
515
564
|
});
|
|
565
|
+
this.client.on('response.audio.done', ev => {
|
|
566
|
+
this.emit('speaking.done', { response_id: ev.response_id });
|
|
516
567
|
|
|
517
|
-
|
|
518
|
-
|
|
568
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
569
|
+
stream?.end();
|
|
570
|
+
});
|
|
571
|
+
this.client.on('response.audio_transcript.delta', ev => {
|
|
572
|
+
this.emit('writing', { text: ev.delta, response_id: ev.response_id });
|
|
573
|
+
});
|
|
574
|
+
this.client.on('response.audio_transcript.done', ev => {
|
|
575
|
+
this.emit('writing', { text: '\n', response_id: ev.response_id });
|
|
519
576
|
});
|
|
577
|
+
this.client.on('response.text.delta', ev => {
|
|
578
|
+
this.emit('writing', { text: ev.delta, response_id: ev.response_id });
|
|
579
|
+
});
|
|
580
|
+
this.client.on('response.text.done', ev => {
|
|
581
|
+
this.emit('writing', { text: '\n', response_id: ev.response_id });
|
|
582
|
+
});
|
|
583
|
+
this.client.on('response.done', ev => {
|
|
584
|
+
this.handleFunctionCalls(ev);
|
|
585
|
+
this.emit('response.done', ev);
|
|
586
|
+
speakerStreams.delete(ev.response.id);
|
|
587
|
+
});
|
|
588
|
+
}
|
|
520
589
|
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
590
|
+
private async handleFunctionCalls(ev: any) {
|
|
591
|
+
for (const output of ev.response?.output ?? []) {
|
|
592
|
+
if (output.type === 'function_call') {
|
|
593
|
+
await this.handleFunctionCall(output);
|
|
524
594
|
}
|
|
595
|
+
}
|
|
596
|
+
}
|
|
525
597
|
|
|
526
|
-
|
|
527
|
-
|
|
598
|
+
private async handleFunctionCall(output: any) {
|
|
599
|
+
try {
|
|
600
|
+
const context = JSON.parse(output.arguments);
|
|
601
|
+
const tool = this.tools?.[output.name];
|
|
602
|
+
if (!tool) {
|
|
603
|
+
console.warn(`Tool "${output.name}" not found`);
|
|
604
|
+
return;
|
|
605
|
+
}
|
|
606
|
+
const result = await tool?.execute?.(
|
|
607
|
+
{ context },
|
|
608
|
+
{
|
|
609
|
+
toolCallId: 'unknown',
|
|
610
|
+
messages: [],
|
|
611
|
+
},
|
|
612
|
+
);
|
|
613
|
+
this.sendEvent('conversation.item.create', {
|
|
614
|
+
item: {
|
|
615
|
+
type: 'function_call_output',
|
|
616
|
+
call_id: output.call_id,
|
|
617
|
+
output: JSON.stringify(result),
|
|
618
|
+
},
|
|
619
|
+
});
|
|
620
|
+
} catch (e) {
|
|
621
|
+
const err = e as Error;
|
|
622
|
+
console.warn(`Error calling tool "${output.name}":`, err.message);
|
|
623
|
+
this.sendEvent('conversation.item.create', {
|
|
624
|
+
item: {
|
|
625
|
+
type: 'function_call_output',
|
|
626
|
+
call_id: output.call_id,
|
|
627
|
+
output: JSON.stringify({ error: err.message }),
|
|
628
|
+
},
|
|
629
|
+
});
|
|
630
|
+
} finally {
|
|
631
|
+
this.sendEvent('response.create', {});
|
|
632
|
+
}
|
|
528
633
|
}
|
|
529
634
|
|
|
530
635
|
private int16ArrayToBase64(int16Array: Int16Array): string {
|
|
@@ -540,4 +645,13 @@ export class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
540
645
|
}
|
|
541
646
|
return btoa(binary);
|
|
542
647
|
}
|
|
648
|
+
|
|
649
|
+
private sendEvent(type: string, data: any) {
|
|
650
|
+
this.ws.send(
|
|
651
|
+
JSON.stringify({
|
|
652
|
+
type: type,
|
|
653
|
+
...data,
|
|
654
|
+
}),
|
|
655
|
+
);
|
|
656
|
+
}
|
|
543
657
|
}
|