@mastra/voice-openai-realtime 0.0.5-alpha.0 → 0.1.0-alpha.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +23 -0
- package/dist/_tsup-dts-rollup.d.cts +18 -18
- package/dist/_tsup-dts-rollup.d.ts +18 -18
- package/dist/index.cjs +159 -61
- package/dist/index.js +159 -61
- package/package.json +6 -4
- package/src/index.ts +204 -89
- package/src/utils.ts +1 -0
package/.turbo/turbo-build.log
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
|
|
2
|
-
> @mastra/voice-openai-realtime@0.0
|
|
2
|
+
> @mastra/voice-openai-realtime@0.1.0-alpha.2 build /home/runner/work/mastra/mastra/voice/openai-realtime-api
|
|
3
3
|
> tsup src/index.ts --format esm,cjs --experimental-dts --clean --treeshake
|
|
4
4
|
|
|
5
5
|
[34mCLI[39m Building entry: src/index.ts
|
|
6
6
|
[34mCLI[39m Using tsconfig: tsconfig.json
|
|
7
7
|
[34mCLI[39m tsup v8.4.0
|
|
8
8
|
[34mTSC[39m Build start
|
|
9
|
-
[32mTSC[39m ⚡️ Build success in
|
|
9
|
+
[32mTSC[39m ⚡️ Build success in 9123ms
|
|
10
10
|
[34mDTS[39m Build start
|
|
11
11
|
[34mCLI[39m Target: es2022
|
|
12
12
|
Analysis will use the bundled TypeScript version 5.8.2
|
|
13
13
|
[36mWriting package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.ts[39m
|
|
14
14
|
Analysis will use the bundled TypeScript version 5.8.2
|
|
15
15
|
[36mWriting package typings: /home/runner/work/mastra/mastra/voice/openai-realtime-api/dist/_tsup-dts-rollup.d.cts[39m
|
|
16
|
-
[32mDTS[39m ⚡️ Build success in
|
|
16
|
+
[32mDTS[39m ⚡️ Build success in 10151ms
|
|
17
17
|
[34mCLI[39m Cleaning output folder
|
|
18
18
|
[34mESM[39m Build start
|
|
19
19
|
[34mCJS[39m Build start
|
|
20
|
-
[32mCJS[39m [1mdist/index.cjs [22m[
|
|
21
|
-
[32mCJS[39m ⚡️ Build success in
|
|
22
|
-
[32mESM[39m [1mdist/index.js [22m[
|
|
23
|
-
[32mESM[39m ⚡️ Build success in
|
|
20
|
+
[32mCJS[39m [1mdist/index.cjs [22m[32m17.80 KB[39m
|
|
21
|
+
[32mCJS[39m ⚡️ Build success in 842ms
|
|
22
|
+
[32mESM[39m [1mdist/index.js [22m[32m17.75 KB[39m
|
|
23
|
+
[32mESM[39m ⚡️ Build success in 843ms
|
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,28 @@
|
|
|
1
1
|
# @mastra/voice-openai-realtime
|
|
2
2
|
|
|
3
|
+
## 0.1.0-alpha.2
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [a838fde]
|
|
8
|
+
- Updated dependencies [a8bd4cf]
|
|
9
|
+
- Updated dependencies [7a3eeb0]
|
|
10
|
+
- Updated dependencies [6530ad1]
|
|
11
|
+
- @mastra/core@0.7.0-alpha.2
|
|
12
|
+
|
|
13
|
+
## 0.1.0-alpha.1
|
|
14
|
+
|
|
15
|
+
### Minor Changes
|
|
16
|
+
|
|
17
|
+
- 443b118: This update removed an external dependency on an unmaintained package and implemented a native websocket connection.
|
|
18
|
+
|
|
19
|
+
### Patch Changes
|
|
20
|
+
|
|
21
|
+
- Updated dependencies [0b54522]
|
|
22
|
+
- Updated dependencies [1af25d5]
|
|
23
|
+
- Updated dependencies [27439ad]
|
|
24
|
+
- @mastra/core@0.7.0-alpha.1
|
|
25
|
+
|
|
3
26
|
## 0.0.5-alpha.0
|
|
4
27
|
|
|
5
28
|
### Patch Changes
|
|
@@ -43,10 +43,13 @@ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
|
|
|
43
43
|
* ```
|
|
44
44
|
*/
|
|
45
45
|
export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
46
|
-
private
|
|
46
|
+
private ws;
|
|
47
47
|
private state;
|
|
48
|
+
private client;
|
|
48
49
|
private events;
|
|
49
|
-
|
|
50
|
+
private instructions?;
|
|
51
|
+
private tools?;
|
|
52
|
+
private debug;
|
|
50
53
|
/**
|
|
51
54
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
52
55
|
*
|
|
@@ -55,13 +58,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
55
58
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
56
59
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
57
60
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
58
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
59
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
60
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
61
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
62
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
63
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
64
61
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
62
|
+
* @param options.debug - Enable debug mode
|
|
65
63
|
*
|
|
66
64
|
* @example
|
|
67
65
|
* ```typescript
|
|
@@ -74,20 +72,16 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
74
72
|
* });
|
|
75
73
|
* ```
|
|
76
74
|
*/
|
|
77
|
-
constructor({ chatModel, speaker, }?: {
|
|
75
|
+
constructor({ chatModel, speaker, debug, }?: {
|
|
78
76
|
chatModel?: {
|
|
79
77
|
model?: string;
|
|
80
78
|
apiKey?: string;
|
|
81
79
|
tools?: TTools;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
url?: string;
|
|
85
|
-
dangerouslyAllowAPIKeyInBrowser?: boolean;
|
|
86
|
-
debug?: boolean;
|
|
87
|
-
tools?: TTools;
|
|
88
|
-
};
|
|
80
|
+
instructions?: string;
|
|
81
|
+
url?: string;
|
|
89
82
|
};
|
|
90
83
|
speaker?: Realtime.Voice;
|
|
84
|
+
debug?: boolean;
|
|
91
85
|
});
|
|
92
86
|
/**
|
|
93
87
|
* Returns a list of available voice speakers.
|
|
@@ -175,7 +169,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
175
169
|
* });
|
|
176
170
|
* ```
|
|
177
171
|
*/
|
|
178
|
-
updateConfig(sessionConfig:
|
|
172
|
+
updateConfig(sessionConfig: unknown): void;
|
|
179
173
|
/**
|
|
180
174
|
* Processes audio input for speech recognition.
|
|
181
175
|
* Takes a readable stream of audio data and emits a writing event.
|
|
@@ -200,6 +194,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
200
194
|
* ```
|
|
201
195
|
*/
|
|
202
196
|
listen(audioData: NodeJS.ReadableStream): Promise<void>;
|
|
197
|
+
waitForOpen(): Promise<unknown>;
|
|
198
|
+
waitForSessionCreated(): Promise<unknown>;
|
|
203
199
|
/**
|
|
204
200
|
* Establishes a connection to the OpenAI realtime service.
|
|
205
201
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -231,7 +227,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
231
227
|
* await voice.relay(micStream);
|
|
232
228
|
* ```
|
|
233
229
|
*/
|
|
234
|
-
send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
|
|
230
|
+
send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void>;
|
|
235
231
|
/**
|
|
236
232
|
* Sends a response to the OpenAI Realtime API.
|
|
237
233
|
*
|
|
@@ -304,11 +300,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
304
300
|
*/
|
|
305
301
|
private emit;
|
|
306
302
|
private setupEventListeners;
|
|
303
|
+
private handleFunctionCalls;
|
|
304
|
+
private handleFunctionCall;
|
|
307
305
|
private int16ArrayToBase64;
|
|
306
|
+
private sendEvent;
|
|
308
307
|
}
|
|
309
308
|
|
|
310
309
|
export declare const transformTools: (tools?: TTools_2) => {
|
|
311
310
|
openaiTool: {
|
|
311
|
+
type: string;
|
|
312
312
|
name: string;
|
|
313
313
|
description: string;
|
|
314
314
|
parameters: {
|
|
@@ -43,10 +43,13 @@ export declare type OpenAIExecuteFunction = (args: any) => Promise<any>;
|
|
|
43
43
|
* ```
|
|
44
44
|
*/
|
|
45
45
|
export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
46
|
-
private
|
|
46
|
+
private ws;
|
|
47
47
|
private state;
|
|
48
|
+
private client;
|
|
48
49
|
private events;
|
|
49
|
-
|
|
50
|
+
private instructions?;
|
|
51
|
+
private tools?;
|
|
52
|
+
private debug;
|
|
50
53
|
/**
|
|
51
54
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
52
55
|
*
|
|
@@ -55,13 +58,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
55
58
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
56
59
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
57
60
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
58
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
59
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
60
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
61
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
62
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
63
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
64
61
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
62
|
+
* @param options.debug - Enable debug mode
|
|
65
63
|
*
|
|
66
64
|
* @example
|
|
67
65
|
* ```typescript
|
|
@@ -74,20 +72,16 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
74
72
|
* });
|
|
75
73
|
* ```
|
|
76
74
|
*/
|
|
77
|
-
constructor({ chatModel, speaker, }?: {
|
|
75
|
+
constructor({ chatModel, speaker, debug, }?: {
|
|
78
76
|
chatModel?: {
|
|
79
77
|
model?: string;
|
|
80
78
|
apiKey?: string;
|
|
81
79
|
tools?: TTools;
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
url?: string;
|
|
85
|
-
dangerouslyAllowAPIKeyInBrowser?: boolean;
|
|
86
|
-
debug?: boolean;
|
|
87
|
-
tools?: TTools;
|
|
88
|
-
};
|
|
80
|
+
instructions?: string;
|
|
81
|
+
url?: string;
|
|
89
82
|
};
|
|
90
83
|
speaker?: Realtime.Voice;
|
|
84
|
+
debug?: boolean;
|
|
91
85
|
});
|
|
92
86
|
/**
|
|
93
87
|
* Returns a list of available voice speakers.
|
|
@@ -175,7 +169,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
175
169
|
* });
|
|
176
170
|
* ```
|
|
177
171
|
*/
|
|
178
|
-
updateConfig(sessionConfig:
|
|
172
|
+
updateConfig(sessionConfig: unknown): void;
|
|
179
173
|
/**
|
|
180
174
|
* Processes audio input for speech recognition.
|
|
181
175
|
* Takes a readable stream of audio data and emits a writing event.
|
|
@@ -200,6 +194,8 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
200
194
|
* ```
|
|
201
195
|
*/
|
|
202
196
|
listen(audioData: NodeJS.ReadableStream): Promise<void>;
|
|
197
|
+
waitForOpen(): Promise<unknown>;
|
|
198
|
+
waitForSessionCreated(): Promise<unknown>;
|
|
203
199
|
/**
|
|
204
200
|
* Establishes a connection to the OpenAI realtime service.
|
|
205
201
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -231,7 +227,7 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
231
227
|
* await voice.relay(micStream);
|
|
232
228
|
* ```
|
|
233
229
|
*/
|
|
234
|
-
send(audioData: NodeJS.ReadableStream | Int16Array): Promise<void>;
|
|
230
|
+
send(audioData: NodeJS.ReadableStream | Int16Array, eventId?: string): Promise<void>;
|
|
235
231
|
/**
|
|
236
232
|
* Sends a response to the OpenAI Realtime API.
|
|
237
233
|
*
|
|
@@ -304,11 +300,15 @@ export declare class OpenAIRealtimeVoice extends MastraVoice {
|
|
|
304
300
|
*/
|
|
305
301
|
private emit;
|
|
306
302
|
private setupEventListeners;
|
|
303
|
+
private handleFunctionCalls;
|
|
304
|
+
private handleFunctionCall;
|
|
307
305
|
private int16ArrayToBase64;
|
|
306
|
+
private sendEvent;
|
|
308
307
|
}
|
|
309
308
|
|
|
310
309
|
export declare const transformTools: (tools?: TTools_2) => {
|
|
311
310
|
openaiTool: {
|
|
311
|
+
type: string;
|
|
312
312
|
name: string;
|
|
313
313
|
description: string;
|
|
314
314
|
parameters: {
|
package/dist/index.cjs
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
|
-
var
|
|
4
|
-
var openaiRealtimeApi = require('openai-realtime-api');
|
|
3
|
+
var events = require('events');
|
|
5
4
|
var stream = require('stream');
|
|
5
|
+
var voice = require('@mastra/core/voice');
|
|
6
|
+
var ws = require('ws');
|
|
6
7
|
var zodToJsonSchema = require('zod-to-json-schema');
|
|
7
8
|
|
|
8
9
|
// src/index.ts
|
|
@@ -29,6 +30,7 @@ var transformTools = (tools) => {
|
|
|
29
30
|
continue;
|
|
30
31
|
}
|
|
31
32
|
const openaiTool = {
|
|
33
|
+
type: "function",
|
|
32
34
|
name,
|
|
33
35
|
description: tool.description || `Tool: ${name}`,
|
|
34
36
|
parameters
|
|
@@ -66,19 +68,17 @@ var isReadableStream = (obj) => {
|
|
|
66
68
|
|
|
67
69
|
// src/index.ts
|
|
68
70
|
var DEFAULT_VOICE = "alloy";
|
|
71
|
+
var DEFAULT_URL = "wss://api.openai.com/v1/realtime";
|
|
69
72
|
var DEFAULT_MODEL = "gpt-4o-mini-realtime-preview-2024-12-17";
|
|
70
|
-
var DEFAULT_VAD_CONFIG = {
|
|
71
|
-
type: "server_vad",
|
|
72
|
-
threshold: 0.5,
|
|
73
|
-
prefix_padding_ms: 1e3,
|
|
74
|
-
silence_duration_ms: 1e3
|
|
75
|
-
};
|
|
76
73
|
var VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse"];
|
|
77
74
|
var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
78
|
-
|
|
75
|
+
ws;
|
|
79
76
|
state;
|
|
77
|
+
client;
|
|
80
78
|
events;
|
|
79
|
+
instructions;
|
|
81
80
|
tools;
|
|
81
|
+
debug;
|
|
82
82
|
/**
|
|
83
83
|
* Creates a new instance of OpenAIRealtimeVoice.
|
|
84
84
|
*
|
|
@@ -87,13 +87,8 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
87
87
|
* @param options.chatModel.model - The model ID to use (defaults to GPT-4 Mini Realtime)
|
|
88
88
|
* @param options.chatModel.apiKey - OpenAI API key. Falls back to process.env.OPENAI_API_KEY
|
|
89
89
|
* @param options.chatModel.tools - Tools configuration for the model
|
|
90
|
-
* @param options.chatModel.options - Additional options for the realtime client
|
|
91
|
-
* @param options.chatModel.options.sessionConfig - Session configuration overrides
|
|
92
|
-
* @param options.chatModel.options.url - Custom WebSocket URL
|
|
93
|
-
* @param options.chatModel.options.dangerouslyAllowAPIKeyInBrowser - Whether to allow API key in browser
|
|
94
|
-
* @param options.chatModel.options.debug - Enable debug logging
|
|
95
|
-
* @param options.chatModel.options.tools - Additional tools configuration
|
|
96
90
|
* @param options.speaker - Voice ID to use (defaults to 'alloy')
|
|
91
|
+
* @param options.debug - Enable debug mode
|
|
97
92
|
*
|
|
98
93
|
* @example
|
|
99
94
|
* ```typescript
|
|
@@ -108,25 +103,26 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
108
103
|
*/
|
|
109
104
|
constructor({
|
|
110
105
|
chatModel,
|
|
111
|
-
speaker
|
|
106
|
+
speaker,
|
|
107
|
+
debug = false
|
|
112
108
|
} = {}) {
|
|
113
109
|
super();
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
turn_detection: DEFAULT_VAD_CONFIG,
|
|
121
|
-
...chatModel?.options?.sessionConfig
|
|
110
|
+
const url = `${chatModel?.url || DEFAULT_URL}?model=${chatModel?.model || DEFAULT_MODEL}`;
|
|
111
|
+
const apiKey = chatModel?.apiKey || process.env.OPENAI_API_KEY;
|
|
112
|
+
this.ws = new ws.WebSocket(url, void 0, {
|
|
113
|
+
headers: {
|
|
114
|
+
Authorization: "Bearer " + apiKey,
|
|
115
|
+
"OpenAI-Beta": "realtime=v1"
|
|
122
116
|
}
|
|
123
117
|
});
|
|
118
|
+
this.client = new events.EventEmitter();
|
|
124
119
|
this.state = "close";
|
|
125
120
|
this.events = {};
|
|
121
|
+
this.tools = chatModel?.tools;
|
|
122
|
+
this.instructions = chatModel?.instructions;
|
|
123
|
+
this.speaker = speaker || DEFAULT_VOICE;
|
|
124
|
+
this.debug = debug;
|
|
126
125
|
this.setupEventListeners();
|
|
127
|
-
if (chatModel?.tools) {
|
|
128
|
-
this.addTools(chatModel.tools);
|
|
129
|
-
}
|
|
130
126
|
}
|
|
131
127
|
/**
|
|
132
128
|
* Returns a list of available voice speakers.
|
|
@@ -152,8 +148,8 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
152
148
|
* ```
|
|
153
149
|
*/
|
|
154
150
|
close() {
|
|
155
|
-
if (!this.
|
|
156
|
-
this.
|
|
151
|
+
if (!this.ws) return;
|
|
152
|
+
this.ws.close();
|
|
157
153
|
this.state = "close";
|
|
158
154
|
}
|
|
159
155
|
/**
|
|
@@ -173,10 +169,10 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
173
169
|
* ```
|
|
174
170
|
*/
|
|
175
171
|
addTools(tools) {
|
|
176
|
-
const
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
}
|
|
172
|
+
const openaiTools = transformTools(tools);
|
|
173
|
+
this.updateConfig({
|
|
174
|
+
tools: openaiTools.map((t) => t.openaiTool)
|
|
175
|
+
});
|
|
180
176
|
}
|
|
181
177
|
/**
|
|
182
178
|
* Emits a speaking event using the configured voice model.
|
|
@@ -212,7 +208,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
212
208
|
if (input.trim().length === 0) {
|
|
213
209
|
throw new Error("Input text is empty");
|
|
214
210
|
}
|
|
215
|
-
this.
|
|
211
|
+
this.sendEvent("response.create", {
|
|
216
212
|
response: {
|
|
217
213
|
instructions: `Repeat the following text: ${input}`,
|
|
218
214
|
voice: options?.speaker ? options.speaker : void 0
|
|
@@ -238,7 +234,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
238
234
|
* ```
|
|
239
235
|
*/
|
|
240
236
|
updateConfig(sessionConfig) {
|
|
241
|
-
this.
|
|
237
|
+
this.sendEvent("session.update", { session: sessionConfig });
|
|
242
238
|
}
|
|
243
239
|
/**
|
|
244
240
|
* Processes audio input for speech recognition.
|
|
@@ -273,14 +269,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
273
269
|
const buffer = Buffer.concat(chunks);
|
|
274
270
|
const int16Array = new Int16Array(buffer.buffer, buffer.byteOffset ?? 0, (buffer.byteLength ?? 0) / 2);
|
|
275
271
|
const base64Audio = this.int16ArrayToBase64(int16Array);
|
|
276
|
-
this.
|
|
272
|
+
this.sendEvent("conversation.item.create", {
|
|
277
273
|
item: {
|
|
278
274
|
type: "message",
|
|
279
275
|
role: "user",
|
|
280
276
|
content: [{ type: "input_audio", audio: base64Audio }]
|
|
281
277
|
}
|
|
282
278
|
});
|
|
283
|
-
this.
|
|
279
|
+
this.sendEvent("response.create", {
|
|
284
280
|
response: {
|
|
285
281
|
modalities: ["text"],
|
|
286
282
|
instructions: `ONLY repeat the input and DO NOT say anything else`
|
|
@@ -290,6 +286,16 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
290
286
|
this.emit("error", new Error("Unsupported audio data format"));
|
|
291
287
|
}
|
|
292
288
|
}
|
|
289
|
+
waitForOpen() {
|
|
290
|
+
return new Promise((resolve) => {
|
|
291
|
+
this.ws.on("open", resolve);
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
waitForSessionCreated() {
|
|
295
|
+
return new Promise((resolve) => {
|
|
296
|
+
this.client.on("session.created", resolve);
|
|
297
|
+
});
|
|
298
|
+
}
|
|
293
299
|
/**
|
|
294
300
|
* Establishes a connection to the OpenAI realtime service.
|
|
295
301
|
* Must be called before using speak, listen, or relay functions.
|
|
@@ -303,8 +309,17 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
303
309
|
* ```
|
|
304
310
|
*/
|
|
305
311
|
async connect() {
|
|
306
|
-
await this.
|
|
307
|
-
await this.
|
|
312
|
+
await this.waitForOpen();
|
|
313
|
+
await this.waitForSessionCreated();
|
|
314
|
+
const openaiTools = transformTools(this.tools);
|
|
315
|
+
this.updateConfig({
|
|
316
|
+
instructions: this.instructions,
|
|
317
|
+
tools: openaiTools.map((t) => t.openaiTool),
|
|
318
|
+
input_audio_transcription: {
|
|
319
|
+
model: "whisper-1"
|
|
320
|
+
},
|
|
321
|
+
voice: this.speaker
|
|
322
|
+
});
|
|
308
323
|
this.state = "open";
|
|
309
324
|
}
|
|
310
325
|
/**
|
|
@@ -325,7 +340,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
325
340
|
* await voice.relay(micStream);
|
|
326
341
|
* ```
|
|
327
342
|
*/
|
|
328
|
-
async send(audioData) {
|
|
343
|
+
async send(audioData, eventId) {
|
|
329
344
|
if (!this.state || this.state !== "open") {
|
|
330
345
|
console.warn("Cannot relay audio when not open. Call open() first.");
|
|
331
346
|
return;
|
|
@@ -335,15 +350,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
335
350
|
stream.on("data", (chunk) => {
|
|
336
351
|
try {
|
|
337
352
|
const buffer = Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk);
|
|
338
|
-
|
|
339
|
-
this.client.appendInputAudio(int16Array);
|
|
353
|
+
this.sendEvent("input_audio_buffer.append", { audio: buffer.toString("base64"), event_id: eventId });
|
|
340
354
|
} catch (err) {
|
|
341
355
|
this.emit("error", err);
|
|
342
356
|
}
|
|
343
357
|
});
|
|
344
358
|
} else if (audioData instanceof Int16Array) {
|
|
345
359
|
try {
|
|
346
|
-
this.
|
|
360
|
+
this.sendEvent("input_audio_buffer.append", { audio: audioData, event_id: eventId });
|
|
347
361
|
} catch (err) {
|
|
348
362
|
this.emit("error", err);
|
|
349
363
|
}
|
|
@@ -370,7 +384,7 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
370
384
|
* });
|
|
371
385
|
*/
|
|
372
386
|
async answer({ options }) {
|
|
373
|
-
this.
|
|
387
|
+
this.sendEvent("response.create", { response: options ?? {} });
|
|
374
388
|
}
|
|
375
389
|
/**
|
|
376
390
|
* Registers an event listener for voice events.
|
|
@@ -439,29 +453,105 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
439
453
|
}
|
|
440
454
|
}
|
|
441
455
|
setupEventListeners() {
|
|
442
|
-
|
|
443
|
-
|
|
456
|
+
const speakerStreams = /* @__PURE__ */ new Map();
|
|
457
|
+
this.ws.on("message", (message) => {
|
|
458
|
+
const data = JSON.parse(message.toString());
|
|
459
|
+
this.client.emit(data.type, data);
|
|
460
|
+
if (this.debug) {
|
|
461
|
+
const { delta, ...fields } = data;
|
|
462
|
+
console.log(data.type, fields, delta?.length < 100 ? delta : "");
|
|
463
|
+
}
|
|
444
464
|
});
|
|
445
|
-
this.client.on("
|
|
446
|
-
this.emit("
|
|
465
|
+
this.client.on("session.created", (ev) => {
|
|
466
|
+
this.emit("session.created", ev);
|
|
447
467
|
});
|
|
448
|
-
this.client.on("
|
|
449
|
-
this.emit("
|
|
468
|
+
this.client.on("session.updated", (ev) => {
|
|
469
|
+
this.emit("session.updated", ev);
|
|
450
470
|
});
|
|
451
|
-
this.client.on("
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
471
|
+
this.client.on("response.created", (ev) => {
|
|
472
|
+
this.emit("response.created", ev);
|
|
473
|
+
const speakerStream = new stream.PassThrough();
|
|
474
|
+
speakerStream.id = ev.response.id;
|
|
475
|
+
speakerStreams.set(ev.response.id, speakerStream);
|
|
476
|
+
this.emit("speaker", speakerStream);
|
|
455
477
|
});
|
|
456
|
-
this.client.on("conversation.item.
|
|
457
|
-
this.emit("
|
|
478
|
+
this.client.on("conversation.item.input_audio_transcription.delta", (ev) => {
|
|
479
|
+
this.emit("transcribing", { text: ev.delta, response_id: ev.response_id, role: "user" });
|
|
458
480
|
});
|
|
459
|
-
this.client.on("conversation.item.
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
481
|
+
this.client.on("conversation.item.input_audio_transcription.done", (ev) => {
|
|
482
|
+
this.emit("transcribing", { text: "\n", response_id: ev.response_id, role: "user" });
|
|
483
|
+
});
|
|
484
|
+
this.client.on("response.audio.delta", (ev) => {
|
|
485
|
+
const audio = Buffer.from(ev.delta, "base64");
|
|
486
|
+
this.emit("speaking", { audio, response_id: ev.response_id });
|
|
487
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
488
|
+
stream?.write(audio);
|
|
489
|
+
});
|
|
490
|
+
this.client.on("response.audio.done", (ev) => {
|
|
491
|
+
this.emit("speaking.done", { response_id: ev.response_id });
|
|
492
|
+
const stream = speakerStreams.get(ev.response_id);
|
|
493
|
+
stream?.end();
|
|
494
|
+
});
|
|
495
|
+
this.client.on("response.audio_transcript.delta", (ev) => {
|
|
496
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
464
497
|
});
|
|
498
|
+
this.client.on("response.audio_transcript.done", (ev) => {
|
|
499
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
500
|
+
});
|
|
501
|
+
this.client.on("response.text.delta", (ev) => {
|
|
502
|
+
this.emit("writing", { text: ev.delta, response_id: ev.response_id });
|
|
503
|
+
});
|
|
504
|
+
this.client.on("response.text.done", (ev) => {
|
|
505
|
+
this.emit("writing", { text: "\n", response_id: ev.response_id });
|
|
506
|
+
});
|
|
507
|
+
this.client.on("response.done", async (ev) => {
|
|
508
|
+
await this.handleFunctionCalls(ev);
|
|
509
|
+
this.emit("response.done", ev);
|
|
510
|
+
speakerStreams.delete(ev.response.id);
|
|
511
|
+
});
|
|
512
|
+
}
|
|
513
|
+
async handleFunctionCalls(ev) {
|
|
514
|
+
for (const output of ev.response?.output ?? []) {
|
|
515
|
+
if (output.type === "function_call") {
|
|
516
|
+
await this.handleFunctionCall(output);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
async handleFunctionCall(output) {
|
|
521
|
+
try {
|
|
522
|
+
const context = JSON.parse(output.arguments);
|
|
523
|
+
const tool = this.tools?.[output.name];
|
|
524
|
+
if (!tool) {
|
|
525
|
+
console.warn(`Tool "${output.name}" not found`);
|
|
526
|
+
return;
|
|
527
|
+
}
|
|
528
|
+
const result = await tool?.execute?.(
|
|
529
|
+
{ context },
|
|
530
|
+
{
|
|
531
|
+
toolCallId: "unknown",
|
|
532
|
+
messages: []
|
|
533
|
+
}
|
|
534
|
+
);
|
|
535
|
+
this.sendEvent("conversation.item.create", {
|
|
536
|
+
item: {
|
|
537
|
+
type: "function_call_output",
|
|
538
|
+
call_id: output.call_id,
|
|
539
|
+
output: JSON.stringify(result)
|
|
540
|
+
}
|
|
541
|
+
});
|
|
542
|
+
} catch (e) {
|
|
543
|
+
const err = e;
|
|
544
|
+
console.warn(`Error calling tool "${output.name}":`, err.message);
|
|
545
|
+
this.sendEvent("conversation.item.create", {
|
|
546
|
+
item: {
|
|
547
|
+
type: "function_call_output",
|
|
548
|
+
call_id: output.call_id,
|
|
549
|
+
output: JSON.stringify({ error: err.message })
|
|
550
|
+
}
|
|
551
|
+
});
|
|
552
|
+
} finally {
|
|
553
|
+
this.sendEvent("response.create", {});
|
|
554
|
+
}
|
|
465
555
|
}
|
|
466
556
|
int16ArrayToBase64(int16Array) {
|
|
467
557
|
const buffer = new ArrayBuffer(int16Array.length * 2);
|
|
@@ -476,6 +566,14 @@ var OpenAIRealtimeVoice = class extends voice.MastraVoice {
|
|
|
476
566
|
}
|
|
477
567
|
return btoa(binary);
|
|
478
568
|
}
|
|
569
|
+
sendEvent(type, data) {
|
|
570
|
+
this.ws.send(
|
|
571
|
+
JSON.stringify({
|
|
572
|
+
type,
|
|
573
|
+
...data
|
|
574
|
+
})
|
|
575
|
+
);
|
|
576
|
+
}
|
|
479
577
|
};
|
|
480
578
|
|
|
481
579
|
exports.OpenAIRealtimeVoice = OpenAIRealtimeVoice;
|