@mastra/voice-xai-realtime 0.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +39 -0
- package/README.md +106 -0
- package/dist/docs/SKILL.md +26 -0
- package/dist/docs/assets/SOURCE_MAP.json +6 -0
- package/dist/docs/references/docs-voice-overview.md +1188 -0
- package/dist/docs/references/reference-voice-xai-realtime.md +267 -0
- package/dist/index.cjs +851 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +91 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +849 -0
- package/dist/index.js.map +1 -0
- package/dist/types.d.ts +181 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/utils.d.ts +25 -0
- package/dist/utils.d.ts.map +1 -0
- package/package.json +67 -0
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,851 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var stream = require('stream');
|
|
4
|
+
var voice = require('@mastra/core/voice');
|
|
5
|
+
var ws = require('ws');
|
|
6
|
+
var schemaCompat = require('@mastra/schema-compat');
|
|
7
|
+
var zodToJson = require('@mastra/schema-compat/zod-to-json');
|
|
8
|
+
|
|
9
|
+
// src/index.ts
|
|
10
|
+
var isReadableStream = (obj) => {
|
|
11
|
+
return !!obj && obj instanceof stream.Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
|
|
12
|
+
};
|
|
13
|
+
var int16ArrayToBase64 = (int16Array) => {
|
|
14
|
+
const buffer = new ArrayBuffer(int16Array.length * 2);
|
|
15
|
+
const view = new DataView(buffer);
|
|
16
|
+
for (let i = 0; i < int16Array.length; i++) {
|
|
17
|
+
view.setInt16(i * 2, int16Array[i], true);
|
|
18
|
+
}
|
|
19
|
+
return Buffer.from(buffer).toString("base64");
|
|
20
|
+
};
|
|
21
|
+
var readableToBuffer = async (stream) => {
|
|
22
|
+
const chunks = [];
|
|
23
|
+
for await (const chunk of stream) {
|
|
24
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
|
|
25
|
+
}
|
|
26
|
+
return Buffer.concat(chunks);
|
|
27
|
+
};
|
|
28
|
+
var readableToBase64 = async (stream) => {
|
|
29
|
+
return (await readableToBuffer(stream)).toString("base64");
|
|
30
|
+
};
|
|
31
|
+
var transformTools = (tools, logger = console) => {
|
|
32
|
+
const xaiTools = [];
|
|
33
|
+
for (const [name, tool] of Object.entries(tools || {})) {
|
|
34
|
+
let parameters;
|
|
35
|
+
try {
|
|
36
|
+
if ("inputSchema" in tool && tool.inputSchema) {
|
|
37
|
+
parameters = schemaToJsonSchema(tool.inputSchema);
|
|
38
|
+
} else if ("parameters" in tool && tool.parameters) {
|
|
39
|
+
parameters = schemaToJsonSchema(tool.parameters);
|
|
40
|
+
} else {
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
} catch (err) {
|
|
44
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
45
|
+
throw new Error(`Failed to transform xAI realtime tool "${name}" schema: ${message}`);
|
|
46
|
+
}
|
|
47
|
+
if (!tool.execute) {
|
|
48
|
+
logger.warn(`Skipping xAI realtime tool "${name}" because it has no execute function.`);
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
xaiTools.push({
|
|
52
|
+
xaiTool: {
|
|
53
|
+
type: "function",
|
|
54
|
+
name,
|
|
55
|
+
description: tool.description || `Tool: ${name}`,
|
|
56
|
+
parameters
|
|
57
|
+
},
|
|
58
|
+
execute: async (args, options) => {
|
|
59
|
+
if (!tool.execute) {
|
|
60
|
+
throw new Error(`Tool ${name} has no execute function`);
|
|
61
|
+
}
|
|
62
|
+
const execute = tool.execute;
|
|
63
|
+
const callOptions = {
|
|
64
|
+
toolCallId: options.toolCallId,
|
|
65
|
+
messages: [],
|
|
66
|
+
requestContext: options.requestContext
|
|
67
|
+
};
|
|
68
|
+
return execute(args, callOptions);
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
return xaiTools;
|
|
73
|
+
};
|
|
74
|
+
function schemaToJsonSchema(schema) {
|
|
75
|
+
let jsonSchema;
|
|
76
|
+
if (schemaCompat.isZodType(schema)) {
|
|
77
|
+
jsonSchema = zodToJson.zodToJsonSchema(schema);
|
|
78
|
+
} else {
|
|
79
|
+
jsonSchema = schemaCompat.standardSchemaToJSONSchema(schemaCompat.toStandardSchema(schema), { io: "input" });
|
|
80
|
+
}
|
|
81
|
+
delete jsonSchema.$schema;
|
|
82
|
+
return jsonSchema;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// src/index.ts
|
|
86
|
+
var DEFAULT_URL = "wss://api.x.ai/v1/realtime";
|
|
87
|
+
var DEFAULT_MODEL = "grok-voice-think-fast-1.0";
|
|
88
|
+
var DEFAULT_VOICE = "eve";
|
|
89
|
+
var DEFAULT_TURN_DETECTION = {
|
|
90
|
+
type: "server_vad"
|
|
91
|
+
};
|
|
92
|
+
var FUNCTION_CALL_ARGUMENT_TIMEOUT_MS = 3e4;
|
|
93
|
+
var DEFAULT_AUDIO = {
|
|
94
|
+
input: { format: { type: "audio/pcm", rate: 24e3 } },
|
|
95
|
+
output: { format: { type: "audio/pcm", rate: 24e3 } }
|
|
96
|
+
};
|
|
97
|
+
var XAI_SPEAKERS = [
|
|
98
|
+
{
|
|
99
|
+
voiceId: "eve",
|
|
100
|
+
name: "Eve",
|
|
101
|
+
gender: "female",
|
|
102
|
+
description: "Energetic, upbeat default voice."
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
voiceId: "ara",
|
|
106
|
+
name: "Ara",
|
|
107
|
+
gender: "female",
|
|
108
|
+
description: "Warm, friendly conversational voice."
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
voiceId: "rex",
|
|
112
|
+
name: "Rex",
|
|
113
|
+
gender: "male",
|
|
114
|
+
description: "Confident, clear professional voice."
|
|
115
|
+
},
|
|
116
|
+
{
|
|
117
|
+
voiceId: "sal",
|
|
118
|
+
name: "Sal",
|
|
119
|
+
gender: "neutral",
|
|
120
|
+
description: "Smooth, balanced general-purpose voice."
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
voiceId: "leo",
|
|
124
|
+
name: "Leo",
|
|
125
|
+
gender: "male",
|
|
126
|
+
description: "Authoritative, strong instructional voice."
|
|
127
|
+
}
|
|
128
|
+
];
|
|
129
|
+
var XAIRealtimeVoice = class _XAIRealtimeVoice extends voice.MastraVoice {
|
|
130
|
+
ws;
|
|
131
|
+
state = "closed";
|
|
132
|
+
events = /* @__PURE__ */ new Map();
|
|
133
|
+
queue = [];
|
|
134
|
+
speakerStreams = /* @__PURE__ */ new Map();
|
|
135
|
+
functionResponses = /* @__PURE__ */ new Map();
|
|
136
|
+
audioStreamCleanups = /* @__PURE__ */ new Set();
|
|
137
|
+
requestContext;
|
|
138
|
+
instructions;
|
|
139
|
+
tools;
|
|
140
|
+
transformedTools;
|
|
141
|
+
options;
|
|
142
|
+
debug;
|
|
143
|
+
closedByUser = false;
|
|
144
|
+
connectPromise;
|
|
145
|
+
sessionGeneration = 0;
|
|
146
|
+
fallbackResponseCounter = 0;
|
|
147
|
+
constructor(config = {}) {
|
|
148
|
+
const normalizedConfig = _XAIRealtimeVoice.normalizeConfig(config);
|
|
149
|
+
super(normalizedConfig);
|
|
150
|
+
this.options = normalizedConfig.realtimeConfig?.options || {};
|
|
151
|
+
this.instructions = this.options.instructions;
|
|
152
|
+
this.speaker = normalizedConfig.speaker || this.options.speaker || DEFAULT_VOICE;
|
|
153
|
+
this.debug = this.options.debug || false;
|
|
154
|
+
}
|
|
155
|
+
static normalizeConfig(config) {
|
|
156
|
+
if ("realtimeConfig" in config || "speechModel" in config || "listeningModel" in config) {
|
|
157
|
+
const voiceConfig = config;
|
|
158
|
+
const options = voiceConfig.realtimeConfig?.options || {};
|
|
159
|
+
return {
|
|
160
|
+
...voiceConfig,
|
|
161
|
+
speaker: voiceConfig.speaker || options.speaker || DEFAULT_VOICE,
|
|
162
|
+
realtimeConfig: {
|
|
163
|
+
model: voiceConfig.realtimeConfig?.model || options.model || DEFAULT_MODEL,
|
|
164
|
+
apiKey: voiceConfig.realtimeConfig?.apiKey || options.apiKey,
|
|
165
|
+
options
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
}
|
|
169
|
+
const xaiConfig = config;
|
|
170
|
+
return {
|
|
171
|
+
speaker: xaiConfig.speaker || DEFAULT_VOICE,
|
|
172
|
+
realtimeConfig: {
|
|
173
|
+
model: xaiConfig.model || DEFAULT_MODEL,
|
|
174
|
+
apiKey: xaiConfig.apiKey,
|
|
175
|
+
options: xaiConfig
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
getSpeakers() {
|
|
180
|
+
return Promise.resolve(XAI_SPEAKERS);
|
|
181
|
+
}
|
|
182
|
+
async getListener() {
|
|
183
|
+
return { enabled: true };
|
|
184
|
+
}
|
|
185
|
+
addInstructions(instructions) {
|
|
186
|
+
this.instructions = instructions ?? "";
|
|
187
|
+
if (this.state === "open") {
|
|
188
|
+
this.updateConfig({ instructions: this.instructions });
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
addTools(tools) {
|
|
192
|
+
this.tools = tools || {};
|
|
193
|
+
this.transformedTools = void 0;
|
|
194
|
+
if (this.state === "open") {
|
|
195
|
+
this.updateConfig({ tools: this.buildSessionTools() });
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
updateConfig(sessionConfig) {
|
|
199
|
+
this.sendEvent({
|
|
200
|
+
type: "session.update",
|
|
201
|
+
session: this.stripUndefined(sessionConfig)
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
async connect({ requestContext } = {}) {
|
|
205
|
+
if (this.state === "open") {
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
if (this.state === "connecting" && this.connectPromise) {
|
|
209
|
+
return this.connectPromise;
|
|
210
|
+
}
|
|
211
|
+
this.connectPromise = this.openConnection({ requestContext });
|
|
212
|
+
try {
|
|
213
|
+
await this.connectPromise;
|
|
214
|
+
} finally {
|
|
215
|
+
this.connectPromise = void 0;
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
async openConnection({ requestContext } = {}) {
|
|
219
|
+
const apiKey = this.options.apiKey || this.realtimeConfig?.apiKey || process.env.XAI_API_KEY;
|
|
220
|
+
const ephemeralToken = this.options.ephemeralToken;
|
|
221
|
+
if (!apiKey && !ephemeralToken) {
|
|
222
|
+
throw new Error("xAI API key is required. Set XAI_API_KEY, pass apiKey, or pass ephemeralToken.");
|
|
223
|
+
}
|
|
224
|
+
this.requestContext = requestContext;
|
|
225
|
+
this.closedByUser = false;
|
|
226
|
+
this.state = "connecting";
|
|
227
|
+
this.sessionGeneration += 1;
|
|
228
|
+
const url = this.buildUrl();
|
|
229
|
+
const protocols = ephemeralToken ? [`xai-client-secret.${ephemeralToken}`] : void 0;
|
|
230
|
+
const wsOptions = !ephemeralToken && apiKey ? {
|
|
231
|
+
headers: {
|
|
232
|
+
Authorization: `Bearer ${apiKey}`
|
|
233
|
+
}
|
|
234
|
+
} : void 0;
|
|
235
|
+
const ws$1 = new ws.WebSocket(url, protocols, wsOptions);
|
|
236
|
+
this.ws = ws$1;
|
|
237
|
+
this.setupEventListeners(ws$1);
|
|
238
|
+
try {
|
|
239
|
+
await this.waitForOpen(ws$1);
|
|
240
|
+
this.state = "open";
|
|
241
|
+
this.updateConfig(this.buildInitialSessionConfig());
|
|
242
|
+
this.flushQueue();
|
|
243
|
+
} catch (err) {
|
|
244
|
+
this.cleanupSessionState();
|
|
245
|
+
this.state = "closed";
|
|
246
|
+
this.ws = void 0;
|
|
247
|
+
ws$1.close();
|
|
248
|
+
throw err;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
close() {
|
|
252
|
+
const ws = this.ws;
|
|
253
|
+
this.state = "closed";
|
|
254
|
+
this.closedByUser = true;
|
|
255
|
+
this.connectPromise = void 0;
|
|
256
|
+
this.ws = void 0;
|
|
257
|
+
this.cleanupSessionState();
|
|
258
|
+
ws?.close();
|
|
259
|
+
if (ws) {
|
|
260
|
+
this.emit("close", { code: 1e3, reason: "closed" });
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
disconnect() {
|
|
264
|
+
this.close();
|
|
265
|
+
}
|
|
266
|
+
async speak(input, options = {}) {
|
|
267
|
+
const text = typeof input === "string" ? input : (await this.readInputStream(input)).toString("utf-8");
|
|
268
|
+
if (text.trim().length === 0) {
|
|
269
|
+
throw new Error("Input text is empty");
|
|
270
|
+
}
|
|
271
|
+
if (options.speaker && options.speaker !== this.speaker) {
|
|
272
|
+
this.speaker = options.speaker;
|
|
273
|
+
this.updateConfig({ voice: options.speaker });
|
|
274
|
+
}
|
|
275
|
+
this.sendEvent({
|
|
276
|
+
type: "conversation.item.create",
|
|
277
|
+
item: {
|
|
278
|
+
type: "message",
|
|
279
|
+
role: "user",
|
|
280
|
+
content: [{ type: "input_text", text }]
|
|
281
|
+
}
|
|
282
|
+
});
|
|
283
|
+
await this.answer({ response: options.response });
|
|
284
|
+
}
|
|
285
|
+
async listen(audioData, options = {}) {
|
|
286
|
+
if (!isReadableStream(audioData)) {
|
|
287
|
+
this.emit("error", { message: "Unsupported audio data format" });
|
|
288
|
+
return;
|
|
289
|
+
}
|
|
290
|
+
this.appendAudio(await readableToBase64(audioData));
|
|
291
|
+
if (options.commit ?? true) {
|
|
292
|
+
await this.commitAudioBuffer();
|
|
293
|
+
}
|
|
294
|
+
if (options.createResponse ?? true) {
|
|
295
|
+
await this.answer({ response: options.response });
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
async send(audioData, eventId) {
|
|
299
|
+
if (this.state !== "open" || !this.ws || this.ws.readyState !== ws.WebSocket.OPEN) {
|
|
300
|
+
this.emit("error", { message: "Cannot send audio before connect() is open" });
|
|
301
|
+
return;
|
|
302
|
+
}
|
|
303
|
+
if (isReadableStream(audioData)) {
|
|
304
|
+
const cleanup = () => {
|
|
305
|
+
audioData.removeListener("data", onData);
|
|
306
|
+
audioData.removeListener("error", onError);
|
|
307
|
+
audioData.removeListener("end", onEnd);
|
|
308
|
+
audioData.removeListener("close", onEnd);
|
|
309
|
+
this.audioStreamCleanups.delete(cleanup);
|
|
310
|
+
};
|
|
311
|
+
const onData = (chunk) => {
|
|
312
|
+
try {
|
|
313
|
+
const buffer = this.normalizeAudioChunk(chunk);
|
|
314
|
+
this.appendAudio(buffer.toString("base64"), eventId);
|
|
315
|
+
} catch (err) {
|
|
316
|
+
this.emitError(err);
|
|
317
|
+
cleanup();
|
|
318
|
+
}
|
|
319
|
+
};
|
|
320
|
+
const onError = (err) => {
|
|
321
|
+
this.emitError(err);
|
|
322
|
+
cleanup();
|
|
323
|
+
};
|
|
324
|
+
const onEnd = () => cleanup();
|
|
325
|
+
this.audioStreamCleanups.add(cleanup);
|
|
326
|
+
audioData.on("data", onData);
|
|
327
|
+
audioData.on("error", onError);
|
|
328
|
+
audioData.on("end", onEnd);
|
|
329
|
+
audioData.on("close", onEnd);
|
|
330
|
+
return;
|
|
331
|
+
}
|
|
332
|
+
if (audioData instanceof Int16Array) {
|
|
333
|
+
this.appendAudio(int16ArrayToBase64(audioData), eventId);
|
|
334
|
+
return;
|
|
335
|
+
}
|
|
336
|
+
this.emit("error", { message: "Unsupported audio data format" });
|
|
337
|
+
}
|
|
338
|
+
async commitAudioBuffer(eventId) {
|
|
339
|
+
this.sendEvent({ type: "input_audio_buffer.commit", event_id: eventId });
|
|
340
|
+
}
|
|
341
|
+
async clearAudioBuffer(eventId) {
|
|
342
|
+
this.sendEvent({ type: "input_audio_buffer.clear", event_id: eventId });
|
|
343
|
+
}
|
|
344
|
+
async cancelResponse(responseId, eventId) {
|
|
345
|
+
this.sendEvent({ type: "response.cancel", response_id: responseId, event_id: eventId });
|
|
346
|
+
}
|
|
347
|
+
async answer(options = {}) {
|
|
348
|
+
this.sendEvent({
|
|
349
|
+
type: "response.create",
|
|
350
|
+
...options.response ? { response: options.response } : {}
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
on(event, callback) {
|
|
354
|
+
const callbacks = this.events.get(event) || [];
|
|
355
|
+
callbacks.push(callback);
|
|
356
|
+
this.events.set(event, callbacks);
|
|
357
|
+
}
|
|
358
|
+
off(event, callback) {
|
|
359
|
+
const callbacks = this.events.get(event);
|
|
360
|
+
if (!callbacks) {
|
|
361
|
+
return;
|
|
362
|
+
}
|
|
363
|
+
const index = callbacks.indexOf(callback);
|
|
364
|
+
if (index !== -1) {
|
|
365
|
+
callbacks.splice(index, 1);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
buildUrl() {
|
|
369
|
+
const baseUrl = this.options.url || DEFAULT_URL;
|
|
370
|
+
const url = new URL(baseUrl);
|
|
371
|
+
url.searchParams.set("model", this.realtimeConfig?.model || this.options.model || DEFAULT_MODEL);
|
|
372
|
+
return url.toString();
|
|
373
|
+
}
|
|
374
|
+
buildInitialSessionConfig() {
|
|
375
|
+
const session = this.options.session || {};
|
|
376
|
+
return this.stripUndefined({
|
|
377
|
+
...session,
|
|
378
|
+
instructions: this.instructions,
|
|
379
|
+
voice: this.speaker,
|
|
380
|
+
turn_detection: session.turn_detection ?? this.options.turnDetection ?? DEFAULT_TURN_DETECTION,
|
|
381
|
+
audio: session.audio ?? this.options.audio ?? DEFAULT_AUDIO,
|
|
382
|
+
tools: this.buildSessionTools()
|
|
383
|
+
});
|
|
384
|
+
}
|
|
385
|
+
buildSessionTools() {
|
|
386
|
+
const serverTools = [...this.options.session?.tools || [], ...this.options.serverTools || []];
|
|
387
|
+
const functionTools = this.getTransformedTools().map((tool) => tool.xaiTool);
|
|
388
|
+
return [...serverTools, ...functionTools];
|
|
389
|
+
}
|
|
390
|
+
getTransformedTools() {
|
|
391
|
+
this.transformedTools ??= transformTools(this.tools, this.logger);
|
|
392
|
+
return this.transformedTools;
|
|
393
|
+
}
|
|
394
|
+
setupEventListeners(ws) {
|
|
395
|
+
ws.on("message", (message) => {
|
|
396
|
+
if (this.ws !== ws) {
|
|
397
|
+
return;
|
|
398
|
+
}
|
|
399
|
+
try {
|
|
400
|
+
const event = JSON.parse(message.toString());
|
|
401
|
+
this.handleServerEvent(event);
|
|
402
|
+
} catch (err) {
|
|
403
|
+
this.emitError(err);
|
|
404
|
+
}
|
|
405
|
+
});
|
|
406
|
+
ws.on("error", (err) => {
|
|
407
|
+
if (this.ws !== ws) {
|
|
408
|
+
return;
|
|
409
|
+
}
|
|
410
|
+
if (this.state === "open") {
|
|
411
|
+
this.emitError(err);
|
|
412
|
+
}
|
|
413
|
+
});
|
|
414
|
+
ws.on("close", (code, reason) => {
|
|
415
|
+
if (this.ws !== ws) {
|
|
416
|
+
return;
|
|
417
|
+
}
|
|
418
|
+
this.state = "closed";
|
|
419
|
+
this.ws = void 0;
|
|
420
|
+
this.connectPromise = void 0;
|
|
421
|
+
this.cleanupSessionState();
|
|
422
|
+
this.emit("close", { code, reason: reason?.toString?.() });
|
|
423
|
+
});
|
|
424
|
+
}
|
|
425
|
+
handleServerEvent(event) {
|
|
426
|
+
if (this.debug) {
|
|
427
|
+
const { delta, ...fields } = event;
|
|
428
|
+
this.logger.debug(`[xAI realtime] ${event.type}`, { ...fields, deltaLength: delta?.length });
|
|
429
|
+
}
|
|
430
|
+
if (event.type !== "error") {
|
|
431
|
+
this.emit(event.type, event);
|
|
432
|
+
}
|
|
433
|
+
switch (event.type) {
|
|
434
|
+
case "session.created":
|
|
435
|
+
case "session.updated":
|
|
436
|
+
case "response.created":
|
|
437
|
+
if (event.type === "response.created") {
|
|
438
|
+
this.createSpeakerStream(this.getResponseId(event));
|
|
439
|
+
}
|
|
440
|
+
return;
|
|
441
|
+
case "response.output_audio.delta":
|
|
442
|
+
case "response.audio.delta":
|
|
443
|
+
this.handleAudioDelta(event);
|
|
444
|
+
return;
|
|
445
|
+
case "response.output_audio.done":
|
|
446
|
+
case "response.audio.done":
|
|
447
|
+
this.handleAudioDone(event);
|
|
448
|
+
return;
|
|
449
|
+
case "response.text.delta":
|
|
450
|
+
case "response.output_text.delta":
|
|
451
|
+
case "response.audio_transcript.delta":
|
|
452
|
+
case "response.output_audio_transcript.delta":
|
|
453
|
+
this.emit("writing", { text: event.delta || "", response_id: this.getResponseId(event), role: "assistant" });
|
|
454
|
+
return;
|
|
455
|
+
case "response.text.done":
|
|
456
|
+
case "response.output_text.done":
|
|
457
|
+
case "response.audio_transcript.done":
|
|
458
|
+
case "response.output_audio_transcript.done":
|
|
459
|
+
this.emit("writing", { text: "\n", response_id: this.getResponseId(event), role: "assistant" });
|
|
460
|
+
return;
|
|
461
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
462
|
+
case "conversation.item.input_audio_transcription.done":
|
|
463
|
+
this.emit("writing", {
|
|
464
|
+
text: event.transcript || event.text || event.delta || "",
|
|
465
|
+
response_id: this.getResponseId(event),
|
|
466
|
+
role: "user"
|
|
467
|
+
});
|
|
468
|
+
return;
|
|
469
|
+
case "response.function_call_arguments.done":
|
|
470
|
+
this.handleFunctionCallEvent(event);
|
|
471
|
+
return;
|
|
472
|
+
case "response.done":
|
|
473
|
+
void this.handleResponseDone(event);
|
|
474
|
+
return;
|
|
475
|
+
case "error":
|
|
476
|
+
this.emit("error", {
|
|
477
|
+
message: event.error?.message || "xAI realtime error",
|
|
478
|
+
code: event.error?.code || event.error?.type,
|
|
479
|
+
details: event
|
|
480
|
+
});
|
|
481
|
+
return;
|
|
482
|
+
default:
|
|
483
|
+
return;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
handleAudioDelta(event) {
|
|
487
|
+
const responseId = this.getResponseId(event);
|
|
488
|
+
const audio = event.delta || "";
|
|
489
|
+
const audioData = Buffer.from(audio, "base64");
|
|
490
|
+
const stream = this.createSpeakerStream(responseId);
|
|
491
|
+
stream.write(audioData);
|
|
492
|
+
this.emit("speaking", { audio, audioData, response_id: responseId });
|
|
493
|
+
}
|
|
494
|
+
handleAudioDone(event) {
|
|
495
|
+
const responseId = this.getResponseId(event);
|
|
496
|
+
this.emit("speaking.done", { response_id: responseId });
|
|
497
|
+
this.endSpeakerStream(responseId);
|
|
498
|
+
}
|
|
499
|
+
handleFunctionCallEvent(event) {
|
|
500
|
+
const call = this.normalizeFunctionCallEvent(event);
|
|
501
|
+
if (!call) {
|
|
502
|
+
this.emit("error", {
|
|
503
|
+
message: "Invalid xAI function call event",
|
|
504
|
+
details: event
|
|
505
|
+
});
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
const state = this.getFunctionResponseState(call.responseId);
|
|
509
|
+
state.hasFunctionCall = true;
|
|
510
|
+
state.expectedCallIds.add(call.callId);
|
|
511
|
+
if (state.startedCallIds.has(call.callId)) {
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
state.startedCallIds.add(call.callId);
|
|
515
|
+
const pending = this.executeFunctionCall(call).finally(() => {
|
|
516
|
+
state.completedCallIds.add(call.callId);
|
|
517
|
+
state.pending.delete(pending);
|
|
518
|
+
void this.maybeContinueAfterFunctionCalls(call.responseId);
|
|
519
|
+
});
|
|
520
|
+
state.pending.add(pending);
|
|
521
|
+
}
|
|
522
|
+
async handleResponseDone(event) {
|
|
523
|
+
const responseId = this.getResponseId(event);
|
|
524
|
+
this.endSpeakerStream(responseId);
|
|
525
|
+
const expectedCallIds = this.getFunctionCallIds(event);
|
|
526
|
+
const state = this.functionResponses.get(responseId) || (expectedCallIds.length > 0 ? this.getFunctionResponseState(responseId) : void 0);
|
|
527
|
+
if (!state) {
|
|
528
|
+
return;
|
|
529
|
+
}
|
|
530
|
+
for (const callId of expectedCallIds) {
|
|
531
|
+
state.expectedCallIds.add(callId);
|
|
532
|
+
}
|
|
533
|
+
state.hasFunctionCall ||= expectedCallIds.length > 0;
|
|
534
|
+
state.responseDone = true;
|
|
535
|
+
await this.maybeContinueAfterFunctionCalls(responseId);
|
|
536
|
+
}
|
|
537
|
+
getFunctionCallIds(event) {
|
|
538
|
+
return event.response?.output?.filter((output) => output.type === "function_call" && typeof output.call_id === "string").map((output) => output.call_id) || [];
|
|
539
|
+
}
|
|
540
|
+
async executeFunctionCall(call) {
|
|
541
|
+
const tool = this.tools?.[call.name];
|
|
542
|
+
const parsedArgs = this.parseFunctionArguments(call.arguments);
|
|
543
|
+
if (!parsedArgs.ok) {
|
|
544
|
+
if (!this.isCurrentSession(call.sessionGeneration)) {
|
|
545
|
+
return;
|
|
546
|
+
}
|
|
547
|
+
const message = `Failed to parse xAI function call arguments: ${parsedArgs.error.message}`;
|
|
548
|
+
this.sendFunctionOutput(call.callId, { error: message });
|
|
549
|
+
this.emit("error", {
|
|
550
|
+
message,
|
|
551
|
+
details: {
|
|
552
|
+
call_id: call.callId,
|
|
553
|
+
name: call.name,
|
|
554
|
+
arguments: parsedArgs.rawArguments,
|
|
555
|
+
error: parsedArgs.error
|
|
556
|
+
}
|
|
557
|
+
});
|
|
558
|
+
return;
|
|
559
|
+
}
|
|
560
|
+
const args = parsedArgs.value;
|
|
561
|
+
try {
|
|
562
|
+
if (!tool?.execute) {
|
|
563
|
+
throw new Error(`Tool "${call.name}" not found`);
|
|
564
|
+
}
|
|
565
|
+
this.emit("tool-call-start", {
|
|
566
|
+
toolCallId: call.callId,
|
|
567
|
+
toolName: call.name,
|
|
568
|
+
toolDescription: tool.description,
|
|
569
|
+
args
|
|
570
|
+
});
|
|
571
|
+
const result = await this.executeTool(call.name, call.callId, args);
|
|
572
|
+
if (!this.isCurrentSession(call.sessionGeneration)) {
|
|
573
|
+
return;
|
|
574
|
+
}
|
|
575
|
+
this.emit("tool-call-result", {
|
|
576
|
+
toolCallId: call.callId,
|
|
577
|
+
toolName: call.name,
|
|
578
|
+
toolDescription: tool.description,
|
|
579
|
+
args,
|
|
580
|
+
result
|
|
581
|
+
});
|
|
582
|
+
this.sendFunctionOutput(call.callId, result);
|
|
583
|
+
} catch (err) {
|
|
584
|
+
if (!this.isCurrentSession(call.sessionGeneration)) {
|
|
585
|
+
return;
|
|
586
|
+
}
|
|
587
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
588
|
+
this.sendFunctionOutput(call.callId, { error: message });
|
|
589
|
+
this.emit("error", {
|
|
590
|
+
message,
|
|
591
|
+
details: { call_id: call.callId, name: call.name }
|
|
592
|
+
});
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
async executeTool(name, callId, args) {
|
|
596
|
+
const transformedTool = this.getTransformedTools().find((tool) => tool.xaiTool.name === name);
|
|
597
|
+
if (!transformedTool) {
|
|
598
|
+
throw new Error(`Tool "${name}" not found`);
|
|
599
|
+
}
|
|
600
|
+
return transformedTool.execute(args, {
|
|
601
|
+
toolCallId: callId,
|
|
602
|
+
requestContext: this.requestContext
|
|
603
|
+
});
|
|
604
|
+
}
|
|
605
|
+
sendFunctionOutput(callId, output) {
|
|
606
|
+
this.sendEvent({
|
|
607
|
+
type: "conversation.item.create",
|
|
608
|
+
item: {
|
|
609
|
+
type: "function_call_output",
|
|
610
|
+
call_id: callId,
|
|
611
|
+
output: JSON.stringify(output ?? null)
|
|
612
|
+
}
|
|
613
|
+
});
|
|
614
|
+
}
|
|
615
|
+
isCurrentSession(sessionGeneration) {
|
|
616
|
+
return this.state === "open" && this.sessionGeneration === sessionGeneration;
|
|
617
|
+
}
|
|
618
|
+
async maybeContinueAfterFunctionCalls(responseId) {
|
|
619
|
+
const state = this.functionResponses.get(responseId);
|
|
620
|
+
const hasPendingExpectedCall = state?.expectedCallIds && [...state.expectedCallIds].some((callId) => !state.completedCallIds.has(callId));
|
|
621
|
+
if (state && hasPendingExpectedCall) {
|
|
622
|
+
this.scheduleMissingFunctionCallTimeout(responseId, state);
|
|
623
|
+
}
|
|
624
|
+
if (!state || !state.hasFunctionCall || state.continuationSent || !state.responseDone || state.pending.size > 0 || hasPendingExpectedCall) {
|
|
625
|
+
return;
|
|
626
|
+
}
|
|
627
|
+
state.continuationSent = true;
|
|
628
|
+
this.clearMissingFunctionCallTimeout(state);
|
|
629
|
+
this.sendEvent({ type: "response.create" });
|
|
630
|
+
this.functionResponses.delete(responseId);
|
|
631
|
+
}
|
|
632
|
+
scheduleMissingFunctionCallTimeout(responseId, state) {
|
|
633
|
+
if (state.missingCallTimeout || !state.responseDone) {
|
|
634
|
+
return;
|
|
635
|
+
}
|
|
636
|
+
state.missingCallTimeout = setTimeout(() => {
|
|
637
|
+
if (!this.isCurrentSession(state.sessionGeneration) || this.functionResponses.get(responseId) !== state) {
|
|
638
|
+
return;
|
|
639
|
+
}
|
|
640
|
+
state.missingCallTimeout = void 0;
|
|
641
|
+
const missingCallIds = [...state.expectedCallIds].filter((callId) => !state.startedCallIds.has(callId));
|
|
642
|
+
if (missingCallIds.length === 0) {
|
|
643
|
+
void this.maybeContinueAfterFunctionCalls(responseId);
|
|
644
|
+
return;
|
|
645
|
+
}
|
|
646
|
+
const message = `Timed out waiting for xAI function call arguments for ${missingCallIds.join(", ")}`;
|
|
647
|
+
for (const callId of missingCallIds) {
|
|
648
|
+
state.completedCallIds.add(callId);
|
|
649
|
+
this.sendFunctionOutput(callId, { error: message });
|
|
650
|
+
}
|
|
651
|
+
this.emit("error", { message, details: { response_id: responseId, call_ids: missingCallIds } });
|
|
652
|
+
void this.maybeContinueAfterFunctionCalls(responseId);
|
|
653
|
+
}, FUNCTION_CALL_ARGUMENT_TIMEOUT_MS);
|
|
654
|
+
}
|
|
655
|
+
clearMissingFunctionCallTimeout(state) {
|
|
656
|
+
if (state.missingCallTimeout) {
|
|
657
|
+
clearTimeout(state.missingCallTimeout);
|
|
658
|
+
state.missingCallTimeout = void 0;
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
getFunctionResponseState(responseId) {
|
|
662
|
+
let state = this.functionResponses.get(responseId);
|
|
663
|
+
if (!state) {
|
|
664
|
+
state = {
|
|
665
|
+
pending: /* @__PURE__ */ new Set(),
|
|
666
|
+
expectedCallIds: /* @__PURE__ */ new Set(),
|
|
667
|
+
startedCallIds: /* @__PURE__ */ new Set(),
|
|
668
|
+
completedCallIds: /* @__PURE__ */ new Set(),
|
|
669
|
+
sessionGeneration: this.sessionGeneration,
|
|
670
|
+
responseDone: false,
|
|
671
|
+
continuationSent: false,
|
|
672
|
+
hasFunctionCall: false
|
|
673
|
+
};
|
|
674
|
+
this.functionResponses.set(responseId, state);
|
|
675
|
+
}
|
|
676
|
+
return state;
|
|
677
|
+
}
|
|
678
|
+
normalizeFunctionCallEvent(event) {
|
|
679
|
+
if (!event.call_id || !event.name || typeof event.arguments !== "string") {
|
|
680
|
+
return void 0;
|
|
681
|
+
}
|
|
682
|
+
return {
|
|
683
|
+
responseId: this.getResponseId(event),
|
|
684
|
+
callId: event.call_id,
|
|
685
|
+
name: event.name,
|
|
686
|
+
arguments: event.arguments,
|
|
687
|
+
sessionGeneration: this.sessionGeneration
|
|
688
|
+
};
|
|
689
|
+
}
|
|
690
|
+
parseFunctionArguments(args) {
|
|
691
|
+
try {
|
|
692
|
+
return { ok: true, value: JSON.parse(args || "{}") };
|
|
693
|
+
} catch (err) {
|
|
694
|
+
return { ok: false, rawArguments: args, error: err };
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
appendAudio(audio, eventId) {
|
|
698
|
+
this.sendEvent({ type: "input_audio_buffer.append", audio, event_id: eventId });
|
|
699
|
+
}
|
|
700
|
+
normalizeAudioChunk(chunk) {
|
|
701
|
+
if (Buffer.isBuffer(chunk)) {
|
|
702
|
+
return chunk;
|
|
703
|
+
}
|
|
704
|
+
if (chunk instanceof ArrayBuffer) {
|
|
705
|
+
return Buffer.from(chunk);
|
|
706
|
+
}
|
|
707
|
+
if (ArrayBuffer.isView(chunk)) {
|
|
708
|
+
return Buffer.from(chunk.buffer, chunk.byteOffset, chunk.byteLength);
|
|
709
|
+
}
|
|
710
|
+
throw new TypeError("Audio stream chunks must be Buffer, ArrayBuffer, or TypedArray values");
|
|
711
|
+
}
|
|
712
|
+
sendEvent(event) {
|
|
713
|
+
if (!this.ws || this.ws.readyState !== ws.WebSocket.OPEN || this.state !== "open") {
|
|
714
|
+
if (this.closedByUser) {
|
|
715
|
+
this.emit("error", { message: "Cannot send event after close()" });
|
|
716
|
+
return;
|
|
717
|
+
}
|
|
718
|
+
this.queue.push(event);
|
|
719
|
+
return;
|
|
720
|
+
}
|
|
721
|
+
try {
|
|
722
|
+
this.ws.send(JSON.stringify(this.stripUndefined(event)));
|
|
723
|
+
} catch (err) {
|
|
724
|
+
this.emitError(err);
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
flushQueue() {
|
|
728
|
+
const queuedEvents = this.queue.splice(0, this.queue.length);
|
|
729
|
+
for (const event of queuedEvents) {
|
|
730
|
+
this.sendEvent(event);
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
waitForOpen(ws$1) {
|
|
734
|
+
return new Promise((resolve, reject) => {
|
|
735
|
+
if (ws$1.readyState === ws.WebSocket.OPEN) {
|
|
736
|
+
resolve();
|
|
737
|
+
return;
|
|
738
|
+
}
|
|
739
|
+
const onOpen = () => {
|
|
740
|
+
cleanup();
|
|
741
|
+
resolve();
|
|
742
|
+
};
|
|
743
|
+
const onError = (err) => {
|
|
744
|
+
cleanup();
|
|
745
|
+
reject(err);
|
|
746
|
+
};
|
|
747
|
+
const onClose = () => {
|
|
748
|
+
cleanup();
|
|
749
|
+
reject(new Error("WebSocket closed before opening"));
|
|
750
|
+
};
|
|
751
|
+
const cleanup = () => {
|
|
752
|
+
ws$1.off?.("open", onOpen);
|
|
753
|
+
ws$1.off?.("error", onError);
|
|
754
|
+
ws$1.off?.("close", onClose);
|
|
755
|
+
};
|
|
756
|
+
ws$1.on("open", onOpen);
|
|
757
|
+
ws$1.on("error", onError);
|
|
758
|
+
ws$1.on("close", onClose);
|
|
759
|
+
});
|
|
760
|
+
}
|
|
761
|
+
getResponseId(event) {
|
|
762
|
+
return event.response_id || event.response?.id || event.item_id || `fallback:${this.sessionGeneration}:${++this.fallbackResponseCounter}`;
|
|
763
|
+
}
|
|
764
|
+
createSpeakerStream(responseId) {
|
|
765
|
+
const existing = this.speakerStreams.get(responseId);
|
|
766
|
+
if (existing) {
|
|
767
|
+
return existing;
|
|
768
|
+
}
|
|
769
|
+
const stream$1 = new stream.PassThrough();
|
|
770
|
+
stream$1.id = responseId;
|
|
771
|
+
this.speakerStreams.set(responseId, stream$1);
|
|
772
|
+
this.emit("speaker", stream$1);
|
|
773
|
+
return stream$1;
|
|
774
|
+
}
|
|
775
|
+
closeSpeakerStreams() {
|
|
776
|
+
for (const stream of this.speakerStreams.values()) {
|
|
777
|
+
stream.end();
|
|
778
|
+
}
|
|
779
|
+
this.speakerStreams.clear();
|
|
780
|
+
}
|
|
781
|
+
endSpeakerStream(responseId) {
|
|
782
|
+
this.speakerStreams.get(responseId)?.end();
|
|
783
|
+
this.speakerStreams.delete(responseId);
|
|
784
|
+
}
|
|
785
|
+
cleanupSessionState() {
|
|
786
|
+
this.sessionGeneration += 1;
|
|
787
|
+
this.queue.length = 0;
|
|
788
|
+
for (const cleanup of [...this.audioStreamCleanups]) {
|
|
789
|
+
cleanup();
|
|
790
|
+
}
|
|
791
|
+
this.audioStreamCleanups.clear();
|
|
792
|
+
for (const state of this.functionResponses.values()) {
|
|
793
|
+
this.clearMissingFunctionCallTimeout(state);
|
|
794
|
+
}
|
|
795
|
+
this.functionResponses.clear();
|
|
796
|
+
this.requestContext = void 0;
|
|
797
|
+
this.closeSpeakerStreams();
|
|
798
|
+
}
|
|
799
|
+
emit(event, ...args) {
|
|
800
|
+
const callbacks = this.events.get(event);
|
|
801
|
+
if (!callbacks) {
|
|
802
|
+
return;
|
|
803
|
+
}
|
|
804
|
+
for (const callback of callbacks) {
|
|
805
|
+
callback(...args);
|
|
806
|
+
}
|
|
807
|
+
}
|
|
808
|
+
emitError(err) {
|
|
809
|
+
this.emit("error", {
|
|
810
|
+
message: err instanceof Error ? err.message : String(err),
|
|
811
|
+
details: err
|
|
812
|
+
});
|
|
813
|
+
}
|
|
814
|
+
stripUndefined(value, seen = /* @__PURE__ */ new WeakSet(), depth = 0, maxDepth = 100) {
|
|
815
|
+
if (!value || typeof value !== "object") {
|
|
816
|
+
return value;
|
|
817
|
+
}
|
|
818
|
+
if (depth >= maxDepth) {
|
|
819
|
+
throw new Error("Cannot serialize xAI realtime event: maximum object depth exceeded");
|
|
820
|
+
}
|
|
821
|
+
const objectValue = value;
|
|
822
|
+
if (Buffer.isBuffer(value) || ArrayBuffer.isView(value) || value instanceof ArrayBuffer) {
|
|
823
|
+
return value;
|
|
824
|
+
}
|
|
825
|
+
if (seen.has(objectValue)) {
|
|
826
|
+
return void 0;
|
|
827
|
+
}
|
|
828
|
+
seen.add(objectValue);
|
|
829
|
+
if (Array.isArray(value)) {
|
|
830
|
+
const result2 = value.map((item) => this.stripUndefined(item, seen, depth + 1, maxDepth)).filter((item) => item !== void 0);
|
|
831
|
+
seen.delete(objectValue);
|
|
832
|
+
return result2;
|
|
833
|
+
}
|
|
834
|
+
const result = Object.fromEntries(
|
|
835
|
+
Object.entries(value).filter(([, entry]) => entry !== void 0).map(([key, entry]) => [key, this.stripUndefined(entry, seen, depth + 1, maxDepth)]).filter(([, entry]) => entry !== void 0)
|
|
836
|
+
);
|
|
837
|
+
seen.delete(objectValue);
|
|
838
|
+
return result;
|
|
839
|
+
}
|
|
840
|
+
async readInputStream(input) {
|
|
841
|
+
const chunks = [];
|
|
842
|
+
for await (const chunk of input) {
|
|
843
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
|
|
844
|
+
}
|
|
845
|
+
return Buffer.concat(chunks);
|
|
846
|
+
}
|
|
847
|
+
};
|
|
848
|
+
|
|
849
|
+
exports.XAIRealtimeVoice = XAIRealtimeVoice;
|
|
850
|
+
//# sourceMappingURL=index.cjs.map
|
|
851
|
+
//# sourceMappingURL=index.cjs.map
|