@mastra/voice-xai-realtime 0.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js ADDED
@@ -0,0 +1,849 @@
1
+ import { PassThrough, Readable } from 'stream';
2
+ import { MastraVoice } from '@mastra/core/voice';
3
+ import { WebSocket } from 'ws';
4
+ import { isZodType, standardSchemaToJSONSchema, toStandardSchema } from '@mastra/schema-compat';
5
+ import { zodToJsonSchema } from '@mastra/schema-compat/zod-to-json';
6
+
7
+ // src/index.ts
8
+ var isReadableStream = (obj) => {
9
+ return !!obj && obj instanceof Readable && typeof obj.read === "function" && typeof obj.pipe === "function" && obj.readable === true;
10
+ };
11
+ var int16ArrayToBase64 = (int16Array) => {
12
+ const buffer = new ArrayBuffer(int16Array.length * 2);
13
+ const view = new DataView(buffer);
14
+ for (let i = 0; i < int16Array.length; i++) {
15
+ view.setInt16(i * 2, int16Array[i], true);
16
+ }
17
+ return Buffer.from(buffer).toString("base64");
18
+ };
19
+ var readableToBuffer = async (stream) => {
20
+ const chunks = [];
21
+ for await (const chunk of stream) {
22
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(chunk));
23
+ }
24
+ return Buffer.concat(chunks);
25
+ };
26
+ var readableToBase64 = async (stream) => {
27
+ return (await readableToBuffer(stream)).toString("base64");
28
+ };
29
+ var transformTools = (tools, logger = console) => {
30
+ const xaiTools = [];
31
+ for (const [name, tool] of Object.entries(tools || {})) {
32
+ let parameters;
33
+ try {
34
+ if ("inputSchema" in tool && tool.inputSchema) {
35
+ parameters = schemaToJsonSchema(tool.inputSchema);
36
+ } else if ("parameters" in tool && tool.parameters) {
37
+ parameters = schemaToJsonSchema(tool.parameters);
38
+ } else {
39
+ continue;
40
+ }
41
+ } catch (err) {
42
+ const message = err instanceof Error ? err.message : String(err);
43
+ throw new Error(`Failed to transform xAI realtime tool "${name}" schema: ${message}`);
44
+ }
45
+ if (!tool.execute) {
46
+ logger.warn(`Skipping xAI realtime tool "${name}" because it has no execute function.`);
47
+ continue;
48
+ }
49
+ xaiTools.push({
50
+ xaiTool: {
51
+ type: "function",
52
+ name,
53
+ description: tool.description || `Tool: ${name}`,
54
+ parameters
55
+ },
56
+ execute: async (args, options) => {
57
+ if (!tool.execute) {
58
+ throw new Error(`Tool ${name} has no execute function`);
59
+ }
60
+ const execute = tool.execute;
61
+ const callOptions = {
62
+ toolCallId: options.toolCallId,
63
+ messages: [],
64
+ requestContext: options.requestContext
65
+ };
66
+ return execute(args, callOptions);
67
+ }
68
+ });
69
+ }
70
+ return xaiTools;
71
+ };
72
+ function schemaToJsonSchema(schema) {
73
+ let jsonSchema;
74
+ if (isZodType(schema)) {
75
+ jsonSchema = zodToJsonSchema(schema);
76
+ } else {
77
+ jsonSchema = standardSchemaToJSONSchema(toStandardSchema(schema), { io: "input" });
78
+ }
79
+ delete jsonSchema.$schema;
80
+ return jsonSchema;
81
+ }
82
+
83
+ // src/index.ts
84
+ var DEFAULT_URL = "wss://api.x.ai/v1/realtime";
85
+ var DEFAULT_MODEL = "grok-voice-think-fast-1.0";
86
+ var DEFAULT_VOICE = "eve";
87
+ var DEFAULT_TURN_DETECTION = {
88
+ type: "server_vad"
89
+ };
90
+ var FUNCTION_CALL_ARGUMENT_TIMEOUT_MS = 3e4;
91
+ var DEFAULT_AUDIO = {
92
+ input: { format: { type: "audio/pcm", rate: 24e3 } },
93
+ output: { format: { type: "audio/pcm", rate: 24e3 } }
94
+ };
95
+ var XAI_SPEAKERS = [
96
+ {
97
+ voiceId: "eve",
98
+ name: "Eve",
99
+ gender: "female",
100
+ description: "Energetic, upbeat default voice."
101
+ },
102
+ {
103
+ voiceId: "ara",
104
+ name: "Ara",
105
+ gender: "female",
106
+ description: "Warm, friendly conversational voice."
107
+ },
108
+ {
109
+ voiceId: "rex",
110
+ name: "Rex",
111
+ gender: "male",
112
+ description: "Confident, clear professional voice."
113
+ },
114
+ {
115
+ voiceId: "sal",
116
+ name: "Sal",
117
+ gender: "neutral",
118
+ description: "Smooth, balanced general-purpose voice."
119
+ },
120
+ {
121
+ voiceId: "leo",
122
+ name: "Leo",
123
+ gender: "male",
124
+ description: "Authoritative, strong instructional voice."
125
+ }
126
+ ];
127
+ var XAIRealtimeVoice = class _XAIRealtimeVoice extends MastraVoice {
128
+ ws;
129
+ state = "closed";
130
+ events = /* @__PURE__ */ new Map();
131
+ queue = [];
132
+ speakerStreams = /* @__PURE__ */ new Map();
133
+ functionResponses = /* @__PURE__ */ new Map();
134
+ audioStreamCleanups = /* @__PURE__ */ new Set();
135
+ requestContext;
136
+ instructions;
137
+ tools;
138
+ transformedTools;
139
+ options;
140
+ debug;
141
+ closedByUser = false;
142
+ connectPromise;
143
+ sessionGeneration = 0;
144
+ fallbackResponseCounter = 0;
145
+ constructor(config = {}) {
146
+ const normalizedConfig = _XAIRealtimeVoice.normalizeConfig(config);
147
+ super(normalizedConfig);
148
+ this.options = normalizedConfig.realtimeConfig?.options || {};
149
+ this.instructions = this.options.instructions;
150
+ this.speaker = normalizedConfig.speaker || this.options.speaker || DEFAULT_VOICE;
151
+ this.debug = this.options.debug || false;
152
+ }
153
+ static normalizeConfig(config) {
154
+ if ("realtimeConfig" in config || "speechModel" in config || "listeningModel" in config) {
155
+ const voiceConfig = config;
156
+ const options = voiceConfig.realtimeConfig?.options || {};
157
+ return {
158
+ ...voiceConfig,
159
+ speaker: voiceConfig.speaker || options.speaker || DEFAULT_VOICE,
160
+ realtimeConfig: {
161
+ model: voiceConfig.realtimeConfig?.model || options.model || DEFAULT_MODEL,
162
+ apiKey: voiceConfig.realtimeConfig?.apiKey || options.apiKey,
163
+ options
164
+ }
165
+ };
166
+ }
167
+ const xaiConfig = config;
168
+ return {
169
+ speaker: xaiConfig.speaker || DEFAULT_VOICE,
170
+ realtimeConfig: {
171
+ model: xaiConfig.model || DEFAULT_MODEL,
172
+ apiKey: xaiConfig.apiKey,
173
+ options: xaiConfig
174
+ }
175
+ };
176
+ }
177
+ getSpeakers() {
178
+ return Promise.resolve(XAI_SPEAKERS);
179
+ }
180
+ async getListener() {
181
+ return { enabled: true };
182
+ }
183
+ addInstructions(instructions) {
184
+ this.instructions = instructions ?? "";
185
+ if (this.state === "open") {
186
+ this.updateConfig({ instructions: this.instructions });
187
+ }
188
+ }
189
+ addTools(tools) {
190
+ this.tools = tools || {};
191
+ this.transformedTools = void 0;
192
+ if (this.state === "open") {
193
+ this.updateConfig({ tools: this.buildSessionTools() });
194
+ }
195
+ }
196
+ updateConfig(sessionConfig) {
197
+ this.sendEvent({
198
+ type: "session.update",
199
+ session: this.stripUndefined(sessionConfig)
200
+ });
201
+ }
202
+ async connect({ requestContext } = {}) {
203
+ if (this.state === "open") {
204
+ return;
205
+ }
206
+ if (this.state === "connecting" && this.connectPromise) {
207
+ return this.connectPromise;
208
+ }
209
+ this.connectPromise = this.openConnection({ requestContext });
210
+ try {
211
+ await this.connectPromise;
212
+ } finally {
213
+ this.connectPromise = void 0;
214
+ }
215
+ }
216
+ async openConnection({ requestContext } = {}) {
217
+ const apiKey = this.options.apiKey || this.realtimeConfig?.apiKey || process.env.XAI_API_KEY;
218
+ const ephemeralToken = this.options.ephemeralToken;
219
+ if (!apiKey && !ephemeralToken) {
220
+ throw new Error("xAI API key is required. Set XAI_API_KEY, pass apiKey, or pass ephemeralToken.");
221
+ }
222
+ this.requestContext = requestContext;
223
+ this.closedByUser = false;
224
+ this.state = "connecting";
225
+ this.sessionGeneration += 1;
226
+ const url = this.buildUrl();
227
+ const protocols = ephemeralToken ? [`xai-client-secret.${ephemeralToken}`] : void 0;
228
+ const wsOptions = !ephemeralToken && apiKey ? {
229
+ headers: {
230
+ Authorization: `Bearer ${apiKey}`
231
+ }
232
+ } : void 0;
233
+ const ws = new WebSocket(url, protocols, wsOptions);
234
+ this.ws = ws;
235
+ this.setupEventListeners(ws);
236
+ try {
237
+ await this.waitForOpen(ws);
238
+ this.state = "open";
239
+ this.updateConfig(this.buildInitialSessionConfig());
240
+ this.flushQueue();
241
+ } catch (err) {
242
+ this.cleanupSessionState();
243
+ this.state = "closed";
244
+ this.ws = void 0;
245
+ ws.close();
246
+ throw err;
247
+ }
248
+ }
249
+ close() {
250
+ const ws = this.ws;
251
+ this.state = "closed";
252
+ this.closedByUser = true;
253
+ this.connectPromise = void 0;
254
+ this.ws = void 0;
255
+ this.cleanupSessionState();
256
+ ws?.close();
257
+ if (ws) {
258
+ this.emit("close", { code: 1e3, reason: "closed" });
259
+ }
260
+ }
261
+ disconnect() {
262
+ this.close();
263
+ }
264
+ async speak(input, options = {}) {
265
+ const text = typeof input === "string" ? input : (await this.readInputStream(input)).toString("utf-8");
266
+ if (text.trim().length === 0) {
267
+ throw new Error("Input text is empty");
268
+ }
269
+ if (options.speaker && options.speaker !== this.speaker) {
270
+ this.speaker = options.speaker;
271
+ this.updateConfig({ voice: options.speaker });
272
+ }
273
+ this.sendEvent({
274
+ type: "conversation.item.create",
275
+ item: {
276
+ type: "message",
277
+ role: "user",
278
+ content: [{ type: "input_text", text }]
279
+ }
280
+ });
281
+ await this.answer({ response: options.response });
282
+ }
283
+ async listen(audioData, options = {}) {
284
+ if (!isReadableStream(audioData)) {
285
+ this.emit("error", { message: "Unsupported audio data format" });
286
+ return;
287
+ }
288
+ this.appendAudio(await readableToBase64(audioData));
289
+ if (options.commit ?? true) {
290
+ await this.commitAudioBuffer();
291
+ }
292
+ if (options.createResponse ?? true) {
293
+ await this.answer({ response: options.response });
294
+ }
295
+ }
296
+ async send(audioData, eventId) {
297
+ if (this.state !== "open" || !this.ws || this.ws.readyState !== WebSocket.OPEN) {
298
+ this.emit("error", { message: "Cannot send audio before connect() is open" });
299
+ return;
300
+ }
301
+ if (isReadableStream(audioData)) {
302
+ const cleanup = () => {
303
+ audioData.removeListener("data", onData);
304
+ audioData.removeListener("error", onError);
305
+ audioData.removeListener("end", onEnd);
306
+ audioData.removeListener("close", onEnd);
307
+ this.audioStreamCleanups.delete(cleanup);
308
+ };
309
+ const onData = (chunk) => {
310
+ try {
311
+ const buffer = this.normalizeAudioChunk(chunk);
312
+ this.appendAudio(buffer.toString("base64"), eventId);
313
+ } catch (err) {
314
+ this.emitError(err);
315
+ cleanup();
316
+ }
317
+ };
318
+ const onError = (err) => {
319
+ this.emitError(err);
320
+ cleanup();
321
+ };
322
+ const onEnd = () => cleanup();
323
+ this.audioStreamCleanups.add(cleanup);
324
+ audioData.on("data", onData);
325
+ audioData.on("error", onError);
326
+ audioData.on("end", onEnd);
327
+ audioData.on("close", onEnd);
328
+ return;
329
+ }
330
+ if (audioData instanceof Int16Array) {
331
+ this.appendAudio(int16ArrayToBase64(audioData), eventId);
332
+ return;
333
+ }
334
+ this.emit("error", { message: "Unsupported audio data format" });
335
+ }
336
+ async commitAudioBuffer(eventId) {
337
+ this.sendEvent({ type: "input_audio_buffer.commit", event_id: eventId });
338
+ }
339
+ async clearAudioBuffer(eventId) {
340
+ this.sendEvent({ type: "input_audio_buffer.clear", event_id: eventId });
341
+ }
342
+ async cancelResponse(responseId, eventId) {
343
+ this.sendEvent({ type: "response.cancel", response_id: responseId, event_id: eventId });
344
+ }
345
+ async answer(options = {}) {
346
+ this.sendEvent({
347
+ type: "response.create",
348
+ ...options.response ? { response: options.response } : {}
349
+ });
350
+ }
351
+ on(event, callback) {
352
+ const callbacks = this.events.get(event) || [];
353
+ callbacks.push(callback);
354
+ this.events.set(event, callbacks);
355
+ }
356
+ off(event, callback) {
357
+ const callbacks = this.events.get(event);
358
+ if (!callbacks) {
359
+ return;
360
+ }
361
+ const index = callbacks.indexOf(callback);
362
+ if (index !== -1) {
363
+ callbacks.splice(index, 1);
364
+ }
365
+ }
366
+ buildUrl() {
367
+ const baseUrl = this.options.url || DEFAULT_URL;
368
+ const url = new URL(baseUrl);
369
+ url.searchParams.set("model", this.realtimeConfig?.model || this.options.model || DEFAULT_MODEL);
370
+ return url.toString();
371
+ }
372
+ buildInitialSessionConfig() {
373
+ const session = this.options.session || {};
374
+ return this.stripUndefined({
375
+ ...session,
376
+ instructions: this.instructions,
377
+ voice: this.speaker,
378
+ turn_detection: session.turn_detection ?? this.options.turnDetection ?? DEFAULT_TURN_DETECTION,
379
+ audio: session.audio ?? this.options.audio ?? DEFAULT_AUDIO,
380
+ tools: this.buildSessionTools()
381
+ });
382
+ }
383
+ buildSessionTools() {
384
+ const serverTools = [...this.options.session?.tools || [], ...this.options.serverTools || []];
385
+ const functionTools = this.getTransformedTools().map((tool) => tool.xaiTool);
386
+ return [...serverTools, ...functionTools];
387
+ }
388
+ getTransformedTools() {
389
+ this.transformedTools ??= transformTools(this.tools, this.logger);
390
+ return this.transformedTools;
391
+ }
392
+ setupEventListeners(ws) {
393
+ ws.on("message", (message) => {
394
+ if (this.ws !== ws) {
395
+ return;
396
+ }
397
+ try {
398
+ const event = JSON.parse(message.toString());
399
+ this.handleServerEvent(event);
400
+ } catch (err) {
401
+ this.emitError(err);
402
+ }
403
+ });
404
+ ws.on("error", (err) => {
405
+ if (this.ws !== ws) {
406
+ return;
407
+ }
408
+ if (this.state === "open") {
409
+ this.emitError(err);
410
+ }
411
+ });
412
+ ws.on("close", (code, reason) => {
413
+ if (this.ws !== ws) {
414
+ return;
415
+ }
416
+ this.state = "closed";
417
+ this.ws = void 0;
418
+ this.connectPromise = void 0;
419
+ this.cleanupSessionState();
420
+ this.emit("close", { code, reason: reason?.toString?.() });
421
+ });
422
+ }
423
+ handleServerEvent(event) {
424
+ if (this.debug) {
425
+ const { delta, ...fields } = event;
426
+ this.logger.debug(`[xAI realtime] ${event.type}`, { ...fields, deltaLength: delta?.length });
427
+ }
428
+ if (event.type !== "error") {
429
+ this.emit(event.type, event);
430
+ }
431
+ switch (event.type) {
432
+ case "session.created":
433
+ case "session.updated":
434
+ case "response.created":
435
+ if (event.type === "response.created") {
436
+ this.createSpeakerStream(this.getResponseId(event));
437
+ }
438
+ return;
439
+ case "response.output_audio.delta":
440
+ case "response.audio.delta":
441
+ this.handleAudioDelta(event);
442
+ return;
443
+ case "response.output_audio.done":
444
+ case "response.audio.done":
445
+ this.handleAudioDone(event);
446
+ return;
447
+ case "response.text.delta":
448
+ case "response.output_text.delta":
449
+ case "response.audio_transcript.delta":
450
+ case "response.output_audio_transcript.delta":
451
+ this.emit("writing", { text: event.delta || "", response_id: this.getResponseId(event), role: "assistant" });
452
+ return;
453
+ case "response.text.done":
454
+ case "response.output_text.done":
455
+ case "response.audio_transcript.done":
456
+ case "response.output_audio_transcript.done":
457
+ this.emit("writing", { text: "\n", response_id: this.getResponseId(event), role: "assistant" });
458
+ return;
459
+ case "conversation.item.input_audio_transcription.completed":
460
+ case "conversation.item.input_audio_transcription.done":
461
+ this.emit("writing", {
462
+ text: event.transcript || event.text || event.delta || "",
463
+ response_id: this.getResponseId(event),
464
+ role: "user"
465
+ });
466
+ return;
467
+ case "response.function_call_arguments.done":
468
+ this.handleFunctionCallEvent(event);
469
+ return;
470
+ case "response.done":
471
+ void this.handleResponseDone(event);
472
+ return;
473
+ case "error":
474
+ this.emit("error", {
475
+ message: event.error?.message || "xAI realtime error",
476
+ code: event.error?.code || event.error?.type,
477
+ details: event
478
+ });
479
+ return;
480
+ default:
481
+ return;
482
+ }
483
+ }
484
+ handleAudioDelta(event) {
485
+ const responseId = this.getResponseId(event);
486
+ const audio = event.delta || "";
487
+ const audioData = Buffer.from(audio, "base64");
488
+ const stream = this.createSpeakerStream(responseId);
489
+ stream.write(audioData);
490
+ this.emit("speaking", { audio, audioData, response_id: responseId });
491
+ }
492
+ handleAudioDone(event) {
493
+ const responseId = this.getResponseId(event);
494
+ this.emit("speaking.done", { response_id: responseId });
495
+ this.endSpeakerStream(responseId);
496
+ }
497
+ handleFunctionCallEvent(event) {
498
+ const call = this.normalizeFunctionCallEvent(event);
499
+ if (!call) {
500
+ this.emit("error", {
501
+ message: "Invalid xAI function call event",
502
+ details: event
503
+ });
504
+ return;
505
+ }
506
+ const state = this.getFunctionResponseState(call.responseId);
507
+ state.hasFunctionCall = true;
508
+ state.expectedCallIds.add(call.callId);
509
+ if (state.startedCallIds.has(call.callId)) {
510
+ return;
511
+ }
512
+ state.startedCallIds.add(call.callId);
513
+ const pending = this.executeFunctionCall(call).finally(() => {
514
+ state.completedCallIds.add(call.callId);
515
+ state.pending.delete(pending);
516
+ void this.maybeContinueAfterFunctionCalls(call.responseId);
517
+ });
518
+ state.pending.add(pending);
519
+ }
520
+ async handleResponseDone(event) {
521
+ const responseId = this.getResponseId(event);
522
+ this.endSpeakerStream(responseId);
523
+ const expectedCallIds = this.getFunctionCallIds(event);
524
+ const state = this.functionResponses.get(responseId) || (expectedCallIds.length > 0 ? this.getFunctionResponseState(responseId) : void 0);
525
+ if (!state) {
526
+ return;
527
+ }
528
+ for (const callId of expectedCallIds) {
529
+ state.expectedCallIds.add(callId);
530
+ }
531
+ state.hasFunctionCall ||= expectedCallIds.length > 0;
532
+ state.responseDone = true;
533
+ await this.maybeContinueAfterFunctionCalls(responseId);
534
+ }
535
+ getFunctionCallIds(event) {
536
+ return event.response?.output?.filter((output) => output.type === "function_call" && typeof output.call_id === "string").map((output) => output.call_id) || [];
537
+ }
538
+ async executeFunctionCall(call) {
539
+ const tool = this.tools?.[call.name];
540
+ const parsedArgs = this.parseFunctionArguments(call.arguments);
541
+ if (!parsedArgs.ok) {
542
+ if (!this.isCurrentSession(call.sessionGeneration)) {
543
+ return;
544
+ }
545
+ const message = `Failed to parse xAI function call arguments: ${parsedArgs.error.message}`;
546
+ this.sendFunctionOutput(call.callId, { error: message });
547
+ this.emit("error", {
548
+ message,
549
+ details: {
550
+ call_id: call.callId,
551
+ name: call.name,
552
+ arguments: parsedArgs.rawArguments,
553
+ error: parsedArgs.error
554
+ }
555
+ });
556
+ return;
557
+ }
558
+ const args = parsedArgs.value;
559
+ try {
560
+ if (!tool?.execute) {
561
+ throw new Error(`Tool "${call.name}" not found`);
562
+ }
563
+ this.emit("tool-call-start", {
564
+ toolCallId: call.callId,
565
+ toolName: call.name,
566
+ toolDescription: tool.description,
567
+ args
568
+ });
569
+ const result = await this.executeTool(call.name, call.callId, args);
570
+ if (!this.isCurrentSession(call.sessionGeneration)) {
571
+ return;
572
+ }
573
+ this.emit("tool-call-result", {
574
+ toolCallId: call.callId,
575
+ toolName: call.name,
576
+ toolDescription: tool.description,
577
+ args,
578
+ result
579
+ });
580
+ this.sendFunctionOutput(call.callId, result);
581
+ } catch (err) {
582
+ if (!this.isCurrentSession(call.sessionGeneration)) {
583
+ return;
584
+ }
585
+ const message = err instanceof Error ? err.message : String(err);
586
+ this.sendFunctionOutput(call.callId, { error: message });
587
+ this.emit("error", {
588
+ message,
589
+ details: { call_id: call.callId, name: call.name }
590
+ });
591
+ }
592
+ }
593
+ async executeTool(name, callId, args) {
594
+ const transformedTool = this.getTransformedTools().find((tool) => tool.xaiTool.name === name);
595
+ if (!transformedTool) {
596
+ throw new Error(`Tool "${name}" not found`);
597
+ }
598
+ return transformedTool.execute(args, {
599
+ toolCallId: callId,
600
+ requestContext: this.requestContext
601
+ });
602
+ }
603
+ sendFunctionOutput(callId, output) {
604
+ this.sendEvent({
605
+ type: "conversation.item.create",
606
+ item: {
607
+ type: "function_call_output",
608
+ call_id: callId,
609
+ output: JSON.stringify(output ?? null)
610
+ }
611
+ });
612
+ }
613
+ isCurrentSession(sessionGeneration) {
614
+ return this.state === "open" && this.sessionGeneration === sessionGeneration;
615
+ }
616
+ async maybeContinueAfterFunctionCalls(responseId) {
617
+ const state = this.functionResponses.get(responseId);
618
+ const hasPendingExpectedCall = state?.expectedCallIds && [...state.expectedCallIds].some((callId) => !state.completedCallIds.has(callId));
619
+ if (state && hasPendingExpectedCall) {
620
+ this.scheduleMissingFunctionCallTimeout(responseId, state);
621
+ }
622
+ if (!state || !state.hasFunctionCall || state.continuationSent || !state.responseDone || state.pending.size > 0 || hasPendingExpectedCall) {
623
+ return;
624
+ }
625
+ state.continuationSent = true;
626
+ this.clearMissingFunctionCallTimeout(state);
627
+ this.sendEvent({ type: "response.create" });
628
+ this.functionResponses.delete(responseId);
629
+ }
630
+ scheduleMissingFunctionCallTimeout(responseId, state) {
631
+ if (state.missingCallTimeout || !state.responseDone) {
632
+ return;
633
+ }
634
+ state.missingCallTimeout = setTimeout(() => {
635
+ if (!this.isCurrentSession(state.sessionGeneration) || this.functionResponses.get(responseId) !== state) {
636
+ return;
637
+ }
638
+ state.missingCallTimeout = void 0;
639
+ const missingCallIds = [...state.expectedCallIds].filter((callId) => !state.startedCallIds.has(callId));
640
+ if (missingCallIds.length === 0) {
641
+ void this.maybeContinueAfterFunctionCalls(responseId);
642
+ return;
643
+ }
644
+ const message = `Timed out waiting for xAI function call arguments for ${missingCallIds.join(", ")}`;
645
+ for (const callId of missingCallIds) {
646
+ state.completedCallIds.add(callId);
647
+ this.sendFunctionOutput(callId, { error: message });
648
+ }
649
+ this.emit("error", { message, details: { response_id: responseId, call_ids: missingCallIds } });
650
+ void this.maybeContinueAfterFunctionCalls(responseId);
651
+ }, FUNCTION_CALL_ARGUMENT_TIMEOUT_MS);
652
+ }
653
+ clearMissingFunctionCallTimeout(state) {
654
+ if (state.missingCallTimeout) {
655
+ clearTimeout(state.missingCallTimeout);
656
+ state.missingCallTimeout = void 0;
657
+ }
658
+ }
659
+ getFunctionResponseState(responseId) {
660
+ let state = this.functionResponses.get(responseId);
661
+ if (!state) {
662
+ state = {
663
+ pending: /* @__PURE__ */ new Set(),
664
+ expectedCallIds: /* @__PURE__ */ new Set(),
665
+ startedCallIds: /* @__PURE__ */ new Set(),
666
+ completedCallIds: /* @__PURE__ */ new Set(),
667
+ sessionGeneration: this.sessionGeneration,
668
+ responseDone: false,
669
+ continuationSent: false,
670
+ hasFunctionCall: false
671
+ };
672
+ this.functionResponses.set(responseId, state);
673
+ }
674
+ return state;
675
+ }
676
+ normalizeFunctionCallEvent(event) {
677
+ if (!event.call_id || !event.name || typeof event.arguments !== "string") {
678
+ return void 0;
679
+ }
680
+ return {
681
+ responseId: this.getResponseId(event),
682
+ callId: event.call_id,
683
+ name: event.name,
684
+ arguments: event.arguments,
685
+ sessionGeneration: this.sessionGeneration
686
+ };
687
+ }
688
+ parseFunctionArguments(args) {
689
+ try {
690
+ return { ok: true, value: JSON.parse(args || "{}") };
691
+ } catch (err) {
692
+ return { ok: false, rawArguments: args, error: err };
693
+ }
694
+ }
695
+ appendAudio(audio, eventId) {
696
+ this.sendEvent({ type: "input_audio_buffer.append", audio, event_id: eventId });
697
+ }
698
+ normalizeAudioChunk(chunk) {
699
+ if (Buffer.isBuffer(chunk)) {
700
+ return chunk;
701
+ }
702
+ if (chunk instanceof ArrayBuffer) {
703
+ return Buffer.from(chunk);
704
+ }
705
+ if (ArrayBuffer.isView(chunk)) {
706
+ return Buffer.from(chunk.buffer, chunk.byteOffset, chunk.byteLength);
707
+ }
708
+ throw new TypeError("Audio stream chunks must be Buffer, ArrayBuffer, or TypedArray values");
709
+ }
710
+ sendEvent(event) {
711
+ if (!this.ws || this.ws.readyState !== WebSocket.OPEN || this.state !== "open") {
712
+ if (this.closedByUser) {
713
+ this.emit("error", { message: "Cannot send event after close()" });
714
+ return;
715
+ }
716
+ this.queue.push(event);
717
+ return;
718
+ }
719
+ try {
720
+ this.ws.send(JSON.stringify(this.stripUndefined(event)));
721
+ } catch (err) {
722
+ this.emitError(err);
723
+ }
724
+ }
725
+ flushQueue() {
726
+ const queuedEvents = this.queue.splice(0, this.queue.length);
727
+ for (const event of queuedEvents) {
728
+ this.sendEvent(event);
729
+ }
730
+ }
731
+ waitForOpen(ws) {
732
+ return new Promise((resolve, reject) => {
733
+ if (ws.readyState === WebSocket.OPEN) {
734
+ resolve();
735
+ return;
736
+ }
737
+ const onOpen = () => {
738
+ cleanup();
739
+ resolve();
740
+ };
741
+ const onError = (err) => {
742
+ cleanup();
743
+ reject(err);
744
+ };
745
+ const onClose = () => {
746
+ cleanup();
747
+ reject(new Error("WebSocket closed before opening"));
748
+ };
749
+ const cleanup = () => {
750
+ ws.off?.("open", onOpen);
751
+ ws.off?.("error", onError);
752
+ ws.off?.("close", onClose);
753
+ };
754
+ ws.on("open", onOpen);
755
+ ws.on("error", onError);
756
+ ws.on("close", onClose);
757
+ });
758
+ }
759
+ getResponseId(event) {
760
+ return event.response_id || event.response?.id || event.item_id || `fallback:${this.sessionGeneration}:${++this.fallbackResponseCounter}`;
761
+ }
762
+ createSpeakerStream(responseId) {
763
+ const existing = this.speakerStreams.get(responseId);
764
+ if (existing) {
765
+ return existing;
766
+ }
767
+ const stream = new PassThrough();
768
+ stream.id = responseId;
769
+ this.speakerStreams.set(responseId, stream);
770
+ this.emit("speaker", stream);
771
+ return stream;
772
+ }
773
+ closeSpeakerStreams() {
774
+ for (const stream of this.speakerStreams.values()) {
775
+ stream.end();
776
+ }
777
+ this.speakerStreams.clear();
778
+ }
779
+ endSpeakerStream(responseId) {
780
+ this.speakerStreams.get(responseId)?.end();
781
+ this.speakerStreams.delete(responseId);
782
+ }
783
+ cleanupSessionState() {
784
+ this.sessionGeneration += 1;
785
+ this.queue.length = 0;
786
+ for (const cleanup of [...this.audioStreamCleanups]) {
787
+ cleanup();
788
+ }
789
+ this.audioStreamCleanups.clear();
790
+ for (const state of this.functionResponses.values()) {
791
+ this.clearMissingFunctionCallTimeout(state);
792
+ }
793
+ this.functionResponses.clear();
794
+ this.requestContext = void 0;
795
+ this.closeSpeakerStreams();
796
+ }
797
+ emit(event, ...args) {
798
+ const callbacks = this.events.get(event);
799
+ if (!callbacks) {
800
+ return;
801
+ }
802
+ for (const callback of callbacks) {
803
+ callback(...args);
804
+ }
805
+ }
806
+ emitError(err) {
807
+ this.emit("error", {
808
+ message: err instanceof Error ? err.message : String(err),
809
+ details: err
810
+ });
811
+ }
812
+ stripUndefined(value, seen = /* @__PURE__ */ new WeakSet(), depth = 0, maxDepth = 100) {
813
+ if (!value || typeof value !== "object") {
814
+ return value;
815
+ }
816
+ if (depth >= maxDepth) {
817
+ throw new Error("Cannot serialize xAI realtime event: maximum object depth exceeded");
818
+ }
819
+ const objectValue = value;
820
+ if (Buffer.isBuffer(value) || ArrayBuffer.isView(value) || value instanceof ArrayBuffer) {
821
+ return value;
822
+ }
823
+ if (seen.has(objectValue)) {
824
+ return void 0;
825
+ }
826
+ seen.add(objectValue);
827
+ if (Array.isArray(value)) {
828
+ const result2 = value.map((item) => this.stripUndefined(item, seen, depth + 1, maxDepth)).filter((item) => item !== void 0);
829
+ seen.delete(objectValue);
830
+ return result2;
831
+ }
832
+ const result = Object.fromEntries(
833
+ Object.entries(value).filter(([, entry]) => entry !== void 0).map(([key, entry]) => [key, this.stripUndefined(entry, seen, depth + 1, maxDepth)]).filter(([, entry]) => entry !== void 0)
834
+ );
835
+ seen.delete(objectValue);
836
+ return result;
837
+ }
838
+ async readInputStream(input) {
839
+ const chunks = [];
840
+ for await (const chunk of input) {
841
+ chunks.push(Buffer.isBuffer(chunk) ? chunk : Buffer.from(String(chunk)));
842
+ }
843
+ return Buffer.concat(chunks);
844
+ }
845
+ };
846
+
847
+ export { XAIRealtimeVoice };
848
+ //# sourceMappingURL=index.js.map
849
+ //# sourceMappingURL=index.js.map