@gravity-platform/openai-realtime 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -0
  2. package/dist/Realtime/constants.d.ts +7 -0
  3. package/dist/Realtime/constants.js +10 -0
  4. package/dist/Realtime/node/executor.d.ts +16 -0
  5. package/dist/Realtime/node/executor.js +112 -0
  6. package/dist/Realtime/node/index.d.ts +7 -0
  7. package/dist/Realtime/node/index.js +146 -0
  8. package/dist/Realtime/service/core/orchestration/SessionOrchestrator.d.ts +9 -0
  9. package/dist/Realtime/service/core/orchestration/SessionOrchestrator.js +333 -0
  10. package/dist/Realtime/service/core/processing/AudioHandler.d.ts +17 -0
  11. package/dist/Realtime/service/core/processing/AudioHandler.js +57 -0
  12. package/dist/Realtime/service/core/processing/ResponseProcessor.d.ts +34 -0
  13. package/dist/Realtime/service/core/processing/ResponseProcessor.js +199 -0
  14. package/dist/Realtime/service/core/processing/TextAccumulator.d.ts +19 -0
  15. package/dist/Realtime/service/core/processing/TextAccumulator.js +75 -0
  16. package/dist/Realtime/service/core/processing/UsageStatsCollector.d.ts +14 -0
  17. package/dist/Realtime/service/core/processing/UsageStatsCollector.js +52 -0
  18. package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.d.ts +10 -0
  19. package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.js +32 -0
  20. package/dist/Realtime/service/core/streaming/SessionManager.d.ts +14 -0
  21. package/dist/Realtime/service/core/streaming/SessionManager.js +33 -0
  22. package/dist/Realtime/service/core/streaming/WsClient.d.ts +11 -0
  23. package/dist/Realtime/service/core/streaming/WsClient.js +93 -0
  24. package/dist/Realtime/service/index.d.ts +6 -0
  25. package/dist/Realtime/service/index.js +13 -0
  26. package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.d.ts +4 -0
  27. package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.js +15 -0
  28. package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.d.ts +5 -0
  29. package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.js +36 -0
  30. package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.d.ts +3 -0
  31. package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.js +9 -0
  32. package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.d.ts +4 -0
  33. package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.js +61 -0
  34. package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.d.ts +28 -0
  35. package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.js +101 -0
  36. package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.d.ts +13 -0
  37. package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.js +94 -0
  38. package/dist/credentials/index.d.ts +14 -0
  39. package/dist/credentials/index.js +19 -0
  40. package/dist/index.d.ts +2 -0
  41. package/dist/index.js +54 -0
  42. package/dist/util/types.d.ts +40 -0
  43. package/dist/util/types.js +2 -0
  44. package/package.json +58 -0
@@ -0,0 +1,333 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.SessionOrchestrator = void 0;
4
+ const plugin_base_1 = require("@gravity-platform/plugin-base");
5
+ const WsClient_1 = require("../streaming/WsClient");
6
+ const SessionManager_1 = require("../streaming/SessionManager");
7
+ const ResponseProcessor_1 = require("../processing/ResponseProcessor");
8
+ const RealtimeSessionRegistry_1 = require("../streaming/RealtimeSessionRegistry");
9
+ const SessionUpdateBuilder_1 = require("../../io/events/incoming/builders/SessionUpdateBuilder");
10
+ const ConversationItemBuilder_1 = require("../../io/events/incoming/builders/ConversationItemBuilder");
11
+ const ResponseCreateBuilder_1 = require("../../io/events/incoming/builders/ResponseCreateBuilder");
12
+ const RealtimeWebSocketAudioSubscriber_1 = require("../../io/websocket/RealtimeWebSocketAudioSubscriber");
13
+ const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
14
+ const constants_1 = require("../../../constants");
15
+ function getLogger() {
16
+ return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeSessionOrchestrator");
17
+ }
18
+ const EMPTY_STATS = {
19
+ estimated: true,
20
+ total_tokens: 0,
21
+ inputTokens: 0,
22
+ outputTokens: 0,
23
+ chunk_count: 0,
24
+ textOutput: "",
25
+ transcription: "",
26
+ assistantResponse: "",
27
+ };
28
+ class SessionOrchestrator {
29
+ constructor() {
30
+ this.logger = getLogger();
31
+ this.sessionManager = new SessionManager_1.SessionManager();
32
+ }
33
+ async orchestrateSession(config, metadata, context, emit) {
34
+ const controlSignal = config.controlSignal ?? "START_CALL";
35
+ const conversationId = metadata.conversationId || metadata.workflowId || "unknown";
36
+ if (controlSignal === "END_CALL") {
37
+ return this.handleEndCall(conversationId);
38
+ }
39
+ return this.handleStartCall(config, metadata, context, emit, conversationId);
40
+ }
41
+ async handleStartCall(config, metadata, context, emit, conversationId) {
42
+ // MCP discovery (timeout after 3s to avoid blocking session startup)
43
+ const api = context?.api;
44
+ if (api?.callService) {
45
+ let discoveryTimer;
46
+ try {
47
+ const discovery = api.callService("getSchema", {}, context);
48
+ // If the timeout wins the race, a later rejection from the discovery call
49
+ // must not surface as an unhandled rejection
50
+ discovery.catch(() => { });
51
+ const mcpSchema = await Promise.race([
52
+ discovery,
53
+ new Promise((_, reject) => {
54
+ discoveryTimer = setTimeout(() => reject(new Error("MCP discovery timeout")), 3000);
55
+ }),
56
+ ]);
57
+ if (mcpSchema?.methods) {
58
+ config.tools = Object.entries(mcpSchema.methods).map(([name, m]) => ({
59
+ type: "function",
60
+ name,
61
+ description: m.description ?? `Execute ${name}`,
62
+ parameters: m.input ?? { type: "object", properties: {} },
63
+ }));
64
+ config.mcpService = {};
65
+ for (const [methodName] of Object.entries(mcpSchema.methods)) {
66
+ config.mcpService[methodName] = (input) => api.callService(methodName, input, context);
67
+ }
68
+ this.logger.info(`MCP tools configured: ${config.tools.length}`);
69
+ }
70
+ }
71
+ catch {
72
+ // No MCP connected — continue
73
+ }
74
+ finally {
75
+ if (discoveryTimer)
76
+ clearTimeout(discoveryTimer);
77
+ }
78
+ }
79
+ // Credentials - using field signature pattern
80
+ const credentials = this.getCredentials(context);
81
+ const wsClient = new WsClient_1.WsClient();
82
+ const responseProcessor = new ResponseProcessor_1.RealtimeResponseProcessor(metadata.workflowId || "unknown", metadata, emit, wsClient);
83
+ // Track parallel tool calls — request the next response only when ALL
84
+ // dispatched tools have completed AND the model's response.done has arrived
85
+ // (a fast tool can finish before the response that requested it settles).
86
+ let pendingToolCount = 0;
87
+ let awaitingResponseDone = false;
88
+ let needsSessionUpdate = false;
89
+ const TOOL_TIMEOUT_MS = parseInt(process.env.REALTIME_TOOL_TIMEOUT_MS || "30000", 10);
90
+ const withTimeout = (promise, ms, label) => new Promise((resolve, reject) => {
91
+ const timer = setTimeout(() => reject(new Error(`Tool ${label} timed out after ${ms}ms`)), ms);
92
+ promise.then((v) => {
93
+ clearTimeout(timer);
94
+ resolve(v);
95
+ }, (e) => {
96
+ clearTimeout(timer);
97
+ reject(e);
98
+ });
99
+ });
100
+ const saveTrace = (args) => {
101
+ if (!context?.api?.saveMCPTrace || !metadata.executionId || !metadata.nodeId)
102
+ return;
103
+ const endTime = Date.now();
104
+ context.api
105
+ .saveMCPTrace({
106
+ executionId: metadata.executionId,
107
+ parentNodeId: metadata.nodeId,
108
+ toolName: args.toolName,
109
+ arguments: args.toolInput,
110
+ result: args.result,
111
+ startTime: args.startTime,
112
+ endTime,
113
+ duration: endTime - args.startTime,
114
+ success: args.success,
115
+ ...(args.error ? { error: args.error } : {}),
116
+ })
117
+ .catch((err) => this.logger.warn("Failed to save MCP trace", { error: err?.message }));
118
+ };
119
+ const maybeRequestNextResponse = () => {
120
+ if (pendingToolCount > 0 || awaitingResponseDone)
121
+ return;
122
+ if (needsSessionUpdate) {
123
+ wsClient.send(SessionUpdateBuilder_1.SessionUpdateBuilder.build(config));
124
+ this.logger.info("Registered discovered MCPs as tools");
125
+ needsSessionUpdate = false;
126
+ }
127
+ wsClient.send(ResponseCreateBuilder_1.ResponseCreateBuilder.build());
128
+ };
129
+ const executeToolCall = async (toolName, toolInput, callId) => {
130
+ // The entire body is guarded: this runs fire-and-forget, so any escape
131
+ // would be an unhandled rejection and would skip the counter decrement
132
+ try {
133
+ this.logger.info("Tool use", { toolName, callId });
134
+ const startTime = Date.now();
135
+ let result;
136
+ let toolError;
137
+ try {
138
+ if (config.mcpService?.[toolName]) {
139
+ result = await withTimeout(config.mcpService[toolName](toolInput), TOOL_TIMEOUT_MS, toolName);
140
+ emit?.({ __outputs: { mcpResult: { name: toolName, arguments: toolInput, result } } });
141
+ saveTrace({ toolName, toolInput, result, startTime, success: true });
142
+ }
143
+ else {
144
+ toolError = `No handler for tool: ${toolName}`;
145
+ result = { error: toolError };
146
+ saveTrace({ toolName, toolInput, result: null, startTime, success: false, error: toolError });
147
+ }
148
+ }
149
+ catch (err) {
150
+ toolError = `Tool execution failed: ${err.message}`;
151
+ result = { error: toolError };
152
+ saveTrace({ toolName, toolInput, result: null, startTime, success: false, error: toolError });
153
+ }
154
+ // Notify client that tool call finished
155
+ try {
156
+ const completionPublisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
157
+ await completionPublisher.publishState({
158
+ state: "TOOL_USE_COMPLETED",
159
+ conversationId,
160
+ metadata,
161
+ message: toolError ? `Tool failed: ${toolName}` : `Tool completed: ${toolName}`,
162
+ additionalMetadata: { toolName, callId, error: toolError },
163
+ });
164
+ }
165
+ catch (err) {
166
+ this.logger.warn("Failed to publish TOOL_USE_COMPLETED", { error: err?.message });
167
+ }
168
+ // Send function output back to OpenAI
169
+ let serialized;
170
+ try {
171
+ serialized = JSON.stringify(result);
172
+ }
173
+ catch {
174
+ serialized = JSON.stringify({ error: "Tool result was not serializable" });
175
+ }
176
+ wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildFunctionCallOutput(callId, serialized));
177
+ // Register dynamically discovered MCP tools from findIntent/discoverRelated results
178
+ if ((toolName === "findIntent" || toolName === "discoverRelated") && Array.isArray(result) && api?.callService) {
179
+ config.tools = config.tools || [];
180
+ config.mcpService = config.mcpService || {};
181
+ for (const item of result) {
182
+ if (item?.object_type === "mcp" && item?.metadata?.schema?.methods) {
183
+ const methodName = Object.keys(item.metadata.schema.methods)[0];
184
+ if (!methodName || config.mcpService[methodName])
185
+ continue;
186
+ const methodDef = item.metadata.schema.methods[methodName];
187
+ config.tools.push({
188
+ type: "function",
189
+ name: methodName,
190
+ description: methodDef?.description || item.description || item.title,
191
+ parameters: methodDef?.input || { type: "object", properties: { message: { type: "string" } } },
192
+ });
193
+ config.mcpService[methodName] = (input) => api.callService(methodName, input, context);
194
+ needsSessionUpdate = true;
195
+ }
196
+ }
197
+ }
198
+ }
199
+ catch (err) {
200
+ this.logger.error("Tool call handling failed", { toolName, callId, error: err?.message });
201
+ // Best effort: the model must not be left waiting on this call_id
202
+ try {
203
+ wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildFunctionCallOutput(callId, JSON.stringify({ error: "Tool call failed" })));
204
+ }
205
+ catch {
206
+ /* socket may be closed */
207
+ }
208
+ }
209
+ finally {
210
+ pendingToolCount--;
211
+ maybeRequestNextResponse();
212
+ }
213
+ };
214
+ // Tool use handler — called SYNCHRONOUSLY per function-call event so the
215
+ // pending count is accurate before any tool can complete
216
+ responseProcessor.onToolUse = ({ toolName, toolInput, callId }) => {
217
+ pendingToolCount++;
218
+ awaitingResponseDone = true;
219
+ void executeToolCall(toolName, toolInput, callId);
220
+ };
221
+ // The response that requested the tools has settled — safe to continue
222
+ // once all dispatched tools have finished
223
+ responseProcessor.onToolResponseDone = () => {
224
+ awaitingResponseDone = false;
225
+ maybeRequestNextResponse();
226
+ };
227
+ const session = this.sessionManager.createSession(config, metadata, responseProcessor, wsClient);
228
+ // Connect
229
+ await wsClient.connect(credentials.apiKey);
230
+ RealtimeSessionRegistry_1.realtimeSessionRegistry.register(conversationId, wsClient);
231
+ const wsSubscriber = RealtimeWebSocketAudioSubscriber_1.RealtimeWebSocketAudioSubscriber.getInstance();
232
+ // From here on the OpenAI session is live and billed — any exit path MUST
233
+ // close the socket and deregister, or the session is orphaned with no way
234
+ // to end it
235
+ try {
236
+ wsClient.onMessage((event) => {
237
+ responseProcessor.processEvent(event).catch((err) => {
238
+ this.logger.error("Event processing error", { error: err.message });
239
+ });
240
+ });
241
+ // Configure session
242
+ wsClient.send(SessionUpdateBuilder_1.SessionUpdateBuilder.build(config));
243
+ // Replay history
244
+ if (config.conversationHistory?.length) {
245
+ for (const msg of config.conversationHistory) {
246
+ wsClient.send(msg.role === "user"
247
+ ? ConversationItemBuilder_1.ConversationItemBuilder.buildUserMessage(msg.content)
248
+ : ConversationItemBuilder_1.ConversationItemBuilder.buildAssistantMessage(msg.content));
249
+ }
250
+ }
251
+ // Send initial request first — model processing time provides a natural
252
+ // buffer while the frontend initialises audio playback after SESSION_READY.
253
+ if (config.initialRequest) {
254
+ wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildUserMessage(config.initialRequest));
255
+ wsClient.send(ResponseCreateBuilder_1.ResponseCreateBuilder.build());
256
+ }
257
+ // Register mic audio subscriber
258
+ wsSubscriber.registerSession(conversationId, metadata.chatId || "", wsClient);
259
+ // Publish SESSION_READY — frontend begins audio playback setup on receipt
260
+ const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
261
+ await publisher.publishState({
262
+ state: "SESSION_READY",
263
+ conversationId,
264
+ metadata,
265
+ message: "OpenAI Realtime audio session ready",
266
+ additionalMetadata: { nodeId: context?.nodeId || "openairealtimevoice1", serverVad: true },
267
+ });
268
+ // Hold until WS closes (triggered by END_CALL)
269
+ this.logger.info("⏳ [SESSION] Waiting for WebSocket close...", { conversationId });
270
+ await wsClient.waitForClose();
271
+ this.logger.info("🔚 [SESSION] WebSocket closed — session ending", { conversationId });
272
+ }
273
+ finally {
274
+ wsClient.close(); // no-op if already closed
275
+ this.sessionManager.endSession(session.sessionId);
276
+ RealtimeSessionRegistry_1.realtimeSessionRegistry.remove(conversationId);
277
+ wsSubscriber.unregisterSession(conversationId);
278
+ }
279
+ responseProcessor.emitFinal();
280
+ const result = responseProcessor.getUsageStats();
281
+ // Save token usage
282
+ if (result.total_tokens > 0) {
283
+ try {
284
+ await (0, plugin_base_1.getPlatformDependencies)().saveTokenUsage({
285
+ workflowId: metadata.workflowId,
286
+ executionId: metadata.executionId,
287
+ nodeId: metadata.nodeId,
288
+ nodeType: "OpenAIRealtimeVoice",
289
+ model: constants_1.REALTIME_MODEL_ID,
290
+ usage: {
291
+ total_tokens: result.total_tokens,
292
+ input_tokens: result.inputTokens,
293
+ output_tokens: result.outputTokens,
294
+ },
295
+ timestamp: new Date(),
296
+ });
297
+ }
298
+ catch (err) {
299
+ this.logger.error("Failed to save token usage", { error: err.message });
300
+ }
301
+ }
302
+ return result;
303
+ }
304
+ handleEndCall(conversationId) {
305
+ const wsClient = RealtimeSessionRegistry_1.realtimeSessionRegistry.get(conversationId);
306
+ if (wsClient) {
307
+ this.logger.info("END_CALL — closing Realtime WebSocket", { conversationId });
308
+ wsClient.close();
309
+ }
310
+ else {
311
+ this.logger.warn("END_CALL — no active session found", { conversationId });
312
+ }
313
+ return { ...EMPTY_STATS };
314
+ }
315
+ getCredentials(context) {
316
+ const available = context.credentials || {};
317
+ // Prefer the credential declared on the node definition: the platform passes
318
+ // ALL workflow credentials here, and others (e.g. xAI) also carry an apiKey
319
+ // field — the generic signature scan below could pick the wrong provider.
320
+ const preferred = available.openaiCredential || available.openAICredential;
321
+ if (preferred?.apiKey) {
322
+ return { apiKey: preferred.apiKey };
323
+ }
324
+ // Fallback: field signature pattern (docs-starter/nodes/04-credentials.md)
325
+ for (const val of Object.values(available)) {
326
+ if (val?.apiKey) {
327
+ return { apiKey: val.apiKey };
328
+ }
329
+ }
330
+ throw new Error("OpenAI credentials not configured");
331
+ }
332
+ }
333
+ exports.SessionOrchestrator = SessionOrchestrator;
@@ -0,0 +1,17 @@
1
+ import { StreamingMetadata } from "../../../../util/types";
2
+ export declare class AudioHandler {
3
+ private conversationId;
4
+ private metadata;
5
+ private chunkIndex;
6
+ private publisher;
7
+ constructor(conversationId: string, metadata: StreamingMetadata);
8
+ handleAudioStart(): Promise<void>;
9
+ bufferAudioChunk(base64Audio: string): Promise<void>;
10
+ handleAudioEnd(): Promise<void>;
11
+ /**
12
+ * User barge-in: drop buffered assistant audio instead of flushing it, but
13
+ * still send SPEECH_ENDED — the client relies on it to unmute the mic
14
+ */
15
+ handleInterruption(): Promise<void>;
16
+ cleanup(): void;
17
+ }
@@ -0,0 +1,57 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.AudioHandler = void 0;
4
+ const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
5
+ class AudioHandler {
6
+ constructor(conversationId, metadata) {
7
+ this.conversationId = conversationId;
8
+ this.metadata = metadata;
9
+ this.chunkIndex = 0;
10
+ this.publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
11
+ }
12
+ async handleAudioStart() {
13
+ await this.publisher.publishState({
14
+ state: "SPEECH_STARTED",
15
+ conversationId: this.conversationId,
16
+ metadata: this.metadata,
17
+ message: "Assistant started speaking",
18
+ });
19
+ }
20
+ async bufferAudioChunk(base64Audio) {
21
+ await this.publisher.publishAudio({
22
+ audioData: base64Audio,
23
+ format: "pcm16",
24
+ sourceType: "OpenAIRealtime",
25
+ conversationId: this.conversationId,
26
+ metadata: this.metadata,
27
+ audioState: "SPEECH_STREAMING",
28
+ index: this.chunkIndex++,
29
+ });
30
+ }
31
+ async handleAudioEnd() {
32
+ await this.publisher.publishState({
33
+ state: "SPEECH_ENDED",
34
+ conversationId: this.conversationId,
35
+ metadata: this.metadata,
36
+ message: "Assistant finished speaking",
37
+ });
38
+ await this.publisher.cleanup(this.conversationId);
39
+ }
40
+ /**
41
+ * User barge-in: drop buffered assistant audio instead of flushing it, but
42
+ * still send SPEECH_ENDED — the client relies on it to unmute the mic
43
+ */
44
+ async handleInterruption() {
45
+ this.publisher.discardBuffer(this.conversationId);
46
+ await this.publisher.publishState({
47
+ state: "SPEECH_ENDED",
48
+ conversationId: this.conversationId,
49
+ metadata: this.metadata,
50
+ message: "Assistant interrupted by user",
51
+ });
52
+ }
53
+ cleanup() {
54
+ this.chunkIndex = 0;
55
+ }
56
+ }
57
+ exports.AudioHandler = AudioHandler;
@@ -0,0 +1,34 @@
1
+ import { StreamingMetadata, StreamUsageStats } from "../../../../util/types";
2
+ import type { WsClient } from "../streaming/WsClient";
3
+ export declare class RealtimeResponseProcessor {
4
+ private sessionId;
5
+ private metadata;
6
+ private wsClient?;
7
+ private audioHandler;
8
+ private textAccumulator;
9
+ private usageStatsCollector;
10
+ private audioStarted;
11
+ private readonly logger;
12
+ private hasToolCallsInCurrentResponse;
13
+ /** Called synchronously per function-call event — must not be awaited so the
14
+ * orchestrator's pending count is accurate before any tool can complete */
15
+ onToolUse?: (toolUse: {
16
+ toolName: string;
17
+ toolInput: any;
18
+ callId: string;
19
+ }) => void;
20
+ /** Called when a response that dispatched tool calls reaches response.done */
21
+ onToolResponseDone?: () => void;
22
+ emitProgress(text: string): void;
23
+ constructor(sessionId: string, metadata: StreamingMetadata, emit?: (output: any) => void, wsClient?: WsClient | undefined);
24
+ processEvent(event: any): Promise<void>;
25
+ private routeEvent;
26
+ private handleResponseDone;
27
+ private publishUserSpeechState;
28
+ private handleFunctionCall;
29
+ private finalizeUsageStats;
30
+ emitFinal(): void;
31
+ getUsageStats(): StreamUsageStats;
32
+ handleError(error: any): Promise<void>;
33
+ cleanup(): void;
34
+ }
@@ -0,0 +1,199 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.RealtimeResponseProcessor = void 0;
4
+ const plugin_base_1 = require("@gravity-platform/plugin-base");
5
+ const AudioHandler_1 = require("./AudioHandler");
6
+ const TextAccumulator_1 = require("./TextAccumulator");
7
+ const UsageStatsCollector_1 = require("./UsageStatsCollector");
8
+ const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
9
+ // Module-level platform calls cause startup freezes (docs-starter/nodes/CLAUDE.md
10
+ // rule 5) — resolve the logger lazily instead
11
+ function getLogger() {
12
+ return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeResponseProcessor");
13
+ }
14
+ class RealtimeResponseProcessor {
15
+ emitProgress(text) {
16
+ this.textAccumulator.emitProgress(text);
17
+ }
18
+ constructor(sessionId, metadata, emit, wsClient) {
19
+ this.sessionId = sessionId;
20
+ this.metadata = metadata;
21
+ this.wsClient = wsClient;
22
+ this.usageStatsCollector = new UsageStatsCollector_1.UsageStatsCollector();
23
+ this.audioStarted = false;
24
+ this.logger = getLogger();
25
+ this.hasToolCallsInCurrentResponse = false;
26
+ const conversationId = metadata.conversationId || sessionId;
27
+ this.audioHandler = new AudioHandler_1.AudioHandler(conversationId, metadata);
28
+ this.textAccumulator = new TextAccumulator_1.TextAccumulator(sessionId, emit);
29
+ }
30
+ async processEvent(event) {
31
+ try {
32
+ await this.routeEvent(event);
33
+ }
34
+ catch (err) {
35
+ this.logger.error("Error processing event", {
36
+ sessionId: this.sessionId,
37
+ type: event?.type,
38
+ error: err instanceof Error ? err.message : err,
39
+ });
40
+ }
41
+ }
42
+ async routeEvent(event) {
43
+ switch (event.type) {
44
+ case "response.output_audio.delta":
45
+ if (!this.audioStarted) {
46
+ this.audioStarted = true;
47
+ await this.audioHandler.handleAudioStart();
48
+ }
49
+ this.usageStatsCollector.incrementChunkCount();
50
+ await this.audioHandler.bufferAudioChunk(event.delta);
51
+ break;
52
+ case "response.output_audio.done":
53
+ await this.audioHandler.handleAudioEnd();
54
+ this.audioStarted = false;
55
+ break;
56
+ case "response.output_audio_transcript.delta":
57
+ this.textAccumulator.appendAssistant(event.delta ?? "");
58
+ break;
59
+ case "response.output_audio_transcript.done":
60
+ if (event.transcript) {
61
+ this.textAccumulator.setAssistantText(event.transcript);
62
+ }
63
+ break;
64
+ case "conversation.item.input_audio_transcription.completed":
65
+ this.textAccumulator.setUserTranscript(event.transcript ?? "");
66
+ break;
67
+ case "response.function_call_arguments.done":
68
+ await this.handleFunctionCall(event);
69
+ break;
70
+ case "session.created":
71
+ this.logger.info("Session created", { session: event.session });
72
+ break;
73
+ case "session.updated":
74
+ this.logger.debug("Session config accepted", {
75
+ voice: event.session?.audio?.output?.voice,
76
+ turn_detection: event.session?.audio?.input?.turn_detection,
77
+ });
78
+ break;
79
+ case "input_audio_buffer.speech_started":
80
+ this.logger.debug("VAD: speech started", { sessionId: this.sessionId });
81
+ if (this.audioStarted) {
82
+ // Barge-in: drop buffered assistant audio rather than flushing it
83
+ await this.audioHandler.handleInterruption();
84
+ this.audioStarted = false;
85
+ }
86
+ this.textAccumulator.resetTurn();
87
+ await this.publishUserSpeechState("USER_SPEECH_STARTED");
88
+ break;
89
+ case "input_audio_buffer.speech_stopped":
90
+ this.logger.debug("VAD: speech stopped", { sessionId: this.sessionId });
91
+ await this.publishUserSpeechState("USER_SPEECH_ENDED");
92
+ break;
93
+ case "input_audio_buffer.committed":
94
+ this.logger.debug("Audio buffer committed", { sessionId: this.sessionId });
95
+ break;
96
+ case "response.done":
97
+ this.handleResponseDone(event);
98
+ break;
99
+ case "response.function_call_arguments.delta":
100
+ case "response.output_item.added":
101
+ case "response.output_item.done":
102
+ case "response.content_part.added":
103
+ case "response.content_part.done":
104
+ case "response.created":
105
+ case "conversation.item.created":
106
+ case "conversation.item.done":
107
+ case "conversation.item.added":
108
+ case "rate_limits.updated":
109
+ break;
110
+ case "error":
111
+ this.logger.error("Realtime API error", { error: event.error, sessionId: this.sessionId });
112
+ break;
113
+ default:
114
+ this.logger.debug("Unhandled event", { type: event.type });
115
+ break;
116
+ }
117
+ }
118
+ handleResponseDone(event) {
119
+ this.finalizeUsageStats(event);
120
+ if (this.hasToolCallsInCurrentResponse) {
121
+ // Tool calls were dispatched — the orchestrator sends response.create once
122
+ // all tool outputs are submitted AND this response has settled
123
+ this.hasToolCallsInCurrentResponse = false;
124
+ this.onToolResponseDone?.();
125
+ return;
126
+ }
127
+ // Normal text/audio response — emit conversation turn
128
+ this.textAccumulator.emitConversation();
129
+ this.textAccumulator.resetTurn();
130
+ }
131
+ async publishUserSpeechState(state) {
132
+ const conversationId = this.metadata.conversationId || this.sessionId;
133
+ const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
134
+ try {
135
+ await publisher.publishState({
136
+ state,
137
+ conversationId,
138
+ metadata: this.metadata,
139
+ message: state === "USER_SPEECH_STARTED" ? "User started speaking" : "User finished speaking",
140
+ });
141
+ }
142
+ catch (err) {
143
+ this.logger.warn("Failed to publish user speech state", { state, error: err?.message });
144
+ }
145
+ }
146
+ async handleFunctionCall(event) {
147
+ const toolName = event.name;
148
+ const callId = event.call_id;
149
+ let toolInput = {};
150
+ try {
151
+ toolInput = JSON.parse(event.arguments || "{}");
152
+ }
153
+ catch {
154
+ this.logger.warn("Failed to parse tool arguments", { arguments: event.arguments });
155
+ }
156
+ this.hasToolCallsInCurrentResponse = true;
157
+ // Dispatch FIRST, synchronously — awaiting the state publish before counting
158
+ // opens a race where a fast parallel tool hits zero pending and requests the
159
+ // next response early
160
+ if (this.onToolUse) {
161
+ this.onToolUse({ toolName, toolInput, callId });
162
+ }
163
+ const conversationId = this.metadata.conversationId || this.sessionId;
164
+ const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
165
+ await publisher.publishState({
166
+ state: "TOOL_USE",
167
+ conversationId,
168
+ metadata: this.metadata,
169
+ message: `Using tool: ${toolName}`,
170
+ additionalMetadata: { toolName, callId, toolInput },
171
+ });
172
+ }
173
+ finalizeUsageStats(event) {
174
+ const usage = event.response?.usage;
175
+ if (usage) {
176
+ this.usageStatsCollector.addUsage(usage.input_tokens ?? 0, usage.output_tokens ?? 0);
177
+ }
178
+ this.usageStatsCollector.setTextResults(this.textAccumulator.getTranscription(), this.textAccumulator.getAssistantResponse());
179
+ }
180
+ emitFinal() {
181
+ this.textAccumulator.emitFinal();
182
+ }
183
+ getUsageStats() {
184
+ return this.usageStatsCollector.getUsageStats();
185
+ }
186
+ async handleError(error) {
187
+ this.logger.error("Stream error", { sessionId: this.sessionId, error: error.message || error });
188
+ if (this.audioStarted) {
189
+ await this.audioHandler.handleAudioEnd();
190
+ this.audioStarted = false;
191
+ }
192
+ }
193
+ cleanup() {
194
+ this.audioHandler.cleanup();
195
+ this.textAccumulator.resetTurn();
196
+ this.usageStatsCollector.reset();
197
+ }
198
+ }
199
+ exports.RealtimeResponseProcessor = RealtimeResponseProcessor;
@@ -0,0 +1,19 @@
1
+ export declare class TextAccumulator {
2
+ private sessionId;
3
+ private emit?;
4
+ private transcription;
5
+ private assistantResponse;
6
+ private allTurns;
7
+ private progressLog;
8
+ private readonly logger;
9
+ constructor(sessionId: string, emit?: ((output: any) => void) | undefined);
10
+ appendAssistant(text: string): void;
11
+ setAssistantText(text: string): void;
12
+ setUserTranscript(transcript: string): void;
13
+ emitProgress(text: string): void;
14
+ emitConversation(): void;
15
+ emitFinal(): void;
16
+ resetTurn(): void;
17
+ getTranscription(): string;
18
+ getAssistantResponse(): string;
19
+ }