@gravity-platform/openai-realtime 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/Realtime/constants.d.ts +7 -0
- package/dist/Realtime/constants.js +10 -0
- package/dist/Realtime/node/executor.d.ts +16 -0
- package/dist/Realtime/node/executor.js +112 -0
- package/dist/Realtime/node/index.d.ts +7 -0
- package/dist/Realtime/node/index.js +146 -0
- package/dist/Realtime/service/core/orchestration/SessionOrchestrator.d.ts +9 -0
- package/dist/Realtime/service/core/orchestration/SessionOrchestrator.js +333 -0
- package/dist/Realtime/service/core/processing/AudioHandler.d.ts +17 -0
- package/dist/Realtime/service/core/processing/AudioHandler.js +57 -0
- package/dist/Realtime/service/core/processing/ResponseProcessor.d.ts +34 -0
- package/dist/Realtime/service/core/processing/ResponseProcessor.js +199 -0
- package/dist/Realtime/service/core/processing/TextAccumulator.d.ts +19 -0
- package/dist/Realtime/service/core/processing/TextAccumulator.js +75 -0
- package/dist/Realtime/service/core/processing/UsageStatsCollector.d.ts +14 -0
- package/dist/Realtime/service/core/processing/UsageStatsCollector.js +52 -0
- package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.d.ts +10 -0
- package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.js +32 -0
- package/dist/Realtime/service/core/streaming/SessionManager.d.ts +14 -0
- package/dist/Realtime/service/core/streaming/SessionManager.js +33 -0
- package/dist/Realtime/service/core/streaming/WsClient.d.ts +11 -0
- package/dist/Realtime/service/core/streaming/WsClient.js +93 -0
- package/dist/Realtime/service/index.d.ts +6 -0
- package/dist/Realtime/service/index.js +13 -0
- package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.d.ts +4 -0
- package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.js +15 -0
- package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.d.ts +5 -0
- package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.js +36 -0
- package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.d.ts +3 -0
- package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.js +9 -0
- package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.d.ts +4 -0
- package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.js +61 -0
- package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.d.ts +28 -0
- package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.js +101 -0
- package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.d.ts +13 -0
- package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.js +94 -0
- package/dist/credentials/index.d.ts +14 -0
- package/dist/credentials/index.js +19 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +54 -0
- package/dist/util/types.d.ts +40 -0
- package/dist/util/types.js +2 -0
- package/package.json +58 -0
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SessionOrchestrator = void 0;
|
|
4
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
5
|
+
const WsClient_1 = require("../streaming/WsClient");
|
|
6
|
+
const SessionManager_1 = require("../streaming/SessionManager");
|
|
7
|
+
const ResponseProcessor_1 = require("../processing/ResponseProcessor");
|
|
8
|
+
const RealtimeSessionRegistry_1 = require("../streaming/RealtimeSessionRegistry");
|
|
9
|
+
const SessionUpdateBuilder_1 = require("../../io/events/incoming/builders/SessionUpdateBuilder");
|
|
10
|
+
const ConversationItemBuilder_1 = require("../../io/events/incoming/builders/ConversationItemBuilder");
|
|
11
|
+
const ResponseCreateBuilder_1 = require("../../io/events/incoming/builders/ResponseCreateBuilder");
|
|
12
|
+
const RealtimeWebSocketAudioSubscriber_1 = require("../../io/websocket/RealtimeWebSocketAudioSubscriber");
|
|
13
|
+
const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
|
|
14
|
+
const constants_1 = require("../../../constants");
|
|
15
|
+
function getLogger() {
|
|
16
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeSessionOrchestrator");
|
|
17
|
+
}
|
|
18
|
+
const EMPTY_STATS = {
|
|
19
|
+
estimated: true,
|
|
20
|
+
total_tokens: 0,
|
|
21
|
+
inputTokens: 0,
|
|
22
|
+
outputTokens: 0,
|
|
23
|
+
chunk_count: 0,
|
|
24
|
+
textOutput: "",
|
|
25
|
+
transcription: "",
|
|
26
|
+
assistantResponse: "",
|
|
27
|
+
};
|
|
28
|
+
class SessionOrchestrator {
|
|
29
|
+
constructor() {
|
|
30
|
+
this.logger = getLogger();
|
|
31
|
+
this.sessionManager = new SessionManager_1.SessionManager();
|
|
32
|
+
}
|
|
33
|
+
async orchestrateSession(config, metadata, context, emit) {
|
|
34
|
+
const controlSignal = config.controlSignal ?? "START_CALL";
|
|
35
|
+
const conversationId = metadata.conversationId || metadata.workflowId || "unknown";
|
|
36
|
+
if (controlSignal === "END_CALL") {
|
|
37
|
+
return this.handleEndCall(conversationId);
|
|
38
|
+
}
|
|
39
|
+
return this.handleStartCall(config, metadata, context, emit, conversationId);
|
|
40
|
+
}
|
|
41
|
+
async handleStartCall(config, metadata, context, emit, conversationId) {
|
|
42
|
+
// MCP discovery (timeout after 3s to avoid blocking session startup)
|
|
43
|
+
const api = context?.api;
|
|
44
|
+
if (api?.callService) {
|
|
45
|
+
let discoveryTimer;
|
|
46
|
+
try {
|
|
47
|
+
const discovery = api.callService("getSchema", {}, context);
|
|
48
|
+
// If the timeout wins the race, a later rejection from the discovery call
|
|
49
|
+
// must not surface as an unhandled rejection
|
|
50
|
+
discovery.catch(() => { });
|
|
51
|
+
const mcpSchema = await Promise.race([
|
|
52
|
+
discovery,
|
|
53
|
+
new Promise((_, reject) => {
|
|
54
|
+
discoveryTimer = setTimeout(() => reject(new Error("MCP discovery timeout")), 3000);
|
|
55
|
+
}),
|
|
56
|
+
]);
|
|
57
|
+
if (mcpSchema?.methods) {
|
|
58
|
+
config.tools = Object.entries(mcpSchema.methods).map(([name, m]) => ({
|
|
59
|
+
type: "function",
|
|
60
|
+
name,
|
|
61
|
+
description: m.description ?? `Execute ${name}`,
|
|
62
|
+
parameters: m.input ?? { type: "object", properties: {} },
|
|
63
|
+
}));
|
|
64
|
+
config.mcpService = {};
|
|
65
|
+
for (const [methodName] of Object.entries(mcpSchema.methods)) {
|
|
66
|
+
config.mcpService[methodName] = (input) => api.callService(methodName, input, context);
|
|
67
|
+
}
|
|
68
|
+
this.logger.info(`MCP tools configured: ${config.tools.length}`);
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
catch {
|
|
72
|
+
// No MCP connected — continue
|
|
73
|
+
}
|
|
74
|
+
finally {
|
|
75
|
+
if (discoveryTimer)
|
|
76
|
+
clearTimeout(discoveryTimer);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
// Credentials - using field signature pattern
|
|
80
|
+
const credentials = this.getCredentials(context);
|
|
81
|
+
const wsClient = new WsClient_1.WsClient();
|
|
82
|
+
const responseProcessor = new ResponseProcessor_1.RealtimeResponseProcessor(metadata.workflowId || "unknown", metadata, emit, wsClient);
|
|
83
|
+
// Track parallel tool calls — request the next response only when ALL
|
|
84
|
+
// dispatched tools have completed AND the model's response.done has arrived
|
|
85
|
+
// (a fast tool can finish before the response that requested it settles).
|
|
86
|
+
let pendingToolCount = 0;
|
|
87
|
+
let awaitingResponseDone = false;
|
|
88
|
+
let needsSessionUpdate = false;
|
|
89
|
+
const TOOL_TIMEOUT_MS = parseInt(process.env.REALTIME_TOOL_TIMEOUT_MS || "30000", 10);
|
|
90
|
+
const withTimeout = (promise, ms, label) => new Promise((resolve, reject) => {
|
|
91
|
+
const timer = setTimeout(() => reject(new Error(`Tool ${label} timed out after ${ms}ms`)), ms);
|
|
92
|
+
promise.then((v) => {
|
|
93
|
+
clearTimeout(timer);
|
|
94
|
+
resolve(v);
|
|
95
|
+
}, (e) => {
|
|
96
|
+
clearTimeout(timer);
|
|
97
|
+
reject(e);
|
|
98
|
+
});
|
|
99
|
+
});
|
|
100
|
+
const saveTrace = (args) => {
|
|
101
|
+
if (!context?.api?.saveMCPTrace || !metadata.executionId || !metadata.nodeId)
|
|
102
|
+
return;
|
|
103
|
+
const endTime = Date.now();
|
|
104
|
+
context.api
|
|
105
|
+
.saveMCPTrace({
|
|
106
|
+
executionId: metadata.executionId,
|
|
107
|
+
parentNodeId: metadata.nodeId,
|
|
108
|
+
toolName: args.toolName,
|
|
109
|
+
arguments: args.toolInput,
|
|
110
|
+
result: args.result,
|
|
111
|
+
startTime: args.startTime,
|
|
112
|
+
endTime,
|
|
113
|
+
duration: endTime - args.startTime,
|
|
114
|
+
success: args.success,
|
|
115
|
+
...(args.error ? { error: args.error } : {}),
|
|
116
|
+
})
|
|
117
|
+
.catch((err) => this.logger.warn("Failed to save MCP trace", { error: err?.message }));
|
|
118
|
+
};
|
|
119
|
+
const maybeRequestNextResponse = () => {
|
|
120
|
+
if (pendingToolCount > 0 || awaitingResponseDone)
|
|
121
|
+
return;
|
|
122
|
+
if (needsSessionUpdate) {
|
|
123
|
+
wsClient.send(SessionUpdateBuilder_1.SessionUpdateBuilder.build(config));
|
|
124
|
+
this.logger.info("Registered discovered MCPs as tools");
|
|
125
|
+
needsSessionUpdate = false;
|
|
126
|
+
}
|
|
127
|
+
wsClient.send(ResponseCreateBuilder_1.ResponseCreateBuilder.build());
|
|
128
|
+
};
|
|
129
|
+
const executeToolCall = async (toolName, toolInput, callId) => {
|
|
130
|
+
// The entire body is guarded: this runs fire-and-forget, so any escape
|
|
131
|
+
// would be an unhandled rejection and would skip the counter decrement
|
|
132
|
+
try {
|
|
133
|
+
this.logger.info("Tool use", { toolName, callId });
|
|
134
|
+
const startTime = Date.now();
|
|
135
|
+
let result;
|
|
136
|
+
let toolError;
|
|
137
|
+
try {
|
|
138
|
+
if (config.mcpService?.[toolName]) {
|
|
139
|
+
result = await withTimeout(config.mcpService[toolName](toolInput), TOOL_TIMEOUT_MS, toolName);
|
|
140
|
+
emit?.({ __outputs: { mcpResult: { name: toolName, arguments: toolInput, result } } });
|
|
141
|
+
saveTrace({ toolName, toolInput, result, startTime, success: true });
|
|
142
|
+
}
|
|
143
|
+
else {
|
|
144
|
+
toolError = `No handler for tool: ${toolName}`;
|
|
145
|
+
result = { error: toolError };
|
|
146
|
+
saveTrace({ toolName, toolInput, result: null, startTime, success: false, error: toolError });
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
catch (err) {
|
|
150
|
+
toolError = `Tool execution failed: ${err.message}`;
|
|
151
|
+
result = { error: toolError };
|
|
152
|
+
saveTrace({ toolName, toolInput, result: null, startTime, success: false, error: toolError });
|
|
153
|
+
}
|
|
154
|
+
// Notify client that tool call finished
|
|
155
|
+
try {
|
|
156
|
+
const completionPublisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
|
|
157
|
+
await completionPublisher.publishState({
|
|
158
|
+
state: "TOOL_USE_COMPLETED",
|
|
159
|
+
conversationId,
|
|
160
|
+
metadata,
|
|
161
|
+
message: toolError ? `Tool failed: ${toolName}` : `Tool completed: ${toolName}`,
|
|
162
|
+
additionalMetadata: { toolName, callId, error: toolError },
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
catch (err) {
|
|
166
|
+
this.logger.warn("Failed to publish TOOL_USE_COMPLETED", { error: err?.message });
|
|
167
|
+
}
|
|
168
|
+
// Send function output back to OpenAI
|
|
169
|
+
let serialized;
|
|
170
|
+
try {
|
|
171
|
+
serialized = JSON.stringify(result);
|
|
172
|
+
}
|
|
173
|
+
catch {
|
|
174
|
+
serialized = JSON.stringify({ error: "Tool result was not serializable" });
|
|
175
|
+
}
|
|
176
|
+
wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildFunctionCallOutput(callId, serialized));
|
|
177
|
+
// Register dynamically discovered MCP tools from findIntent/discoverRelated results
|
|
178
|
+
if ((toolName === "findIntent" || toolName === "discoverRelated") && Array.isArray(result) && api?.callService) {
|
|
179
|
+
config.tools = config.tools || [];
|
|
180
|
+
config.mcpService = config.mcpService || {};
|
|
181
|
+
for (const item of result) {
|
|
182
|
+
if (item?.object_type === "mcp" && item?.metadata?.schema?.methods) {
|
|
183
|
+
const methodName = Object.keys(item.metadata.schema.methods)[0];
|
|
184
|
+
if (!methodName || config.mcpService[methodName])
|
|
185
|
+
continue;
|
|
186
|
+
const methodDef = item.metadata.schema.methods[methodName];
|
|
187
|
+
config.tools.push({
|
|
188
|
+
type: "function",
|
|
189
|
+
name: methodName,
|
|
190
|
+
description: methodDef?.description || item.description || item.title,
|
|
191
|
+
parameters: methodDef?.input || { type: "object", properties: { message: { type: "string" } } },
|
|
192
|
+
});
|
|
193
|
+
config.mcpService[methodName] = (input) => api.callService(methodName, input, context);
|
|
194
|
+
needsSessionUpdate = true;
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
catch (err) {
|
|
200
|
+
this.logger.error("Tool call handling failed", { toolName, callId, error: err?.message });
|
|
201
|
+
// Best effort: the model must not be left waiting on this call_id
|
|
202
|
+
try {
|
|
203
|
+
wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildFunctionCallOutput(callId, JSON.stringify({ error: "Tool call failed" })));
|
|
204
|
+
}
|
|
205
|
+
catch {
|
|
206
|
+
/* socket may be closed */
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
finally {
|
|
210
|
+
pendingToolCount--;
|
|
211
|
+
maybeRequestNextResponse();
|
|
212
|
+
}
|
|
213
|
+
};
|
|
214
|
+
// Tool use handler — called SYNCHRONOUSLY per function-call event so the
|
|
215
|
+
// pending count is accurate before any tool can complete
|
|
216
|
+
responseProcessor.onToolUse = ({ toolName, toolInput, callId }) => {
|
|
217
|
+
pendingToolCount++;
|
|
218
|
+
awaitingResponseDone = true;
|
|
219
|
+
void executeToolCall(toolName, toolInput, callId);
|
|
220
|
+
};
|
|
221
|
+
// The response that requested the tools has settled — safe to continue
|
|
222
|
+
// once all dispatched tools have finished
|
|
223
|
+
responseProcessor.onToolResponseDone = () => {
|
|
224
|
+
awaitingResponseDone = false;
|
|
225
|
+
maybeRequestNextResponse();
|
|
226
|
+
};
|
|
227
|
+
const session = this.sessionManager.createSession(config, metadata, responseProcessor, wsClient);
|
|
228
|
+
// Connect
|
|
229
|
+
await wsClient.connect(credentials.apiKey);
|
|
230
|
+
RealtimeSessionRegistry_1.realtimeSessionRegistry.register(conversationId, wsClient);
|
|
231
|
+
const wsSubscriber = RealtimeWebSocketAudioSubscriber_1.RealtimeWebSocketAudioSubscriber.getInstance();
|
|
232
|
+
// From here on the OpenAI session is live and billed — any exit path MUST
|
|
233
|
+
// close the socket and deregister, or the session is orphaned with no way
|
|
234
|
+
// to end it
|
|
235
|
+
try {
|
|
236
|
+
wsClient.onMessage((event) => {
|
|
237
|
+
responseProcessor.processEvent(event).catch((err) => {
|
|
238
|
+
this.logger.error("Event processing error", { error: err.message });
|
|
239
|
+
});
|
|
240
|
+
});
|
|
241
|
+
// Configure session
|
|
242
|
+
wsClient.send(SessionUpdateBuilder_1.SessionUpdateBuilder.build(config));
|
|
243
|
+
// Replay history
|
|
244
|
+
if (config.conversationHistory?.length) {
|
|
245
|
+
for (const msg of config.conversationHistory) {
|
|
246
|
+
wsClient.send(msg.role === "user"
|
|
247
|
+
? ConversationItemBuilder_1.ConversationItemBuilder.buildUserMessage(msg.content)
|
|
248
|
+
: ConversationItemBuilder_1.ConversationItemBuilder.buildAssistantMessage(msg.content));
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
// Send initial request first — model processing time provides a natural
|
|
252
|
+
// buffer while the frontend initialises audio playback after SESSION_READY.
|
|
253
|
+
if (config.initialRequest) {
|
|
254
|
+
wsClient.send(ConversationItemBuilder_1.ConversationItemBuilder.buildUserMessage(config.initialRequest));
|
|
255
|
+
wsClient.send(ResponseCreateBuilder_1.ResponseCreateBuilder.build());
|
|
256
|
+
}
|
|
257
|
+
// Register mic audio subscriber
|
|
258
|
+
wsSubscriber.registerSession(conversationId, metadata.chatId || "", wsClient);
|
|
259
|
+
// Publish SESSION_READY — frontend begins audio playback setup on receipt
|
|
260
|
+
const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
|
|
261
|
+
await publisher.publishState({
|
|
262
|
+
state: "SESSION_READY",
|
|
263
|
+
conversationId,
|
|
264
|
+
metadata,
|
|
265
|
+
message: "OpenAI Realtime audio session ready",
|
|
266
|
+
additionalMetadata: { nodeId: context?.nodeId || "openairealtimevoice1", serverVad: true },
|
|
267
|
+
});
|
|
268
|
+
// Hold until WS closes (triggered by END_CALL)
|
|
269
|
+
this.logger.info("⏳ [SESSION] Waiting for WebSocket close...", { conversationId });
|
|
270
|
+
await wsClient.waitForClose();
|
|
271
|
+
this.logger.info("🔚 [SESSION] WebSocket closed — session ending", { conversationId });
|
|
272
|
+
}
|
|
273
|
+
finally {
|
|
274
|
+
wsClient.close(); // no-op if already closed
|
|
275
|
+
this.sessionManager.endSession(session.sessionId);
|
|
276
|
+
RealtimeSessionRegistry_1.realtimeSessionRegistry.remove(conversationId);
|
|
277
|
+
wsSubscriber.unregisterSession(conversationId);
|
|
278
|
+
}
|
|
279
|
+
responseProcessor.emitFinal();
|
|
280
|
+
const result = responseProcessor.getUsageStats();
|
|
281
|
+
// Save token usage
|
|
282
|
+
if (result.total_tokens > 0) {
|
|
283
|
+
try {
|
|
284
|
+
await (0, plugin_base_1.getPlatformDependencies)().saveTokenUsage({
|
|
285
|
+
workflowId: metadata.workflowId,
|
|
286
|
+
executionId: metadata.executionId,
|
|
287
|
+
nodeId: metadata.nodeId,
|
|
288
|
+
nodeType: "OpenAIRealtimeVoice",
|
|
289
|
+
model: constants_1.REALTIME_MODEL_ID,
|
|
290
|
+
usage: {
|
|
291
|
+
total_tokens: result.total_tokens,
|
|
292
|
+
input_tokens: result.inputTokens,
|
|
293
|
+
output_tokens: result.outputTokens,
|
|
294
|
+
},
|
|
295
|
+
timestamp: new Date(),
|
|
296
|
+
});
|
|
297
|
+
}
|
|
298
|
+
catch (err) {
|
|
299
|
+
this.logger.error("Failed to save token usage", { error: err.message });
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
return result;
|
|
303
|
+
}
|
|
304
|
+
handleEndCall(conversationId) {
|
|
305
|
+
const wsClient = RealtimeSessionRegistry_1.realtimeSessionRegistry.get(conversationId);
|
|
306
|
+
if (wsClient) {
|
|
307
|
+
this.logger.info("END_CALL — closing Realtime WebSocket", { conversationId });
|
|
308
|
+
wsClient.close();
|
|
309
|
+
}
|
|
310
|
+
else {
|
|
311
|
+
this.logger.warn("END_CALL — no active session found", { conversationId });
|
|
312
|
+
}
|
|
313
|
+
return { ...EMPTY_STATS };
|
|
314
|
+
}
|
|
315
|
+
getCredentials(context) {
|
|
316
|
+
const available = context.credentials || {};
|
|
317
|
+
// Prefer the credential declared on the node definition: the platform passes
|
|
318
|
+
// ALL workflow credentials here, and others (e.g. xAI) also carry an apiKey
|
|
319
|
+
// field — the generic signature scan below could pick the wrong provider.
|
|
320
|
+
const preferred = available.openaiCredential || available.openAICredential;
|
|
321
|
+
if (preferred?.apiKey) {
|
|
322
|
+
return { apiKey: preferred.apiKey };
|
|
323
|
+
}
|
|
324
|
+
// Fallback: field signature pattern (docs-starter/nodes/04-credentials.md)
|
|
325
|
+
for (const val of Object.values(available)) {
|
|
326
|
+
if (val?.apiKey) {
|
|
327
|
+
return { apiKey: val.apiKey };
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
throw new Error("OpenAI credentials not configured");
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
exports.SessionOrchestrator = SessionOrchestrator;
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import { StreamingMetadata } from "../../../../util/types";
|
|
2
|
+
export declare class AudioHandler {
|
|
3
|
+
private conversationId;
|
|
4
|
+
private metadata;
|
|
5
|
+
private chunkIndex;
|
|
6
|
+
private publisher;
|
|
7
|
+
constructor(conversationId: string, metadata: StreamingMetadata);
|
|
8
|
+
handleAudioStart(): Promise<void>;
|
|
9
|
+
bufferAudioChunk(base64Audio: string): Promise<void>;
|
|
10
|
+
handleAudioEnd(): Promise<void>;
|
|
11
|
+
/**
|
|
12
|
+
* User barge-in: drop buffered assistant audio instead of flushing it, but
|
|
13
|
+
* still send SPEECH_ENDED — the client relies on it to unmute the mic
|
|
14
|
+
*/
|
|
15
|
+
handleInterruption(): Promise<void>;
|
|
16
|
+
cleanup(): void;
|
|
17
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.AudioHandler = void 0;
|
|
4
|
+
const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
|
|
5
|
+
class AudioHandler {
|
|
6
|
+
constructor(conversationId, metadata) {
|
|
7
|
+
this.conversationId = conversationId;
|
|
8
|
+
this.metadata = metadata;
|
|
9
|
+
this.chunkIndex = 0;
|
|
10
|
+
this.publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
|
|
11
|
+
}
|
|
12
|
+
async handleAudioStart() {
|
|
13
|
+
await this.publisher.publishState({
|
|
14
|
+
state: "SPEECH_STARTED",
|
|
15
|
+
conversationId: this.conversationId,
|
|
16
|
+
metadata: this.metadata,
|
|
17
|
+
message: "Assistant started speaking",
|
|
18
|
+
});
|
|
19
|
+
}
|
|
20
|
+
async bufferAudioChunk(base64Audio) {
|
|
21
|
+
await this.publisher.publishAudio({
|
|
22
|
+
audioData: base64Audio,
|
|
23
|
+
format: "pcm16",
|
|
24
|
+
sourceType: "OpenAIRealtime",
|
|
25
|
+
conversationId: this.conversationId,
|
|
26
|
+
metadata: this.metadata,
|
|
27
|
+
audioState: "SPEECH_STREAMING",
|
|
28
|
+
index: this.chunkIndex++,
|
|
29
|
+
});
|
|
30
|
+
}
|
|
31
|
+
async handleAudioEnd() {
|
|
32
|
+
await this.publisher.publishState({
|
|
33
|
+
state: "SPEECH_ENDED",
|
|
34
|
+
conversationId: this.conversationId,
|
|
35
|
+
metadata: this.metadata,
|
|
36
|
+
message: "Assistant finished speaking",
|
|
37
|
+
});
|
|
38
|
+
await this.publisher.cleanup(this.conversationId);
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* User barge-in: drop buffered assistant audio instead of flushing it, but
|
|
42
|
+
* still send SPEECH_ENDED — the client relies on it to unmute the mic
|
|
43
|
+
*/
|
|
44
|
+
async handleInterruption() {
|
|
45
|
+
this.publisher.discardBuffer(this.conversationId);
|
|
46
|
+
await this.publisher.publishState({
|
|
47
|
+
state: "SPEECH_ENDED",
|
|
48
|
+
conversationId: this.conversationId,
|
|
49
|
+
metadata: this.metadata,
|
|
50
|
+
message: "Assistant interrupted by user",
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
cleanup() {
|
|
54
|
+
this.chunkIndex = 0;
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
exports.AudioHandler = AudioHandler;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { StreamingMetadata, StreamUsageStats } from "../../../../util/types";
|
|
2
|
+
import type { WsClient } from "../streaming/WsClient";
|
|
3
|
+
export declare class RealtimeResponseProcessor {
|
|
4
|
+
private sessionId;
|
|
5
|
+
private metadata;
|
|
6
|
+
private wsClient?;
|
|
7
|
+
private audioHandler;
|
|
8
|
+
private textAccumulator;
|
|
9
|
+
private usageStatsCollector;
|
|
10
|
+
private audioStarted;
|
|
11
|
+
private readonly logger;
|
|
12
|
+
private hasToolCallsInCurrentResponse;
|
|
13
|
+
/** Called synchronously per function-call event — must not be awaited so the
|
|
14
|
+
* orchestrator's pending count is accurate before any tool can complete */
|
|
15
|
+
onToolUse?: (toolUse: {
|
|
16
|
+
toolName: string;
|
|
17
|
+
toolInput: any;
|
|
18
|
+
callId: string;
|
|
19
|
+
}) => void;
|
|
20
|
+
/** Called when a response that dispatched tool calls reaches response.done */
|
|
21
|
+
onToolResponseDone?: () => void;
|
|
22
|
+
emitProgress(text: string): void;
|
|
23
|
+
constructor(sessionId: string, metadata: StreamingMetadata, emit?: (output: any) => void, wsClient?: WsClient | undefined);
|
|
24
|
+
processEvent(event: any): Promise<void>;
|
|
25
|
+
private routeEvent;
|
|
26
|
+
private handleResponseDone;
|
|
27
|
+
private publishUserSpeechState;
|
|
28
|
+
private handleFunctionCall;
|
|
29
|
+
private finalizeUsageStats;
|
|
30
|
+
emitFinal(): void;
|
|
31
|
+
getUsageStats(): StreamUsageStats;
|
|
32
|
+
handleError(error: any): Promise<void>;
|
|
33
|
+
cleanup(): void;
|
|
34
|
+
}
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RealtimeResponseProcessor = void 0;
|
|
4
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
5
|
+
const AudioHandler_1 = require("./AudioHandler");
|
|
6
|
+
const TextAccumulator_1 = require("./TextAccumulator");
|
|
7
|
+
const UsageStatsCollector_1 = require("./UsageStatsCollector");
|
|
8
|
+
const WebSocketAudioPublisher_1 = require("../../io/publishers/WebSocketAudioPublisher");
|
|
9
|
+
// Module-level platform calls cause startup freezes (docs-starter/nodes/CLAUDE.md
|
|
10
|
+
// rule 5) — resolve the logger lazily instead
|
|
11
|
+
function getLogger() {
|
|
12
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeResponseProcessor");
|
|
13
|
+
}
|
|
14
|
+
class RealtimeResponseProcessor {
|
|
15
|
+
emitProgress(text) {
|
|
16
|
+
this.textAccumulator.emitProgress(text);
|
|
17
|
+
}
|
|
18
|
+
constructor(sessionId, metadata, emit, wsClient) {
|
|
19
|
+
this.sessionId = sessionId;
|
|
20
|
+
this.metadata = metadata;
|
|
21
|
+
this.wsClient = wsClient;
|
|
22
|
+
this.usageStatsCollector = new UsageStatsCollector_1.UsageStatsCollector();
|
|
23
|
+
this.audioStarted = false;
|
|
24
|
+
this.logger = getLogger();
|
|
25
|
+
this.hasToolCallsInCurrentResponse = false;
|
|
26
|
+
const conversationId = metadata.conversationId || sessionId;
|
|
27
|
+
this.audioHandler = new AudioHandler_1.AudioHandler(conversationId, metadata);
|
|
28
|
+
this.textAccumulator = new TextAccumulator_1.TextAccumulator(sessionId, emit);
|
|
29
|
+
}
|
|
30
|
+
async processEvent(event) {
|
|
31
|
+
try {
|
|
32
|
+
await this.routeEvent(event);
|
|
33
|
+
}
|
|
34
|
+
catch (err) {
|
|
35
|
+
this.logger.error("Error processing event", {
|
|
36
|
+
sessionId: this.sessionId,
|
|
37
|
+
type: event?.type,
|
|
38
|
+
error: err instanceof Error ? err.message : err,
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async routeEvent(event) {
|
|
43
|
+
switch (event.type) {
|
|
44
|
+
case "response.output_audio.delta":
|
|
45
|
+
if (!this.audioStarted) {
|
|
46
|
+
this.audioStarted = true;
|
|
47
|
+
await this.audioHandler.handleAudioStart();
|
|
48
|
+
}
|
|
49
|
+
this.usageStatsCollector.incrementChunkCount();
|
|
50
|
+
await this.audioHandler.bufferAudioChunk(event.delta);
|
|
51
|
+
break;
|
|
52
|
+
case "response.output_audio.done":
|
|
53
|
+
await this.audioHandler.handleAudioEnd();
|
|
54
|
+
this.audioStarted = false;
|
|
55
|
+
break;
|
|
56
|
+
case "response.output_audio_transcript.delta":
|
|
57
|
+
this.textAccumulator.appendAssistant(event.delta ?? "");
|
|
58
|
+
break;
|
|
59
|
+
case "response.output_audio_transcript.done":
|
|
60
|
+
if (event.transcript) {
|
|
61
|
+
this.textAccumulator.setAssistantText(event.transcript);
|
|
62
|
+
}
|
|
63
|
+
break;
|
|
64
|
+
case "conversation.item.input_audio_transcription.completed":
|
|
65
|
+
this.textAccumulator.setUserTranscript(event.transcript ?? "");
|
|
66
|
+
break;
|
|
67
|
+
case "response.function_call_arguments.done":
|
|
68
|
+
await this.handleFunctionCall(event);
|
|
69
|
+
break;
|
|
70
|
+
case "session.created":
|
|
71
|
+
this.logger.info("Session created", { session: event.session });
|
|
72
|
+
break;
|
|
73
|
+
case "session.updated":
|
|
74
|
+
this.logger.debug("Session config accepted", {
|
|
75
|
+
voice: event.session?.audio?.output?.voice,
|
|
76
|
+
turn_detection: event.session?.audio?.input?.turn_detection,
|
|
77
|
+
});
|
|
78
|
+
break;
|
|
79
|
+
case "input_audio_buffer.speech_started":
|
|
80
|
+
this.logger.debug("VAD: speech started", { sessionId: this.sessionId });
|
|
81
|
+
if (this.audioStarted) {
|
|
82
|
+
// Barge-in: drop buffered assistant audio rather than flushing it
|
|
83
|
+
await this.audioHandler.handleInterruption();
|
|
84
|
+
this.audioStarted = false;
|
|
85
|
+
}
|
|
86
|
+
this.textAccumulator.resetTurn();
|
|
87
|
+
await this.publishUserSpeechState("USER_SPEECH_STARTED");
|
|
88
|
+
break;
|
|
89
|
+
case "input_audio_buffer.speech_stopped":
|
|
90
|
+
this.logger.debug("VAD: speech stopped", { sessionId: this.sessionId });
|
|
91
|
+
await this.publishUserSpeechState("USER_SPEECH_ENDED");
|
|
92
|
+
break;
|
|
93
|
+
case "input_audio_buffer.committed":
|
|
94
|
+
this.logger.debug("Audio buffer committed", { sessionId: this.sessionId });
|
|
95
|
+
break;
|
|
96
|
+
case "response.done":
|
|
97
|
+
this.handleResponseDone(event);
|
|
98
|
+
break;
|
|
99
|
+
case "response.function_call_arguments.delta":
|
|
100
|
+
case "response.output_item.added":
|
|
101
|
+
case "response.output_item.done":
|
|
102
|
+
case "response.content_part.added":
|
|
103
|
+
case "response.content_part.done":
|
|
104
|
+
case "response.created":
|
|
105
|
+
case "conversation.item.created":
|
|
106
|
+
case "conversation.item.done":
|
|
107
|
+
case "conversation.item.added":
|
|
108
|
+
case "rate_limits.updated":
|
|
109
|
+
break;
|
|
110
|
+
case "error":
|
|
111
|
+
this.logger.error("Realtime API error", { error: event.error, sessionId: this.sessionId });
|
|
112
|
+
break;
|
|
113
|
+
default:
|
|
114
|
+
this.logger.debug("Unhandled event", { type: event.type });
|
|
115
|
+
break;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
handleResponseDone(event) {
|
|
119
|
+
this.finalizeUsageStats(event);
|
|
120
|
+
if (this.hasToolCallsInCurrentResponse) {
|
|
121
|
+
// Tool calls were dispatched — the orchestrator sends response.create once
|
|
122
|
+
// all tool outputs are submitted AND this response has settled
|
|
123
|
+
this.hasToolCallsInCurrentResponse = false;
|
|
124
|
+
this.onToolResponseDone?.();
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
// Normal text/audio response — emit conversation turn
|
|
128
|
+
this.textAccumulator.emitConversation();
|
|
129
|
+
this.textAccumulator.resetTurn();
|
|
130
|
+
}
|
|
131
|
+
async publishUserSpeechState(state) {
|
|
132
|
+
const conversationId = this.metadata.conversationId || this.sessionId;
|
|
133
|
+
const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
|
|
134
|
+
try {
|
|
135
|
+
await publisher.publishState({
|
|
136
|
+
state,
|
|
137
|
+
conversationId,
|
|
138
|
+
metadata: this.metadata,
|
|
139
|
+
message: state === "USER_SPEECH_STARTED" ? "User started speaking" : "User finished speaking",
|
|
140
|
+
});
|
|
141
|
+
}
|
|
142
|
+
catch (err) {
|
|
143
|
+
this.logger.warn("Failed to publish user speech state", { state, error: err?.message });
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
async handleFunctionCall(event) {
|
|
147
|
+
const toolName = event.name;
|
|
148
|
+
const callId = event.call_id;
|
|
149
|
+
let toolInput = {};
|
|
150
|
+
try {
|
|
151
|
+
toolInput = JSON.parse(event.arguments || "{}");
|
|
152
|
+
}
|
|
153
|
+
catch {
|
|
154
|
+
this.logger.warn("Failed to parse tool arguments", { arguments: event.arguments });
|
|
155
|
+
}
|
|
156
|
+
this.hasToolCallsInCurrentResponse = true;
|
|
157
|
+
// Dispatch FIRST, synchronously — awaiting the state publish before counting
|
|
158
|
+
// opens a race where a fast parallel tool hits zero pending and requests the
|
|
159
|
+
// next response early
|
|
160
|
+
if (this.onToolUse) {
|
|
161
|
+
this.onToolUse({ toolName, toolInput, callId });
|
|
162
|
+
}
|
|
163
|
+
const conversationId = this.metadata.conversationId || this.sessionId;
|
|
164
|
+
const publisher = new WebSocketAudioPublisher_1.WebSocketAudioPublisher();
|
|
165
|
+
await publisher.publishState({
|
|
166
|
+
state: "TOOL_USE",
|
|
167
|
+
conversationId,
|
|
168
|
+
metadata: this.metadata,
|
|
169
|
+
message: `Using tool: ${toolName}`,
|
|
170
|
+
additionalMetadata: { toolName, callId, toolInput },
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
finalizeUsageStats(event) {
|
|
174
|
+
const usage = event.response?.usage;
|
|
175
|
+
if (usage) {
|
|
176
|
+
this.usageStatsCollector.addUsage(usage.input_tokens ?? 0, usage.output_tokens ?? 0);
|
|
177
|
+
}
|
|
178
|
+
this.usageStatsCollector.setTextResults(this.textAccumulator.getTranscription(), this.textAccumulator.getAssistantResponse());
|
|
179
|
+
}
|
|
180
|
+
emitFinal() {
|
|
181
|
+
this.textAccumulator.emitFinal();
|
|
182
|
+
}
|
|
183
|
+
getUsageStats() {
|
|
184
|
+
return this.usageStatsCollector.getUsageStats();
|
|
185
|
+
}
|
|
186
|
+
async handleError(error) {
|
|
187
|
+
this.logger.error("Stream error", { sessionId: this.sessionId, error: error.message || error });
|
|
188
|
+
if (this.audioStarted) {
|
|
189
|
+
await this.audioHandler.handleAudioEnd();
|
|
190
|
+
this.audioStarted = false;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
cleanup() {
|
|
194
|
+
this.audioHandler.cleanup();
|
|
195
|
+
this.textAccumulator.resetTurn();
|
|
196
|
+
this.usageStatsCollector.reset();
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
exports.RealtimeResponseProcessor = RealtimeResponseProcessor;
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
export declare class TextAccumulator {
|
|
2
|
+
private sessionId;
|
|
3
|
+
private emit?;
|
|
4
|
+
private transcription;
|
|
5
|
+
private assistantResponse;
|
|
6
|
+
private allTurns;
|
|
7
|
+
private progressLog;
|
|
8
|
+
private readonly logger;
|
|
9
|
+
constructor(sessionId: string, emit?: ((output: any) => void) | undefined);
|
|
10
|
+
appendAssistant(text: string): void;
|
|
11
|
+
setAssistantText(text: string): void;
|
|
12
|
+
setUserTranscript(transcript: string): void;
|
|
13
|
+
emitProgress(text: string): void;
|
|
14
|
+
emitConversation(): void;
|
|
15
|
+
emitFinal(): void;
|
|
16
|
+
resetTurn(): void;
|
|
17
|
+
getTranscription(): string;
|
|
18
|
+
getAssistantResponse(): string;
|
|
19
|
+
}
|