@gravity-platform/openai-realtime 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/README.md +104 -0
  2. package/dist/Realtime/constants.d.ts +7 -0
  3. package/dist/Realtime/constants.js +10 -0
  4. package/dist/Realtime/node/executor.d.ts +16 -0
  5. package/dist/Realtime/node/executor.js +112 -0
  6. package/dist/Realtime/node/index.d.ts +7 -0
  7. package/dist/Realtime/node/index.js +146 -0
  8. package/dist/Realtime/service/core/orchestration/SessionOrchestrator.d.ts +9 -0
  9. package/dist/Realtime/service/core/orchestration/SessionOrchestrator.js +333 -0
  10. package/dist/Realtime/service/core/processing/AudioHandler.d.ts +17 -0
  11. package/dist/Realtime/service/core/processing/AudioHandler.js +57 -0
  12. package/dist/Realtime/service/core/processing/ResponseProcessor.d.ts +34 -0
  13. package/dist/Realtime/service/core/processing/ResponseProcessor.js +199 -0
  14. package/dist/Realtime/service/core/processing/TextAccumulator.d.ts +19 -0
  15. package/dist/Realtime/service/core/processing/TextAccumulator.js +75 -0
  16. package/dist/Realtime/service/core/processing/UsageStatsCollector.d.ts +14 -0
  17. package/dist/Realtime/service/core/processing/UsageStatsCollector.js +52 -0
  18. package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.d.ts +10 -0
  19. package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.js +32 -0
  20. package/dist/Realtime/service/core/streaming/SessionManager.d.ts +14 -0
  21. package/dist/Realtime/service/core/streaming/SessionManager.js +33 -0
  22. package/dist/Realtime/service/core/streaming/WsClient.d.ts +11 -0
  23. package/dist/Realtime/service/core/streaming/WsClient.js +93 -0
  24. package/dist/Realtime/service/index.d.ts +6 -0
  25. package/dist/Realtime/service/index.js +13 -0
  26. package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.d.ts +4 -0
  27. package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.js +15 -0
  28. package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.d.ts +5 -0
  29. package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.js +36 -0
  30. package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.d.ts +3 -0
  31. package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.js +9 -0
  32. package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.d.ts +4 -0
  33. package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.js +61 -0
  34. package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.d.ts +28 -0
  35. package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.js +101 -0
  36. package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.d.ts +13 -0
  37. package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.js +94 -0
  38. package/dist/credentials/index.d.ts +14 -0
  39. package/dist/credentials/index.js +19 -0
  40. package/dist/index.d.ts +2 -0
  41. package/dist/index.js +54 -0
  42. package/dist/util/types.d.ts +40 -0
  43. package/dist/util/types.js +2 -0
  44. package/package.json +58 -0
package/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # @gravity-platform/openai-realtime
2
+
3
+ OpenAI Realtime API integration for Gravity Platform — real-time voice conversations with gpt-realtime-2.
4
+
5
+ ## Features
6
+
7
+ - ✅ Real-time voice input/output via WebSocket
8
+ - ✅ Server-side Voice Activity Detection (VAD)
9
+ - ✅ Function calling / tool use support
10
+ - ✅ MCP service connector auto-discovery
11
+ - ✅ Token usage tracking
12
+ - ✅ Conversation history support
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ npm install @gravity-platform/openai-realtime
18
+ ```
19
+
20
+ ## Node: OpenAI Realtime Voice
21
+
22
+ **Type**: CallbackNode
23
+ **Category**: AI
24
+ **Model**: `gpt-realtime-2`
25
+
26
+ Real-time voice conversation with OpenAI's gpt-realtime-2 model via WebSocket streaming.
27
+
28
+ ### Inputs
29
+ - `input` (ANY): Input data
30
+
31
+ ### Outputs
32
+ - `text` (OBJECT): `{ query, response }` — user transcription + assistant text
33
+ - `conversation` (OBJECT): `{ user, assistant }` — combined conversation
34
+ - `mcpResult` (OBJECT): MCP tool execution results
35
+ - `progress` (STRING): Real-time log of tool calls and turns
36
+
37
+ ### Configuration
38
+
39
+ | Field | Type | Required | Default | Description |
40
+ |-------|------|----------|---------|-------------|
41
+ | systemPrompt | string | No | "" | System instructions (supports templates) |
42
+ | conversationHistory | object | No | - | JSON array of prior messages |
43
+ | initialRequest | string | No | "" | Text sent at call start |
44
+ | voice | enum | Yes | "alloy" | Voice: alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, cedar |
45
+ | turnDetection | enum | Yes | "semantic_vad" | semantic_vad, server_vad, or disabled |
46
+ | maxResponseOutputTokens | number | No | 4096 | Max tokens per response |
47
+ | redisChannel | enum | Yes | AI_RESULT_CHANNEL | Redis channel for audio |
48
+
49
+ ### Environment Variables
50
+
51
+ | Variable | Default | Description |
52
+ |----------|---------|-------------|
53
+ | REALTIME_TOOL_TIMEOUT_MS | 30000 | Max time for an MCP tool call before it returns an error to the model |
54
+
55
+ ### Credentials
56
+
57
+ Requires **OpenAI API Key** (`openaiCredential`) from platform.openai.com.
58
+
59
+ ## Architecture
60
+
61
+ ```
62
+ src/
63
+ ├── credentials/ # OpenAI credential definition
64
+ ├── Realtime/
65
+ │ ├── constants.ts # Model ID, WS URL, defaults
66
+ │ ├── node/ # Node definition + executor
67
+ │ └── service/
68
+ │ ├── index.ts # RealtimeVoiceService entry point
69
+ │ └── core/
70
+ │ ├── orchestration/ # SessionOrchestrator
71
+ │ ├── streaming/ # WsClient, SessionManager, Registry
72
+ │ └── processing/ # ResponseProcessor
73
+ └── util/types.ts # Shared TypeScript types
74
+ ```
75
+
76
+ ## Differences from xAI Grok
77
+
78
+ 1. **Credentials**: Uses field signature pattern (no deprecated `getNodeCredentials`)
79
+ 2. **Event Format**: OpenAI Realtime API event structure differs from Grok
80
+ 3. **Function Calling**: Uses `conversation.item.create` with `function_call_output` type
81
+ 4. **Turn Detection**: OpenAI uses explicit `turn_detection` config object
82
+ 5. **Audio**: OpenAI supports both text and audio modalities in same session
83
+
84
+ ## Usage Example
85
+
86
+ ```typescript
87
+ // In a workflow:
88
+ // 1. User speaks → mic audio sent to Realtime API
89
+ // 2. Server VAD detects speech end
90
+ // 3. gpt-realtime-2 processes audio → generates text response
91
+ // 4. Text response streamed back as audio
92
+ // 5. MCP tools called if needed (e.g., search knowledge base)
93
+ ```
94
+
95
+ ## Control Signals
96
+
97
+ - `START_CALL`: Initiate new Realtime session
98
+ - `END_CALL`: Close WebSocket and end session
99
+
100
+ Pass via `input.metadata.action` or use separate workflow step.
101
+
102
+ ## License
103
+
104
+ MIT
@@ -0,0 +1,7 @@
1
+ export declare const REALTIME_MODEL_ID = "gpt-realtime-2";
2
+ export declare const REALTIME_WS_URL = "wss://api.openai.com/v1/realtime";
3
+ export declare const DEFAULT_VOICE = "alloy";
4
+ export declare const AVAILABLE_VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"];
5
+ export declare const DEFAULT_TURN_DETECTION: {
6
+ type: "semantic_vad";
7
+ };
@@ -0,0 +1,10 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.DEFAULT_TURN_DETECTION = exports.AVAILABLE_VOICES = exports.DEFAULT_VOICE = exports.REALTIME_WS_URL = exports.REALTIME_MODEL_ID = void 0;
4
+ exports.REALTIME_MODEL_ID = "gpt-realtime-2";
5
+ exports.REALTIME_WS_URL = "wss://api.openai.com/v1/realtime";
6
+ exports.DEFAULT_VOICE = "alloy";
7
+ exports.AVAILABLE_VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"];
8
+ exports.DEFAULT_TURN_DETECTION = {
9
+ type: "semantic_vad",
10
+ };
@@ -0,0 +1,16 @@
1
+ import { OpenAIRealtimeConfig } from "../../util/types";
2
+ declare const CallbackNode: any;
3
+ interface RealtimeVoiceState {
4
+ isComplete: boolean;
5
+ }
6
+ export default class RealtimeVoiceExecutor extends CallbackNode<OpenAIRealtimeConfig, RealtimeVoiceState> {
7
+ private logger;
8
+ constructor();
9
+ initializeState(_inputs: any): RealtimeVoiceState;
10
+ handleEvent(event: {
11
+ type: string;
12
+ inputs?: any;
13
+ config?: any;
14
+ }, state: RealtimeVoiceState, emit: (output: any) => void, context?: any): Promise<RealtimeVoiceState>;
15
+ }
16
+ export { RealtimeVoiceExecutor };
@@ -0,0 +1,112 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.RealtimeVoiceExecutor = void 0;
37
+ const plugin_base_1 = require("@gravity-platform/plugin-base");
38
+ const { CallbackNode } = (0, plugin_base_1.getPlatformDependencies)();
39
+ class RealtimeVoiceExecutor extends CallbackNode {
40
+ constructor() {
41
+ super("OpenAIRealtimeVoice");
42
+ this.logger = (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeVoiceExecutor");
43
+ }
44
+ initializeState(_inputs) {
45
+ return { isComplete: false };
46
+ }
47
+ async handleEvent(event, state, emit, context) {
48
+ if (state.isComplete)
49
+ return state;
50
+ if (!context) {
51
+ this.logger.error("No execution context provided");
52
+ return { ...state, isComplete: true };
53
+ }
54
+ const { inputs, config } = event;
55
+ const startTime = Date.now();
56
+ this.logger.info("handleEvent called", { type: event.type, isComplete: state.isComplete });
57
+ try {
58
+ const pubContext = context.publishingContext || {};
59
+ const workflowVars = context.workflow?.variables || {};
60
+ const chatId = pubContext.chatId || workflowVars.chatId || "";
61
+ const conversationId = pubContext.conversationId || workflowVars.conversationId || "";
62
+ const userId = pubContext.userId || workflowVars.userId || "";
63
+ const inputObj = inputs?.input;
64
+ const firstKey = inputObj ? Object.keys(inputObj)[0] : null;
65
+ const sourceData = firstKey ? inputObj[firstKey] : null;
66
+ const inputData = sourceData?.output || sourceData;
67
+ const action = inputData?.metadata?.action;
68
+ const metadata = {
69
+ workflowId: context.workflowId || context.workflow?.id || "",
70
+ executionId: context.executionId,
71
+ nodeId: context.nodeId,
72
+ chatId,
73
+ conversationId,
74
+ userId,
75
+ providerId: "OpenAI Realtime Voice",
76
+ };
77
+ const { RealtimeVoiceService } = await Promise.resolve().then(() => __importStar(require("../service")));
78
+ const service = new RealtimeVoiceService();
79
+ const initialRequest = typeof config.initialRequest === "string"
80
+ ? config.initialRequest
81
+ : config.initialRequest?.toString?.() || "";
82
+ this.logger.info("Config resolved", {
83
+ hasInitialRequest: !!initialRequest,
84
+ initialRequestType: typeof config.initialRequest,
85
+ initialRequestValue: initialRequest?.slice(0, 50),
86
+ });
87
+ const stats = await service.generateVoiceStream({
88
+ systemPrompt: config.systemPrompt,
89
+ conversationHistory: config.conversationHistory,
90
+ initialRequest,
91
+ voice: config.voice,
92
+ turnDetection: config.turnDetection || "semantic_vad",
93
+ maxResponseOutputTokens: config.maxResponseOutputTokens,
94
+ redisChannel: config.redisChannel,
95
+ controlSignal: action === "END_CALL" ? "END_CALL" : "START_CALL",
96
+ }, metadata, context, emit);
97
+ this.logger.info(`RealtimeVoice turn complete in ${Date.now() - startTime}ms`, { action });
98
+ return { ...state, isComplete: true };
99
+ }
100
+ catch (err) {
101
+ this.logger.error("RealtimeVoice execution FAILED", {
102
+ error: err.message,
103
+ stack: err.stack?.split("\n").slice(0, 3).join(" | ")
104
+ });
105
+ // Rethrow so the engine emits NODE_ERROR — swallowing here makes a failed
106
+ // call indistinguishable from a completed one downstream
107
+ throw err;
108
+ }
109
+ }
110
+ }
111
+ exports.default = RealtimeVoiceExecutor;
112
+ exports.RealtimeVoiceExecutor = RealtimeVoiceExecutor;
@@ -0,0 +1,7 @@
1
+ import { type EnhancedNodeDefinition } from "@gravity-platform/plugin-base";
2
+ import RealtimeVoiceExecutor from "./executor";
3
+ export declare function createNodeDefinition(): EnhancedNodeDefinition;
4
+ export declare const RealtimeVoiceNode: {
5
+ readonly definition: any;
6
+ executor: typeof RealtimeVoiceExecutor;
7
+ };
@@ -0,0 +1,146 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.RealtimeVoiceNode = void 0;
7
+ exports.createNodeDefinition = createNodeDefinition;
8
+ const plugin_base_1 = require("@gravity-platform/plugin-base");
9
+ const executor_1 = __importDefault(require("./executor"));
10
+ function createNodeDefinition() {
11
+ const { NodeInputType, AI_RESULT_CHANNEL, SYSTEM_CHANNEL } = (0, plugin_base_1.getPlatformDependencies)();
12
+ return {
13
+ packageVersion: "1.0.0",
14
+ type: "OpenAIRealtimeVoice",
15
+ isService: false,
16
+ name: "OpenAI Realtime Voice",
17
+ description: "Real-time voice conversation with gpt-realtime-2 via WebSocket",
18
+ whenToUse: "Live WebSocket voice conversation with OpenAI gpt-realtime; audio chunks publish to a Redis channel and it can call MCP tools via service edges. Alternatives: XAIGrokVoice (xAI) or AWSNovaSpeech (AWS). For plain text-to-speech use ElevenLabs.",
19
+ configExample: {
20
+ systemPrompt: "You are a helpful voice assistant. Keep responses short and conversational.",
21
+ voice: "marin",
22
+ turnDetection: "semantic_vad",
23
+ maxResponseOutputTokens: 4096,
24
+ redisChannel: AI_RESULT_CHANNEL,
25
+ },
26
+ category: "AI",
27
+ color: "#10A37F",
28
+ logoUrl: "https://res.cloudinary.com/sonik/image/upload/v1749262616/gravity/icons/ChatGPT-Logo.svg.webp",
29
+ inputs: [
30
+ {
31
+ name: "input",
32
+ type: NodeInputType.ANY,
33
+ description: "Input data",
34
+ },
35
+ ],
36
+ outputs: [
37
+ {
38
+ name: "text",
39
+ type: NodeInputType.OBJECT,
40
+ description: "Conversation object with query (user transcription) and response (assistant text)",
41
+ },
42
+ {
43
+ name: "conversation",
44
+ type: NodeInputType.OBJECT,
45
+ description: "Combined conversation object with user and assistant messages",
46
+ },
47
+ {
48
+ name: "mcpResult",
49
+ type: NodeInputType.OBJECT,
50
+ description: "MCP tool results",
51
+ },
52
+ {
53
+ name: "progress",
54
+ type: NodeInputType.STRING,
55
+ description: "Real-time progress log of tool calls and conversation turns",
56
+ },
57
+ ],
58
+ configSchema: {
59
+ type: "object",
60
+ properties: {
61
+ systemPrompt: {
62
+ type: "string",
63
+ title: "System Prompt",
64
+ description: "System instructions for the model. Supports template syntax like {{signal.<sourceNodeId>.<outputHandle>.<field>}}.",
65
+ default: "",
66
+ "ui:field": "template",
67
+ },
68
+ conversationHistory: {
69
+ type: "object",
70
+ title: "Conversation History",
71
+ description: "JSON array of conversation history",
72
+ "ui:field": "template",
73
+ },
74
+ initialRequest: {
75
+ type: "object",
76
+ title: "Initial Request",
77
+ description: "Text sent as user message at call start — the model responds immediately",
78
+ "ui:field": "template",
79
+ },
80
+ voice: {
81
+ type: "string",
82
+ title: "Voice",
83
+ description: "Select the voice for speech generation",
84
+ enum: ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"],
85
+ enumNames: ["Alloy", "Ash", "Ballad", "Coral", "Echo", "Sage", "Shimmer", "Verse", "Marin", "Cedar"],
86
+ default: "alloy",
87
+ },
88
+ turnDetection: {
89
+ type: "string",
90
+ title: "Turn Detection",
91
+ description: "How the model detects end of user speech",
92
+ enum: ["semantic_vad", "server_vad", "disabled"],
93
+ enumNames: ["Semantic VAD (recommended)", "Server VAD (threshold-based)", "Disabled"],
94
+ default: "semantic_vad",
95
+ },
96
+ maxResponseOutputTokens: {
97
+ type: "number",
98
+ title: "Max Response Tokens",
99
+ description: "Maximum tokens in assistant response (inf for unlimited)",
100
+ default: 4096,
101
+ },
102
+ redisChannel: {
103
+ type: "string",
104
+ title: "Redis Channel",
105
+ description: "Redis channel to publish audio chunks to",
106
+ enum: [AI_RESULT_CHANNEL, SYSTEM_CHANNEL],
107
+ enumNames: ["AI Results", "System Messages"],
108
+ default: AI_RESULT_CHANNEL,
109
+ },
110
+ },
111
+ required: ["voice", "redisChannel"],
112
+ "ui:order": [
113
+ "systemPrompt",
114
+ "conversationHistory",
115
+ "initialRequest",
116
+ "voice",
117
+ "turnDetection",
118
+ "maxResponseOutputTokens",
119
+ "redisChannel",
120
+ ],
121
+ },
122
+ capabilities: { isTrigger: false },
123
+ serviceConnectors: [
124
+ {
125
+ name: "mcpService",
126
+ description: "MCP service connector — automatic schema discovery",
127
+ serviceType: "mcp",
128
+ isService: false,
129
+ },
130
+ ],
131
+ credentials: [
132
+ {
133
+ name: "openaiCredential",
134
+ required: true,
135
+ displayName: "OpenAI Credentials",
136
+ description: "OpenAI API key for Realtime API",
137
+ },
138
+ ],
139
+ };
140
+ }
141
+ exports.RealtimeVoiceNode = {
142
+ get definition() {
143
+ return createNodeDefinition();
144
+ },
145
+ executor: executor_1.default,
146
+ };
@@ -0,0 +1,9 @@
1
+ import { OpenAIRealtimeConfig, StreamUsageStats, StreamingMetadata } from "../../../../util/types";
2
+ export declare class SessionOrchestrator {
3
+ private readonly logger;
4
+ private sessionManager;
5
+ orchestrateSession(config: OpenAIRealtimeConfig, metadata: StreamingMetadata, context: any, emit?: (output: any) => void): Promise<StreamUsageStats>;
6
+ private handleStartCall;
7
+ private handleEndCall;
8
+ private getCredentials;
9
+ }