npm - @gravity-platform/openai-realtime - Versions diffs - 1.0.1 - Mend

@gravity-platform/openai-realtime 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,104 @@
+# @gravity-platform/openai-realtime
+OpenAI Realtime API integration for Gravity Platform — real-time voice conversations with gpt-realtime-2.
+## Features
+- ✅ Real-time voice input/output via WebSocket
+- ✅ Server-side Voice Activity Detection (VAD)
+- ✅ Function calling / tool use support
+- ✅ MCP service connector auto-discovery
+- ✅ Token usage tracking
+- ✅ Conversation history support
+## Installation
+```bash
+npm install @gravity-platform/openai-realtime
+```
+## Node: OpenAI Realtime Voice
+**Type**: CallbackNode
+**Category**: AI
+**Model**: `gpt-realtime-2`
+Real-time voice conversation with OpenAI's gpt-realtime-2 model via WebSocket streaming.
+### Inputs
+- `input` (ANY): Input data
+### Outputs
+- `text` (OBJECT): `{ query, response }` — user transcription + assistant text
+- `conversation` (OBJECT): `{ user, assistant }` — combined conversation
+- `mcpResult` (OBJECT): MCP tool execution results
+- `progress` (STRING): Real-time log of tool calls and turns
+### Configuration
+| Field | Type | Required | Default | Description |
+|-------|------|----------|---------|-------------|
+| systemPrompt | string | No | "" | System instructions (supports templates) |
+| conversationHistory | object | No | - | JSON array of prior messages |
+| initialRequest | string | No | "" | Text sent at call start |
+| voice | enum | Yes | "alloy" | Voice: alloy, ash, ballad, coral, echo, sage, shimmer, verse, marin, cedar |
+| turnDetection | enum | Yes | "semantic_vad" | semantic_vad, server_vad, or disabled |
+| maxResponseOutputTokens | number | No | 4096 | Max tokens per response |
+| redisChannel | enum | Yes | AI_RESULT_CHANNEL | Redis channel for audio |
+### Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| REALTIME_TOOL_TIMEOUT_MS | 30000 | Max time for an MCP tool call before it returns an error to the model |
+### Credentials
+Requires **OpenAI API Key** (`openaiCredential`) from platform.openai.com.
+## Architecture
+```
+src/
+├── credentials/          # OpenAI credential definition
+├── Realtime/
+│   ├── constants.ts     # Model ID, WS URL, defaults
+│   ├── node/            # Node definition + executor
+│   └── service/
+│       ├── index.ts     # RealtimeVoiceService entry point
+│       └── core/
+│           ├── orchestration/  # SessionOrchestrator
+│           ├── streaming/      # WsClient, SessionManager, Registry
+│           └── processing/     # ResponseProcessor
+└── util/types.ts        # Shared TypeScript types
+```
+## Differences from xAI Grok
+1. **Credentials**: Uses field signature pattern (no deprecated `getNodeCredentials`)
+2. **Event Format**: OpenAI Realtime API event structure differs from Grok
+3. **Function Calling**: Uses `conversation.item.create` with `function_call_output` type
+4. **Turn Detection**: OpenAI uses explicit `turn_detection` config object
+5. **Audio**: OpenAI supports both text and audio modalities in same session
+## Usage Example
+```typescript
+// In a workflow:
+// 1. User speaks → mic audio sent to Realtime API
+// 2. Server VAD detects speech end
+// 3. gpt-realtime-2 processes audio → generates text response
+// 4. Text response streamed back as audio
+// 5. MCP tools called if needed (e.g., search knowledge base)
+```
+## Control Signals
+- `START_CALL`: Initiate new Realtime session
+- `END_CALL`: Close WebSocket and end session
+Pass via `input.metadata.action` or use separate workflow step.
+## License
+MIT

package/dist/Realtime/constants.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+export declare const REALTIME_MODEL_ID = "gpt-realtime-2";
+export declare const REALTIME_WS_URL = "wss://api.openai.com/v1/realtime";
+export declare const DEFAULT_VOICE = "alloy";
+export declare const AVAILABLE_VOICES: readonly ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"];
+export declare const DEFAULT_TURN_DETECTION: {
+    type: "semantic_vad";
+};

package/dist/Realtime/constants.js ADDED Viewed

@@ -0,0 +1,10 @@
+"use strict";
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.DEFAULT_TURN_DETECTION = exports.AVAILABLE_VOICES = exports.DEFAULT_VOICE = exports.REALTIME_WS_URL = exports.REALTIME_MODEL_ID = void 0;
+exports.REALTIME_MODEL_ID = "gpt-realtime-2";
+exports.REALTIME_WS_URL = "wss://api.openai.com/v1/realtime";
+exports.DEFAULT_VOICE = "alloy";
+exports.AVAILABLE_VOICES = ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"];
+exports.DEFAULT_TURN_DETECTION = {
+    type: "semantic_vad",
+};

package/dist/Realtime/node/executor.d.ts ADDED Viewed

@@ -0,0 +1,16 @@
+import { OpenAIRealtimeConfig } from "../../util/types";
+declare const CallbackNode: any;
+interface RealtimeVoiceState {
+    isComplete: boolean;
+}
+export default class RealtimeVoiceExecutor extends CallbackNode<OpenAIRealtimeConfig, RealtimeVoiceState> {
+    private logger;
+    constructor();
+    initializeState(_inputs: any): RealtimeVoiceState;
+    handleEvent(event: {
+        type: string;
+        inputs?: any;
+        config?: any;
+    }, state: RealtimeVoiceState, emit: (output: any) => void, context?: any): Promise<RealtimeVoiceState>;
+}
+export { RealtimeVoiceExecutor };

package/dist/Realtime/node/executor.js ADDED Viewed

@@ -0,0 +1,112 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.RealtimeVoiceExecutor = void 0;
+const plugin_base_1 = require("@gravity-platform/plugin-base");
+const { CallbackNode } = (0, plugin_base_1.getPlatformDependencies)();
+class RealtimeVoiceExecutor extends CallbackNode {
+    constructor() {
+        super("OpenAIRealtimeVoice");
+        this.logger = (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeVoiceExecutor");
+    }
+    initializeState(_inputs) {
+        return { isComplete: false };
+    }
+    async handleEvent(event, state, emit, context) {
+        if (state.isComplete)
+            return state;
+        if (!context) {
+            this.logger.error("No execution context provided");
+            return { ...state, isComplete: true };
+        }
+        const { inputs, config } = event;
+        const startTime = Date.now();
+        this.logger.info("handleEvent called", { type: event.type, isComplete: state.isComplete });
+        try {
+            const pubContext = context.publishingContext || {};
+            const workflowVars = context.workflow?.variables || {};
+            const chatId = pubContext.chatId || workflowVars.chatId || "";
+            const conversationId = pubContext.conversationId || workflowVars.conversationId || "";
+            const userId = pubContext.userId || workflowVars.userId || "";
+            const inputObj = inputs?.input;
+            const firstKey = inputObj ? Object.keys(inputObj)[0] : null;
+            const sourceData = firstKey ? inputObj[firstKey] : null;
+            const inputData = sourceData?.output || sourceData;
+            const action = inputData?.metadata?.action;
+            const metadata = {
+                workflowId: context.workflowId || context.workflow?.id || "",
+                executionId: context.executionId,
+                nodeId: context.nodeId,
+                chatId,
+                conversationId,
+                userId,
+                providerId: "OpenAI Realtime Voice",
+            };
+            const { RealtimeVoiceService } = await Promise.resolve().then(() => __importStar(require("../service")));
+            const service = new RealtimeVoiceService();
+            const initialRequest = typeof config.initialRequest === "string"
+                ? config.initialRequest
+                : config.initialRequest?.toString?.() || "";
+            this.logger.info("Config resolved", {
+                hasInitialRequest: !!initialRequest,
+                initialRequestType: typeof config.initialRequest,
+                initialRequestValue: initialRequest?.slice(0, 50),
+            });
+            const stats = await service.generateVoiceStream({
+                systemPrompt: config.systemPrompt,
+                conversationHistory: config.conversationHistory,
+                initialRequest,
+                voice: config.voice,
+                turnDetection: config.turnDetection || "semantic_vad",
+                maxResponseOutputTokens: config.maxResponseOutputTokens,
+                redisChannel: config.redisChannel,
+                controlSignal: action === "END_CALL" ? "END_CALL" : "START_CALL",
+            }, metadata, context, emit);
+            this.logger.info(`RealtimeVoice turn complete in ${Date.now() - startTime}ms`, { action });
+            return { ...state, isComplete: true };
+        }
+        catch (err) {
+            this.logger.error("RealtimeVoice execution FAILED", {
+                error: err.message,
+                stack: err.stack?.split("\n").slice(0, 3).join(" | ")
+            });
+            // Rethrow so the engine emits NODE_ERROR — swallowing here makes a failed
+            // call indistinguishable from a completed one downstream
+            throw err;
+        }
+    }
+}
+exports.default = RealtimeVoiceExecutor;
+exports.RealtimeVoiceExecutor = RealtimeVoiceExecutor;

package/dist/Realtime/node/index.d.ts ADDED Viewed

@@ -0,0 +1,7 @@
+import { type EnhancedNodeDefinition } from "@gravity-platform/plugin-base";
+import RealtimeVoiceExecutor from "./executor";
+export declare function createNodeDefinition(): EnhancedNodeDefinition;
+export declare const RealtimeVoiceNode: {
+    readonly definition: any;
+    executor: typeof RealtimeVoiceExecutor;
+};

package/dist/Realtime/node/index.js ADDED Viewed

@@ -0,0 +1,146 @@
+"use strict";
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.RealtimeVoiceNode = void 0;
+exports.createNodeDefinition = createNodeDefinition;
+const plugin_base_1 = require("@gravity-platform/plugin-base");
+const executor_1 = __importDefault(require("./executor"));
+function createNodeDefinition() {
+    const { NodeInputType, AI_RESULT_CHANNEL, SYSTEM_CHANNEL } = (0, plugin_base_1.getPlatformDependencies)();
+    return {
+        packageVersion: "1.0.0",
+        type: "OpenAIRealtimeVoice",
+        isService: false,
+        name: "OpenAI Realtime Voice",
+        description: "Real-time voice conversation with gpt-realtime-2 via WebSocket",
+        whenToUse: "Live WebSocket voice conversation with OpenAI gpt-realtime; audio chunks publish to a Redis channel and it can call MCP tools via service edges. Alternatives: XAIGrokVoice (xAI) or AWSNovaSpeech (AWS). For plain text-to-speech use ElevenLabs.",
+        configExample: {
+            systemPrompt: "You are a helpful voice assistant. Keep responses short and conversational.",
+            voice: "marin",
+            turnDetection: "semantic_vad",
+            maxResponseOutputTokens: 4096,
+            redisChannel: AI_RESULT_CHANNEL,
+        },
+        category: "AI",
+        color: "#10A37F",
+        logoUrl: "https://res.cloudinary.com/sonik/image/upload/v1749262616/gravity/icons/ChatGPT-Logo.svg.webp",
+        inputs: [
+            {
+                name: "input",
+                type: NodeInputType.ANY,
+                description: "Input data",
+            },
+        ],
+        outputs: [
+            {
+                name: "text",
+                type: NodeInputType.OBJECT,
+                description: "Conversation object with query (user transcription) and response (assistant text)",
+            },
+            {
+                name: "conversation",
+                type: NodeInputType.OBJECT,
+                description: "Combined conversation object with user and assistant messages",
+            },
+            {
+                name: "mcpResult",
+                type: NodeInputType.OBJECT,
+                description: "MCP tool results",
+            },
+            {
+                name: "progress",
+                type: NodeInputType.STRING,
+                description: "Real-time progress log of tool calls and conversation turns",
+            },
+        ],
+        configSchema: {
+            type: "object",
+            properties: {
+                systemPrompt: {
+                    type: "string",
+                    title: "System Prompt",
+                    description: "System instructions for the model. Supports template syntax like {{signal.<sourceNodeId>.<outputHandle>.<field>}}.",
+                    default: "",
+                    "ui:field": "template",
+                },
+                conversationHistory: {
+                    type: "object",
+                    title: "Conversation History",
+                    description: "JSON array of conversation history",
+                    "ui:field": "template",
+                },
+                initialRequest: {
+                    type: "object",
+                    title: "Initial Request",
+                    description: "Text sent as user message at call start — the model responds immediately",
+                    "ui:field": "template",
+                },
+                voice: {
+                    type: "string",
+                    title: "Voice",
+                    description: "Select the voice for speech generation",
+                    enum: ["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"],
+                    enumNames: ["Alloy", "Ash", "Ballad", "Coral", "Echo", "Sage", "Shimmer", "Verse", "Marin", "Cedar"],
+                    default: "alloy",
+                },
+                turnDetection: {
+                    type: "string",
+                    title: "Turn Detection",
+                    description: "How the model detects end of user speech",
+                    enum: ["semantic_vad", "server_vad", "disabled"],
+                    enumNames: ["Semantic VAD (recommended)", "Server VAD (threshold-based)", "Disabled"],
+                    default: "semantic_vad",
+                },
+                maxResponseOutputTokens: {
+                    type: "number",
+                    title: "Max Response Tokens",
+                    description: "Maximum tokens in assistant response (inf for unlimited)",
+                    default: 4096,
+                },
+                redisChannel: {
+                    type: "string",
+                    title: "Redis Channel",
+                    description: "Redis channel to publish audio chunks to",
+                    enum: [AI_RESULT_CHANNEL, SYSTEM_CHANNEL],
+                    enumNames: ["AI Results", "System Messages"],
+                    default: AI_RESULT_CHANNEL,
+                },
+            },
+            required: ["voice", "redisChannel"],
+            "ui:order": [
+                "systemPrompt",
+                "conversationHistory",
+                "initialRequest",
+                "voice",
+                "turnDetection",
+                "maxResponseOutputTokens",
+                "redisChannel",
+            ],
+        },
+        capabilities: { isTrigger: false },
+        serviceConnectors: [
+            {
+                name: "mcpService",
+                description: "MCP service connector — automatic schema discovery",
+                serviceType: "mcp",
+                isService: false,
+            },
+        ],
+        credentials: [
+            {
+                name: "openaiCredential",
+                required: true,
+                displayName: "OpenAI Credentials",
+                description: "OpenAI API key for Realtime API",
+            },
+        ],
+    };
+}
+exports.RealtimeVoiceNode = {
+    get definition() {
+        return createNodeDefinition();
+    },
+    executor: executor_1.default,
+};

package/dist/Realtime/service/core/orchestration/SessionOrchestrator.d.ts ADDED Viewed

@@ -0,0 +1,9 @@
+import { OpenAIRealtimeConfig, StreamUsageStats, StreamingMetadata } from "../../../../util/types";
+export declare class SessionOrchestrator {
+    private readonly logger;
+    private sessionManager;
+    orchestrateSession(config: OpenAIRealtimeConfig, metadata: StreamingMetadata, context: any, emit?: (output: any) => void): Promise<StreamUsageStats>;
+    private handleStartCall;
+    private handleEndCall;
+    private getCredentials;
+}