@gravity-platform/openai-realtime 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +104 -0
- package/dist/Realtime/constants.d.ts +7 -0
- package/dist/Realtime/constants.js +10 -0
- package/dist/Realtime/node/executor.d.ts +16 -0
- package/dist/Realtime/node/executor.js +112 -0
- package/dist/Realtime/node/index.d.ts +7 -0
- package/dist/Realtime/node/index.js +146 -0
- package/dist/Realtime/service/core/orchestration/SessionOrchestrator.d.ts +9 -0
- package/dist/Realtime/service/core/orchestration/SessionOrchestrator.js +333 -0
- package/dist/Realtime/service/core/processing/AudioHandler.d.ts +17 -0
- package/dist/Realtime/service/core/processing/AudioHandler.js +57 -0
- package/dist/Realtime/service/core/processing/ResponseProcessor.d.ts +34 -0
- package/dist/Realtime/service/core/processing/ResponseProcessor.js +199 -0
- package/dist/Realtime/service/core/processing/TextAccumulator.d.ts +19 -0
- package/dist/Realtime/service/core/processing/TextAccumulator.js +75 -0
- package/dist/Realtime/service/core/processing/UsageStatsCollector.d.ts +14 -0
- package/dist/Realtime/service/core/processing/UsageStatsCollector.js +52 -0
- package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.d.ts +10 -0
- package/dist/Realtime/service/core/streaming/RealtimeSessionRegistry.js +32 -0
- package/dist/Realtime/service/core/streaming/SessionManager.d.ts +14 -0
- package/dist/Realtime/service/core/streaming/SessionManager.js +33 -0
- package/dist/Realtime/service/core/streaming/WsClient.d.ts +11 -0
- package/dist/Realtime/service/core/streaming/WsClient.js +93 -0
- package/dist/Realtime/service/index.d.ts +6 -0
- package/dist/Realtime/service/index.js +13 -0
- package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.d.ts +4 -0
- package/dist/Realtime/service/io/events/incoming/builders/AudioAppendBuilder.js +15 -0
- package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.d.ts +5 -0
- package/dist/Realtime/service/io/events/incoming/builders/ConversationItemBuilder.js +36 -0
- package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.d.ts +3 -0
- package/dist/Realtime/service/io/events/incoming/builders/ResponseCreateBuilder.js +9 -0
- package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.d.ts +4 -0
- package/dist/Realtime/service/io/events/incoming/builders/SessionUpdateBuilder.js +61 -0
- package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.d.ts +28 -0
- package/dist/Realtime/service/io/publishers/WebSocketAudioPublisher.js +101 -0
- package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.d.ts +13 -0
- package/dist/Realtime/service/io/websocket/RealtimeWebSocketAudioSubscriber.js +94 -0
- package/dist/credentials/index.d.ts +14 -0
- package/dist/credentials/index.js +19 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +54 -0
- package/dist/util/types.d.ts +40 -0
- package/dist/util/types.js +2 -0
- package/package.json +58 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.TextAccumulator = void 0;
|
|
4
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
5
|
+
// Module-level platform calls cause startup freezes (docs-starter/nodes/CLAUDE.md
|
|
6
|
+
// rule 5) — resolve the logger lazily instead
|
|
7
|
+
function getLogger() {
|
|
8
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeTextAccumulator");
|
|
9
|
+
}
|
|
10
|
+
class TextAccumulator {
|
|
11
|
+
constructor(sessionId, emit) {
|
|
12
|
+
this.sessionId = sessionId;
|
|
13
|
+
this.emit = emit;
|
|
14
|
+
this.transcription = "";
|
|
15
|
+
this.assistantResponse = "";
|
|
16
|
+
this.allTurns = [];
|
|
17
|
+
this.progressLog = "";
|
|
18
|
+
this.logger = getLogger();
|
|
19
|
+
}
|
|
20
|
+
appendAssistant(text) {
|
|
21
|
+
this.assistantResponse += text;
|
|
22
|
+
}
|
|
23
|
+
setAssistantText(text) {
|
|
24
|
+
this.assistantResponse = text;
|
|
25
|
+
}
|
|
26
|
+
setUserTranscript(transcript) {
|
|
27
|
+
this.transcription = transcript;
|
|
28
|
+
}
|
|
29
|
+
emitProgress(text) {
|
|
30
|
+
this.progressLog += text;
|
|
31
|
+
this.emit?.({ __outputs: { progress: this.progressLog } });
|
|
32
|
+
}
|
|
33
|
+
emitConversation() {
|
|
34
|
+
this.logger.debug("Turn complete", {
|
|
35
|
+
sessionId: this.sessionId,
|
|
36
|
+
queryLength: this.transcription.length,
|
|
37
|
+
responseLength: this.assistantResponse.length,
|
|
38
|
+
});
|
|
39
|
+
if (this.transcription || this.assistantResponse) {
|
|
40
|
+
this.allTurns.push({ query: this.transcription, response: this.assistantResponse });
|
|
41
|
+
const turnNum = this.allTurns.length;
|
|
42
|
+
const q = this.transcription ? `Q${turnNum}: ${this.transcription}\n` : "";
|
|
43
|
+
const a = this.assistantResponse ? `A${turnNum}: ${this.assistantResponse}\n` : "";
|
|
44
|
+
this.emitProgress(q + a);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
emitFinal() {
|
|
48
|
+
if (!this.emit || this.allTurns.length === 0)
|
|
49
|
+
return;
|
|
50
|
+
const lastTurn = this.allTurns[this.allTurns.length - 1];
|
|
51
|
+
this.logger.info("Emitting final conversation", {
|
|
52
|
+
sessionId: this.sessionId,
|
|
53
|
+
turns: this.allTurns.length,
|
|
54
|
+
});
|
|
55
|
+
this.emit({
|
|
56
|
+
__outputs: {
|
|
57
|
+
text: {
|
|
58
|
+
query: lastTurn.query,
|
|
59
|
+
response: lastTurn.response,
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
resetTurn() {
|
|
65
|
+
this.transcription = "";
|
|
66
|
+
this.assistantResponse = "";
|
|
67
|
+
}
|
|
68
|
+
getTranscription() {
|
|
69
|
+
return this.transcription;
|
|
70
|
+
}
|
|
71
|
+
getAssistantResponse() {
|
|
72
|
+
return this.assistantResponse;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
exports.TextAccumulator = TextAccumulator;
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { StreamUsageStats } from "../../../../util/types";
|
|
2
|
+
export declare class UsageStatsCollector {
|
|
3
|
+
private stats;
|
|
4
|
+
incrementChunkCount(): void;
|
|
5
|
+
/**
|
|
6
|
+
* Accumulate per-response usage. OpenAI bills each response with its own
|
|
7
|
+
* input tokens (including context), so summing across the session's turns is
|
|
8
|
+
* the accurate total — overwriting would record only the final turn.
|
|
9
|
+
*/
|
|
10
|
+
addUsage(inputTokens: number, outputTokens: number): void;
|
|
11
|
+
setTextResults(transcription: string, assistantResponse: string): void;
|
|
12
|
+
getUsageStats(): StreamUsageStats;
|
|
13
|
+
reset(): void;
|
|
14
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.UsageStatsCollector = void 0;
|
|
4
|
+
class UsageStatsCollector {
|
|
5
|
+
constructor() {
|
|
6
|
+
this.stats = {
|
|
7
|
+
estimated: false,
|
|
8
|
+
total_tokens: 0,
|
|
9
|
+
inputTokens: 0,
|
|
10
|
+
outputTokens: 0,
|
|
11
|
+
chunk_count: 0,
|
|
12
|
+
textOutput: "",
|
|
13
|
+
transcription: "",
|
|
14
|
+
assistantResponse: "",
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
incrementChunkCount() {
|
|
18
|
+
this.stats.chunk_count++;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Accumulate per-response usage. OpenAI bills each response with its own
|
|
22
|
+
* input tokens (including context), so summing across the session's turns is
|
|
23
|
+
* the accurate total — overwriting would record only the final turn.
|
|
24
|
+
*/
|
|
25
|
+
addUsage(inputTokens, outputTokens) {
|
|
26
|
+
this.stats.inputTokens += inputTokens;
|
|
27
|
+
this.stats.outputTokens += outputTokens;
|
|
28
|
+
this.stats.total_tokens = this.stats.inputTokens + this.stats.outputTokens;
|
|
29
|
+
this.stats.estimated = false;
|
|
30
|
+
}
|
|
31
|
+
setTextResults(transcription, assistantResponse) {
|
|
32
|
+
this.stats.transcription = transcription;
|
|
33
|
+
this.stats.assistantResponse = assistantResponse;
|
|
34
|
+
this.stats.textOutput = transcription + assistantResponse;
|
|
35
|
+
}
|
|
36
|
+
getUsageStats() {
|
|
37
|
+
return { ...this.stats };
|
|
38
|
+
}
|
|
39
|
+
reset() {
|
|
40
|
+
this.stats = {
|
|
41
|
+
estimated: false,
|
|
42
|
+
total_tokens: 0,
|
|
43
|
+
inputTokens: 0,
|
|
44
|
+
outputTokens: 0,
|
|
45
|
+
chunk_count: 0,
|
|
46
|
+
textOutput: "",
|
|
47
|
+
transcription: "",
|
|
48
|
+
assistantResponse: "",
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
exports.UsageStatsCollector = UsageStatsCollector;
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import { WsClient } from "./WsClient";
|
|
2
|
+
declare class RealtimeSessionRegistry {
|
|
3
|
+
private sessions;
|
|
4
|
+
register(conversationId: string, wsClient: WsClient): void;
|
|
5
|
+
get(conversationId: string): WsClient | undefined;
|
|
6
|
+
remove(conversationId: string): void;
|
|
7
|
+
clear(): void;
|
|
8
|
+
}
|
|
9
|
+
export declare const realtimeSessionRegistry: RealtimeSessionRegistry;
|
|
10
|
+
export {};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.realtimeSessionRegistry = void 0;
|
|
4
|
+
class RealtimeSessionRegistry {
|
|
5
|
+
constructor() {
|
|
6
|
+
this.sessions = new Map();
|
|
7
|
+
}
|
|
8
|
+
register(conversationId, wsClient) {
|
|
9
|
+
// A duplicate START_CALL must not orphan the previous session: the old
|
|
10
|
+
// client would become unreachable by END_CALL and stay open (and billed)
|
|
11
|
+
const existing = this.sessions.get(conversationId);
|
|
12
|
+
if (existing && existing !== wsClient) {
|
|
13
|
+
try {
|
|
14
|
+
existing.close();
|
|
15
|
+
}
|
|
16
|
+
catch {
|
|
17
|
+
/* already closed */
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
this.sessions.set(conversationId, wsClient);
|
|
21
|
+
}
|
|
22
|
+
get(conversationId) {
|
|
23
|
+
return this.sessions.get(conversationId);
|
|
24
|
+
}
|
|
25
|
+
remove(conversationId) {
|
|
26
|
+
this.sessions.delete(conversationId);
|
|
27
|
+
}
|
|
28
|
+
clear() {
|
|
29
|
+
this.sessions.clear();
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
exports.realtimeSessionRegistry = new RealtimeSessionRegistry();
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { OpenAIRealtimeConfig, StreamingMetadata } from "../../../../util/types";
|
|
2
|
+
import { WsClient } from "./WsClient";
|
|
3
|
+
import { RealtimeResponseProcessor } from "../processing/ResponseProcessor";
|
|
4
|
+
export interface RealtimeSession {
|
|
5
|
+
sessionId: string;
|
|
6
|
+
isActive: boolean;
|
|
7
|
+
wsClient: WsClient;
|
|
8
|
+
responseProcessor: RealtimeResponseProcessor;
|
|
9
|
+
}
|
|
10
|
+
export declare class SessionManager {
|
|
11
|
+
private sessions;
|
|
12
|
+
createSession(_config: OpenAIRealtimeConfig, metadata: StreamingMetadata, responseProcessor: RealtimeResponseProcessor, wsClient: WsClient): RealtimeSession;
|
|
13
|
+
endSession(sessionId: string): void;
|
|
14
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SessionManager = void 0;
|
|
4
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
5
|
+
function getLogger() {
|
|
6
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeSessionManager");
|
|
7
|
+
}
|
|
8
|
+
class SessionManager {
|
|
9
|
+
constructor() {
|
|
10
|
+
this.sessions = new Map();
|
|
11
|
+
}
|
|
12
|
+
createSession(_config, metadata, responseProcessor, wsClient) {
|
|
13
|
+
const sessionId = metadata.workflowId || `session_${Date.now()}`;
|
|
14
|
+
const session = {
|
|
15
|
+
sessionId,
|
|
16
|
+
isActive: true,
|
|
17
|
+
wsClient,
|
|
18
|
+
responseProcessor,
|
|
19
|
+
};
|
|
20
|
+
this.sessions.set(sessionId, session);
|
|
21
|
+
return session;
|
|
22
|
+
}
|
|
23
|
+
endSession(sessionId) {
|
|
24
|
+
const session = this.sessions.get(sessionId);
|
|
25
|
+
if (session) {
|
|
26
|
+
session.isActive = false;
|
|
27
|
+
session.responseProcessor.cleanup();
|
|
28
|
+
this.sessions.delete(sessionId);
|
|
29
|
+
getLogger().info("Session ended", { sessionId });
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
exports.SessionManager = SessionManager;
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export declare class WsClient {
|
|
2
|
+
private ws;
|
|
3
|
+
private logger;
|
|
4
|
+
private messageHandler;
|
|
5
|
+
connect(apiKey: string): Promise<void>;
|
|
6
|
+
send(event: any): void;
|
|
7
|
+
onMessage(handler: (event: any) => void): void;
|
|
8
|
+
close(): void;
|
|
9
|
+
waitForClose(): Promise<void>;
|
|
10
|
+
get isOpen(): boolean;
|
|
11
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.WsClient = void 0;
|
|
7
|
+
const ws_1 = __importDefault(require("ws"));
|
|
8
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
9
|
+
const constants_1 = require("../../../constants");
|
|
10
|
+
function getLogger() {
|
|
11
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeWsClient");
|
|
12
|
+
}
|
|
13
|
+
class WsClient {
|
|
14
|
+
constructor() {
|
|
15
|
+
this.ws = null;
|
|
16
|
+
this.logger = getLogger();
|
|
17
|
+
this.messageHandler = null;
|
|
18
|
+
}
|
|
19
|
+
async connect(apiKey) {
|
|
20
|
+
const url = `${constants_1.REALTIME_WS_URL}?model=${constants_1.REALTIME_MODEL_ID}`;
|
|
21
|
+
this.ws = new ws_1.default(url, {
|
|
22
|
+
headers: {
|
|
23
|
+
Authorization: `Bearer ${apiKey}`,
|
|
24
|
+
},
|
|
25
|
+
});
|
|
26
|
+
return new Promise((resolve, reject) => {
|
|
27
|
+
if (!this.ws)
|
|
28
|
+
return reject(new Error("WebSocket not initialized"));
|
|
29
|
+
let settled = false;
|
|
30
|
+
this.ws.once("open", () => {
|
|
31
|
+
settled = true;
|
|
32
|
+
this.logger.info("OpenAI Realtime WebSocket connected");
|
|
33
|
+
resolve();
|
|
34
|
+
});
|
|
35
|
+
// Persistent handler: an unhandled 'error' event on the socket would crash
|
|
36
|
+
// the process. Rejects the connect promise only during the connection phase.
|
|
37
|
+
this.ws.on("error", (err) => {
|
|
38
|
+
this.logger.error("WebSocket error", { error: err.message });
|
|
39
|
+
if (!settled) {
|
|
40
|
+
settled = true;
|
|
41
|
+
reject(err);
|
|
42
|
+
}
|
|
43
|
+
});
|
|
44
|
+
this.ws.on("message", (data) => {
|
|
45
|
+
try {
|
|
46
|
+
const event = JSON.parse(data.toString());
|
|
47
|
+
if (this.messageHandler) {
|
|
48
|
+
this.messageHandler(event);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
catch (err) {
|
|
52
|
+
this.logger.error("Failed to parse WebSocket message", { error: err.message });
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
this.ws.on("close", (code, reason) => {
|
|
56
|
+
this.logger.info("OpenAI Realtime WebSocket closed", { code, reason: reason.toString() });
|
|
57
|
+
});
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
send(event) {
|
|
61
|
+
if (!this.ws || this.ws.readyState !== ws_1.default.OPEN) {
|
|
62
|
+
this.logger.warn("Cannot send — WebSocket not open");
|
|
63
|
+
return;
|
|
64
|
+
}
|
|
65
|
+
this.ws.send(JSON.stringify(event));
|
|
66
|
+
}
|
|
67
|
+
onMessage(handler) {
|
|
68
|
+
this.messageHandler = handler;
|
|
69
|
+
}
|
|
70
|
+
close() {
|
|
71
|
+
if (this.ws) {
|
|
72
|
+
this.ws.close();
|
|
73
|
+
this.ws = null;
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
waitForClose() {
|
|
77
|
+
return new Promise((resolve) => {
|
|
78
|
+
if (!this.ws) {
|
|
79
|
+
resolve();
|
|
80
|
+
return;
|
|
81
|
+
}
|
|
82
|
+
if (!this.isOpen) {
|
|
83
|
+
resolve();
|
|
84
|
+
return;
|
|
85
|
+
}
|
|
86
|
+
this.ws.once("close", () => resolve());
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
get isOpen() {
|
|
90
|
+
return this.ws?.readyState === ws_1.default.OPEN;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
exports.WsClient = WsClient;
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { OpenAIRealtimeConfig, StreamUsageStats, StreamingMetadata } from "../../util/types";
|
|
2
|
+
export declare class RealtimeVoiceService {
|
|
3
|
+
private orchestrator;
|
|
4
|
+
generateVoiceStream(config: OpenAIRealtimeConfig, metadata: StreamingMetadata, context: any, emit?: (output: any) => void): Promise<StreamUsageStats>;
|
|
5
|
+
}
|
|
6
|
+
export type { OpenAIRealtimeConfig, StreamUsageStats, StreamingMetadata };
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.RealtimeVoiceService = void 0;
|
|
4
|
+
const SessionOrchestrator_1 = require("./core/orchestration/SessionOrchestrator");
|
|
5
|
+
class RealtimeVoiceService {
|
|
6
|
+
constructor() {
|
|
7
|
+
this.orchestrator = new SessionOrchestrator_1.SessionOrchestrator();
|
|
8
|
+
}
|
|
9
|
+
async generateVoiceStream(config, metadata, context, emit) {
|
|
10
|
+
return this.orchestrator.orchestrateSession(config, metadata, context, emit);
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
exports.RealtimeVoiceService = RealtimeVoiceService;
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.AudioAppendBuilder = void 0;
|
|
4
|
+
class AudioAppendBuilder {
|
|
5
|
+
static build(base64Audio) {
|
|
6
|
+
return {
|
|
7
|
+
type: "input_audio_buffer.append",
|
|
8
|
+
audio: base64Audio,
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
static buildCommit() {
|
|
12
|
+
return { type: "input_audio_buffer.commit" };
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
exports.AudioAppendBuilder = AudioAppendBuilder;
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export declare class ConversationItemBuilder {
|
|
2
|
+
static buildUserMessage(text: string): Record<string, unknown>;
|
|
3
|
+
static buildAssistantMessage(text: string): Record<string, unknown>;
|
|
4
|
+
static buildFunctionCallOutput(callId: string, output: string): Record<string, unknown>;
|
|
5
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ConversationItemBuilder = void 0;
|
|
4
|
+
class ConversationItemBuilder {
|
|
5
|
+
static buildUserMessage(text) {
|
|
6
|
+
return {
|
|
7
|
+
type: "conversation.item.create",
|
|
8
|
+
item: {
|
|
9
|
+
type: "message",
|
|
10
|
+
role: "user",
|
|
11
|
+
content: [{ type: "input_text", text }],
|
|
12
|
+
},
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
static buildAssistantMessage(text) {
|
|
16
|
+
return {
|
|
17
|
+
type: "conversation.item.create",
|
|
18
|
+
item: {
|
|
19
|
+
type: "message",
|
|
20
|
+
role: "assistant",
|
|
21
|
+
content: [{ type: "text", text }],
|
|
22
|
+
},
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
static buildFunctionCallOutput(callId, output) {
|
|
26
|
+
return {
|
|
27
|
+
type: "conversation.item.create",
|
|
28
|
+
item: {
|
|
29
|
+
type: "function_call_output",
|
|
30
|
+
call_id: callId,
|
|
31
|
+
output,
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
exports.ConversationItemBuilder = ConversationItemBuilder;
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.ResponseCreateBuilder = void 0;
|
|
4
|
+
class ResponseCreateBuilder {
|
|
5
|
+
static build() {
|
|
6
|
+
return { type: "response.create" };
|
|
7
|
+
}
|
|
8
|
+
}
|
|
9
|
+
exports.ResponseCreateBuilder = ResponseCreateBuilder;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.SessionUpdateBuilder = void 0;
|
|
4
|
+
const constants_1 = require("../../../../../constants");
|
|
5
|
+
const VALID_VOICES = new Set(["alloy", "ash", "ballad", "coral", "echo", "sage", "shimmer", "verse", "marin", "cedar"]);
|
|
6
|
+
function normalizeVoice(input) {
|
|
7
|
+
if (!input)
|
|
8
|
+
return "alloy";
|
|
9
|
+
const lower = input.toLowerCase();
|
|
10
|
+
return VALID_VOICES.has(lower) ? lower : "alloy";
|
|
11
|
+
}
|
|
12
|
+
class SessionUpdateBuilder {
|
|
13
|
+
static build(config) {
|
|
14
|
+
const turnDetection = config.turnDetection === "disabled"
|
|
15
|
+
? null
|
|
16
|
+
: config.turnDetection === "server_vad"
|
|
17
|
+
? {
|
|
18
|
+
type: "server_vad",
|
|
19
|
+
create_response: true,
|
|
20
|
+
interrupt_response: true,
|
|
21
|
+
silence_duration_ms: 700,
|
|
22
|
+
prefix_padding_ms: 300,
|
|
23
|
+
threshold: 0.7,
|
|
24
|
+
}
|
|
25
|
+
: {
|
|
26
|
+
type: "semantic_vad",
|
|
27
|
+
create_response: true,
|
|
28
|
+
interrupt_response: true,
|
|
29
|
+
eagerness: "high",
|
|
30
|
+
};
|
|
31
|
+
const session = {
|
|
32
|
+
type: "realtime",
|
|
33
|
+
model: constants_1.REALTIME_MODEL_ID,
|
|
34
|
+
output_modalities: ["audio"],
|
|
35
|
+
audio: {
|
|
36
|
+
input: {
|
|
37
|
+
format: { type: "audio/pcm", rate: 24000 },
|
|
38
|
+
noise_reduction: { type: "near_field" },
|
|
39
|
+
transcription: { model: "gpt-4o-mini-transcribe" },
|
|
40
|
+
turn_detection: turnDetection,
|
|
41
|
+
},
|
|
42
|
+
output: {
|
|
43
|
+
format: { type: "audio/pcm", rate: 24000 },
|
|
44
|
+
voice: normalizeVoice(config.voice),
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
if (config.systemPrompt) {
|
|
49
|
+
session.instructions = config.systemPrompt;
|
|
50
|
+
}
|
|
51
|
+
if (config.maxResponseOutputTokens !== undefined) {
|
|
52
|
+
session.max_output_tokens = config.maxResponseOutputTokens;
|
|
53
|
+
}
|
|
54
|
+
if (config.tools && config.tools.length > 0) {
|
|
55
|
+
session.tools = config.tools;
|
|
56
|
+
session.tool_choice = "auto";
|
|
57
|
+
}
|
|
58
|
+
return { type: "session.update", session };
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
exports.SessionUpdateBuilder = SessionUpdateBuilder;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import { StreamingMetadata, AudioState } from "../../../../util/types";
|
|
2
|
+
export declare class WebSocketAudioPublisher {
|
|
3
|
+
private chunkBuffers;
|
|
4
|
+
publishAudio(config: {
|
|
5
|
+
audioData: string;
|
|
6
|
+
format: string;
|
|
7
|
+
sourceType: string;
|
|
8
|
+
conversationId: string;
|
|
9
|
+
metadata: StreamingMetadata;
|
|
10
|
+
audioState: AudioState;
|
|
11
|
+
index: number;
|
|
12
|
+
}): Promise<void>;
|
|
13
|
+
private addToBuffer;
|
|
14
|
+
private flushBuffer;
|
|
15
|
+
publishState(config: {
|
|
16
|
+
state: AudioState | string;
|
|
17
|
+
conversationId?: string;
|
|
18
|
+
metadata?: StreamingMetadata;
|
|
19
|
+
message?: string;
|
|
20
|
+
additionalMetadata?: Record<string, unknown>;
|
|
21
|
+
}): Promise<void>;
|
|
22
|
+
/**
|
|
23
|
+
* Drop any buffered audio without sending it — used on user barge-in so the
|
|
24
|
+
* client doesn't hear a trailing fragment of the interrupted response
|
|
25
|
+
*/
|
|
26
|
+
discardBuffer(conversationId: string): void;
|
|
27
|
+
cleanup(conversationId: string): Promise<void>;
|
|
28
|
+
}
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.WebSocketAudioPublisher = void 0;
|
|
4
|
+
const plugin_base_1 = require("@gravity-platform/plugin-base");
|
|
5
|
+
function getLogger() {
|
|
6
|
+
return (0, plugin_base_1.getPlatformDependencies)().createLogger("RealtimeWebSocketAudioPublisher");
|
|
7
|
+
}
|
|
8
|
+
function getAudioWSManager() {
|
|
9
|
+
return (0, plugin_base_1.getPlatformDependencies)().getAudioWebSocketManager?.();
|
|
10
|
+
}
|
|
11
|
+
const TARGET_CHUNK_SIZE = 32768;
|
|
12
|
+
const MAX_BUFFER_DELAY = 50;
|
|
13
|
+
class WebSocketAudioPublisher {
|
|
14
|
+
constructor() {
|
|
15
|
+
this.chunkBuffers = new Map();
|
|
16
|
+
}
|
|
17
|
+
async publishAudio(config) {
|
|
18
|
+
const audioWSManager = getAudioWSManager();
|
|
19
|
+
if (!audioWSManager)
|
|
20
|
+
return;
|
|
21
|
+
if (config.audioState === "SPEECH_STARTED" || config.audioState === "SPEECH_ENDED") {
|
|
22
|
+
audioWSManager.sendControl(config.conversationId, {
|
|
23
|
+
type: "audioState",
|
|
24
|
+
state: config.audioState,
|
|
25
|
+
metadata: config.metadata,
|
|
26
|
+
});
|
|
27
|
+
if (config.audioState === "SPEECH_ENDED") {
|
|
28
|
+
await this.flushBuffer(config.conversationId);
|
|
29
|
+
}
|
|
30
|
+
return;
|
|
31
|
+
}
|
|
32
|
+
const audioBuffer = Buffer.from(config.audioData, "base64");
|
|
33
|
+
this.addToBuffer(config.conversationId, audioBuffer);
|
|
34
|
+
}
|
|
35
|
+
addToBuffer(conversationId, chunk) {
|
|
36
|
+
let buffer = this.chunkBuffers.get(conversationId);
|
|
37
|
+
if (!buffer) {
|
|
38
|
+
buffer = { chunks: [], totalSize: 0 };
|
|
39
|
+
this.chunkBuffers.set(conversationId, buffer);
|
|
40
|
+
}
|
|
41
|
+
buffer.chunks.push(chunk);
|
|
42
|
+
buffer.totalSize += chunk.length;
|
|
43
|
+
if (buffer.timer)
|
|
44
|
+
clearTimeout(buffer.timer);
|
|
45
|
+
if (buffer.totalSize >= TARGET_CHUNK_SIZE) {
|
|
46
|
+
this.flushBuffer(conversationId);
|
|
47
|
+
}
|
|
48
|
+
else {
|
|
49
|
+
buffer.timer = setTimeout(() => this.flushBuffer(conversationId), MAX_BUFFER_DELAY);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
async flushBuffer(conversationId) {
|
|
53
|
+
const buffer = this.chunkBuffers.get(conversationId);
|
|
54
|
+
if (!buffer || buffer.chunks.length === 0)
|
|
55
|
+
return;
|
|
56
|
+
const audioWSManager = getAudioWSManager();
|
|
57
|
+
if (!audioWSManager)
|
|
58
|
+
return;
|
|
59
|
+
const combined = Buffer.concat(buffer.chunks);
|
|
60
|
+
audioWSManager.sendAudio(conversationId, combined);
|
|
61
|
+
buffer.chunks = [];
|
|
62
|
+
buffer.totalSize = 0;
|
|
63
|
+
if (buffer.timer) {
|
|
64
|
+
clearTimeout(buffer.timer);
|
|
65
|
+
buffer.timer = undefined;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
async publishState(config) {
|
|
69
|
+
const audioWSManager = getAudioWSManager();
|
|
70
|
+
if (!audioWSManager || !config.conversationId)
|
|
71
|
+
return;
|
|
72
|
+
if (config.state === "SPEECH_ENDED") {
|
|
73
|
+
await this.flushBuffer(config.conversationId);
|
|
74
|
+
}
|
|
75
|
+
const mergedMetadata = { ...(config.metadata || {}), ...(config.additionalMetadata || {}) };
|
|
76
|
+
audioWSManager.sendControl(config.conversationId, {
|
|
77
|
+
type: "AUDIO_STATE",
|
|
78
|
+
state: config.state,
|
|
79
|
+
message: config.message,
|
|
80
|
+
metadata: mergedMetadata,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Drop any buffered audio without sending it — used on user barge-in so the
|
|
85
|
+
* client doesn't hear a trailing fragment of the interrupted response
|
|
86
|
+
*/
|
|
87
|
+
discardBuffer(conversationId) {
|
|
88
|
+
const buffer = this.chunkBuffers.get(conversationId);
|
|
89
|
+
if (buffer?.timer)
|
|
90
|
+
clearTimeout(buffer.timer);
|
|
91
|
+
this.chunkBuffers.delete(conversationId);
|
|
92
|
+
}
|
|
93
|
+
async cleanup(conversationId) {
|
|
94
|
+
await this.flushBuffer(conversationId);
|
|
95
|
+
const buffer = this.chunkBuffers.get(conversationId);
|
|
96
|
+
if (buffer?.timer)
|
|
97
|
+
clearTimeout(buffer.timer);
|
|
98
|
+
this.chunkBuffers.delete(conversationId);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
exports.WebSocketAudioPublisher = WebSocketAudioPublisher;
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import { WsClient } from "../../core/streaming/WsClient";
|
|
2
|
+
export declare class RealtimeWebSocketAudioSubscriber {
|
|
3
|
+
private sessions;
|
|
4
|
+
private static instance;
|
|
5
|
+
private logger;
|
|
6
|
+
private constructor();
|
|
7
|
+
static getInstance(): RealtimeWebSocketAudioSubscriber;
|
|
8
|
+
private setupWebSocketHandlers;
|
|
9
|
+
registerSession(wsSessionId: string, chatId: string, wsClient: WsClient): void;
|
|
10
|
+
handleAudioData(sessionId: string, audioData: ArrayBuffer): Promise<void>;
|
|
11
|
+
handleControlMessage(sessionId: string, message: any): Promise<void>;
|
|
12
|
+
unregisterSession(sessionId: string): void;
|
|
13
|
+
}
|