@juspay/neurolink 9.72.0 → 9.73.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/browser/neurolink.min.js +51 -51
- package/dist/lib/types/livekit.d.ts +134 -0
- package/dist/lib/voice/livekit/brain.js +1 -1
- package/dist/lib/voice/livekit/config.d.ts +12 -1
- package/dist/lib/voice/livekit/config.js +54 -0
- package/dist/lib/voice/livekit/eventBridge.js +4 -4
- package/dist/lib/voice/livekit/index.d.ts +9 -2
- package/dist/lib/voice/livekit/index.js +9 -2
- package/dist/lib/voice/livekit/realtimeEventBridge.d.ts +14 -0
- package/dist/lib/voice/livekit/realtimeEventBridge.js +161 -0
- package/dist/lib/voice/livekit/realtimeMcpTools.d.ts +31 -0
- package/dist/lib/voice/livekit/realtimeMcpTools.js +194 -0
- package/dist/lib/voice/livekit/realtimeVoiceAgent.d.ts +26 -0
- package/dist/lib/voice/livekit/realtimeVoiceAgent.js +362 -0
- package/dist/lib/voice/livekit/roomContext.d.ts +23 -0
- package/dist/lib/voice/livekit/roomContext.js +57 -0
- package/dist/lib/voice/livekit/roomDispatch.d.ts +24 -0
- package/dist/lib/voice/livekit/roomDispatch.js +31 -0
- package/dist/lib/voice/livekit/schemaSanitizer.d.ts +26 -0
- package/dist/lib/voice/livekit/schemaSanitizer.js +144 -0
- package/dist/lib/voice/livekit/vertexAuth.d.ts +30 -0
- package/dist/lib/voice/livekit/vertexAuth.js +73 -0
- package/dist/lib/voice/livekit/voiceAgent.js +47 -37
- package/dist/lib/voice/livekit/voiceAgentWorker.d.ts +2 -0
- package/dist/lib/voice/livekit/voiceAgentWorker.js +64 -0
- package/dist/types/livekit.d.ts +134 -0
- package/dist/voice/livekit/brain.js +1 -1
- package/dist/voice/livekit/config.d.ts +12 -1
- package/dist/voice/livekit/config.js +54 -0
- package/dist/voice/livekit/eventBridge.js +4 -4
- package/dist/voice/livekit/index.d.ts +9 -2
- package/dist/voice/livekit/index.js +9 -2
- package/dist/voice/livekit/realtimeEventBridge.d.ts +14 -0
- package/dist/voice/livekit/realtimeEventBridge.js +160 -0
- package/dist/voice/livekit/realtimeMcpTools.d.ts +31 -0
- package/dist/voice/livekit/realtimeMcpTools.js +193 -0
- package/dist/voice/livekit/realtimeVoiceAgent.d.ts +26 -0
- package/dist/voice/livekit/realtimeVoiceAgent.js +361 -0
- package/dist/voice/livekit/roomContext.d.ts +23 -0
- package/dist/voice/livekit/roomContext.js +56 -0
- package/dist/voice/livekit/roomDispatch.d.ts +24 -0
- package/dist/voice/livekit/roomDispatch.js +30 -0
- package/dist/voice/livekit/schemaSanitizer.d.ts +26 -0
- package/dist/voice/livekit/schemaSanitizer.js +143 -0
- package/dist/voice/livekit/vertexAuth.d.ts +30 -0
- package/dist/voice/livekit/vertexAuth.js +72 -0
- package/dist/voice/livekit/voiceAgent.js +47 -37
- package/dist/voice/livekit/voiceAgentWorker.d.ts +2 -0
- package/dist/voice/livekit/voiceAgentWorker.js +64 -0
- package/package.json +2 -1
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MCP tools → Gemini function tools bridge (realtime mode).
|
|
3
|
+
*
|
|
4
|
+
* In speech-to-speech mode Gemini owns the tool loop, so an external MCP
|
|
5
|
+
* server's tools are registered as LiveKit/Gemini function tools rather than
|
|
6
|
+
* NeuroLink tools. This connects to the MCP server as the calling user
|
|
7
|
+
* (`x-auth-token` + base64 `x-context`), sanitizes each tool's schema into the
|
|
8
|
+
* Gemini-safe subset, wires WRITE-labeled (destructive) tools through a HITL
|
|
9
|
+
* confirmation, and forwards tool start/result events to the browser bridge.
|
|
10
|
+
*
|
|
11
|
+
* `@livekit/agents` and `@modelcontextprotocol/sdk` are imported dynamically so
|
|
12
|
+
* the core package does not require them unless the realtime agent is used.
|
|
13
|
+
*
|
|
14
|
+
* See docs/features/livekit-voice-agent.md.
|
|
15
|
+
*/
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
import { logger } from "../../utils/logger.js";
|
|
18
|
+
import { findSchemaIssue, sanitizeToolParameters } from "./schemaSanitizer.js";
|
|
19
|
+
/**
|
|
20
|
+
* The fields of an MCP tool result we render: text content parts and the error
|
|
21
|
+
* flag. The result crosses a network boundary from an external MCP server, so it
|
|
22
|
+
* is decoded with a schema (other content kinds — image, resource — are dropped).
|
|
23
|
+
*/
|
|
24
|
+
const toolResultSchema = z.object({
|
|
25
|
+
isError: z.boolean().optional(),
|
|
26
|
+
content: z
|
|
27
|
+
.array(z.object({ type: z.string().optional(), text: z.string().optional() }))
|
|
28
|
+
.optional(),
|
|
29
|
+
});
|
|
30
|
+
/** Flatten an MCP tool result's text content into a single string for the model. */
|
|
31
|
+
export function mcpResultToText(result) {
|
|
32
|
+
const decoded = toolResultSchema.safeParse(result);
|
|
33
|
+
if (!decoded.success) {
|
|
34
|
+
return "Tool returned no content.";
|
|
35
|
+
}
|
|
36
|
+
const text = (decoded.data.content ?? [])
|
|
37
|
+
.map((part) => (part.type === "text" && part.text ? part.text : ""))
|
|
38
|
+
.filter(Boolean)
|
|
39
|
+
.join("\n")
|
|
40
|
+
.trim();
|
|
41
|
+
if (decoded.data.isError) {
|
|
42
|
+
return `Tool error: ${text || "unknown error"}`;
|
|
43
|
+
}
|
|
44
|
+
return text || "Tool returned no content.";
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* The function-result text returned to Gemini when the user rejects a HITL tool.
|
|
48
|
+
* Scope the rejection to THIS attempt — a "never retry" wording made the model
|
|
49
|
+
* refuse the tool permanently even when the user later asked for it again.
|
|
50
|
+
*/
|
|
51
|
+
function buildHitlDeclineMessage(toolName) {
|
|
52
|
+
return (`The user reviewed the request to run "${toolName}" and chose to reject it, ` +
|
|
53
|
+
`so it was NOT executed this time. Do not run it right now. Briefly let the user know ` +
|
|
54
|
+
`you didn't proceed because they declined, and ask if there's anything else. ` +
|
|
55
|
+
`If the user later asks for "${toolName}" again or says to go ahead, you may run it then — ` +
|
|
56
|
+
`this rejection applies only to this request, not to all future requests.`);
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Register a tool handler under the name variants the model might call:
|
|
60
|
+
* 1. the Gemini-safe full name (hyphens → underscores; Gemini rewrites these
|
|
61
|
+
* itself, so the rewritten form must be registered or the round-trip fails)
|
|
62
|
+
* 2. the bare short name (the model often drops the server prefix)
|
|
63
|
+
* First registration wins, so a unique full name is never shadowed by a generic
|
|
64
|
+
* short name. Returns the aliases actually registered.
|
|
65
|
+
*/
|
|
66
|
+
function registerToolAliases(toolContext, mcpToolName, handler) {
|
|
67
|
+
const safeFullName = mcpToolName.replace(/[^a-zA-Z0-9_]/g, "_");
|
|
68
|
+
const shortName = safeFullName.includes("_")
|
|
69
|
+
? safeFullName.slice(safeFullName.lastIndexOf("_") + 1)
|
|
70
|
+
: safeFullName;
|
|
71
|
+
const candidates = [safeFullName, shortName].filter((name, index, all) => name.length > 0 && all.indexOf(name) === index);
|
|
72
|
+
const registered = [];
|
|
73
|
+
for (const alias of candidates) {
|
|
74
|
+
if (alias in toolContext) {
|
|
75
|
+
continue; // taken by another tool — don't clobber
|
|
76
|
+
}
|
|
77
|
+
toolContext[alias] = handler;
|
|
78
|
+
registered.push(alias);
|
|
79
|
+
}
|
|
80
|
+
return registered;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Connect to the MCP server as the calling user and bridge every tool into a
|
|
84
|
+
* LiveKit `ToolContext`. Returns the tool map + the client (close on shutdown).
|
|
85
|
+
* The server already scopes the tool list by `x-context`, so no client-side
|
|
86
|
+
* filtering is needed.
|
|
87
|
+
*/
|
|
88
|
+
export async function buildRealtimeMcpTools(params) {
|
|
89
|
+
const { mcpUrl, authToken, xContext, publishEvent, requestConfirmation } = params;
|
|
90
|
+
const { llm } = await import("@livekit/agents");
|
|
91
|
+
const { Client: McpClient } = await import("@modelcontextprotocol/sdk/client/index.js");
|
|
92
|
+
const { StreamableHTTPClientTransport } = await import("@modelcontextprotocol/sdk/client/streamableHttp.js");
|
|
93
|
+
const transport = new StreamableHTTPClientTransport(new URL(mcpUrl), {
|
|
94
|
+
requestInit: {
|
|
95
|
+
headers: { "x-auth-token": authToken, "x-context": xContext },
|
|
96
|
+
},
|
|
97
|
+
});
|
|
98
|
+
const client = new McpClient({ name: "neurolink-realtime-voice", version: "1.0.0" }, { capabilities: {} });
|
|
99
|
+
logger.info("realtime.mcp.connecting", {
|
|
100
|
+
mcpUrl,
|
|
101
|
+
hasAuthToken: authToken !== "",
|
|
102
|
+
hasContext: xContext !== "",
|
|
103
|
+
});
|
|
104
|
+
await client.connect(transport);
|
|
105
|
+
try {
|
|
106
|
+
const { tools: mcpTools } = await client.listTools();
|
|
107
|
+
logger.info("realtime.mcp.toolsListed", { count: mcpTools.length });
|
|
108
|
+
const toolContext = {};
|
|
109
|
+
const skipped = [];
|
|
110
|
+
for (const mcpTool of mcpTools) {
|
|
111
|
+
const parameters = sanitizeToolParameters(mcpTool.inputSchema);
|
|
112
|
+
const issue = findSchemaIssue(parameters);
|
|
113
|
+
if (issue) {
|
|
114
|
+
skipped.push(mcpTool.name);
|
|
115
|
+
logger.warn("realtime.mcp.toolSkipped", {
|
|
116
|
+
tool: mcpTool.name,
|
|
117
|
+
issue,
|
|
118
|
+
rawInputSchema: JSON.stringify(mcpTool.inputSchema),
|
|
119
|
+
});
|
|
120
|
+
continue;
|
|
121
|
+
}
|
|
122
|
+
const requiresConfirmation = mcpTool.annotations?.destructiveHint === true;
|
|
123
|
+
const handler = llm.tool({
|
|
124
|
+
description: mcpTool.description ?? mcpTool.name,
|
|
125
|
+
parameters,
|
|
126
|
+
execute: async (args) => {
|
|
127
|
+
if (requiresConfirmation) {
|
|
128
|
+
const approved = await requestConfirmation(mcpTool.name, args ?? {});
|
|
129
|
+
if (!approved) {
|
|
130
|
+
logger.info("realtime.tool.hitlRejected", {
|
|
131
|
+
tool: mcpTool.name,
|
|
132
|
+
});
|
|
133
|
+
return buildHitlDeclineMessage(mcpTool.name);
|
|
134
|
+
}
|
|
135
|
+
logger.info("realtime.tool.hitlApproved", { tool: mcpTool.name });
|
|
136
|
+
}
|
|
137
|
+
logger.info("realtime.tool.invoke", {
|
|
138
|
+
tool: mcpTool.name,
|
|
139
|
+
args: args ?? {},
|
|
140
|
+
});
|
|
141
|
+
publishEvent("tool-start", { name: mcpTool.name });
|
|
142
|
+
const startedAt = Date.now();
|
|
143
|
+
try {
|
|
144
|
+
const result = await client.callTool({
|
|
145
|
+
name: mcpTool.name,
|
|
146
|
+
arguments: args ?? {},
|
|
147
|
+
});
|
|
148
|
+
const text = mcpResultToText(result);
|
|
149
|
+
logger.info("realtime.tool.result", {
|
|
150
|
+
tool: mcpTool.name,
|
|
151
|
+
isError: result.isError === true,
|
|
152
|
+
chars: text.length,
|
|
153
|
+
ms: Date.now() - startedAt,
|
|
154
|
+
});
|
|
155
|
+
// Forward the RAW result so the client can extract a
|
|
156
|
+
// chart/UIComponent and clear the in-progress indicator.
|
|
157
|
+
publishEvent("tool-result", { name: mcpTool.name, result });
|
|
158
|
+
return text;
|
|
159
|
+
}
|
|
160
|
+
catch (error) {
|
|
161
|
+
logger.error("realtime.tool.error", {
|
|
162
|
+
tool: mcpTool.name,
|
|
163
|
+
ms: Date.now() - startedAt,
|
|
164
|
+
error: error instanceof Error ? error.message : String(error),
|
|
165
|
+
});
|
|
166
|
+
publishEvent("tool-result", { name: mcpTool.name, result: null });
|
|
167
|
+
return `Tool ${mcpTool.name} failed: ${String(error)}`;
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
});
|
|
171
|
+
const aliases = registerToolAliases(toolContext, mcpTool.name, handler);
|
|
172
|
+
logger.info("realtime.mcp.toolRegistered", {
|
|
173
|
+
tool: mcpTool.name,
|
|
174
|
+
aliases,
|
|
175
|
+
hitl: requiresConfirmation,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
logger.info("realtime.mcp.connected", {
|
|
179
|
+
registered: Object.keys(toolContext).length,
|
|
180
|
+
skipped: skipped.length,
|
|
181
|
+
skippedTools: skipped,
|
|
182
|
+
});
|
|
183
|
+
return { tools: toolContext, client };
|
|
184
|
+
}
|
|
185
|
+
catch (error) {
|
|
186
|
+
await client.close().catch((closeError) => {
|
|
187
|
+
logger.warn("realtime.mcp.closeFailed", {
|
|
188
|
+
error: closeError instanceof Error ? closeError.message : String(closeError),
|
|
189
|
+
});
|
|
190
|
+
});
|
|
191
|
+
throw error;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
//# sourceMappingURL=realtimeMcpTools.js.map
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents agent definition — realtime (Gemini Live speech-to-speech).
|
|
3
|
+
*
|
|
4
|
+
* `defineRealtimeVoiceAgent` returns the agent object placed as the default
|
|
5
|
+
* export of a worker entry file. Unlike the cascaded `defineVoiceAgent`
|
|
6
|
+
* (Silero VAD → STT → NeuroLink → TTS), here a single realtime model (Gemini
|
|
7
|
+
* Live on Vertex) does STT + reasoning + TTS + turn detection over one
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents`, `@livekit/agents-plugin-google`, `@livekit/rtc-node`, and
|
|
10
|
+
* `@google/genai` are imported dynamically so the core package does not require
|
|
11
|
+
* them unless the realtime agent is used. Type-only imports are erased at build.
|
|
12
|
+
*
|
|
13
|
+
* See docs/features/livekit-voice-agent.md.
|
|
14
|
+
*/
|
|
15
|
+
import type { JobContext } from "@livekit/agents";
|
|
16
|
+
import type { RealtimeVoiceAgentConfig } from "../../types/index.js";
|
|
17
|
+
/**
|
|
18
|
+
* Define a realtime (Gemini Live S2S) LiveKit voice agent.
|
|
19
|
+
*
|
|
20
|
+
* Place the result as the default export of the worker entry file and launch it
|
|
21
|
+
* with `startRealtimeVoiceAgentWorker`. With no `config` everything is resolved
|
|
22
|
+
* from the environment (see `resolveRealtimeVoiceConfig`).
|
|
23
|
+
*/
|
|
24
|
+
export declare function defineRealtimeVoiceAgent(config?: RealtimeVoiceAgentConfig): {
|
|
25
|
+
entry: (ctx: JobContext) => Promise<void>;
|
|
26
|
+
};
|
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LiveKit Agents agent definition — realtime (Gemini Live speech-to-speech).
|
|
3
|
+
*
|
|
4
|
+
* `defineRealtimeVoiceAgent` returns the agent object placed as the default
|
|
5
|
+
* export of a worker entry file. Unlike the cascaded `defineVoiceAgent`
|
|
6
|
+
* (Silero VAD → STT → NeuroLink → TTS), here a single realtime model (Gemini
|
|
7
|
+
* Live on Vertex) does STT + reasoning + TTS + turn detection over one
|
|
8
|
+
*
|
|
9
|
+
* `@livekit/agents`, `@livekit/agents-plugin-google`, `@livekit/rtc-node`, and
|
|
10
|
+
* `@google/genai` are imported dynamically so the core package does not require
|
|
11
|
+
* them unless the realtime agent is used. Type-only imports are erased at build.
|
|
12
|
+
*
|
|
13
|
+
* See docs/features/livekit-voice-agent.md.
|
|
14
|
+
*/
|
|
15
|
+
import { z } from "zod";
|
|
16
|
+
import { logger } from "../../utils/logger.js";
|
|
17
|
+
import { resolveRealtimeVoiceConfig } from "./config.js";
|
|
18
|
+
import { ensureVertexAdc, clearGeminiApiKeyEnv } from "./vertexAuth.js";
|
|
19
|
+
import { readCallContextFromRoom } from "./roomContext.js";
|
|
20
|
+
import { attachRealtimeEventBridge } from "./realtimeEventBridge.js";
|
|
21
|
+
import { buildRealtimeMcpTools } from "./realtimeMcpTools.js";
|
|
22
|
+
const realtimeLogEventSchema = z.object({
|
|
23
|
+
level: z.enum(["debug", "info", "warn", "error"]),
|
|
24
|
+
message: z.string(),
|
|
25
|
+
timestamp: z.number(),
|
|
26
|
+
data: z.unknown().optional(),
|
|
27
|
+
});
|
|
28
|
+
/**
|
|
29
|
+
* Install the per-call job lifecycle: shut down when the caller leaves, log
|
|
30
|
+
* connection transitions, run the empty-room / join-deadline watchdog, and reap
|
|
31
|
+
* the worker (parent) + this job (child) on shutdown.
|
|
32
|
+
*
|
|
33
|
+
* In `connect` mode the worker does NOT exit when a job shuts down, so the child
|
|
34
|
+
* must SIGTERM its parent and hard-exit itself.
|
|
35
|
+
*/
|
|
36
|
+
async function installRealtimeJobLifecycle(ctx, cfg) {
|
|
37
|
+
const { RoomEvent } = await import("@livekit/rtc-node");
|
|
38
|
+
const joinedAt = Date.now();
|
|
39
|
+
ctx.room.on(RoomEvent.ParticipantDisconnected, () => {
|
|
40
|
+
if (ctx.room.remoteParticipants.size === 0) {
|
|
41
|
+
logger.info("realtime.room.participantLeft", {
|
|
42
|
+
remotes: 0,
|
|
43
|
+
action: "shutdown",
|
|
44
|
+
});
|
|
45
|
+
ctx.shutdown("participant left");
|
|
46
|
+
}
|
|
47
|
+
});
|
|
48
|
+
const logConn = (label) => (...args) => {
|
|
49
|
+
if (!logger.shouldLog("debug")) {
|
|
50
|
+
return;
|
|
51
|
+
}
|
|
52
|
+
logger.debug("realtime.room.connection", {
|
|
53
|
+
event: label,
|
|
54
|
+
remotes: ctx.room.remoteParticipants.size,
|
|
55
|
+
...(args.length ? { detail: JSON.stringify(args).slice(0, 200) } : {}),
|
|
56
|
+
});
|
|
57
|
+
};
|
|
58
|
+
ctx.room.on(RoomEvent.Disconnected, (...args) => {
|
|
59
|
+
logConn("Disconnected")(...args);
|
|
60
|
+
ctx.shutdown("room disconnected");
|
|
61
|
+
});
|
|
62
|
+
ctx.room.on(RoomEvent.Reconnecting, logConn("Reconnecting"));
|
|
63
|
+
ctx.room.on(RoomEvent.Reconnected, logConn("Reconnected"));
|
|
64
|
+
ctx.room.on(RoomEvent.ConnectionStateChanged, logConn("ConnectionStateChanged"));
|
|
65
|
+
ctx.room.on(RoomEvent.ConnectionQualityChanged, logConn("ConnectionQualityChanged"));
|
|
66
|
+
let sawParticipant = ctx.room.remoteParticipants.size > 0;
|
|
67
|
+
let emptySince = null;
|
|
68
|
+
const emptyRoomWatchdog = setInterval(() => {
|
|
69
|
+
const remotes = ctx.room.remoteParticipants.size;
|
|
70
|
+
if (remotes > 0) {
|
|
71
|
+
sawParticipant = true;
|
|
72
|
+
emptySince = null;
|
|
73
|
+
return;
|
|
74
|
+
}
|
|
75
|
+
if (sawParticipant) {
|
|
76
|
+
emptySince ??= Date.now();
|
|
77
|
+
if (Date.now() - emptySince >= cfg.emptyRoomGraceMs) {
|
|
78
|
+
logger.info("realtime.watchdog.emptyRoom", {
|
|
79
|
+
graceMs: cfg.emptyRoomGraceMs,
|
|
80
|
+
action: "shutdown",
|
|
81
|
+
});
|
|
82
|
+
ctx.shutdown("empty-room watchdog");
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
else if (Date.now() - joinedAt >= cfg.joinDeadlineMs) {
|
|
86
|
+
logger.info("realtime.watchdog.joinDeadline", {
|
|
87
|
+
joinDeadlineMs: cfg.joinDeadlineMs,
|
|
88
|
+
action: "shutdown",
|
|
89
|
+
});
|
|
90
|
+
ctx.shutdown("join-deadline watchdog");
|
|
91
|
+
}
|
|
92
|
+
}, 5000);
|
|
93
|
+
emptyRoomWatchdog.unref?.();
|
|
94
|
+
ctx.addShutdownCallback(async () => {
|
|
95
|
+
clearInterval(emptyRoomWatchdog);
|
|
96
|
+
});
|
|
97
|
+
if (process.env.LK_REALTIME_CONNECT_MODE === "true") {
|
|
98
|
+
ctx.addShutdownCallback(async () => {
|
|
99
|
+
const parentPid = process.ppid;
|
|
100
|
+
logger.info("realtime.reap.parent", {
|
|
101
|
+
mode: "connect",
|
|
102
|
+
jobPid: process.pid,
|
|
103
|
+
parentPid,
|
|
104
|
+
});
|
|
105
|
+
if (typeof parentPid === "number" && parentPid > 1) {
|
|
106
|
+
try {
|
|
107
|
+
process.kill(parentPid, "SIGTERM");
|
|
108
|
+
}
|
|
109
|
+
catch {
|
|
110
|
+
logger.debug("realtime.reap.parentGone", { parentPid });
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
setTimeout(() => {
|
|
114
|
+
process.exit(0);
|
|
115
|
+
}, 1000);
|
|
116
|
+
});
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
/** Merge caller overrides over the env-resolved realtime config. */
|
|
120
|
+
function resolveConfig(overrides) {
|
|
121
|
+
const base = resolveRealtimeVoiceConfig();
|
|
122
|
+
return {
|
|
123
|
+
...base,
|
|
124
|
+
project: overrides.project ?? base.project,
|
|
125
|
+
location: overrides.location ?? base.location,
|
|
126
|
+
model: overrides.model ?? base.model,
|
|
127
|
+
voice: overrides.voice ?? base.voice,
|
|
128
|
+
responseModality: overrides.responseModality ?? base.responseModality,
|
|
129
|
+
systemPrompt: overrides.systemPrompt ?? base.systemPrompt,
|
|
130
|
+
greeting: overrides.greeting ?? base.greeting,
|
|
131
|
+
toolsEnabled: overrides.tools?.enabled ?? base.toolsEnabled,
|
|
132
|
+
mcpUrl: overrides.tools?.mcpUrl ?? base.mcpUrl,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
/**
|
|
136
|
+
* Define a realtime (Gemini Live S2S) LiveKit voice agent.
|
|
137
|
+
*
|
|
138
|
+
* Place the result as the default export of the worker entry file and launch it
|
|
139
|
+
* with `startRealtimeVoiceAgentWorker`. With no `config` everything is resolved
|
|
140
|
+
* from the environment (see `resolveRealtimeVoiceConfig`).
|
|
141
|
+
*/
|
|
142
|
+
export function defineRealtimeVoiceAgent(config = {}) {
|
|
143
|
+
const cfg = resolveConfig(config);
|
|
144
|
+
const eventsTopic = config.eventsTopic;
|
|
145
|
+
const controlTopic = config.controlTopic;
|
|
146
|
+
const onLog = config.onLog;
|
|
147
|
+
async function entry(ctx) {
|
|
148
|
+
// The Gemini Live WS authenticates to Vertex via ADC; materialise it and
|
|
149
|
+
// force ADC (not an API key) auth before any realtime connection.
|
|
150
|
+
ensureVertexAdc();
|
|
151
|
+
clearGeminiApiKeyEnv();
|
|
152
|
+
logger.info("realtime.config", {
|
|
153
|
+
model: cfg.model,
|
|
154
|
+
location: cfg.location,
|
|
155
|
+
project: cfg.project ?? null,
|
|
156
|
+
voice: cfg.voice ?? null,
|
|
157
|
+
modality: cfg.responseModality,
|
|
158
|
+
toolsEnabled: cfg.toolsEnabled,
|
|
159
|
+
mcpUrl: cfg.mcpUrl,
|
|
160
|
+
});
|
|
161
|
+
await ctx.connect();
|
|
162
|
+
const connectedAt = Date.now();
|
|
163
|
+
const sinceConnect = () => Date.now() - connectedAt;
|
|
164
|
+
if (onLog) {
|
|
165
|
+
const room = ctx.room.name ?? "unknown";
|
|
166
|
+
logger.setEventEmitter({
|
|
167
|
+
emit: (event, ...args) => {
|
|
168
|
+
if (event === "log-event") {
|
|
169
|
+
const decoded = realtimeLogEventSchema.safeParse(args[0]);
|
|
170
|
+
if (decoded.success) {
|
|
171
|
+
try {
|
|
172
|
+
onLog(decoded.data, { room });
|
|
173
|
+
}
|
|
174
|
+
catch {
|
|
175
|
+
/* a log sink must never break the call */
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return true;
|
|
180
|
+
},
|
|
181
|
+
});
|
|
182
|
+
ctx.addShutdownCallback(async () => {
|
|
183
|
+
logger.clearEventEmitter();
|
|
184
|
+
});
|
|
185
|
+
}
|
|
186
|
+
logger.info("realtime.room.joined", {
|
|
187
|
+
room: ctx.room.name ?? "unknown",
|
|
188
|
+
model: cfg.model,
|
|
189
|
+
location: cfg.location,
|
|
190
|
+
ms: sinceConnect(),
|
|
191
|
+
});
|
|
192
|
+
await installRealtimeJobLifecycle(ctx, cfg);
|
|
193
|
+
const { voice } = await import("@livekit/agents");
|
|
194
|
+
const google = await import("@livekit/agents-plugin-google");
|
|
195
|
+
const { Modality } = await import("@google/genai");
|
|
196
|
+
const bridge = await attachRealtimeEventBridge({
|
|
197
|
+
room: ctx.room,
|
|
198
|
+
hitlTimeoutMs: cfg.hitlTimeoutMs,
|
|
199
|
+
...(eventsTopic !== undefined ? { eventsTopic } : {}),
|
|
200
|
+
...(controlTopic !== undefined ? { controlTopic } : {}),
|
|
201
|
+
});
|
|
202
|
+
ctx.addShutdownCallback(async () => {
|
|
203
|
+
bridge.dispose();
|
|
204
|
+
});
|
|
205
|
+
let agentTools;
|
|
206
|
+
if (cfg.toolsEnabled) {
|
|
207
|
+
const { authToken, xContext } = readCallContextFromRoom(ctx.room.metadata);
|
|
208
|
+
if (xContext === "") {
|
|
209
|
+
logger.warn("realtime.mcp.noContext", {
|
|
210
|
+
reason: "room-metadata-missing-context",
|
|
211
|
+
action: "running-without-tools",
|
|
212
|
+
});
|
|
213
|
+
}
|
|
214
|
+
else {
|
|
215
|
+
try {
|
|
216
|
+
const toolset = await buildRealtimeMcpTools({
|
|
217
|
+
mcpUrl: cfg.mcpUrl,
|
|
218
|
+
authToken,
|
|
219
|
+
xContext,
|
|
220
|
+
publishEvent: bridge.publishEvent,
|
|
221
|
+
requestConfirmation: bridge.requestConfirmation,
|
|
222
|
+
});
|
|
223
|
+
agentTools = toolset.tools;
|
|
224
|
+
logger.info("realtime.mcp.enabled", {
|
|
225
|
+
mcpUrl: cfg.mcpUrl,
|
|
226
|
+
toolCount: Object.keys(toolset.tools).length,
|
|
227
|
+
hasAuthToken: authToken !== "",
|
|
228
|
+
});
|
|
229
|
+
ctx.addShutdownCallback(async () => {
|
|
230
|
+
await toolset.client.close();
|
|
231
|
+
});
|
|
232
|
+
}
|
|
233
|
+
catch (error) {
|
|
234
|
+
logger.error("realtime.mcp.setupFailed", {
|
|
235
|
+
mcpUrl: cfg.mcpUrl,
|
|
236
|
+
error: error instanceof Error ? error.message : String(error),
|
|
237
|
+
action: "running-without-tools",
|
|
238
|
+
});
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
const modality = Object.values(Modality).find((value) => value === cfg.responseModality);
|
|
243
|
+
const modelOptions = {
|
|
244
|
+
vertexai: true,
|
|
245
|
+
model: cfg.model,
|
|
246
|
+
// Emit text transcripts of BOTH sides; LiveKit forwards them to the room.
|
|
247
|
+
inputAudioTranscription: {},
|
|
248
|
+
outputAudioTranscription: {},
|
|
249
|
+
};
|
|
250
|
+
if (modality !== undefined) {
|
|
251
|
+
modelOptions.modalities = [modality];
|
|
252
|
+
}
|
|
253
|
+
if (cfg.project) {
|
|
254
|
+
modelOptions.project = cfg.project;
|
|
255
|
+
}
|
|
256
|
+
if (cfg.location) {
|
|
257
|
+
modelOptions.location = cfg.location;
|
|
258
|
+
}
|
|
259
|
+
if (cfg.voice) {
|
|
260
|
+
modelOptions.voice = cfg.voice;
|
|
261
|
+
}
|
|
262
|
+
const session = new voice.AgentSession({
|
|
263
|
+
llm: new google.realtime.RealtimeModel(modelOptions),
|
|
264
|
+
});
|
|
265
|
+
const agent = new voice.Agent({
|
|
266
|
+
instructions: cfg.systemPrompt,
|
|
267
|
+
...(agentTools ? { tools: agentTools } : {}),
|
|
268
|
+
});
|
|
269
|
+
agent.transcriptionNode = async (textStream, _modelSettings) => {
|
|
270
|
+
const [forUi, forDownstream] = textStream.tee();
|
|
271
|
+
void (async () => {
|
|
272
|
+
const reader = forUi.getReader();
|
|
273
|
+
let chunkCount = 0;
|
|
274
|
+
try {
|
|
275
|
+
for (;;) {
|
|
276
|
+
const { done, value } = await reader.read();
|
|
277
|
+
if (done) {
|
|
278
|
+
break;
|
|
279
|
+
}
|
|
280
|
+
const chunk = typeof value === "string" ? value : (value.text ?? "");
|
|
281
|
+
if (chunk) {
|
|
282
|
+
chunkCount += 1;
|
|
283
|
+
if (chunkCount === 1) {
|
|
284
|
+
logger.debug("realtime.session.assistantFirstChunk", {
|
|
285
|
+
ms: sinceConnect(),
|
|
286
|
+
preview: chunk.slice(0, 30),
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
bridge.publishEvent("text", { delta: chunk });
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
catch {
|
|
294
|
+
/* best-effort live streaming */
|
|
295
|
+
}
|
|
296
|
+
})();
|
|
297
|
+
return forDownstream;
|
|
298
|
+
};
|
|
299
|
+
session.on(voice.AgentSessionEventTypes.MetricsCollected, (ev) => {
|
|
300
|
+
const metrics = ev.metrics;
|
|
301
|
+
if (metrics.type !== "realtime_model_metrics") {
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
const inAudio = metrics.inputTokenDetails.audioTokens;
|
|
305
|
+
const outAudio = metrics.outputTokenDetails.audioTokens;
|
|
306
|
+
const inText = Math.max(0, metrics.inputTokens - inAudio);
|
|
307
|
+
const outText = Math.max(0, metrics.outputTokens - outAudio);
|
|
308
|
+
logger.info("realtime.usage", {
|
|
309
|
+
inputTokens: metrics.inputTokens,
|
|
310
|
+
inputAudioTokens: inAudio,
|
|
311
|
+
inputTextTokens: inText,
|
|
312
|
+
outputTokens: metrics.outputTokens,
|
|
313
|
+
outputAudioTokens: outAudio,
|
|
314
|
+
outputTextTokens: outText,
|
|
315
|
+
});
|
|
316
|
+
});
|
|
317
|
+
// --- Session events → browser ------------------------------------------
|
|
318
|
+
session.on(voice.AgentSessionEventTypes.UserInputTranscribed, (ev) => {
|
|
319
|
+
logger.debug("realtime.session.userTranscript", {
|
|
320
|
+
ms: sinceConnect(),
|
|
321
|
+
final: ev.isFinal,
|
|
322
|
+
transcript: ev.transcript,
|
|
323
|
+
});
|
|
324
|
+
bridge.publishEvent("user-text", {
|
|
325
|
+
text: ev.transcript,
|
|
326
|
+
final: ev.isFinal,
|
|
327
|
+
});
|
|
328
|
+
});
|
|
329
|
+
session.on(voice.AgentSessionEventTypes.AgentStateChanged, (ev) => {
|
|
330
|
+
logger.debug("realtime.session.stateChanged", {
|
|
331
|
+
ms: sinceConnect(),
|
|
332
|
+
from: ev.oldState,
|
|
333
|
+
to: ev.newState,
|
|
334
|
+
});
|
|
335
|
+
bridge.publishEvent("status", { state: ev.newState });
|
|
336
|
+
});
|
|
337
|
+
session.on(voice.AgentSessionEventTypes.ConversationItemAdded, (ev) => {
|
|
338
|
+
if (ev.item.type === "message" &&
|
|
339
|
+
ev.item.role === "assistant" &&
|
|
340
|
+
ev.item.textContent) {
|
|
341
|
+
logger.debug("realtime.session.turnDone", { ms: sinceConnect() });
|
|
342
|
+
bridge.publishEvent("done", {});
|
|
343
|
+
}
|
|
344
|
+
});
|
|
345
|
+
session.on(voice.AgentSessionEventTypes.Error, (ev) => {
|
|
346
|
+
logger.error("realtime.session.error", {
|
|
347
|
+
ms: sinceConnect(),
|
|
348
|
+
error: ev.error,
|
|
349
|
+
});
|
|
350
|
+
bridge.publishEvent("status", { state: "error" });
|
|
351
|
+
});
|
|
352
|
+
await session.start({ agent, room: ctx.room });
|
|
353
|
+
logger.info("realtime.session.start", { ms: sinceConnect() });
|
|
354
|
+
bridge.publishEvent("status", { state: "listening" });
|
|
355
|
+
if (cfg.greeting.trim().length > 0) {
|
|
356
|
+
logger.info("realtime.session.greeting", { ms: sinceConnect() });
|
|
357
|
+
session.generateReply({ instructions: cfg.greeting });
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
return { entry };
|
|
361
|
+
}
|
|
362
|
+
//# sourceMappingURL=realtimeVoiceAgent.js.map
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-call context from LiveKit room metadata.
|
|
3
|
+
*
|
|
4
|
+
* The manager (e.g. a Lighthouse `/start` endpoint) pre-creates the room with
|
|
5
|
+
* `base64(JSON({ authToken, mcpContext }))` metadata, built from the caller's
|
|
6
|
+
* session. The worker reads it on join — nothing per-call comes from worker env.
|
|
7
|
+
* Returns the MCP `x-auth-token` and the base64(JSON) `x-context` the server
|
|
8
|
+
* expects.
|
|
9
|
+
*
|
|
10
|
+
* The metadata is untrusted input, so it is decoded with a zod schema rather
|
|
11
|
+
* than a trusted `JSON.parse` cast.
|
|
12
|
+
*
|
|
13
|
+
* See docs/features/livekit-voice-agent.md.
|
|
14
|
+
*/
|
|
15
|
+
import type { LiveKitRoomCallContext } from "../../types/index.js";
|
|
16
|
+
/**
|
|
17
|
+
* Decode `{ authToken, mcpContext }` from a room's base64(JSON) metadata.
|
|
18
|
+
*
|
|
19
|
+
* `authToken` may be empty (demo/guest, where the MCP server gates on the
|
|
20
|
+
* context's `demoMode`); `xContext` is the re-encoded base64(JSON) of
|
|
21
|
+
* `mcpContext`, or `""` when no context was supplied or the metadata is invalid.
|
|
22
|
+
*/
|
|
23
|
+
export declare function readCallContextFromRoom(roomMetadata: string | undefined): LiveKitRoomCallContext;
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-call context from LiveKit room metadata.
|
|
3
|
+
*
|
|
4
|
+
* The manager (e.g. a Lighthouse `/start` endpoint) pre-creates the room with
|
|
5
|
+
* `base64(JSON({ authToken, mcpContext }))` metadata, built from the caller's
|
|
6
|
+
* session. The worker reads it on join — nothing per-call comes from worker env.
|
|
7
|
+
* Returns the MCP `x-auth-token` and the base64(JSON) `x-context` the server
|
|
8
|
+
* expects.
|
|
9
|
+
*
|
|
10
|
+
* The metadata is untrusted input, so it is decoded with a zod schema rather
|
|
11
|
+
* than a trusted `JSON.parse` cast.
|
|
12
|
+
*
|
|
13
|
+
* See docs/features/livekit-voice-agent.md.
|
|
14
|
+
*/
|
|
15
|
+
import { Buffer } from "node:buffer";
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
import { logger } from "../../utils/logger.js";
|
|
18
|
+
/** Shape the manager writes into room metadata. `mcpContext` is opaque here. */
|
|
19
|
+
const roomMetadataSchema = z.object({
|
|
20
|
+
authToken: z.string().optional(),
|
|
21
|
+
mcpContext: z.unknown().optional(),
|
|
22
|
+
});
|
|
23
|
+
/** Decode the base64(JSON) metadata string into an `unknown`, or `undefined`. */
|
|
24
|
+
function decodeBase64Json(encoded) {
|
|
25
|
+
try {
|
|
26
|
+
return JSON.parse(Buffer.from(encoded, "base64").toString("utf-8"));
|
|
27
|
+
}
|
|
28
|
+
catch (error) {
|
|
29
|
+
logger.error(`[RealtimeVoiceAgent] room metadata is not valid base64 JSON: ${String(error)}`);
|
|
30
|
+
return undefined;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Decode `{ authToken, mcpContext }` from a room's base64(JSON) metadata.
|
|
35
|
+
*
|
|
36
|
+
* `authToken` may be empty (demo/guest, where the MCP server gates on the
|
|
37
|
+
* context's `demoMode`); `xContext` is the re-encoded base64(JSON) of
|
|
38
|
+
* `mcpContext`, or `""` when no context was supplied or the metadata is invalid.
|
|
39
|
+
*/
|
|
40
|
+
export function readCallContextFromRoom(roomMetadata) {
|
|
41
|
+
const empty = { authToken: "", xContext: "" };
|
|
42
|
+
if (!roomMetadata) {
|
|
43
|
+
logger.warn("[RealtimeVoiceAgent] room has no metadata — MCP auth/context unavailable.");
|
|
44
|
+
return empty;
|
|
45
|
+
}
|
|
46
|
+
const decoded = roomMetadataSchema.safeParse(decodeBase64Json(roomMetadata));
|
|
47
|
+
if (!decoded.success) {
|
|
48
|
+
logger.error(`[RealtimeVoiceAgent] room metadata has unexpected shape: ${decoded.error.message}`);
|
|
49
|
+
return empty;
|
|
50
|
+
}
|
|
51
|
+
const { authToken, mcpContext } = decoded.data;
|
|
52
|
+
const xContext = mcpContext === undefined || mcpContext === null
|
|
53
|
+
? ""
|
|
54
|
+
: Buffer.from(JSON.stringify(mcpContext), "utf-8").toString("base64");
|
|
55
|
+
return { authToken: authToken ?? "", xContext };
|
|
56
|
+
}
|
|
57
|
+
//# sourceMappingURL=roomContext.js.map
|