@openclaw/voice-call 2026.5.2 → 2026.5.3-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api.js +2 -0
- package/dist/call-status-CXldV5o8.js +32 -0
- package/dist/cli-metadata.js +12 -0
- package/dist/config-7w04YpHh.js +548 -0
- package/dist/config-compat-B0me39_4.js +129 -0
- package/dist/guarded-json-api-Btx5EE4w.js +591 -0
- package/dist/http-headers-BrnxBasF.js +10 -0
- package/dist/index.js +1284 -0
- package/dist/mock-CeKvfVEd.js +135 -0
- package/dist/plivo-B-a7KFoT.js +393 -0
- package/dist/realtime-handler-B63CIDP2.js +325 -0
- package/dist/realtime-transcription.runtime-B2h70y2W.js +2 -0
- package/dist/realtime-voice.runtime-Bkh4nvLn.js +2 -0
- package/dist/response-generator-BrcmwDZU.js +182 -0
- package/dist/response-model-CyF5K80p.js +12 -0
- package/dist/runtime-api.js +6 -0
- package/dist/runtime-entry-88ytYAQa.js +3119 -0
- package/dist/runtime-entry.js +2 -0
- package/dist/setup-api.js +37 -0
- package/dist/telnyx-jjBE8boz.js +260 -0
- package/dist/twilio-1OqbcXLL.js +676 -0
- package/dist/voice-mapping-BYDGdWGx.js +40 -0
- package/package.json +14 -6
- package/api.ts +0 -16
- package/cli-metadata.ts +0 -10
- package/config-api.ts +0 -12
- package/index.test.ts +0 -943
- package/index.ts +0 -794
- package/runtime-api.ts +0 -20
- package/runtime-entry.ts +0 -1
- package/setup-api.ts +0 -47
- package/src/allowlist.test.ts +0 -18
- package/src/allowlist.ts +0 -19
- package/src/cli.ts +0 -845
- package/src/config-compat.test.ts +0 -120
- package/src/config-compat.ts +0 -227
- package/src/config.test.ts +0 -479
- package/src/config.ts +0 -808
- package/src/core-bridge.ts +0 -14
- package/src/deep-merge.test.ts +0 -40
- package/src/deep-merge.ts +0 -23
- package/src/gateway-continue-operation.ts +0 -200
- package/src/http-headers.test.ts +0 -16
- package/src/http-headers.ts +0 -15
- package/src/manager/context.ts +0 -42
- package/src/manager/events.test.ts +0 -581
- package/src/manager/events.ts +0 -288
- package/src/manager/lifecycle.ts +0 -53
- package/src/manager/lookup.test.ts +0 -52
- package/src/manager/lookup.ts +0 -35
- package/src/manager/outbound.test.ts +0 -528
- package/src/manager/outbound.ts +0 -486
- package/src/manager/state.ts +0 -48
- package/src/manager/store.ts +0 -106
- package/src/manager/timers.test.ts +0 -129
- package/src/manager/timers.ts +0 -113
- package/src/manager/twiml.test.ts +0 -13
- package/src/manager/twiml.ts +0 -17
- package/src/manager.closed-loop.test.ts +0 -236
- package/src/manager.inbound-allowlist.test.ts +0 -188
- package/src/manager.notify.test.ts +0 -377
- package/src/manager.restore.test.ts +0 -183
- package/src/manager.test-harness.ts +0 -127
- package/src/manager.ts +0 -392
- package/src/media-stream.test.ts +0 -768
- package/src/media-stream.ts +0 -708
- package/src/providers/base.ts +0 -97
- package/src/providers/mock.test.ts +0 -78
- package/src/providers/mock.ts +0 -185
- package/src/providers/plivo.test.ts +0 -93
- package/src/providers/plivo.ts +0 -601
- package/src/providers/shared/call-status.test.ts +0 -24
- package/src/providers/shared/call-status.ts +0 -24
- package/src/providers/shared/guarded-json-api.test.ts +0 -106
- package/src/providers/shared/guarded-json-api.ts +0 -42
- package/src/providers/telnyx.test.ts +0 -340
- package/src/providers/telnyx.ts +0 -394
- package/src/providers/twilio/api.test.ts +0 -145
- package/src/providers/twilio/api.ts +0 -93
- package/src/providers/twilio/twiml-policy.test.ts +0 -84
- package/src/providers/twilio/twiml-policy.ts +0 -87
- package/src/providers/twilio/webhook.ts +0 -34
- package/src/providers/twilio.test.ts +0 -591
- package/src/providers/twilio.ts +0 -861
- package/src/providers/twilio.types.ts +0 -17
- package/src/realtime-defaults.ts +0 -3
- package/src/realtime-fast-context.test.ts +0 -88
- package/src/realtime-fast-context.ts +0 -165
- package/src/realtime-transcription.runtime.ts +0 -4
- package/src/realtime-voice.runtime.ts +0 -5
- package/src/response-generator.test.ts +0 -321
- package/src/response-generator.ts +0 -318
- package/src/response-model.test.ts +0 -71
- package/src/response-model.ts +0 -23
- package/src/runtime.test.ts +0 -536
- package/src/runtime.ts +0 -510
- package/src/telephony-audio.test.ts +0 -61
- package/src/telephony-audio.ts +0 -12
- package/src/telephony-tts.test.ts +0 -196
- package/src/telephony-tts.ts +0 -235
- package/src/test-fixtures.ts +0 -73
- package/src/tts-provider-voice.test.ts +0 -34
- package/src/tts-provider-voice.ts +0 -21
- package/src/tunnel.test.ts +0 -166
- package/src/tunnel.ts +0 -314
- package/src/types.ts +0 -291
- package/src/utils.test.ts +0 -17
- package/src/utils.ts +0 -14
- package/src/voice-mapping.test.ts +0 -34
- package/src/voice-mapping.ts +0 -68
- package/src/webhook/realtime-handler.test.ts +0 -598
- package/src/webhook/realtime-handler.ts +0 -485
- package/src/webhook/stale-call-reaper.test.ts +0 -88
- package/src/webhook/stale-call-reaper.ts +0 -38
- package/src/webhook/tailscale.test.ts +0 -214
- package/src/webhook/tailscale.ts +0 -129
- package/src/webhook-exposure.test.ts +0 -33
- package/src/webhook-exposure.ts +0 -84
- package/src/webhook-security.test.ts +0 -770
- package/src/webhook-security.ts +0 -994
- package/src/webhook.hangup-once.lifecycle.test.ts +0 -135
- package/src/webhook.test.ts +0 -1470
- package/src/webhook.ts +0 -908
- package/src/webhook.types.ts +0 -5
- package/src/websocket-test-support.ts +0 -72
- package/tsconfig.json +0 -16
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
|
|
2
|
+
import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, buildRealtimeVoiceAgentConsultWorkingResponse, createRealtimeVoiceBridgeSession } from "openclaw/plugin-sdk/realtime-voice";
|
|
3
|
+
import { randomUUID } from "node:crypto";
|
|
4
|
+
import WebSocket, { WebSocketServer } from "ws";
|
|
5
|
+
//#region extensions/voice-call/src/webhook/realtime-handler.ts
|
|
6
|
+
const STREAM_TOKEN_TTL_MS = 3e4;
|
|
7
|
+
const DEFAULT_HOST = "localhost:8443";
|
|
8
|
+
const MAX_REALTIME_MESSAGE_BYTES = 256 * 1024;
|
|
9
|
+
function normalizePath(pathname) {
|
|
10
|
+
const trimmed = pathname.trim();
|
|
11
|
+
if (!trimmed) return "/";
|
|
12
|
+
const prefixed = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
|
|
13
|
+
if (prefixed === "/") return prefixed;
|
|
14
|
+
return prefixed.endsWith("/") ? prefixed.slice(0, -1) : prefixed;
|
|
15
|
+
}
|
|
16
|
+
function buildGreetingInstructions(baseInstructions, greeting) {
|
|
17
|
+
const trimmedGreeting = greeting?.trim();
|
|
18
|
+
if (!trimmedGreeting) return;
|
|
19
|
+
const intro = "Start the call by greeting the caller naturally. Include this greeting in your first spoken reply:";
|
|
20
|
+
return baseInstructions ? `${baseInstructions}\n\n${intro} "${trimmedGreeting}"` : `${intro} "${trimmedGreeting}"`;
|
|
21
|
+
}
|
|
22
|
+
var RealtimeCallHandler = class {
|
|
23
|
+
constructor(config, manager, provider, realtimeProvider, providerConfig, servePath) {
|
|
24
|
+
this.config = config;
|
|
25
|
+
this.manager = manager;
|
|
26
|
+
this.provider = provider;
|
|
27
|
+
this.realtimeProvider = realtimeProvider;
|
|
28
|
+
this.providerConfig = providerConfig;
|
|
29
|
+
this.servePath = servePath;
|
|
30
|
+
this.toolHandlers = /* @__PURE__ */ new Map();
|
|
31
|
+
this.pendingStreamTokens = /* @__PURE__ */ new Map();
|
|
32
|
+
this.activeBridgesByCallId = /* @__PURE__ */ new Map();
|
|
33
|
+
this.partialUserTranscriptsByCallId = /* @__PURE__ */ new Map();
|
|
34
|
+
this.publicOrigin = null;
|
|
35
|
+
this.publicPathPrefix = "";
|
|
36
|
+
}
|
|
37
|
+
setPublicUrl(url) {
|
|
38
|
+
try {
|
|
39
|
+
const parsed = new URL(url);
|
|
40
|
+
this.publicOrigin = parsed.host;
|
|
41
|
+
const normalizedServePath = normalizePath(this.servePath);
|
|
42
|
+
const normalizedPublicPath = normalizePath(parsed.pathname);
|
|
43
|
+
const idx = normalizedPublicPath.indexOf(normalizedServePath);
|
|
44
|
+
this.publicPathPrefix = idx > 0 ? normalizedPublicPath.slice(0, idx) : "";
|
|
45
|
+
} catch {
|
|
46
|
+
this.publicOrigin = null;
|
|
47
|
+
this.publicPathPrefix = "";
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
getStreamPathPattern() {
|
|
51
|
+
return `${this.publicPathPrefix}${normalizePath(this.config.streamPath ?? "/voice/stream/realtime")}`;
|
|
52
|
+
}
|
|
53
|
+
buildTwiMLPayload(req, params) {
|
|
54
|
+
const host = this.publicOrigin || req.headers.host || DEFAULT_HOST;
|
|
55
|
+
const rawDirection = params?.get("Direction");
|
|
56
|
+
const token = this.issueStreamToken({
|
|
57
|
+
from: params?.get("From") ?? void 0,
|
|
58
|
+
to: params?.get("To") ?? void 0,
|
|
59
|
+
direction: rawDirection?.startsWith("outbound") ? "outbound" : "inbound"
|
|
60
|
+
});
|
|
61
|
+
return {
|
|
62
|
+
statusCode: 200,
|
|
63
|
+
headers: { "Content-Type": "text/xml" },
|
|
64
|
+
body: `<?xml version="1.0" encoding="UTF-8"?>
|
|
65
|
+
<Response>
|
|
66
|
+
<Connect>
|
|
67
|
+
<Stream url="${`wss://${host}${this.getStreamPathPattern()}/${token}`}" />
|
|
68
|
+
</Connect>
|
|
69
|
+
</Response>`
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
handleWebSocketUpgrade(request, socket, head) {
|
|
73
|
+
const token = new URL(request.url ?? "/", "wss://localhost").pathname.split("/").pop() ?? null;
|
|
74
|
+
const callerMeta = token ? this.consumeStreamToken(token) : null;
|
|
75
|
+
if (!callerMeta) {
|
|
76
|
+
socket.write("HTTP/1.1 401 Unauthorized\r\n\r\n");
|
|
77
|
+
socket.destroy();
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
80
|
+
new WebSocketServer({
|
|
81
|
+
noServer: true,
|
|
82
|
+
maxPayload: MAX_REALTIME_MESSAGE_BYTES
|
|
83
|
+
}).handleUpgrade(request, socket, head, (ws) => {
|
|
84
|
+
let bridge = null;
|
|
85
|
+
let initialized = false;
|
|
86
|
+
ws.on("message", (data) => {
|
|
87
|
+
try {
|
|
88
|
+
const msg = JSON.parse(data.toString());
|
|
89
|
+
if (!initialized && msg.event === "start") {
|
|
90
|
+
initialized = true;
|
|
91
|
+
const startData = typeof msg.start === "object" && msg.start !== null ? msg.start : void 0;
|
|
92
|
+
const streamSid = typeof startData?.streamSid === "string" ? startData.streamSid : "unknown";
|
|
93
|
+
const callSid = typeof startData?.callSid === "string" ? startData.callSid : "unknown";
|
|
94
|
+
const nextBridge = this.handleCall(streamSid, callSid, ws, callerMeta);
|
|
95
|
+
if (!nextBridge) return;
|
|
96
|
+
bridge = nextBridge;
|
|
97
|
+
return;
|
|
98
|
+
}
|
|
99
|
+
if (!bridge) return;
|
|
100
|
+
const mediaData = typeof msg.media === "object" && msg.media !== null ? msg.media : void 0;
|
|
101
|
+
if (msg.event === "media" && typeof mediaData?.payload === "string") {
|
|
102
|
+
bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
|
|
103
|
+
if (typeof mediaData.timestamp === "number") bridge.setMediaTimestamp(mediaData.timestamp);
|
|
104
|
+
else if (typeof mediaData.timestamp === "string") bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10));
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
if (msg.event === "mark") {
|
|
108
|
+
bridge.acknowledgeMark();
|
|
109
|
+
return;
|
|
110
|
+
}
|
|
111
|
+
if (msg.event === "stop") bridge.close();
|
|
112
|
+
} catch (error) {
|
|
113
|
+
console.error("[voice-call] realtime WS parse failed:", error);
|
|
114
|
+
}
|
|
115
|
+
});
|
|
116
|
+
ws.on("close", () => {
|
|
117
|
+
bridge?.close();
|
|
118
|
+
});
|
|
119
|
+
ws.on("error", (error) => {
|
|
120
|
+
console.error("[voice-call] realtime WS error:", error);
|
|
121
|
+
});
|
|
122
|
+
});
|
|
123
|
+
}
|
|
124
|
+
registerToolHandler(name, fn) {
|
|
125
|
+
this.toolHandlers.set(name, fn);
|
|
126
|
+
}
|
|
127
|
+
speak(callId, instructions) {
|
|
128
|
+
const bridge = this.activeBridgesByCallId.get(callId);
|
|
129
|
+
if (!bridge) return {
|
|
130
|
+
success: false,
|
|
131
|
+
error: "No active realtime bridge for call"
|
|
132
|
+
};
|
|
133
|
+
try {
|
|
134
|
+
bridge.triggerGreeting(instructions);
|
|
135
|
+
return { success: true };
|
|
136
|
+
} catch (error) {
|
|
137
|
+
return {
|
|
138
|
+
success: false,
|
|
139
|
+
error: formatErrorMessage(error)
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
issueStreamToken(meta = {}) {
|
|
144
|
+
const token = randomUUID();
|
|
145
|
+
this.pendingStreamTokens.set(token, {
|
|
146
|
+
expiry: Date.now() + STREAM_TOKEN_TTL_MS,
|
|
147
|
+
...meta
|
|
148
|
+
});
|
|
149
|
+
for (const [candidate, entry] of this.pendingStreamTokens) if (Date.now() > entry.expiry) this.pendingStreamTokens.delete(candidate);
|
|
150
|
+
return token;
|
|
151
|
+
}
|
|
152
|
+
consumeStreamToken(token) {
|
|
153
|
+
const entry = this.pendingStreamTokens.get(token);
|
|
154
|
+
if (!entry) return null;
|
|
155
|
+
this.pendingStreamTokens.delete(token);
|
|
156
|
+
if (Date.now() > entry.expiry) return null;
|
|
157
|
+
return {
|
|
158
|
+
from: entry.from,
|
|
159
|
+
to: entry.to,
|
|
160
|
+
direction: entry.direction
|
|
161
|
+
};
|
|
162
|
+
}
|
|
163
|
+
handleCall(streamSid, callSid, ws, callerMeta) {
|
|
164
|
+
const registration = this.registerCallInManager(callSid, callerMeta);
|
|
165
|
+
if (!registration) {
|
|
166
|
+
ws.close(1008, "Caller rejected by policy");
|
|
167
|
+
return null;
|
|
168
|
+
}
|
|
169
|
+
const { callId, initialGreetingInstructions } = registration;
|
|
170
|
+
console.log(`[voice-call] Realtime bridge starting for call ${callId} (providerCallId=${callSid}, initialGreeting=${initialGreetingInstructions ? "queued" : "absent"})`);
|
|
171
|
+
let callEndEmitted = false;
|
|
172
|
+
const emitCallEnd = (reason) => {
|
|
173
|
+
if (callEndEmitted) return;
|
|
174
|
+
callEndEmitted = true;
|
|
175
|
+
this.endCallInManager(callSid, callId, reason);
|
|
176
|
+
};
|
|
177
|
+
const bridge = createRealtimeVoiceBridgeSession({
|
|
178
|
+
provider: this.realtimeProvider,
|
|
179
|
+
providerConfig: this.providerConfig,
|
|
180
|
+
instructions: this.config.instructions,
|
|
181
|
+
tools: this.config.tools,
|
|
182
|
+
initialGreetingInstructions,
|
|
183
|
+
triggerGreetingOnReady: Boolean(initialGreetingInstructions),
|
|
184
|
+
audioSink: {
|
|
185
|
+
isOpen: () => ws.readyState === WebSocket.OPEN,
|
|
186
|
+
sendAudio: (muLaw) => {
|
|
187
|
+
ws.send(JSON.stringify({
|
|
188
|
+
event: "media",
|
|
189
|
+
streamSid,
|
|
190
|
+
media: { payload: muLaw.toString("base64") }
|
|
191
|
+
}));
|
|
192
|
+
},
|
|
193
|
+
clearAudio: () => {
|
|
194
|
+
ws.send(JSON.stringify({
|
|
195
|
+
event: "clear",
|
|
196
|
+
streamSid
|
|
197
|
+
}));
|
|
198
|
+
},
|
|
199
|
+
sendMark: (markName) => {
|
|
200
|
+
ws.send(JSON.stringify({
|
|
201
|
+
event: "mark",
|
|
202
|
+
streamSid,
|
|
203
|
+
mark: { name: markName }
|
|
204
|
+
}));
|
|
205
|
+
}
|
|
206
|
+
},
|
|
207
|
+
onTranscript: (role, text, isFinal) => {
|
|
208
|
+
if (!isFinal) {
|
|
209
|
+
if (role === "user" && text.trim()) this.partialUserTranscriptsByCallId.set(callId, text);
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
if (role === "user") {
|
|
213
|
+
this.partialUserTranscriptsByCallId.delete(callId);
|
|
214
|
+
const event = {
|
|
215
|
+
id: `realtime-speech-${callSid}-${Date.now()}`,
|
|
216
|
+
type: "call.speech",
|
|
217
|
+
callId,
|
|
218
|
+
providerCallId: callSid,
|
|
219
|
+
timestamp: Date.now(),
|
|
220
|
+
transcript: text,
|
|
221
|
+
isFinal: true
|
|
222
|
+
};
|
|
223
|
+
this.manager.processEvent(event);
|
|
224
|
+
return;
|
|
225
|
+
}
|
|
226
|
+
this.manager.processEvent({
|
|
227
|
+
id: `realtime-bot-${callSid}-${Date.now()}`,
|
|
228
|
+
type: "call.speaking",
|
|
229
|
+
callId,
|
|
230
|
+
providerCallId: callSid,
|
|
231
|
+
timestamp: Date.now(),
|
|
232
|
+
text
|
|
233
|
+
});
|
|
234
|
+
},
|
|
235
|
+
onToolCall: (toolEvent, session) => {
|
|
236
|
+
this.executeToolCall(session, callId, toolEvent.callId || toolEvent.itemId, toolEvent.name, toolEvent.args);
|
|
237
|
+
},
|
|
238
|
+
onError: (error) => {
|
|
239
|
+
console.error("[voice-call] realtime voice error:", error.message);
|
|
240
|
+
},
|
|
241
|
+
onClose: (reason) => {
|
|
242
|
+
this.activeBridgesByCallId.delete(callId);
|
|
243
|
+
this.activeBridgesByCallId.delete(callSid);
|
|
244
|
+
this.partialUserTranscriptsByCallId.delete(callId);
|
|
245
|
+
if (reason !== "error") return;
|
|
246
|
+
emitCallEnd("error");
|
|
247
|
+
if (ws.readyState === WebSocket.OPEN) ws.close(1011, "Bridge disconnected");
|
|
248
|
+
this.provider.hangupCall({
|
|
249
|
+
callId,
|
|
250
|
+
providerCallId: callSid,
|
|
251
|
+
reason: "error"
|
|
252
|
+
}).catch((error) => {
|
|
253
|
+
console.warn(`[voice-call] Failed to hang up realtime call ${callSid}: ${formatErrorMessage(error)}`);
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
});
|
|
257
|
+
this.activeBridgesByCallId.set(callId, bridge);
|
|
258
|
+
this.activeBridgesByCallId.set(callSid, bridge);
|
|
259
|
+
const closeBridge = bridge.close.bind(bridge);
|
|
260
|
+
bridge.close = () => {
|
|
261
|
+
this.activeBridgesByCallId.delete(callId);
|
|
262
|
+
this.activeBridgesByCallId.delete(callSid);
|
|
263
|
+
this.partialUserTranscriptsByCallId.delete(callId);
|
|
264
|
+
closeBridge();
|
|
265
|
+
};
|
|
266
|
+
bridge.connect().catch((error) => {
|
|
267
|
+
console.error("[voice-call] Failed to connect realtime bridge:", error);
|
|
268
|
+
bridge.close();
|
|
269
|
+
emitCallEnd("error");
|
|
270
|
+
ws.close(1011, "Failed to connect");
|
|
271
|
+
});
|
|
272
|
+
return bridge;
|
|
273
|
+
}
|
|
274
|
+
registerCallInManager(callSid, callerMeta = {}) {
|
|
275
|
+
const baseFields = {
|
|
276
|
+
providerCallId: callSid,
|
|
277
|
+
timestamp: Date.now(),
|
|
278
|
+
direction: callerMeta.direction ?? "inbound",
|
|
279
|
+
...callerMeta.from ? { from: callerMeta.from } : {},
|
|
280
|
+
...callerMeta.to ? { to: callerMeta.to } : {}
|
|
281
|
+
};
|
|
282
|
+
this.manager.processEvent({
|
|
283
|
+
id: `realtime-initiated-${callSid}`,
|
|
284
|
+
callId: callSid,
|
|
285
|
+
type: "call.initiated",
|
|
286
|
+
...baseFields
|
|
287
|
+
});
|
|
288
|
+
const callRecord = this.manager.getCallByProviderCallId(callSid);
|
|
289
|
+
if (!callRecord) return null;
|
|
290
|
+
const initialGreeting = this.extractInitialGreeting(callRecord);
|
|
291
|
+
console.log(`[voice-call] Realtime call ${callRecord.callId} initial greeting ${initialGreeting ? "queued" : "absent"}`);
|
|
292
|
+
if (callRecord.metadata) delete callRecord.metadata.initialMessage;
|
|
293
|
+
this.manager.processEvent({
|
|
294
|
+
id: `realtime-answered-${callSid}`,
|
|
295
|
+
callId: callSid,
|
|
296
|
+
type: "call.answered",
|
|
297
|
+
...baseFields
|
|
298
|
+
});
|
|
299
|
+
return {
|
|
300
|
+
callId: callRecord.callId,
|
|
301
|
+
initialGreetingInstructions: buildGreetingInstructions(this.config.instructions, initialGreeting)
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
extractInitialGreeting(call) {
|
|
305
|
+
return typeof call.metadata?.initialMessage === "string" ? call.metadata.initialMessage : void 0;
|
|
306
|
+
}
|
|
307
|
+
endCallInManager(callSid, callId, reason) {
|
|
308
|
+
this.manager.processEvent({
|
|
309
|
+
id: `realtime-ended-${callSid}-${Date.now()}`,
|
|
310
|
+
type: "call.ended",
|
|
311
|
+
callId,
|
|
312
|
+
providerCallId: callSid,
|
|
313
|
+
timestamp: Date.now(),
|
|
314
|
+
reason
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
async executeToolCall(bridge, callId, bridgeCallId, name, args) {
|
|
318
|
+
const handler = this.toolHandlers.get(name);
|
|
319
|
+
if (handler && name === REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME && bridge.bridge.supportsToolResultContinuation && !this.config.fastContext.enabled) bridge.submitToolResult(bridgeCallId, buildRealtimeVoiceAgentConsultWorkingResponse("caller"), { willContinue: true });
|
|
320
|
+
const result = !handler ? { error: `Tool "${name}" not available` } : await handler(args, callId, { partialUserTranscript: this.partialUserTranscriptsByCallId.get(callId) }).catch((error) => ({ error: formatErrorMessage(error) }));
|
|
321
|
+
bridge.submitToolResult(bridgeCallId, result);
|
|
322
|
+
}
|
|
323
|
+
};
|
|
324
|
+
//#endregion
|
|
325
|
+
export { RealtimeCallHandler };
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
import { o as resolveVoiceCallSessionKey } from "./config-7w04YpHh.js";
|
|
2
|
+
import { t as resolveVoiceResponseModel } from "./response-model-CyF5K80p.js";
|
|
3
|
+
import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
|
|
4
|
+
import crypto from "node:crypto";
|
|
5
|
+
import { applyModelOverrideToSessionEntry } from "openclaw/plugin-sdk/model-session-runtime";
|
|
6
|
+
//#region extensions/voice-call/src/response-generator.ts
|
|
7
|
+
/**
|
|
8
|
+
* Voice call response generator - uses the embedded Pi agent for tool support.
|
|
9
|
+
* Routes voice responses through the same agent infrastructure as messaging.
|
|
10
|
+
*/
|
|
11
|
+
const VOICE_SPOKEN_OUTPUT_CONTRACT = [
|
|
12
|
+
"Output format requirements:",
|
|
13
|
+
"- Return only valid JSON in this exact shape: {\"spoken\":\"...\"}",
|
|
14
|
+
"- Do not include markdown, code fences, planning text, or extra keys.",
|
|
15
|
+
"- Put exactly what should be spoken to the caller into \"spoken\".",
|
|
16
|
+
"- If there is nothing to say, return {\"spoken\":\"\"}."
|
|
17
|
+
].join("\n");
|
|
18
|
+
function normalizeSpokenText(value) {
|
|
19
|
+
const normalized = value.replace(/\s+/g, " ").trim();
|
|
20
|
+
return normalized.length > 0 ? normalized : null;
|
|
21
|
+
}
|
|
22
|
+
function tryParseSpokenJson(text) {
|
|
23
|
+
const candidates = [];
|
|
24
|
+
const trimmed = text.trim();
|
|
25
|
+
if (!trimmed) return null;
|
|
26
|
+
candidates.push(trimmed);
|
|
27
|
+
const fenced = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
|
|
28
|
+
if (fenced?.[1]) candidates.push(fenced[1]);
|
|
29
|
+
const firstBrace = trimmed.indexOf("{");
|
|
30
|
+
const lastBrace = trimmed.lastIndexOf("}");
|
|
31
|
+
if (firstBrace >= 0 && lastBrace > firstBrace) candidates.push(trimmed.slice(firstBrace, lastBrace + 1));
|
|
32
|
+
for (const candidate of candidates) try {
|
|
33
|
+
const parsed = JSON.parse(candidate);
|
|
34
|
+
if (typeof parsed?.spoken !== "string") continue;
|
|
35
|
+
return normalizeSpokenText(parsed.spoken) ?? "";
|
|
36
|
+
} catch {}
|
|
37
|
+
const inlineSpokenMatch = trimmed.match(/"spoken"\s*:\s*"((?:[^"\\]|\\.)*)"/i);
|
|
38
|
+
if (!inlineSpokenMatch) return null;
|
|
39
|
+
try {
|
|
40
|
+
return normalizeSpokenText(JSON.parse(`"${inlineSpokenMatch[1] ?? ""}"`)) ?? "";
|
|
41
|
+
} catch {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
function isLikelyMetaReasoningParagraph(paragraph) {
|
|
46
|
+
const lower = normalizeLowercaseStringOrEmpty(paragraph);
|
|
47
|
+
if (!lower) return false;
|
|
48
|
+
if (lower.startsWith("thinking process")) return true;
|
|
49
|
+
if (lower.startsWith("reasoning:") || lower.startsWith("analysis:")) return true;
|
|
50
|
+
if (lower.startsWith("the user ") && (lower.includes("i should") || lower.includes("i need to") || lower.includes("i will"))) return true;
|
|
51
|
+
if (lower.includes("this is a natural continuation of the conversation") || lower.includes("keep the conversation flowing")) return true;
|
|
52
|
+
return false;
|
|
53
|
+
}
|
|
54
|
+
function sanitizePlainSpokenText(text) {
|
|
55
|
+
const withoutCodeFences = text.replace(/```[\s\S]*?```/g, " ").trim();
|
|
56
|
+
if (!withoutCodeFences) return null;
|
|
57
|
+
const paragraphs = withoutCodeFences.split(/\n\s*\n+/).map((paragraph) => paragraph.trim()).filter(Boolean);
|
|
58
|
+
while (paragraphs.length > 1 && isLikelyMetaReasoningParagraph(paragraphs[0])) paragraphs.shift();
|
|
59
|
+
return normalizeSpokenText(paragraphs.join(" "));
|
|
60
|
+
}
|
|
61
|
+
function extractSpokenTextFromPayloads(payloads) {
|
|
62
|
+
const spokenSegments = [];
|
|
63
|
+
for (const payload of payloads) {
|
|
64
|
+
if (payload.isError || payload.isReasoning) continue;
|
|
65
|
+
const rawText = payload.text?.trim() ?? "";
|
|
66
|
+
if (!rawText) continue;
|
|
67
|
+
const structured = tryParseSpokenJson(rawText);
|
|
68
|
+
if (structured !== null) {
|
|
69
|
+
if (structured.length > 0) spokenSegments.push(structured);
|
|
70
|
+
continue;
|
|
71
|
+
}
|
|
72
|
+
const plain = sanitizePlainSpokenText(rawText);
|
|
73
|
+
if (plain) spokenSegments.push(plain);
|
|
74
|
+
}
|
|
75
|
+
return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
|
|
76
|
+
}
|
|
77
|
+
function resolveVoiceSandboxSessionKey(agentId, sessionKey) {
|
|
78
|
+
const trimmed = sessionKey.trim();
|
|
79
|
+
if (trimmed.toLowerCase().startsWith("agent:")) return trimmed;
|
|
80
|
+
return `agent:${agentId}:${trimmed}`;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Generate a voice response using the embedded Pi agent with full tool support.
|
|
84
|
+
* Uses the same agent infrastructure as messaging for consistent behavior.
|
|
85
|
+
*/
|
|
86
|
+
async function generateVoiceResponse(params) {
|
|
87
|
+
const { voiceConfig, callId, sessionKey, from, transcript, userMessage, coreConfig, agentRuntime } = params;
|
|
88
|
+
if (!coreConfig) return {
|
|
89
|
+
text: null,
|
|
90
|
+
error: "Core config unavailable for voice response"
|
|
91
|
+
};
|
|
92
|
+
const cfg = coreConfig;
|
|
93
|
+
const resolvedSessionKey = resolveVoiceCallSessionKey({
|
|
94
|
+
config: voiceConfig,
|
|
95
|
+
callId,
|
|
96
|
+
phone: from,
|
|
97
|
+
explicitSessionKey: sessionKey
|
|
98
|
+
});
|
|
99
|
+
const agentId = voiceConfig.agentId ?? "main";
|
|
100
|
+
const storePath = agentRuntime.session.resolveStorePath(cfg.session?.store, { agentId });
|
|
101
|
+
const agentDir = agentRuntime.resolveAgentDir(cfg, agentId);
|
|
102
|
+
const workspaceDir = agentRuntime.resolveAgentWorkspaceDir(cfg, agentId);
|
|
103
|
+
await agentRuntime.ensureAgentWorkspace({ dir: workspaceDir });
|
|
104
|
+
const sessionStore = agentRuntime.session.loadSessionStore(storePath);
|
|
105
|
+
const now = Date.now();
|
|
106
|
+
const existingSessionEntry = sessionStore[resolvedSessionKey];
|
|
107
|
+
const { provider, model } = resolveVoiceResponseModel({
|
|
108
|
+
voiceConfig,
|
|
109
|
+
agentRuntime
|
|
110
|
+
});
|
|
111
|
+
let sessionEntry = existingSessionEntry;
|
|
112
|
+
if (!sessionEntry?.sessionId || voiceConfig.responseModel) sessionEntry = await agentRuntime.session.updateSessionStore(storePath, (store) => {
|
|
113
|
+
let entry = store[resolvedSessionKey];
|
|
114
|
+
if (!entry?.sessionId) {
|
|
115
|
+
entry = {
|
|
116
|
+
...entry,
|
|
117
|
+
sessionId: crypto.randomUUID(),
|
|
118
|
+
updatedAt: now
|
|
119
|
+
};
|
|
120
|
+
store[resolvedSessionKey] = entry;
|
|
121
|
+
}
|
|
122
|
+
if (voiceConfig.responseModel) applyModelOverrideToSessionEntry({
|
|
123
|
+
entry,
|
|
124
|
+
selection: {
|
|
125
|
+
provider,
|
|
126
|
+
model
|
|
127
|
+
},
|
|
128
|
+
selectionSource: "auto"
|
|
129
|
+
});
|
|
130
|
+
return entry;
|
|
131
|
+
});
|
|
132
|
+
const sessionId = sessionEntry.sessionId;
|
|
133
|
+
const sessionFile = agentRuntime.session.resolveSessionFilePath(sessionId, sessionEntry, { agentId });
|
|
134
|
+
const thinkLevel = agentRuntime.resolveThinkingDefault({
|
|
135
|
+
cfg,
|
|
136
|
+
provider,
|
|
137
|
+
model
|
|
138
|
+
});
|
|
139
|
+
const agentName = agentRuntime.resolveAgentIdentity(cfg, agentId)?.name?.trim() || "assistant";
|
|
140
|
+
const basePrompt = voiceConfig.responseSystemPrompt ?? `You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
|
|
141
|
+
let extraSystemPrompt = basePrompt;
|
|
142
|
+
if (transcript.length > 0) extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${transcript.map((entry) => `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`).join("\n")}`;
|
|
143
|
+
extraSystemPrompt = `${extraSystemPrompt}\n\n${VOICE_SPOKEN_OUTPUT_CONTRACT}`;
|
|
144
|
+
const timeoutMs = voiceConfig.responseTimeoutMs ?? agentRuntime.resolveAgentTimeoutMs({ cfg });
|
|
145
|
+
const runId = `voice:${callId}:${Date.now()}`;
|
|
146
|
+
try {
|
|
147
|
+
const result = await agentRuntime.runEmbeddedPiAgent({
|
|
148
|
+
sessionId,
|
|
149
|
+
sessionKey: resolvedSessionKey,
|
|
150
|
+
sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, resolvedSessionKey),
|
|
151
|
+
agentId,
|
|
152
|
+
messageProvider: "voice",
|
|
153
|
+
sessionFile,
|
|
154
|
+
workspaceDir,
|
|
155
|
+
config: cfg,
|
|
156
|
+
prompt: userMessage,
|
|
157
|
+
provider,
|
|
158
|
+
model,
|
|
159
|
+
thinkLevel,
|
|
160
|
+
verboseLevel: "off",
|
|
161
|
+
timeoutMs,
|
|
162
|
+
runId,
|
|
163
|
+
lane: "voice",
|
|
164
|
+
extraSystemPrompt,
|
|
165
|
+
agentDir
|
|
166
|
+
});
|
|
167
|
+
const text = extractSpokenTextFromPayloads(result.payloads ?? []);
|
|
168
|
+
if (!text && result.meta?.aborted) return {
|
|
169
|
+
text: null,
|
|
170
|
+
error: "Response generation was aborted"
|
|
171
|
+
};
|
|
172
|
+
return { text };
|
|
173
|
+
} catch (err) {
|
|
174
|
+
console.error(`[voice-call] Response generation failed:`, err);
|
|
175
|
+
return {
|
|
176
|
+
text: null,
|
|
177
|
+
error: String(err)
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
//#endregion
|
|
182
|
+
export { generateVoiceResponse };
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
//#region extensions/voice-call/src/response-model.ts
|
|
2
|
+
function resolveVoiceResponseModel(params) {
|
|
3
|
+
const modelRef = params.voiceConfig.responseModel ?? `${params.agentRuntime.defaults.provider}/${params.agentRuntime.defaults.model}`;
|
|
4
|
+
const slashIndex = modelRef.indexOf("/");
|
|
5
|
+
return {
|
|
6
|
+
modelRef,
|
|
7
|
+
provider: slashIndex === -1 ? params.agentRuntime.defaults.provider : modelRef.slice(0, slashIndex),
|
|
8
|
+
model: slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1)
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
//#endregion
|
|
12
|
+
export { resolveVoiceResponseModel as t };
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
|
|
2
|
+
import { isRequestBodyLimitError, readRequestBodyWithLimit, requestBodyErrorToText } from "openclaw/plugin-sdk/webhook-request-guards";
|
|
3
|
+
import { fetchWithSsrFGuard, isBlockedHostnameOrIp } from "openclaw/plugin-sdk/ssrf-runtime";
|
|
4
|
+
import { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema } from "openclaw/plugin-sdk/tts-runtime";
|
|
5
|
+
import { sleep } from "openclaw/plugin-sdk/runtime-env";
|
|
6
|
+
export { TtsAutoSchema, TtsConfigSchema, TtsModeSchema, TtsProviderSchema, definePluginEntry, fetchWithSsrFGuard, isBlockedHostnameOrIp, isRequestBodyLimitError, readRequestBodyWithLimit, requestBodyErrorToText, sleep };
|