@openclaw/voice-call 2026.3.13 → 2026.5.1-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/README.md +25 -5
  2. package/api.ts +16 -0
  3. package/cli-metadata.ts +10 -0
  4. package/config-api.ts +12 -0
  5. package/index.test.ts +866 -0
  6. package/index.ts +353 -148
  7. package/openclaw.plugin.json +336 -157
  8. package/package.json +33 -5
  9. package/runtime-api.ts +20 -0
  10. package/runtime-entry.ts +1 -0
  11. package/setup-api.ts +47 -0
  12. package/src/allowlist.test.ts +18 -0
  13. package/src/cli.ts +533 -68
  14. package/src/config-compat.test.ts +120 -0
  15. package/src/config-compat.ts +227 -0
  16. package/src/config.test.ts +160 -12
  17. package/src/config.ts +243 -74
  18. package/src/core-bridge.ts +2 -147
  19. package/src/deep-merge.test.ts +40 -0
  20. package/src/gateway-continue-operation.ts +200 -0
  21. package/src/http-headers.ts +6 -3
  22. package/src/manager/context.ts +6 -5
  23. package/src/manager/events.test.ts +179 -19
  24. package/src/manager/events.ts +48 -30
  25. package/src/manager/lifecycle.ts +53 -0
  26. package/src/manager/lookup.test.ts +52 -0
  27. package/src/manager/outbound.test.ts +464 -0
  28. package/src/manager/outbound.ts +148 -55
  29. package/src/manager/store.ts +18 -6
  30. package/src/manager/timers.test.ts +129 -0
  31. package/src/manager/timers.ts +4 -3
  32. package/src/manager/twiml.test.ts +13 -0
  33. package/src/manager/twiml.ts +8 -0
  34. package/src/manager.closed-loop.test.ts +30 -12
  35. package/src/manager.inbound-allowlist.test.ts +77 -10
  36. package/src/manager.notify.test.ts +344 -20
  37. package/src/manager.restore.test.ts +95 -8
  38. package/src/manager.test-harness.ts +8 -6
  39. package/src/manager.ts +79 -5
  40. package/src/media-stream.test.ts +578 -81
  41. package/src/media-stream.ts +235 -54
  42. package/src/providers/base.ts +19 -0
  43. package/src/providers/mock.ts +7 -1
  44. package/src/providers/plivo.test.ts +50 -6
  45. package/src/providers/plivo.ts +14 -6
  46. package/src/providers/shared/call-status.ts +2 -1
  47. package/src/providers/shared/guarded-json-api.test.ts +106 -0
  48. package/src/providers/shared/guarded-json-api.ts +1 -1
  49. package/src/providers/telnyx.test.ts +178 -6
  50. package/src/providers/telnyx.ts +40 -3
  51. package/src/providers/twilio/api.test.ts +145 -0
  52. package/src/providers/twilio/api.ts +67 -16
  53. package/src/providers/twilio/twiml-policy.ts +6 -10
  54. package/src/providers/twilio/webhook.ts +1 -1
  55. package/src/providers/twilio.test.ts +425 -25
  56. package/src/providers/twilio.ts +230 -77
  57. package/src/providers/twilio.types.ts +17 -0
  58. package/src/realtime-defaults.ts +3 -0
  59. package/src/realtime-fast-context.test.ts +88 -0
  60. package/src/realtime-fast-context.ts +165 -0
  61. package/src/realtime-transcription.runtime.ts +4 -0
  62. package/src/realtime-voice.runtime.ts +5 -0
  63. package/src/response-generator.test.ts +277 -0
  64. package/src/response-generator.ts +186 -40
  65. package/src/response-model.test.ts +71 -0
  66. package/src/response-model.ts +23 -0
  67. package/src/runtime.test.ts +351 -0
  68. package/src/runtime.ts +254 -24
  69. package/src/telephony-audio.test.ts +61 -0
  70. package/src/telephony-audio.ts +1 -79
  71. package/src/telephony-tts.test.ts +133 -12
  72. package/src/telephony-tts.ts +155 -2
  73. package/src/test-fixtures.ts +26 -7
  74. package/src/tts-provider-voice.test.ts +34 -0
  75. package/src/tts-provider-voice.ts +21 -0
  76. package/src/tunnel.test.ts +166 -0
  77. package/src/tunnel.ts +1 -1
  78. package/src/types.ts +24 -37
  79. package/src/utils.test.ts +17 -0
  80. package/src/voice-mapping.test.ts +34 -0
  81. package/src/voice-mapping.ts +3 -2
  82. package/src/webhook/realtime-handler.test.ts +598 -0
  83. package/src/webhook/realtime-handler.ts +485 -0
  84. package/src/webhook/stale-call-reaper.test.ts +88 -0
  85. package/src/webhook/stale-call-reaper.ts +5 -0
  86. package/src/webhook/tailscale.test.ts +214 -0
  87. package/src/webhook/tailscale.ts +19 -5
  88. package/src/webhook-exposure.test.ts +33 -0
  89. package/src/webhook-exposure.ts +84 -0
  90. package/src/webhook-security.test.ts +172 -21
  91. package/src/webhook-security.ts +43 -29
  92. package/src/webhook.hangup-once.lifecycle.test.ts +135 -0
  93. package/src/webhook.test.ts +1145 -27
  94. package/src/webhook.ts +513 -100
  95. package/src/webhook.types.ts +5 -0
  96. package/src/websocket-test-support.ts +72 -0
  97. package/tsconfig.json +16 -0
  98. package/CHANGELOG.md +0 -121
  99. package/src/providers/index.ts +0 -10
  100. package/src/providers/stt-openai-realtime.test.ts +0 -42
  101. package/src/providers/stt-openai-realtime.ts +0 -311
  102. package/src/providers/tts-openai.test.ts +0 -43
  103. package/src/providers/tts-openai.ts +0 -221
@@ -4,14 +4,20 @@
4
4
  */
5
5
 
6
6
  import crypto from "node:crypto";
7
+ import { applyModelOverrideToSessionEntry } from "openclaw/plugin-sdk/model-session-runtime";
8
+ import { normalizeLowercaseStringOrEmpty } from "openclaw/plugin-sdk/text-runtime";
9
+ import type { SessionEntry } from "../api.js";
7
10
  import type { VoiceCallConfig } from "./config.js";
8
- import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
11
+ import type { CoreAgentDeps, CoreConfig } from "./core-bridge.js";
12
+ import { resolveVoiceResponseModel } from "./response-model.js";
9
13
 
10
14
  export type VoiceResponseParams = {
11
15
  /** Voice call config */
12
16
  voiceConfig: VoiceCallConfig;
13
17
  /** Core OpenClaw config */
14
18
  coreConfig: CoreConfig;
19
+ /** Injected host agent runtime */
20
+ agentRuntime: CoreAgentDeps;
15
21
  /** Call ID for session tracking */
16
22
  callId: string;
17
23
  /** Caller's phone number */
@@ -27,11 +33,153 @@ export type VoiceResponseResult = {
27
33
  error?: string;
28
34
  };
29
35
 
30
- type SessionEntry = {
31
- sessionId: string;
32
- updatedAt: number;
36
+ type VoiceResponsePayload = {
37
+ text?: string;
38
+ isError?: boolean;
39
+ isReasoning?: boolean;
33
40
  };
34
41
 
42
+ const VOICE_SPOKEN_OUTPUT_CONTRACT = [
43
+ "Output format requirements:",
44
+ '- Return only valid JSON in this exact shape: {"spoken":"..."}',
45
+ "- Do not include markdown, code fences, planning text, or extra keys.",
46
+ '- Put exactly what should be spoken to the caller into "spoken".',
47
+ '- If there is nothing to say, return {"spoken":""}.',
48
+ ].join("\n");
49
+
50
+ function normalizeSpokenText(value: string): string | null {
51
+ const normalized = value.replace(/\s+/g, " ").trim();
52
+ return normalized.length > 0 ? normalized : null;
53
+ }
54
+
55
+ function tryParseSpokenJson(text: string): string | null {
56
+ const candidates: string[] = [];
57
+ const trimmed = text.trim();
58
+ if (!trimmed) {
59
+ return null;
60
+ }
61
+ candidates.push(trimmed);
62
+
63
+ const fenced = trimmed.match(/^```(?:json)?\s*([\s\S]*?)\s*```$/i);
64
+ if (fenced?.[1]) {
65
+ candidates.push(fenced[1]);
66
+ }
67
+
68
+ const firstBrace = trimmed.indexOf("{");
69
+ const lastBrace = trimmed.lastIndexOf("}");
70
+ if (firstBrace >= 0 && lastBrace > firstBrace) {
71
+ candidates.push(trimmed.slice(firstBrace, lastBrace + 1));
72
+ }
73
+
74
+ for (const candidate of candidates) {
75
+ try {
76
+ const parsed = JSON.parse(candidate) as { spoken?: unknown };
77
+ if (typeof parsed?.spoken !== "string") {
78
+ continue;
79
+ }
80
+ return normalizeSpokenText(parsed.spoken) ?? "";
81
+ } catch {
82
+ // Continue trying other candidates.
83
+ }
84
+ }
85
+
86
+ const inlineSpokenMatch = trimmed.match(/"spoken"\s*:\s*"((?:[^"\\]|\\.)*)"/i);
87
+ if (!inlineSpokenMatch) {
88
+ return null;
89
+ }
90
+
91
+ try {
92
+ const decoded = JSON.parse(`"${inlineSpokenMatch[1] ?? ""}"`) as string;
93
+ return normalizeSpokenText(decoded) ?? "";
94
+ } catch {
95
+ return null;
96
+ }
97
+ }
98
+
99
+ function isLikelyMetaReasoningParagraph(paragraph: string): boolean {
100
+ const lower = normalizeLowercaseStringOrEmpty(paragraph);
101
+ if (!lower) {
102
+ return false;
103
+ }
104
+
105
+ if (lower.startsWith("thinking process")) {
106
+ return true;
107
+ }
108
+ if (lower.startsWith("reasoning:") || lower.startsWith("analysis:")) {
109
+ return true;
110
+ }
111
+ if (
112
+ lower.startsWith("the user ") &&
113
+ (lower.includes("i should") || lower.includes("i need to") || lower.includes("i will"))
114
+ ) {
115
+ return true;
116
+ }
117
+ if (
118
+ lower.includes("this is a natural continuation of the conversation") ||
119
+ lower.includes("keep the conversation flowing")
120
+ ) {
121
+ return true;
122
+ }
123
+
124
+ return false;
125
+ }
126
+
127
+ function sanitizePlainSpokenText(text: string): string | null {
128
+ const withoutCodeFences = text.replace(/```[\s\S]*?```/g, " ").trim();
129
+ if (!withoutCodeFences) {
130
+ return null;
131
+ }
132
+
133
+ const paragraphs = withoutCodeFences
134
+ .split(/\n\s*\n+/)
135
+ .map((paragraph) => paragraph.trim())
136
+ .filter(Boolean);
137
+
138
+ while (paragraphs.length > 1 && isLikelyMetaReasoningParagraph(paragraphs[0])) {
139
+ paragraphs.shift();
140
+ }
141
+
142
+ return normalizeSpokenText(paragraphs.join(" "));
143
+ }
144
+
145
+ function extractSpokenTextFromPayloads(payloads: VoiceResponsePayload[]): string | null {
146
+ const spokenSegments: string[] = [];
147
+
148
+ for (const payload of payloads) {
149
+ if (payload.isError || payload.isReasoning) {
150
+ continue;
151
+ }
152
+
153
+ const rawText = payload.text?.trim() ?? "";
154
+ if (!rawText) {
155
+ continue;
156
+ }
157
+
158
+ const structured = tryParseSpokenJson(rawText);
159
+ if (structured !== null) {
160
+ if (structured.length > 0) {
161
+ spokenSegments.push(structured);
162
+ }
163
+ continue;
164
+ }
165
+
166
+ const plain = sanitizePlainSpokenText(rawText);
167
+ if (plain) {
168
+ spokenSegments.push(plain);
169
+ }
170
+ }
171
+
172
+ return spokenSegments.length > 0 ? spokenSegments.join(" ").trim() : null;
173
+ }
174
+
175
+ function resolveVoiceSandboxSessionKey(agentId: string, sessionKey: string): string {
176
+ const trimmed = sessionKey.trim();
177
+ if (trimmed.toLowerCase().startsWith("agent:")) {
178
+ return trimmed;
179
+ }
180
+ return `agent:${agentId}:${trimmed}`;
181
+ }
182
+
35
183
  /**
36
184
  * Generate a voice response using the embedded Pi agent with full tool support.
37
185
  * Uses the same agent infrastructure as messaging for consistent behavior.
@@ -39,40 +187,31 @@ type SessionEntry = {
39
187
  export async function generateVoiceResponse(
40
188
  params: VoiceResponseParams,
41
189
  ): Promise<VoiceResponseResult> {
42
- const { voiceConfig, callId, from, transcript, userMessage, coreConfig } = params;
190
+ const { voiceConfig, callId, from, transcript, userMessage, coreConfig, agentRuntime } = params;
43
191
 
44
192
  if (!coreConfig) {
45
193
  return { text: null, error: "Core config unavailable for voice response" };
46
194
  }
47
-
48
- let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
49
- try {
50
- deps = await loadCoreAgentDeps();
51
- } catch (err) {
52
- return {
53
- text: null,
54
- error: err instanceof Error ? err.message : "Unable to load core agent dependencies",
55
- };
56
- }
57
195
  const cfg = coreConfig;
58
196
 
59
197
  // Build voice-specific session key based on phone number
60
198
  const normalizedPhone = from.replace(/\D/g, "");
61
199
  const sessionKey = `voice:${normalizedPhone}`;
62
- const agentId = "main";
200
+ const agentId = voiceConfig.agentId ?? "main";
63
201
 
64
202
  // Resolve paths
65
- const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
66
- const agentDir = deps.resolveAgentDir(cfg, agentId);
67
- const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
203
+ const storePath = agentRuntime.session.resolveStorePath(cfg.session?.store, { agentId });
204
+ const agentDir = agentRuntime.resolveAgentDir(cfg, agentId);
205
+ const workspaceDir = agentRuntime.resolveAgentWorkspaceDir(cfg, agentId);
68
206
 
69
207
  // Ensure workspace exists
70
- await deps.ensureAgentWorkspace({ dir: workspaceDir });
208
+ await agentRuntime.ensureAgentWorkspace({ dir: workspaceDir });
71
209
 
72
210
  // Load or create session entry
73
- const sessionStore = deps.loadSessionStore(storePath);
211
+ const sessionStore = agentRuntime.session.loadSessionStore(storePath);
74
212
  const now = Date.now();
75
213
  let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
214
+ let sessionEntryUpdated = false;
76
215
 
77
216
  if (!sessionEntry) {
78
217
  sessionEntry = {
@@ -80,25 +219,35 @@ export async function generateVoiceResponse(
80
219
  updatedAt: now,
81
220
  };
82
221
  sessionStore[sessionKey] = sessionEntry;
83
- await deps.saveSessionStore(storePath, sessionStore);
222
+ sessionEntryUpdated = true;
84
223
  }
85
224
 
86
225
  const sessionId = sessionEntry.sessionId;
87
- const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
88
- agentId,
89
- });
90
226
 
91
227
  // Resolve model from config
92
- const modelRef = voiceConfig.responseModel || `${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
93
- const slashIndex = modelRef.indexOf("/");
94
- const provider = slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
95
- const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
228
+ const { provider, model } = resolveVoiceResponseModel({ voiceConfig, agentRuntime });
229
+ if (voiceConfig.responseModel) {
230
+ sessionEntryUpdated =
231
+ applyModelOverrideToSessionEntry({
232
+ entry: sessionEntry,
233
+ selection: { provider, model },
234
+ selectionSource: "auto",
235
+ }).updated || sessionEntryUpdated;
236
+ }
237
+
238
+ if (sessionEntryUpdated) {
239
+ await agentRuntime.session.saveSessionStore(storePath, sessionStore);
240
+ }
241
+
242
+ const sessionFile = agentRuntime.session.resolveSessionFilePath(sessionId, sessionEntry, {
243
+ agentId,
244
+ });
96
245
 
97
246
  // Resolve thinking level
98
- const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
247
+ const thinkLevel = agentRuntime.resolveThinkingDefault({ cfg, provider, model });
99
248
 
100
249
  // Resolve agent identity for personalized prompt
101
- const identity = deps.resolveAgentIdentity(cfg, agentId);
250
+ const identity = agentRuntime.resolveAgentIdentity(cfg, agentId);
102
251
  const agentName = identity?.name?.trim() || "assistant";
103
252
 
104
253
  // Build system prompt with conversation history
@@ -113,15 +262,18 @@ export async function generateVoiceResponse(
113
262
  .join("\n");
114
263
  extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
115
264
  }
265
+ extraSystemPrompt = `${extraSystemPrompt}\n\n${VOICE_SPOKEN_OUTPUT_CONTRACT}`;
116
266
 
117
267
  // Resolve timeout
118
- const timeoutMs = voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
268
+ const timeoutMs = voiceConfig.responseTimeoutMs ?? agentRuntime.resolveAgentTimeoutMs({ cfg });
119
269
  const runId = `voice:${callId}:${Date.now()}`;
120
270
 
121
271
  try {
122
- const result = await deps.runEmbeddedPiAgent({
272
+ const result = await agentRuntime.runEmbeddedPiAgent({
123
273
  sessionId,
124
274
  sessionKey,
275
+ sandboxSessionKey: resolveVoiceSandboxSessionKey(agentId, sessionKey),
276
+ agentId,
125
277
  messageProvider: "voice",
126
278
  sessionFile,
127
279
  workspaceDir,
@@ -138,13 +290,7 @@ export async function generateVoiceResponse(
138
290
  agentDir,
139
291
  });
140
292
 
141
- // Extract text from payloads
142
- const texts = (result.payloads ?? [])
143
- .filter((p) => p.text && !p.isError)
144
- .map((p) => p.text?.trim())
145
- .filter(Boolean);
146
-
147
- const text = texts.join(" ") || null;
293
+ const text = extractSpokenTextFromPayloads((result.payloads ?? []) as VoiceResponsePayload[]);
148
294
 
149
295
  if (!text && result.meta?.aborted) {
150
296
  return { text: null, error: "Response generation was aborted" };
@@ -0,0 +1,71 @@
1
+ import { describe, expect, it } from "vitest";
2
+ import { VoiceCallConfigSchema } from "./config.js";
3
+ import type { CoreAgentDeps } from "./core-bridge.js";
4
+ import { resolveVoiceResponseModel } from "./response-model.js";
5
+
6
+ const agentRuntime = {
7
+ defaults: {
8
+ provider: "together",
9
+ model: "Qwen/Qwen2.5-7B-Instruct-Turbo",
10
+ },
11
+ } as unknown as CoreAgentDeps;
12
+
13
+ describe("resolveVoiceResponseModel", () => {
14
+ it("falls back to the runtime default model", () => {
15
+ expect(
16
+ resolveVoiceResponseModel({
17
+ voiceConfig: VoiceCallConfigSchema.parse({}),
18
+ agentRuntime,
19
+ }),
20
+ ).toEqual({
21
+ modelRef: "together/Qwen/Qwen2.5-7B-Instruct-Turbo",
22
+ provider: "together",
23
+ model: "Qwen/Qwen2.5-7B-Instruct-Turbo",
24
+ });
25
+ });
26
+
27
+ it("uses an explicit provider/model ref", () => {
28
+ expect(
29
+ resolveVoiceResponseModel({
30
+ voiceConfig: VoiceCallConfigSchema.parse({
31
+ responseModel: "openai/gpt-5.4-mini",
32
+ }),
33
+ agentRuntime,
34
+ }),
35
+ ).toEqual({
36
+ modelRef: "openai/gpt-5.4-mini",
37
+ provider: "openai",
38
+ model: "gpt-5.4-mini",
39
+ });
40
+ });
41
+
42
+ it("uses the runtime default provider for bare model overrides", () => {
43
+ expect(
44
+ resolveVoiceResponseModel({
45
+ voiceConfig: VoiceCallConfigSchema.parse({
46
+ responseModel: "meta-llama/Llama-4-Scout-17B-16E-Instruct",
47
+ }),
48
+ agentRuntime,
49
+ }),
50
+ ).toEqual({
51
+ modelRef: "meta-llama/Llama-4-Scout-17B-16E-Instruct",
52
+ provider: "meta-llama",
53
+ model: "Llama-4-Scout-17B-16E-Instruct",
54
+ });
55
+ });
56
+
57
+ it("keeps legacy single-segment overrides on the runtime default provider", () => {
58
+ expect(
59
+ resolveVoiceResponseModel({
60
+ voiceConfig: VoiceCallConfigSchema.parse({
61
+ responseModel: "gpt-5.4-mini",
62
+ }),
63
+ agentRuntime,
64
+ }),
65
+ ).toEqual({
66
+ modelRef: "gpt-5.4-mini",
67
+ provider: "together",
68
+ model: "gpt-5.4-mini",
69
+ });
70
+ });
71
+ });
@@ -0,0 +1,23 @@
1
+ import type { VoiceCallConfig } from "./config.js";
2
+ import type { CoreAgentDeps } from "./core-bridge.js";
3
+
4
+ export function resolveVoiceResponseModel(params: {
5
+ voiceConfig: VoiceCallConfig;
6
+ agentRuntime: CoreAgentDeps;
7
+ }): {
8
+ modelRef: string;
9
+ provider: string;
10
+ model: string;
11
+ } {
12
+ const modelRef =
13
+ params.voiceConfig.responseModel ??
14
+ `${params.agentRuntime.defaults.provider}/${params.agentRuntime.defaults.model}`;
15
+ const slashIndex = modelRef.indexOf("/");
16
+
17
+ return {
18
+ modelRef,
19
+ provider:
20
+ slashIndex === -1 ? params.agentRuntime.defaults.provider : modelRef.slice(0, slashIndex),
21
+ model: slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1),
22
+ };
23
+ }