@openclaw/voice-call 2026.5.3 → 2026.5.4-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import { definePluginEntry, sleep } from "./runtime-api.js";
2
2
  import "./api.js";
3
3
  import { i as resolveVoiceCallConfig, s as validateProviderConfig } from "./config-7w04YpHh.js";
4
- import { a as setupTailscaleExposureRoute, i as getTailscaleSelfInfo, n as resolveWebhookExposureStatus, r as cleanupTailscaleExposureRoute, s as resolveUserPath, t as createVoiceCallRuntime } from "./runtime-entry-88ytYAQa.js";
4
+ import { a as setupTailscaleExposureRoute, i as getTailscaleSelfInfo, n as resolveWebhookExposureStatus, r as cleanupTailscaleExposureRoute, s as resolveUserPath, t as createVoiceCallRuntime } from "./runtime-entry-DFzuGKLG.js";
5
5
  import { i as parseVoiceCallPluginConfig, r as normalizeVoiceCallLegacyConfigInput, t as formatVoiceCallLegacyConfigWarnings } from "./config-compat-B0me39_4.js";
6
6
  import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
7
7
  import { ErrorCodes, callGatewayFromCli, errorShape } from "openclaw/plugin-sdk/gateway-runtime";
@@ -610,6 +610,8 @@ function createVoiceCallContinueOperationStore(params) {
610
610
  }
611
611
  //#endregion
612
612
  //#region extensions/voice-call/index.ts
613
+ const VOICE_CALL_WRITE_METHOD_SCOPE = { scope: "operator.write" };
614
+ const VOICE_CALL_READ_METHOD_SCOPE = { scope: "operator.read" };
613
615
  const voiceCallConfigSchema = {
614
616
  parse(value) {
615
617
  const normalized = normalizeVoiceCallLegacyConfigInput(value);
@@ -988,7 +990,7 @@ var voice_call_default = definePluginEntry({
988
990
  } catch (err) {
989
991
  sendError(respond, err);
990
992
  }
991
- });
993
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
992
994
  api.registerGatewayMethod("voicecall.continue", async ({ params, respond }) => {
993
995
  try {
994
996
  await respondToCallMessageAction({
@@ -1001,7 +1003,7 @@ var voice_call_default = definePluginEntry({
1001
1003
  } catch (err) {
1002
1004
  sendError(respond, err);
1003
1005
  }
1004
- });
1006
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1005
1007
  api.registerGatewayMethod("voicecall.continue.start", async ({ params, respond }) => {
1006
1008
  try {
1007
1009
  const request = await resolveCallMessageRequest(params);
@@ -1013,7 +1015,7 @@ var voice_call_default = definePluginEntry({
1013
1015
  } catch (err) {
1014
1016
  sendError(respond, err);
1015
1017
  }
1016
- });
1018
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1017
1019
  api.registerGatewayMethod("voicecall.continue.result", async ({ params, respond }) => {
1018
1020
  try {
1019
1021
  const operationId = normalizeOptionalString(params?.operationId) ?? "";
@@ -1030,7 +1032,7 @@ var voice_call_default = definePluginEntry({
1030
1032
  } catch (err) {
1031
1033
  sendError(respond, err);
1032
1034
  }
1033
- });
1035
+ }, VOICE_CALL_READ_METHOD_SCOPE);
1034
1036
  api.registerGatewayMethod("voicecall.speak", async ({ params, respond }) => {
1035
1037
  try {
1036
1038
  const request = await resolveCallMessageRequest(params);
@@ -1039,10 +1041,18 @@ var voice_call_default = definePluginEntry({
1039
1041
  return;
1040
1042
  }
1041
1043
  if (request.rt.config.realtime.enabled) {
1042
- if (request.rt.webhookServer.speakRealtime(request.callId, request.message).success) {
1044
+ const realtimeResult = request.rt.webhookServer.speakRealtime(request.callId, request.message);
1045
+ if (realtimeResult.success) {
1043
1046
  respond(true, { success: true });
1044
1047
  return;
1045
1048
  }
1049
+ if (params?.allowTwimlFallback === false) {
1050
+ respond(true, {
1051
+ success: false,
1052
+ error: realtimeResult.error ?? "Realtime bridge is not active"
1053
+ });
1054
+ return;
1055
+ }
1046
1056
  }
1047
1057
  const result = await request.rt.manager.speak(request.callId, request.message);
1048
1058
  if (!result.success) {
@@ -1053,7 +1063,7 @@ var voice_call_default = definePluginEntry({
1053
1063
  } catch (err) {
1054
1064
  sendError(respond, err);
1055
1065
  }
1056
- });
1066
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1057
1067
  api.registerGatewayMethod("voicecall.dtmf", async ({ params, respond }) => {
1058
1068
  try {
1059
1069
  const callId = normalizeOptionalString(params?.callId) ?? "";
@@ -1071,7 +1081,7 @@ var voice_call_default = definePluginEntry({
1071
1081
  } catch (err) {
1072
1082
  sendError(respond, err);
1073
1083
  }
1074
- });
1084
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1075
1085
  api.registerGatewayMethod("voicecall.end", async ({ params, respond }) => {
1076
1086
  try {
1077
1087
  const callId = normalizeOptionalString(params?.callId) ?? "";
@@ -1088,7 +1098,7 @@ var voice_call_default = definePluginEntry({
1088
1098
  } catch (err) {
1089
1099
  sendError(respond, err);
1090
1100
  }
1091
- });
1101
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1092
1102
  api.registerGatewayMethod("voicecall.status", async ({ params, respond }) => {
1093
1103
  try {
1094
1104
  const raw = normalizeOptionalString(params?.callId) ?? normalizeOptionalString(params?.sid) ?? "";
@@ -1112,7 +1122,7 @@ var voice_call_default = definePluginEntry({
1112
1122
  } catch (err) {
1113
1123
  sendError(respond, err);
1114
1124
  }
1115
- });
1125
+ }, VOICE_CALL_READ_METHOD_SCOPE);
1116
1126
  api.registerGatewayMethod("voicecall.start", async ({ params, respond }) => {
1117
1127
  try {
1118
1128
  const to = normalizeOptionalString(params?.to) ?? "";
@@ -1122,10 +1132,9 @@ var voice_call_default = definePluginEntry({
1122
1132
  respondError(respond, "to required", ErrorCodes.INVALID_REQUEST);
1123
1133
  return;
1124
1134
  }
1125
- const rt = await ensureRuntime();
1126
1135
  const mode = params?.mode === "notify" || params?.mode === "conversation" ? params.mode : void 0;
1127
1136
  await initiateCallAndRespond({
1128
- rt,
1137
+ rt: await ensureRuntime(),
1129
1138
  respond,
1130
1139
  to,
1131
1140
  message: message || void 0,
@@ -1135,7 +1144,7 @@ var voice_call_default = definePluginEntry({
1135
1144
  } catch (err) {
1136
1145
  sendError(respond, err);
1137
1146
  }
1138
- });
1147
+ }, VOICE_CALL_WRITE_METHOD_SCOPE);
1139
1148
  api.registerTool({
1140
1149
  name: "voice_call",
1141
1150
  label: "Voice Call",
@@ -2,10 +2,155 @@ import { formatErrorMessage } from "openclaw/plugin-sdk/error-runtime";
2
2
  import { REALTIME_VOICE_AGENT_CONSULT_TOOL_NAME, buildRealtimeVoiceAgentConsultWorkingResponse, createRealtimeVoiceBridgeSession } from "openclaw/plugin-sdk/realtime-voice";
3
3
  import { randomUUID } from "node:crypto";
4
4
  import WebSocket, { WebSocketServer } from "ws";
5
+ //#region extensions/voice-call/src/webhook/realtime-audio-pacer.ts
6
+ const TELEPHONY_SAMPLE_RATE = 8e3;
7
+ const TELEPHONY_CHUNK_BYTES = 160;
8
+ const TELEPHONY_CHUNK_MS = 20;
9
+ const DEFAULT_SPEECH_RMS_THRESHOLD = .02;
10
+ const DEFAULT_REQUIRED_LOUD_CHUNKS = 2;
11
+ const DEFAULT_REQUIRED_QUIET_CHUNKS = 10;
12
+ const DEFAULT_MAX_QUEUED_AUDIO_BYTES = TELEPHONY_SAMPLE_RATE * 120;
13
+ const PCM16_MAX_AMPLITUDE = 32768;
14
+ const MULAW_LINEAR_SAMPLES = new Int16Array(256);
15
+ for (let i = 0; i < MULAW_LINEAR_SAMPLES.length; i += 1) MULAW_LINEAR_SAMPLES[i] = decodeMulawSample(i);
16
+ var RealtimeTwilioAudioPacer = class {
17
+ constructor(params) {
18
+ this.params = params;
19
+ this.queue = [];
20
+ this.timer = null;
21
+ this.queuedAudioBytes = 0;
22
+ this.closed = false;
23
+ }
24
+ sendAudio(muLaw) {
25
+ if (this.closed || muLaw.length === 0) return;
26
+ const maxQueuedAudioBytes = this.params.maxQueuedAudioBytes ?? DEFAULT_MAX_QUEUED_AUDIO_BYTES;
27
+ for (let offset = 0; offset < muLaw.length; offset += TELEPHONY_CHUNK_BYTES) {
28
+ const chunk = Buffer.from(muLaw.subarray(offset, offset + TELEPHONY_CHUNK_BYTES));
29
+ if (this.queuedAudioBytes + chunk.length > maxQueuedAudioBytes) {
30
+ this.failBackpressure();
31
+ return;
32
+ }
33
+ this.queue.push({
34
+ type: "audio",
35
+ chunk,
36
+ durationMs: Math.max(1, Math.round(chunk.length / TELEPHONY_SAMPLE_RATE * 1e3))
37
+ });
38
+ this.queuedAudioBytes += chunk.length;
39
+ }
40
+ this.ensurePump();
41
+ }
42
+ sendMark(name) {
43
+ if (this.closed || !name) return;
44
+ this.queue.push({
45
+ type: "mark",
46
+ name
47
+ });
48
+ this.ensurePump();
49
+ }
50
+ clearAudio() {
51
+ if (this.closed) return;
52
+ this.clearTimer();
53
+ this.queue = [];
54
+ this.queuedAudioBytes = 0;
55
+ this.params.sendJson({
56
+ event: "clear",
57
+ streamSid: this.params.streamSid
58
+ });
59
+ }
60
+ close() {
61
+ this.closed = true;
62
+ this.clearTimer();
63
+ this.queue = [];
64
+ this.queuedAudioBytes = 0;
65
+ }
66
+ clearTimer() {
67
+ if (!this.timer) return;
68
+ clearTimeout(this.timer);
69
+ this.timer = null;
70
+ }
71
+ ensurePump() {
72
+ if (!this.timer) this.pump();
73
+ }
74
+ failBackpressure() {
75
+ this.close();
76
+ this.params.onBackpressure?.();
77
+ }
78
+ pump() {
79
+ this.timer = null;
80
+ if (this.closed) return;
81
+ const item = this.queue.shift();
82
+ if (!item) return;
83
+ let delayMs = 0;
84
+ let sent = true;
85
+ if (item.type === "audio") {
86
+ this.queuedAudioBytes = Math.max(0, this.queuedAudioBytes - item.chunk.length);
87
+ sent = this.params.sendJson({
88
+ event: "media",
89
+ streamSid: this.params.streamSid,
90
+ media: { payload: item.chunk.toString("base64") }
91
+ });
92
+ delayMs = item.durationMs || TELEPHONY_CHUNK_MS;
93
+ } else sent = this.params.sendJson({
94
+ event: "mark",
95
+ streamSid: this.params.streamSid,
96
+ mark: { name: item.name }
97
+ });
98
+ if (!sent) {
99
+ this.queue = [];
100
+ this.queuedAudioBytes = 0;
101
+ return;
102
+ }
103
+ if (this.queue.length > 0) this.timer = setTimeout(() => this.pump(), delayMs);
104
+ }
105
+ };
106
+ function calculateMulawRms(muLaw) {
107
+ if (muLaw.length === 0) return 0;
108
+ let sum = 0;
109
+ for (let i = 0; i < muLaw.length; i += 1) {
110
+ const normalized = (MULAW_LINEAR_SAMPLES[muLaw[i] ?? 0] ?? 0) / PCM16_MAX_AMPLITUDE;
111
+ sum += normalized * normalized;
112
+ }
113
+ return Math.sqrt(sum / muLaw.length);
114
+ }
115
+ var RealtimeMulawSpeechStartDetector = class {
116
+ constructor(params = {}) {
117
+ this.params = params;
118
+ this.loudChunks = 0;
119
+ this.quietChunks = DEFAULT_REQUIRED_QUIET_CHUNKS;
120
+ this.speaking = false;
121
+ }
122
+ accept(muLaw) {
123
+ if (calculateMulawRms(muLaw) >= (this.params.rmsThreshold ?? DEFAULT_SPEECH_RMS_THRESHOLD)) {
124
+ this.quietChunks = 0;
125
+ this.loudChunks += 1;
126
+ const requiredLoudChunks = this.params.requiredLoudChunks ?? DEFAULT_REQUIRED_LOUD_CHUNKS;
127
+ if (!this.speaking && this.loudChunks >= requiredLoudChunks) {
128
+ this.speaking = true;
129
+ return true;
130
+ }
131
+ return false;
132
+ }
133
+ this.loudChunks = 0;
134
+ this.quietChunks += 1;
135
+ const requiredQuietChunks = this.params.requiredQuietChunks ?? DEFAULT_REQUIRED_QUIET_CHUNKS;
136
+ if (this.quietChunks >= requiredQuietChunks) this.speaking = false;
137
+ return false;
138
+ }
139
+ };
140
+ function decodeMulawSample(value) {
141
+ const muLaw = ~value & 255;
142
+ const sign = muLaw & 128;
143
+ const exponent = muLaw >> 4 & 7;
144
+ let sample = ((muLaw & 15) << 3) + 132 << exponent;
145
+ sample -= 132;
146
+ return sign ? -sample : sample;
147
+ }
148
+ //#endregion
5
149
  //#region extensions/voice-call/src/webhook/realtime-handler.ts
6
150
  const STREAM_TOKEN_TTL_MS = 3e4;
7
151
  const DEFAULT_HOST = "localhost:8443";
8
152
  const MAX_REALTIME_MESSAGE_BYTES = 256 * 1024;
153
+ const MAX_REALTIME_WS_BUFFERED_BYTES = 1024 * 1024;
9
154
  function normalizePath(pathname) {
10
155
  const trimmed = pathname.trim();
11
156
  if (!trimmed) return "/";
@@ -99,7 +244,8 @@ var RealtimeCallHandler = class {
99
244
  if (!bridge) return;
100
245
  const mediaData = typeof msg.media === "object" && msg.media !== null ? msg.media : void 0;
101
246
  if (msg.event === "media" && typeof mediaData?.payload === "string") {
102
- bridge.sendAudio(Buffer.from(mediaData.payload, "base64"));
247
+ const audio = Buffer.from(mediaData.payload, "base64");
248
+ bridge.sendAudio(audio);
103
249
  if (typeof mediaData.timestamp === "number") bridge.setMediaTimestamp(mediaData.timestamp);
104
250
  else if (typeof mediaData.timestamp === "string") bridge.setMediaTimestamp(Number.parseInt(mediaData.timestamp, 10));
105
251
  return;
@@ -174,7 +320,28 @@ var RealtimeCallHandler = class {
174
320
  callEndEmitted = true;
175
321
  this.endCallInManager(callSid, callId, reason);
176
322
  };
177
- const bridge = createRealtimeVoiceBridgeSession({
323
+ const sendJson = (message) => {
324
+ if (ws.readyState !== WebSocket.OPEN) return false;
325
+ if (ws.bufferedAmount > MAX_REALTIME_WS_BUFFERED_BYTES) {
326
+ ws.close(1013, "Backpressure: send buffer exceeded");
327
+ return false;
328
+ }
329
+ ws.send(JSON.stringify(message));
330
+ if (ws.bufferedAmount > MAX_REALTIME_WS_BUFFERED_BYTES) {
331
+ ws.close(1013, "Backpressure: send buffer exceeded");
332
+ return false;
333
+ }
334
+ return true;
335
+ };
336
+ const audioPacer = new RealtimeTwilioAudioPacer({
337
+ streamSid,
338
+ sendJson,
339
+ onBackpressure: () => {
340
+ if (ws.readyState === WebSocket.OPEN) ws.close(1013, "Backpressure: paced audio queue exceeded");
341
+ }
342
+ });
343
+ const speechDetector = new RealtimeMulawSpeechStartDetector();
344
+ const session = createRealtimeVoiceBridgeSession({
178
345
  provider: this.realtimeProvider,
179
346
  providerConfig: this.providerConfig,
180
347
  instructions: this.config.instructions,
@@ -184,24 +351,13 @@ var RealtimeCallHandler = class {
184
351
  audioSink: {
185
352
  isOpen: () => ws.readyState === WebSocket.OPEN,
186
353
  sendAudio: (muLaw) => {
187
- ws.send(JSON.stringify({
188
- event: "media",
189
- streamSid,
190
- media: { payload: muLaw.toString("base64") }
191
- }));
354
+ audioPacer.sendAudio(muLaw);
192
355
  },
193
356
  clearAudio: () => {
194
- ws.send(JSON.stringify({
195
- event: "clear",
196
- streamSid
197
- }));
357
+ audioPacer.clearAudio();
198
358
  },
199
359
  sendMark: (markName) => {
200
- ws.send(JSON.stringify({
201
- event: "mark",
202
- streamSid,
203
- mark: { name: markName }
204
- }));
360
+ audioPacer.sendMark(markName);
205
361
  }
206
362
  },
207
363
  onTranscript: (role, text, isFinal) => {
@@ -242,7 +398,10 @@ var RealtimeCallHandler = class {
242
398
  this.activeBridgesByCallId.delete(callId);
243
399
  this.activeBridgesByCallId.delete(callSid);
244
400
  this.partialUserTranscriptsByCallId.delete(callId);
245
- if (reason !== "error") return;
401
+ if (reason !== "error") {
402
+ emitCallEnd("completed");
403
+ return;
404
+ }
246
405
  emitCallEnd("error");
247
406
  if (ws.readyState === WebSocket.OPEN) ws.close(1011, "Bridge disconnected");
248
407
  this.provider.hangupCall({
@@ -254,22 +413,28 @@ var RealtimeCallHandler = class {
254
413
  });
255
414
  }
256
415
  });
257
- this.activeBridgesByCallId.set(callId, bridge);
258
- this.activeBridgesByCallId.set(callSid, bridge);
259
- const closeBridge = bridge.close.bind(bridge);
260
- bridge.close = () => {
416
+ this.activeBridgesByCallId.set(callId, session);
417
+ this.activeBridgesByCallId.set(callSid, session);
418
+ const sendAudioToSession = session.sendAudio.bind(session);
419
+ session.sendAudio = (audio) => {
420
+ if (speechDetector.accept(audio)) audioPacer.clearAudio();
421
+ sendAudioToSession(audio);
422
+ };
423
+ const closeSession = session.close.bind(session);
424
+ session.close = () => {
261
425
  this.activeBridgesByCallId.delete(callId);
262
426
  this.activeBridgesByCallId.delete(callSid);
263
427
  this.partialUserTranscriptsByCallId.delete(callId);
264
- closeBridge();
428
+ audioPacer.close();
429
+ closeSession();
265
430
  };
266
- bridge.connect().catch((error) => {
431
+ session.connect().catch((error) => {
267
432
  console.error("[voice-call] Failed to connect realtime bridge:", error);
268
- bridge.close();
433
+ session.close();
269
434
  emitCallEnd("error");
270
435
  ws.close(1011, "Failed to connect");
271
436
  });
272
- return bridge;
437
+ return session;
273
438
  }
274
439
  registerCallInManager(callSid, callerMeta = {}) {
275
440
  const baseFields = {
@@ -2861,7 +2861,7 @@ function loadRealtimeVoiceRuntime() {
2861
2861
  return realtimeVoiceRuntimePromise;
2862
2862
  }
2863
2863
  function loadRealtimeHandler() {
2864
- realtimeHandlerPromise ??= import("./realtime-handler-B63CIDP2.js");
2864
+ realtimeHandlerPromise ??= import("./realtime-handler-C-SaPrny.js");
2865
2865
  return realtimeHandlerPromise;
2866
2866
  }
2867
2867
  function resolveVoiceCallConsultSessionKey(call) {
@@ -1,2 +1,2 @@
1
- import { t as createVoiceCallRuntime } from "./runtime-entry-88ytYAQa.js";
1
+ import { t as createVoiceCallRuntime } from "./runtime-entry-DFzuGKLG.js";
2
2
  export { createVoiceCallRuntime };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openclaw/voice-call",
3
- "version": "2026.5.3",
3
+ "version": "2026.5.4-beta.2",
4
4
  "description": "OpenClaw voice-call plugin",
5
5
  "repository": {
6
6
  "type": "git",
@@ -17,7 +17,7 @@
17
17
  "openclaw": "workspace:*"
18
18
  },
19
19
  "peerDependencies": {
20
- "openclaw": ">=2026.5.3"
20
+ "openclaw": ">=2026.5.4-beta.2"
21
21
  },
22
22
  "peerDependenciesMeta": {
23
23
  "openclaw": {
@@ -34,10 +34,10 @@
34
34
  "minHostVersion": ">=2026.4.10"
35
35
  },
36
36
  "compat": {
37
- "pluginApi": ">=2026.5.3"
37
+ "pluginApi": ">=2026.5.4-beta.2"
38
38
  },
39
39
  "build": {
40
- "openclawVersion": "2026.5.3"
40
+ "openclawVersion": "2026.5.4-beta.2"
41
41
  },
42
42
  "release": {
43
43
  "publishToClawHub": true,