@openclaw/voice-call 2026.2.22 → 2026.2.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -175,5 +175,7 @@ Actions:
175
175
  ## Notes
176
176
 
177
177
  - Uses webhook signature verification for Twilio/Telnyx/Plivo.
178
+ - Adds replay protection for Twilio and Plivo webhooks (valid duplicate callbacks are ignored safely).
179
+ - Twilio speech turns include a per-turn token so stale/replayed callbacks cannot complete a newer turn.
178
180
  - `responseModel` / `responseSystemPrompt` control AI auto-responses.
179
181
  - Media streaming requires `ws` and OpenAI Realtime API key.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@openclaw/voice-call",
3
- "version": "2026.2.22",
3
+ "version": "2026.2.23",
4
4
  "description": "OpenClaw voice-call plugin",
5
5
  "type": "module",
6
6
  "dependencies": {
package/src/cli.ts CHANGED
@@ -81,6 +81,27 @@ function summarizeSeries(values: number[]): {
81
81
  };
82
82
  }
83
83
 
84
+ function resolveCallMode(mode?: string): "notify" | "conversation" | undefined {
85
+ return mode === "notify" || mode === "conversation" ? mode : undefined;
86
+ }
87
+
88
+ async function initiateCallAndPrintId(params: {
89
+ runtime: VoiceCallRuntime;
90
+ to: string;
91
+ message?: string;
92
+ mode?: string;
93
+ }) {
94
+ const result = await params.runtime.manager.initiateCall(params.to, undefined, {
95
+ message: params.message,
96
+ mode: resolveCallMode(params.mode),
97
+ });
98
+ if (!result.success) {
99
+ throw new Error(result.error || "initiate failed");
100
+ }
101
+ // eslint-disable-next-line no-console
102
+ console.log(JSON.stringify({ callId: result.callId }, null, 2));
103
+ }
104
+
84
105
  export function registerVoiceCallCli(params: {
85
106
  program: Command;
86
107
  config: VoiceCallConfig;
@@ -112,16 +133,12 @@ export function registerVoiceCallCli(params: {
112
133
  if (!to) {
113
134
  throw new Error("Missing --to and no toNumber configured");
114
135
  }
115
- const result = await rt.manager.initiateCall(to, undefined, {
136
+ await initiateCallAndPrintId({
137
+ runtime: rt,
138
+ to,
116
139
  message: options.message,
117
- mode:
118
- options.mode === "notify" || options.mode === "conversation" ? options.mode : undefined,
140
+ mode: options.mode,
119
141
  });
120
- if (!result.success) {
121
- throw new Error(result.error || "initiate failed");
122
- }
123
- // eslint-disable-next-line no-console
124
- console.log(JSON.stringify({ callId: result.callId }, null, 2));
125
142
  });
126
143
 
127
144
  root
@@ -136,16 +153,12 @@ export function registerVoiceCallCli(params: {
136
153
  )
137
154
  .action(async (options: { to: string; message?: string; mode?: string }) => {
138
155
  const rt = await ensureRuntime();
139
- const result = await rt.manager.initiateCall(options.to, undefined, {
156
+ await initiateCallAndPrintId({
157
+ runtime: rt,
158
+ to: options.to,
140
159
  message: options.message,
141
- mode:
142
- options.mode === "notify" || options.mode === "conversation" ? options.mode : undefined,
160
+ mode: options.mode,
143
161
  });
144
- if (!result.success) {
145
- throw new Error(result.error || "initiate failed");
146
- }
147
- // eslint-disable-next-line no-console
148
- console.log(JSON.stringify({ callId: result.callId }, null, 2));
149
162
  });
150
163
 
151
164
  root
@@ -6,6 +6,7 @@ export type TranscriptWaiter = {
6
6
  resolve: (text: string) => void;
7
7
  reject: (err: Error) => void;
8
8
  timeout: NodeJS.Timeout;
9
+ turnToken?: string;
9
10
  };
10
11
 
11
12
  export type CallManagerRuntimeState = {
@@ -71,19 +71,26 @@ function createInboundInitiatedEvent(params: {
71
71
  };
72
72
  }
73
73
 
74
+ function createRejectingInboundContext(): {
75
+ ctx: CallManagerContext;
76
+ hangupCalls: HangupCallInput[];
77
+ } {
78
+ const hangupCalls: HangupCallInput[] = [];
79
+ const provider = createProvider({
80
+ hangupCall: async (input: HangupCallInput): Promise<void> => {
81
+ hangupCalls.push(input);
82
+ },
83
+ });
84
+ const ctx = createContext({
85
+ config: createInboundDisabledConfig(),
86
+ provider,
87
+ });
88
+ return { ctx, hangupCalls };
89
+ }
90
+
74
91
  describe("processEvent (functional)", () => {
75
92
  it("calls provider hangup when rejecting inbound call", () => {
76
- const hangupCalls: HangupCallInput[] = [];
77
- const provider = createProvider({
78
- hangupCall: async (input: HangupCallInput): Promise<void> => {
79
- hangupCalls.push(input);
80
- },
81
- });
82
-
83
- const ctx = createContext({
84
- config: createInboundDisabledConfig(),
85
- provider,
86
- });
93
+ const { ctx, hangupCalls } = createRejectingInboundContext();
87
94
  const event = createInboundInitiatedEvent({
88
95
  id: "evt-1",
89
96
  providerCallId: "prov-1",
@@ -118,16 +125,7 @@ describe("processEvent (functional)", () => {
118
125
  });
119
126
 
120
127
  it("calls hangup only once for duplicate events for same rejected call", () => {
121
- const hangupCalls: HangupCallInput[] = [];
122
- const provider = createProvider({
123
- hangupCall: async (input: HangupCallInput): Promise<void> => {
124
- hangupCalls.push(input);
125
- },
126
- });
127
- const ctx = createContext({
128
- config: createInboundDisabledConfig(),
129
- provider,
130
- });
128
+ const { ctx, hangupCalls } = createRejectingInboundContext();
131
129
  const event1 = createInboundInitiatedEvent({
132
130
  id: "evt-init",
133
131
  providerCallId: "prov-dup",
@@ -236,4 +234,49 @@ describe("processEvent (functional)", () => {
236
234
  expect(() => processEvent(ctx, event)).not.toThrow();
237
235
  expect(ctx.activeCalls.size).toBe(0);
238
236
  });
237
+
238
+ it("deduplicates by dedupeKey even when event IDs differ", () => {
239
+ const now = Date.now();
240
+ const ctx = createContext();
241
+ ctx.activeCalls.set("call-dedupe", {
242
+ callId: "call-dedupe",
243
+ providerCallId: "provider-dedupe",
244
+ provider: "plivo",
245
+ direction: "outbound",
246
+ state: "answered",
247
+ from: "+15550000000",
248
+ to: "+15550000001",
249
+ startedAt: now,
250
+ transcript: [],
251
+ processedEventIds: [],
252
+ metadata: {},
253
+ });
254
+ ctx.providerCallIdMap.set("provider-dedupe", "call-dedupe");
255
+
256
+ processEvent(ctx, {
257
+ id: "evt-1",
258
+ dedupeKey: "stable-key-1",
259
+ type: "call.speech",
260
+ callId: "call-dedupe",
261
+ providerCallId: "provider-dedupe",
262
+ timestamp: now + 1,
263
+ transcript: "hello",
264
+ isFinal: true,
265
+ });
266
+
267
+ processEvent(ctx, {
268
+ id: "evt-2",
269
+ dedupeKey: "stable-key-1",
270
+ type: "call.speech",
271
+ callId: "call-dedupe",
272
+ providerCallId: "provider-dedupe",
273
+ timestamp: now + 2,
274
+ transcript: "hello",
275
+ isFinal: true,
276
+ });
277
+
278
+ const call = ctx.activeCalls.get("call-dedupe");
279
+ expect(call?.transcript).toHaveLength(1);
280
+ expect(Array.from(ctx.processedEventIds)).toEqual(["stable-key-1"]);
281
+ });
239
282
  });
@@ -92,10 +92,11 @@ function createInboundCall(params: {
92
92
  }
93
93
 
94
94
  export function processEvent(ctx: EventContext, event: NormalizedEvent): void {
95
- if (ctx.processedEventIds.has(event.id)) {
95
+ const dedupeKey = event.dedupeKey || event.id;
96
+ if (ctx.processedEventIds.has(dedupeKey)) {
96
97
  return;
97
98
  }
98
- ctx.processedEventIds.add(event.id);
99
+ ctx.processedEventIds.add(dedupeKey);
99
100
 
100
101
  let call = findCall({
101
102
  activeCalls: ctx.activeCalls,
@@ -158,7 +159,7 @@ export function processEvent(ctx: EventContext, event: NormalizedEvent): void {
158
159
  }
159
160
  }
160
161
 
161
- call.processedEventIds.push(event.id);
162
+ call.processedEventIds.push(dedupeKey);
162
163
 
163
164
  switch (event.type) {
164
165
  case "call.initiated":
@@ -192,8 +193,20 @@ export function processEvent(ctx: EventContext, event: NormalizedEvent): void {
192
193
 
193
194
  case "call.speech":
194
195
  if (event.isFinal) {
196
+ const hadWaiter = ctx.transcriptWaiters.has(call.callId);
197
+ const resolved = resolveTranscriptWaiter(
198
+ ctx,
199
+ call.callId,
200
+ event.transcript,
201
+ event.turnToken,
202
+ );
203
+ if (hadWaiter && !resolved) {
204
+ console.warn(
205
+ `[voice-call] Ignoring speech event with mismatched turn token for ${call.callId}`,
206
+ );
207
+ break;
208
+ }
195
209
  addTranscriptEntry(call, "user", event.transcript);
196
- resolveTranscriptWaiter(ctx, call.callId, event.transcript);
197
210
  }
198
211
  transitionState(call, "listening");
199
212
  break;
@@ -63,6 +63,15 @@ type ConnectedCallLookup =
63
63
  provider: NonNullable<ConnectedCallContext["provider"]>;
64
64
  };
65
65
 
66
+ type ConnectedCallResolution =
67
+ | { ok: false; error: string }
68
+ | {
69
+ ok: true;
70
+ call: CallRecord;
71
+ providerCallId: string;
72
+ provider: NonNullable<ConnectedCallContext["provider"]>;
73
+ };
74
+
66
75
  function lookupConnectedCall(ctx: ConnectedCallContext, callId: CallId): ConnectedCallLookup {
67
76
  const call = ctx.activeCalls.get(callId);
68
77
  if (!call) {
@@ -77,6 +86,22 @@ function lookupConnectedCall(ctx: ConnectedCallContext, callId: CallId): Connect
77
86
  return { kind: "ok", call, providerCallId: call.providerCallId, provider: ctx.provider };
78
87
  }
79
88
 
89
+ function requireConnectedCall(ctx: ConnectedCallContext, callId: CallId): ConnectedCallResolution {
90
+ const lookup = lookupConnectedCall(ctx, callId);
91
+ if (lookup.kind === "error") {
92
+ return { ok: false, error: lookup.error };
93
+ }
94
+ if (lookup.kind === "ended") {
95
+ return { ok: false, error: "Call has ended" };
96
+ }
97
+ return {
98
+ ok: true,
99
+ call: lookup.call,
100
+ providerCallId: lookup.providerCallId,
101
+ provider: lookup.provider,
102
+ };
103
+ }
104
+
80
105
  export async function initiateCall(
81
106
  ctx: InitiateContext,
82
107
  to: string,
@@ -175,14 +200,11 @@ export async function speak(
175
200
  callId: CallId,
176
201
  text: string,
177
202
  ): Promise<{ success: boolean; error?: string }> {
178
- const lookup = lookupConnectedCall(ctx, callId);
179
- if (lookup.kind === "error") {
180
- return { success: false, error: lookup.error };
203
+ const connected = requireConnectedCall(ctx, callId);
204
+ if (!connected.ok) {
205
+ return { success: false, error: connected.error };
181
206
  }
182
- if (lookup.kind === "ended") {
183
- return { success: false, error: "Call has ended" };
184
- }
185
- const { call, providerCallId, provider } = lookup;
207
+ const { call, providerCallId, provider } = connected;
186
208
 
187
209
  try {
188
210
  transitionState(call, "speaking");
@@ -257,14 +279,11 @@ export async function continueCall(
257
279
  callId: CallId,
258
280
  prompt: string,
259
281
  ): Promise<{ success: boolean; transcript?: string; error?: string }> {
260
- const lookup = lookupConnectedCall(ctx, callId);
261
- if (lookup.kind === "error") {
262
- return { success: false, error: lookup.error };
282
+ const connected = requireConnectedCall(ctx, callId);
283
+ if (!connected.ok) {
284
+ return { success: false, error: connected.error };
263
285
  }
264
- if (lookup.kind === "ended") {
265
- return { success: false, error: "Call has ended" };
266
- }
267
- const { call, providerCallId, provider } = lookup;
286
+ const { call, providerCallId, provider } = connected;
268
287
 
269
288
  if (ctx.activeTurnCalls.has(callId) || ctx.transcriptWaiters.has(callId)) {
270
289
  return { success: false, error: "Already waiting for transcript" };
@@ -272,6 +291,7 @@ export async function continueCall(
272
291
  ctx.activeTurnCalls.add(callId);
273
292
 
274
293
  const turnStartedAt = Date.now();
294
+ const turnToken = provider.name === "twilio" ? crypto.randomUUID() : undefined;
275
295
 
276
296
  try {
277
297
  await speak(ctx, callId, prompt);
@@ -280,9 +300,9 @@ export async function continueCall(
280
300
  persistCallRecord(ctx.storePath, call);
281
301
 
282
302
  const listenStartedAt = Date.now();
283
- await provider.startListening({ callId, providerCallId });
303
+ await provider.startListening({ callId, providerCallId, turnToken });
284
304
 
285
- const transcript = await waitForFinalTranscript(ctx, callId);
305
+ const transcript = await waitForFinalTranscript(ctx, callId, turnToken);
286
306
  const transcriptReceivedAt = Date.now();
287
307
 
288
308
  // Best-effort: stop listening after final transcript.
@@ -77,16 +77,25 @@ export function resolveTranscriptWaiter(
77
77
  ctx: TranscriptWaiterContext,
78
78
  callId: CallId,
79
79
  transcript: string,
80
- ): void {
80
+ turnToken?: string,
81
+ ): boolean {
81
82
  const waiter = ctx.transcriptWaiters.get(callId);
82
83
  if (!waiter) {
83
- return;
84
+ return false;
85
+ }
86
+ if (waiter.turnToken && waiter.turnToken !== turnToken) {
87
+ return false;
84
88
  }
85
89
  clearTranscriptWaiter(ctx, callId);
86
90
  waiter.resolve(transcript);
91
+ return true;
87
92
  }
88
93
 
89
- export function waitForFinalTranscript(ctx: TimerContext, callId: CallId): Promise<string> {
94
+ export function waitForFinalTranscript(
95
+ ctx: TimerContext,
96
+ callId: CallId,
97
+ turnToken?: string,
98
+ ): Promise<string> {
90
99
  if (ctx.transcriptWaiters.has(callId)) {
91
100
  return Promise.reject(new Error("Already waiting for transcript"));
92
101
  }
@@ -98,6 +107,6 @@ export function waitForFinalTranscript(ctx: TimerContext, callId: CallId): Promi
98
107
  reject(new Error(`Timed out waiting for transcript after ${timeoutMs}ms`));
99
108
  }, timeoutMs);
100
109
 
101
- ctx.transcriptWaiters.set(callId, { resolve, reject, timeout });
110
+ ctx.transcriptWaiters.set(callId, { resolve, reject, timeout, turnToken });
102
111
  });
103
112
  }
@@ -17,12 +17,16 @@ import type {
17
17
  } from "./types.js";
18
18
 
19
19
  class FakeProvider implements VoiceCallProvider {
20
- readonly name = "plivo" as const;
20
+ readonly name: "plivo" | "twilio";
21
21
  readonly playTtsCalls: PlayTtsInput[] = [];
22
22
  readonly hangupCalls: HangupCallInput[] = [];
23
23
  readonly startListeningCalls: StartListeningInput[] = [];
24
24
  readonly stopListeningCalls: StopListeningInput[] = [];
25
25
 
26
+ constructor(name: "plivo" | "twilio" = "plivo") {
27
+ this.name = name;
28
+ }
29
+
26
30
  verifyWebhook(_ctx: WebhookContext): WebhookVerificationResult {
27
31
  return { ok: true };
28
32
  }
@@ -319,6 +323,61 @@ describe("CallManager", () => {
319
323
  expect(provider.stopListeningCalls).toHaveLength(1);
320
324
  });
321
325
 
326
+ it("ignores speech events with mismatched turnToken while waiting for transcript", async () => {
327
+ const { manager, provider } = createManagerHarness(
328
+ {
329
+ transcriptTimeoutMs: 5000,
330
+ },
331
+ new FakeProvider("twilio"),
332
+ );
333
+
334
+ const started = await manager.initiateCall("+15550000004");
335
+ expect(started.success).toBe(true);
336
+
337
+ markCallAnswered(manager, started.callId, "evt-turn-token-answered");
338
+
339
+ const turnPromise = manager.continueCall(started.callId, "Prompt");
340
+ await new Promise((resolve) => setTimeout(resolve, 0));
341
+
342
+ const expectedTurnToken = provider.startListeningCalls[0]?.turnToken;
343
+ expect(typeof expectedTurnToken).toBe("string");
344
+
345
+ manager.processEvent({
346
+ id: "evt-turn-token-bad",
347
+ type: "call.speech",
348
+ callId: started.callId,
349
+ providerCallId: "request-uuid",
350
+ timestamp: Date.now(),
351
+ transcript: "stale replay",
352
+ isFinal: true,
353
+ turnToken: "wrong-token",
354
+ });
355
+
356
+ const pendingState = await Promise.race([
357
+ turnPromise.then(() => "resolved"),
358
+ new Promise<"pending">((resolve) => setTimeout(() => resolve("pending"), 0)),
359
+ ]);
360
+ expect(pendingState).toBe("pending");
361
+
362
+ manager.processEvent({
363
+ id: "evt-turn-token-good",
364
+ type: "call.speech",
365
+ callId: started.callId,
366
+ providerCallId: "request-uuid",
367
+ timestamp: Date.now(),
368
+ transcript: "final answer",
369
+ isFinal: true,
370
+ turnToken: expectedTurnToken,
371
+ });
372
+
373
+ const turnResult = await turnPromise;
374
+ expect(turnResult.success).toBe(true);
375
+ expect(turnResult.transcript).toBe("final answer");
376
+
377
+ const call = manager.getCall(started.callId);
378
+ expect(call?.transcript.map((entry) => entry.text)).toEqual(["Prompt", "final answer"]);
379
+ });
380
+
322
381
  it("tracks latency metadata across multiple closed-loop turns", async () => {
323
382
  const { manager, provider } = createManagerHarness({
324
383
  transcriptTimeoutMs: 5000,
@@ -30,6 +30,29 @@ export interface PlivoProviderOptions {
30
30
  type PendingSpeak = { text: string; locale?: string };
31
31
  type PendingListen = { language?: string };
32
32
 
33
+ function getHeader(
34
+ headers: Record<string, string | string[] | undefined>,
35
+ name: string,
36
+ ): string | undefined {
37
+ const value = headers[name.toLowerCase()];
38
+ if (Array.isArray(value)) {
39
+ return value[0];
40
+ }
41
+ return value;
42
+ }
43
+
44
+ function createPlivoRequestDedupeKey(ctx: WebhookContext): string {
45
+ const nonceV3 = getHeader(ctx.headers, "x-plivo-signature-v3-nonce");
46
+ if (nonceV3) {
47
+ return `plivo:v3:${nonceV3}`;
48
+ }
49
+ const nonceV2 = getHeader(ctx.headers, "x-plivo-signature-v2-nonce");
50
+ if (nonceV2) {
51
+ return `plivo:v2:${nonceV2}`;
52
+ }
53
+ return `plivo:fallback:${crypto.createHash("sha256").update(ctx.rawBody).digest("hex")}`;
54
+ }
55
+
33
56
  export class PlivoProvider implements VoiceCallProvider {
34
57
  readonly name = "plivo" as const;
35
58
 
@@ -104,7 +127,7 @@ export class PlivoProvider implements VoiceCallProvider {
104
127
  console.warn(`[plivo] Webhook verification failed: ${result.reason}`);
105
128
  }
106
129
 
107
- return { ok: result.ok, reason: result.reason };
130
+ return { ok: result.ok, reason: result.reason, isReplay: result.isReplay };
108
131
  }
109
132
 
110
133
  parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
@@ -173,7 +196,8 @@ export class PlivoProvider implements VoiceCallProvider {
173
196
 
174
197
  // Normal events.
175
198
  const callIdFromQuery = this.getCallIdFromQuery(ctx);
176
- const event = this.normalizeEvent(parsed, callIdFromQuery);
199
+ const dedupeKey = createPlivoRequestDedupeKey(ctx);
200
+ const event = this.normalizeEvent(parsed, callIdFromQuery, dedupeKey);
177
201
 
178
202
  return {
179
203
  events: event ? [event] : [],
@@ -186,7 +210,11 @@ export class PlivoProvider implements VoiceCallProvider {
186
210
  };
187
211
  }
188
212
 
189
- private normalizeEvent(params: URLSearchParams, callIdOverride?: string): NormalizedEvent | null {
213
+ private normalizeEvent(
214
+ params: URLSearchParams,
215
+ callIdOverride?: string,
216
+ dedupeKey?: string,
217
+ ): NormalizedEvent | null {
190
218
  const callUuid = params.get("CallUUID") || "";
191
219
  const requestUuid = params.get("RequestUUID") || "";
192
220
 
@@ -201,6 +229,7 @@ export class PlivoProvider implements VoiceCallProvider {
201
229
 
202
230
  const baseEvent = {
203
231
  id: crypto.randomUUID(),
232
+ dedupeKey,
204
233
  callId: callIdOverride || callUuid || requestUuid,
205
234
  providerCallId: callUuid || requestUuid || undefined,
206
235
  timestamp: Date.now(),
@@ -331,31 +360,40 @@ export class PlivoProvider implements VoiceCallProvider {
331
360
  });
332
361
  }
333
362
 
334
- async playTts(input: PlayTtsInput): Promise<void> {
335
- const callUuid = this.requestUuidToCallUuid.get(input.providerCallId) ?? input.providerCallId;
363
+ private resolveCallContext(params: {
364
+ providerCallId: string;
365
+ callId: string;
366
+ operation: string;
367
+ }): {
368
+ callUuid: string;
369
+ webhookBase: string;
370
+ } {
371
+ const callUuid = this.requestUuidToCallUuid.get(params.providerCallId) ?? params.providerCallId;
336
372
  const webhookBase =
337
- this.callUuidToWebhookUrl.get(callUuid) || this.callIdToWebhookUrl.get(input.callId);
373
+ this.callUuidToWebhookUrl.get(callUuid) || this.callIdToWebhookUrl.get(params.callId);
338
374
  if (!webhookBase) {
339
375
  throw new Error("Missing webhook URL for this call (provider state missing)");
340
376
  }
341
-
342
377
  if (!callUuid) {
343
- throw new Error("Missing Plivo CallUUID for playTts");
378
+ throw new Error(`Missing Plivo CallUUID for ${params.operation}`);
344
379
  }
380
+ return { callUuid, webhookBase };
381
+ }
345
382
 
346
- const transferUrl = new URL(webhookBase);
383
+ private async transferCallLeg(params: {
384
+ callUuid: string;
385
+ webhookBase: string;
386
+ callId: string;
387
+ flow: "xml-speak" | "xml-listen";
388
+ }): Promise<void> {
389
+ const transferUrl = new URL(params.webhookBase);
347
390
  transferUrl.searchParams.set("provider", "plivo");
348
- transferUrl.searchParams.set("flow", "xml-speak");
349
- transferUrl.searchParams.set("callId", input.callId);
350
-
351
- this.pendingSpeakByCallId.set(input.callId, {
352
- text: input.text,
353
- locale: input.locale,
354
- });
391
+ transferUrl.searchParams.set("flow", params.flow);
392
+ transferUrl.searchParams.set("callId", params.callId);
355
393
 
356
394
  await this.apiRequest({
357
395
  method: "POST",
358
- endpoint: `/Call/${callUuid}/`,
396
+ endpoint: `/Call/${params.callUuid}/`,
359
397
  body: {
360
398
  legs: "aleg",
361
399
  aleg_url: transferUrl.toString(),
@@ -364,35 +402,42 @@ export class PlivoProvider implements VoiceCallProvider {
364
402
  });
365
403
  }
366
404
 
367
- async startListening(input: StartListeningInput): Promise<void> {
368
- const callUuid = this.requestUuidToCallUuid.get(input.providerCallId) ?? input.providerCallId;
369
- const webhookBase =
370
- this.callUuidToWebhookUrl.get(callUuid) || this.callIdToWebhookUrl.get(input.callId);
371
- if (!webhookBase) {
372
- throw new Error("Missing webhook URL for this call (provider state missing)");
373
- }
405
+ async playTts(input: PlayTtsInput): Promise<void> {
406
+ const { callUuid, webhookBase } = this.resolveCallContext({
407
+ providerCallId: input.providerCallId,
408
+ callId: input.callId,
409
+ operation: "playTts",
410
+ });
374
411
 
375
- if (!callUuid) {
376
- throw new Error("Missing Plivo CallUUID for startListening");
377
- }
412
+ this.pendingSpeakByCallId.set(input.callId, {
413
+ text: input.text,
414
+ locale: input.locale,
415
+ });
378
416
 
379
- const transferUrl = new URL(webhookBase);
380
- transferUrl.searchParams.set("provider", "plivo");
381
- transferUrl.searchParams.set("flow", "xml-listen");
382
- transferUrl.searchParams.set("callId", input.callId);
417
+ await this.transferCallLeg({
418
+ callUuid,
419
+ webhookBase,
420
+ callId: input.callId,
421
+ flow: "xml-speak",
422
+ });
423
+ }
424
+
425
+ async startListening(input: StartListeningInput): Promise<void> {
426
+ const { callUuid, webhookBase } = this.resolveCallContext({
427
+ providerCallId: input.providerCallId,
428
+ callId: input.callId,
429
+ operation: "startListening",
430
+ });
383
431
 
384
432
  this.pendingListenByCallId.set(input.callId, {
385
433
  language: input.language,
386
434
  });
387
435
 
388
- await this.apiRequest({
389
- method: "POST",
390
- endpoint: `/Call/${callUuid}/`,
391
- body: {
392
- legs: "aleg",
393
- aleg_url: transferUrl.toString(),
394
- aleg_method: "POST",
395
- },
436
+ await this.transferCallLeg({
437
+ callUuid,
438
+ webhookBase,
439
+ callId: input.callId,
440
+ flow: "xml-listen",
396
441
  });
397
442
  }
398
443
 
@@ -28,5 +28,6 @@ export function verifyTwilioProviderWebhook(params: {
28
28
  return {
29
29
  ok: result.ok,
30
30
  reason: result.reason,
31
+ isReplay: result.isReplay,
31
32
  };
32
33
  }
@@ -59,4 +59,38 @@ describe("TwilioProvider", () => {
59
59
  expect(result.providerResponseBody).toContain('<Parameter name="token" value="');
60
60
  expect(result.providerResponseBody).toContain("<Connect>");
61
61
  });
62
+
63
+ it("uses a stable dedupeKey for identical request payloads", () => {
64
+ const provider = createProvider();
65
+ const rawBody = "CallSid=CA789&Direction=inbound&SpeechResult=hello";
66
+ const ctxA = {
67
+ ...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
68
+ headers: { "i-twilio-idempotency-token": "idem-123" },
69
+ };
70
+ const ctxB = {
71
+ ...createContext(rawBody, { callId: "call-1", turnToken: "turn-1" }),
72
+ headers: { "i-twilio-idempotency-token": "idem-123" },
73
+ };
74
+
75
+ const eventA = provider.parseWebhookEvent(ctxA).events[0];
76
+ const eventB = provider.parseWebhookEvent(ctxB).events[0];
77
+
78
+ expect(eventA).toBeDefined();
79
+ expect(eventB).toBeDefined();
80
+ expect(eventA?.id).not.toBe(eventB?.id);
81
+ expect(eventA?.dedupeKey).toBe("twilio:idempotency:idem-123");
82
+ expect(eventA?.dedupeKey).toBe(eventB?.dedupeKey);
83
+ });
84
+
85
+ it("keeps turnToken from query on speech events", () => {
86
+ const provider = createProvider();
87
+ const ctx = createContext("CallSid=CA222&Direction=inbound&SpeechResult=hello", {
88
+ callId: "call-2",
89
+ turnToken: "turn-xyz",
90
+ });
91
+
92
+ const event = provider.parseWebhookEvent(ctx).events[0];
93
+ expect(event?.type).toBe("call.speech");
94
+ expect(event?.turnToken).toBe("turn-xyz");
95
+ });
62
96
  });
@@ -20,6 +20,33 @@ import type { VoiceCallProvider } from "./base.js";
20
20
  import { twilioApiRequest } from "./twilio/api.js";
21
21
  import { verifyTwilioProviderWebhook } from "./twilio/webhook.js";
22
22
 
23
+ function getHeader(
24
+ headers: Record<string, string | string[] | undefined>,
25
+ name: string,
26
+ ): string | undefined {
27
+ const value = headers[name.toLowerCase()];
28
+ if (Array.isArray(value)) {
29
+ return value[0];
30
+ }
31
+ return value;
32
+ }
33
+
34
+ function createTwilioRequestDedupeKey(ctx: WebhookContext): string {
35
+ const idempotencyToken = getHeader(ctx.headers, "i-twilio-idempotency-token");
36
+ if (idempotencyToken) {
37
+ return `twilio:idempotency:${idempotencyToken}`;
38
+ }
39
+
40
+ const signature = getHeader(ctx.headers, "x-twilio-signature") ?? "";
41
+ const callId = typeof ctx.query?.callId === "string" ? ctx.query.callId.trim() : "";
42
+ const flow = typeof ctx.query?.flow === "string" ? ctx.query.flow.trim() : "";
43
+ const turnToken = typeof ctx.query?.turnToken === "string" ? ctx.query.turnToken.trim() : "";
44
+ return `twilio:fallback:${crypto
45
+ .createHash("sha256")
46
+ .update(`${signature}\n${callId}\n${flow}\n${turnToken}\n${ctx.rawBody}`)
47
+ .digest("hex")}`;
48
+ }
49
+
23
50
  /**
24
51
  * Twilio Voice API provider implementation.
25
52
  *
@@ -212,7 +239,16 @@ export class TwilioProvider implements VoiceCallProvider {
212
239
  typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
213
240
  ? ctx.query.callId.trim()
214
241
  : undefined;
215
- const event = this.normalizeEvent(params, callIdFromQuery);
242
+ const turnTokenFromQuery =
243
+ typeof ctx.query?.turnToken === "string" && ctx.query.turnToken.trim()
244
+ ? ctx.query.turnToken.trim()
245
+ : undefined;
246
+ const dedupeKey = createTwilioRequestDedupeKey(ctx);
247
+ const event = this.normalizeEvent(params, {
248
+ callIdOverride: callIdFromQuery,
249
+ dedupeKey,
250
+ turnToken: turnTokenFromQuery,
251
+ });
216
252
 
217
253
  // For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
218
254
  // so the webhook response is typically a pause to keep the call alive.
@@ -245,14 +281,24 @@ export class TwilioProvider implements VoiceCallProvider {
245
281
  /**
246
282
  * Convert Twilio webhook params to normalized event format.
247
283
  */
248
- private normalizeEvent(params: URLSearchParams, callIdOverride?: string): NormalizedEvent | null {
284
+ private normalizeEvent(
285
+ params: URLSearchParams,
286
+ options?: {
287
+ callIdOverride?: string;
288
+ dedupeKey?: string;
289
+ turnToken?: string;
290
+ },
291
+ ): NormalizedEvent | null {
249
292
  const callSid = params.get("CallSid") || "";
293
+ const callIdOverride = options?.callIdOverride;
250
294
 
251
295
  const baseEvent = {
252
296
  id: crypto.randomUUID(),
297
+ dedupeKey: options?.dedupeKey,
253
298
  callId: callIdOverride || callSid,
254
299
  providerCallId: callSid,
255
300
  timestamp: Date.now(),
301
+ turnToken: options?.turnToken,
256
302
  direction: TwilioProvider.parseDirection(params.get("Direction")),
257
303
  from: params.get("From") || undefined,
258
304
  to: params.get("To") || undefined,
@@ -603,9 +649,14 @@ export class TwilioProvider implements VoiceCallProvider {
603
649
  throw new Error("Missing webhook URL for this call (provider state not initialized)");
604
650
  }
605
651
 
652
+ const actionUrl = new URL(webhookUrl);
653
+ if (input.turnToken) {
654
+ actionUrl.searchParams.set("turnToken", input.turnToken);
655
+ }
656
+
606
657
  const twiml = `<?xml version="1.0" encoding="UTF-8"?>
607
658
  <Response>
608
- <Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
659
+ <Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(actionUrl.toString())}" method="POST">
609
660
  </Gather>
610
661
  </Response>`;
611
662
 
package/src/types.ts CHANGED
@@ -74,9 +74,13 @@ export type EndReason = z.infer<typeof EndReasonSchema>;
74
74
 
75
75
  const BaseEventSchema = z.object({
76
76
  id: z.string(),
77
+ // Stable provider-derived key for idempotency/replay dedupe.
78
+ dedupeKey: z.string().optional(),
77
79
  callId: z.string(),
78
80
  providerCallId: z.string().optional(),
79
81
  timestamp: z.number(),
82
+ // Optional per-turn nonce for speech events (Twilio <Gather> replay hardening).
83
+ turnToken: z.string().optional(),
80
84
  // Optional fields for inbound call detection
81
85
  direction: z.enum(["inbound", "outbound"]).optional(),
82
86
  from: z.string().optional(),
@@ -171,6 +175,8 @@ export type CallRecord = z.infer<typeof CallRecordSchema>;
171
175
  export type WebhookVerificationResult = {
172
176
  ok: boolean;
173
177
  reason?: string;
178
+ /** Signature is valid, but request was seen before within replay window. */
179
+ isReplay?: boolean;
174
180
  };
175
181
 
176
182
  export type WebhookContext = {
@@ -226,6 +232,8 @@ export type StartListeningInput = {
226
232
  callId: CallId;
227
233
  providerCallId: ProviderCallId;
228
234
  language?: string;
235
+ /** Optional per-turn nonce for provider callbacks (replay hardening). */
236
+ turnToken?: string;
229
237
  };
230
238
 
231
239
  export type StopListeningInput = {
@@ -163,6 +163,40 @@ describe("verifyPlivoWebhook", () => {
163
163
  expect(result.ok).toBe(false);
164
164
  expect(result.reason).toMatch(/Missing Plivo signature headers/);
165
165
  });
166
+
167
+ it("marks replayed valid V3 requests as replay without failing auth", () => {
168
+ const authToken = "test-auth-token";
169
+ const nonce = "nonce-replay-v3";
170
+ const urlWithQuery = "https://example.com/voice/webhook?flow=answer&callId=abc";
171
+ const postBody = "CallUUID=uuid&CallStatus=in-progress&From=%2B15550000000";
172
+ const signature = plivoV3Signature({
173
+ authToken,
174
+ urlWithQuery,
175
+ postBody,
176
+ nonce,
177
+ });
178
+
179
+ const ctx = {
180
+ headers: {
181
+ host: "example.com",
182
+ "x-forwarded-proto": "https",
183
+ "x-plivo-signature-v3": signature,
184
+ "x-plivo-signature-v3-nonce": nonce,
185
+ },
186
+ rawBody: postBody,
187
+ url: urlWithQuery,
188
+ method: "POST" as const,
189
+ query: { flow: "answer", callId: "abc" },
190
+ };
191
+
192
+ const first = verifyPlivoWebhook(ctx, authToken);
193
+ const second = verifyPlivoWebhook(ctx, authToken);
194
+
195
+ expect(first.ok).toBe(true);
196
+ expect(first.isReplay).toBeFalsy();
197
+ expect(second.ok).toBe(true);
198
+ expect(second.isReplay).toBe(true);
199
+ });
166
200
  });
167
201
 
168
202
  describe("verifyTwilioWebhook", () => {
@@ -197,6 +231,48 @@ describe("verifyTwilioWebhook", () => {
197
231
  expect(result.ok).toBe(true);
198
232
  });
199
233
 
234
+ it("marks replayed valid requests as replay without failing auth", () => {
235
+ const authToken = "test-auth-token";
236
+ const publicUrl = "https://example.com/voice/webhook";
237
+ const urlWithQuery = `${publicUrl}?callId=abc`;
238
+ const postBody = "CallSid=CS777&CallStatus=completed&From=%2B15550000000";
239
+ const signature = twilioSignature({ authToken, url: urlWithQuery, postBody });
240
+ const headers = {
241
+ host: "example.com",
242
+ "x-forwarded-proto": "https",
243
+ "x-twilio-signature": signature,
244
+ "i-twilio-idempotency-token": "idem-replay-1",
245
+ };
246
+
247
+ const first = verifyTwilioWebhook(
248
+ {
249
+ headers,
250
+ rawBody: postBody,
251
+ url: "http://local/voice/webhook?callId=abc",
252
+ method: "POST",
253
+ query: { callId: "abc" },
254
+ },
255
+ authToken,
256
+ { publicUrl },
257
+ );
258
+ const second = verifyTwilioWebhook(
259
+ {
260
+ headers,
261
+ rawBody: postBody,
262
+ url: "http://local/voice/webhook?callId=abc",
263
+ method: "POST",
264
+ query: { callId: "abc" },
265
+ },
266
+ authToken,
267
+ { publicUrl },
268
+ );
269
+
270
+ expect(first.ok).toBe(true);
271
+ expect(first.isReplay).toBeFalsy();
272
+ expect(second.ok).toBe(true);
273
+ expect(second.isReplay).toBe(true);
274
+ });
275
+
200
276
  it("rejects invalid signatures even when attacker injects forwarded host", () => {
201
277
  const authToken = "test-auth-token";
202
278
  const postBody = "CallSid=CS123&CallStatus=completed&From=%2B15550000000";
@@ -1,6 +1,63 @@
1
1
  import crypto from "node:crypto";
2
2
  import type { WebhookContext } from "./types.js";
3
3
 
4
+ const REPLAY_WINDOW_MS = 10 * 60 * 1000;
5
+ const REPLAY_CACHE_MAX_ENTRIES = 10_000;
6
+ const REPLAY_CACHE_PRUNE_INTERVAL = 64;
7
+
8
+ type ReplayCache = {
9
+ seenUntil: Map<string, number>;
10
+ calls: number;
11
+ };
12
+
13
+ const twilioReplayCache: ReplayCache = {
14
+ seenUntil: new Map<string, number>(),
15
+ calls: 0,
16
+ };
17
+
18
+ const plivoReplayCache: ReplayCache = {
19
+ seenUntil: new Map<string, number>(),
20
+ calls: 0,
21
+ };
22
+
23
+ function sha256Hex(input: string): string {
24
+ return crypto.createHash("sha256").update(input).digest("hex");
25
+ }
26
+
27
+ function pruneReplayCache(cache: ReplayCache, now: number): void {
28
+ for (const [key, expiresAt] of cache.seenUntil) {
29
+ if (expiresAt <= now) {
30
+ cache.seenUntil.delete(key);
31
+ }
32
+ }
33
+ while (cache.seenUntil.size > REPLAY_CACHE_MAX_ENTRIES) {
34
+ const oldest = cache.seenUntil.keys().next().value;
35
+ if (!oldest) {
36
+ break;
37
+ }
38
+ cache.seenUntil.delete(oldest);
39
+ }
40
+ }
41
+
42
+ function markReplay(cache: ReplayCache, replayKey: string): boolean {
43
+ const now = Date.now();
44
+ cache.calls += 1;
45
+ if (cache.calls % REPLAY_CACHE_PRUNE_INTERVAL === 0) {
46
+ pruneReplayCache(cache, now);
47
+ }
48
+
49
+ const existing = cache.seenUntil.get(replayKey);
50
+ if (existing && existing > now) {
51
+ return true;
52
+ }
53
+
54
+ cache.seenUntil.set(replayKey, now + REPLAY_WINDOW_MS);
55
+ if (cache.seenUntil.size > REPLAY_CACHE_MAX_ENTRIES) {
56
+ pruneReplayCache(cache, now);
57
+ }
58
+ return false;
59
+ }
60
+
4
61
  /**
5
62
  * Validate Twilio webhook signature using HMAC-SHA1.
6
63
  *
@@ -328,6 +385,8 @@ export interface TwilioVerificationResult {
328
385
  verificationUrl?: string;
329
386
  /** Whether we're running behind ngrok free tier */
330
387
  isNgrokFreeTier?: boolean;
388
+ /** Request is cryptographically valid but was already processed recently. */
389
+ isReplay?: boolean;
331
390
  }
332
391
 
333
392
  export interface TelnyxVerificationResult {
@@ -335,6 +394,20 @@ export interface TelnyxVerificationResult {
335
394
  reason?: string;
336
395
  }
337
396
 
397
+ function createTwilioReplayKey(params: {
398
+ ctx: WebhookContext;
399
+ signature: string;
400
+ verificationUrl: string;
401
+ }): string {
402
+ const idempotencyToken = getHeader(params.ctx.headers, "i-twilio-idempotency-token");
403
+ if (idempotencyToken) {
404
+ return `twilio:idempotency:${idempotencyToken}`;
405
+ }
406
+ return `twilio:fallback:${sha256Hex(
407
+ `${params.verificationUrl}\n${params.signature}\n${params.ctx.rawBody}`,
408
+ )}`;
409
+ }
410
+
338
411
  function decodeBase64OrBase64Url(input: string): Buffer {
339
412
  // Telnyx docs say Base64; some tooling emits Base64URL. Accept both.
340
413
  const normalized = input.replace(/-/g, "+").replace(/_/g, "/");
@@ -505,7 +578,9 @@ export function verifyTwilioWebhook(
505
578
  const isValid = validateTwilioSignature(authToken, signature, verificationUrl, params);
506
579
 
507
580
  if (isValid) {
508
- return { ok: true, verificationUrl };
581
+ const replayKey = createTwilioReplayKey({ ctx, signature, verificationUrl });
582
+ const isReplay = markReplay(twilioReplayCache, replayKey);
583
+ return { ok: true, verificationUrl, isReplay };
509
584
  }
510
585
 
511
586
  // Check if this is ngrok free tier - the URL might have different format
@@ -533,6 +608,8 @@ export interface PlivoVerificationResult {
533
608
  verificationUrl?: string;
534
609
  /** Signature version used for verification */
535
610
  version?: "v3" | "v2";
611
+ /** Request is cryptographically valid but was already processed recently. */
612
+ isReplay?: boolean;
536
613
  }
537
614
 
538
615
  function normalizeSignatureBase64(input: string): string {
@@ -753,14 +830,17 @@ export function verifyPlivoWebhook(
753
830
  url: verificationUrl,
754
831
  postParams,
755
832
  });
756
- return ok
757
- ? { ok: true, version: "v3", verificationUrl }
758
- : {
759
- ok: false,
760
- version: "v3",
761
- verificationUrl,
762
- reason: "Invalid Plivo V3 signature",
763
- };
833
+ if (!ok) {
834
+ return {
835
+ ok: false,
836
+ version: "v3",
837
+ verificationUrl,
838
+ reason: "Invalid Plivo V3 signature",
839
+ };
840
+ }
841
+ const replayKey = `plivo:v3:${sha256Hex(`${verificationUrl}\n${nonceV3}`)}`;
842
+ const isReplay = markReplay(plivoReplayCache, replayKey);
843
+ return { ok: true, version: "v3", verificationUrl, isReplay };
764
844
  }
765
845
 
766
846
  if (signatureV2 && nonceV2) {
@@ -770,14 +850,17 @@ export function verifyPlivoWebhook(
770
850
  nonce: nonceV2,
771
851
  url: verificationUrl,
772
852
  });
773
- return ok
774
- ? { ok: true, version: "v2", verificationUrl }
775
- : {
776
- ok: false,
777
- version: "v2",
778
- verificationUrl,
779
- reason: "Invalid Plivo V2 signature",
780
- };
853
+ if (!ok) {
854
+ return {
855
+ ok: false,
856
+ version: "v2",
857
+ verificationUrl,
858
+ reason: "Invalid Plivo V2 signature",
859
+ };
860
+ }
861
+ const replayKey = `plivo:v2:${sha256Hex(`${verificationUrl}\n${nonceV2}`)}`;
862
+ const isReplay = markReplay(plivoReplayCache, replayKey);
863
+ return { ok: true, version: "v2", verificationUrl, isReplay };
781
864
  }
782
865
 
783
866
  return {
@@ -45,12 +45,14 @@ const createCall = (startedAt: number): CallRecord => ({
45
45
 
46
46
  const createManager = (calls: CallRecord[]) => {
47
47
  const endCall = vi.fn(async () => ({ success: true }));
48
+ const processEvent = vi.fn();
48
49
  const manager = {
49
50
  getActiveCalls: () => calls,
50
51
  endCall,
52
+ processEvent,
51
53
  } as unknown as CallManager;
52
54
 
53
- return { manager, endCall };
55
+ return { manager, endCall, processEvent };
54
56
  };
55
57
 
56
58
  describe("VoiceCallWebhookServer stale call reaper", () => {
@@ -116,3 +118,51 @@ describe("VoiceCallWebhookServer stale call reaper", () => {
116
118
  }
117
119
  });
118
120
  });
121
+
122
+ describe("VoiceCallWebhookServer replay handling", () => {
123
+ it("acknowledges replayed webhook requests and skips event side effects", async () => {
124
+ const replayProvider: VoiceCallProvider = {
125
+ ...provider,
126
+ verifyWebhook: () => ({ ok: true, isReplay: true }),
127
+ parseWebhookEvent: () => ({
128
+ events: [
129
+ {
130
+ id: "evt-replay",
131
+ dedupeKey: "stable-replay",
132
+ type: "call.speech",
133
+ callId: "call-1",
134
+ providerCallId: "provider-call-1",
135
+ timestamp: Date.now(),
136
+ transcript: "hello",
137
+ isFinal: true,
138
+ },
139
+ ],
140
+ statusCode: 200,
141
+ }),
142
+ };
143
+ const { manager, processEvent } = createManager([]);
144
+ const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
145
+ const server = new VoiceCallWebhookServer(config, manager, replayProvider);
146
+
147
+ try {
148
+ const baseUrl = await server.start();
149
+ const address = (
150
+ server as unknown as { server?: { address?: () => unknown } }
151
+ ).server?.address?.();
152
+ const requestUrl = new URL(baseUrl);
153
+ if (address && typeof address === "object" && "port" in address && address.port) {
154
+ requestUrl.port = String(address.port);
155
+ }
156
+ const response = await fetch(requestUrl.toString(), {
157
+ method: "POST",
158
+ headers: { "content-type": "application/x-www-form-urlencoded" },
159
+ body: "CallSid=CA123&SpeechResult=hello",
160
+ });
161
+
162
+ expect(response.status).toBe(200);
163
+ expect(processEvent).not.toHaveBeenCalled();
164
+ } finally {
165
+ await server.stop();
166
+ }
167
+ });
168
+ });
package/src/webhook.ts CHANGED
@@ -346,11 +346,15 @@ export class VoiceCallWebhookServer {
346
346
  const result = this.provider.parseWebhookEvent(ctx);
347
347
 
348
348
  // Process each event
349
- for (const event of result.events) {
350
- try {
351
- this.manager.processEvent(event);
352
- } catch (err) {
353
- console.error(`[voice-call] Error processing event ${event.type}:`, err);
349
+ if (verification.isReplay) {
350
+ console.warn("[voice-call] Replay detected; skipping event side effects");
351
+ } else {
352
+ for (const event of result.events) {
353
+ try {
354
+ this.manager.processEvent(event);
355
+ } catch (err) {
356
+ console.error(`[voice-call] Error processing event ${event.type}:`, err);
357
+ }
354
358
  }
355
359
  }
356
360