@clawdbot/voice-call 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,537 @@
1
+ import crypto from "node:crypto";
2
+
3
+ import type { TwilioConfig } from "../config.js";
4
+ import type { MediaStreamHandler } from "../media-stream.js";
5
+ import type {
6
+ HangupCallInput,
7
+ InitiateCallInput,
8
+ InitiateCallResult,
9
+ NormalizedEvent,
10
+ PlayTtsInput,
11
+ ProviderWebhookParseResult,
12
+ StartListeningInput,
13
+ StopListeningInput,
14
+ WebhookContext,
15
+ WebhookVerificationResult,
16
+ } from "../types.js";
17
+ import { escapeXml, mapVoiceToPolly } from "../voice-mapping.js";
18
+ import { verifyTwilioWebhook } from "../webhook-security.js";
19
+ import type { VoiceCallProvider } from "./base.js";
20
+ import type { OpenAITTSProvider } from "./tts-openai.js";
21
+ import { chunkAudio } from "./tts-openai.js";
22
+
23
+ /**
24
+ * Twilio Voice API provider implementation.
25
+ *
26
+ * Uses Twilio Programmable Voice API with Media Streams for real-time
27
+ * bidirectional audio streaming.
28
+ *
29
+ * @see https://www.twilio.com/docs/voice
30
+ * @see https://www.twilio.com/docs/voice/media-streams
31
+ */
32
+ export interface TwilioProviderOptions {
33
+ /** Allow ngrok free tier compatibility mode (less secure) */
34
+ allowNgrokFreeTier?: boolean;
35
+ /** Override public URL for signature verification */
36
+ publicUrl?: string;
37
+ /** Path for media stream WebSocket (e.g., /voice/stream) */
38
+ streamPath?: string;
39
+ /** Skip webhook signature verification (development only) */
40
+ skipVerification?: boolean;
41
+ }
42
+
43
+ export class TwilioProvider implements VoiceCallProvider {
44
+ readonly name = "twilio" as const;
45
+
46
+ private readonly accountSid: string;
47
+ private readonly authToken: string;
48
+ private readonly baseUrl: string;
49
+ private readonly callWebhookUrls = new Map<string, string>();
50
+ private readonly options: TwilioProviderOptions;
51
+
52
+ /** Current public webhook URL (set when tunnel starts or from config) */
53
+ private currentPublicUrl: string | null = null;
54
+
55
+ /** Optional OpenAI TTS provider for streaming TTS */
56
+ private ttsProvider: OpenAITTSProvider | null = null;
57
+
58
+ /** Optional media stream handler for sending audio */
59
+ private mediaStreamHandler: MediaStreamHandler | null = null;
60
+
61
+ /** Map of call SID to stream SID for media streams */
62
+ private callStreamMap = new Map<string, string>();
63
+
64
+ constructor(config: TwilioConfig, options: TwilioProviderOptions = {}) {
65
+ if (!config.accountSid) {
66
+ throw new Error("Twilio Account SID is required");
67
+ }
68
+ if (!config.authToken) {
69
+ throw new Error("Twilio Auth Token is required");
70
+ }
71
+
72
+ this.accountSid = config.accountSid;
73
+ this.authToken = config.authToken;
74
+ this.baseUrl = `https://api.twilio.com/2010-04-01/Accounts/${this.accountSid}`;
75
+ this.options = options;
76
+
77
+ if (options.publicUrl) {
78
+ this.currentPublicUrl = options.publicUrl;
79
+ }
80
+ }
81
+
82
+ /**
83
+ * Set the current public webhook URL (called when tunnel starts).
84
+ */
85
+ setPublicUrl(url: string): void {
86
+ this.currentPublicUrl = url;
87
+ }
88
+
89
+ /**
90
+ * Get the current public webhook URL.
91
+ */
92
+ getPublicUrl(): string | null {
93
+ return this.currentPublicUrl;
94
+ }
95
+
96
+ /**
97
+ * Set the OpenAI TTS provider for streaming TTS.
98
+ * When set, playTts will use OpenAI audio via media streams.
99
+ */
100
+ setTTSProvider(provider: OpenAITTSProvider): void {
101
+ this.ttsProvider = provider;
102
+ }
103
+
104
+ /**
105
+ * Set the media stream handler for sending audio.
106
+ */
107
+ setMediaStreamHandler(handler: MediaStreamHandler): void {
108
+ this.mediaStreamHandler = handler;
109
+ }
110
+
111
+ /**
112
+ * Register a call's stream SID for audio routing.
113
+ */
114
+ registerCallStream(callSid: string, streamSid: string): void {
115
+ this.callStreamMap.set(callSid, streamSid);
116
+ }
117
+
118
+ /**
119
+ * Unregister a call's stream SID.
120
+ */
121
+ unregisterCallStream(callSid: string): void {
122
+ this.callStreamMap.delete(callSid);
123
+ }
124
+
125
+ /**
126
+ * Make an authenticated request to the Twilio API.
127
+ */
128
+ private async apiRequest<T = unknown>(
129
+ endpoint: string,
130
+ params: Record<string, string>,
131
+ options?: { allowNotFound?: boolean },
132
+ ): Promise<T> {
133
+ const response = await fetch(`${this.baseUrl}${endpoint}`, {
134
+ method: "POST",
135
+ headers: {
136
+ Authorization: `Basic ${Buffer.from(`${this.accountSid}:${this.authToken}`).toString("base64")}`,
137
+ "Content-Type": "application/x-www-form-urlencoded",
138
+ },
139
+ body: new URLSearchParams(params),
140
+ });
141
+
142
+ if (!response.ok) {
143
+ if (options?.allowNotFound && response.status === 404) {
144
+ return undefined as T;
145
+ }
146
+ const errorText = await response.text();
147
+ throw new Error(`Twilio API error: ${response.status} ${errorText}`);
148
+ }
149
+
150
+ const text = await response.text();
151
+ return text ? (JSON.parse(text) as T) : (undefined as T);
152
+ }
153
+
154
+ /**
155
+ * Verify Twilio webhook signature using HMAC-SHA1.
156
+ *
157
+ * Handles reverse proxy scenarios (Tailscale, nginx, ngrok) by reconstructing
158
+ * the public URL from forwarding headers.
159
+ *
160
+ * @see https://www.twilio.com/docs/usage/webhooks/webhooks-security
161
+ */
162
+ verifyWebhook(ctx: WebhookContext): WebhookVerificationResult {
163
+ const result = verifyTwilioWebhook(ctx, this.authToken, {
164
+ publicUrl: this.currentPublicUrl || undefined,
165
+ allowNgrokFreeTier: this.options.allowNgrokFreeTier ?? true,
166
+ skipVerification: this.options.skipVerification,
167
+ });
168
+
169
+ if (!result.ok) {
170
+ console.warn(`[twilio] Webhook verification failed: ${result.reason}`);
171
+ if (result.verificationUrl) {
172
+ console.warn(`[twilio] Verification URL: ${result.verificationUrl}`);
173
+ }
174
+ }
175
+
176
+ return {
177
+ ok: result.ok,
178
+ reason: result.reason,
179
+ };
180
+ }
181
+
182
+ /**
183
+ * Parse Twilio webhook event into normalized format.
184
+ */
185
+ parseWebhookEvent(ctx: WebhookContext): ProviderWebhookParseResult {
186
+ try {
187
+ const params = new URLSearchParams(ctx.rawBody);
188
+ const callIdFromQuery =
189
+ typeof ctx.query?.callId === "string" && ctx.query.callId.trim()
190
+ ? ctx.query.callId.trim()
191
+ : undefined;
192
+ const event = this.normalizeEvent(params, callIdFromQuery);
193
+
194
+ // For Twilio, we must return TwiML. Most actions are driven by Calls API updates,
195
+ // so the webhook response is typically a pause to keep the call alive.
196
+ const twiml = this.generateTwimlResponse(ctx);
197
+
198
+ return {
199
+ events: event ? [event] : [],
200
+ providerResponseBody: twiml,
201
+ providerResponseHeaders: { "Content-Type": "application/xml" },
202
+ statusCode: 200,
203
+ };
204
+ } catch {
205
+ return { events: [], statusCode: 400 };
206
+ }
207
+ }
208
+
209
+ /**
210
+ * Parse Twilio direction to normalized format.
211
+ */
212
+ private static parseDirection(
213
+ direction: string | null,
214
+ ): "inbound" | "outbound" | undefined {
215
+ if (direction === "inbound") return "inbound";
216
+ if (direction === "outbound-api" || direction === "outbound-dial")
217
+ return "outbound";
218
+ return undefined;
219
+ }
220
+
221
+ /**
222
+ * Convert Twilio webhook params to normalized event format.
223
+ */
224
+ private normalizeEvent(
225
+ params: URLSearchParams,
226
+ callIdOverride?: string,
227
+ ): NormalizedEvent | null {
228
+ const callSid = params.get("CallSid") || "";
229
+
230
+ const baseEvent = {
231
+ id: crypto.randomUUID(),
232
+ callId: callIdOverride || callSid,
233
+ providerCallId: callSid,
234
+ timestamp: Date.now(),
235
+ direction: TwilioProvider.parseDirection(params.get("Direction")),
236
+ from: params.get("From") || undefined,
237
+ to: params.get("To") || undefined,
238
+ };
239
+
240
+ // Handle speech result (from <Gather>)
241
+ const speechResult = params.get("SpeechResult");
242
+ if (speechResult) {
243
+ return {
244
+ ...baseEvent,
245
+ type: "call.speech",
246
+ transcript: speechResult,
247
+ isFinal: true,
248
+ confidence: parseFloat(params.get("Confidence") || "0.9"),
249
+ };
250
+ }
251
+
252
+ // Handle DTMF
253
+ const digits = params.get("Digits");
254
+ if (digits) {
255
+ return { ...baseEvent, type: "call.dtmf", digits };
256
+ }
257
+
258
+ // Handle call status changes
259
+ const callStatus = params.get("CallStatus");
260
+ switch (callStatus) {
261
+ case "initiated":
262
+ return { ...baseEvent, type: "call.initiated" };
263
+ case "ringing":
264
+ return { ...baseEvent, type: "call.ringing" };
265
+ case "in-progress":
266
+ return { ...baseEvent, type: "call.answered" };
267
+ case "completed":
268
+ case "busy":
269
+ case "no-answer":
270
+ case "failed":
271
+ return { ...baseEvent, type: "call.ended", reason: callStatus };
272
+ case "canceled":
273
+ return { ...baseEvent, type: "call.ended", reason: "hangup-bot" };
274
+ default:
275
+ return null;
276
+ }
277
+ }
278
+
279
+ private static readonly EMPTY_TWIML =
280
+ '<?xml version="1.0" encoding="UTF-8"?><Response></Response>';
281
+
282
+ private static readonly PAUSE_TWIML = `<?xml version="1.0" encoding="UTF-8"?>
283
+ <Response>
284
+ <Pause length="30"/>
285
+ </Response>`;
286
+
287
+ /**
288
+ * Generate TwiML response for webhook.
289
+ * When a call is answered, connects to media stream for bidirectional audio.
290
+ */
291
+ private generateTwimlResponse(ctx?: WebhookContext): string {
292
+ if (!ctx) return TwilioProvider.EMPTY_TWIML;
293
+
294
+ const params = new URLSearchParams(ctx.rawBody);
295
+ const callStatus = params.get("CallStatus");
296
+ const direction = params.get("Direction");
297
+
298
+ console.log(
299
+ `[voice-call] generateTwimlResponse: status=${callStatus} direction=${direction}`,
300
+ );
301
+
302
+ // For inbound calls, answer immediately with stream
303
+ if (direction === "inbound") {
304
+ const streamUrl = this.getStreamUrl();
305
+ return streamUrl
306
+ ? this.getStreamConnectXml(streamUrl)
307
+ : TwilioProvider.PAUSE_TWIML;
308
+ }
309
+
310
+ // For outbound calls, only connect to stream when call is in-progress
311
+ if (callStatus !== "in-progress") {
312
+ return TwilioProvider.EMPTY_TWIML;
313
+ }
314
+
315
+ const streamUrl = this.getStreamUrl();
316
+ return streamUrl
317
+ ? this.getStreamConnectXml(streamUrl)
318
+ : TwilioProvider.PAUSE_TWIML;
319
+ }
320
+
321
+ /**
322
+ * Get the WebSocket URL for media streaming.
323
+ * Derives from the public URL origin + stream path.
324
+ */
325
+ private getStreamUrl(): string | null {
326
+ if (!this.currentPublicUrl || !this.options.streamPath) {
327
+ return null;
328
+ }
329
+
330
+ // Extract just the origin (host) from the public URL, ignoring any path
331
+ const url = new URL(this.currentPublicUrl);
332
+ const origin = url.origin;
333
+
334
+ // Convert https:// to wss:// for WebSocket
335
+ const wsOrigin = origin
336
+ .replace(/^https:\/\//, "wss://")
337
+ .replace(/^http:\/\//, "ws://");
338
+
339
+ // Append the stream path
340
+ const path = this.options.streamPath.startsWith("/")
341
+ ? this.options.streamPath
342
+ : `/${this.options.streamPath}`;
343
+
344
+ return `${wsOrigin}${path}`;
345
+ }
346
+
347
+ /**
348
+ * Generate TwiML to connect a call to a WebSocket media stream.
349
+ * This enables bidirectional audio streaming for real-time STT/TTS.
350
+ *
351
+ * @param streamUrl - WebSocket URL (wss://...) for the media stream
352
+ */
353
+ getStreamConnectXml(streamUrl: string): string {
354
+ return `<?xml version="1.0" encoding="UTF-8"?>
355
+ <Response>
356
+ <Connect>
357
+ <Stream url="${escapeXml(streamUrl)}" />
358
+ </Connect>
359
+ </Response>`;
360
+ }
361
+
362
+ /**
363
+ * Initiate an outbound call via Twilio API.
364
+ * If inlineTwiml is provided, uses that directly (for notify mode).
365
+ * Otherwise, uses webhook URL for dynamic TwiML.
366
+ */
367
+ async initiateCall(input: InitiateCallInput): Promise<InitiateCallResult> {
368
+ const url = new URL(input.webhookUrl);
369
+ url.searchParams.set("callId", input.callId);
370
+
371
+ // Build request params
372
+ const params: Record<string, string> = {
373
+ To: input.to,
374
+ From: input.from,
375
+ StatusCallback: url.toString(),
376
+ StatusCallbackEvent: "initiated ringing answered completed",
377
+ Timeout: "30",
378
+ };
379
+
380
+ // Use inline TwiML for notify mode (simpler, no webhook needed)
381
+ if (input.inlineTwiml) {
382
+ params.Twiml = input.inlineTwiml;
383
+ } else {
384
+ params.Url = url.toString();
385
+ }
386
+
387
+ const result = await this.apiRequest<TwilioCallResponse>(
388
+ "/Calls.json",
389
+ params,
390
+ );
391
+
392
+ this.callWebhookUrls.set(result.sid, url.toString());
393
+
394
+ return {
395
+ providerCallId: result.sid,
396
+ status: result.status === "queued" ? "queued" : "initiated",
397
+ };
398
+ }
399
+
400
+ /**
401
+ * Hang up a call via Twilio API.
402
+ */
403
+ async hangupCall(input: HangupCallInput): Promise<void> {
404
+ this.callWebhookUrls.delete(input.providerCallId);
405
+
406
+ await this.apiRequest(
407
+ `/Calls/${input.providerCallId}.json`,
408
+ { Status: "completed" },
409
+ { allowNotFound: true },
410
+ );
411
+ }
412
+
413
+ /**
414
+ * Play TTS audio via Twilio.
415
+ *
416
+ * Two modes:
417
+ * 1. OpenAI TTS + Media Streams: If TTS provider and media stream are available,
418
+ * generates audio via OpenAI and streams it through WebSocket (preferred).
419
+ * 2. TwiML <Say>: Falls back to Twilio's native TTS with Polly voices.
420
+ * Note: This may not work on all Twilio accounts.
421
+ */
422
+ async playTts(input: PlayTtsInput): Promise<void> {
423
+ // Try OpenAI TTS via media stream first (if configured)
424
+ const streamSid = this.callStreamMap.get(input.providerCallId);
425
+ if (this.ttsProvider && this.mediaStreamHandler && streamSid) {
426
+ try {
427
+ await this.playTtsViaStream(input.text, streamSid);
428
+ return;
429
+ } catch (err) {
430
+ console.warn(
431
+ `[voice-call] OpenAI TTS failed, falling back to Twilio <Say>:`,
432
+ err instanceof Error ? err.message : err,
433
+ );
434
+ // Fall through to TwiML <Say> fallback
435
+ }
436
+ }
437
+
438
+ // Fall back to TwiML <Say> (may not work on all accounts)
439
+ const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
440
+ if (!webhookUrl) {
441
+ throw new Error(
442
+ "Missing webhook URL for this call (provider state not initialized)",
443
+ );
444
+ }
445
+
446
+ console.warn(
447
+ "[voice-call] Using TwiML <Say> fallback - OpenAI TTS not configured or media stream not active",
448
+ );
449
+
450
+ const pollyVoice = mapVoiceToPolly(input.voice);
451
+ const twiml = `<?xml version="1.0" encoding="UTF-8"?>
452
+ <Response>
453
+ <Say voice="${pollyVoice}" language="${input.locale || "en-US"}">${escapeXml(input.text)}</Say>
454
+ <Gather input="speech" speechTimeout="auto" action="${escapeXml(webhookUrl)}" method="POST">
455
+ <Say>.</Say>
456
+ </Gather>
457
+ </Response>`;
458
+
459
+ await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
460
+ Twiml: twiml,
461
+ });
462
+ }
463
+
464
+ /**
465
+ * Play TTS via OpenAI and Twilio Media Streams.
466
+ * Generates audio with OpenAI TTS, converts to mu-law, and streams via WebSocket.
467
+ * Uses a jitter buffer to smooth out timing variations.
468
+ */
469
+ private async playTtsViaStream(
470
+ text: string,
471
+ streamSid: string,
472
+ ): Promise<void> {
473
+ if (!this.ttsProvider || !this.mediaStreamHandler) {
474
+ throw new Error("TTS provider and media stream handler required");
475
+ }
476
+
477
+ // Generate audio with OpenAI TTS (returns mu-law at 8kHz)
478
+ const muLawAudio = await this.ttsProvider.synthesizeForTwilio(text);
479
+
480
+ // Stream audio in 20ms chunks (160 bytes at 8kHz mu-law)
481
+ const CHUNK_SIZE = 160;
482
+ const CHUNK_DELAY_MS = 20;
483
+
484
+ for (const chunk of chunkAudio(muLawAudio, CHUNK_SIZE)) {
485
+ this.mediaStreamHandler.sendAudio(streamSid, chunk);
486
+
487
+ // Pace the audio to match real-time playback
488
+ await new Promise((resolve) => setTimeout(resolve, CHUNK_DELAY_MS));
489
+ }
490
+
491
+ // Send a mark to track when audio finishes
492
+ this.mediaStreamHandler.sendMark(streamSid, `tts-${Date.now()}`);
493
+ }
494
+
495
+ /**
496
+ * Start listening for speech via Twilio <Gather>.
497
+ */
498
+ async startListening(input: StartListeningInput): Promise<void> {
499
+ const webhookUrl = this.callWebhookUrls.get(input.providerCallId);
500
+ if (!webhookUrl) {
501
+ throw new Error(
502
+ "Missing webhook URL for this call (provider state not initialized)",
503
+ );
504
+ }
505
+
506
+ const twiml = `<?xml version="1.0" encoding="UTF-8"?>
507
+ <Response>
508
+ <Gather input="speech" speechTimeout="auto" language="${input.language || "en-US"}" action="${escapeXml(webhookUrl)}" method="POST">
509
+ </Gather>
510
+ </Response>`;
511
+
512
+ await this.apiRequest(`/Calls/${input.providerCallId}.json`, {
513
+ Twiml: twiml,
514
+ });
515
+ }
516
+
517
+ /**
518
+ * Stop listening - for Twilio this is a no-op as <Gather> auto-ends.
519
+ */
520
+ async stopListening(_input: StopListeningInput): Promise<void> {
521
+ // Twilio's <Gather> automatically stops on speech end
522
+ // No explicit action needed
523
+ }
524
+ }
525
+
526
+ // -----------------------------------------------------------------------------
527
+ // Twilio-specific types
528
+ // -----------------------------------------------------------------------------
529
+
530
+ interface TwilioCallResponse {
531
+ sid: string;
532
+ status: string;
533
+ direction: string;
534
+ from: string;
535
+ to: string;
536
+ uri: string;
537
+ }
@@ -0,0 +1,171 @@
1
+ /**
2
+ * Voice call response generator - uses the embedded Pi agent for tool support.
3
+ * Routes voice responses through the same agent infrastructure as messaging.
4
+ */
5
+
6
+ import crypto from "node:crypto";
7
+
8
+ import { loadCoreAgentDeps, type CoreConfig } from "./core-bridge.js";
9
+
10
+ import type { VoiceCallConfig } from "./config.js";
11
+
12
+ export type VoiceResponseParams = {
13
+ /** Voice call config */
14
+ voiceConfig: VoiceCallConfig;
15
+ /** Core Clawdbot config */
16
+ coreConfig: CoreConfig;
17
+ /** Call ID for session tracking */
18
+ callId: string;
19
+ /** Caller's phone number */
20
+ from: string;
21
+ /** Conversation transcript */
22
+ transcript: Array<{ speaker: "user" | "bot"; text: string }>;
23
+ /** Latest user message */
24
+ userMessage: string;
25
+ };
26
+
27
+ export type VoiceResponseResult = {
28
+ text: string | null;
29
+ error?: string;
30
+ };
31
+
32
+ type SessionEntry = {
33
+ sessionId: string;
34
+ updatedAt: number;
35
+ };
36
+
37
+ /**
38
+ * Generate a voice response using the embedded Pi agent with full tool support.
39
+ * Uses the same agent infrastructure as messaging for consistent behavior.
40
+ */
41
+ export async function generateVoiceResponse(
42
+ params: VoiceResponseParams,
43
+ ): Promise<VoiceResponseResult> {
44
+ const { voiceConfig, callId, from, transcript, userMessage, coreConfig } =
45
+ params;
46
+
47
+ if (!coreConfig) {
48
+ return { text: null, error: "Core config unavailable for voice response" };
49
+ }
50
+
51
+ let deps: Awaited<ReturnType<typeof loadCoreAgentDeps>>;
52
+ try {
53
+ deps = await loadCoreAgentDeps();
54
+ } catch (err) {
55
+ return {
56
+ text: null,
57
+ error:
58
+ err instanceof Error
59
+ ? err.message
60
+ : "Unable to load core agent dependencies",
61
+ };
62
+ }
63
+ const cfg = coreConfig;
64
+
65
+ // Build voice-specific session key based on phone number
66
+ const normalizedPhone = from.replace(/\D/g, "");
67
+ const sessionKey = `voice:${normalizedPhone}`;
68
+ const agentId = "main";
69
+
70
+ // Resolve paths
71
+ const storePath = deps.resolveStorePath(cfg.session?.store, { agentId });
72
+ const agentDir = deps.resolveAgentDir(cfg, agentId);
73
+ const workspaceDir = deps.resolveAgentWorkspaceDir(cfg, agentId);
74
+
75
+ // Ensure workspace exists
76
+ await deps.ensureAgentWorkspace({ dir: workspaceDir });
77
+
78
+ // Load or create session entry
79
+ const sessionStore = deps.loadSessionStore(storePath);
80
+ const now = Date.now();
81
+ let sessionEntry = sessionStore[sessionKey] as SessionEntry | undefined;
82
+
83
+ if (!sessionEntry) {
84
+ sessionEntry = {
85
+ sessionId: crypto.randomUUID(),
86
+ updatedAt: now,
87
+ };
88
+ sessionStore[sessionKey] = sessionEntry;
89
+ await deps.saveSessionStore(storePath, sessionStore);
90
+ }
91
+
92
+ const sessionId = sessionEntry.sessionId;
93
+ const sessionFile = deps.resolveSessionFilePath(sessionId, sessionEntry, {
94
+ agentId,
95
+ });
96
+
97
+ // Resolve model from config
98
+ const modelRef =
99
+ voiceConfig.responseModel ||
100
+ `${deps.DEFAULT_PROVIDER}/${deps.DEFAULT_MODEL}`;
101
+ const slashIndex = modelRef.indexOf("/");
102
+ const provider =
103
+ slashIndex === -1 ? deps.DEFAULT_PROVIDER : modelRef.slice(0, slashIndex);
104
+ const model = slashIndex === -1 ? modelRef : modelRef.slice(slashIndex + 1);
105
+
106
+ // Resolve thinking level
107
+ const thinkLevel = deps.resolveThinkingDefault({ cfg, provider, model });
108
+
109
+ // Resolve agent identity for personalized prompt
110
+ const identity = deps.resolveAgentIdentity(cfg, agentId);
111
+ const agentName = identity?.name?.trim() || "assistant";
112
+
113
+ // Build system prompt with conversation history
114
+ const basePrompt =
115
+ voiceConfig.responseSystemPrompt ??
116
+ `You are ${agentName}, a helpful voice assistant on a phone call. Keep responses brief and conversational (1-2 sentences max). Be natural and friendly. The caller's phone number is ${from}. You have access to tools - use them when helpful.`;
117
+
118
+ let extraSystemPrompt = basePrompt;
119
+ if (transcript.length > 0) {
120
+ const history = transcript
121
+ .map(
122
+ (entry) =>
123
+ `${entry.speaker === "bot" ? "You" : "Caller"}: ${entry.text}`,
124
+ )
125
+ .join("\n");
126
+ extraSystemPrompt = `${basePrompt}\n\nConversation so far:\n${history}`;
127
+ }
128
+
129
+ // Resolve timeout
130
+ const timeoutMs =
131
+ voiceConfig.responseTimeoutMs ?? deps.resolveAgentTimeoutMs({ cfg });
132
+ const runId = `voice:${callId}:${Date.now()}`;
133
+
134
+ try {
135
+ const result = await deps.runEmbeddedPiAgent({
136
+ sessionId,
137
+ sessionKey,
138
+ messageProvider: "voice",
139
+ sessionFile,
140
+ workspaceDir,
141
+ config: cfg,
142
+ prompt: userMessage,
143
+ provider,
144
+ model,
145
+ thinkLevel,
146
+ verboseLevel: "off",
147
+ timeoutMs,
148
+ runId,
149
+ lane: "voice",
150
+ extraSystemPrompt,
151
+ agentDir,
152
+ });
153
+
154
+ // Extract text from payloads
155
+ const texts = (result.payloads ?? [])
156
+ .filter((p) => p.text && !p.isError)
157
+ .map((p) => p.text?.trim())
158
+ .filter(Boolean);
159
+
160
+ const text = texts.join(" ") || null;
161
+
162
+ if (!text && result.meta.aborted) {
163
+ return { text: null, error: "Response generation was aborted" };
164
+ }
165
+
166
+ return { text };
167
+ } catch (err) {
168
+ console.error(`[voice-call] Response generation failed:`, err);
169
+ return { text: null, error: String(err) };
170
+ }
171
+ }