@ouro.bot/cli 0.1.0-alpha.582 → 0.1.0-alpha.583

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/changelog.json CHANGED
@@ -1,6 +1,17 @@
1
1
  {
2
2
  "_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
3
3
  "versions": [
4
+ {
5
+ "version": "0.1.0-alpha.583",
6
+ "changes": [
7
+ "Outbound SIP phone calls now start the Realtime greeting immediately after answer unless Twilio has already positively identified voicemail or fax, preventing humans from hearing post-pickup silence when async AMD returns unknown.",
8
+ "Twilio phone voice now defaults outbound calls to OpenAI Realtime Media Streams when inbound calls use OpenAI SIP on a Media Stream machine, while still allowing `voice.twilioOutboundConversationEngine` overrides, so humans avoid post-pickup SIP ringback.",
9
+ "Realtime voice now resolves phone callers through the canonical friend graph, preferring existing friend ids and otherwise matching normalized phone numbers via `imessage-handle`, so trust-aware tools see the same friend context as text and mail.",
10
+ "Realtime media-stream voice now treats empty caller metadata as absent and preserves local voice friend identities, keeping outbound and provider-simulated calls attached to the intended friend instead of inventing a blank phone identity.",
11
+ "Realtime voice response creation now backs off and retries after provider active-response conflicts, holds user turns under Ouro floor-control instead of provider auto-response, and long-running voice tools can emit one tiny holding phrase instead of leaving seconds of unexplained silence.",
12
+ "Realtime voice VAD and local barge-in thresholds are less twitchy by default, reducing accidental interruption from tiny room sounds while preserving deliberate caller interruption."
13
+ ]
14
+ },
4
15
  {
5
16
  "version": "0.1.0-alpha.582",
6
17
  "changes": [
@@ -154,13 +154,37 @@ function resolveOpenAIRealtimeApiKey(options) {
154
154
  return { apiKey: compatKey, source: "integrations.openaiEmbeddingsApiKey" };
155
155
  return undefined;
156
156
  }
157
- function configuredConversationEngine(options, overrides) {
158
- return overrides.conversationEngine
159
- ?? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(configString(options.machineConfig, "voice.twilioConversationEngine")
160
- ?? configString(options.machineConfig, "voice.conversationEngine")
161
- ?? configString(options.runtimeConfig, "voice.twilioConversationEngine")
162
- ?? configString(options.runtimeConfig, "voice.conversationEngine")
163
- ?? "cascade");
157
+ function configuredConversationEngine(options, overrides, transportMode) {
158
+ const explicit = overrides.conversationEngine
159
+ ?? configString(options.machineConfig, "voice.twilioConversationEngine")
160
+ ?? configString(options.machineConfig, "voice.conversationEngine")
161
+ ?? configString(options.runtimeConfig, "voice.twilioConversationEngine")
162
+ ?? configString(options.runtimeConfig, "voice.conversationEngine");
163
+ const hasSipConfig = !!(configString(options.runtimeConfig, "voice.openaiSipProjectId")
164
+ || configString(options.machineConfig, "voice.openaiSipProjectId"));
165
+ const explicitEngine = explicit ? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(explicit) : undefined;
166
+ if (hasSipConfig && (!explicitEngine || explicitEngine === "cascade"))
167
+ return "openai-sip";
168
+ if (explicitEngine)
169
+ return explicitEngine;
170
+ const hasRealtimeConfig = !!resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
171
+ if (hasRealtimeConfig && transportMode === "media-stream")
172
+ return "openai-realtime";
173
+ return "cascade";
174
+ }
175
+ function configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode) {
176
+ const defaultOutboundEngine = conversationEngine === "openai-sip" && transportMode === "media-stream"
177
+ ? "openai-realtime"
178
+ : conversationEngine;
179
+ const configured = overrides.outboundConversationEngine
180
+ ?? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(configString(options.machineConfig, "voice.twilioOutboundConversationEngine")
181
+ ?? configString(options.machineConfig, "voice.outboundConversationEngine")
182
+ ?? configString(options.runtimeConfig, "voice.twilioOutboundConversationEngine")
183
+ ?? configString(options.runtimeConfig, "voice.outboundConversationEngine")
184
+ ?? defaultOutboundEngine);
185
+ if (defaultOutboundEngine === "openai-realtime" && configured === "cascade")
186
+ return defaultOutboundEngine;
187
+ return configured;
164
188
  }
165
189
  function normalizeOpenAIRealtimeReasoningEffort(value) {
166
190
  const normalized = value?.trim().toLowerCase();
@@ -226,7 +250,14 @@ function resolveTwilioPhoneTransportRuntime(options) {
226
250
  ?? twilio_phone_1.TWILIO_PHONE_WEBHOOK_BASE_PATH);
227
251
  const transportMode = overrides.transportMode
228
252
  ?? (0, twilio_phone_1.normalizeTwilioPhoneTransportMode)(configString(options.machineConfig, "voice.twilioTransportMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_TRANSPORT_MODE);
229
- const conversationEngine = configuredConversationEngine(options, overrides);
253
+ const conversationEngine = configuredConversationEngine(options, overrides, transportMode);
254
+ const outboundConversationEngine = configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode);
255
+ const needsOpenAIRealtime = conversationEngine === "openai-realtime"
256
+ || conversationEngine === "openai-sip"
257
+ || outboundConversationEngine === "openai-realtime"
258
+ || outboundConversationEngine === "openai-sip";
259
+ const needsOpenAISip = conversationEngine === "openai-sip" || outboundConversationEngine === "openai-sip";
260
+ const needsCascade = conversationEngine === "cascade" || outboundConversationEngine === "cascade";
230
261
  let elevenLabsApiKey = configString(options.runtimeConfig, "integrations.elevenLabsApiKey") ?? "";
231
262
  let elevenLabsVoiceId = trimOptional(overrides.elevenLabsVoiceId)
232
263
  ?? configString(options.runtimeConfig, "integrations.elevenLabsVoiceId")
@@ -240,9 +271,9 @@ function resolveTwilioPhoneTransportRuntime(options) {
240
271
  ?? "";
241
272
  let openaiRealtime;
242
273
  let openaiSip;
243
- if (conversationEngine === "openai-realtime" || conversationEngine === "openai-sip") {
244
- if (conversationEngine === "openai-realtime" && transportMode !== "media-stream") {
245
- throw new Error("voice.twilioConversationEngine=openai-realtime requires voice.twilioTransportMode=media-stream");
274
+ if (needsOpenAIRealtime) {
275
+ if ((conversationEngine === "openai-realtime" || outboundConversationEngine === "openai-realtime") && transportMode !== "media-stream") {
276
+ throw new Error("voice.twilioConversationEngine/openai-realtime requires voice.twilioTransportMode=media-stream");
246
277
  }
247
278
  const key = resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
248
279
  if (!key) {
@@ -300,7 +331,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
300
331
  ?? normalizeOpenAIRealtimeNoiseReduction(configString(options.runtimeConfig, "voice.openaiRealtimeNoiseReduction")),
301
332
  turnDetection,
302
333
  };
303
- if (conversationEngine === "openai-sip") {
334
+ if (needsOpenAISip) {
304
335
  const projectId = trimOptional(overrides.openaiSipProjectId)
305
336
  ?? configString(options.runtimeConfig, "voice.openaiSipProjectId")
306
337
  ?? configString(options.machineConfig, "voice.openaiSipProjectId");
@@ -334,7 +365,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
334
365
  };
335
366
  }
336
367
  }
337
- else {
368
+ if (needsCascade) {
338
369
  elevenLabsApiKey = required(elevenLabsApiKey || undefined, "missing integrations.elevenLabsApiKey; run 'ouro connect voice --agent <agent>' for setup guidance");
339
370
  elevenLabsVoiceId = required(elevenLabsVoiceId || undefined, "missing integrations.elevenLabsVoiceId; save the ElevenLabs voice ID before starting phone voice");
340
371
  whisperCliPath = required(whisperCliPath || undefined, "missing voice.whisperCliPath in this machine's runtime config");
@@ -379,6 +410,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
379
410
  ?? (0, twilio_phone_1.normalizeTwilioPhonePlaybackMode)(configString(options.machineConfig, "voice.twilioPlaybackMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE),
380
411
  transportMode,
381
412
  conversationEngine,
413
+ outboundConversationEngine,
382
414
  openaiRealtime,
383
415
  openaiSip,
384
416
  openaiSipWebhookUrl: openaiSip?.webhookPath ? (0, twilio_phone_1.openAISipWebhookUrl)(publicBaseUrl, openaiSip.webhookPath) : undefined,
@@ -482,7 +514,12 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
482
514
  meta: { agentName: settings.agentName, source: settings.openaiRealtime.apiKeySource },
483
515
  });
484
516
  }
485
- const transcriber = settings.conversationEngine === "openai-realtime" || settings.conversationEngine === "openai-sip"
517
+ const settingsNeedsOpenAIRealtime = settings.conversationEngine === "openai-realtime"
518
+ || settings.conversationEngine === "openai-sip"
519
+ || settings.outboundConversationEngine === "openai-realtime"
520
+ || settings.outboundConversationEngine === "openai-sip";
521
+ const settingsNeedsCascade = settings.conversationEngine === "cascade" || settings.outboundConversationEngine === "cascade";
522
+ const transcriber = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
486
523
  ? {
487
524
  transcribe: async () => {
488
525
  throw new Error("OpenAI Realtime voice sessions do not use the cascade transcriber");
@@ -492,7 +529,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
492
529
  whisperCliPath: settings.whisperCliPath,
493
530
  modelPath: settings.whisperModelPath,
494
531
  });
495
- const tts = settings.conversationEngine === "openai-realtime" || settings.conversationEngine === "openai-sip"
532
+ const tts = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
496
533
  ? {
497
534
  synthesize: async () => {
498
535
  throw new Error("OpenAI Realtime voice sessions do not use the cascade TTS service");
@@ -522,6 +559,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
522
559
  transportMode: settings.transportMode,
523
560
  playbackMode: settings.playbackMode,
524
561
  conversationEngine: settings.conversationEngine,
562
+ outboundConversationEngine: settings.outboundConversationEngine,
525
563
  openaiRealtime: settings.openaiRealtime,
526
564
  openaiSip: settings.openaiSip,
527
565
  });
@@ -538,6 +576,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
538
576
  openaiSipWebhookUrl: settings.openaiSipWebhookUrl ?? "",
539
577
  transportMode: settings.transportMode,
540
578
  conversationEngine: settings.conversationEngine,
579
+ outboundConversationEngine: settings.outboundConversationEngine,
541
580
  openaiRealtimeModel: settings.openaiRealtime?.model ?? "",
542
581
  },
543
582
  });
@@ -565,7 +604,7 @@ async function prewarmOutboundGreeting(options, deps) {
565
604
  if (options.settings.transportMode !== "media-stream")
566
605
  return undefined;
567
606
  /* v8 ignore next -- Realtime/SIP outbound tests assert no cascade prewarm is attempted @preserve */
568
- if (options.settings.conversationEngine === "openai-realtime" || options.settings.conversationEngine === "openai-sip")
607
+ if (options.settings.outboundConversationEngine === "openai-realtime" || options.settings.outboundConversationEngine === "openai-sip")
569
608
  return undefined;
570
609
  const friendId = options.friendId?.trim() || `twilio-${safeRuntimeSegment(options.to)}`;
571
610
  const sessionKey = (0, twilio_phone_1.twilioPhoneVoiceSessionKey)({
@@ -677,7 +716,7 @@ async function placeConfiguredTwilioPhoneCall(options, deps = defaultTwilioPhone
677
716
  reason: options.reason.trim(),
678
717
  ...(options.initialAudio ? { initialAudio: options.initialAudio } : {}),
679
718
  createdAt,
680
- status: settings.transportMode === "media-stream" && settings.conversationEngine !== "openai-realtime" && settings.conversationEngine !== "openai-sip"
719
+ status: settings.transportMode === "media-stream" && settings.outboundConversationEngine === "cascade"
681
720
  ? "prewarming"
682
721
  : "requested",
683
722
  });
@@ -205,6 +205,15 @@ function usesOpenAIRealtimeConversationEngine(options) {
205
205
  function usesOpenAISipConversationEngine(options) {
206
206
  return normalizeTwilioPhoneConversationEngine(options.conversationEngine) === "openai-sip";
207
207
  }
208
+ function outboundConversationEngine(options) {
209
+ return normalizeTwilioPhoneConversationEngine(options.outboundConversationEngine ?? options.conversationEngine);
210
+ }
211
+ function usesOpenAIRealtimeOutboundConversationEngine(options) {
212
+ return outboundConversationEngine(options) === "openai-realtime";
213
+ }
214
+ function usesOpenAISipOutboundConversationEngine(options) {
215
+ return outboundConversationEngine(options) === "openai-sip";
216
+ }
208
217
  function twilioPhoneWebhookUrl(publicBaseUrl, basePath = exports.TWILIO_PHONE_WEBHOOK_BASE_PATH) {
209
218
  return routeUrl(publicBaseUrl, `${normalizeTwilioPhoneBasePath(basePath)}/incoming`);
210
219
  }
@@ -253,8 +262,9 @@ function websocketRouteUrl(publicBaseUrl, route) {
253
262
  url.protocol = "wss:";
254
263
  return url.toString();
255
264
  }
256
- function mediaStreamTwiml(options, basePath, params, greetingJobId, customParams = {}) {
257
- const streamUrl = websocketRouteUrl(options.publicBaseUrl, `${basePath}/media-stream`);
265
+ function mediaStreamTwiml(options, basePath, params, greetingJobId, customParams = {}, streamEngine) {
266
+ const streamRoute = streamEngine ? `${basePath}/media-stream?engine=${encodeURIComponent(streamEngine)}` : `${basePath}/media-stream`;
267
+ const streamUrl = websocketRouteUrl(options.publicBaseUrl, streamRoute);
258
268
  const twimlParams = {
259
269
  From: params.From,
260
270
  To: params.To,
@@ -286,7 +296,7 @@ function openAISipUri(options, customHeaders = {}) {
286
296
  return `sip:${projectId}@sip.api.openai.com;transport=tls${query ? `?${query}` : ""}`;
287
297
  }
288
298
  function openAISipDialTwiml(options, customHeaders = {}) {
289
- return `<Dial answerOnBridge="true"><Sip>${escapeXml(openAISipUri(options, customHeaders))}</Sip></Dial>`;
299
+ return `<Dial><Sip>${escapeXml(openAISipUri(options, customHeaders))}</Sip></Dial>`;
290
300
  }
291
301
  /* v8 ignore stop */
292
302
  function safeSegment(input) {
@@ -346,6 +356,32 @@ function friendIdFromCaller(from, callSid) {
346
356
  function voiceFriendId(options, from, callSid) {
347
357
  return options.defaultFriendId?.trim() || friendIdFromCaller(from, callSid);
348
358
  }
359
+ async function resolveVoiceFriendContext(options, input) {
360
+ const agentRoot = options.agentRoot ?? (0, identity_1.getAgentRoot)(options.agentName);
361
+ const friendStore = new store_file_1.FileFriendStore(path.join(agentRoot, "friends"));
362
+ const explicitFriendId = input.friendId?.trim();
363
+ if (explicitFriendId) {
364
+ const existing = await friendStore.get(explicitFriendId);
365
+ if (existing) {
366
+ return {
367
+ friendId: existing.id,
368
+ friendStore,
369
+ resolved: { friend: existing, channel: (0, channel_1.getChannelCapabilities)("voice") },
370
+ };
371
+ }
372
+ }
373
+ const remotePhone = (0, phone_1.normalizeTwilioE164PhoneNumber)(input.remotePhone);
374
+ const provider = remotePhone ? "imessage-handle" : "local";
375
+ const externalId = remotePhone || explicitFriendId || voiceFriendId(options, input.remotePhone ?? "", input.callSid);
376
+ const resolver = new resolver_1.FriendResolver(friendStore, {
377
+ provider,
378
+ externalId,
379
+ displayName: remotePhone || externalId,
380
+ channel: "voice",
381
+ });
382
+ const resolved = await resolver.resolve();
383
+ return { friendId: resolved.friend.id, friendStore, resolved };
384
+ }
349
385
  function phoneIdentitySegment(input) {
350
386
  const phoneish = input.replace(/[^0-9A-Za-z]+/g, "");
351
387
  return phoneish || safeSegment(input);
@@ -1119,16 +1155,19 @@ const OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL = "gpt-realtime-whisper";
1119
1155
  const OPENAI_REALTIME_BOOTSTRAP_TIMEOUT_MS = 250;
1120
1156
  const OPENAI_REALTIME_PCMS_BYTES_PER_MS = 8;
1121
1157
  const OPENAI_REALTIME_DEFAULT_NOISE_REDUCTION = "near_field";
1122
- const OPENAI_REALTIME_DEFAULT_VAD_THRESHOLD = 0.68;
1123
- const OPENAI_REALTIME_DEFAULT_VAD_PREFIX_PADDING_MS = 220;
1124
- const OPENAI_REALTIME_DEFAULT_VAD_SILENCE_DURATION_MS = 320;
1125
- const OPENAI_REALTIME_DEFAULT_VAD_IDLE_TIMEOUT_MS = 15_000;
1158
+ const OPENAI_REALTIME_DEFAULT_VAD_THRESHOLD = 0.78;
1159
+ const OPENAI_REALTIME_DEFAULT_VAD_PREFIX_PADDING_MS = 300;
1160
+ const OPENAI_REALTIME_DEFAULT_VAD_SILENCE_DURATION_MS = 650;
1161
+ const OPENAI_REALTIME_DEFAULT_VAD_IDLE_TIMEOUT_MS = 7_000;
1126
1162
  const OPENAI_REALTIME_MAX_OUTPUT_TOKENS = 220;
1127
- const OPENAI_REALTIME_BARGE_IN_MIN_SPEECH_MS = 160;
1128
- const OPENAI_REALTIME_BARGE_IN_RMS_THRESHOLD = 900;
1163
+ const OPENAI_REALTIME_BARGE_IN_MIN_SPEECH_MS = 260;
1164
+ const OPENAI_REALTIME_BARGE_IN_RMS_THRESHOLD = 1_300;
1129
1165
  const OPENAI_REALTIME_MIN_VOICE_SPEED = 0.25;
1130
1166
  const OPENAI_REALTIME_MAX_VOICE_SPEED = 1.5;
1131
1167
  const OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS = 50;
1168
+ const OPENAI_REALTIME_RESPONSE_CREATE_CONFLICT_BACKOFF_MS = 1_000;
1169
+ const OPENAI_REALTIME_TOOL_PRESENCE_DELAY_MS = 900;
1170
+ const OPENAI_REALTIME_USER_TURN_RESPONSE_DELAY_MS = 700;
1132
1171
  const OPENAI_SIP_OUTBOUND_AMD_GREETING_TIMEOUT_MS = 10_000;
1133
1172
  const OPENAI_SIP_UNSUPPORTED_TOOL_NAMES = new Set();
1134
1173
  const OPENAI_SIP_DEFAULT_API_BASE_URL = "https://api.openai.com/v1";
@@ -1337,6 +1376,18 @@ function realtimeToolsFromChatTools(tools, excludedToolNames = new Set()) {
1337
1376
  parameters: tool.function.parameters ?? { type: "object", properties: {} },
1338
1377
  }));
1339
1378
  }
1379
+ function mediaStreamRequestedConversationEngine(url) {
1380
+ if (!url)
1381
+ return undefined;
1382
+ try {
1383
+ const parsed = new URL(url, "wss://localhost");
1384
+ const engine = parsed.searchParams.get("engine") ?? undefined;
1385
+ return engine ? normalizeTwilioPhoneConversationEngine(engine) : undefined;
1386
+ }
1387
+ catch {
1388
+ return undefined;
1389
+ }
1390
+ }
1340
1391
  function parseToolArguments(raw) {
1341
1392
  if (!raw.trim())
1342
1393
  return {};
@@ -1406,12 +1457,14 @@ async function buildRealtimeVoiceInstructions(options) {
1406
1457
  return [
1407
1458
  `You are ${options.agentName} in the live Voice sense.`,
1408
1459
  "This is the same agent identity as every other Ouro surface. Voice is not a reduced or alternate self.",
1460
+ options.friend ? `Resolved voice friend: ${options.friend.name || options.friend.id} (friendId=${options.friend.id}, trust=${options.friend.trustLevel ?? "friend"}, role=${options.friend.role ?? "friend"}). Use this same friend record and trust context for relationship awareness and tool permissions across voice, text, mail, and every other sense.` : "",
1409
1461
  `Current native Realtime provider config for this call: model=${options.realtimeModel?.trim() || OPENAI_REALTIME_DEFAULT_MODEL}, voice=${options.realtimeVoice?.trim() || OPENAI_REALTIME_DEFAULT_VOICE}${options.realtimeVoiceSpeed === undefined ? "" : `, speed=${options.realtimeVoiceSpeed}`}.`,
1410
1462
  options.realtimeVoiceStyle?.trim()
1411
1463
  ? `Phone voice target: ${options.realtimeVoiceStyle.trim()}`
1412
1464
  : "",
1413
1465
  "Speak as yourself through live audio. Follow voice/style preferences from identity notes; do not say you lack identity, preferences, or agency because the provider voice is configured by the transport.",
1414
1466
  "Audio is synchronous. Default to one short sentence. Use two short sentences only when needed. Do not use markdown, lists, or long explanations unless the caller explicitly asks.",
1467
+ "Do not treat every tiny silence as your turn. Let the caller finish the thought, especially if they pause mid-sentence.",
1415
1468
  "If the caller interrupts, stop the older path and answer the newest thing first.",
1416
1469
  "If the caller says they are counting, measuring latency, testing lag, waiting, or wants you quiet, say at most 'got it' and then stay silent until they ask or say something that needs an answer.",
1417
1470
  "Use tools for outside facts or side effects. While a tool is running, give at most one tiny preamble, then summarize the result compactly when it returns.",
@@ -1491,6 +1544,8 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1491
1544
  pendingAudioPayloads = [];
1492
1545
  openaiWs = null;
1493
1546
  toolContext;
1547
+ friendStore;
1548
+ resolvedContext;
1494
1549
  sessionMessages = [];
1495
1550
  playbackState;
1496
1551
  playbackMarkIndex = 0;
@@ -1498,8 +1553,12 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1498
1553
  toolResponses = new Map();
1499
1554
  completedRealtimeResponseIds = new Set();
1500
1555
  activeRealtimeResponseId = null;
1556
+ realtimeResponseCreateInFlight = null;
1557
+ untrackedActiveRealtimeResponse = false;
1558
+ untrackedActiveRealtimeResponseTimer = null;
1501
1559
  pendingRealtimeResponse = null;
1502
1560
  pendingRealtimeResponseTimer = null;
1561
+ pendingUserTurnResponseTimer = null;
1503
1562
  responseCreateHoldUntilMs = 0;
1504
1563
  initialAudio;
1505
1564
  initialAudioPlayed = false;
@@ -1574,9 +1633,16 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1574
1633
  this.from = customParameter(start, "From");
1575
1634
  this.to = customParameter(start, "To");
1576
1635
  }
1577
- this.friendId = explicitFriendId || voiceFriendId(this.options, this.from, this.callSid);
1636
+ const voiceContext = await resolveVoiceFriendContext(this.options, {
1637
+ friendId: explicitFriendId,
1638
+ remotePhone: this.from || undefined,
1639
+ callSid: this.callSid,
1640
+ });
1641
+ this.friendId = voiceContext.friendId;
1642
+ this.friendStore = voiceContext.friendStore;
1643
+ this.resolvedContext = voiceContext.resolved;
1578
1644
  this.sessionKey = twilioPhoneVoiceSessionKey({
1579
- defaultFriendId: explicitFriendId || this.options.defaultFriendId,
1645
+ defaultFriendId: this.friendId,
1580
1646
  from: this.from,
1581
1647
  to: this.to,
1582
1648
  callSid: this.callSid,
@@ -1699,7 +1765,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1699
1765
  format: { type: "audio/pcmu" },
1700
1766
  noise_reduction: realtimeNoiseReductionConfig(realtime),
1701
1767
  transcription: { model: OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL },
1702
- turn_detection: realtimeTurnDetectionConfig(realtime),
1768
+ turn_detection: realtimeTurnDetectionConfig(realtime, { createResponse: false, interruptResponse: false }),
1703
1769
  },
1704
1770
  output: realtimeOutputAudioConfig(realtime, { type: "audio/pcmu" }),
1705
1771
  },
@@ -1721,6 +1787,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1721
1787
  const realtimeSystem = await buildRealtimeVoiceInstructions({
1722
1788
  agentName: this.options.agentName,
1723
1789
  agentRoot,
1790
+ friend: this.resolvedContext?.friend,
1724
1791
  priorTranscript: prior,
1725
1792
  realtimeVoice: this.options.openaiRealtime?.voice,
1726
1793
  realtimeVoiceStyle: this.options.openaiRealtime?.voiceStyle,
@@ -1743,6 +1810,8 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1743
1810
  return;
1744
1811
  this.toolContext = {
1745
1812
  signin: async () => undefined,
1813
+ ...(this.resolvedContext ? { context: this.resolvedContext } : {}),
1814
+ ...(this.friendStore ? { friendStore: this.friendStore } : {}),
1746
1815
  voiceCall: {
1747
1816
  requestEnd: () => this.requestHangupFromTool(),
1748
1817
  playAudio: (request) => this.playPreparedAudio(request),
@@ -1750,16 +1819,18 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1750
1819
  };
1751
1820
  }
1752
1821
  async buildRealtimeTools() {
1753
- const agentRoot = this.options.agentRoot ?? (0, identity_1.getAgentRoot)(this.options.agentName);
1754
- const friendsPath = path.join(agentRoot, "friends");
1755
- const friendStore = new store_file_1.FileFriendStore(friendsPath);
1756
- const resolver = new resolver_1.FriendResolver(friendStore, {
1757
- provider: "local",
1758
- externalId: this.friendId,
1759
- displayName: this.friendId,
1760
- channel: "voice",
1761
- });
1762
- const resolved = await resolver.resolve();
1822
+ if (!this.resolvedContext || !this.friendStore) {
1823
+ const voiceContext = await resolveVoiceFriendContext(this.options, {
1824
+ friendId: this.friendId,
1825
+ remotePhone: this.from || undefined,
1826
+ callSid: this.callSid,
1827
+ });
1828
+ this.friendId = voiceContext.friendId;
1829
+ this.friendStore = voiceContext.friendStore;
1830
+ this.resolvedContext = voiceContext.resolved;
1831
+ }
1832
+ const resolved = this.resolvedContext;
1833
+ const friendStore = this.friendStore;
1763
1834
  this.toolContext = {
1764
1835
  signin: async () => undefined,
1765
1836
  context: resolved,
@@ -1875,7 +1946,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1875
1946
  return;
1876
1947
  }
1877
1948
  if (type === "conversation.item.input_audio_transcription.completed" && typeof event.transcript === "string") {
1878
- this.appendTranscript("user", event.transcript);
1949
+ this.handleUserTranscript(event.transcript);
1879
1950
  return;
1880
1951
  }
1881
1952
  if (type === "response.output_audio_transcript.done" && typeof event.transcript === "string") {
@@ -1896,6 +1967,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1896
1967
  return;
1897
1968
  }
1898
1969
  if (type === "error") {
1970
+ this.handleRealtimeError(event);
1899
1971
  (0, runtime_1.emitNervesEvent)({
1900
1972
  level: "error",
1901
1973
  component: "senses",
@@ -1905,6 +1977,40 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1905
1977
  });
1906
1978
  }
1907
1979
  }
1980
+ handleRealtimeError(event) {
1981
+ const error = event.error;
1982
+ if (!error || typeof error !== "object" || Array.isArray(error))
1983
+ return;
1984
+ const code = stringField(error.code);
1985
+ if (code !== "conversation_already_has_active_response")
1986
+ return;
1987
+ this.noteRealtimeResponseConflict();
1988
+ }
1989
+ handleUserTranscript(transcript) {
1990
+ const content = transcript.trim();
1991
+ if (!content)
1992
+ return;
1993
+ this.appendTranscript("user", content);
1994
+ this.scheduleUserTurnResponse();
1995
+ }
1996
+ scheduleUserTurnResponse() {
1997
+ if (this.closed)
1998
+ return;
1999
+ this.clearPendingUserTurnResponse();
2000
+ this.pendingUserTurnResponseTimer = setTimeout(() => {
2001
+ this.pendingUserTurnResponseTimer = null;
2002
+ if (this.closed)
2003
+ return;
2004
+ this.requestRealtimeResponse();
2005
+ }, OPENAI_REALTIME_USER_TURN_RESPONSE_DELAY_MS);
2006
+ this.pendingUserTurnResponseTimer.unref?.();
2007
+ }
2008
+ clearPendingUserTurnResponse() {
2009
+ if (!this.pendingUserTurnResponseTimer)
2010
+ return;
2011
+ clearTimeout(this.pendingUserTurnResponseTimer);
2012
+ this.pendingUserTurnResponseTimer = null;
2013
+ }
1908
2014
  handleOpenAIAudioDelta(event) {
1909
2015
  const payload = stringField(event.delta);
1910
2016
  if (!payload)
@@ -1926,6 +2032,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1926
2032
  this.sendTwilioMark({ itemId, contentIndex, audioEndMs });
1927
2033
  }
1928
2034
  handleCallerSpeechStarted() {
2035
+ this.clearPendingUserTurnResponse();
1929
2036
  const playback = this.playbackState;
1930
2037
  if (!this.hasReliableCallerBargeInSpeech()) {
1931
2038
  (0, runtime_1.emitNervesEvent)({
@@ -1971,6 +2078,8 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1971
2078
  responseDone: this.completedRealtimeResponseIds.has(responseId),
1972
2079
  followupRequested: false,
1973
2080
  suppressFollowup: false,
2081
+ presenceRequested: false,
2082
+ presenceTimer: null,
1974
2083
  };
1975
2084
  state.pendingCallIds.add(callId);
1976
2085
  if (!existing)
@@ -1984,6 +2093,8 @@ class TwilioOpenAIRealtimeMediaStreamSession {
1984
2093
  if (!state)
1985
2094
  return false;
1986
2095
  state.pendingCallIds.delete(callId);
2096
+ if (state.pendingCallIds.size === 0)
2097
+ this.clearRealtimeToolPresenceTimer(state);
1987
2098
  return this.maybeCreateRealtimeToolFollowup(responseId, state);
1988
2099
  }
1989
2100
  completeRealtimeToolResponse(responseId) {
@@ -2002,11 +2113,37 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2002
2113
  return false;
2003
2114
  state.followupRequested = true;
2004
2115
  this.toolResponses.delete(responseId);
2116
+ this.clearRealtimeToolPresenceTimer(state);
2005
2117
  if (state.suppressFollowup)
2006
2118
  return true;
2007
2119
  this.requestRealtimeResponse();
2008
2120
  return true;
2009
2121
  }
2122
+ scheduleRealtimeToolPresence(responseId, state) {
2123
+ if (!responseId || state.presenceRequested || state.presenceTimer)
2124
+ return;
2125
+ state.presenceTimer = setTimeout(() => {
2126
+ state.presenceTimer = null;
2127
+ const current = this.toolResponses.get(responseId);
2128
+ if (this.closed || current !== state || state.pendingCallIds.size === 0 || state.suppressFollowup)
2129
+ return;
2130
+ state.presenceRequested = true;
2131
+ this.requestRealtimeResponse({
2132
+ instructions: "A tool is taking a moment. Say one very short natural holding phrase under six words, then stop speaking.",
2133
+ });
2134
+ }, OPENAI_REALTIME_TOOL_PRESENCE_DELAY_MS);
2135
+ state.presenceTimer.unref?.();
2136
+ }
2137
+ clearRealtimeToolPresenceTimer(state) {
2138
+ if (!state.presenceTimer)
2139
+ return;
2140
+ clearTimeout(state.presenceTimer);
2141
+ state.presenceTimer = null;
2142
+ }
2143
+ clearRealtimeToolPresenceTimers() {
2144
+ for (const state of this.toolResponses.values())
2145
+ this.clearRealtimeToolPresenceTimer(state);
2146
+ }
2010
2147
  async runRealtimeTool(event) {
2011
2148
  const name = typeof event.name === "string" ? event.name : "";
2012
2149
  const callId = typeof event.call_id === "string" ? event.call_id : "";
@@ -2017,6 +2154,8 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2017
2154
  const coordinated = !!toolState;
2018
2155
  if (name === "voice_end_call" && toolState)
2019
2156
  toolState.suppressFollowup = true;
2157
+ if (toolState && !toolState.suppressFollowup)
2158
+ this.scheduleRealtimeToolPresence(responseId, toolState);
2020
2159
  let output;
2021
2160
  try {
2022
2161
  const args = parseToolArguments(typeof event.arguments === "string" ? event.arguments : "");
@@ -2057,14 +2196,18 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2057
2196
  }
2058
2197
  }
2059
2198
  noteRealtimeResponseCreated(event) {
2199
+ this.realtimeResponseCreateInFlight = null;
2200
+ this.untrackedActiveRealtimeResponse = false;
2201
+ this.clearUntrackedActiveRealtimeResponseTimer();
2060
2202
  const responseId = realtimeResponseId(event);
2061
2203
  if (responseId)
2062
2204
  this.activeRealtimeResponseId = responseId;
2063
2205
  }
2064
- noteRealtimeResponseDone(responseId) {
2065
- if (!responseId || this.activeRealtimeResponseId === responseId) {
2066
- this.activeRealtimeResponseId = null;
2067
- }
2206
+ noteRealtimeResponseDone(_responseId) {
2207
+ this.realtimeResponseCreateInFlight = null;
2208
+ this.untrackedActiveRealtimeResponse = false;
2209
+ this.clearUntrackedActiveRealtimeResponseTimer();
2210
+ this.activeRealtimeResponseId = null;
2068
2211
  this.responseCreateHoldUntilMs = Math.max(this.responseCreateHoldUntilMs, Date.now() + OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2069
2212
  this.schedulePendingRealtimeResponse(OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2070
2213
  }
@@ -2072,15 +2215,21 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2072
2215
  if (this.closed)
2073
2216
  return;
2074
2217
  const waitMs = Math.max(0, this.responseCreateHoldUntilMs - Date.now());
2075
- if (this.activeRealtimeResponseId || waitMs > 0) {
2076
- const pendingResponse = response ?? this.pendingRealtimeResponse?.response;
2077
- this.pendingRealtimeResponse = pendingResponse ? { response: pendingResponse } : {};
2078
- if (!this.activeRealtimeResponseId)
2218
+ if (this.realtimeResponseIsBusy() || waitMs > 0) {
2219
+ this.holdRealtimeResponse(response ? { response } : {});
2220
+ if (!this.realtimeResponseIsBusy())
2079
2221
  this.schedulePendingRealtimeResponse(waitMs);
2080
2222
  return;
2081
2223
  }
2082
2224
  this.sendRealtimeResponseCreate(response ? { response } : {});
2083
2225
  }
2226
+ realtimeResponseIsBusy() {
2227
+ return !!this.activeRealtimeResponseId || !!this.realtimeResponseCreateInFlight || this.untrackedActiveRealtimeResponse;
2228
+ }
2229
+ holdRealtimeResponse(request) {
2230
+ const pendingResponse = request.response ?? this.pendingRealtimeResponse?.response;
2231
+ this.pendingRealtimeResponse = pendingResponse ? { response: pendingResponse } : {};
2232
+ }
2084
2233
  schedulePendingRealtimeResponse(delayMs) {
2085
2234
  if (!this.pendingRealtimeResponse)
2086
2235
  return;
@@ -2093,7 +2242,7 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2093
2242
  this.pendingRealtimeResponseTimer.unref?.();
2094
2243
  }
2095
2244
  flushPendingRealtimeResponse() {
2096
- if (!this.pendingRealtimeResponse || this.closed || this.activeRealtimeResponseId)
2245
+ if (!this.pendingRealtimeResponse || this.closed || this.realtimeResponseIsBusy())
2097
2246
  return;
2098
2247
  const waitMs = Math.max(0, this.responseCreateHoldUntilMs - Date.now());
2099
2248
  if (waitMs > 0) {
@@ -2105,11 +2254,38 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2105
2254
  this.sendRealtimeResponseCreate(pending);
2106
2255
  }
2107
2256
  sendRealtimeResponseCreate(request) {
2257
+ this.realtimeResponseCreateInFlight = request;
2108
2258
  this.sendOpenAI({
2109
2259
  type: "response.create",
2110
2260
  ...(request.response ? { response: request.response } : {}),
2111
2261
  });
2112
2262
  }
2263
+ noteRealtimeResponseConflict() {
2264
+ const inFlight = this.realtimeResponseCreateInFlight;
2265
+ this.realtimeResponseCreateInFlight = null;
2266
+ this.untrackedActiveRealtimeResponse = true;
2267
+ if (inFlight)
2268
+ this.holdRealtimeResponse(inFlight);
2269
+ this.scheduleUntrackedActiveRealtimeResponseFallback();
2270
+ }
2271
+ scheduleUntrackedActiveRealtimeResponseFallback() {
2272
+ this.clearUntrackedActiveRealtimeResponseTimer();
2273
+ this.untrackedActiveRealtimeResponseTimer = setTimeout(() => {
2274
+ this.untrackedActiveRealtimeResponseTimer = null;
2275
+ if (this.closed || !this.untrackedActiveRealtimeResponse)
2276
+ return;
2277
+ this.untrackedActiveRealtimeResponse = false;
2278
+ this.responseCreateHoldUntilMs = Math.max(this.responseCreateHoldUntilMs, Date.now() + OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2279
+ this.schedulePendingRealtimeResponse(OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2280
+ }, OPENAI_REALTIME_RESPONSE_CREATE_CONFLICT_BACKOFF_MS);
2281
+ this.untrackedActiveRealtimeResponseTimer.unref?.();
2282
+ }
2283
+ clearUntrackedActiveRealtimeResponseTimer() {
2284
+ if (!this.untrackedActiveRealtimeResponseTimer)
2285
+ return;
2286
+ clearTimeout(this.untrackedActiveRealtimeResponseTimer);
2287
+ this.untrackedActiveRealtimeResponseTimer = null;
2288
+ }
2113
2289
  flushPendingAudio() {
2114
2290
  const pending = this.pendingAudioPayloads.splice(0);
2115
2291
  for (const payload of pending) {
@@ -2229,6 +2405,9 @@ class TwilioOpenAIRealtimeMediaStreamSession {
2229
2405
  clearTimeout(this.pendingRealtimeResponseTimer);
2230
2406
  this.pendingRealtimeResponseTimer = null;
2231
2407
  }
2408
+ this.clearPendingUserTurnResponse();
2409
+ this.clearRealtimeToolPresenceTimers();
2410
+ this.clearUntrackedActiveRealtimeResponseTimer();
2232
2411
  this.lifecycle?.onClose?.(this, { callSid: this.callSid, outboundId: this.outboundId });
2233
2412
  (0, runtime_1.emitNervesEvent)({
2234
2413
  component: "senses",
@@ -2257,12 +2436,18 @@ class OpenAISipPhoneSession {
2257
2436
  autoResponsesSuppressedForAmd = false;
2258
2437
  openaiWs = null;
2259
2438
  toolContext;
2439
+ friendStore;
2440
+ resolvedContext;
2260
2441
  sessionMessages = [];
2261
2442
  toolResponses = new Map();
2262
2443
  completedRealtimeResponseIds = new Set();
2263
2444
  activeRealtimeResponseId = null;
2445
+ realtimeResponseCreateInFlight = null;
2446
+ untrackedActiveRealtimeResponse = false;
2447
+ untrackedActiveRealtimeResponseTimer = null;
2264
2448
  pendingRealtimeResponse = null;
2265
2449
  pendingRealtimeResponseTimer = null;
2450
+ pendingUserTurnResponseTimer = null;
2266
2451
  responseCreateHoldUntilMs = 0;
2267
2452
  constructor(options, metadata, registry) {
2268
2453
  this.options = options;
@@ -2283,11 +2468,16 @@ class OpenAISipPhoneSession {
2283
2468
  throw new Error("OpenAI Realtime API key is not configured");
2284
2469
  if (!sip)
2285
2470
  throw new Error("OpenAI SIP options are not configured");
2286
- this.friendId = this.metadata.friendId
2287
- || this.options.defaultFriendId?.trim()
2288
- || voiceFriendId(this.options, this.metadata.from, this.metadata.callId);
2471
+ const voiceContext = await resolveVoiceFriendContext(this.options, {
2472
+ friendId: this.metadata.friendId || this.options.defaultFriendId?.trim(),
2473
+ remotePhone: this.metadata.from || undefined,
2474
+ callSid: this.metadata.callId,
2475
+ });
2476
+ this.friendId = voiceContext.friendId;
2477
+ this.friendStore = voiceContext.friendStore;
2478
+ this.resolvedContext = voiceContext.resolved;
2289
2479
  this.sessionKey = twilioPhoneVoiceSessionKey({
2290
- defaultFriendId: this.friendId || this.options.defaultFriendId,
2480
+ defaultFriendId: this.friendId,
2291
2481
  from: this.metadata.from,
2292
2482
  to: this.metadata.to,
2293
2483
  callSid: this.metadata.callId,
@@ -2329,7 +2519,7 @@ class OpenAISipPhoneSession {
2329
2519
  ];
2330
2520
  if (this.closed || this.outboundAmdStopped())
2331
2521
  return;
2332
- await this.acceptOpenAISipCall(realtime, sip, instructions, tools, this.autoResponsesSuppressedForAmd);
2522
+ await this.acceptOpenAISipCall(realtime, sip, instructions, tools);
2333
2523
  if (this.closed || this.outboundAmdStopped())
2334
2524
  return;
2335
2525
  this.openControlWebSocket(realtime, sip, fullConfigPromise, usedBootstrap);
@@ -2370,14 +2560,14 @@ class OpenAISipPhoneSession {
2370
2560
  const answeredBy = job.answeredBy?.trim();
2371
2561
  if (nonHumanAnsweredStatus(answeredBy) || job.status === "voicemail" || job.status === "fax")
2372
2562
  return "reject";
2373
- if (answeredBy?.toLowerCase() === "human")
2374
- return "send";
2375
- return "hold";
2563
+ // Silence after pickup feels broken. Start the greeting immediately unless
2564
+ // Twilio has already positively identified a machine/fax answer.
2565
+ return "send";
2376
2566
  }
2377
2567
  outboundAmdStopped() {
2378
2568
  return this.outboundAmdState === "nonhuman" || this.outboundAmdState === "timeout";
2379
2569
  }
2380
- async acceptOpenAISipCall(realtime, sip, instructions, tools, suppressAutoResponsesForAmd = false) {
2570
+ async acceptOpenAISipCall(realtime, sip, instructions, tools) {
2381
2571
  const fetchImpl = sip.fetch ?? fetch;
2382
2572
  const response = await fetchImpl(openAISipCallActionUrl(sip, this.metadata.callId, "accept"), {
2383
2573
  method: "POST",
@@ -2393,7 +2583,7 @@ class OpenAISipPhoneSession {
2393
2583
  input: {
2394
2584
  noise_reduction: realtimeNoiseReductionConfig(realtime),
2395
2585
  transcription: { model: OPENAI_REALTIME_DEFAULT_TRANSCRIPTION_MODEL },
2396
- turn_detection: realtimeTurnDetectionConfig(realtime, suppressAutoResponsesForAmd ? { createResponse: false } : {}),
2586
+ turn_detection: realtimeTurnDetectionConfig(realtime, { createResponse: false, interruptResponse: false }),
2397
2587
  },
2398
2588
  output: realtimeOutputAudioConfig(realtime),
2399
2589
  },
@@ -2509,6 +2699,7 @@ class OpenAISipPhoneSession {
2509
2699
  const realtimeSystem = await buildRealtimeVoiceInstructions({
2510
2700
  agentName: this.options.agentName,
2511
2701
  agentRoot,
2702
+ friend: this.resolvedContext?.friend,
2512
2703
  priorTranscript: prior,
2513
2704
  realtimeVoice: this.options.openaiRealtime?.voice,
2514
2705
  realtimeVoiceStyle: this.options.openaiRealtime?.voiceStyle,
@@ -2528,6 +2719,8 @@ class OpenAISipPhoneSession {
2528
2719
  return;
2529
2720
  this.toolContext = {
2530
2721
  signin: async () => undefined,
2722
+ ...(this.resolvedContext ? { context: this.resolvedContext } : {}),
2723
+ ...(this.friendStore ? { friendStore: this.friendStore } : {}),
2531
2724
  voiceCall: {
2532
2725
  requestEnd: () => this.requestHangupFromTool(),
2533
2726
  playAudio: (request) => this.playRealtimeAudioCue(request),
@@ -2535,16 +2728,18 @@ class OpenAISipPhoneSession {
2535
2728
  };
2536
2729
  }
2537
2730
  async buildRealtimeTools() {
2538
- const agentRoot = this.options.agentRoot ?? (0, identity_1.getAgentRoot)(this.options.agentName);
2539
- const friendsPath = path.join(agentRoot, "friends");
2540
- const friendStore = new store_file_1.FileFriendStore(friendsPath);
2541
- const resolver = new resolver_1.FriendResolver(friendStore, {
2542
- provider: "local",
2543
- externalId: this.friendId,
2544
- displayName: this.friendId,
2545
- channel: "voice",
2546
- });
2547
- const resolved = await resolver.resolve();
2731
+ if (!this.resolvedContext || !this.friendStore) {
2732
+ const voiceContext = await resolveVoiceFriendContext(this.options, {
2733
+ friendId: this.friendId,
2734
+ remotePhone: this.metadata.from || undefined,
2735
+ callSid: this.metadata.callId,
2736
+ });
2737
+ this.friendId = voiceContext.friendId;
2738
+ this.friendStore = voiceContext.friendStore;
2739
+ this.resolvedContext = voiceContext.resolved;
2740
+ }
2741
+ const resolved = this.resolvedContext;
2742
+ const friendStore = this.friendStore;
2548
2743
  this.toolContext = {
2549
2744
  signin: async () => undefined,
2550
2745
  context: resolved,
@@ -2697,7 +2892,7 @@ class OpenAISipPhoneSession {
2697
2892
  type: "realtime",
2698
2893
  audio: {
2699
2894
  input: {
2700
- turn_detection: realtimeTurnDetectionConfig(realtime),
2895
+ turn_detection: realtimeTurnDetectionConfig(realtime, { createResponse: false, interruptResponse: false }),
2701
2896
  },
2702
2897
  },
2703
2898
  },
@@ -2796,9 +2991,13 @@ class OpenAISipPhoneSession {
2796
2991
  this.noteRealtimeResponseCreated(event);
2797
2992
  return;
2798
2993
  }
2994
+ if (type === "input_audio_buffer.speech_started") {
2995
+ this.clearPendingUserTurnResponse();
2996
+ return;
2997
+ }
2799
2998
  if (type === "conversation.item.input_audio_transcription.completed" && typeof event.transcript === "string") {
2800
2999
  this.recordOutboundAmdTranscriptCandidate(event.transcript);
2801
- this.appendTranscript("user", event.transcript);
3000
+ this.handleUserTranscript(event.transcript);
2802
3001
  return;
2803
3002
  }
2804
3003
  if (type === "response.output_audio_transcript.done" && typeof event.transcript === "string") {
@@ -2818,6 +3017,7 @@ class OpenAISipPhoneSession {
2818
3017
  return;
2819
3018
  }
2820
3019
  if (type === "error") {
3020
+ this.handleRealtimeError(event);
2821
3021
  (0, runtime_1.emitNervesEvent)({
2822
3022
  level: "error",
2823
3023
  component: "senses",
@@ -2827,6 +3027,40 @@ class OpenAISipPhoneSession {
2827
3027
  });
2828
3028
  }
2829
3029
  }
3030
+ handleRealtimeError(event) {
3031
+ const error = event.error;
3032
+ if (!error || typeof error !== "object" || Array.isArray(error))
3033
+ return;
3034
+ const code = stringField(error.code);
3035
+ if (code !== "conversation_already_has_active_response")
3036
+ return;
3037
+ this.noteRealtimeResponseConflict();
3038
+ }
3039
+ handleUserTranscript(transcript) {
3040
+ const content = transcript.trim();
3041
+ if (!content)
3042
+ return;
3043
+ this.appendTranscript("user", content);
3044
+ this.scheduleUserTurnResponse();
3045
+ }
3046
+ scheduleUserTurnResponse() {
3047
+ if (this.closed)
3048
+ return;
3049
+ this.clearPendingUserTurnResponse();
3050
+ this.pendingUserTurnResponseTimer = setTimeout(() => {
3051
+ this.pendingUserTurnResponseTimer = null;
3052
+ if (this.closed)
3053
+ return;
3054
+ this.requestRealtimeResponse();
3055
+ }, OPENAI_REALTIME_USER_TURN_RESPONSE_DELAY_MS);
3056
+ this.pendingUserTurnResponseTimer.unref?.();
3057
+ }
3058
+ clearPendingUserTurnResponse() {
3059
+ if (!this.pendingUserTurnResponseTimer)
3060
+ return;
3061
+ clearTimeout(this.pendingUserTurnResponseTimer);
3062
+ this.pendingUserTurnResponseTimer = null;
3063
+ }
2830
3064
  registerRealtimeToolResponse(responseId, callId) {
2831
3065
  if (!responseId)
2832
3066
  return undefined;
@@ -2836,6 +3070,8 @@ class OpenAISipPhoneSession {
2836
3070
  responseDone: this.completedRealtimeResponseIds.has(responseId),
2837
3071
  followupRequested: false,
2838
3072
  suppressFollowup: false,
3073
+ presenceRequested: false,
3074
+ presenceTimer: null,
2839
3075
  };
2840
3076
  state.pendingCallIds.add(callId);
2841
3077
  if (!existing)
@@ -2849,6 +3085,8 @@ class OpenAISipPhoneSession {
2849
3085
  if (!state)
2850
3086
  return false;
2851
3087
  state.pendingCallIds.delete(callId);
3088
+ if (state.pendingCallIds.size === 0)
3089
+ this.clearRealtimeToolPresenceTimer(state);
2852
3090
  return this.maybeCreateRealtimeToolFollowup(responseId, state);
2853
3091
  }
2854
3092
  completeRealtimeToolResponse(responseId) {
@@ -2867,6 +3105,7 @@ class OpenAISipPhoneSession {
2867
3105
  return false;
2868
3106
  state.followupRequested = true;
2869
3107
  this.toolResponses.delete(responseId);
3108
+ this.clearRealtimeToolPresenceTimer(state);
2870
3109
  if (state.suppressFollowup) {
2871
3110
  this.completeHangupIfReady("tool_response_done");
2872
3111
  return true;
@@ -2874,6 +3113,31 @@ class OpenAISipPhoneSession {
2874
3113
  this.requestRealtimeResponse();
2875
3114
  return true;
2876
3115
  }
3116
+ scheduleRealtimeToolPresence(responseId, state) {
3117
+ if (!responseId || state.presenceRequested || state.presenceTimer)
3118
+ return;
3119
+ state.presenceTimer = setTimeout(() => {
3120
+ state.presenceTimer = null;
3121
+ const current = this.toolResponses.get(responseId);
3122
+ if (this.closed || current !== state || state.pendingCallIds.size === 0 || state.suppressFollowup)
3123
+ return;
3124
+ state.presenceRequested = true;
3125
+ this.requestRealtimeResponse({
3126
+ instructions: "A tool is taking a moment. Say one very short natural holding phrase under six words, then stop speaking.",
3127
+ });
3128
+ }, OPENAI_REALTIME_TOOL_PRESENCE_DELAY_MS);
3129
+ state.presenceTimer.unref?.();
3130
+ }
3131
+ clearRealtimeToolPresenceTimer(state) {
3132
+ if (!state.presenceTimer)
3133
+ return;
3134
+ clearTimeout(state.presenceTimer);
3135
+ state.presenceTimer = null;
3136
+ }
3137
+ clearRealtimeToolPresenceTimers() {
3138
+ for (const state of this.toolResponses.values())
3139
+ this.clearRealtimeToolPresenceTimer(state);
3140
+ }
2877
3141
  async runRealtimeTool(event) {
2878
3142
  const name = typeof event.name === "string" ? event.name : "";
2879
3143
  const callId = typeof event.call_id === "string" ? event.call_id : "";
@@ -2884,6 +3148,8 @@ class OpenAISipPhoneSession {
2884
3148
  const coordinated = !!toolState;
2885
3149
  if (name === "voice_end_call" && toolState)
2886
3150
  toolState.suppressFollowup = true;
3151
+ if (toolState && !toolState.suppressFollowup)
3152
+ this.scheduleRealtimeToolPresence(responseId, toolState);
2887
3153
  let output;
2888
3154
  try {
2889
3155
  const args = parseToolArguments(typeof event.arguments === "string" ? event.arguments : "");
@@ -2924,14 +3190,18 @@ class OpenAISipPhoneSession {
2924
3190
  }
2925
3191
  }
2926
3192
  noteRealtimeResponseCreated(event) {
3193
+ this.realtimeResponseCreateInFlight = null;
3194
+ this.untrackedActiveRealtimeResponse = false;
3195
+ this.clearUntrackedActiveRealtimeResponseTimer();
2927
3196
  const responseId = realtimeResponseId(event);
2928
3197
  if (responseId)
2929
3198
  this.activeRealtimeResponseId = responseId;
2930
3199
  }
2931
- noteRealtimeResponseDone(responseId) {
2932
- if (!responseId || this.activeRealtimeResponseId === responseId) {
2933
- this.activeRealtimeResponseId = null;
2934
- }
3200
+ noteRealtimeResponseDone(_responseId) {
3201
+ this.realtimeResponseCreateInFlight = null;
3202
+ this.untrackedActiveRealtimeResponse = false;
3203
+ this.clearUntrackedActiveRealtimeResponseTimer();
3204
+ this.activeRealtimeResponseId = null;
2935
3205
  this.responseCreateHoldUntilMs = Math.max(this.responseCreateHoldUntilMs, Date.now() + OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2936
3206
  this.schedulePendingRealtimeResponse(OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
2937
3207
  }
@@ -2939,15 +3209,21 @@ class OpenAISipPhoneSession {
2939
3209
  if (this.closed)
2940
3210
  return;
2941
3211
  const waitMs = Math.max(0, this.responseCreateHoldUntilMs - Date.now());
2942
- if (this.activeRealtimeResponseId || waitMs > 0) {
2943
- const pendingResponse = response ?? this.pendingRealtimeResponse?.response;
2944
- this.pendingRealtimeResponse = pendingResponse ? { response: pendingResponse } : {};
2945
- if (!this.activeRealtimeResponseId)
3212
+ if (this.realtimeResponseIsBusy() || waitMs > 0) {
3213
+ this.holdRealtimeResponse(response ? { response } : {});
3214
+ if (!this.realtimeResponseIsBusy())
2946
3215
  this.schedulePendingRealtimeResponse(waitMs);
2947
3216
  return;
2948
3217
  }
2949
3218
  this.sendRealtimeResponseCreate(response ? { response } : {});
2950
3219
  }
3220
+ realtimeResponseIsBusy() {
3221
+ return !!this.activeRealtimeResponseId || !!this.realtimeResponseCreateInFlight || this.untrackedActiveRealtimeResponse;
3222
+ }
3223
+ holdRealtimeResponse(request) {
3224
+ const pendingResponse = request.response ?? this.pendingRealtimeResponse?.response;
3225
+ this.pendingRealtimeResponse = pendingResponse ? { response: pendingResponse } : {};
3226
+ }
2951
3227
  schedulePendingRealtimeResponse(delayMs) {
2952
3228
  if (!this.pendingRealtimeResponse)
2953
3229
  return;
@@ -2960,7 +3236,7 @@ class OpenAISipPhoneSession {
2960
3236
  this.pendingRealtimeResponseTimer.unref?.();
2961
3237
  }
2962
3238
  flushPendingRealtimeResponse() {
2963
- if (!this.pendingRealtimeResponse || this.closed || this.activeRealtimeResponseId)
3239
+ if (!this.pendingRealtimeResponse || this.closed || this.realtimeResponseIsBusy())
2964
3240
  return;
2965
3241
  const waitMs = Math.max(0, this.responseCreateHoldUntilMs - Date.now());
2966
3242
  if (waitMs > 0) {
@@ -2972,11 +3248,38 @@ class OpenAISipPhoneSession {
2972
3248
  this.sendRealtimeResponseCreate(pending);
2973
3249
  }
2974
3250
  sendRealtimeResponseCreate(request) {
3251
+ this.realtimeResponseCreateInFlight = request;
2975
3252
  this.sendOpenAI({
2976
3253
  type: "response.create",
2977
3254
  ...(request.response ? { response: request.response } : {}),
2978
3255
  });
2979
3256
  }
3257
+ noteRealtimeResponseConflict() {
3258
+ const inFlight = this.realtimeResponseCreateInFlight;
3259
+ this.realtimeResponseCreateInFlight = null;
3260
+ this.untrackedActiveRealtimeResponse = true;
3261
+ if (inFlight)
3262
+ this.holdRealtimeResponse(inFlight);
3263
+ this.scheduleUntrackedActiveRealtimeResponseFallback();
3264
+ }
3265
+ scheduleUntrackedActiveRealtimeResponseFallback() {
3266
+ this.clearUntrackedActiveRealtimeResponseTimer();
3267
+ this.untrackedActiveRealtimeResponseTimer = setTimeout(() => {
3268
+ this.untrackedActiveRealtimeResponseTimer = null;
3269
+ if (this.closed || !this.untrackedActiveRealtimeResponse)
3270
+ return;
3271
+ this.untrackedActiveRealtimeResponse = false;
3272
+ this.responseCreateHoldUntilMs = Math.max(this.responseCreateHoldUntilMs, Date.now() + OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
3273
+ this.schedulePendingRealtimeResponse(OPENAI_REALTIME_RESPONSE_CREATE_GRACE_MS);
3274
+ }, OPENAI_REALTIME_RESPONSE_CREATE_CONFLICT_BACKOFF_MS);
3275
+ this.untrackedActiveRealtimeResponseTimer.unref?.();
3276
+ }
3277
+ clearUntrackedActiveRealtimeResponseTimer() {
3278
+ if (!this.untrackedActiveRealtimeResponseTimer)
3279
+ return;
3280
+ clearTimeout(this.untrackedActiveRealtimeResponseTimer);
3281
+ this.untrackedActiveRealtimeResponseTimer = null;
3282
+ }
2980
3283
  requestHangupFromTool() {
2981
3284
  if (this.closed)
2982
3285
  return;
@@ -3050,6 +3353,9 @@ class OpenAISipPhoneSession {
3050
3353
  clearTimeout(this.pendingRealtimeResponseTimer);
3051
3354
  this.pendingRealtimeResponseTimer = null;
3052
3355
  }
3356
+ this.clearPendingUserTurnResponse();
3357
+ this.clearRealtimeToolPresenceTimers();
3358
+ this.clearUntrackedActiveRealtimeResponseTimer();
3053
3359
  (0, runtime_1.emitNervesEvent)({
3054
3360
  component: "senses",
3055
3361
  event: "senses.voice_openai_sip_call_stop",
@@ -3805,7 +4111,7 @@ async function handleOutgoing(options, basePath, outboundId, params, jobs) {
3805
4111
  Reason: job.reason,
3806
4112
  InitialAudio: encodeVoiceCallAudioCustomParameter(job.initialAudio),
3807
4113
  };
3808
- if (usesOpenAISipConversationEngine(options)) {
4114
+ if (usesOpenAISipOutboundConversationEngine(options)) {
3809
4115
  (0, runtime_1.emitNervesEvent)({
3810
4116
  component: "senses",
3811
4117
  event: "senses.voice_twilio_sip_connect",
@@ -3824,8 +4130,8 @@ async function handleOutgoing(options, basePath, outboundId, params, jobs) {
3824
4130
  })));
3825
4131
  }
3826
4132
  if (normalizeTwilioPhoneTransportMode(options.transportMode) === "media-stream") {
3827
- if (usesOpenAIRealtimeConversationEngine(options)) {
3828
- return xmlResponse(mediaStreamTwiml(options, basePath, { From: from, To: to }, undefined, streamParams));
4133
+ if (usesOpenAIRealtimeOutboundConversationEngine(options)) {
4134
+ return xmlResponse(mediaStreamTwiml(options, basePath, { From: from, To: to }, undefined, streamParams, "openai-realtime"));
3829
4135
  }
3830
4136
  try {
3831
4137
  await fs.mkdir(callDir, { recursive: true });
@@ -4228,7 +4534,7 @@ function createTwilioPhoneBridge(options) {
4228
4534
  byOutboundId: new Map(),
4229
4535
  };
4230
4536
  const activeSipSessions = new ActiveOpenAISipSessions();
4231
- mediaStreams.on("connection", (ws) => {
4537
+ mediaStreams.on("connection", (ws, request) => {
4232
4538
  const lifecycle = {
4233
4539
  onIdentityChange: (activeSession, identity) => {
4234
4540
  if (identity.callSid)
@@ -4245,7 +4551,8 @@ function createTwilioPhoneBridge(options) {
4245
4551
  }
4246
4552
  },
4247
4553
  };
4248
- const session = usesOpenAIRealtimeConversationEngine(options)
4554
+ const streamEngine = mediaStreamRequestedConversationEngine(request.url);
4555
+ const session = streamEngine === "openai-realtime" || (!streamEngine && usesOpenAIRealtimeConversationEngine(options))
4249
4556
  ? new TwilioOpenAIRealtimeMediaStreamSession(ws, options, lifecycle)
4250
4557
  : new TwilioMediaStreamSession(ws, options, jobs, lifecycle);
4251
4558
  session.attach();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ouro.bot/cli",
3
- "version": "0.1.0-alpha.582",
3
+ "version": "0.1.0-alpha.583",
4
4
  "main": "dist/heart/daemon/ouro-entry.js",
5
5
  "bin": {
6
6
  "cli": "dist/heart/daemon/ouro-bot-entry.js",