@ouro.bot/cli 0.1.0-alpha.582 → 0.1.0-alpha.584

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/changelog.json CHANGED
@@ -1,6 +1,24 @@
1
1
  {
2
2
  "_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
3
3
  "versions": [
4
+ {
5
+ "version": "0.1.0-alpha.584",
6
+ "changes": [
7
+ "Voice now has a transport-aware Realtime eval kernel that grades deterministic call timelines for first-audio latency, user-turn response latency, tool holding phrases, barge-in clearing/truncation, friend context, transcript continuity, and hangup control before live phone testing.",
8
+ "`npm run voice:eval` runs built-in no-human voice scenarios, including a healthy path and an expected known-bad latency canary, and emits the normal nerves events expected of executable sense entrypoints so future Voice transport work can prove synchronous behavior without requiring a human to answer calls."
9
+ ]
10
+ },
11
+ {
12
+ "version": "0.1.0-alpha.583",
13
+ "changes": [
14
+ "Outbound SIP phone calls now start the Realtime greeting immediately after answer unless Twilio has already positively identified voicemail or fax, preventing humans from hearing post-pickup silence when async AMD returns unknown.",
15
+ "Twilio phone voice now defaults outbound calls to OpenAI Realtime Media Streams when inbound calls use OpenAI SIP on a Media Stream machine, while still allowing `voice.twilioOutboundConversationEngine` overrides, so humans avoid post-pickup SIP ringback.",
16
+ "Realtime voice now resolves phone callers through the canonical friend graph, preferring existing friend ids and otherwise matching normalized phone numbers via `imessage-handle`, so trust-aware tools see the same friend context as text and mail.",
17
+ "Realtime media-stream voice now treats empty caller metadata as absent and preserves local voice friend identities, keeping outbound and provider-simulated calls attached to the intended friend instead of inventing a blank phone identity.",
18
+ "Realtime voice response creation now backs off and retries after provider active-response conflicts, holds user turns under Ouro floor-control instead of provider auto-response, and long-running voice tools can emit one tiny holding phrase instead of leaving seconds of unexplained silence.",
19
+ "Realtime voice VAD and local barge-in thresholds are less twitchy by default, reducing accidental interruption from tiny room sounds while preserving deliberate caller interruption."
20
+ ]
21
+ },
4
22
  {
5
23
  "version": "0.1.0-alpha.582",
6
24
  "changes": [
@@ -26,3 +26,4 @@ __exportStar(require("./playback"), exports);
26
26
  __exportStar(require("./golden-path"), exports);
27
27
  __exportStar(require("./twilio-phone"), exports);
28
28
  __exportStar(require("./twilio-phone-runtime"), exports);
29
+ __exportStar(require("./realtime-eval"), exports);
@@ -0,0 +1,384 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.gradeVoiceRealtimeEvalTimeline = gradeVoiceRealtimeEvalTimeline;
4
+ exports.buildVoiceRealtimeEvalHappyPath = buildVoiceRealtimeEvalHappyPath;
5
+ exports.runBuiltInVoiceRealtimeEvalSuite = runBuiltInVoiceRealtimeEvalSuite;
6
+ exports.summarizeVoiceRealtimeEvalSuite = summarizeVoiceRealtimeEvalSuite;
7
+ const runtime_1 = require("../../nerves/runtime");
8
+ function validateTimeline(scenarioId, events, expectation) {
9
+ const normalizedScenarioId = scenarioId.trim();
10
+ if (!normalizedScenarioId)
11
+ throw new Error("voice eval scenario id is empty");
12
+ if (events.length === 0)
13
+ throw new Error("voice eval timeline is empty");
14
+ const budgets = [
15
+ expectation.maxFirstAssistantAudioMs,
16
+ expectation.maxUserTurnResponseMs,
17
+ expectation.maxToolPresenceMs,
18
+ expectation.maxBargeInClearMs,
19
+ expectation.maxBargeInTruncateMs,
20
+ ];
21
+ if (budgets.some((budget) => !Number.isFinite(budget) || budget <= 0)) {
22
+ throw new Error("voice eval latency budgets must be positive");
23
+ }
24
+ return normalizedScenarioId;
25
+ }
26
+ function sortedEvents(events) {
27
+ return [...events].sort((left, right) => left.atMs - right.atMs);
28
+ }
29
+ function firstEvent(events, type) {
30
+ return events.find((event) => event.type === type);
31
+ }
32
+ function allEvents(events, type) {
33
+ return events.filter((event) => event.type === type);
34
+ }
35
+ function lowerText(value) {
36
+ return value?.toLowerCase() ?? "";
37
+ }
38
+ function pushFinding(findings, finding) {
39
+ findings.push(finding);
40
+ }
41
+ function gradeFirstAudio(events, expectation, findings) {
42
+ const connected = firstEvent(events, "call.connected");
43
+ const firstAudio = firstEvent(events, "assistant.audio.started");
44
+ if (!connected || !firstAudio) {
45
+ pushFinding(findings, {
46
+ code: "first_audio_missing",
47
+ severity: "fail",
48
+ message: "Voice call did not produce assistant audio after connect.",
49
+ source: connected?.source ?? firstAudio?.source,
50
+ atMs: connected?.atMs ?? firstAudio?.atMs,
51
+ });
52
+ return undefined;
53
+ }
54
+ const ttfaMs = firstAudio.atMs - connected.atMs;
55
+ if (ttfaMs > expectation.maxFirstAssistantAudioMs) {
56
+ pushFinding(findings, {
57
+ code: "first_audio_late",
58
+ severity: "fail",
59
+ message: `First assistant audio started after ${ttfaMs}ms, over the ${expectation.maxFirstAssistantAudioMs}ms budget.`,
60
+ source: firstAudio.source,
61
+ atMs: firstAudio.atMs,
62
+ });
63
+ }
64
+ return ttfaMs;
65
+ }
66
+ function gradeFirstUserResponse(events, expectation, findings) {
67
+ const userTranscript = firstEvent(events, "user.transcript.done");
68
+ if (!userTranscript)
69
+ return undefined;
70
+ const response = events.find((event) => event.type === "response.requested"
71
+ && event.atMs >= userTranscript.atMs
72
+ && (!userTranscript.correlationId || event.correlationId === userTranscript.correlationId));
73
+ if (!response) {
74
+ pushFinding(findings, {
75
+ code: "user_response_missing",
76
+ severity: "fail",
77
+ message: "No voice response was requested after the caller transcript completed.",
78
+ source: userTranscript.source,
79
+ atMs: userTranscript.atMs,
80
+ });
81
+ return undefined;
82
+ }
83
+ const latencyMs = response.atMs - userTranscript.atMs;
84
+ if (latencyMs > expectation.maxUserTurnResponseMs) {
85
+ pushFinding(findings, {
86
+ code: "user_response_late",
87
+ severity: "fail",
88
+ message: `Voice response was requested after ${latencyMs}ms, over the ${expectation.maxUserTurnResponseMs}ms budget.`,
89
+ source: response.source,
90
+ atMs: response.atMs,
91
+ });
92
+ }
93
+ return latencyMs;
94
+ }
95
+ function gradeToolPresence(events, expectation, findings) {
96
+ const toolCall = firstEvent(events, "tool.call.started");
97
+ if (!toolCall)
98
+ return undefined;
99
+ const holding = events.find((event) => event.type === "tool.holding.started"
100
+ && event.atMs >= toolCall.atMs
101
+ && (!toolCall.correlationId || event.correlationId === toolCall.correlationId));
102
+ if (!holding) {
103
+ pushFinding(findings, {
104
+ code: "tool_presence_missing",
105
+ severity: "fail",
106
+ message: "Tool call did not produce a short voice holding phrase.",
107
+ source: toolCall.source,
108
+ atMs: toolCall.atMs,
109
+ });
110
+ return undefined;
111
+ }
112
+ const latencyMs = holding.atMs - toolCall.atMs;
113
+ if (latencyMs > expectation.maxToolPresenceMs) {
114
+ pushFinding(findings, {
115
+ code: "tool_presence_late",
116
+ severity: "fail",
117
+ message: `Tool holding phrase started after ${latencyMs}ms, over the ${expectation.maxToolPresenceMs}ms budget.`,
118
+ source: holding.source,
119
+ atMs: holding.atMs,
120
+ });
121
+ }
122
+ return latencyMs;
123
+ }
124
+ function gradeBargeIn(events, expectation, findings) {
125
+ const bargeIn = firstEvent(events, "barge_in.detected");
126
+ if (!bargeIn)
127
+ return {};
128
+ const clear = events.find((event) => event.type === "transport.playback_cleared" && event.atMs >= bargeIn.atMs);
129
+ const truncate = events.find((event) => event.type === "response.truncated" && event.atMs >= bargeIn.atMs);
130
+ const metrics = {};
131
+ if (!clear) {
132
+ pushFinding(findings, {
133
+ code: "barge_in_clear_missing",
134
+ severity: "fail",
135
+ message: "Caller barge-in did not clear transport playback.",
136
+ source: bargeIn.source,
137
+ atMs: bargeIn.atMs,
138
+ });
139
+ }
140
+ else {
141
+ metrics.firstBargeInClearMs = clear.atMs - bargeIn.atMs;
142
+ if (metrics.firstBargeInClearMs > expectation.maxBargeInClearMs) {
143
+ pushFinding(findings, {
144
+ code: "barge_in_clear_late",
145
+ severity: "fail",
146
+ message: `Barge-in playback clear took ${metrics.firstBargeInClearMs}ms, over the ${expectation.maxBargeInClearMs}ms budget.`,
147
+ source: clear.source,
148
+ atMs: clear.atMs,
149
+ });
150
+ }
151
+ }
152
+ if (!truncate) {
153
+ pushFinding(findings, {
154
+ code: "barge_in_truncate_missing",
155
+ severity: "fail",
156
+ message: "Caller barge-in did not truncate the active Realtime response.",
157
+ source: bargeIn.source,
158
+ atMs: bargeIn.atMs,
159
+ });
160
+ }
161
+ else {
162
+ metrics.firstBargeInTruncateMs = truncate.atMs - bargeIn.atMs;
163
+ if (metrics.firstBargeInTruncateMs > expectation.maxBargeInTruncateMs) {
164
+ pushFinding(findings, {
165
+ code: "barge_in_truncate_late",
166
+ severity: "fail",
167
+ message: `Barge-in response truncation took ${metrics.firstBargeInTruncateMs}ms, over the ${expectation.maxBargeInTruncateMs}ms budget.`,
168
+ source: truncate.source,
169
+ atMs: truncate.atMs,
170
+ });
171
+ }
172
+ }
173
+ return metrics;
174
+ }
175
+ function gradeManualFloorControl(events, findings) {
176
+ const session = allEvents(events, "session.updated").find((event) => event.session?.turnDetection);
177
+ if (session?.session?.turnDetection?.createResponse === false
178
+ && session.session.turnDetection.interruptResponse === false) {
179
+ return;
180
+ }
181
+ pushFinding(findings, {
182
+ code: "manual_floor_control_missing",
183
+ severity: "fail",
184
+ message: "Realtime session did not disable provider auto-response and provider interruption.",
185
+ source: session?.source,
186
+ atMs: session?.atMs,
187
+ });
188
+ }
189
+ function gradeFriendContext(events, requirement, findings) {
190
+ const context = firstEvent(events, "voice.context.injected");
191
+ if (context?.friendId === requirement.friendId
192
+ && context.sessionKey === requirement.sessionKey
193
+ && lowerText(context.text).includes(requirement.marker.toLowerCase())) {
194
+ return;
195
+ }
196
+ pushFinding(findings, {
197
+ code: "friend_context_mismatch",
198
+ severity: "fail",
199
+ message: "Voice context did not preserve the expected friend identity, trust marker, and stable session key.",
200
+ source: context?.source,
201
+ atMs: context?.atMs,
202
+ });
203
+ }
204
+ function gradeTranscripts(events, requirements, findings) {
205
+ for (const requirement of requirements) {
206
+ const type = requirement.role === "assistant"
207
+ ? "assistant.transcript.done"
208
+ : "user.transcript.done";
209
+ const found = allEvents(events, type).some((event) => lowerText(event.text).includes(requirement.contains.toLowerCase()));
210
+ if (!found) {
211
+ pushFinding(findings, {
212
+ code: "transcript_missing",
213
+ severity: "fail",
214
+ message: `Missing ${requirement.role} transcript containing "${requirement.contains}".`,
215
+ });
216
+ }
217
+ }
218
+ }
219
+ function gradeHangup(events, findings) {
220
+ const hangup = firstEvent(events, "call.hangup.requested");
221
+ if (hangup)
222
+ return;
223
+ const ended = firstEvent(events, "call.ended");
224
+ pushFinding(findings, {
225
+ code: "hangup_missing",
226
+ severity: "fail",
227
+ message: "Voice eval expected an agent-controlled hangup request before call end.",
228
+ source: ended?.source,
229
+ atMs: ended?.atMs,
230
+ });
231
+ }
232
+ function gradeOverlappingResponses(events, findings) {
233
+ for (const response of allEvents(events, "response.requested")) {
234
+ const activeAudio = allEvents(events, "assistant.audio.started").find((started) => {
235
+ const done = events.find((event) => event.type === "assistant.audio.done" && event.atMs >= started.atMs);
236
+ return response.atMs > started.atMs && (!done || response.atMs < done.atMs);
237
+ });
238
+ if (activeAudio) {
239
+ pushFinding(findings, {
240
+ code: "response_overlap",
241
+ severity: "fail",
242
+ message: "Voice response was requested while assistant audio was still active.",
243
+ source: response.source,
244
+ atMs: response.atMs,
245
+ });
246
+ return;
247
+ }
248
+ }
249
+ }
250
+ function collectTransportSources(events) {
251
+ return [...new Set(events.flatMap((event) => event.source ? [event.source.transport] : []))].sort();
252
+ }
253
+ function gradeVoiceRealtimeEvalTimeline(scenarioId, timeline, expectation) {
254
+ const normalizedScenarioId = validateTimeline(scenarioId, timeline, expectation);
255
+ const events = sortedEvents(timeline);
256
+ (0, runtime_1.emitNervesEvent)({
257
+ component: "senses",
258
+ event: "senses.voice_realtime_eval_start",
259
+ message: "starting Voice realtime eval timeline grading",
260
+ meta: { scenarioId: normalizedScenarioId, events: events.length },
261
+ });
262
+ const findings = [];
263
+ const metrics = {
264
+ ttfaMs: gradeFirstAudio(events, expectation, findings),
265
+ firstUserResponseMs: gradeFirstUserResponse(events, expectation, findings),
266
+ firstToolPresenceMs: gradeToolPresence(events, expectation, findings),
267
+ ...gradeBargeIn(events, expectation, findings),
268
+ };
269
+ if (expectation.requireManualFloorControl)
270
+ gradeManualFloorControl(events, findings);
271
+ if (expectation.requireFriendContext)
272
+ gradeFriendContext(events, expectation.requireFriendContext, findings);
273
+ if (expectation.requiredTranscripts)
274
+ gradeTranscripts(events, expectation.requiredTranscripts, findings);
275
+ if (expectation.requireHangup)
276
+ gradeHangup(events, findings);
277
+ gradeOverlappingResponses(events, findings);
278
+ const report = {
279
+ scenarioId: normalizedScenarioId,
280
+ passed: findings.every((finding) => finding.severity !== "fail"),
281
+ findings,
282
+ metrics,
283
+ transportSources: collectTransportSources(events),
284
+ };
285
+ (0, runtime_1.emitNervesEvent)({
286
+ component: "senses",
287
+ event: "senses.voice_realtime_eval_end",
288
+ message: "finished Voice realtime eval timeline grading",
289
+ meta: { scenarioId: normalizedScenarioId, passed: report.passed, findings: findings.length },
290
+ });
291
+ return report;
292
+ }
293
+ function buildVoiceRealtimeEvalHappyPath() {
294
+ return [
295
+ { type: "call.connected", atMs: 0, source: { transport: "openai-sip", id: "sip-call-1" } },
296
+ {
297
+ type: "voice.context.injected",
298
+ atMs: 80,
299
+ friendId: "friend-ari",
300
+ sessionKey: "twilio-phone-friend-ari-via-ouro",
301
+ text: "Resolved voice friend: Ari (friendId=friend-ari, trust=family).",
302
+ source: { transport: "voice-eval" },
303
+ },
304
+ {
305
+ type: "session.updated",
306
+ atMs: 100,
307
+ session: { turnDetection: { createResponse: false, interruptResponse: false } },
308
+ source: { transport: "openai-realtime-control", id: "ws-1" },
309
+ },
310
+ { type: "response.requested", atMs: 120, correlationId: "greeting", source: { transport: "openai-realtime-control", id: "ws-1" } },
311
+ { type: "assistant.audio.started", atMs: 720, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
312
+ { type: "assistant.audio.done", atMs: 1_820, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
313
+ {
314
+ type: "assistant.transcript.done",
315
+ atMs: 1_840,
316
+ correlationId: "greeting",
317
+ text: "Hey Ari, I am checking the weather now.",
318
+ source: { transport: "openai-realtime-control", id: "ws-1" },
319
+ },
320
+ {
321
+ type: "user.transcript.done",
322
+ atMs: 2_200,
323
+ correlationId: "user-1",
324
+ text: "Can you check the weather and then hang up?",
325
+ source: { transport: "twilio-media-stream", id: "stream-1" },
326
+ },
327
+ { type: "response.requested", atMs: 2_480, correlationId: "user-1", source: { transport: "openai-realtime-control", id: "ws-1" } },
328
+ { type: "assistant.audio.started", atMs: 2_540, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
329
+ { type: "assistant.audio.done", atMs: 2_820, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
330
+ { type: "tool.call.started", atMs: 3_000, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
331
+ { type: "tool.holding.started", atMs: 3_260, correlationId: "tool-1", text: "One sec, checking.", source: { transport: "openai-sip", id: "sip-call-1" } },
332
+ { type: "tool.call.completed", atMs: 3_800, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
333
+ { type: "barge_in.detected", atMs: 4_100, source: { transport: "twilio-media-stream", id: "stream-1" } },
334
+ { type: "transport.playback_cleared", atMs: 4_140, source: { transport: "twilio-media-stream", id: "stream-1" } },
335
+ { type: "response.truncated", atMs: 4_170, source: { transport: "openai-realtime-control", id: "ws-1" } },
336
+ { type: "call.hangup.requested", atMs: 5_000, source: { transport: "openai-realtime-control", id: "ws-1" } },
337
+ { type: "call.ended", atMs: 5_100, source: { transport: "openai-sip", id: "sip-call-1" } },
338
+ ];
339
+ }
340
+ function builtInExpectation() {
341
+ return {
342
+ maxFirstAssistantAudioMs: 1_200,
343
+ maxUserTurnResponseMs: 900,
344
+ maxToolPresenceMs: 600,
345
+ maxBargeInClearMs: 120,
346
+ maxBargeInTruncateMs: 180,
347
+ requireManualFloorControl: true,
348
+ requireFriendContext: {
349
+ friendId: "friend-ari",
350
+ sessionKey: "twilio-phone-friend-ari-via-ouro",
351
+ marker: "trust=family",
352
+ },
353
+ requireHangup: true,
354
+ requiredTranscripts: [
355
+ { role: "user", contains: "weather" },
356
+ { role: "assistant", contains: "checking the weather" },
357
+ ],
358
+ };
359
+ }
360
+ function buildKnownBadLatencyPath() {
361
+ return buildVoiceRealtimeEvalHappyPath().map((event) => {
362
+ if (event.type === "assistant.audio.started" && event.correlationId === "greeting")
363
+ return { ...event, atMs: 1_900 };
364
+ if (event.type === "response.requested" && event.correlationId === "user-1")
365
+ return { ...event, atMs: 3_500 };
366
+ return event;
367
+ });
368
+ }
369
+ function runBuiltInVoiceRealtimeEvalSuite() {
370
+ const expectation = builtInExpectation();
371
+ return [
372
+ gradeVoiceRealtimeEvalTimeline("voice-happy-path", buildVoiceRealtimeEvalHappyPath(), expectation),
373
+ gradeVoiceRealtimeEvalTimeline("voice-known-bad-latency", buildKnownBadLatencyPath(), expectation),
374
+ ];
375
+ }
376
+ function summarizeVoiceRealtimeEvalSuite(reports) {
377
+ const failedScenarioIds = reports.filter((report) => !report.passed).map((report) => report.scenarioId);
378
+ return {
379
+ passed: reports.length - failedScenarioIds.length,
380
+ failed: failedScenarioIds.length,
381
+ total: reports.length,
382
+ failedScenarioIds,
383
+ };
384
+ }
@@ -154,13 +154,37 @@ function resolveOpenAIRealtimeApiKey(options) {
154
154
  return { apiKey: compatKey, source: "integrations.openaiEmbeddingsApiKey" };
155
155
  return undefined;
156
156
  }
157
- function configuredConversationEngine(options, overrides) {
158
- return overrides.conversationEngine
159
- ?? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(configString(options.machineConfig, "voice.twilioConversationEngine")
160
- ?? configString(options.machineConfig, "voice.conversationEngine")
161
- ?? configString(options.runtimeConfig, "voice.twilioConversationEngine")
162
- ?? configString(options.runtimeConfig, "voice.conversationEngine")
163
- ?? "cascade");
157
+ function configuredConversationEngine(options, overrides, transportMode) {
158
+ const explicit = overrides.conversationEngine
159
+ ?? configString(options.machineConfig, "voice.twilioConversationEngine")
160
+ ?? configString(options.machineConfig, "voice.conversationEngine")
161
+ ?? configString(options.runtimeConfig, "voice.twilioConversationEngine")
162
+ ?? configString(options.runtimeConfig, "voice.conversationEngine");
163
+ const hasSipConfig = !!(configString(options.runtimeConfig, "voice.openaiSipProjectId")
164
+ || configString(options.machineConfig, "voice.openaiSipProjectId"));
165
+ const explicitEngine = explicit ? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(explicit) : undefined;
166
+ if (hasSipConfig && (!explicitEngine || explicitEngine === "cascade"))
167
+ return "openai-sip";
168
+ if (explicitEngine)
169
+ return explicitEngine;
170
+ const hasRealtimeConfig = !!resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
171
+ if (hasRealtimeConfig && transportMode === "media-stream")
172
+ return "openai-realtime";
173
+ return "cascade";
174
+ }
175
+ function configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode) {
176
+ const defaultOutboundEngine = conversationEngine === "openai-sip" && transportMode === "media-stream"
177
+ ? "openai-realtime"
178
+ : conversationEngine;
179
+ const configured = overrides.outboundConversationEngine
180
+ ?? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(configString(options.machineConfig, "voice.twilioOutboundConversationEngine")
181
+ ?? configString(options.machineConfig, "voice.outboundConversationEngine")
182
+ ?? configString(options.runtimeConfig, "voice.twilioOutboundConversationEngine")
183
+ ?? configString(options.runtimeConfig, "voice.outboundConversationEngine")
184
+ ?? defaultOutboundEngine);
185
+ if (defaultOutboundEngine === "openai-realtime" && configured === "cascade")
186
+ return defaultOutboundEngine;
187
+ return configured;
164
188
  }
165
189
  function normalizeOpenAIRealtimeReasoningEffort(value) {
166
190
  const normalized = value?.trim().toLowerCase();
@@ -226,7 +250,14 @@ function resolveTwilioPhoneTransportRuntime(options) {
226
250
  ?? twilio_phone_1.TWILIO_PHONE_WEBHOOK_BASE_PATH);
227
251
  const transportMode = overrides.transportMode
228
252
  ?? (0, twilio_phone_1.normalizeTwilioPhoneTransportMode)(configString(options.machineConfig, "voice.twilioTransportMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_TRANSPORT_MODE);
229
- const conversationEngine = configuredConversationEngine(options, overrides);
253
+ const conversationEngine = configuredConversationEngine(options, overrides, transportMode);
254
+ const outboundConversationEngine = configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode);
255
+ const needsOpenAIRealtime = conversationEngine === "openai-realtime"
256
+ || conversationEngine === "openai-sip"
257
+ || outboundConversationEngine === "openai-realtime"
258
+ || outboundConversationEngine === "openai-sip";
259
+ const needsOpenAISip = conversationEngine === "openai-sip" || outboundConversationEngine === "openai-sip";
260
+ const needsCascade = conversationEngine === "cascade" || outboundConversationEngine === "cascade";
230
261
  let elevenLabsApiKey = configString(options.runtimeConfig, "integrations.elevenLabsApiKey") ?? "";
231
262
  let elevenLabsVoiceId = trimOptional(overrides.elevenLabsVoiceId)
232
263
  ?? configString(options.runtimeConfig, "integrations.elevenLabsVoiceId")
@@ -240,9 +271,9 @@ function resolveTwilioPhoneTransportRuntime(options) {
240
271
  ?? "";
241
272
  let openaiRealtime;
242
273
  let openaiSip;
243
- if (conversationEngine === "openai-realtime" || conversationEngine === "openai-sip") {
244
- if (conversationEngine === "openai-realtime" && transportMode !== "media-stream") {
245
- throw new Error("voice.twilioConversationEngine=openai-realtime requires voice.twilioTransportMode=media-stream");
274
+ if (needsOpenAIRealtime) {
275
+ if ((conversationEngine === "openai-realtime" || outboundConversationEngine === "openai-realtime") && transportMode !== "media-stream") {
276
+ throw new Error("voice.twilioConversationEngine/openai-realtime requires voice.twilioTransportMode=media-stream");
246
277
  }
247
278
  const key = resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
248
279
  if (!key) {
@@ -300,7 +331,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
300
331
  ?? normalizeOpenAIRealtimeNoiseReduction(configString(options.runtimeConfig, "voice.openaiRealtimeNoiseReduction")),
301
332
  turnDetection,
302
333
  };
303
- if (conversationEngine === "openai-sip") {
334
+ if (needsOpenAISip) {
304
335
  const projectId = trimOptional(overrides.openaiSipProjectId)
305
336
  ?? configString(options.runtimeConfig, "voice.openaiSipProjectId")
306
337
  ?? configString(options.machineConfig, "voice.openaiSipProjectId");
@@ -334,7 +365,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
334
365
  };
335
366
  }
336
367
  }
337
- else {
368
+ if (needsCascade) {
338
369
  elevenLabsApiKey = required(elevenLabsApiKey || undefined, "missing integrations.elevenLabsApiKey; run 'ouro connect voice --agent <agent>' for setup guidance");
339
370
  elevenLabsVoiceId = required(elevenLabsVoiceId || undefined, "missing integrations.elevenLabsVoiceId; save the ElevenLabs voice ID before starting phone voice");
340
371
  whisperCliPath = required(whisperCliPath || undefined, "missing voice.whisperCliPath in this machine's runtime config");
@@ -379,6 +410,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
379
410
  ?? (0, twilio_phone_1.normalizeTwilioPhonePlaybackMode)(configString(options.machineConfig, "voice.twilioPlaybackMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE),
380
411
  transportMode,
381
412
  conversationEngine,
413
+ outboundConversationEngine,
382
414
  openaiRealtime,
383
415
  openaiSip,
384
416
  openaiSipWebhookUrl: openaiSip?.webhookPath ? (0, twilio_phone_1.openAISipWebhookUrl)(publicBaseUrl, openaiSip.webhookPath) : undefined,
@@ -482,7 +514,12 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
482
514
  meta: { agentName: settings.agentName, source: settings.openaiRealtime.apiKeySource },
483
515
  });
484
516
  }
485
- const transcriber = settings.conversationEngine === "openai-realtime" || settings.conversationEngine === "openai-sip"
517
+ const settingsNeedsOpenAIRealtime = settings.conversationEngine === "openai-realtime"
518
+ || settings.conversationEngine === "openai-sip"
519
+ || settings.outboundConversationEngine === "openai-realtime"
520
+ || settings.outboundConversationEngine === "openai-sip";
521
+ const settingsNeedsCascade = settings.conversationEngine === "cascade" || settings.outboundConversationEngine === "cascade";
522
+ const transcriber = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
486
523
  ? {
487
524
  transcribe: async () => {
488
525
  throw new Error("OpenAI Realtime voice sessions do not use the cascade transcriber");
@@ -492,7 +529,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
492
529
  whisperCliPath: settings.whisperCliPath,
493
530
  modelPath: settings.whisperModelPath,
494
531
  });
495
- const tts = settings.conversationEngine === "openai-realtime" || settings.conversationEngine === "openai-sip"
532
+ const tts = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
496
533
  ? {
497
534
  synthesize: async () => {
498
535
  throw new Error("OpenAI Realtime voice sessions do not use the cascade TTS service");
@@ -522,6 +559,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
522
559
  transportMode: settings.transportMode,
523
560
  playbackMode: settings.playbackMode,
524
561
  conversationEngine: settings.conversationEngine,
562
+ outboundConversationEngine: settings.outboundConversationEngine,
525
563
  openaiRealtime: settings.openaiRealtime,
526
564
  openaiSip: settings.openaiSip,
527
565
  });
@@ -538,6 +576,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
538
576
  openaiSipWebhookUrl: settings.openaiSipWebhookUrl ?? "",
539
577
  transportMode: settings.transportMode,
540
578
  conversationEngine: settings.conversationEngine,
579
+ outboundConversationEngine: settings.outboundConversationEngine,
541
580
  openaiRealtimeModel: settings.openaiRealtime?.model ?? "",
542
581
  },
543
582
  });
@@ -565,7 +604,7 @@ async function prewarmOutboundGreeting(options, deps) {
565
604
  if (options.settings.transportMode !== "media-stream")
566
605
  return undefined;
567
606
  /* v8 ignore next -- Realtime/SIP outbound tests assert no cascade prewarm is attempted @preserve */
568
- if (options.settings.conversationEngine === "openai-realtime" || options.settings.conversationEngine === "openai-sip")
607
+ if (options.settings.outboundConversationEngine === "openai-realtime" || options.settings.outboundConversationEngine === "openai-sip")
569
608
  return undefined;
570
609
  const friendId = options.friendId?.trim() || `twilio-${safeRuntimeSegment(options.to)}`;
571
610
  const sessionKey = (0, twilio_phone_1.twilioPhoneVoiceSessionKey)({
@@ -677,7 +716,7 @@ async function placeConfiguredTwilioPhoneCall(options, deps = defaultTwilioPhone
677
716
  reason: options.reason.trim(),
678
717
  ...(options.initialAudio ? { initialAudio: options.initialAudio } : {}),
679
718
  createdAt,
680
- status: settings.transportMode === "media-stream" && settings.conversationEngine !== "openai-realtime" && settings.conversationEngine !== "openai-sip"
719
+ status: settings.transportMode === "media-stream" && settings.outboundConversationEngine === "cascade"
681
720
  ? "prewarming"
682
721
  : "requested",
683
722
  });