@ouro.bot/cli 0.1.0-alpha.582 → 0.1.0-alpha.584
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/changelog.json +18 -0
- package/dist/senses/voice/index.js +1 -0
- package/dist/senses/voice/realtime-eval.js +384 -0
- package/dist/senses/voice/twilio-phone-runtime.js +56 -17
- package/dist/senses/voice/twilio-phone.js +375 -68
- package/dist/senses/voice-realtime-eval-entry.js +25 -0
- package/package.json +2 -1
package/changelog.json
CHANGED
|
@@ -1,6 +1,24 @@
|
|
|
1
1
|
{
|
|
2
2
|
"_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
|
|
3
3
|
"versions": [
|
|
4
|
+
{
|
|
5
|
+
"version": "0.1.0-alpha.584",
|
|
6
|
+
"changes": [
|
|
7
|
+
"Voice now has a transport-aware Realtime eval kernel that grades deterministic call timelines for first-audio latency, user-turn response latency, tool holding phrases, barge-in clearing/truncation, friend context, transcript continuity, and hangup control before live phone testing.",
|
|
8
|
+
"`npm run voice:eval` runs built-in no-human voice scenarios, including a healthy path and an expected known-bad latency canary, and emits the normal nerves events expected of executable sense entrypoints so future Voice transport work can prove synchronous behavior without requiring a human to answer calls."
|
|
9
|
+
]
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"version": "0.1.0-alpha.583",
|
|
13
|
+
"changes": [
|
|
14
|
+
"Outbound SIP phone calls now start the Realtime greeting immediately after answer unless Twilio has already positively identified voicemail or fax, preventing humans from hearing post-pickup silence when async AMD returns unknown.",
|
|
15
|
+
"Twilio phone voice now defaults outbound calls to OpenAI Realtime Media Streams when inbound calls use OpenAI SIP on a Media Stream machine, while still allowing `voice.twilioOutboundConversationEngine` overrides, so humans avoid post-pickup SIP ringback.",
|
|
16
|
+
"Realtime voice now resolves phone callers through the canonical friend graph, preferring existing friend ids and otherwise matching normalized phone numbers via `imessage-handle`, so trust-aware tools see the same friend context as text and mail.",
|
|
17
|
+
"Realtime media-stream voice now treats empty caller metadata as absent and preserves local voice friend identities, keeping outbound and provider-simulated calls attached to the intended friend instead of inventing a blank phone identity.",
|
|
18
|
+
"Realtime voice response creation now backs off and retries after provider active-response conflicts, holds user turns under Ouro floor-control instead of provider auto-response, and long-running voice tools can emit one tiny holding phrase instead of leaving seconds of unexplained silence.",
|
|
19
|
+
"Realtime voice VAD and local barge-in thresholds are less twitchy by default, reducing accidental interruption from tiny room sounds while preserving deliberate caller interruption."
|
|
20
|
+
]
|
|
21
|
+
},
|
|
4
22
|
{
|
|
5
23
|
"version": "0.1.0-alpha.582",
|
|
6
24
|
"changes": [
|
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.gradeVoiceRealtimeEvalTimeline = gradeVoiceRealtimeEvalTimeline;
|
|
4
|
+
exports.buildVoiceRealtimeEvalHappyPath = buildVoiceRealtimeEvalHappyPath;
|
|
5
|
+
exports.runBuiltInVoiceRealtimeEvalSuite = runBuiltInVoiceRealtimeEvalSuite;
|
|
6
|
+
exports.summarizeVoiceRealtimeEvalSuite = summarizeVoiceRealtimeEvalSuite;
|
|
7
|
+
const runtime_1 = require("../../nerves/runtime");
|
|
8
|
+
function validateTimeline(scenarioId, events, expectation) {
|
|
9
|
+
const normalizedScenarioId = scenarioId.trim();
|
|
10
|
+
if (!normalizedScenarioId)
|
|
11
|
+
throw new Error("voice eval scenario id is empty");
|
|
12
|
+
if (events.length === 0)
|
|
13
|
+
throw new Error("voice eval timeline is empty");
|
|
14
|
+
const budgets = [
|
|
15
|
+
expectation.maxFirstAssistantAudioMs,
|
|
16
|
+
expectation.maxUserTurnResponseMs,
|
|
17
|
+
expectation.maxToolPresenceMs,
|
|
18
|
+
expectation.maxBargeInClearMs,
|
|
19
|
+
expectation.maxBargeInTruncateMs,
|
|
20
|
+
];
|
|
21
|
+
if (budgets.some((budget) => !Number.isFinite(budget) || budget <= 0)) {
|
|
22
|
+
throw new Error("voice eval latency budgets must be positive");
|
|
23
|
+
}
|
|
24
|
+
return normalizedScenarioId;
|
|
25
|
+
}
|
|
26
|
+
function sortedEvents(events) {
|
|
27
|
+
return [...events].sort((left, right) => left.atMs - right.atMs);
|
|
28
|
+
}
|
|
29
|
+
function firstEvent(events, type) {
|
|
30
|
+
return events.find((event) => event.type === type);
|
|
31
|
+
}
|
|
32
|
+
function allEvents(events, type) {
|
|
33
|
+
return events.filter((event) => event.type === type);
|
|
34
|
+
}
|
|
35
|
+
function lowerText(value) {
|
|
36
|
+
return value?.toLowerCase() ?? "";
|
|
37
|
+
}
|
|
38
|
+
function pushFinding(findings, finding) {
|
|
39
|
+
findings.push(finding);
|
|
40
|
+
}
|
|
41
|
+
function gradeFirstAudio(events, expectation, findings) {
|
|
42
|
+
const connected = firstEvent(events, "call.connected");
|
|
43
|
+
const firstAudio = firstEvent(events, "assistant.audio.started");
|
|
44
|
+
if (!connected || !firstAudio) {
|
|
45
|
+
pushFinding(findings, {
|
|
46
|
+
code: "first_audio_missing",
|
|
47
|
+
severity: "fail",
|
|
48
|
+
message: "Voice call did not produce assistant audio after connect.",
|
|
49
|
+
source: connected?.source ?? firstAudio?.source,
|
|
50
|
+
atMs: connected?.atMs ?? firstAudio?.atMs,
|
|
51
|
+
});
|
|
52
|
+
return undefined;
|
|
53
|
+
}
|
|
54
|
+
const ttfaMs = firstAudio.atMs - connected.atMs;
|
|
55
|
+
if (ttfaMs > expectation.maxFirstAssistantAudioMs) {
|
|
56
|
+
pushFinding(findings, {
|
|
57
|
+
code: "first_audio_late",
|
|
58
|
+
severity: "fail",
|
|
59
|
+
message: `First assistant audio started after ${ttfaMs}ms, over the ${expectation.maxFirstAssistantAudioMs}ms budget.`,
|
|
60
|
+
source: firstAudio.source,
|
|
61
|
+
atMs: firstAudio.atMs,
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
return ttfaMs;
|
|
65
|
+
}
|
|
66
|
+
function gradeFirstUserResponse(events, expectation, findings) {
|
|
67
|
+
const userTranscript = firstEvent(events, "user.transcript.done");
|
|
68
|
+
if (!userTranscript)
|
|
69
|
+
return undefined;
|
|
70
|
+
const response = events.find((event) => event.type === "response.requested"
|
|
71
|
+
&& event.atMs >= userTranscript.atMs
|
|
72
|
+
&& (!userTranscript.correlationId || event.correlationId === userTranscript.correlationId));
|
|
73
|
+
if (!response) {
|
|
74
|
+
pushFinding(findings, {
|
|
75
|
+
code: "user_response_missing",
|
|
76
|
+
severity: "fail",
|
|
77
|
+
message: "No voice response was requested after the caller transcript completed.",
|
|
78
|
+
source: userTranscript.source,
|
|
79
|
+
atMs: userTranscript.atMs,
|
|
80
|
+
});
|
|
81
|
+
return undefined;
|
|
82
|
+
}
|
|
83
|
+
const latencyMs = response.atMs - userTranscript.atMs;
|
|
84
|
+
if (latencyMs > expectation.maxUserTurnResponseMs) {
|
|
85
|
+
pushFinding(findings, {
|
|
86
|
+
code: "user_response_late",
|
|
87
|
+
severity: "fail",
|
|
88
|
+
message: `Voice response was requested after ${latencyMs}ms, over the ${expectation.maxUserTurnResponseMs}ms budget.`,
|
|
89
|
+
source: response.source,
|
|
90
|
+
atMs: response.atMs,
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
return latencyMs;
|
|
94
|
+
}
|
|
95
|
+
function gradeToolPresence(events, expectation, findings) {
|
|
96
|
+
const toolCall = firstEvent(events, "tool.call.started");
|
|
97
|
+
if (!toolCall)
|
|
98
|
+
return undefined;
|
|
99
|
+
const holding = events.find((event) => event.type === "tool.holding.started"
|
|
100
|
+
&& event.atMs >= toolCall.atMs
|
|
101
|
+
&& (!toolCall.correlationId || event.correlationId === toolCall.correlationId));
|
|
102
|
+
if (!holding) {
|
|
103
|
+
pushFinding(findings, {
|
|
104
|
+
code: "tool_presence_missing",
|
|
105
|
+
severity: "fail",
|
|
106
|
+
message: "Tool call did not produce a short voice holding phrase.",
|
|
107
|
+
source: toolCall.source,
|
|
108
|
+
atMs: toolCall.atMs,
|
|
109
|
+
});
|
|
110
|
+
return undefined;
|
|
111
|
+
}
|
|
112
|
+
const latencyMs = holding.atMs - toolCall.atMs;
|
|
113
|
+
if (latencyMs > expectation.maxToolPresenceMs) {
|
|
114
|
+
pushFinding(findings, {
|
|
115
|
+
code: "tool_presence_late",
|
|
116
|
+
severity: "fail",
|
|
117
|
+
message: `Tool holding phrase started after ${latencyMs}ms, over the ${expectation.maxToolPresenceMs}ms budget.`,
|
|
118
|
+
source: holding.source,
|
|
119
|
+
atMs: holding.atMs,
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
return latencyMs;
|
|
123
|
+
}
|
|
124
|
+
function gradeBargeIn(events, expectation, findings) {
|
|
125
|
+
const bargeIn = firstEvent(events, "barge_in.detected");
|
|
126
|
+
if (!bargeIn)
|
|
127
|
+
return {};
|
|
128
|
+
const clear = events.find((event) => event.type === "transport.playback_cleared" && event.atMs >= bargeIn.atMs);
|
|
129
|
+
const truncate = events.find((event) => event.type === "response.truncated" && event.atMs >= bargeIn.atMs);
|
|
130
|
+
const metrics = {};
|
|
131
|
+
if (!clear) {
|
|
132
|
+
pushFinding(findings, {
|
|
133
|
+
code: "barge_in_clear_missing",
|
|
134
|
+
severity: "fail",
|
|
135
|
+
message: "Caller barge-in did not clear transport playback.",
|
|
136
|
+
source: bargeIn.source,
|
|
137
|
+
atMs: bargeIn.atMs,
|
|
138
|
+
});
|
|
139
|
+
}
|
|
140
|
+
else {
|
|
141
|
+
metrics.firstBargeInClearMs = clear.atMs - bargeIn.atMs;
|
|
142
|
+
if (metrics.firstBargeInClearMs > expectation.maxBargeInClearMs) {
|
|
143
|
+
pushFinding(findings, {
|
|
144
|
+
code: "barge_in_clear_late",
|
|
145
|
+
severity: "fail",
|
|
146
|
+
message: `Barge-in playback clear took ${metrics.firstBargeInClearMs}ms, over the ${expectation.maxBargeInClearMs}ms budget.`,
|
|
147
|
+
source: clear.source,
|
|
148
|
+
atMs: clear.atMs,
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
if (!truncate) {
|
|
153
|
+
pushFinding(findings, {
|
|
154
|
+
code: "barge_in_truncate_missing",
|
|
155
|
+
severity: "fail",
|
|
156
|
+
message: "Caller barge-in did not truncate the active Realtime response.",
|
|
157
|
+
source: bargeIn.source,
|
|
158
|
+
atMs: bargeIn.atMs,
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
metrics.firstBargeInTruncateMs = truncate.atMs - bargeIn.atMs;
|
|
163
|
+
if (metrics.firstBargeInTruncateMs > expectation.maxBargeInTruncateMs) {
|
|
164
|
+
pushFinding(findings, {
|
|
165
|
+
code: "barge_in_truncate_late",
|
|
166
|
+
severity: "fail",
|
|
167
|
+
message: `Barge-in response truncation took ${metrics.firstBargeInTruncateMs}ms, over the ${expectation.maxBargeInTruncateMs}ms budget.`,
|
|
168
|
+
source: truncate.source,
|
|
169
|
+
atMs: truncate.atMs,
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
return metrics;
|
|
174
|
+
}
|
|
175
|
+
function gradeManualFloorControl(events, findings) {
|
|
176
|
+
const session = allEvents(events, "session.updated").find((event) => event.session?.turnDetection);
|
|
177
|
+
if (session?.session?.turnDetection?.createResponse === false
|
|
178
|
+
&& session.session.turnDetection.interruptResponse === false) {
|
|
179
|
+
return;
|
|
180
|
+
}
|
|
181
|
+
pushFinding(findings, {
|
|
182
|
+
code: "manual_floor_control_missing",
|
|
183
|
+
severity: "fail",
|
|
184
|
+
message: "Realtime session did not disable provider auto-response and provider interruption.",
|
|
185
|
+
source: session?.source,
|
|
186
|
+
atMs: session?.atMs,
|
|
187
|
+
});
|
|
188
|
+
}
|
|
189
|
+
function gradeFriendContext(events, requirement, findings) {
|
|
190
|
+
const context = firstEvent(events, "voice.context.injected");
|
|
191
|
+
if (context?.friendId === requirement.friendId
|
|
192
|
+
&& context.sessionKey === requirement.sessionKey
|
|
193
|
+
&& lowerText(context.text).includes(requirement.marker.toLowerCase())) {
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
pushFinding(findings, {
|
|
197
|
+
code: "friend_context_mismatch",
|
|
198
|
+
severity: "fail",
|
|
199
|
+
message: "Voice context did not preserve the expected friend identity, trust marker, and stable session key.",
|
|
200
|
+
source: context?.source,
|
|
201
|
+
atMs: context?.atMs,
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
function gradeTranscripts(events, requirements, findings) {
|
|
205
|
+
for (const requirement of requirements) {
|
|
206
|
+
const type = requirement.role === "assistant"
|
|
207
|
+
? "assistant.transcript.done"
|
|
208
|
+
: "user.transcript.done";
|
|
209
|
+
const found = allEvents(events, type).some((event) => lowerText(event.text).includes(requirement.contains.toLowerCase()));
|
|
210
|
+
if (!found) {
|
|
211
|
+
pushFinding(findings, {
|
|
212
|
+
code: "transcript_missing",
|
|
213
|
+
severity: "fail",
|
|
214
|
+
message: `Missing ${requirement.role} transcript containing "${requirement.contains}".`,
|
|
215
|
+
});
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
function gradeHangup(events, findings) {
|
|
220
|
+
const hangup = firstEvent(events, "call.hangup.requested");
|
|
221
|
+
if (hangup)
|
|
222
|
+
return;
|
|
223
|
+
const ended = firstEvent(events, "call.ended");
|
|
224
|
+
pushFinding(findings, {
|
|
225
|
+
code: "hangup_missing",
|
|
226
|
+
severity: "fail",
|
|
227
|
+
message: "Voice eval expected an agent-controlled hangup request before call end.",
|
|
228
|
+
source: ended?.source,
|
|
229
|
+
atMs: ended?.atMs,
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
function gradeOverlappingResponses(events, findings) {
|
|
233
|
+
for (const response of allEvents(events, "response.requested")) {
|
|
234
|
+
const activeAudio = allEvents(events, "assistant.audio.started").find((started) => {
|
|
235
|
+
const done = events.find((event) => event.type === "assistant.audio.done" && event.atMs >= started.atMs);
|
|
236
|
+
return response.atMs > started.atMs && (!done || response.atMs < done.atMs);
|
|
237
|
+
});
|
|
238
|
+
if (activeAudio) {
|
|
239
|
+
pushFinding(findings, {
|
|
240
|
+
code: "response_overlap",
|
|
241
|
+
severity: "fail",
|
|
242
|
+
message: "Voice response was requested while assistant audio was still active.",
|
|
243
|
+
source: response.source,
|
|
244
|
+
atMs: response.atMs,
|
|
245
|
+
});
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
function collectTransportSources(events) {
|
|
251
|
+
return [...new Set(events.flatMap((event) => event.source ? [event.source.transport] : []))].sort();
|
|
252
|
+
}
|
|
253
|
+
function gradeVoiceRealtimeEvalTimeline(scenarioId, timeline, expectation) {
|
|
254
|
+
const normalizedScenarioId = validateTimeline(scenarioId, timeline, expectation);
|
|
255
|
+
const events = sortedEvents(timeline);
|
|
256
|
+
(0, runtime_1.emitNervesEvent)({
|
|
257
|
+
component: "senses",
|
|
258
|
+
event: "senses.voice_realtime_eval_start",
|
|
259
|
+
message: "starting Voice realtime eval timeline grading",
|
|
260
|
+
meta: { scenarioId: normalizedScenarioId, events: events.length },
|
|
261
|
+
});
|
|
262
|
+
const findings = [];
|
|
263
|
+
const metrics = {
|
|
264
|
+
ttfaMs: gradeFirstAudio(events, expectation, findings),
|
|
265
|
+
firstUserResponseMs: gradeFirstUserResponse(events, expectation, findings),
|
|
266
|
+
firstToolPresenceMs: gradeToolPresence(events, expectation, findings),
|
|
267
|
+
...gradeBargeIn(events, expectation, findings),
|
|
268
|
+
};
|
|
269
|
+
if (expectation.requireManualFloorControl)
|
|
270
|
+
gradeManualFloorControl(events, findings);
|
|
271
|
+
if (expectation.requireFriendContext)
|
|
272
|
+
gradeFriendContext(events, expectation.requireFriendContext, findings);
|
|
273
|
+
if (expectation.requiredTranscripts)
|
|
274
|
+
gradeTranscripts(events, expectation.requiredTranscripts, findings);
|
|
275
|
+
if (expectation.requireHangup)
|
|
276
|
+
gradeHangup(events, findings);
|
|
277
|
+
gradeOverlappingResponses(events, findings);
|
|
278
|
+
const report = {
|
|
279
|
+
scenarioId: normalizedScenarioId,
|
|
280
|
+
passed: findings.every((finding) => finding.severity !== "fail"),
|
|
281
|
+
findings,
|
|
282
|
+
metrics,
|
|
283
|
+
transportSources: collectTransportSources(events),
|
|
284
|
+
};
|
|
285
|
+
(0, runtime_1.emitNervesEvent)({
|
|
286
|
+
component: "senses",
|
|
287
|
+
event: "senses.voice_realtime_eval_end",
|
|
288
|
+
message: "finished Voice realtime eval timeline grading",
|
|
289
|
+
meta: { scenarioId: normalizedScenarioId, passed: report.passed, findings: findings.length },
|
|
290
|
+
});
|
|
291
|
+
return report;
|
|
292
|
+
}
|
|
293
|
+
function buildVoiceRealtimeEvalHappyPath() {
|
|
294
|
+
return [
|
|
295
|
+
{ type: "call.connected", atMs: 0, source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
296
|
+
{
|
|
297
|
+
type: "voice.context.injected",
|
|
298
|
+
atMs: 80,
|
|
299
|
+
friendId: "friend-ari",
|
|
300
|
+
sessionKey: "twilio-phone-friend-ari-via-ouro",
|
|
301
|
+
text: "Resolved voice friend: Ari (friendId=friend-ari, trust=family).",
|
|
302
|
+
source: { transport: "voice-eval" },
|
|
303
|
+
},
|
|
304
|
+
{
|
|
305
|
+
type: "session.updated",
|
|
306
|
+
atMs: 100,
|
|
307
|
+
session: { turnDetection: { createResponse: false, interruptResponse: false } },
|
|
308
|
+
source: { transport: "openai-realtime-control", id: "ws-1" },
|
|
309
|
+
},
|
|
310
|
+
{ type: "response.requested", atMs: 120, correlationId: "greeting", source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
311
|
+
{ type: "assistant.audio.started", atMs: 720, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
312
|
+
{ type: "assistant.audio.done", atMs: 1_820, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
313
|
+
{
|
|
314
|
+
type: "assistant.transcript.done",
|
|
315
|
+
atMs: 1_840,
|
|
316
|
+
correlationId: "greeting",
|
|
317
|
+
text: "Hey Ari, I am checking the weather now.",
|
|
318
|
+
source: { transport: "openai-realtime-control", id: "ws-1" },
|
|
319
|
+
},
|
|
320
|
+
{
|
|
321
|
+
type: "user.transcript.done",
|
|
322
|
+
atMs: 2_200,
|
|
323
|
+
correlationId: "user-1",
|
|
324
|
+
text: "Can you check the weather and then hang up?",
|
|
325
|
+
source: { transport: "twilio-media-stream", id: "stream-1" },
|
|
326
|
+
},
|
|
327
|
+
{ type: "response.requested", atMs: 2_480, correlationId: "user-1", source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
328
|
+
{ type: "assistant.audio.started", atMs: 2_540, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
329
|
+
{ type: "assistant.audio.done", atMs: 2_820, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
330
|
+
{ type: "tool.call.started", atMs: 3_000, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
331
|
+
{ type: "tool.holding.started", atMs: 3_260, correlationId: "tool-1", text: "One sec, checking.", source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
332
|
+
{ type: "tool.call.completed", atMs: 3_800, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
333
|
+
{ type: "barge_in.detected", atMs: 4_100, source: { transport: "twilio-media-stream", id: "stream-1" } },
|
|
334
|
+
{ type: "transport.playback_cleared", atMs: 4_140, source: { transport: "twilio-media-stream", id: "stream-1" } },
|
|
335
|
+
{ type: "response.truncated", atMs: 4_170, source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
336
|
+
{ type: "call.hangup.requested", atMs: 5_000, source: { transport: "openai-realtime-control", id: "ws-1" } },
|
|
337
|
+
{ type: "call.ended", atMs: 5_100, source: { transport: "openai-sip", id: "sip-call-1" } },
|
|
338
|
+
];
|
|
339
|
+
}
|
|
340
|
+
function builtInExpectation() {
|
|
341
|
+
return {
|
|
342
|
+
maxFirstAssistantAudioMs: 1_200,
|
|
343
|
+
maxUserTurnResponseMs: 900,
|
|
344
|
+
maxToolPresenceMs: 600,
|
|
345
|
+
maxBargeInClearMs: 120,
|
|
346
|
+
maxBargeInTruncateMs: 180,
|
|
347
|
+
requireManualFloorControl: true,
|
|
348
|
+
requireFriendContext: {
|
|
349
|
+
friendId: "friend-ari",
|
|
350
|
+
sessionKey: "twilio-phone-friend-ari-via-ouro",
|
|
351
|
+
marker: "trust=family",
|
|
352
|
+
},
|
|
353
|
+
requireHangup: true,
|
|
354
|
+
requiredTranscripts: [
|
|
355
|
+
{ role: "user", contains: "weather" },
|
|
356
|
+
{ role: "assistant", contains: "checking the weather" },
|
|
357
|
+
],
|
|
358
|
+
};
|
|
359
|
+
}
|
|
360
|
+
function buildKnownBadLatencyPath() {
|
|
361
|
+
return buildVoiceRealtimeEvalHappyPath().map((event) => {
|
|
362
|
+
if (event.type === "assistant.audio.started" && event.correlationId === "greeting")
|
|
363
|
+
return { ...event, atMs: 1_900 };
|
|
364
|
+
if (event.type === "response.requested" && event.correlationId === "user-1")
|
|
365
|
+
return { ...event, atMs: 3_500 };
|
|
366
|
+
return event;
|
|
367
|
+
});
|
|
368
|
+
}
|
|
369
|
+
function runBuiltInVoiceRealtimeEvalSuite() {
|
|
370
|
+
const expectation = builtInExpectation();
|
|
371
|
+
return [
|
|
372
|
+
gradeVoiceRealtimeEvalTimeline("voice-happy-path", buildVoiceRealtimeEvalHappyPath(), expectation),
|
|
373
|
+
gradeVoiceRealtimeEvalTimeline("voice-known-bad-latency", buildKnownBadLatencyPath(), expectation),
|
|
374
|
+
];
|
|
375
|
+
}
|
|
376
|
+
function summarizeVoiceRealtimeEvalSuite(reports) {
|
|
377
|
+
const failedScenarioIds = reports.filter((report) => !report.passed).map((report) => report.scenarioId);
|
|
378
|
+
return {
|
|
379
|
+
passed: reports.length - failedScenarioIds.length,
|
|
380
|
+
failed: failedScenarioIds.length,
|
|
381
|
+
total: reports.length,
|
|
382
|
+
failedScenarioIds,
|
|
383
|
+
};
|
|
384
|
+
}
|
|
@@ -154,13 +154,37 @@ function resolveOpenAIRealtimeApiKey(options) {
|
|
|
154
154
|
return { apiKey: compatKey, source: "integrations.openaiEmbeddingsApiKey" };
|
|
155
155
|
return undefined;
|
|
156
156
|
}
|
|
157
|
-
function configuredConversationEngine(options, overrides) {
|
|
158
|
-
|
|
159
|
-
??
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
157
|
+
function configuredConversationEngine(options, overrides, transportMode) {
|
|
158
|
+
const explicit = overrides.conversationEngine
|
|
159
|
+
?? configString(options.machineConfig, "voice.twilioConversationEngine")
|
|
160
|
+
?? configString(options.machineConfig, "voice.conversationEngine")
|
|
161
|
+
?? configString(options.runtimeConfig, "voice.twilioConversationEngine")
|
|
162
|
+
?? configString(options.runtimeConfig, "voice.conversationEngine");
|
|
163
|
+
const hasSipConfig = !!(configString(options.runtimeConfig, "voice.openaiSipProjectId")
|
|
164
|
+
|| configString(options.machineConfig, "voice.openaiSipProjectId"));
|
|
165
|
+
const explicitEngine = explicit ? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(explicit) : undefined;
|
|
166
|
+
if (hasSipConfig && (!explicitEngine || explicitEngine === "cascade"))
|
|
167
|
+
return "openai-sip";
|
|
168
|
+
if (explicitEngine)
|
|
169
|
+
return explicitEngine;
|
|
170
|
+
const hasRealtimeConfig = !!resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
|
|
171
|
+
if (hasRealtimeConfig && transportMode === "media-stream")
|
|
172
|
+
return "openai-realtime";
|
|
173
|
+
return "cascade";
|
|
174
|
+
}
|
|
175
|
+
function configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode) {
|
|
176
|
+
const defaultOutboundEngine = conversationEngine === "openai-sip" && transportMode === "media-stream"
|
|
177
|
+
? "openai-realtime"
|
|
178
|
+
: conversationEngine;
|
|
179
|
+
const configured = overrides.outboundConversationEngine
|
|
180
|
+
?? (0, twilio_phone_1.normalizeTwilioPhoneConversationEngine)(configString(options.machineConfig, "voice.twilioOutboundConversationEngine")
|
|
181
|
+
?? configString(options.machineConfig, "voice.outboundConversationEngine")
|
|
182
|
+
?? configString(options.runtimeConfig, "voice.twilioOutboundConversationEngine")
|
|
183
|
+
?? configString(options.runtimeConfig, "voice.outboundConversationEngine")
|
|
184
|
+
?? defaultOutboundEngine);
|
|
185
|
+
if (defaultOutboundEngine === "openai-realtime" && configured === "cascade")
|
|
186
|
+
return defaultOutboundEngine;
|
|
187
|
+
return configured;
|
|
164
188
|
}
|
|
165
189
|
function normalizeOpenAIRealtimeReasoningEffort(value) {
|
|
166
190
|
const normalized = value?.trim().toLowerCase();
|
|
@@ -226,7 +250,14 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
226
250
|
?? twilio_phone_1.TWILIO_PHONE_WEBHOOK_BASE_PATH);
|
|
227
251
|
const transportMode = overrides.transportMode
|
|
228
252
|
?? (0, twilio_phone_1.normalizeTwilioPhoneTransportMode)(configString(options.machineConfig, "voice.twilioTransportMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_TRANSPORT_MODE);
|
|
229
|
-
const conversationEngine = configuredConversationEngine(options, overrides);
|
|
253
|
+
const conversationEngine = configuredConversationEngine(options, overrides, transportMode);
|
|
254
|
+
const outboundConversationEngine = configuredOutboundConversationEngine(options, overrides, conversationEngine, transportMode);
|
|
255
|
+
const needsOpenAIRealtime = conversationEngine === "openai-realtime"
|
|
256
|
+
|| conversationEngine === "openai-sip"
|
|
257
|
+
|| outboundConversationEngine === "openai-realtime"
|
|
258
|
+
|| outboundConversationEngine === "openai-sip";
|
|
259
|
+
const needsOpenAISip = conversationEngine === "openai-sip" || outboundConversationEngine === "openai-sip";
|
|
260
|
+
const needsCascade = conversationEngine === "cascade" || outboundConversationEngine === "cascade";
|
|
230
261
|
let elevenLabsApiKey = configString(options.runtimeConfig, "integrations.elevenLabsApiKey") ?? "";
|
|
231
262
|
let elevenLabsVoiceId = trimOptional(overrides.elevenLabsVoiceId)
|
|
232
263
|
?? configString(options.runtimeConfig, "integrations.elevenLabsVoiceId")
|
|
@@ -240,9 +271,9 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
240
271
|
?? "";
|
|
241
272
|
let openaiRealtime;
|
|
242
273
|
let openaiSip;
|
|
243
|
-
if (
|
|
244
|
-
if (conversationEngine === "openai-realtime" && transportMode !== "media-stream") {
|
|
245
|
-
throw new Error("voice.twilioConversationEngine
|
|
274
|
+
if (needsOpenAIRealtime) {
|
|
275
|
+
if ((conversationEngine === "openai-realtime" || outboundConversationEngine === "openai-realtime") && transportMode !== "media-stream") {
|
|
276
|
+
throw new Error("voice.twilioConversationEngine/openai-realtime requires voice.twilioTransportMode=media-stream");
|
|
246
277
|
}
|
|
247
278
|
const key = resolveOpenAIRealtimeApiKey({ runtimeConfig: options.runtimeConfig, overrides });
|
|
248
279
|
if (!key) {
|
|
@@ -300,7 +331,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
300
331
|
?? normalizeOpenAIRealtimeNoiseReduction(configString(options.runtimeConfig, "voice.openaiRealtimeNoiseReduction")),
|
|
301
332
|
turnDetection,
|
|
302
333
|
};
|
|
303
|
-
if (
|
|
334
|
+
if (needsOpenAISip) {
|
|
304
335
|
const projectId = trimOptional(overrides.openaiSipProjectId)
|
|
305
336
|
?? configString(options.runtimeConfig, "voice.openaiSipProjectId")
|
|
306
337
|
?? configString(options.machineConfig, "voice.openaiSipProjectId");
|
|
@@ -334,7 +365,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
334
365
|
};
|
|
335
366
|
}
|
|
336
367
|
}
|
|
337
|
-
|
|
368
|
+
if (needsCascade) {
|
|
338
369
|
elevenLabsApiKey = required(elevenLabsApiKey || undefined, "missing integrations.elevenLabsApiKey; run 'ouro connect voice --agent <agent>' for setup guidance");
|
|
339
370
|
elevenLabsVoiceId = required(elevenLabsVoiceId || undefined, "missing integrations.elevenLabsVoiceId; save the ElevenLabs voice ID before starting phone voice");
|
|
340
371
|
whisperCliPath = required(whisperCliPath || undefined, "missing voice.whisperCliPath in this machine's runtime config");
|
|
@@ -379,6 +410,7 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
379
410
|
?? (0, twilio_phone_1.normalizeTwilioPhonePlaybackMode)(configString(options.machineConfig, "voice.twilioPlaybackMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE),
|
|
380
411
|
transportMode,
|
|
381
412
|
conversationEngine,
|
|
413
|
+
outboundConversationEngine,
|
|
382
414
|
openaiRealtime,
|
|
383
415
|
openaiSip,
|
|
384
416
|
openaiSipWebhookUrl: openaiSip?.webhookPath ? (0, twilio_phone_1.openAISipWebhookUrl)(publicBaseUrl, openaiSip.webhookPath) : undefined,
|
|
@@ -482,7 +514,12 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
|
|
|
482
514
|
meta: { agentName: settings.agentName, source: settings.openaiRealtime.apiKeySource },
|
|
483
515
|
});
|
|
484
516
|
}
|
|
485
|
-
const
|
|
517
|
+
const settingsNeedsOpenAIRealtime = settings.conversationEngine === "openai-realtime"
|
|
518
|
+
|| settings.conversationEngine === "openai-sip"
|
|
519
|
+
|| settings.outboundConversationEngine === "openai-realtime"
|
|
520
|
+
|| settings.outboundConversationEngine === "openai-sip";
|
|
521
|
+
const settingsNeedsCascade = settings.conversationEngine === "cascade" || settings.outboundConversationEngine === "cascade";
|
|
522
|
+
const transcriber = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
|
|
486
523
|
? {
|
|
487
524
|
transcribe: async () => {
|
|
488
525
|
throw new Error("OpenAI Realtime voice sessions do not use the cascade transcriber");
|
|
@@ -492,7 +529,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
|
|
|
492
529
|
whisperCliPath: settings.whisperCliPath,
|
|
493
530
|
modelPath: settings.whisperModelPath,
|
|
494
531
|
});
|
|
495
|
-
const tts =
|
|
532
|
+
const tts = settingsNeedsOpenAIRealtime && !settingsNeedsCascade
|
|
496
533
|
? {
|
|
497
534
|
synthesize: async () => {
|
|
498
535
|
throw new Error("OpenAI Realtime voice sessions do not use the cascade TTS service");
|
|
@@ -522,6 +559,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
|
|
|
522
559
|
transportMode: settings.transportMode,
|
|
523
560
|
playbackMode: settings.playbackMode,
|
|
524
561
|
conversationEngine: settings.conversationEngine,
|
|
562
|
+
outboundConversationEngine: settings.outboundConversationEngine,
|
|
525
563
|
openaiRealtime: settings.openaiRealtime,
|
|
526
564
|
openaiSip: settings.openaiSip,
|
|
527
565
|
});
|
|
@@ -538,6 +576,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
|
|
|
538
576
|
openaiSipWebhookUrl: settings.openaiSipWebhookUrl ?? "",
|
|
539
577
|
transportMode: settings.transportMode,
|
|
540
578
|
conversationEngine: settings.conversationEngine,
|
|
579
|
+
outboundConversationEngine: settings.outboundConversationEngine,
|
|
541
580
|
openaiRealtimeModel: settings.openaiRealtime?.model ?? "",
|
|
542
581
|
},
|
|
543
582
|
});
|
|
@@ -565,7 +604,7 @@ async function prewarmOutboundGreeting(options, deps) {
|
|
|
565
604
|
if (options.settings.transportMode !== "media-stream")
|
|
566
605
|
return undefined;
|
|
567
606
|
/* v8 ignore next -- Realtime/SIP outbound tests assert no cascade prewarm is attempted @preserve */
|
|
568
|
-
if (options.settings.
|
|
607
|
+
if (options.settings.outboundConversationEngine === "openai-realtime" || options.settings.outboundConversationEngine === "openai-sip")
|
|
569
608
|
return undefined;
|
|
570
609
|
const friendId = options.friendId?.trim() || `twilio-${safeRuntimeSegment(options.to)}`;
|
|
571
610
|
const sessionKey = (0, twilio_phone_1.twilioPhoneVoiceSessionKey)({
|
|
@@ -677,7 +716,7 @@ async function placeConfiguredTwilioPhoneCall(options, deps = defaultTwilioPhone
|
|
|
677
716
|
reason: options.reason.trim(),
|
|
678
717
|
...(options.initialAudio ? { initialAudio: options.initialAudio } : {}),
|
|
679
718
|
createdAt,
|
|
680
|
-
status: settings.transportMode === "media-stream" && settings.
|
|
719
|
+
status: settings.transportMode === "media-stream" && settings.outboundConversationEngine === "cascade"
|
|
681
720
|
? "prewarming"
|
|
682
721
|
: "requested",
|
|
683
722
|
});
|