@ouro.bot/cli 0.1.0-alpha.583 → 0.1.0-alpha.585

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/changelog.json CHANGED
@@ -1,6 +1,21 @@
1
1
  {
2
2
  "_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
3
3
  "versions": [
4
+ {
5
+ "version": "0.1.0-alpha.585",
6
+ "changes": [
7
+ "Voice realtime evals now support replayable trace JSON artifacts that preserve transport/source metadata, normalize SIP/Twilio/Realtime event names into the shared Voice eval timeline, validate causal ordering, and keep ignored provider-noise events visible without grading them.",
8
+ "`npm run voice:eval -- --trace <file>` grades one or more trace artifacts alongside the built-in happy/known-bad suite, treating expected-fail traces as canaries while failing on unexpected trace failures or unexpected passes.",
9
+ "Voice trace replay has golden fixtures for clean calls, barge-in, tool holding phrases, mid-turn hangup, delayed audio/transcript behavior, duplicate/late provider events, and redaction-safe summaries so phone regressions can be debugged without a human live call."
10
+ ]
11
+ },
12
+ {
13
+ "version": "0.1.0-alpha.584",
14
+ "changes": [
15
+ "Voice now has a transport-aware Realtime eval kernel that grades deterministic call timelines for first-audio latency, user-turn response latency, tool holding phrases, barge-in clearing/truncation, friend context, transcript continuity, and hangup control before live phone testing.",
16
+ "`npm run voice:eval` runs built-in no-human voice scenarios, including a healthy path and an expected known-bad latency canary, and emits the normal nerves events expected of executable sense entrypoints so future Voice transport work can prove synchronous behavior without requiring a human to answer calls."
17
+ ]
18
+ },
4
19
  {
5
20
  "version": "0.1.0-alpha.583",
6
21
  "changes": [
@@ -26,3 +26,4 @@ __exportStar(require("./playback"), exports);
26
26
  __exportStar(require("./golden-path"), exports);
27
27
  __exportStar(require("./twilio-phone"), exports);
28
28
  __exportStar(require("./twilio-phone-runtime"), exports);
29
+ __exportStar(require("./realtime-eval"), exports);
@@ -0,0 +1,393 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.gradeVoiceRealtimeEvalTimeline = gradeVoiceRealtimeEvalTimeline;
4
+ exports.buildVoiceRealtimeEvalHappyPath = buildVoiceRealtimeEvalHappyPath;
5
+ exports.buildVoiceRealtimeEvalDefaultExpectation = buildVoiceRealtimeEvalDefaultExpectation;
6
+ exports.runBuiltInVoiceRealtimeEvalSuite = runBuiltInVoiceRealtimeEvalSuite;
7
+ exports.summarizeVoiceRealtimeEvalSuite = summarizeVoiceRealtimeEvalSuite;
8
+ const runtime_1 = require("../../nerves/runtime");
9
+ function validateTimeline(scenarioId, events, expectation) {
10
+ const normalizedScenarioId = scenarioId.trim();
11
+ if (!normalizedScenarioId)
12
+ throw new Error("voice eval scenario id is empty");
13
+ if (events.length === 0)
14
+ throw new Error("voice eval timeline is empty");
15
+ const budgets = [
16
+ expectation.maxFirstAssistantAudioMs,
17
+ expectation.maxUserTurnResponseMs,
18
+ expectation.maxToolPresenceMs,
19
+ expectation.maxBargeInClearMs,
20
+ expectation.maxBargeInTruncateMs,
21
+ ];
22
+ if (budgets.some((budget) => !Number.isFinite(budget) || budget <= 0)) {
23
+ throw new Error("voice eval latency budgets must be positive");
24
+ }
25
+ return normalizedScenarioId;
26
+ }
27
+ function sortedEvents(events) {
28
+ return [...events].sort((left, right) => left.atMs - right.atMs);
29
+ }
30
+ function firstEvent(events, type) {
31
+ return events.find((event) => event.type === type);
32
+ }
33
+ function allEvents(events, type) {
34
+ return events.filter((event) => event.type === type);
35
+ }
36
+ function lowerText(value) {
37
+ return value?.toLowerCase() ?? "";
38
+ }
39
+ function pushFinding(findings, finding) {
40
+ findings.push(finding);
41
+ }
42
+ function gradeFirstAudio(events, expectation, findings) {
43
+ const connected = firstEvent(events, "call.connected");
44
+ const firstAudio = firstEvent(events, "assistant.audio.started");
45
+ if (!connected || !firstAudio) {
46
+ pushFinding(findings, {
47
+ code: "first_audio_missing",
48
+ severity: "fail",
49
+ message: "Voice call did not produce assistant audio after connect.",
50
+ source: connected?.source ?? firstAudio?.source,
51
+ atMs: connected?.atMs ?? firstAudio?.atMs,
52
+ });
53
+ return undefined;
54
+ }
55
+ const ttfaMs = firstAudio.atMs - connected.atMs;
56
+ if (ttfaMs > expectation.maxFirstAssistantAudioMs) {
57
+ pushFinding(findings, {
58
+ code: "first_audio_late",
59
+ severity: "fail",
60
+ message: `First assistant audio started after ${ttfaMs}ms, over the ${expectation.maxFirstAssistantAudioMs}ms budget.`,
61
+ source: firstAudio.source,
62
+ atMs: firstAudio.atMs,
63
+ });
64
+ }
65
+ return ttfaMs;
66
+ }
67
+ function gradeFirstUserResponse(events, expectation, findings) {
68
+ const userTranscript = firstEvent(events, "user.transcript.done");
69
+ if (!userTranscript)
70
+ return undefined;
71
+ const response = events.find((event) => event.type === "response.requested"
72
+ && event.atMs >= userTranscript.atMs
73
+ && (!userTranscript.correlationId || event.correlationId === userTranscript.correlationId));
74
+ if (!response) {
75
+ pushFinding(findings, {
76
+ code: "user_response_missing",
77
+ severity: "fail",
78
+ message: "No voice response was requested after the caller transcript completed.",
79
+ source: userTranscript.source,
80
+ atMs: userTranscript.atMs,
81
+ });
82
+ return undefined;
83
+ }
84
+ const latencyMs = response.atMs - userTranscript.atMs;
85
+ if (latencyMs > expectation.maxUserTurnResponseMs) {
86
+ pushFinding(findings, {
87
+ code: "user_response_late",
88
+ severity: "fail",
89
+ message: `Voice response was requested after ${latencyMs}ms, over the ${expectation.maxUserTurnResponseMs}ms budget.`,
90
+ source: response.source,
91
+ atMs: response.atMs,
92
+ });
93
+ }
94
+ return latencyMs;
95
+ }
96
+ function gradeToolPresence(events, expectation, findings) {
97
+ const toolCall = firstEvent(events, "tool.call.started");
98
+ if (!toolCall)
99
+ return undefined;
100
+ const holding = events.find((event) => event.type === "tool.holding.started"
101
+ && event.atMs >= toolCall.atMs
102
+ && (!toolCall.correlationId || event.correlationId === toolCall.correlationId));
103
+ if (!holding) {
104
+ pushFinding(findings, {
105
+ code: "tool_presence_missing",
106
+ severity: "fail",
107
+ message: "Tool call did not produce a short voice holding phrase.",
108
+ source: toolCall.source,
109
+ atMs: toolCall.atMs,
110
+ });
111
+ return undefined;
112
+ }
113
+ const latencyMs = holding.atMs - toolCall.atMs;
114
+ if (latencyMs > expectation.maxToolPresenceMs) {
115
+ pushFinding(findings, {
116
+ code: "tool_presence_late",
117
+ severity: "fail",
118
+ message: `Tool holding phrase started after ${latencyMs}ms, over the ${expectation.maxToolPresenceMs}ms budget.`,
119
+ source: holding.source,
120
+ atMs: holding.atMs,
121
+ });
122
+ }
123
+ return latencyMs;
124
+ }
125
+ function gradeBargeIn(events, expectation, findings) {
126
+ const bargeIn = firstEvent(events, "barge_in.detected");
127
+ if (!bargeIn)
128
+ return {};
129
+ const clear = events.find((event) => event.type === "transport.playback_cleared" && event.atMs >= bargeIn.atMs);
130
+ const truncate = events.find((event) => event.type === "response.truncated" && event.atMs >= bargeIn.atMs);
131
+ const metrics = {};
132
+ if (!clear) {
133
+ pushFinding(findings, {
134
+ code: "barge_in_clear_missing",
135
+ severity: "fail",
136
+ message: "Caller barge-in did not clear transport playback.",
137
+ source: bargeIn.source,
138
+ atMs: bargeIn.atMs,
139
+ });
140
+ }
141
+ else {
142
+ metrics.firstBargeInClearMs = clear.atMs - bargeIn.atMs;
143
+ if (metrics.firstBargeInClearMs > expectation.maxBargeInClearMs) {
144
+ pushFinding(findings, {
145
+ code: "barge_in_clear_late",
146
+ severity: "fail",
147
+ message: `Barge-in playback clear took ${metrics.firstBargeInClearMs}ms, over the ${expectation.maxBargeInClearMs}ms budget.`,
148
+ source: clear.source,
149
+ atMs: clear.atMs,
150
+ });
151
+ }
152
+ }
153
+ if (!truncate) {
154
+ pushFinding(findings, {
155
+ code: "barge_in_truncate_missing",
156
+ severity: "fail",
157
+ message: "Caller barge-in did not truncate the active Realtime response.",
158
+ source: bargeIn.source,
159
+ atMs: bargeIn.atMs,
160
+ });
161
+ }
162
+ else {
163
+ metrics.firstBargeInTruncateMs = truncate.atMs - bargeIn.atMs;
164
+ if (metrics.firstBargeInTruncateMs > expectation.maxBargeInTruncateMs) {
165
+ pushFinding(findings, {
166
+ code: "barge_in_truncate_late",
167
+ severity: "fail",
168
+ message: `Barge-in response truncation took ${metrics.firstBargeInTruncateMs}ms, over the ${expectation.maxBargeInTruncateMs}ms budget.`,
169
+ source: truncate.source,
170
+ atMs: truncate.atMs,
171
+ });
172
+ }
173
+ }
174
+ return metrics;
175
+ }
176
+ function gradeManualFloorControl(events, findings) {
177
+ const session = allEvents(events, "session.updated").find((event) => event.session?.turnDetection);
178
+ if (session?.session?.turnDetection?.createResponse === false
179
+ && session.session.turnDetection.interruptResponse === false) {
180
+ return;
181
+ }
182
+ pushFinding(findings, {
183
+ code: "manual_floor_control_missing",
184
+ severity: "fail",
185
+ message: "Realtime session did not disable provider auto-response and provider interruption.",
186
+ source: session?.source,
187
+ atMs: session?.atMs,
188
+ });
189
+ }
190
+ function gradeFriendContext(events, requirement, findings) {
191
+ const context = firstEvent(events, "voice.context.injected");
192
+ if (context?.friendId === requirement.friendId
193
+ && context.sessionKey === requirement.sessionKey
194
+ && lowerText(context.text).includes(requirement.marker.toLowerCase())) {
195
+ return;
196
+ }
197
+ pushFinding(findings, {
198
+ code: "friend_context_mismatch",
199
+ severity: "fail",
200
+ message: "Voice context did not preserve the expected friend identity, trust marker, and stable session key.",
201
+ source: context?.source,
202
+ atMs: context?.atMs,
203
+ });
204
+ }
205
+ function gradeTranscripts(events, requirements, findings) {
206
+ for (const requirement of requirements) {
207
+ const type = requirement.role === "assistant"
208
+ ? "assistant.transcript.done"
209
+ : "user.transcript.done";
210
+ const found = allEvents(events, type).some((event) => lowerText(event.text).includes(requirement.contains.toLowerCase()));
211
+ if (!found) {
212
+ pushFinding(findings, {
213
+ code: "transcript_missing",
214
+ severity: "fail",
215
+ message: `Missing ${requirement.role} transcript containing "${requirement.contains}".`,
216
+ });
217
+ }
218
+ }
219
+ }
220
+ function gradeHangup(events, findings) {
221
+ const hangup = firstEvent(events, "call.hangup.requested");
222
+ if (hangup)
223
+ return;
224
+ const ended = firstEvent(events, "call.ended");
225
+ pushFinding(findings, {
226
+ code: "hangup_missing",
227
+ severity: "fail",
228
+ message: "Voice eval expected an agent-controlled hangup request before call end.",
229
+ source: ended?.source,
230
+ atMs: ended?.atMs,
231
+ });
232
+ }
233
+ function gradeOverlappingResponses(events, findings) {
234
+ for (const response of allEvents(events, "response.requested")) {
235
+ const activeAudio = allEvents(events, "assistant.audio.started").find((started) => {
236
+ const done = events.find((event) => {
237
+ if (event.atMs < started.atMs)
238
+ return false;
239
+ if (event.type === "response.truncated" || event.type === "call.hangup.requested")
240
+ return true;
241
+ if (event.type !== "assistant.audio.done" && event.type !== "assistant.transcript.done")
242
+ return false;
243
+ return !started.correlationId || !event.correlationId || event.correlationId === started.correlationId;
244
+ });
245
+ return response.atMs > started.atMs && (!done || response.atMs < done.atMs);
246
+ });
247
+ if (activeAudio) {
248
+ pushFinding(findings, {
249
+ code: "response_overlap",
250
+ severity: "fail",
251
+ message: "Voice response was requested while assistant audio was still active.",
252
+ source: response.source,
253
+ atMs: response.atMs,
254
+ });
255
+ return;
256
+ }
257
+ }
258
+ }
259
+ function collectTransportSources(events) {
260
+ return [...new Set(events.flatMap((event) => event.source ? [event.source.transport] : []))].sort();
261
+ }
262
+ function gradeVoiceRealtimeEvalTimeline(scenarioId, timeline, expectation) {
263
+ const normalizedScenarioId = validateTimeline(scenarioId, timeline, expectation);
264
+ const events = sortedEvents(timeline);
265
+ (0, runtime_1.emitNervesEvent)({
266
+ component: "senses",
267
+ event: "senses.voice_realtime_eval_start",
268
+ message: "starting Voice realtime eval timeline grading",
269
+ meta: { scenarioId: normalizedScenarioId, events: events.length },
270
+ });
271
+ const findings = [];
272
+ const metrics = {
273
+ ttfaMs: gradeFirstAudio(events, expectation, findings),
274
+ firstUserResponseMs: gradeFirstUserResponse(events, expectation, findings),
275
+ firstToolPresenceMs: gradeToolPresence(events, expectation, findings),
276
+ ...gradeBargeIn(events, expectation, findings),
277
+ };
278
+ if (expectation.requireManualFloorControl)
279
+ gradeManualFloorControl(events, findings);
280
+ if (expectation.requireFriendContext)
281
+ gradeFriendContext(events, expectation.requireFriendContext, findings);
282
+ if (expectation.requiredTranscripts)
283
+ gradeTranscripts(events, expectation.requiredTranscripts, findings);
284
+ if (expectation.requireHangup)
285
+ gradeHangup(events, findings);
286
+ gradeOverlappingResponses(events, findings);
287
+ const report = {
288
+ scenarioId: normalizedScenarioId,
289
+ passed: findings.every((finding) => finding.severity !== "fail"),
290
+ findings,
291
+ metrics,
292
+ transportSources: collectTransportSources(events),
293
+ };
294
+ (0, runtime_1.emitNervesEvent)({
295
+ component: "senses",
296
+ event: "senses.voice_realtime_eval_end",
297
+ message: "finished Voice realtime eval timeline grading",
298
+ meta: { scenarioId: normalizedScenarioId, passed: report.passed, findings: findings.length },
299
+ });
300
+ return report;
301
+ }
302
+ function buildVoiceRealtimeEvalHappyPath() {
303
+ return [
304
+ { type: "call.connected", atMs: 0, source: { transport: "openai-sip", id: "sip-call-1" } },
305
+ {
306
+ type: "voice.context.injected",
307
+ atMs: 80,
308
+ friendId: "friend-ari",
309
+ sessionKey: "twilio-phone-friend-ari-via-ouro",
310
+ text: "Resolved voice friend: Ari (friendId=friend-ari, trust=family).",
311
+ source: { transport: "voice-eval" },
312
+ },
313
+ {
314
+ type: "session.updated",
315
+ atMs: 100,
316
+ session: { turnDetection: { createResponse: false, interruptResponse: false } },
317
+ source: { transport: "openai-realtime-control", id: "ws-1" },
318
+ },
319
+ { type: "response.requested", atMs: 120, correlationId: "greeting", source: { transport: "openai-realtime-control", id: "ws-1" } },
320
+ { type: "assistant.audio.started", atMs: 720, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
321
+ { type: "assistant.audio.done", atMs: 1_820, correlationId: "greeting", source: { transport: "openai-sip", id: "sip-call-1" } },
322
+ {
323
+ type: "assistant.transcript.done",
324
+ atMs: 1_840,
325
+ correlationId: "greeting",
326
+ text: "Hey Ari, I am checking the weather now.",
327
+ source: { transport: "openai-realtime-control", id: "ws-1" },
328
+ },
329
+ {
330
+ type: "user.transcript.done",
331
+ atMs: 2_200,
332
+ correlationId: "user-1",
333
+ text: "Can you check the weather and then hang up?",
334
+ source: { transport: "twilio-media-stream", id: "stream-1" },
335
+ },
336
+ { type: "response.requested", atMs: 2_480, correlationId: "user-1", source: { transport: "openai-realtime-control", id: "ws-1" } },
337
+ { type: "assistant.audio.started", atMs: 2_540, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
338
+ { type: "assistant.audio.done", atMs: 2_820, correlationId: "user-1", source: { transport: "openai-sip", id: "sip-call-1" } },
339
+ { type: "tool.call.started", atMs: 3_000, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
340
+ { type: "tool.holding.started", atMs: 3_260, correlationId: "tool-1", text: "One sec, checking.", source: { transport: "openai-sip", id: "sip-call-1" } },
341
+ { type: "tool.call.completed", atMs: 3_800, correlationId: "tool-1", toolName: "weather_lookup", source: { transport: "openai-realtime-control", id: "ws-1" } },
342
+ { type: "barge_in.detected", atMs: 4_100, source: { transport: "twilio-media-stream", id: "stream-1" } },
343
+ { type: "transport.playback_cleared", atMs: 4_140, source: { transport: "twilio-media-stream", id: "stream-1" } },
344
+ { type: "response.truncated", atMs: 4_170, source: { transport: "openai-realtime-control", id: "ws-1" } },
345
+ { type: "call.hangup.requested", atMs: 5_000, source: { transport: "openai-realtime-control", id: "ws-1" } },
346
+ { type: "call.ended", atMs: 5_100, source: { transport: "openai-sip", id: "sip-call-1" } },
347
+ ];
348
+ }
349
+ function buildVoiceRealtimeEvalDefaultExpectation() {
350
+ return {
351
+ maxFirstAssistantAudioMs: 1_200,
352
+ maxUserTurnResponseMs: 900,
353
+ maxToolPresenceMs: 600,
354
+ maxBargeInClearMs: 120,
355
+ maxBargeInTruncateMs: 180,
356
+ requireManualFloorControl: true,
357
+ requireFriendContext: {
358
+ friendId: "friend-ari",
359
+ sessionKey: "twilio-phone-friend-ari-via-ouro",
360
+ marker: "trust=family",
361
+ },
362
+ requireHangup: true,
363
+ requiredTranscripts: [
364
+ { role: "user", contains: "weather" },
365
+ { role: "assistant", contains: "checking the weather" },
366
+ ],
367
+ };
368
+ }
369
+ function buildKnownBadLatencyPath() {
370
+ return buildVoiceRealtimeEvalHappyPath().map((event) => {
371
+ if (event.type === "assistant.audio.started" && event.correlationId === "greeting")
372
+ return { ...event, atMs: 1_900 };
373
+ if (event.type === "response.requested" && event.correlationId === "user-1")
374
+ return { ...event, atMs: 3_500 };
375
+ return event;
376
+ });
377
+ }
378
+ function runBuiltInVoiceRealtimeEvalSuite() {
379
+ const expectation = buildVoiceRealtimeEvalDefaultExpectation();
380
+ return [
381
+ gradeVoiceRealtimeEvalTimeline("voice-happy-path", buildVoiceRealtimeEvalHappyPath(), expectation),
382
+ gradeVoiceRealtimeEvalTimeline("voice-known-bad-latency", buildKnownBadLatencyPath(), expectation),
383
+ ];
384
+ }
385
+ function summarizeVoiceRealtimeEvalSuite(reports) {
386
+ const failedScenarioIds = reports.filter((report) => !report.passed).map((report) => report.scenarioId);
387
+ return {
388
+ passed: reports.length - failedScenarioIds.length,
389
+ failed: failedScenarioIds.length,
390
+ total: reports.length,
391
+ failedScenarioIds,
392
+ };
393
+ }
@@ -0,0 +1,402 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ Object.defineProperty(exports, "__esModule", { value: true });
36
+ exports.parseVoiceRealtimeEvalTraceArtifact = parseVoiceRealtimeEvalTraceArtifact;
37
+ exports.loadVoiceRealtimeEvalTraceArtifact = loadVoiceRealtimeEvalTraceArtifact;
38
+ exports.resolveVoiceRealtimeEvalTraceExpectation = resolveVoiceRealtimeEvalTraceExpectation;
39
+ exports.traceArtifactToVoiceRealtimeEvalTimeline = traceArtifactToVoiceRealtimeEvalTimeline;
40
+ exports.gradeVoiceRealtimeEvalTrace = gradeVoiceRealtimeEvalTrace;
41
+ exports.formatVoiceRealtimeEvalTraceReport = formatVoiceRealtimeEvalTraceReport;
42
+ const fs = __importStar(require("fs"));
43
+ const runtime_1 = require("../../nerves/runtime");
44
+ const realtime_eval_1 = require("./realtime-eval");
45
+ const transports = new Set([
46
+ "browser-meeting",
47
+ "openai-realtime-control",
48
+ "openai-sip",
49
+ "twilio-media-stream",
50
+ "voice-eval",
51
+ ]);
52
+ const normalizedEvents = new Set([
53
+ "assistant.audio.done",
54
+ "assistant.audio.started",
55
+ "assistant.transcript.done",
56
+ "barge_in.detected",
57
+ "call.connected",
58
+ "call.ended",
59
+ "call.hangup.requested",
60
+ "response.requested",
61
+ "response.truncated",
62
+ "session.updated",
63
+ "tool.call.completed",
64
+ "tool.call.started",
65
+ "tool.holding.started",
66
+ "transport.playback_cleared",
67
+ "user.transcript.done",
68
+ "voice.context.injected",
69
+ ]);
70
+ const rawEventMap = new Map([
71
+ ["openai.realtime.call.hangup.sent", "call.hangup.requested"],
72
+ ["openai.realtime.conversation.item.truncate.sent", "response.truncated"],
73
+ ["openai.realtime.input_audio_buffer.speech_started", "barge_in.detected"],
74
+ ["openai.realtime.input_audio_transcription.completed", "user.transcript.done"],
75
+ ["openai.realtime.output_audio.delta", "assistant.audio.started"],
76
+ ["openai.realtime.output_audio.done", "assistant.audio.done"],
77
+ ["openai.realtime.output_audio_transcript.done", "assistant.transcript.done"],
78
+ ["openai.realtime.response.create.sent", "response.requested"],
79
+ ["openai.realtime.response.function_call_arguments.done", "tool.call.started"],
80
+ ["openai.realtime.session.updated", "session.updated"],
81
+ ["openai.realtime.tool.completed", "tool.call.completed"],
82
+ ["openai.sip.call.connected", "call.connected"],
83
+ ["openai.sip.call.ended", "call.ended"],
84
+ ["twilio.call.ended", "call.ended"],
85
+ ["twilio.media.clear.sent", "transport.playback_cleared"],
86
+ ["twilio.media.start", "call.connected"],
87
+ ["voice.hangup.requested", "call.hangup.requested"],
88
+ ["voice.tool_holding.started", "tool.holding.started"],
89
+ ]);
90
+ const expectedOutcomes = new Set(["expected-fail", "fail", "pass"]);
91
+ const expectationProfiles = new Set(["voice-phone-default"]);
92
+ function objectRecord(value) {
93
+ return typeof value === "object" && value !== null && !Array.isArray(value) ? value : undefined;
94
+ }
95
+ function label(prefix, detail) {
96
+ return prefix ? `${prefix}: ${detail}` : detail;
97
+ }
98
+ function requiredString(value, name, sourceLabel) {
99
+ if (typeof value !== "string" || value.trim() === "")
100
+ throw new Error(label(sourceLabel, `${name} must be a non-empty string`));
101
+ return value.trim();
102
+ }
103
+ function optionalString(value, name, sourceLabel) {
104
+ if (value === undefined)
105
+ return undefined;
106
+ if (typeof value !== "string")
107
+ throw new Error(label(sourceLabel, `${name} must be a string`));
108
+ return value;
109
+ }
110
+ function parseSource(value, sourceLabel) {
111
+ if (value === undefined)
112
+ return undefined;
113
+ const raw = objectRecord(value);
114
+ if (!raw)
115
+ throw new Error(label(sourceLabel, "source must be an object"));
116
+ const transport = raw.transport;
117
+ if (typeof transport !== "string" || !transports.has(transport)) {
118
+ throw new Error(label(sourceLabel, "source.transport is unsupported"));
119
+ }
120
+ const id = optionalString(raw.id, "source.id", sourceLabel);
121
+ return id === undefined ? { transport: transport } : { transport: transport, id };
122
+ }
123
+ function parseTurnDetection(value, sourceLabel) {
124
+ if (value === undefined)
125
+ return undefined;
126
+ const session = objectRecord(value);
127
+ if (!session)
128
+ throw new Error(label(sourceLabel, "session must be an object"));
129
+ const turnDetection = objectRecord(session.turnDetection);
130
+ if (!turnDetection)
131
+ return {};
132
+ const createResponse = turnDetection.createResponse;
133
+ const interruptResponse = turnDetection.interruptResponse;
134
+ if (createResponse !== undefined && typeof createResponse !== "boolean") {
135
+ throw new Error(label(sourceLabel, "session.turnDetection.createResponse must be boolean"));
136
+ }
137
+ if (interruptResponse !== undefined && typeof interruptResponse !== "boolean") {
138
+ throw new Error(label(sourceLabel, "session.turnDetection.interruptResponse must be boolean"));
139
+ }
140
+ return { turnDetection: { createResponse, interruptResponse } };
141
+ }
142
+ function validateExpectation(expectation, sourceLabel) {
143
+ const budgets = [
144
+ expectation.maxFirstAssistantAudioMs,
145
+ expectation.maxUserTurnResponseMs,
146
+ expectation.maxToolPresenceMs,
147
+ expectation.maxBargeInClearMs,
148
+ expectation.maxBargeInTruncateMs,
149
+ ];
150
+ if (budgets.some((budget) => typeof budget !== "number" || !Number.isFinite(budget) || budget <= 0)) {
151
+ throw new Error(label(sourceLabel, "expectation latency budgets must be positive finite numbers"));
152
+ }
153
+ return expectation;
154
+ }
155
+ function parseExpectation(value, sourceLabel) {
156
+ const raw = objectRecord(value);
157
+ if (!raw)
158
+ throw new Error(label(sourceLabel, "expectation must be an object"));
159
+ return validateExpectation(raw, sourceLabel);
160
+ }
161
+ function parseTraceEvent(value, index, sourceLabel) {
162
+ const eventLabel = `${sourceLabel} event[${index}]`;
163
+ const raw = objectRecord(value);
164
+ if (!raw)
165
+ throw new Error(label(eventLabel, "must be an object"));
166
+ const event = requiredString(raw.event, "event", eventLabel);
167
+ const ignored = raw.ignored === undefined ? undefined : raw.ignored;
168
+ if (ignored !== undefined && typeof ignored !== "boolean")
169
+ throw new Error(label(eventLabel, "ignored must be boolean"));
170
+ if (!ignored && !normalizedEvents.has(event) && !rawEventMap.has(event)) {
171
+ throw new Error(label(eventLabel, `unknown trace event ${event}`));
172
+ }
173
+ const atMs = raw.atMs;
174
+ if (typeof atMs !== "number" || !Number.isFinite(atMs)) {
175
+ throw new Error(label(eventLabel, "atMs must be a finite number"));
176
+ }
177
+ let role;
178
+ if (raw.role !== undefined) {
179
+ if (raw.role !== "assistant" && raw.role !== "user")
180
+ throw new Error(label(eventLabel, "role must be assistant or user"));
181
+ role = raw.role;
182
+ }
183
+ const parsed = {
184
+ atMs,
185
+ event,
186
+ source: parseSource(raw.source, eventLabel),
187
+ correlationId: optionalString(raw.correlationId, "correlationId", eventLabel),
188
+ text: optionalString(raw.text, "text", eventLabel),
189
+ role,
190
+ toolName: optionalString(raw.toolName, "toolName", eventLabel),
191
+ friendId: optionalString(raw.friendId, "friendId", eventLabel),
192
+ sessionKey: optionalString(raw.sessionKey, "sessionKey", eventLabel),
193
+ session: parseTurnDetection(raw.session, eventLabel),
194
+ ignored: ignored || undefined,
195
+ ignoreReason: optionalString(raw.ignoreReason, "ignoreReason", eventLabel),
196
+ };
197
+ if (parsed.ignored && !parsed.ignoreReason)
198
+ throw new Error(label(eventLabel, "ignored events require ignoreReason"));
199
+ return parsed;
200
+ }
201
+ function parseVoiceRealtimeEvalTraceArtifact(value, sourceLabel = "voice trace artifact") {
202
+ const raw = objectRecord(value);
203
+ if (!raw || raw.schemaVersion !== 1)
204
+ throw new Error(label(sourceLabel, "schemaVersion must be 1"));
205
+ const expectedOutcome = raw.expectedOutcome;
206
+ if (typeof expectedOutcome !== "string" || !expectedOutcomes.has(expectedOutcome)) {
207
+ throw new Error(label(sourceLabel, "expectedOutcome must be pass, fail, or expected-fail"));
208
+ }
209
+ const expectationProfile = raw.expectationProfile;
210
+ if (expectationProfile !== undefined && (typeof expectationProfile !== "string" || !expectationProfiles.has(expectationProfile))) {
211
+ throw new Error(label(sourceLabel, "expectationProfile is unsupported"));
212
+ }
213
+ const hasInlineExpectation = raw.expectation !== undefined;
214
+ if (hasInlineExpectation === (expectationProfile !== undefined)) {
215
+ throw new Error(label(sourceLabel, "provide exactly one of expectation or expectationProfile"));
216
+ }
217
+ if (!Array.isArray(raw.events) || raw.events.length === 0) {
218
+ throw new Error(label(sourceLabel, "events must contain at least one event"));
219
+ }
220
+ const redacted = raw.redacted;
221
+ if (redacted !== undefined && typeof redacted !== "boolean")
222
+ throw new Error(label(sourceLabel, "redacted must be boolean"));
223
+ return {
224
+ schemaVersion: 1,
225
+ traceId: requiredString(raw.traceId, "traceId", sourceLabel),
226
+ scenarioId: requiredString(raw.scenarioId, "scenarioId", sourceLabel),
227
+ expectedOutcome: expectedOutcome,
228
+ expectation: hasInlineExpectation ? parseExpectation(raw.expectation, sourceLabel) : undefined,
229
+ expectationProfile: expectationProfile,
230
+ redacted: redacted || undefined,
231
+ events: raw.events.map((event, index) => parseTraceEvent(event, index, sourceLabel)),
232
+ };
233
+ }
234
+ function loadVoiceRealtimeEvalTraceArtifact(filePath) {
235
+ let raw;
236
+ try {
237
+ raw = fs.readFileSync(filePath, "utf8");
238
+ }
239
+ catch {
240
+ throw new Error(`${filePath}: failed to read trace artifact`);
241
+ }
242
+ try {
243
+ return parseVoiceRealtimeEvalTraceArtifact(JSON.parse(raw), filePath);
244
+ }
245
+ catch (error) {
246
+ if (error instanceof SyntaxError)
247
+ throw new Error(`${filePath}: invalid JSON: ${error.message}`);
248
+ throw error;
249
+ }
250
+ }
251
+ function resolveVoiceRealtimeEvalTraceExpectation(artifact) {
252
+ if (artifact.expectationProfile === "voice-phone-default")
253
+ return (0, realtime_eval_1.buildVoiceRealtimeEvalDefaultExpectation)();
254
+ if (artifact.expectation)
255
+ return artifact.expectation;
256
+ throw new Error(`${artifact.traceId}: trace artifact has no expectation contract`);
257
+ }
258
+ function eventTypeFor(event) {
259
+ if (normalizedEvents.has(event.event))
260
+ return event.event;
261
+ const type = rawEventMap.get(event.event);
262
+ if (!type)
263
+ throw new Error(`unknown trace event ${event.event}`);
264
+ return type;
265
+ }
266
+ function toTimelineEvent(event, redacted) {
267
+ const type = eventTypeFor(event);
268
+ return {
269
+ type,
270
+ atMs: event.atMs,
271
+ source: event.source,
272
+ correlationId: event.correlationId,
273
+ text: redacted ? undefined : event.text,
274
+ role: event.role,
275
+ toolName: event.toolName,
276
+ friendId: event.friendId,
277
+ sessionKey: event.sessionKey,
278
+ session: event.session,
279
+ };
280
+ }
281
+ function sortedTimeline(events) {
282
+ return events
283
+ .map((event, index) => ({ event, index }))
284
+ .sort((left, right) => left.event.atMs - right.event.atMs || left.index - right.index)
285
+ .map(({ event }) => event);
286
+ }
287
+ function findFirst(events, type) {
288
+ return events.find((event) => event.type === type);
289
+ }
290
+ function validateCausalTimeline(artifact, timeline) {
291
+ for (let index = 0; index < artifact.events.length; index += 1) {
292
+ const event = artifact.events[index];
293
+ if (event.ignored)
294
+ continue;
295
+ if (event.atMs < 0) {
296
+ throw new Error(`${artifact.traceId} event[${index}] atMs must be a nonnegative finite number`);
297
+ }
298
+ }
299
+ const connected = findFirst(timeline, "call.connected");
300
+ const audio = findFirst(timeline, "assistant.audio.started");
301
+ if (connected && audio && audio.atMs < connected.atMs) {
302
+ throw new Error(`${artifact.traceId}: assistant audio started before call.connected`);
303
+ }
304
+ for (const response of timeline.filter((event) => event.type === "response.requested" && event.correlationId)) {
305
+ const transcript = timeline.find((event) => event.type === "user.transcript.done" && event.correlationId === response.correlationId);
306
+ if (transcript && response.atMs < transcript.atMs) {
307
+ throw new Error(`${artifact.traceId}: response.requested for ${response.correlationId} occurred before user.transcript.done`);
308
+ }
309
+ }
310
+ for (const completed of timeline.filter((event) => event.type === "tool.call.completed" && event.correlationId)) {
311
+ const started = timeline.find((event) => event.type === "tool.call.started" && event.correlationId === completed.correlationId);
312
+ if (started && completed.atMs < started.atMs) {
313
+ throw new Error(`${artifact.traceId}: tool.call.completed for ${completed.correlationId} occurred before tool.call.started`);
314
+ }
315
+ }
316
+ const expectation = resolveVoiceRealtimeEvalTraceExpectation(artifact);
317
+ if (expectation.requireHangup) {
318
+ const ended = findFirst(timeline, "call.ended");
319
+ const hangup = findFirst(timeline, "call.hangup.requested");
320
+ if (ended && hangup && ended.atMs < hangup.atMs) {
321
+ throw new Error(`${artifact.traceId}: call.ended occurred before call.hangup.requested`);
322
+ }
323
+ }
324
+ }
325
+ function traceArtifactToVoiceRealtimeEvalTimeline(artifact) {
326
+ const timeline = sortedTimeline(artifact.events
327
+ .filter((event) => !event.ignored)
328
+ .map((event) => toTimelineEvent(event, Boolean(artifact.redacted))));
329
+ validateCausalTimeline(artifact, timeline);
330
+ return timeline;
331
+ }
332
+ function expectedOutcomeMatched(expectedOutcome, passed) {
333
+ if (expectedOutcome === "pass")
334
+ return passed;
335
+ return !passed;
336
+ }
337
+ function gradeVoiceRealtimeEvalTrace(artifact) {
338
+ (0, runtime_1.emitNervesEvent)({
339
+ component: "senses",
340
+ event: "senses.voice_realtime_trace_replay_start",
341
+ message: "starting Voice realtime trace replay",
342
+ meta: { scenarioId: artifact.scenarioId, events: artifact.events.length },
343
+ });
344
+ const timeline = traceArtifactToVoiceRealtimeEvalTimeline(artifact);
345
+ const report = (0, realtime_eval_1.gradeVoiceRealtimeEvalTimeline)(artifact.scenarioId, timeline, resolveVoiceRealtimeEvalTraceExpectation(artifact));
346
+ const result = {
347
+ artifact,
348
+ traceId: artifact.traceId,
349
+ scenarioId: artifact.scenarioId,
350
+ expectedOutcome: artifact.expectedOutcome,
351
+ outcomeMatched: expectedOutcomeMatched(artifact.expectedOutcome, report.passed),
352
+ report,
353
+ timeline,
354
+ ignoredEvents: artifact.events.filter((event) => event.ignored),
355
+ };
356
+ (0, runtime_1.emitNervesEvent)({
357
+ component: "senses",
358
+ event: "senses.voice_realtime_trace_replay_end",
359
+ message: "finished Voice realtime trace replay",
360
+ meta: { scenarioId: artifact.scenarioId, passed: report.passed, findings: report.findings.length },
361
+ });
362
+ return result;
363
+ }
364
+ function textForSummary(result, event) {
365
+ if (result.artifact.redacted && (event.type === "assistant.transcript.done" ||
366
+ event.type === "user.transcript.done" ||
367
+ event.type === "voice.context.injected")) {
368
+ return " [redacted]";
369
+ }
370
+ if (event.text === undefined)
371
+ return "";
372
+ return ` "${event.text}"`;
373
+ }
374
+ function sourceForSummary(source) {
375
+ if (!source)
376
+ return "";
377
+ return source.id ? ` ${source.transport}/${source.id}` : ` ${source.transport}`;
378
+ }
379
+ function formatVoiceRealtimeEvalTraceReport(result) {
380
+ const lines = [
381
+ `trace ${result.traceId} scenario ${result.scenarioId}`,
382
+ `expected: ${result.expectedOutcome}; report passed: ${result.report.passed}; outcome matched: ${result.outcomeMatched}`,
383
+ `transports: ${result.report.transportSources.join(", ") || "none"}`,
384
+ `metrics: ${JSON.stringify(result.report.metrics)}`,
385
+ ];
386
+ if (result.report.findings.length > 0) {
387
+ lines.push("findings:");
388
+ for (const finding of result.report.findings) {
389
+ const message = result.artifact.redacted ? "[redacted]" : finding.message;
390
+ lines.push(`- ${finding.code}${finding.atMs === undefined ? "" : ` at ${finding.atMs}ms`}: ${message}`);
391
+ }
392
+ }
393
+ lines.push("events:");
394
+ for (const event of result.timeline) {
395
+ lines.push(`- ${event.atMs}ms ${event.type}${sourceForSummary(event.source)}${textForSummary(result, event)}`);
396
+ }
397
+ lines.push(`ignored provider events: ${result.ignoredEvents.length}`);
398
+ for (const event of result.ignoredEvents) {
399
+ lines.push(`- ${event.atMs}ms ${event.event}${sourceForSummary(event.source)}: ${event.ignoreReason}`);
400
+ }
401
+ return lines.join("\n");
402
+ }
@@ -0,0 +1,99 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.runVoiceRealtimeEvalCommand = runVoiceRealtimeEvalCommand;
4
+ const runtime_1 = require("../nerves/runtime");
5
+ const realtime_eval_1 = require("./voice/realtime-eval");
6
+ const realtime_trace_1 = require("./voice/realtime-trace");
7
+ function parseTraceArgs(argv) {
8
+ const tracePaths = [];
9
+ for (let index = 0; index < argv.length; index += 1) {
10
+ const arg = argv[index];
11
+ if (arg !== "--trace")
12
+ throw new Error(`unknown argument: ${arg}`);
13
+ const tracePath = argv[index + 1];
14
+ if (!tracePath)
15
+ throw new Error("--trace requires a file path");
16
+ tracePaths.push(tracePath);
17
+ index += 1;
18
+ }
19
+ return tracePaths;
20
+ }
21
+ function builtInPayload() {
22
+ const reports = (0, realtime_eval_1.runBuiltInVoiceRealtimeEvalSuite)();
23
+ const summary = (0, realtime_eval_1.summarizeVoiceRealtimeEvalSuite)(reports);
24
+ return {
25
+ summary,
26
+ expectedKnownBadFailed: summary.failed === 1 && summary.failedScenarioIds[0] === "voice-known-bad-latency",
27
+ happyPathPassed: reports.some((report) => report.scenarioId === "voice-happy-path" && report.passed),
28
+ };
29
+ }
30
+ function traceResultPayload(result) {
31
+ return {
32
+ traceId: result.traceId,
33
+ scenarioId: result.scenarioId,
34
+ expectedOutcome: result.expectedOutcome,
35
+ outcomeMatched: result.outcomeMatched,
36
+ report: result.report,
37
+ ignoredEvents: result.ignoredEvents.map((event) => ({
38
+ atMs: event.atMs,
39
+ event: event.event,
40
+ source: event.source,
41
+ ignoreReason: event.ignoreReason,
42
+ })),
43
+ };
44
+ }
45
+ function summarizeTraceResults(traces) {
46
+ const mismatchedScenarioIds = traces.filter((trace) => !trace.outcomeMatched).map((trace) => trace.scenarioId);
47
+ return {
48
+ matched: traces.length - mismatchedScenarioIds.length,
49
+ mismatched: mismatchedScenarioIds.length,
50
+ total: traces.length,
51
+ mismatchedScenarioIds,
52
+ };
53
+ }
54
+ function errorResult(error) {
55
+ return {
56
+ exitCode: 1,
57
+ payload: { error: String(error).replace(/^Error: /, "") },
58
+ };
59
+ }
60
+ function runVoiceRealtimeEvalCommand(argv) {
61
+ (0, runtime_1.emitNervesEvent)({
62
+ component: "senses",
63
+ event: "senses.voice_realtime_eval_command_start",
64
+ message: "starting Voice realtime eval command runner",
65
+ meta: { scenarioId: "voice-eval-command", events: argv.length },
66
+ });
67
+ try {
68
+ const tracePaths = parseTraceArgs(argv);
69
+ const payload = builtInPayload();
70
+ if (tracePaths.length > 0) {
71
+ const traces = tracePaths
72
+ .map((tracePath) => (0, realtime_trace_1.gradeVoiceRealtimeEvalTrace)((0, realtime_trace_1.loadVoiceRealtimeEvalTraceArtifact)(tracePath)))
73
+ .map(traceResultPayload);
74
+ payload.traces = traces;
75
+ payload.traceSummary = summarizeTraceResults(traces);
76
+ }
77
+ const builtInsPassed = Boolean(payload.expectedKnownBadFailed && payload.happyPathPassed);
78
+ const tracesPassed = payload.traceSummary ? payload.traceSummary.mismatched === 0 : true;
79
+ const result = { exitCode: builtInsPassed && tracesPassed ? 0 : 1, payload };
80
+ (0, runtime_1.emitNervesEvent)({
81
+ component: "senses",
82
+ event: "senses.voice_realtime_eval_command_end",
83
+ message: "finished Voice realtime eval command runner",
84
+ meta: { scenarioId: "voice-eval-command", passed: result.exitCode === 0, findings: payload.traceSummary?.mismatched ?? 0 },
85
+ });
86
+ return result;
87
+ }
88
+ catch (error) {
89
+ const result = errorResult(error);
90
+ (0, runtime_1.emitNervesEvent)({
91
+ component: "senses",
92
+ event: "senses.voice_realtime_eval_command_error",
93
+ message: "Voice realtime eval command runner failed",
94
+ meta: { scenarioId: "voice-eval-command", error: result.payload.error },
95
+ level: "error",
96
+ });
97
+ return result;
98
+ }
99
+ }
@@ -0,0 +1,21 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const runtime_1 = require("../nerves/runtime");
4
+ const voice_realtime_eval_command_1 = require("./voice-realtime-eval-command");
5
+ (0, runtime_1.emitNervesEvent)({
6
+ component: "senses",
7
+ event: "senses.voice_realtime_eval_start",
8
+ message: "starting Voice realtime eval command",
9
+ meta: { scenarioId: "built-in-suite", events: 0 },
10
+ });
11
+ const result = (0, voice_realtime_eval_command_1.runVoiceRealtimeEvalCommand)(process.argv.slice(2));
12
+ (0, runtime_1.emitNervesEvent)({
13
+ component: "senses",
14
+ event: "senses.voice_realtime_eval_end",
15
+ message: "finished Voice realtime eval command",
16
+ meta: { scenarioId: "built-in-suite", passed: result.exitCode === 0, findings: result.payload.traceSummary?.mismatched ?? result.payload.summary?.failed ?? 0 },
17
+ });
18
+ // eslint-disable-next-line no-console -- terminal UX: eval command summary
19
+ console.log(JSON.stringify(result.payload, null, 2));
20
+ if (result.exitCode !== 0)
21
+ process.exit(result.exitCode);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ouro.bot/cli",
3
- "version": "0.1.0-alpha.583",
3
+ "version": "0.1.0-alpha.585",
4
4
  "main": "dist/heart/daemon/ouro-entry.js",
5
5
  "bin": {
6
6
  "cli": "dist/heart/daemon/ouro-bot-entry.js",
@@ -26,6 +26,7 @@
26
26
  "ouro": "tsc && node dist/heart/daemon/ouro-entry.js",
27
27
  "teams": "tsc && node dist/senses/teams-entry.js --agent ouroboros",
28
28
  "bluebubbles": "tsc && node dist/senses/bluebubbles/entry.js --agent ouroboros",
29
+ "voice:eval": "npm run build && node dist/senses/voice-realtime-eval-entry.js",
29
30
  "test": "vitest run",
30
31
  "test:integration": "npm run build && vitest run --config vitest.integration.config.ts",
31
32
  "test:e2e:package": "npm run build && node scripts/package-e2e.cjs",