@ouro.bot/cli 0.1.0-alpha.566 → 0.1.0-alpha.568
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/changelog.json +15 -0
- package/dist/heart/daemon/cli-exec.js +2 -1
- package/dist/mind/prompt.js +1 -1
- package/dist/senses/shared-turn.js +55 -7
- package/dist/senses/voice/elevenlabs.js +13 -1
- package/dist/senses/voice/turn.js +113 -9
- package/dist/senses/voice/twilio-phone-runtime.js +3 -0
- package/dist/senses/voice/twilio-phone.js +427 -32
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -105,7 +105,7 @@ Task docs do not live in this repo anymore. Planning and doing docs live in the
|
|
|
105
105
|
- Human TTY commands share one CLI surface family: bare `ouro` opens the home deck, `ouro up` uses the boot checklist, `ouro connect`/`ouro auth verify`/`ouro repair` agree on provider and vault truth, and `ouro help`/`ouro whoami`/`ouro versions`/`ouro hatch` render through the same Ouro-branded wizard/guide language instead of raw transcript walls. Orientation commands such as root `ouro connect` may use shorter live probes, while startup and verification commands own durable readiness updates.
|
|
106
106
|
- Human-facing CLI commands that can wait on browser auth, vault IO, daemon startup, daemon restart, provider checks, or connector setup use a shared progress checklist. If a cursor may blink for more than a few seconds, the command should print or animate the current step instead of going quiet.
|
|
107
107
|
- CLI commands that mutate bundle config, such as vault setup or `ouro connect bluebubbles`, run bundle sync after the change when `sync.enabled` is true and report a compact `bundle sync:` line.
|
|
108
|
-
- Voice is transcript-first: voice sessions use the ordinary `state/sessions/<friend>/voice/<key>.json` session path and appear in Ouro Mailbox as text transcripts. ElevenLabs API credentials live in portable `runtime/config` at `integrations.elevenLabsApiKey` and `integrations.elevenLabsVoiceId`; Whisper.cpp CLI/model paths live in the machine runtime item at `voice.whisperCliPath` and `voice.whisperModelPath`. Phone calls, browser meetings, and local microphone capture are transports under the single `voice` sense, not separate senses; the Twilio phone transport uses Twilio Record -> Whisper.cpp -> voice session -> ElevenLabs -> Twilio Play.
|
|
108
|
+
- Voice is transcript-first: voice sessions use the ordinary `state/sessions/<friend>/voice/<key>.json` session path and appear in Ouro Mailbox as text transcripts. ElevenLabs API credentials live in portable `runtime/config` at `integrations.elevenLabsApiKey` and `integrations.elevenLabsVoiceId`; Whisper.cpp CLI/model paths live in the machine runtime item at `voice.whisperCliPath` and `voice.whisperModelPath`. Phone calls, browser meetings, and local microphone capture are transports under the single `voice` sense, not separate senses; the Twilio phone transport uses Twilio Record -> Whisper.cpp -> stable voice session -> tool-delivered `speak`/`settle` text -> ElevenLabs -> Twilio Play, with managed playback streaming ElevenLabs chunks to Twilio by default.
|
|
109
109
|
- The daemon discovers bundles dynamically from `~/AgentBundles`.
|
|
110
110
|
- `ouro status` reports version, last-updated time, discovered agents, senses, and workers.
|
|
111
111
|
- `bundle-meta.json` tracks the runtime version that last touched a bundle.
|
package/changelog.json
CHANGED
|
@@ -1,6 +1,21 @@
|
|
|
1
1
|
{
|
|
2
2
|
"_note": "This changelog is maintained as part of the PR/version-bump workflow. Agent-curated, not auto-generated. Agents read this file directly via read_file to understand what changed between versions.",
|
|
3
3
|
"versions": [
|
|
4
|
+
{
|
|
5
|
+
"version": "0.1.0-alpha.568",
|
|
6
|
+
"changes": [
|
|
7
|
+
"Twilio phone recordings that Whisper.cpp reports as empty speech now route through an agent-authored voice reprompt instead of failing the Twilio audio stream.",
|
|
8
|
+
"Real STT infrastructure failures still surface as bridge errors, preserving a clear distinction between silence and broken transcription."
|
|
9
|
+
]
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"version": "0.1.0-alpha.567",
|
|
13
|
+
"changes": [
|
|
14
|
+
"Voice transports now receive outward `speak` and `settle` text through the shared sense delivery callback path, so voice audio is driven by the same tool-required delivery semantics as chat channels.",
|
|
15
|
+
"Twilio phone sessions are now keyed to the stable phone voice channel instead of CallSid, while CallSid remains the per-call artifact directory.",
|
|
16
|
+
"Managed Twilio playback now supports streaming Play URLs backed by ElevenLabs audio chunks, with buffered playback still available for compatibility testing."
|
|
17
|
+
]
|
|
18
|
+
},
|
|
4
19
|
{
|
|
5
20
|
"version": "0.1.0-alpha.566",
|
|
6
21
|
"changes": [
|
|
@@ -4289,11 +4289,12 @@ async function executeConnectVoice(agent, deps) {
|
|
|
4289
4289
|
` ouro vault config set --agent ${agent} --scope machine --key voice.twilioPublicUrl`,
|
|
4290
4290
|
` ouro vault config set --agent ${agent} --scope machine --key voice.twilioBasePath --value /voice/agents/${agentPathSegment}/twilio`,
|
|
4291
4291
|
` ouro vault config set --agent ${agent} --scope machine --key voice.twilioPort --value 18910`,
|
|
4292
|
+
` ouro vault config set --agent ${agent} --scope machine --key voice.twilioPlaybackMode --value stream`,
|
|
4292
4293
|
` ouro vault config set --agent ${agent} --scope machine --key voice.twilioDefaultFriendId --value ari`,
|
|
4293
4294
|
"Then enable agent.json: senses.voice.enabled = true and restart with `ouro up`.",
|
|
4294
4295
|
`The managed Voice entrypoint will listen at POST <public-url>/voice/agents/${agentPathSegment}/twilio/incoming.`,
|
|
4295
4296
|
`Standalone local smoke remains available with: node dist/senses/voice-twilio-entry.js --agent ${agent} --port 18910 --public-url https://<cloudflare-tunnel>.`,
|
|
4296
|
-
"Meeting links use URL intake plus BlackHole/Multi-Output readiness checks. Phone testing uses Twilio Record -> Whisper.cpp -> voice session -> ElevenLabs -> Twilio Play.",
|
|
4297
|
+
"Meeting links use URL intake plus BlackHole/Multi-Output readiness checks. Phone testing uses Twilio Record -> Whisper.cpp -> stable voice session -> tool-delivered speak/settle text -> ElevenLabs -> Twilio Play, with managed playback streaming ElevenLabs chunks by default.",
|
|
4297
4298
|
].join("\n");
|
|
4298
4299
|
deps.writeStdout(message);
|
|
4299
4300
|
return message;
|
package/dist/mind/prompt.js
CHANGED
|
@@ -506,7 +506,7 @@ function senseRuntimeGuidance(channel, preReadStatusLines) {
|
|
|
506
506
|
lines.push("mail validation diagnostics: health checks, bounded mail tools, access logs, and UI inspection can support validation, but they are evidence inside those paths, not additional paths. If asked to name golden paths, do not include diagnostic commands, tool names, or status checks in the answer.");
|
|
507
507
|
lines.push("mail diagnostic naming: `ouro doctor` is installation-wide; do not invent `ouro doctor --agent <agent>`.");
|
|
508
508
|
lines.push("mail setup boundaries: do not invent `ouro auth verify --provider mail`, HEY OAuth, HEY IMAP, `ouro mcp call mail ...`, policy flags, autonomous sending, destructive mail actions, or production MX/DNS/forwarding changes. HEY export, HEY forwarding, DNS, MX cutover, sending, and destructive actions require explicit human confirmation.");
|
|
509
|
-
lines.push("voice setup truth: voice sessions are transcript-first local sessions. ElevenLabs credentials belong in portable runtime/config at `integrations.elevenLabsApiKey` and `integrations.elevenLabsVoiceId`; Whisper.cpp CLI/model paths belong in the machine runtime item under `voice.whisperCliPath` and `voice.whisperModelPath`. Meeting links have URL intake and local BlackHole/Multi-Output readiness checks; phone testing uses Twilio Record -> Whisper.cpp -> voice session -> ElevenLabs -> Twilio Play. Live browser join/injection remains an explicit handoff edge until provider automation lands.");
|
|
509
|
+
lines.push("voice setup truth: voice sessions are transcript-first local sessions. ElevenLabs credentials belong in portable runtime/config at `integrations.elevenLabsApiKey` and `integrations.elevenLabsVoiceId`; Whisper.cpp CLI/model paths belong in the machine runtime item under `voice.whisperCliPath` and `voice.whisperModelPath`. Meeting links have URL intake and local BlackHole/Multi-Output readiness checks; phone testing uses Twilio Record -> Whisper.cpp -> stable voice session -> tool-delivered speak/settle text -> ElevenLabs -> Twilio Play, with managed playback streaming ElevenLabs chunks by default. Live browser join/injection remains an explicit handoff edge until provider automation lands.");
|
|
510
510
|
if (channel === "cli") {
|
|
511
511
|
lines.push("cli is interactive: it is available when the user opens it, not something `ouro up` daemonizes.");
|
|
512
512
|
}
|
|
@@ -235,17 +235,64 @@ async function runSenseTurn(options) {
|
|
|
235
235
|
: [{ role: "system", content: (0, prompt_1.flattenSystemPrompt)(await (0, prompt_1.buildSystem)(channel, {}, undefined)) }];
|
|
236
236
|
// Pending dir
|
|
237
237
|
const pendingDir = (0, pending_1.getPendingDir)(agentName, friendId, channel, sessionKey);
|
|
238
|
-
// Accumulate
|
|
239
|
-
|
|
238
|
+
// Accumulate outward text through the same callback boundary used by chat
|
|
239
|
+
// channels. `speak` flushes pending text immediately; `settle` is delivered
|
|
240
|
+
// once the turn completes.
|
|
241
|
+
let committedResponseText = "";
|
|
242
|
+
let pendingResponseText = "";
|
|
243
|
+
let terminalDeliveryKind = "text";
|
|
244
|
+
const deliveries = [];
|
|
245
|
+
const deliveryFailures = [];
|
|
246
|
+
const commitResponseText = (text) => {
|
|
247
|
+
const cleaned = stripThinkBlocks(text);
|
|
248
|
+
/* v8 ignore next -- deliverPending strips first; this is a defensive direct-call guard @preserve */
|
|
249
|
+
if (!cleaned)
|
|
250
|
+
return;
|
|
251
|
+
committedResponseText = committedResponseText
|
|
252
|
+
? `${committedResponseText}\n${cleaned}`
|
|
253
|
+
: cleaned;
|
|
254
|
+
};
|
|
255
|
+
const deliveryErrorMessage = (error) => error instanceof Error ? error.message : String(error);
|
|
256
|
+
const deliverPending = async (kind, optionsForDelivery) => {
|
|
257
|
+
const text = stripThinkBlocks(pendingResponseText);
|
|
258
|
+
pendingResponseText = "";
|
|
259
|
+
if (!text)
|
|
260
|
+
return;
|
|
261
|
+
const delivery = { kind, text };
|
|
262
|
+
try {
|
|
263
|
+
await options.deliverySink?.onDelivery(delivery);
|
|
264
|
+
deliveries.push(delivery);
|
|
265
|
+
commitResponseText(text);
|
|
266
|
+
}
|
|
267
|
+
catch (error) {
|
|
268
|
+
const failure = { ...delivery, error: deliveryErrorMessage(error) };
|
|
269
|
+
deliveryFailures.push(failure);
|
|
270
|
+
(0, runtime_1.emitNervesEvent)({
|
|
271
|
+
level: "error",
|
|
272
|
+
component: "senses",
|
|
273
|
+
event: "senses.shared_turn_delivery_error",
|
|
274
|
+
message: "shared turn outward delivery failed",
|
|
275
|
+
meta: { agentName, channel, sessionKey, friendId, kind, error: failure.error, textLength: text.length },
|
|
276
|
+
});
|
|
277
|
+
if (optionsForDelivery.throwOnError)
|
|
278
|
+
throw error;
|
|
279
|
+
commitResponseText(text);
|
|
280
|
+
}
|
|
281
|
+
};
|
|
240
282
|
/* v8 ignore start — no-op callback stubs; only onTextChunk does real work (covered via mock) */
|
|
241
283
|
const callbacks = {
|
|
242
284
|
onModelStart: () => { },
|
|
243
285
|
onModelStreamStart: () => { },
|
|
244
|
-
onTextChunk: (chunk) => {
|
|
286
|
+
onTextChunk: (chunk) => { pendingResponseText += chunk; },
|
|
245
287
|
onReasoningChunk: () => { },
|
|
246
288
|
onToolStart: () => { },
|
|
247
|
-
onToolEnd: () => {
|
|
289
|
+
onToolEnd: (name, _summary, success) => {
|
|
290
|
+
if (name === "settle" && success)
|
|
291
|
+
terminalDeliveryKind = "settle";
|
|
292
|
+
},
|
|
248
293
|
onError: () => { },
|
|
294
|
+
onClearText: () => { pendingResponseText = ""; },
|
|
295
|
+
flushNow: () => deliverPending("speak", { throwOnError: true }),
|
|
249
296
|
};
|
|
250
297
|
/* v8 ignore stop */
|
|
251
298
|
// Run the pipeline
|
|
@@ -285,10 +332,11 @@ async function runSenseTurn(options) {
|
|
|
285
332
|
/* v8 ignore stop */
|
|
286
333
|
accumulateFriendTokens: tokens_1.accumulateFriendTokens,
|
|
287
334
|
});
|
|
335
|
+
await deliverPending(terminalDeliveryKind, { throwOnError: false });
|
|
288
336
|
const ponderDeferred = false;
|
|
289
337
|
// Build response
|
|
290
338
|
let finalResponse;
|
|
291
|
-
if (
|
|
339
|
+
if (committedResponseText.length === 0) {
|
|
292
340
|
// Agent settled but no text came through callbacks — check session transcript for the settle answer
|
|
293
341
|
// Await deferred persist so the session file is up-to-date before readback
|
|
294
342
|
/* v8 ignore next -- persistPromise set inside v8-ignored postTurn callback; tested via pipeline integration @preserve */
|
|
@@ -304,7 +352,7 @@ async function runSenseTurn(options) {
|
|
|
304
352
|
}
|
|
305
353
|
}
|
|
306
354
|
else {
|
|
307
|
-
finalResponse =
|
|
355
|
+
finalResponse = committedResponseText;
|
|
308
356
|
}
|
|
309
357
|
// Strip MiniMax-style <think>...</think> blocks from the final response.
|
|
310
358
|
// When a reasoning-style model emits only a think block and no final answer
|
|
@@ -335,5 +383,5 @@ async function runSenseTurn(options) {
|
|
|
335
383
|
message: "shared turn runner complete",
|
|
336
384
|
meta: { agentName, channel, sessionKey, friendId, ponderDeferred, responseLength: finalResponse.length },
|
|
337
385
|
});
|
|
338
|
-
return { response: finalResponse, ponderDeferred };
|
|
386
|
+
return { response: finalResponse, ponderDeferred, deliveries, deliveryFailures };
|
|
339
387
|
}
|
|
@@ -156,7 +156,19 @@ function createElevenLabsTtsClient(options) {
|
|
|
156
156
|
try {
|
|
157
157
|
const parsed = JSON.parse(payloadText(payload));
|
|
158
158
|
if (typeof parsed.audio === "string" && parsed.audio.length > 0) {
|
|
159
|
-
|
|
159
|
+
const chunk = Buffer.from(parsed.audio, "base64");
|
|
160
|
+
chunks.push(chunk);
|
|
161
|
+
if (request.onAudioChunk) {
|
|
162
|
+
try {
|
|
163
|
+
const chunkResult = request.onAudioChunk(chunk);
|
|
164
|
+
if (chunkResult && typeof chunkResult.then === "function") {
|
|
165
|
+
void chunkResult.catch(fail);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
catch (error) {
|
|
169
|
+
fail(error);
|
|
170
|
+
}
|
|
171
|
+
}
|
|
160
172
|
}
|
|
161
173
|
if (parsed.isFinal === true) {
|
|
162
174
|
finish();
|
|
@@ -4,6 +4,33 @@ exports.runVoiceLoopbackTurn = runVoiceLoopbackTurn;
|
|
|
4
4
|
const runtime_1 = require("../../nerves/runtime");
|
|
5
5
|
const shared_turn_1 = require("../shared-turn");
|
|
6
6
|
const transcript_1 = require("./transcript");
|
|
7
|
+
function deliveredTts(spoken) {
|
|
8
|
+
return {
|
|
9
|
+
status: "delivered",
|
|
10
|
+
audio: spoken.audio,
|
|
11
|
+
byteLength: spoken.byteLength,
|
|
12
|
+
chunkCount: spoken.chunkCount,
|
|
13
|
+
mimeType: spoken.mimeType,
|
|
14
|
+
modelId: spoken.modelId,
|
|
15
|
+
voiceId: spoken.voiceId,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
function aggregateSegments(segments) {
|
|
19
|
+
const first = segments[0].tts;
|
|
20
|
+
const audio = Buffer.concat(segments.map((segment) => Buffer.from(segment.tts.audio)));
|
|
21
|
+
return {
|
|
22
|
+
status: "delivered",
|
|
23
|
+
audio,
|
|
24
|
+
byteLength: audio.byteLength,
|
|
25
|
+
chunkCount: segments.reduce((sum, segment) => sum + segment.tts.chunkCount, 0),
|
|
26
|
+
mimeType: first.mimeType,
|
|
27
|
+
modelId: first.modelId,
|
|
28
|
+
voiceId: first.voiceId,
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
function deliveryErrorMessage(error) {
|
|
32
|
+
return error instanceof Error ? error.message : String(error);
|
|
33
|
+
}
|
|
7
34
|
async function runVoiceLoopbackTurn(options) {
|
|
8
35
|
const runSenseTurn = options.runSenseTurn ?? shared_turn_1.runSenseTurn;
|
|
9
36
|
let userMessage;
|
|
@@ -31,30 +58,105 @@ async function runVoiceLoopbackTurn(options) {
|
|
|
31
58
|
utteranceId: options.transcript.utteranceId,
|
|
32
59
|
},
|
|
33
60
|
});
|
|
61
|
+
const speechSegments = [];
|
|
62
|
+
const speechDeliveryErrors = [];
|
|
63
|
+
let deliveryIndex = 0;
|
|
64
|
+
const synthesizeDelivery = async (delivery) => {
|
|
65
|
+
deliveryIndex += 1;
|
|
66
|
+
const segmentUtteranceId = `${options.transcript.utteranceId}-${deliveryIndex}-${delivery.kind}`;
|
|
67
|
+
try {
|
|
68
|
+
const spoken = await options.tts.synthesize({
|
|
69
|
+
utteranceId: segmentUtteranceId,
|
|
70
|
+
text: delivery.text,
|
|
71
|
+
onAudioChunk: options.onAudioChunk,
|
|
72
|
+
});
|
|
73
|
+
speechSegments.push({
|
|
74
|
+
kind: delivery.kind,
|
|
75
|
+
text: delivery.text,
|
|
76
|
+
utteranceId: segmentUtteranceId,
|
|
77
|
+
tts: deliveredTts(spoken),
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
catch (error) {
|
|
81
|
+
const failure = {
|
|
82
|
+
kind: delivery.kind,
|
|
83
|
+
text: delivery.text,
|
|
84
|
+
utteranceId: segmentUtteranceId,
|
|
85
|
+
error: deliveryErrorMessage(error),
|
|
86
|
+
};
|
|
87
|
+
speechDeliveryErrors.push(failure);
|
|
88
|
+
throw error;
|
|
89
|
+
}
|
|
90
|
+
};
|
|
34
91
|
const turn = await runSenseTurn({
|
|
35
92
|
agentName: options.agentName,
|
|
36
93
|
channel: "voice",
|
|
37
94
|
friendId: options.friendId,
|
|
38
95
|
sessionKey: options.sessionKey,
|
|
39
96
|
userMessage,
|
|
97
|
+
deliverySink: { onDelivery: synthesizeDelivery },
|
|
40
98
|
});
|
|
99
|
+
if (speechSegments.length > 0) {
|
|
100
|
+
const tts = aggregateSegments(speechSegments);
|
|
101
|
+
const result = {
|
|
102
|
+
responseText: turn.response,
|
|
103
|
+
ponderDeferred: turn.ponderDeferred,
|
|
104
|
+
tts,
|
|
105
|
+
speechSegments,
|
|
106
|
+
speechDeliveryErrors,
|
|
107
|
+
};
|
|
108
|
+
(0, runtime_1.emitNervesEvent)({
|
|
109
|
+
component: "senses",
|
|
110
|
+
event: "senses.voice_turn_end",
|
|
111
|
+
message: "voice loopback turn delivered speech",
|
|
112
|
+
meta: {
|
|
113
|
+
utteranceId: options.transcript.utteranceId,
|
|
114
|
+
responseLength: turn.response.length,
|
|
115
|
+
segmentCount: speechSegments.length,
|
|
116
|
+
byteLength: tts.byteLength,
|
|
117
|
+
},
|
|
118
|
+
});
|
|
119
|
+
return result;
|
|
120
|
+
}
|
|
121
|
+
const turnDeliveryFailures = turn.deliveryFailures ?? [];
|
|
122
|
+
if (speechDeliveryErrors.length > 0 || turnDeliveryFailures.length > 0) {
|
|
123
|
+
const firstError = speechDeliveryErrors[0]?.error ?? turnDeliveryFailures[0].error;
|
|
124
|
+
(0, runtime_1.emitNervesEvent)({
|
|
125
|
+
level: "error",
|
|
126
|
+
component: "senses",
|
|
127
|
+
event: "senses.voice_turn_tts_error",
|
|
128
|
+
message: "voice loopback TTS failed after text response",
|
|
129
|
+
meta: { utteranceId: options.transcript.utteranceId, error: firstError, responseLength: turn.response.length },
|
|
130
|
+
});
|
|
131
|
+
return {
|
|
132
|
+
responseText: turn.response,
|
|
133
|
+
ponderDeferred: turn.ponderDeferred,
|
|
134
|
+
tts: {
|
|
135
|
+
status: "failed",
|
|
136
|
+
error: firstError,
|
|
137
|
+
},
|
|
138
|
+
speechSegments,
|
|
139
|
+
speechDeliveryErrors,
|
|
140
|
+
};
|
|
141
|
+
}
|
|
41
142
|
try {
|
|
42
143
|
const spoken = await options.tts.synthesize({
|
|
43
144
|
utteranceId: options.transcript.utteranceId,
|
|
44
145
|
text: turn.response,
|
|
146
|
+
onAudioChunk: options.onAudioChunk,
|
|
45
147
|
});
|
|
148
|
+
const tts = deliveredTts(spoken);
|
|
46
149
|
const result = {
|
|
47
150
|
responseText: turn.response,
|
|
48
151
|
ponderDeferred: turn.ponderDeferred,
|
|
49
|
-
tts
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
},
|
|
152
|
+
tts,
|
|
153
|
+
speechSegments: [{
|
|
154
|
+
kind: "text",
|
|
155
|
+
text: turn.response,
|
|
156
|
+
utteranceId: options.transcript.utteranceId,
|
|
157
|
+
tts,
|
|
158
|
+
}],
|
|
159
|
+
speechDeliveryErrors,
|
|
58
160
|
};
|
|
59
161
|
(0, runtime_1.emitNervesEvent)({
|
|
60
162
|
component: "senses",
|
|
@@ -80,6 +182,8 @@ async function runVoiceLoopbackTurn(options) {
|
|
|
80
182
|
status: "failed",
|
|
81
183
|
error: message,
|
|
82
184
|
},
|
|
185
|
+
speechSegments,
|
|
186
|
+
speechDeliveryErrors,
|
|
83
187
|
};
|
|
84
188
|
}
|
|
85
189
|
}
|
|
@@ -193,6 +193,8 @@ function resolveTwilioPhoneTransportRuntime(options) {
|
|
|
193
193
|
recordMaxLengthSeconds: overrides.recordMaxLengthSeconds
|
|
194
194
|
?? configNumber(options.machineConfig, "voice.twilioRecordMaxLengthSeconds")
|
|
195
195
|
?? twilio_phone_1.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS,
|
|
196
|
+
playbackMode: overrides.playbackMode
|
|
197
|
+
?? (0, twilio_phone_1.normalizeTwilioPhonePlaybackMode)(configString(options.machineConfig, "voice.twilioPlaybackMode") ?? twilio_phone_1.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE),
|
|
196
198
|
};
|
|
197
199
|
return { status: "configured", settings };
|
|
198
200
|
}
|
|
@@ -259,6 +261,7 @@ async function startConfiguredTwilioPhoneTransport(options, deps = defaultTwilio
|
|
|
259
261
|
defaultFriendId: settings.defaultFriendId,
|
|
260
262
|
recordTimeoutSeconds: settings.recordTimeoutSeconds,
|
|
261
263
|
recordMaxLengthSeconds: settings.recordMaxLengthSeconds,
|
|
264
|
+
playbackMode: settings.playbackMode,
|
|
262
265
|
});
|
|
263
266
|
(0, runtime_1.emitNervesEvent)({
|
|
264
267
|
component: "senses",
|
|
@@ -33,9 +33,11 @@ var __importStar = (this && this.__importStar) || (function () {
|
|
|
33
33
|
};
|
|
34
34
|
})();
|
|
35
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
36
|
-
exports.TWILIO_PHONE_WEBHOOK_BASE_PATH = exports.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS = exports.DEFAULT_TWILIO_RECORD_TIMEOUT_SECONDS = exports.DEFAULT_TWILIO_PHONE_PORT = void 0;
|
|
36
|
+
exports.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE = exports.TWILIO_PHONE_WEBHOOK_BASE_PATH = exports.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS = exports.DEFAULT_TWILIO_RECORD_TIMEOUT_SECONDS = exports.DEFAULT_TWILIO_PHONE_PORT = void 0;
|
|
37
37
|
exports.normalizeTwilioPhoneBasePath = normalizeTwilioPhoneBasePath;
|
|
38
|
+
exports.normalizeTwilioPhonePlaybackMode = normalizeTwilioPhonePlaybackMode;
|
|
38
39
|
exports.twilioPhoneWebhookUrl = twilioPhoneWebhookUrl;
|
|
40
|
+
exports.twilioPhoneVoiceSessionKey = twilioPhoneVoiceSessionKey;
|
|
39
41
|
exports.computeTwilioSignature = computeTwilioSignature;
|
|
40
42
|
exports.validateTwilioSignature = validateTwilioSignature;
|
|
41
43
|
exports.twilioRecordingMediaUrl = twilioRecordingMediaUrl;
|
|
@@ -54,6 +56,7 @@ exports.DEFAULT_TWILIO_PHONE_PORT = 18910;
|
|
|
54
56
|
exports.DEFAULT_TWILIO_RECORD_TIMEOUT_SECONDS = 2;
|
|
55
57
|
exports.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS = 30;
|
|
56
58
|
exports.TWILIO_PHONE_WEBHOOK_BASE_PATH = "/voice/twilio";
|
|
59
|
+
exports.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE = "stream";
|
|
57
60
|
function bodyText(body) {
|
|
58
61
|
if (body === undefined)
|
|
59
62
|
return "";
|
|
@@ -104,6 +107,21 @@ function binaryResponse(body, contentType) {
|
|
|
104
107
|
body,
|
|
105
108
|
};
|
|
106
109
|
}
|
|
110
|
+
function streamResponse(body, contentType) {
|
|
111
|
+
return {
|
|
112
|
+
statusCode: 200,
|
|
113
|
+
headers: {
|
|
114
|
+
"content-type": contentType,
|
|
115
|
+
"cache-control": "no-store",
|
|
116
|
+
},
|
|
117
|
+
body,
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
function isAsyncIterableBody(body) {
|
|
121
|
+
return typeof body === "object"
|
|
122
|
+
&& body !== null
|
|
123
|
+
&& Symbol.asyncIterator in body;
|
|
124
|
+
}
|
|
107
125
|
function escapeXml(input) {
|
|
108
126
|
return input
|
|
109
127
|
.replace(/&/g, "&")
|
|
@@ -127,6 +145,12 @@ function normalizeTwilioPhoneBasePath(value = exports.TWILIO_PHONE_WEBHOOK_BASE_
|
|
|
127
145
|
}
|
|
128
146
|
return withoutTrailingSlash;
|
|
129
147
|
}
|
|
148
|
+
function normalizeTwilioPhonePlaybackMode(value) {
|
|
149
|
+
const normalized = (value ?? exports.DEFAULT_TWILIO_PHONE_PLAYBACK_MODE).trim().toLowerCase();
|
|
150
|
+
if (normalized === "stream" || normalized === "buffered")
|
|
151
|
+
return normalized;
|
|
152
|
+
throw new Error(`invalid Twilio phone playback mode: ${value}`);
|
|
153
|
+
}
|
|
130
154
|
function twilioPhoneWebhookUrl(publicBaseUrl, basePath = exports.TWILIO_PHONE_WEBHOOK_BASE_PATH) {
|
|
131
155
|
return routeUrl(publicBaseUrl, `${normalizeTwilioPhoneBasePath(basePath)}/incoming`);
|
|
132
156
|
}
|
|
@@ -179,6 +203,25 @@ function friendIdFromCaller(from, callSid) {
|
|
|
179
203
|
function voiceFriendId(options, from, callSid) {
|
|
180
204
|
return options.defaultFriendId?.trim() || friendIdFromCaller(from, callSid);
|
|
181
205
|
}
|
|
206
|
+
function phoneIdentitySegment(input) {
|
|
207
|
+
const phoneish = input.replace(/[^0-9A-Za-z]+/g, "");
|
|
208
|
+
return phoneish || safeSegment(input);
|
|
209
|
+
}
|
|
210
|
+
function twilioPhoneVoiceSessionKey(options) {
|
|
211
|
+
const friendSegment = options.defaultFriendId?.trim()
|
|
212
|
+
? safeSegment(options.defaultFriendId)
|
|
213
|
+
: options.from?.trim()
|
|
214
|
+
? phoneIdentitySegment(options.from)
|
|
215
|
+
: "";
|
|
216
|
+
const lineSegment = options.to?.trim() ? phoneIdentitySegment(options.to) : "";
|
|
217
|
+
if (friendSegment && lineSegment)
|
|
218
|
+
return `twilio-phone-${friendSegment}-via-${lineSegment}`;
|
|
219
|
+
if (friendSegment)
|
|
220
|
+
return `twilio-phone-${friendSegment}`;
|
|
221
|
+
if (lineSegment)
|
|
222
|
+
return `twilio-phone-line-${lineSegment}`;
|
|
223
|
+
return `twilio-phone-${safeSegment(options.callSid ?? "incoming")}`;
|
|
224
|
+
}
|
|
182
225
|
function callConnectedPrompt(params) {
|
|
183
226
|
const from = params.From?.trim();
|
|
184
227
|
const to = params.To?.trim();
|
|
@@ -204,6 +247,35 @@ function isNoSpeechTranscript(text) {
|
|
|
204
247
|
|| normalized === "[NO_SPEECH]"
|
|
205
248
|
|| normalized === "NO_SPEECH";
|
|
206
249
|
}
|
|
250
|
+
function isNoSpeechTranscriptionError(error) {
|
|
251
|
+
const normalized = errorMessage(error).toLowerCase();
|
|
252
|
+
return normalized.includes("empty whisper.cpp transcript")
|
|
253
|
+
|| normalized.includes("voice transcript text is empty");
|
|
254
|
+
}
|
|
255
|
+
function buildNoSpeechTranscript(utteranceId) {
|
|
256
|
+
return (0, transcript_1.buildVoiceTranscript)({
|
|
257
|
+
utteranceId: `${utteranceId}-nospeech`,
|
|
258
|
+
text: noSpeechPrompt(),
|
|
259
|
+
source: "loopback",
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
async function transcribeRecordingOrNoSpeech(options) {
|
|
263
|
+
try {
|
|
264
|
+
const transcript = await options.transcriber.transcribe({
|
|
265
|
+
utteranceId: options.utteranceId,
|
|
266
|
+
audioPath: options.inputPath,
|
|
267
|
+
});
|
|
268
|
+
return isNoSpeechTranscript(transcript.text)
|
|
269
|
+
? buildNoSpeechTranscript(options.utteranceId)
|
|
270
|
+
: transcript;
|
|
271
|
+
}
|
|
272
|
+
catch (error) {
|
|
273
|
+
if (isNoSpeechTranscriptionError(error)) {
|
|
274
|
+
return buildNoSpeechTranscript(options.utteranceId);
|
|
275
|
+
}
|
|
276
|
+
throw error;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
207
279
|
function parseRecordingParams(params) {
|
|
208
280
|
const callSid = params.CallSid?.trim();
|
|
209
281
|
const recordingSid = params.RecordingSid?.trim();
|
|
@@ -215,6 +287,7 @@ function parseRecordingParams(params) {
|
|
|
215
287
|
recordingSid,
|
|
216
288
|
recordingUrl,
|
|
217
289
|
from: params.From?.trim() ?? "",
|
|
290
|
+
to: params.To?.trim() ?? "",
|
|
218
291
|
};
|
|
219
292
|
}
|
|
220
293
|
function recordAgainResponse(options, basePath, message) {
|
|
@@ -238,6 +311,180 @@ function nextInputTwiml(options, basePath, mode) {
|
|
|
238
311
|
maxLengthSeconds: options.recordMaxLengthSeconds ?? exports.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS,
|
|
239
312
|
});
|
|
240
313
|
}
|
|
314
|
+
class TwilioAudioStreamJob {
|
|
315
|
+
callSid;
|
|
316
|
+
jobId;
|
|
317
|
+
mimeType;
|
|
318
|
+
chunks = [];
|
|
319
|
+
waiters = new Set();
|
|
320
|
+
status = "pending";
|
|
321
|
+
failure = null;
|
|
322
|
+
byteLength = 0;
|
|
323
|
+
constructor(callSid, jobId, mimeType) {
|
|
324
|
+
this.callSid = callSid;
|
|
325
|
+
this.jobId = jobId;
|
|
326
|
+
this.mimeType = mimeType;
|
|
327
|
+
}
|
|
328
|
+
append(chunk) {
|
|
329
|
+
/* v8 ignore next -- append is only called while pending with non-empty chunks in bridge flow @preserve */
|
|
330
|
+
if (this.status !== "pending" || chunk.byteLength === 0)
|
|
331
|
+
return;
|
|
332
|
+
const buffered = Buffer.from(chunk);
|
|
333
|
+
this.chunks.push(buffered);
|
|
334
|
+
this.byteLength += buffered.byteLength;
|
|
335
|
+
this.notify();
|
|
336
|
+
}
|
|
337
|
+
complete() {
|
|
338
|
+
/* v8 ignore next -- completion is single-shot inside startTwilioPlaybackStreamJob @preserve */
|
|
339
|
+
if (this.status !== "pending")
|
|
340
|
+
return;
|
|
341
|
+
this.status = "completed";
|
|
342
|
+
this.notify();
|
|
343
|
+
}
|
|
344
|
+
fail(error) {
|
|
345
|
+
/* v8 ignore next -- failure is single-shot inside startTwilioPlaybackStreamJob @preserve */
|
|
346
|
+
if (this.status !== "pending")
|
|
347
|
+
return;
|
|
348
|
+
this.status = "failed";
|
|
349
|
+
this.failure = errorMessage(error);
|
|
350
|
+
this.notify();
|
|
351
|
+
}
|
|
352
|
+
async *stream() {
|
|
353
|
+
let index = 0;
|
|
354
|
+
let yielded = false;
|
|
355
|
+
for (;;) {
|
|
356
|
+
while (index < this.chunks.length) {
|
|
357
|
+
yielded = true;
|
|
358
|
+
yield this.chunks[index++];
|
|
359
|
+
}
|
|
360
|
+
if (this.status === "completed")
|
|
361
|
+
return;
|
|
362
|
+
if (this.status === "failed") {
|
|
363
|
+
if (yielded)
|
|
364
|
+
return;
|
|
365
|
+
throw new Error(this.failure);
|
|
366
|
+
}
|
|
367
|
+
await new Promise((resolve) => {
|
|
368
|
+
this.waiters.add(resolve);
|
|
369
|
+
});
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
notify() {
|
|
373
|
+
const waiters = [...this.waiters];
|
|
374
|
+
this.waiters.clear();
|
|
375
|
+
for (const waiter of waiters)
|
|
376
|
+
waiter();
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
class TwilioAudioStreamJobStore {
|
|
380
|
+
jobs = new Map();
|
|
381
|
+
create(callSid, jobId, mimeType = "audio/mpeg") {
|
|
382
|
+
const key = this.key(callSid, jobId);
|
|
383
|
+
const job = new TwilioAudioStreamJob(callSid, jobId, mimeType);
|
|
384
|
+
this.jobs.set(key, job);
|
|
385
|
+
return job;
|
|
386
|
+
}
|
|
387
|
+
get(callSid, jobId) {
|
|
388
|
+
return this.jobs.get(this.key(callSid, jobId)) ?? null;
|
|
389
|
+
}
|
|
390
|
+
/* v8 ignore start -- stream job cleanup is delayed beyond request-scope tests @preserve */
|
|
391
|
+
delete(callSid, jobId) {
|
|
392
|
+
this.jobs.delete(this.key(callSid, jobId));
|
|
393
|
+
}
|
|
394
|
+
/* v8 ignore stop */
|
|
395
|
+
key(callSid, jobId) {
|
|
396
|
+
return `${callSid}/${jobId}`;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
function deliveredSegments(turn) {
|
|
400
|
+
return turn.speechSegments.map((segment) => segment.tts);
|
|
401
|
+
}
|
|
402
|
+
async function writeVoiceTurnPlaybackArtifacts(options) {
|
|
403
|
+
const urls = [];
|
|
404
|
+
for (const segment of options.turn.speechSegments) {
|
|
405
|
+
const playback = await (0, playback_1.writeVoicePlaybackArtifact)({
|
|
406
|
+
utteranceId: segment.utteranceId,
|
|
407
|
+
delivery: segment.tts,
|
|
408
|
+
outputDir: options.callDir,
|
|
409
|
+
});
|
|
410
|
+
urls.push(routeUrl(options.bridgeOptions.publicBaseUrl, `${options.basePath}/audio/${encodeURIComponent(options.safeCallSid)}/${encodeURIComponent(path.basename(playback.audioPath))}`));
|
|
411
|
+
}
|
|
412
|
+
return urls;
|
|
413
|
+
}
|
|
414
|
+
function playManyTwiml(urls) {
|
|
415
|
+
return urls.map(playTwiml).join("");
|
|
416
|
+
}
|
|
417
|
+
function streamAudioUrl(options, basePath, safeCallSid, jobId) {
|
|
418
|
+
return routeUrl(options.publicBaseUrl, `${basePath}/audio-stream/${encodeURIComponent(safeCallSid)}/${encodeURIComponent(`${jobId}.mp3`)}`);
|
|
419
|
+
}
|
|
420
|
+
function scheduleJobCleanup(jobs, safeCallSid, jobId) {
|
|
421
|
+
/* v8 ignore start -- stream job cleanup is delayed beyond request-scope tests @preserve */
|
|
422
|
+
const cleanup = setTimeout(() => {
|
|
423
|
+
jobs.delete(safeCallSid, jobId);
|
|
424
|
+
}, 5 * 60_000);
|
|
425
|
+
cleanup.unref?.();
|
|
426
|
+
/* v8 ignore stop */
|
|
427
|
+
}
|
|
428
|
+
function startTwilioPlaybackStreamJob(options) {
|
|
429
|
+
const job = options.jobs.create(options.safeCallSid, options.jobId);
|
|
430
|
+
void (async () => {
|
|
431
|
+
try {
|
|
432
|
+
const turn = await options.runTurn((chunk) => job.append(chunk));
|
|
433
|
+
const deliveries = deliveredSegments(turn);
|
|
434
|
+
if (job.byteLength === 0 && deliveries.length > 0) {
|
|
435
|
+
for (const delivery of deliveries)
|
|
436
|
+
job.append(delivery.audio);
|
|
437
|
+
}
|
|
438
|
+
if (deliveries.length === 0) {
|
|
439
|
+
/* v8 ignore next -- runVoiceLoopbackTurn cannot return delivered TTS with zero speech segments @preserve */
|
|
440
|
+
if (turn.tts.status === "failed")
|
|
441
|
+
throw new Error(turn.tts.error);
|
|
442
|
+
/* v8 ignore next -- runVoiceLoopbackTurn emits a speech segment whenever TTS is delivered @preserve */
|
|
443
|
+
throw new Error("voice turn produced no audio");
|
|
444
|
+
}
|
|
445
|
+
try {
|
|
446
|
+
await writeVoiceTurnPlaybackArtifacts({
|
|
447
|
+
bridgeOptions: options.bridgeOptions,
|
|
448
|
+
basePath: options.basePath,
|
|
449
|
+
callDir: options.callDir,
|
|
450
|
+
safeCallSid: options.safeCallSid,
|
|
451
|
+
baseUtteranceId: options.baseUtteranceId,
|
|
452
|
+
turn,
|
|
453
|
+
});
|
|
454
|
+
}
|
|
455
|
+
catch (artifactError) {
|
|
456
|
+
(0, runtime_1.emitNervesEvent)({
|
|
457
|
+
level: "warn",
|
|
458
|
+
component: "senses",
|
|
459
|
+
event: "senses.voice_twilio_stream_artifact_error",
|
|
460
|
+
message: "Twilio stream audio was delivered but artifact persistence failed",
|
|
461
|
+
meta: { ...options.meta, error: errorMessage(artifactError) },
|
|
462
|
+
});
|
|
463
|
+
}
|
|
464
|
+
job.complete();
|
|
465
|
+
(0, runtime_1.emitNervesEvent)({
|
|
466
|
+
component: "senses",
|
|
467
|
+
event: "senses.voice_twilio_stream_end",
|
|
468
|
+
message: "finished Twilio streaming voice playback job",
|
|
469
|
+
meta: { ...options.meta, byteLength: String(job.byteLength), segmentCount: String(deliveries.length) },
|
|
470
|
+
});
|
|
471
|
+
}
|
|
472
|
+
catch (error) {
|
|
473
|
+
job.fail(error);
|
|
474
|
+
(0, runtime_1.emitNervesEvent)({
|
|
475
|
+
level: "error",
|
|
476
|
+
component: "senses",
|
|
477
|
+
event: "senses.voice_twilio_stream_error",
|
|
478
|
+
message: "Twilio streaming voice playback job failed",
|
|
479
|
+
meta: { ...options.meta, error: errorMessage(error) },
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
finally {
|
|
483
|
+
scheduleJobCleanup(options.jobs, options.safeCallSid, options.jobId);
|
|
484
|
+
}
|
|
485
|
+
})();
|
|
486
|
+
return job;
|
|
487
|
+
}
|
|
241
488
|
async function runPhonePromptTurn(options) {
|
|
242
489
|
const transcript = (0, transcript_1.buildVoiceTranscript)({
|
|
243
490
|
utteranceId: options.utteranceId,
|
|
@@ -256,13 +503,15 @@ async function runPhonePromptTurn(options) {
|
|
|
256
503
|
if (turn.tts.status !== "delivered") {
|
|
257
504
|
return xmlResponse(`${sayTwiml("voice output failed after the text response was captured.")}${after}`);
|
|
258
505
|
}
|
|
259
|
-
const
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
506
|
+
const audioUrls = await writeVoiceTurnPlaybackArtifacts({
|
|
507
|
+
bridgeOptions: options.bridgeOptions,
|
|
508
|
+
basePath: options.basePath,
|
|
509
|
+
callDir: options.callDir,
|
|
510
|
+
safeCallSid: options.safeCallSid,
|
|
511
|
+
baseUtteranceId: options.utteranceId,
|
|
512
|
+
turn,
|
|
263
513
|
});
|
|
264
|
-
|
|
265
|
-
return xmlResponse(`${playTwiml(audioUrl)}${after}`);
|
|
514
|
+
return xmlResponse(`${playManyTwiml(audioUrls)}${after}`);
|
|
266
515
|
}
|
|
267
516
|
function computeTwilioSignature(input) {
|
|
268
517
|
const payload = input.url + Object.keys(input.params)
|
|
@@ -309,27 +558,62 @@ function verifyRequest(options, request, params) {
|
|
|
309
558
|
signature: headerValue(request.headers, "x-twilio-signature"),
|
|
310
559
|
});
|
|
311
560
|
}
|
|
312
|
-
async function handleIncoming(options, basePath, params) {
|
|
561
|
+
async function handleIncoming(options, basePath, params, jobs) {
|
|
313
562
|
const callSid = params.CallSid?.trim() || "incoming";
|
|
314
563
|
const safeCallSid = safeSegment(callSid);
|
|
315
564
|
const callDir = path.join(options.outputDir, safeCallSid);
|
|
316
565
|
const utteranceId = `twilio-${safeCallSid}-connected`;
|
|
566
|
+
const friendId = voiceFriendId(options, params.From?.trim() ?? "", callSid);
|
|
567
|
+
const sessionKey = twilioPhoneVoiceSessionKey({
|
|
568
|
+
defaultFriendId: options.defaultFriendId,
|
|
569
|
+
from: params.From?.trim() ?? "",
|
|
570
|
+
to: params.To?.trim() ?? "",
|
|
571
|
+
callSid,
|
|
572
|
+
});
|
|
317
573
|
(0, runtime_1.emitNervesEvent)({
|
|
318
574
|
component: "senses",
|
|
319
575
|
event: "senses.voice_twilio_incoming",
|
|
320
576
|
message: "Twilio voice call connected",
|
|
321
|
-
meta: { agentName: options.agentName, callSid: safeCallSid },
|
|
577
|
+
meta: { agentName: options.agentName, callSid: safeCallSid, sessionKey },
|
|
322
578
|
});
|
|
323
579
|
try {
|
|
324
580
|
await fs.mkdir(callDir, { recursive: true });
|
|
581
|
+
if (normalizeTwilioPhonePlaybackMode(options.playbackMode) === "stream") {
|
|
582
|
+
const transcript = (0, transcript_1.buildVoiceTranscript)({
|
|
583
|
+
utteranceId,
|
|
584
|
+
text: callConnectedPrompt(params),
|
|
585
|
+
source: "loopback",
|
|
586
|
+
});
|
|
587
|
+
const jobId = safeSegment(utteranceId);
|
|
588
|
+
startTwilioPlaybackStreamJob({
|
|
589
|
+
jobs,
|
|
590
|
+
bridgeOptions: options,
|
|
591
|
+
basePath,
|
|
592
|
+
callDir,
|
|
593
|
+
safeCallSid,
|
|
594
|
+
jobId,
|
|
595
|
+
baseUtteranceId: utteranceId,
|
|
596
|
+
runTurn: (onAudioChunk) => (0, turn_1.runVoiceLoopbackTurn)({
|
|
597
|
+
agentName: options.agentName,
|
|
598
|
+
friendId,
|
|
599
|
+
sessionKey,
|
|
600
|
+
transcript,
|
|
601
|
+
tts: options.tts,
|
|
602
|
+
runSenseTurn: options.runSenseTurn,
|
|
603
|
+
onAudioChunk,
|
|
604
|
+
}),
|
|
605
|
+
meta: { agentName: options.agentName, callSid: safeCallSid, utteranceId },
|
|
606
|
+
});
|
|
607
|
+
return xmlResponse(`${playTwiml(streamAudioUrl(options, basePath, safeCallSid, jobId))}${nextInputTwiml(options, basePath, "record")}`);
|
|
608
|
+
}
|
|
325
609
|
return await runPhonePromptTurn({
|
|
326
610
|
bridgeOptions: options,
|
|
327
611
|
basePath,
|
|
328
612
|
callDir,
|
|
329
613
|
safeCallSid,
|
|
330
614
|
utteranceId,
|
|
331
|
-
friendId
|
|
332
|
-
sessionKey
|
|
615
|
+
friendId,
|
|
616
|
+
sessionKey,
|
|
333
617
|
promptText: callConnectedPrompt(params),
|
|
334
618
|
afterPlayback: "record",
|
|
335
619
|
});
|
|
@@ -358,7 +642,7 @@ async function handleListen(options, basePath) {
|
|
|
358
642
|
maxLengthSeconds: options.recordMaxLengthSeconds ?? exports.DEFAULT_TWILIO_RECORD_MAX_LENGTH_SECONDS,
|
|
359
643
|
}));
|
|
360
644
|
}
|
|
361
|
-
async function handleRecording(options, basePath, params) {
|
|
645
|
+
async function handleRecording(options, basePath, params, jobs) {
|
|
362
646
|
const recording = parseRecordingParams(params);
|
|
363
647
|
if (!recording) {
|
|
364
648
|
(0, runtime_1.emitNervesEvent)({
|
|
@@ -376,13 +660,58 @@ async function handleRecording(options, basePath, params) {
|
|
|
376
660
|
const inputPath = path.join(callDir, `${safeRecordingSid}.wav`);
|
|
377
661
|
const utteranceId = `twilio-${safeCallSid}-${safeRecordingSid}`;
|
|
378
662
|
const downloadRecording = options.downloadRecording ?? defaultTwilioRecordingDownloader;
|
|
663
|
+
const friendId = voiceFriendId(options, recording.from, recording.callSid);
|
|
664
|
+
const sessionKey = twilioPhoneVoiceSessionKey({
|
|
665
|
+
defaultFriendId: options.defaultFriendId,
|
|
666
|
+
from: recording.from,
|
|
667
|
+
to: recording.to,
|
|
668
|
+
callSid: recording.callSid,
|
|
669
|
+
});
|
|
379
670
|
(0, runtime_1.emitNervesEvent)({
|
|
380
671
|
component: "senses",
|
|
381
672
|
event: "senses.voice_twilio_turn_start",
|
|
382
673
|
message: "starting Twilio voice turn",
|
|
383
|
-
meta: { agentName: options.agentName, callSid: safeCallSid, recordingSid: safeRecordingSid },
|
|
674
|
+
meta: { agentName: options.agentName, callSid: safeCallSid, recordingSid: safeRecordingSid, sessionKey },
|
|
384
675
|
});
|
|
385
676
|
try {
|
|
677
|
+
if (normalizeTwilioPhonePlaybackMode(options.playbackMode) === "stream") {
|
|
678
|
+
const jobId = safeSegment(utteranceId);
|
|
679
|
+
startTwilioPlaybackStreamJob({
|
|
680
|
+
jobs,
|
|
681
|
+
bridgeOptions: options,
|
|
682
|
+
basePath,
|
|
683
|
+
callDir,
|
|
684
|
+
safeCallSid,
|
|
685
|
+
jobId,
|
|
686
|
+
baseUtteranceId: utteranceId,
|
|
687
|
+
runTurn: async (onAudioChunk) => {
|
|
688
|
+
await fs.mkdir(callDir, { recursive: true });
|
|
689
|
+
const mediaUrl = twilioRecordingMediaUrl(recording.recordingUrl);
|
|
690
|
+
const audio = await downloadRecording({
|
|
691
|
+
recordingUrl: mediaUrl,
|
|
692
|
+
accountSid: options.twilioAccountSid?.trim() || undefined,
|
|
693
|
+
authToken: options.twilioAuthToken?.trim() || undefined,
|
|
694
|
+
});
|
|
695
|
+
await fs.writeFile(inputPath, audio);
|
|
696
|
+
const turnTranscript = await transcribeRecordingOrNoSpeech({
|
|
697
|
+
transcriber: options.transcriber,
|
|
698
|
+
utteranceId,
|
|
699
|
+
inputPath,
|
|
700
|
+
});
|
|
701
|
+
return (0, turn_1.runVoiceLoopbackTurn)({
|
|
702
|
+
agentName: options.agentName,
|
|
703
|
+
friendId,
|
|
704
|
+
sessionKey,
|
|
705
|
+
transcript: turnTranscript,
|
|
706
|
+
tts: options.tts,
|
|
707
|
+
runSenseTurn: options.runSenseTurn,
|
|
708
|
+
onAudioChunk,
|
|
709
|
+
});
|
|
710
|
+
},
|
|
711
|
+
meta: { agentName: options.agentName, callSid: safeCallSid, recordingSid: safeRecordingSid, utteranceId },
|
|
712
|
+
});
|
|
713
|
+
return xmlResponse(`${playTwiml(streamAudioUrl(options, basePath, safeCallSid, jobId))}${redirectTwiml(options.publicBaseUrl, basePath)}`);
|
|
714
|
+
}
|
|
386
715
|
await fs.mkdir(callDir, { recursive: true });
|
|
387
716
|
const mediaUrl = twilioRecordingMediaUrl(recording.recordingUrl);
|
|
388
717
|
const audio = await downloadRecording({
|
|
@@ -391,27 +720,28 @@ async function handleRecording(options, basePath, params) {
|
|
|
391
720
|
authToken: options.twilioAuthToken?.trim() || undefined,
|
|
392
721
|
});
|
|
393
722
|
await fs.writeFile(inputPath, audio);
|
|
394
|
-
const transcript = await
|
|
723
|
+
const transcript = await transcribeRecordingOrNoSpeech({
|
|
724
|
+
transcriber: options.transcriber,
|
|
395
725
|
utteranceId,
|
|
396
|
-
|
|
726
|
+
inputPath,
|
|
397
727
|
});
|
|
398
|
-
if (
|
|
728
|
+
if (transcript.utteranceId === `${utteranceId}-nospeech`) {
|
|
399
729
|
return await runPhonePromptTurn({
|
|
400
730
|
bridgeOptions: options,
|
|
401
731
|
basePath,
|
|
402
732
|
callDir,
|
|
403
733
|
safeCallSid,
|
|
404
734
|
utteranceId: `${utteranceId}-nospeech`,
|
|
405
|
-
friendId
|
|
406
|
-
sessionKey
|
|
735
|
+
friendId,
|
|
736
|
+
sessionKey,
|
|
407
737
|
promptText: noSpeechPrompt(),
|
|
408
738
|
afterPlayback: "redirect",
|
|
409
739
|
});
|
|
410
740
|
}
|
|
411
741
|
const turn = await (0, turn_1.runVoiceLoopbackTurn)({
|
|
412
742
|
agentName: options.agentName,
|
|
413
|
-
friendId
|
|
414
|
-
sessionKey
|
|
743
|
+
friendId,
|
|
744
|
+
sessionKey,
|
|
415
745
|
transcript,
|
|
416
746
|
tts: options.tts,
|
|
417
747
|
runSenseTurn: options.runSenseTurn,
|
|
@@ -419,19 +749,21 @@ async function handleRecording(options, basePath, params) {
|
|
|
419
749
|
if (turn.tts.status !== "delivered") {
|
|
420
750
|
return xmlResponse(`${sayTwiml("voice output failed after the text response was captured.")}${redirectTwiml(options.publicBaseUrl, basePath)}`);
|
|
421
751
|
}
|
|
422
|
-
const
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
752
|
+
const audioUrls = await writeVoiceTurnPlaybackArtifacts({
|
|
753
|
+
bridgeOptions: options,
|
|
754
|
+
basePath,
|
|
755
|
+
callDir,
|
|
756
|
+
safeCallSid,
|
|
757
|
+
baseUtteranceId: utteranceId,
|
|
758
|
+
turn,
|
|
426
759
|
});
|
|
427
|
-
const audioUrl = routeUrl(options.publicBaseUrl, `${basePath}/audio/${encodeURIComponent(safeCallSid)}/${encodeURIComponent(path.basename(playback.audioPath))}`);
|
|
428
760
|
(0, runtime_1.emitNervesEvent)({
|
|
429
761
|
component: "senses",
|
|
430
762
|
event: "senses.voice_twilio_turn_end",
|
|
431
763
|
message: "finished Twilio voice turn",
|
|
432
|
-
meta: { agentName: options.agentName, callSid: safeCallSid, recordingSid: safeRecordingSid,
|
|
764
|
+
meta: { agentName: options.agentName, callSid: safeCallSid, recordingSid: safeRecordingSid, playbackCount: audioUrls.length },
|
|
433
765
|
});
|
|
434
|
-
return xmlResponse(`${
|
|
766
|
+
return xmlResponse(`${playManyTwiml(audioUrls)}${redirectTwiml(options.publicBaseUrl, basePath)}`);
|
|
435
767
|
}
|
|
436
768
|
catch (error) {
|
|
437
769
|
(0, runtime_1.emitNervesEvent)({
|
|
@@ -477,9 +809,34 @@ async function handleAudio(options, basePath, requestPath) {
|
|
|
477
809
|
return textResponse(404, "not found");
|
|
478
810
|
}
|
|
479
811
|
}
|
|
812
|
+
async function handleAudioStream(options, basePath, requestPath, jobs) {
|
|
813
|
+
const prefix = `${basePath}/audio-stream/`;
|
|
814
|
+
const pathOnly = requestPath.split("?")[0];
|
|
815
|
+
const rest = pathOnly.slice(prefix.length);
|
|
816
|
+
const parts = rest.split("/");
|
|
817
|
+
if (parts.length !== 2)
|
|
818
|
+
return textResponse(404, "not found");
|
|
819
|
+
const [callSidPart, fileNamePart] = parts;
|
|
820
|
+
const callSid = decodeSafeSegment(callSidPart);
|
|
821
|
+
const fileName = decodeSafeSegment(fileNamePart);
|
|
822
|
+
if (!callSid || !fileName)
|
|
823
|
+
return textResponse(404, "not found");
|
|
824
|
+
const jobId = fileName.replace(/\.[A-Za-z0-9]+$/, "");
|
|
825
|
+
const job = jobs.get(callSid, jobId);
|
|
826
|
+
if (!job)
|
|
827
|
+
return textResponse(404, "not found");
|
|
828
|
+
(0, runtime_1.emitNervesEvent)({
|
|
829
|
+
component: "senses",
|
|
830
|
+
event: "senses.voice_twilio_stream_served",
|
|
831
|
+
message: "served Twilio voice streaming audio job",
|
|
832
|
+
meta: { agentName: options.agentName, callSid, jobId },
|
|
833
|
+
});
|
|
834
|
+
return streamResponse(job.stream(), job.mimeType);
|
|
835
|
+
}
|
|
480
836
|
function createTwilioPhoneBridge(options) {
|
|
481
837
|
new URL(options.publicBaseUrl);
|
|
482
838
|
const basePath = normalizeTwilioPhoneBasePath(options.basePath);
|
|
839
|
+
const jobs = new TwilioAudioStreamJobStore();
|
|
483
840
|
return {
|
|
484
841
|
async handle(request) {
|
|
485
842
|
const method = request.method.toUpperCase();
|
|
@@ -488,6 +845,9 @@ function createTwilioPhoneBridge(options) {
|
|
|
488
845
|
if (method === "GET" && requestPath.startsWith(`${basePath}/audio/`)) {
|
|
489
846
|
return handleAudio(options, basePath, requestPath);
|
|
490
847
|
}
|
|
848
|
+
if (method === "GET" && requestPath.startsWith(`${basePath}/audio-stream/`)) {
|
|
849
|
+
return handleAudioStream(options, basePath, requestPath, jobs);
|
|
850
|
+
}
|
|
491
851
|
if (method === "GET" && routePath === `${basePath}/health`) {
|
|
492
852
|
return textResponse(200, "ok");
|
|
493
853
|
}
|
|
@@ -505,11 +865,11 @@ function createTwilioPhoneBridge(options) {
|
|
|
505
865
|
return textResponse(403, "invalid Twilio signature");
|
|
506
866
|
}
|
|
507
867
|
if (routePath === `${basePath}/incoming`)
|
|
508
|
-
return handleIncoming(options, basePath, params);
|
|
868
|
+
return handleIncoming(options, basePath, params, jobs);
|
|
509
869
|
if (routePath === `${basePath}/listen`)
|
|
510
870
|
return handleListen(options, basePath);
|
|
511
871
|
if (routePath === `${basePath}/recording`)
|
|
512
|
-
return handleRecording(options, basePath, params);
|
|
872
|
+
return handleRecording(options, basePath, params, jobs);
|
|
513
873
|
return textResponse(404, "not found");
|
|
514
874
|
},
|
|
515
875
|
};
|
|
@@ -531,6 +891,35 @@ function readRequestBody(req, limitBytes = 1_000_000) {
|
|
|
531
891
|
req.on("error", reject);
|
|
532
892
|
});
|
|
533
893
|
}
|
|
894
|
+
/* v8 ignore start -- HTTP backpressure is platform-dependent in unit tests @preserve */
|
|
895
|
+
function waitForDrain(res) {
|
|
896
|
+
return new Promise((resolve, reject) => {
|
|
897
|
+
const onDrain = () => {
|
|
898
|
+
res.off("error", onError);
|
|
899
|
+
resolve();
|
|
900
|
+
};
|
|
901
|
+
const onError = (error) => {
|
|
902
|
+
res.off("drain", onDrain);
|
|
903
|
+
reject(error);
|
|
904
|
+
};
|
|
905
|
+
res.once("drain", onDrain);
|
|
906
|
+
res.once("error", onError);
|
|
907
|
+
});
|
|
908
|
+
}
|
|
909
|
+
/* v8 ignore stop */
|
|
910
|
+
async function writeResponseBody(res, body) {
|
|
911
|
+
if (!isAsyncIterableBody(body)) {
|
|
912
|
+
res.end(body);
|
|
913
|
+
return;
|
|
914
|
+
}
|
|
915
|
+
for await (const chunk of body) {
|
|
916
|
+
/* v8 ignore next -- exercised only when Node reports socket backpressure @preserve */
|
|
917
|
+
if (!res.write(chunk)) {
|
|
918
|
+
await waitForDrain(res);
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
res.end();
|
|
922
|
+
}
|
|
534
923
|
async function startTwilioPhoneBridgeServer(options) {
|
|
535
924
|
const port = options.port ?? exports.DEFAULT_TWILIO_PHONE_PORT;
|
|
536
925
|
const host = options.host ?? "127.0.0.1";
|
|
@@ -545,7 +934,7 @@ async function startTwilioPhoneBridgeServer(options) {
|
|
|
545
934
|
body,
|
|
546
935
|
});
|
|
547
936
|
res.writeHead(response.statusCode, response.headers);
|
|
548
|
-
res
|
|
937
|
+
await writeResponseBody(res, response.body);
|
|
549
938
|
}
|
|
550
939
|
catch (error) {
|
|
551
940
|
(0, runtime_1.emitNervesEvent)({
|
|
@@ -555,8 +944,14 @@ async function startTwilioPhoneBridgeServer(options) {
|
|
|
555
944
|
message: "Twilio voice bridge server failed a request",
|
|
556
945
|
meta: { agentName: options.agentName, error: errorMessage(error) },
|
|
557
946
|
});
|
|
558
|
-
|
|
559
|
-
res.
|
|
947
|
+
/* v8 ignore next -- defensive path for async stream failures after headers @preserve */
|
|
948
|
+
if (res.headersSent) {
|
|
949
|
+
res.destroy(error instanceof Error ? error : new Error(String(error)));
|
|
950
|
+
}
|
|
951
|
+
else {
|
|
952
|
+
res.writeHead(500, { "content-type": "text/plain; charset=utf-8" });
|
|
953
|
+
res.end("internal server error");
|
|
954
|
+
}
|
|
560
955
|
}
|
|
561
956
|
});
|
|
562
957
|
await new Promise((resolve, reject) => {
|