@pimote/pimote 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -16
- package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
- package/client/build/_app/immutable/assets/2.DwPXxSa-.css +1 -0
- package/client/build/_app/immutable/chunks/-Lc-U-GJ.js +1 -0
- package/client/build/_app/immutable/chunks/{CT6ckxpD.js → CO_BwWGt.js} +1 -1
- package/client/build/_app/immutable/chunks/CklMSqcv.js +1 -0
- package/client/build/_app/immutable/chunks/D1INvMB9.js +1 -0
- package/client/build/_app/immutable/chunks/D1vhgXpq.js +5 -0
- package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.B-HFVtpC.js} +2 -2
- package/client/build/_app/immutable/entry/start.DJTQ8-sD.js +1 -0
- package/client/build/_app/immutable/nodes/0.CepAO4xf.js +10 -0
- package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.CmxFYjRm.js} +1 -1
- package/client/build/_app/immutable/nodes/2.DAtqfmki.js +54 -0
- package/client/build/_app/version.json +1 -1
- package/client/build/index.html +7 -7
- package/package.json +7 -3
- package/server/dist/auto-drain-on-abort.js +49 -0
- package/server/dist/config.js +21 -0
- package/server/dist/extension-ui-bridge.js +14 -1
- package/server/dist/index.js +36 -1
- package/server/dist/message-mapper.js +38 -6
- package/server/dist/push-notification.js +11 -0
- package/server/dist/server.js +2 -2
- package/server/dist/session-manager.js +72 -4
- package/server/dist/voice/fsm/actions.js +6 -0
- package/server/dist/voice/fsm/events.js +7 -0
- package/server/dist/voice/fsm/reducer.js +74 -0
- package/server/dist/voice/fsm/reducers/lifecycle.js +158 -0
- package/server/dist/voice/fsm/reducers/streaming.js +220 -0
- package/server/dist/voice/fsm/reducers/walkback.js +73 -0
- package/server/dist/voice/fsm/state.js +21 -0
- package/server/dist/voice/fsm/text-extractor.js +128 -0
- package/server/dist/voice/index.js +336 -0
- package/server/dist/voice/interpreter-prompt.js +115 -0
- package/server/dist/voice/speechmux-client.js +153 -0
- package/server/dist/voice/state-machine.js +14 -0
- package/server/dist/voice/wait-for-idle.js +67 -0
- package/server/dist/voice/walk-back.js +198 -0
- package/server/dist/voice-orchestrator-boot.js +90 -0
- package/server/dist/voice-orchestrator.js +91 -0
- package/server/dist/ws-handler.js +112 -7
- package/shared/dist/index.d.ts +1 -0
- package/shared/dist/index.js +2 -0
- package/shared/dist/protocol.d.ts +614 -0
- package/shared/dist/protocol.js +30 -0
- package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
- package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
- package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
- package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
- package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
- package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
- package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
- package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
- package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
// Wait-for-idle helper for the voice extension.
|
|
2
|
+
//
|
|
3
|
+
// The speechmux abort/user frame pair on barge-in arrives as two
|
|
4
|
+
// independent reducer calls. The `abort` action calls `ctx.abort()`
|
|
5
|
+
// (fire-and-forget — the actual teardown completes asynchronously). If
|
|
6
|
+
// the user frame arrives before the teardown is done, `sendUserMessage`
|
|
7
|
+
// throws ("Agent is already processing…") and the user's utterance is
|
|
8
|
+
// silently dropped.
|
|
9
|
+
//
|
|
10
|
+
// Steering doesn't help: pi-agent-core doesn't drain the steer queue on
|
|
11
|
+
// the abort exit path of `runLoop`. Pimote has a separate
|
|
12
|
+
// `autoDrainOnAbort` listener (see `auto-drain-on-abort.ts`) that
|
|
13
|
+
// rescues queued messages after an aborted run, but that only catches
|
|
14
|
+
// messages that *were* queued — it doesn't help an unqueued
|
|
15
|
+
// `sendUserMessage` that throws.
|
|
16
|
+
//
|
|
17
|
+
// So the voice extension polls `ctx.isIdle()` before calling
|
|
18
|
+
// `sendUserMessage` (without `deliverAs`), guaranteeing the SDK won't
|
|
19
|
+
// throw. Auto-drain remains a belt-and-braces safety net for any
|
|
20
|
+
// queued path that races an abort.
|
|
21
|
+
/**
|
|
22
|
+
* Resolve once the agent is idle, polling with exponential backoff
|
|
23
|
+
* (start 5 ms, doubling, capped at 50 ms). Returns false if the agent
|
|
24
|
+
* never becomes idle within `timeoutMs`. Returns true immediately
|
|
25
|
+
* when already idle.
|
|
26
|
+
*
|
|
27
|
+
* Timeout default of 2 s is well above any normal abort-teardown
|
|
28
|
+
* latency (tens to a few hundred ms). If a real agent doesn't reach
|
|
29
|
+
* idle within 2 s, something is genuinely stuck and dropping the
|
|
30
|
+
* message is preferable to hanging the executor.
|
|
31
|
+
*/
|
|
32
|
+
export async function waitForAgentIdle(ctx, timeoutMs = 2000) {
|
|
33
|
+
if (ctx.isIdle())
|
|
34
|
+
return true;
|
|
35
|
+
const start = Date.now();
|
|
36
|
+
let delay = 5;
|
|
37
|
+
while (!ctx.isIdle()) {
|
|
38
|
+
if (Date.now() - start >= timeoutMs)
|
|
39
|
+
return false;
|
|
40
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
41
|
+
delay = Math.min(50, delay * 2);
|
|
42
|
+
}
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Ensure the agent is idle, synthesising a barge-in when it isn't.
|
|
47
|
+
*
|
|
48
|
+
* Speechmux only emits `abort` while it is actively playing TTS — i.e.
|
|
49
|
+
* during the harness's `token`/`end` stream. While the worker is
|
|
50
|
+
* silently reasoning between a `user` frame and its first `speak()`
|
|
51
|
+
* call, speechmux has no signal that the agent is busy and won't
|
|
52
|
+
* pre-empt. If the user starts a new utterance during that window, the
|
|
53
|
+
* `user` frame arrives at the harness with no preceding `abort`, so the
|
|
54
|
+
* agent is still mid-turn and `sendUserMessage` would race / be
|
|
55
|
+
* dropped.
|
|
56
|
+
*
|
|
57
|
+
* This helper closes that gap: when the agent isn't idle on entry, we
|
|
58
|
+
* fire `ctx.abort()` ourselves (idempotent if a real barge-in already
|
|
59
|
+
* issued one) and then poll for idle the same way the abort/user pair
|
|
60
|
+
* already does. Returns true once idle, false on timeout.
|
|
61
|
+
*/
|
|
62
|
+
export async function ensureIdleWithImplicitAbort(ctx, timeoutMs = 2000) {
|
|
63
|
+
if (ctx.isIdle())
|
|
64
|
+
return true;
|
|
65
|
+
ctx.abort();
|
|
66
|
+
return waitForAgentIdle(ctx, timeoutMs);
|
|
67
|
+
}
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
// Walkback rewrite: surgical truncation of conversation history when
|
|
2
|
+
// speechmux reports the user heard only a prefix of an assistant
|
|
3
|
+
// utterance.
|
|
4
|
+
//
|
|
5
|
+
// **Identity-based design.** Walkback targets a specific `speak()` tool
|
|
6
|
+
// call by its `toolCallId`. That id is round-tripped through speechmux
|
|
7
|
+
// (every outgoing `token`/`end` frame carries it; speechmux echoes it
|
|
8
|
+
// back on `rollback`/`abort`) so we know exactly which utterance the
|
|
9
|
+
// `heardText` belongs to. The previous design used a captured snapshot
|
|
10
|
+
// of the in-flight assistant message and a string-prefix-matching
|
|
11
|
+
// algorithm — both of which broke whenever a turn contained more than
|
|
12
|
+
// one speak() or whenever the snapshot drifted out of sync with the
|
|
13
|
+
// real conversation.
|
|
14
|
+
//
|
|
15
|
+
// **Contract:** see `docs/plans/voice-mode.md` for the high-level
|
|
16
|
+
// behavioural spec. Briefly:
|
|
17
|
+
//
|
|
18
|
+
// 1. The trailing pi-synthetic empty-text aborted assistant (if any)
|
|
19
|
+
// is always stripped, even when no rollback is pending. This is
|
|
20
|
+
// pi's marker for "agent run was aborted"; we don't want it in
|
|
21
|
+
// the LLM context.
|
|
22
|
+
//
|
|
23
|
+
// 2. With a rollback pending, locate the speak block by
|
|
24
|
+
// `targetSpeakToolCallId`. If found:
|
|
25
|
+
// - If `heardText` is empty: drop the speak block entirely (and
|
|
26
|
+
// its paired tool_result if present).
|
|
27
|
+
// - If `heardText.length >= block.text.length`: keep block as-is
|
|
28
|
+
// (whole utterance was heard).
|
|
29
|
+
// - Otherwise: replace the block's text with `heardText` and
|
|
30
|
+
// drop the paired tool_result.
|
|
31
|
+
// Then drop blocks AFTER the target in the same message, and drop
|
|
32
|
+
// any subsequent assistant/tool_result messages — none of those
|
|
33
|
+
// could have been heard if the user interrupted at the target.
|
|
34
|
+
//
|
|
35
|
+
// 3. If the target is not found in messages (e.g. compacted away),
|
|
36
|
+
// walkback is a no-op beyond step 1.
|
|
37
|
+
//
|
|
38
|
+
// **Content-block shape compatibility.** The function handles both
|
|
39
|
+
// pi-agent-core's internal AgentMessage shape (`type:'toolCall'` +
|
|
40
|
+
// `arguments`) and the Anthropic API shape (`type:'tool_use'` +
|
|
41
|
+
// `input`). Earlier versions only matched the latter, which silently
|
|
42
|
+
// failed on every real captured message.
|
|
43
|
+
/**
|
|
44
|
+
* Apply walkback against `messages`. Pure function.
|
|
45
|
+
*
|
|
46
|
+
* Returns a new array; never mutates the input.
|
|
47
|
+
*/
|
|
48
|
+
export function walkBack(input) {
|
|
49
|
+
const stripped = stripTrailingAbortedEmpty(input.messages);
|
|
50
|
+
if (input.rollback === null)
|
|
51
|
+
return stripped;
|
|
52
|
+
return rewriteByToolCallId(stripped, input.rollback.heardText, input.rollback.targetSpeakToolCallId);
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
/** True for the synthetic assistant pi appends to state on abort. */
|
|
56
|
+
export function isAbortedEmptyAssistant(msg) {
|
|
57
|
+
if (!isAssistantMessage(msg))
|
|
58
|
+
return false;
|
|
59
|
+
if (stopReason(msg) !== 'aborted')
|
|
60
|
+
return false;
|
|
61
|
+
return isEmptyText(contentOf(msg));
|
|
62
|
+
}
|
|
63
|
+
function stripTrailingAbortedEmpty(messages) {
|
|
64
|
+
let cut = messages.length;
|
|
65
|
+
while (cut > 0 && isAbortedEmptyAssistant(messages[cut - 1]))
|
|
66
|
+
cut -= 1;
|
|
67
|
+
return cut === messages.length ? messages.slice() : messages.slice(0, cut);
|
|
68
|
+
}
|
|
69
|
+
function rewriteByToolCallId(messages, heardText, targetId) {
|
|
70
|
+
// Search from the back — toolCallIds are unique per session, so the
|
|
71
|
+
// first match is the right one, but searching backward minimises work
|
|
72
|
+
// for the common case (target is in the recent tail).
|
|
73
|
+
let targetMsgIdx = -1;
|
|
74
|
+
let targetBlockIdx = -1;
|
|
75
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
76
|
+
const msg = messages[i];
|
|
77
|
+
if (!isAssistantMessage(msg))
|
|
78
|
+
continue;
|
|
79
|
+
const content = contentOf(msg);
|
|
80
|
+
for (let j = 0; j < content.length; j++) {
|
|
81
|
+
if (isSpeakToolCall(content[j]) && getToolCallId(content[j]) === targetId) {
|
|
82
|
+
targetMsgIdx = i;
|
|
83
|
+
targetBlockIdx = j;
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (targetMsgIdx !== -1)
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
if (targetMsgIdx === -1) {
|
|
91
|
+
// Target gone (compacted, or never landed in messages). Best we can
|
|
92
|
+
// do is honour step 1 (already done).
|
|
93
|
+
return messages;
|
|
94
|
+
}
|
|
95
|
+
const targetMsg = messages[targetMsgIdx];
|
|
96
|
+
const targetContent = contentOf(targetMsg);
|
|
97
|
+
const targetBlock = targetContent[targetBlockIdx];
|
|
98
|
+
const originalText = getSpeakText(targetBlock);
|
|
99
|
+
const newBlocks = targetContent.slice(0, targetBlockIdx);
|
|
100
|
+
const droppedToolUseIds = new Set();
|
|
101
|
+
if (heardText.length === 0) {
|
|
102
|
+
// Nothing was heard of this speak. Drop the block and its paired
|
|
103
|
+
// tool_result (if any).
|
|
104
|
+
droppedToolUseIds.add(targetId);
|
|
105
|
+
}
|
|
106
|
+
else if (heardText.length >= originalText.length) {
|
|
107
|
+
// Entire utterance was heard. Keep block intact.
|
|
108
|
+
newBlocks.push(targetBlock);
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
// Partial. Truncate text in-place and drop the paired tool_result
|
|
112
|
+
// (per the contract — a truncated speak's result is no longer
|
|
113
|
+
// grounded in what the user heard).
|
|
114
|
+
newBlocks.push(replaceSpeakText(targetBlock, heardText));
|
|
115
|
+
droppedToolUseIds.add(targetId);
|
|
116
|
+
}
|
|
117
|
+
// Anything in this message AFTER the target block was emitted after
|
|
118
|
+
// the heard prefix and so was not heard.
|
|
119
|
+
for (let j = targetBlockIdx + 1; j < targetContent.length; j++) {
|
|
120
|
+
const id = getToolCallId(targetContent[j]);
|
|
121
|
+
if (id)
|
|
122
|
+
droppedToolUseIds.add(id);
|
|
123
|
+
}
|
|
124
|
+
const rewrittenTarget = {
|
|
125
|
+
...targetMsg,
|
|
126
|
+
content: newBlocks,
|
|
127
|
+
stopReason: 'aborted',
|
|
128
|
+
};
|
|
129
|
+
// Anything AFTER the target message in the array was emitted by the
|
|
130
|
+
// agent after the interrupted speak — drop it. This includes any
|
|
131
|
+
// tool_result messages whose paired speak we just truncated, plus
|
|
132
|
+
// any subsequent assistant messages.
|
|
133
|
+
return [...messages.slice(0, targetMsgIdx), rewrittenTarget];
|
|
134
|
+
}
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Shape-tolerant accessors. pi-agent-core's runtime AgentMessage uses
|
|
137
|
+
// `toolCall`/`arguments`; the Anthropic API shape uses `tool_use`/`input`.
|
|
138
|
+
// Tests / tooling may pass either; we accept both.
|
|
139
|
+
function isAssistantMessage(msg) {
|
|
140
|
+
return msg.role === 'assistant';
|
|
141
|
+
}
|
|
142
|
+
function stopReason(msg) {
|
|
143
|
+
return msg.stopReason;
|
|
144
|
+
}
|
|
145
|
+
function contentOf(msg) {
|
|
146
|
+
const c = msg.content;
|
|
147
|
+
return Array.isArray(c) ? c : [];
|
|
148
|
+
}
|
|
149
|
+
function isEmptyText(blocks) {
|
|
150
|
+
if (blocks.length === 0)
|
|
151
|
+
return true;
|
|
152
|
+
return blocks.every((b) => {
|
|
153
|
+
if (b.type !== 'text')
|
|
154
|
+
return false;
|
|
155
|
+
const t = b.text;
|
|
156
|
+
return typeof t === 'string' && t.trim() === '';
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
export function isSpeakToolCall(block) {
|
|
160
|
+
if (block.type !== 'toolCall' && block.type !== 'tool_use')
|
|
161
|
+
return false;
|
|
162
|
+
return block.name === 'speak';
|
|
163
|
+
}
|
|
164
|
+
function getToolCallId(block) {
|
|
165
|
+
const id = block.id;
|
|
166
|
+
return typeof id === 'string' ? id : undefined;
|
|
167
|
+
}
|
|
168
|
+
function getSpeakText(block) {
|
|
169
|
+
// Try both shapes; whichever holds a string wins.
|
|
170
|
+
const args = block.arguments;
|
|
171
|
+
if (args && typeof args === 'object') {
|
|
172
|
+
const t = args.text;
|
|
173
|
+
if (typeof t === 'string')
|
|
174
|
+
return t;
|
|
175
|
+
}
|
|
176
|
+
const input = block.input;
|
|
177
|
+
if (input && typeof input === 'object') {
|
|
178
|
+
const t = input.text;
|
|
179
|
+
if (typeof t === 'string')
|
|
180
|
+
return t;
|
|
181
|
+
}
|
|
182
|
+
return '';
|
|
183
|
+
}
|
|
184
|
+
function replaceSpeakText(block, text) {
|
|
185
|
+
// Preserve whichever args/input shape was present, replacing only the
|
|
186
|
+
// `text` field. We don't normalise to a single shape — that would
|
|
187
|
+
// diverge from whatever pi-agent-core/the provider expects.
|
|
188
|
+
const args = block.arguments;
|
|
189
|
+
const input = block.input;
|
|
190
|
+
if (args && typeof args === 'object') {
|
|
191
|
+
return { ...block, arguments: { ...args, text } };
|
|
192
|
+
}
|
|
193
|
+
if (input && typeof input === 'object') {
|
|
194
|
+
return { ...block, input: { ...input, text } };
|
|
195
|
+
}
|
|
196
|
+
// Neither shape present — set both defensively.
|
|
197
|
+
return { ...block, arguments: { text }, input: { text } };
|
|
198
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
// Wire the VoiceOrchestrator together with its runtime dependencies at
|
|
2
|
+
// server boot time. Kept separate from `index.ts` so the wiring is
|
|
3
|
+
// testable (no network / child_process side-effects at import time) and
|
|
4
|
+
// isolated from the plain HTTP/WS boot sequence.
|
|
5
|
+
import { spawn } from 'node:child_process';
|
|
6
|
+
import { VoiceOrchestrator } from './voice-orchestrator.js';
|
|
7
|
+
/**
|
|
8
|
+
* Construct a VoiceOrchestrator backed by real seams:
|
|
9
|
+
* - speechmux sidecar via `child_process.spawn`
|
|
10
|
+
* - displacement = looks up current owner via clientRegistry and calls its
|
|
11
|
+
* `sendDisplacedEvent(sessionId)`
|
|
12
|
+
*
|
|
13
|
+
* Auth on `/signal` is handled by Cloudflare Access at the edge, and
|
|
14
|
+
* per-session TURN credentials are minted by speechmux and returned to the
|
|
15
|
+
* PWA in its `/signal` `session` response. Pimote's orchestrator only
|
|
16
|
+
* hands out the signalling URL.
|
|
17
|
+
*/
|
|
18
|
+
export function buildVoiceOrchestrator(args) {
|
|
19
|
+
const { config, sessionManager, clientRegistry } = args;
|
|
20
|
+
let speechmuxProc = null;
|
|
21
|
+
const busResolver = {
|
|
22
|
+
getSlot: (sessionId) => sessionManager.getSlot(sessionId),
|
|
23
|
+
getEventBus: (sessionId) => sessionManager.getSlot(sessionId)?.eventBusRef.current ?? null,
|
|
24
|
+
};
|
|
25
|
+
const orchestrator = new VoiceOrchestrator({
|
|
26
|
+
config,
|
|
27
|
+
sessionManager,
|
|
28
|
+
busResolver,
|
|
29
|
+
startSpeechmux: async () => {
|
|
30
|
+
const bin = config.voice?.speechmuxBinary;
|
|
31
|
+
if (!bin) {
|
|
32
|
+
console.log('[voice] speechmuxBinary not configured; assuming speechmux is externally managed (systemd, container, remote host, etc.)');
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
if (speechmuxProc)
|
|
36
|
+
return;
|
|
37
|
+
speechmuxProc = spawn(bin, [], { stdio: ['ignore', 'inherit', 'inherit'] });
|
|
38
|
+
speechmuxProc.on('exit', (code, signal) => {
|
|
39
|
+
console.warn(`[voice] speechmux exited (code=${code}, signal=${signal})`);
|
|
40
|
+
speechmuxProc = null;
|
|
41
|
+
});
|
|
42
|
+
// NB: we do not wait for a ready marker here — speechmux emits readiness
|
|
43
|
+
// to its own logs. Callers should ensure startup ordering or implement a
|
|
44
|
+
// readiness probe as part of the Step 14 smoke.
|
|
45
|
+
},
|
|
46
|
+
stopSpeechmux: async () => {
|
|
47
|
+
if (!speechmuxProc)
|
|
48
|
+
return;
|
|
49
|
+
const proc = speechmuxProc;
|
|
50
|
+
speechmuxProc = null;
|
|
51
|
+
await new Promise((resolve) => {
|
|
52
|
+
const timer = setTimeout(() => {
|
|
53
|
+
try {
|
|
54
|
+
proc.kill('SIGKILL');
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
/* ignore */
|
|
58
|
+
}
|
|
59
|
+
resolve();
|
|
60
|
+
}, 2000);
|
|
61
|
+
proc.once('exit', () => {
|
|
62
|
+
clearTimeout(timer);
|
|
63
|
+
resolve();
|
|
64
|
+
});
|
|
65
|
+
try {
|
|
66
|
+
proc.kill('SIGTERM');
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
clearTimeout(timer);
|
|
70
|
+
resolve();
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
},
|
|
74
|
+
displaceOwner: async (sessionId, _newOwner) => {
|
|
75
|
+
const slot = sessionManager.getSlot(sessionId);
|
|
76
|
+
const existingClientId = slot?.connection?.connectedClientId;
|
|
77
|
+
if (!existingClientId)
|
|
78
|
+
return;
|
|
79
|
+
const existing = clientRegistry.get(existingClientId);
|
|
80
|
+
existing?.sendDisplacedEvent(sessionId);
|
|
81
|
+
},
|
|
82
|
+
isOwnedByVoiceCall: (sessionId) => orchestrator.isCallActive(sessionId),
|
|
83
|
+
});
|
|
84
|
+
return {
|
|
85
|
+
orchestrator,
|
|
86
|
+
shutdown: async () => {
|
|
87
|
+
await orchestrator.stop();
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
// Voice orchestrator — owns the speechmux sidecar lifecycle and the per-call
|
|
2
|
+
// bind dispatch. See docs/plans/voice-mode.md → "Voice orchestrator".
|
|
3
|
+
//
|
|
4
|
+
// This file defines the interface surface + a stub implementation. The impl
|
|
5
|
+
// phase fills in start()/stop()/bindCall()/endCall() bodies.
|
|
6
|
+
/** Typed error carrying the discriminable reason code used in PimoteResponse.error. */
|
|
7
|
+
export class CallBindError extends Error {
|
|
8
|
+
code;
|
|
9
|
+
constructor(code, message) {
|
|
10
|
+
super(message ?? code);
|
|
11
|
+
this.code = code;
|
|
12
|
+
this.name = 'CallBindError';
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
export class VoiceOrchestrator {
|
|
16
|
+
opts;
|
|
17
|
+
started = false;
|
|
18
|
+
activeCalls = new Set();
|
|
19
|
+
constructor(opts) {
|
|
20
|
+
this.opts = opts;
|
|
21
|
+
}
|
|
22
|
+
/** Spawns speechmux sidecar. Throws if it fails to start. */
|
|
23
|
+
async start() {
|
|
24
|
+
if (this.started)
|
|
25
|
+
return;
|
|
26
|
+
await this.opts.startSpeechmux();
|
|
27
|
+
this.started = true;
|
|
28
|
+
}
|
|
29
|
+
/** Kills speechmux. Idempotent. */
|
|
30
|
+
async stop() {
|
|
31
|
+
if (!this.started)
|
|
32
|
+
return;
|
|
33
|
+
await this.opts.stopSpeechmux();
|
|
34
|
+
this.started = false;
|
|
35
|
+
this.activeCalls.clear();
|
|
36
|
+
}
|
|
37
|
+
/** Called by ws-handler for CallBindCommand. */
|
|
38
|
+
async bindCall(args) {
|
|
39
|
+
const slot = this.opts.busResolver.getSlot(args.sessionId);
|
|
40
|
+
if (!slot) {
|
|
41
|
+
throw new CallBindError('call_bind_failed_session_not_found', `No session ${args.sessionId}`);
|
|
42
|
+
}
|
|
43
|
+
const alreadyOwned = this.opts.isOwnedByVoiceCall(args.sessionId);
|
|
44
|
+
if (alreadyOwned && !args.force) {
|
|
45
|
+
throw new CallBindError('call_bind_failed_owned', 'Session already bound to a voice call');
|
|
46
|
+
}
|
|
47
|
+
if (alreadyOwned && args.force) {
|
|
48
|
+
await this.opts.displaceOwner(args.sessionId, args.clientConnection);
|
|
49
|
+
}
|
|
50
|
+
// Voice-disabled guard: if speechmux wiring isn't configured, fail the
|
|
51
|
+
// bind here rather than handing the client empty URLs. Speechmux is
|
|
52
|
+
// what mints the per-call TURN creds now (in the /signal `session`
|
|
53
|
+
// response) and what authenticates peers (via Cloudflare Access at the
|
|
54
|
+
// edge), so pimote no longer needs to mint anything.
|
|
55
|
+
const signalUrl = this.opts.config.voice?.speechmuxSignalUrl;
|
|
56
|
+
const llmWsUrl = this.opts.config.voice?.speechmuxLlmWsUrl;
|
|
57
|
+
if (!signalUrl || !llmWsUrl) {
|
|
58
|
+
throw new CallBindError('call_bind_failed_internal', 'voice_disabled: speechmux signal URL / llm WS URL not configured');
|
|
59
|
+
}
|
|
60
|
+
const bus = this.opts.busResolver.getEventBus(args.sessionId);
|
|
61
|
+
if (!bus) {
|
|
62
|
+
throw new CallBindError('call_bind_failed_internal', 'Session has no EventBus');
|
|
63
|
+
}
|
|
64
|
+
const activate = {
|
|
65
|
+
type: 'pimote:voice:activate',
|
|
66
|
+
sessionId: args.sessionId,
|
|
67
|
+
speechmuxWsUrl: llmWsUrl,
|
|
68
|
+
};
|
|
69
|
+
bus.emit(activate.type, activate);
|
|
70
|
+
this.activeCalls.add(args.sessionId);
|
|
71
|
+
return {
|
|
72
|
+
sessionId: args.sessionId,
|
|
73
|
+
webrtcSignalUrl: signalUrl,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
/** Called by ws-handler for CallEndCommand, or internally on displacement/error. Idempotent. */
|
|
77
|
+
async endCall(args) {
|
|
78
|
+
if (!this.activeCalls.has(args.sessionId))
|
|
79
|
+
return;
|
|
80
|
+
this.activeCalls.delete(args.sessionId);
|
|
81
|
+
const bus = this.opts.busResolver.getEventBus(args.sessionId);
|
|
82
|
+
if (bus) {
|
|
83
|
+
const deactivate = { type: 'pimote:voice:deactivate', sessionId: args.sessionId };
|
|
84
|
+
bus.emit(deactivate.type, deactivate);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/** True if the given session currently has an active voice call bound. */
|
|
88
|
+
isCallActive(sessionId) {
|
|
89
|
+
return this.activeCalls.has(sessionId);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
@@ -8,6 +8,7 @@ import { createExtensionUIBridge } from './extension-ui-bridge.js';
|
|
|
8
8
|
import { findExternalPiProcesses, killExternalPiProcesses } from './takeover.js';
|
|
9
9
|
import { mapAgentMessages, extractMessageEntryIds, applyEntryIds } from './message-mapper.js';
|
|
10
10
|
import { getGitBranch } from './git-branch.js';
|
|
11
|
+
import { CallBindError } from './voice-orchestrator.js';
|
|
11
12
|
/** Parse data-URL encoded images into the shape the pi SDK expects. */
|
|
12
13
|
function parseDataUrlImages(images) {
|
|
13
14
|
if (!images || images.length === 0)
|
|
@@ -133,16 +134,18 @@ export class WsHandler {
|
|
|
133
134
|
pushNotificationService;
|
|
134
135
|
sessionMetadataStore;
|
|
135
136
|
clientRegistry;
|
|
137
|
+
voiceOrchestrator;
|
|
136
138
|
subscribedSessions = new Set();
|
|
137
139
|
viewedSessionId = null;
|
|
138
140
|
clientId;
|
|
139
|
-
constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry) {
|
|
141
|
+
constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry, voiceOrchestrator) {
|
|
140
142
|
this.sessionManager = sessionManager;
|
|
141
143
|
this.folderIndex = folderIndex;
|
|
142
144
|
this.ws = ws;
|
|
143
145
|
this.pushNotificationService = pushNotificationService;
|
|
144
146
|
this.sessionMetadataStore = sessionMetadataStore;
|
|
145
147
|
this.clientRegistry = clientRegistry;
|
|
148
|
+
this.voiceOrchestrator = voiceOrchestrator;
|
|
146
149
|
this.clientId = clientId;
|
|
147
150
|
}
|
|
148
151
|
getViewedSessionId() {
|
|
@@ -480,6 +483,65 @@ export class WsHandler {
|
|
|
480
483
|
this.sendResponse(id, true, { sessionId: takeoverSessionId, killedProcesses: killedCount });
|
|
481
484
|
break;
|
|
482
485
|
}
|
|
486
|
+
// ---- Voice call control ----
|
|
487
|
+
case 'call_bind': {
|
|
488
|
+
if (!this.voiceOrchestrator) {
|
|
489
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
|
|
490
|
+
break;
|
|
491
|
+
}
|
|
492
|
+
const slot = this.sessionManager.getSlot(command.sessionId);
|
|
493
|
+
if (!slot) {
|
|
494
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_session_not_found');
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
const connection = {
|
|
498
|
+
ws: this.ws,
|
|
499
|
+
connectedClientId: this.clientId,
|
|
500
|
+
onSessionReset: (s) => this.handleSessionReset(s),
|
|
501
|
+
};
|
|
502
|
+
try {
|
|
503
|
+
const data = await this.voiceOrchestrator.bindCall({
|
|
504
|
+
sessionId: command.sessionId,
|
|
505
|
+
clientConnection: connection,
|
|
506
|
+
force: command.force ?? false,
|
|
507
|
+
});
|
|
508
|
+
this.sendResponse(id, true, data);
|
|
509
|
+
this.sendEvent({ type: 'call_status', sessionId: command.sessionId, status: 'binding' });
|
|
510
|
+
}
|
|
511
|
+
catch (err) {
|
|
512
|
+
if (err instanceof CallBindError) {
|
|
513
|
+
this.sendResponse(id, false, undefined, err.code);
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
console.warn('[voice] call_bind failed', err);
|
|
517
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
break;
|
|
521
|
+
}
|
|
522
|
+
case 'call_end': {
|
|
523
|
+
await this.voiceOrchestrator?.endCall({ sessionId: command.sessionId, reason: 'user_hangup' });
|
|
524
|
+
this.sendResponse(id, true);
|
|
525
|
+
this.sendEvent({ type: 'call_ended', sessionId: command.sessionId, reason: 'user_hangup' });
|
|
526
|
+
break;
|
|
527
|
+
}
|
|
528
|
+
// ---- Client diagnostic logs (voice/call tracing) ----
|
|
529
|
+
case 'client_log': {
|
|
530
|
+
// Forward to the server's logger so client-side traces interleave
|
|
531
|
+
// with the server-side voice extension logs in the same journal.
|
|
532
|
+
const clientWall = new Date(command.clientTimestampMs).toISOString();
|
|
533
|
+
const serverWall = new Date().toISOString();
|
|
534
|
+
const driftMs = Date.now() - command.clientTimestampMs;
|
|
535
|
+
const line = `[voice_trace][client/${command.tag}] ${command.message} ${JSON.stringify({ clientWall, serverWall, driftMs, ...(command.data ?? {}) })}`;
|
|
536
|
+
if (command.level === 'error')
|
|
537
|
+
console.error(line);
|
|
538
|
+
else if (command.level === 'warn')
|
|
539
|
+
console.warn(line);
|
|
540
|
+
else
|
|
541
|
+
console.log(line);
|
|
542
|
+
this.sendResponse(id, true);
|
|
543
|
+
break;
|
|
544
|
+
}
|
|
483
545
|
// ---- Extension UI ----
|
|
484
546
|
case 'extension_ui_response': {
|
|
485
547
|
const uiSlot = command.sessionId ? this.sessionManager.getSession(command.sessionId) : undefined;
|
|
@@ -896,13 +958,26 @@ export class WsHandler {
|
|
|
896
958
|
}
|
|
897
959
|
}
|
|
898
960
|
/** Notify the old owner that they've been displaced from a session.
|
|
899
|
-
* No-op if the session is unowned or owned by this client.
|
|
961
|
+
* No-op if the session is unowned or owned by this client.
|
|
962
|
+
*
|
|
963
|
+
* Voice-call tear-down on displacement lives in `sendDisplacedEvent` (the
|
|
964
|
+
* old-owner-side site that also emits `call_ended { displaced }`), so this
|
|
965
|
+
* method does not call `voiceOrchestrator.endCall` itself — see review
|
|
966
|
+
* finding 4.
|
|
967
|
+
*/
|
|
900
968
|
displaceOwner(sessionId, slot) {
|
|
901
969
|
if (slot.connection?.connectedClientId && slot.connection.connectedClientId !== this.clientId) {
|
|
902
970
|
const oldHandler = this.clientRegistry.get(slot.connection.connectedClientId);
|
|
903
971
|
if (oldHandler) {
|
|
904
972
|
oldHandler.sendDisplacedEvent(sessionId);
|
|
905
973
|
}
|
|
974
|
+
else if (this.voiceOrchestrator?.isCallActive(sessionId)) {
|
|
975
|
+
// Stale owner id with no live handler — clean up orchestrator state
|
|
976
|
+
// so the new owner doesn't inherit a phantom active call.
|
|
977
|
+
this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' }).catch((err) => {
|
|
978
|
+
console.warn('[voice] endCall on displace (stale handler) failed', err);
|
|
979
|
+
});
|
|
980
|
+
}
|
|
906
981
|
}
|
|
907
982
|
}
|
|
908
983
|
/** Bind a slot to this client — sets ownership, WebSocket routing,
|
|
@@ -914,13 +989,16 @@ export class WsHandler {
|
|
|
914
989
|
onSessionReset: (s) => this.handleSessionReset(s),
|
|
915
990
|
};
|
|
916
991
|
slot.connection = connection;
|
|
917
|
-
|
|
992
|
+
// Note: do NOT touch `idleSince` here. Idleness is an agent-level concept driven by
|
|
993
|
+
// agent_start/agent_end — a client claiming a session does not extend its idle clock.
|
|
918
994
|
this.subscribedSessions.add(sessionId);
|
|
919
995
|
// Bind extensions when needed. The bridge holds a direct reference to this
|
|
920
996
|
// ManagedSlot — on reconnect we skip rebinding, but on session reset
|
|
921
997
|
// we must rebind so the bridge points at the new session state.
|
|
922
998
|
if (!slot.sessionState.extensionsBound) {
|
|
923
|
-
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService
|
|
999
|
+
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
|
|
1000
|
+
isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(sessionId) ?? false,
|
|
1001
|
+
});
|
|
924
1002
|
const commandContextActions = createCommandContextActions(slot);
|
|
925
1003
|
await slot.session.bindExtensions({ uiContext, commandContextActions });
|
|
926
1004
|
slot.sessionState.extensionsBound = true;
|
|
@@ -939,9 +1017,12 @@ export class WsHandler {
|
|
|
939
1017
|
return;
|
|
940
1018
|
}
|
|
941
1019
|
// Session ID changed — rebuild session state in-place on the same slot.
|
|
942
|
-
|
|
1020
|
+
// rebuildSessionState refreshes slot.folderPath from the new session's header cwd,
|
|
1021
|
+
// so capture folderPath AFTER the rebuild to pick up the new value (fork-from can
|
|
1022
|
+
// change cwd, e.g. the worktree extension).
|
|
943
1023
|
// Rebuild session state (tears down old, creates new from runtime.session)
|
|
944
1024
|
this.sessionManager.rebuildSessionState(slot);
|
|
1025
|
+
const folderPath = slot.folderPath;
|
|
945
1026
|
// Re-key the session map
|
|
946
1027
|
this.sessionManager.reKeySession(slot, oldSessionId, newSessionId);
|
|
947
1028
|
// Update handler bookkeeping
|
|
@@ -951,7 +1032,9 @@ export class WsHandler {
|
|
|
951
1032
|
this.viewedSessionId = newSessionId;
|
|
952
1033
|
}
|
|
953
1034
|
// Rebind extension UI bridge (new session state for dialog routing)
|
|
954
|
-
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService
|
|
1035
|
+
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
|
|
1036
|
+
isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(newSessionId) ?? false,
|
|
1037
|
+
});
|
|
955
1038
|
const commandContextActions = createCommandContextActions(slot);
|
|
956
1039
|
await slot.session.bindExtensions({ uiContext, commandContextActions });
|
|
957
1040
|
slot.sessionState.extensionsBound = true;
|
|
@@ -1064,6 +1147,27 @@ export class WsHandler {
|
|
|
1064
1147
|
sessionId,
|
|
1065
1148
|
reason: 'displaced',
|
|
1066
1149
|
});
|
|
1150
|
+
// If the old owner had an active voice call on this session, tear down
|
|
1151
|
+
// orchestrator bookkeeping and surface `call_ended { reason: 'displaced' }`
|
|
1152
|
+
// so their VoiceCallStore tears down alongside the session_closed.
|
|
1153
|
+
if (this.voiceOrchestrator?.isCallActive(sessionId)) {
|
|
1154
|
+
void this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' });
|
|
1155
|
+
this.sendEvent({
|
|
1156
|
+
type: 'call_ended',
|
|
1157
|
+
sessionId,
|
|
1158
|
+
reason: 'displaced',
|
|
1159
|
+
});
|
|
1160
|
+
}
|
|
1161
|
+
}
|
|
1162
|
+
/** Broadcast a `call_ended` to this client (used by the session manager's
|
|
1163
|
+
* before-close hook so the orchestrator bookkeeping owner learns that a
|
|
1164
|
+
* server-initiated teardown happened). */
|
|
1165
|
+
sendCallEndedEvent(sessionId, reason) {
|
|
1166
|
+
this.sendEvent({
|
|
1167
|
+
type: 'call_ended',
|
|
1168
|
+
sessionId,
|
|
1169
|
+
reason,
|
|
1170
|
+
});
|
|
1067
1171
|
}
|
|
1068
1172
|
/** Send a session_closed event with reason 'killed' to this client's WebSocket.
|
|
1069
1173
|
* Also removes the session from this handler's subscribedSessions so that
|
|
@@ -1218,9 +1322,10 @@ export class WsHandler {
|
|
|
1218
1322
|
const slot = this.sessionManager.getSession(sid);
|
|
1219
1323
|
if (slot) {
|
|
1220
1324
|
slot.connection = null;
|
|
1221
|
-
slot.sessionState.lastActivity = Date.now();
|
|
1222
1325
|
// Note: pending UI responses are NOT resolved here — they survive
|
|
1223
1326
|
// for replay on reconnect. They are resolved on session close or abort.
|
|
1327
|
+
// Note: do NOT touch `idleSince`. Disconnecting does not reset idleness — if the
|
|
1328
|
+
// agent finished 10 minutes ago, a peeking client should not extend the session's life.
|
|
1224
1329
|
}
|
|
1225
1330
|
}
|
|
1226
1331
|
this.subscribedSessions.clear();
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './protocol.js';
|