@pimote/pimote 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -16
- package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
- package/client/build/_app/immutable/assets/2.D9fiCd8W.css +1 -0
- package/client/build/_app/immutable/chunks/BNqgidwO.js +5 -0
- package/client/build/_app/immutable/chunks/D26i4pYm.js +1 -0
- package/client/build/_app/immutable/chunks/D_Fpgknp.js +1 -0
- package/client/build/_app/immutable/chunks/DoVhjU85.js +1 -0
- package/client/build/_app/immutable/chunks/DzqbY2XU.js +1 -0
- package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.DO-zgzyy.js} +2 -2
- package/client/build/_app/immutable/entry/start.BZlrOH0-.js +1 -0
- package/client/build/_app/immutable/nodes/0.BEh4bPGQ.js +10 -0
- package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.B2l9JGRO.js} +1 -1
- package/client/build/_app/immutable/nodes/2.ph9M0S1U.js +54 -0
- package/client/build/_app/version.json +1 -1
- package/client/build/index.html +7 -7
- package/package.json +7 -3
- package/server/dist/auto-drain-on-abort.js +49 -0
- package/server/dist/config.js +21 -0
- package/server/dist/extension-ui-bridge.js +14 -1
- package/server/dist/index.js +31 -1
- package/server/dist/message-mapper.js +38 -6
- package/server/dist/server.js +2 -2
- package/server/dist/session-manager.js +64 -2
- package/server/dist/voice/fsm/actions.js +6 -0
- package/server/dist/voice/fsm/events.js +7 -0
- package/server/dist/voice/fsm/reducer.js +74 -0
- package/server/dist/voice/fsm/reducers/lifecycle.js +146 -0
- package/server/dist/voice/fsm/reducers/streaming.js +220 -0
- package/server/dist/voice/fsm/reducers/walkback.js +73 -0
- package/server/dist/voice/fsm/state.js +21 -0
- package/server/dist/voice/fsm/text-extractor.js +128 -0
- package/server/dist/voice/index.js +319 -0
- package/server/dist/voice/interpreter-prompt.js +115 -0
- package/server/dist/voice/speechmux-client.js +153 -0
- package/server/dist/voice/state-machine.js +7 -0
- package/server/dist/voice/wait-for-idle.js +67 -0
- package/server/dist/voice/walk-back.js +198 -0
- package/server/dist/voice-orchestrator-boot.js +90 -0
- package/server/dist/voice-orchestrator.js +91 -0
- package/server/dist/ws-handler.js +108 -5
- package/shared/dist/index.d.ts +1 -0
- package/shared/dist/index.js +2 -0
- package/shared/dist/protocol.d.ts +614 -0
- package/shared/dist/protocol.js +30 -0
- package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
- package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
- package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
- package/client/build/_app/immutable/chunks/CT6ckxpD.js +0 -1
- package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
- package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
- package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
- package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
- package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
- package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
// Walkback rewrite: surgical truncation of conversation history when
|
|
2
|
+
// speechmux reports the user heard only a prefix of an assistant
|
|
3
|
+
// utterance.
|
|
4
|
+
//
|
|
5
|
+
// **Identity-based design.** Walkback targets a specific `speak()` tool
|
|
6
|
+
// call by its `toolCallId`. That id is round-tripped through speechmux
|
|
7
|
+
// (every outgoing `token`/`end` frame carries it; speechmux echoes it
|
|
8
|
+
// back on `rollback`/`abort`) so we know exactly which utterance the
|
|
9
|
+
// `heardText` belongs to. The previous design used a captured snapshot
|
|
10
|
+
// of the in-flight assistant message and a string-prefix-matching
|
|
11
|
+
// algorithm — both of which broke whenever a turn contained more than
|
|
12
|
+
// one speak() or whenever the snapshot drifted out of sync with the
|
|
13
|
+
// real conversation.
|
|
14
|
+
//
|
|
15
|
+
// **Contract:** see `docs/plans/voice-mode.md` for the high-level
|
|
16
|
+
// behavioural spec. Briefly:
|
|
17
|
+
//
|
|
18
|
+
// 1. The trailing pi-synthetic empty-text aborted assistant (if any)
|
|
19
|
+
// is always stripped, even when no rollback is pending. This is
|
|
20
|
+
// pi's marker for "agent run was aborted"; we don't want it in
|
|
21
|
+
// the LLM context.
|
|
22
|
+
//
|
|
23
|
+
// 2. With a rollback pending, locate the speak block by
|
|
24
|
+
// `targetSpeakToolCallId`. If found:
|
|
25
|
+
// - If `heardText` is empty: drop the speak block entirely (and
|
|
26
|
+
// its paired tool_result if present).
|
|
27
|
+
// - If `heardText.length >= block.text.length`: keep block as-is
|
|
28
|
+
// (whole utterance was heard).
|
|
29
|
+
// - Otherwise: replace the block's text with `heardText` and
|
|
30
|
+
// drop the paired tool_result.
|
|
31
|
+
// Then drop blocks AFTER the target in the same message, and drop
|
|
32
|
+
// any subsequent assistant/tool_result messages — none of those
|
|
33
|
+
// could have been heard if the user interrupted at the target.
|
|
34
|
+
//
|
|
35
|
+
// 3. If the target is not found in messages (e.g. compacted away),
|
|
36
|
+
// walkback is a no-op beyond step 1.
|
|
37
|
+
//
|
|
38
|
+
// **Content-block shape compatibility.** The function handles both
|
|
39
|
+
// pi-agent-core's internal AgentMessage shape (`type:'toolCall'` +
|
|
40
|
+
// `arguments`) and the Anthropic API shape (`type:'tool_use'` +
|
|
41
|
+
// `input`). Earlier versions only matched the latter, which silently
|
|
42
|
+
// failed on every real captured message.
|
|
43
|
+
/**
|
|
44
|
+
* Apply walkback against `messages`. Pure function.
|
|
45
|
+
*
|
|
46
|
+
* Returns a new array; never mutates the input.
|
|
47
|
+
*/
|
|
48
|
+
export function walkBack(input) {
|
|
49
|
+
const stripped = stripTrailingAbortedEmpty(input.messages);
|
|
50
|
+
if (input.rollback === null)
|
|
51
|
+
return stripped;
|
|
52
|
+
return rewriteByToolCallId(stripped, input.rollback.heardText, input.rollback.targetSpeakToolCallId);
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
/** True for the synthetic assistant pi appends to state on abort. */
|
|
56
|
+
export function isAbortedEmptyAssistant(msg) {
|
|
57
|
+
if (!isAssistantMessage(msg))
|
|
58
|
+
return false;
|
|
59
|
+
if (stopReason(msg) !== 'aborted')
|
|
60
|
+
return false;
|
|
61
|
+
return isEmptyText(contentOf(msg));
|
|
62
|
+
}
|
|
63
|
+
function stripTrailingAbortedEmpty(messages) {
|
|
64
|
+
let cut = messages.length;
|
|
65
|
+
while (cut > 0 && isAbortedEmptyAssistant(messages[cut - 1]))
|
|
66
|
+
cut -= 1;
|
|
67
|
+
return cut === messages.length ? messages.slice() : messages.slice(0, cut);
|
|
68
|
+
}
|
|
69
|
+
function rewriteByToolCallId(messages, heardText, targetId) {
|
|
70
|
+
// Search from the back — toolCallIds are unique per session, so the
|
|
71
|
+
// first match is the right one, but searching backward minimises work
|
|
72
|
+
// for the common case (target is in the recent tail).
|
|
73
|
+
let targetMsgIdx = -1;
|
|
74
|
+
let targetBlockIdx = -1;
|
|
75
|
+
for (let i = messages.length - 1; i >= 0; i--) {
|
|
76
|
+
const msg = messages[i];
|
|
77
|
+
if (!isAssistantMessage(msg))
|
|
78
|
+
continue;
|
|
79
|
+
const content = contentOf(msg);
|
|
80
|
+
for (let j = 0; j < content.length; j++) {
|
|
81
|
+
if (isSpeakToolCall(content[j]) && getToolCallId(content[j]) === targetId) {
|
|
82
|
+
targetMsgIdx = i;
|
|
83
|
+
targetBlockIdx = j;
|
|
84
|
+
break;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
if (targetMsgIdx !== -1)
|
|
88
|
+
break;
|
|
89
|
+
}
|
|
90
|
+
if (targetMsgIdx === -1) {
|
|
91
|
+
// Target gone (compacted, or never landed in messages). Best we can
|
|
92
|
+
// do is honour step 1 (already done).
|
|
93
|
+
return messages;
|
|
94
|
+
}
|
|
95
|
+
const targetMsg = messages[targetMsgIdx];
|
|
96
|
+
const targetContent = contentOf(targetMsg);
|
|
97
|
+
const targetBlock = targetContent[targetBlockIdx];
|
|
98
|
+
const originalText = getSpeakText(targetBlock);
|
|
99
|
+
const newBlocks = targetContent.slice(0, targetBlockIdx);
|
|
100
|
+
const droppedToolUseIds = new Set();
|
|
101
|
+
if (heardText.length === 0) {
|
|
102
|
+
// Nothing was heard of this speak. Drop the block and its paired
|
|
103
|
+
// tool_result (if any).
|
|
104
|
+
droppedToolUseIds.add(targetId);
|
|
105
|
+
}
|
|
106
|
+
else if (heardText.length >= originalText.length) {
|
|
107
|
+
// Entire utterance was heard. Keep block intact.
|
|
108
|
+
newBlocks.push(targetBlock);
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
// Partial. Truncate text in-place and drop the paired tool_result
|
|
112
|
+
// (per the contract — a truncated speak's result is no longer
|
|
113
|
+
// grounded in what the user heard).
|
|
114
|
+
newBlocks.push(replaceSpeakText(targetBlock, heardText));
|
|
115
|
+
droppedToolUseIds.add(targetId);
|
|
116
|
+
}
|
|
117
|
+
// Anything in this message AFTER the target block was emitted after
|
|
118
|
+
// the heard prefix and so was not heard.
|
|
119
|
+
for (let j = targetBlockIdx + 1; j < targetContent.length; j++) {
|
|
120
|
+
const id = getToolCallId(targetContent[j]);
|
|
121
|
+
if (id)
|
|
122
|
+
droppedToolUseIds.add(id);
|
|
123
|
+
}
|
|
124
|
+
const rewrittenTarget = {
|
|
125
|
+
...targetMsg,
|
|
126
|
+
content: newBlocks,
|
|
127
|
+
stopReason: 'aborted',
|
|
128
|
+
};
|
|
129
|
+
// Anything AFTER the target message in the array was emitted by the
|
|
130
|
+
// agent after the interrupted speak — drop it. This includes any
|
|
131
|
+
// tool_result messages whose paired speak we just truncated, plus
|
|
132
|
+
// any subsequent assistant messages.
|
|
133
|
+
return [...messages.slice(0, targetMsgIdx), rewrittenTarget];
|
|
134
|
+
}
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// Shape-tolerant accessors. pi-agent-core's runtime AgentMessage uses
|
|
137
|
+
// `toolCall`/`arguments`; the Anthropic API shape uses `tool_use`/`input`.
|
|
138
|
+
// Tests / tooling may pass either; we accept both.
|
|
139
|
+
function isAssistantMessage(msg) {
|
|
140
|
+
return msg.role === 'assistant';
|
|
141
|
+
}
|
|
142
|
+
function stopReason(msg) {
|
|
143
|
+
return msg.stopReason;
|
|
144
|
+
}
|
|
145
|
+
function contentOf(msg) {
|
|
146
|
+
const c = msg.content;
|
|
147
|
+
return Array.isArray(c) ? c : [];
|
|
148
|
+
}
|
|
149
|
+
function isEmptyText(blocks) {
|
|
150
|
+
if (blocks.length === 0)
|
|
151
|
+
return true;
|
|
152
|
+
return blocks.every((b) => {
|
|
153
|
+
if (b.type !== 'text')
|
|
154
|
+
return false;
|
|
155
|
+
const t = b.text;
|
|
156
|
+
return typeof t === 'string' && t.trim() === '';
|
|
157
|
+
});
|
|
158
|
+
}
|
|
159
|
+
export function isSpeakToolCall(block) {
|
|
160
|
+
if (block.type !== 'toolCall' && block.type !== 'tool_use')
|
|
161
|
+
return false;
|
|
162
|
+
return block.name === 'speak';
|
|
163
|
+
}
|
|
164
|
+
function getToolCallId(block) {
|
|
165
|
+
const id = block.id;
|
|
166
|
+
return typeof id === 'string' ? id : undefined;
|
|
167
|
+
}
|
|
168
|
+
function getSpeakText(block) {
|
|
169
|
+
// Try both shapes; whichever holds a string wins.
|
|
170
|
+
const args = block.arguments;
|
|
171
|
+
if (args && typeof args === 'object') {
|
|
172
|
+
const t = args.text;
|
|
173
|
+
if (typeof t === 'string')
|
|
174
|
+
return t;
|
|
175
|
+
}
|
|
176
|
+
const input = block.input;
|
|
177
|
+
if (input && typeof input === 'object') {
|
|
178
|
+
const t = input.text;
|
|
179
|
+
if (typeof t === 'string')
|
|
180
|
+
return t;
|
|
181
|
+
}
|
|
182
|
+
return '';
|
|
183
|
+
}
|
|
184
|
+
function replaceSpeakText(block, text) {
|
|
185
|
+
// Preserve whichever args/input shape was present, replacing only the
|
|
186
|
+
// `text` field. We don't normalise to a single shape — that would
|
|
187
|
+
// diverge from whatever pi-agent-core/the provider expects.
|
|
188
|
+
const args = block.arguments;
|
|
189
|
+
const input = block.input;
|
|
190
|
+
if (args && typeof args === 'object') {
|
|
191
|
+
return { ...block, arguments: { ...args, text } };
|
|
192
|
+
}
|
|
193
|
+
if (input && typeof input === 'object') {
|
|
194
|
+
return { ...block, input: { ...input, text } };
|
|
195
|
+
}
|
|
196
|
+
// Neither shape present — set both defensively.
|
|
197
|
+
return { ...block, arguments: { text }, input: { text } };
|
|
198
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
// Wire the VoiceOrchestrator together with its runtime dependencies at
|
|
2
|
+
// server boot time. Kept separate from `index.ts` so the wiring is
|
|
3
|
+
// testable (no network / child_process side-effects at import time) and
|
|
4
|
+
// isolated from the plain HTTP/WS boot sequence.
|
|
5
|
+
import { spawn } from 'node:child_process';
|
|
6
|
+
import { VoiceOrchestrator } from './voice-orchestrator.js';
|
|
7
|
+
/**
|
|
8
|
+
* Construct a VoiceOrchestrator backed by real seams:
|
|
9
|
+
* - speechmux sidecar via `child_process.spawn`
|
|
10
|
+
* - displacement = looks up current owner via clientRegistry and calls its
|
|
11
|
+
* `sendDisplacedEvent(sessionId)`
|
|
12
|
+
*
|
|
13
|
+
* Auth on `/signal` is handled by Cloudflare Access at the edge, and
|
|
14
|
+
* per-session TURN credentials are minted by speechmux and returned to the
|
|
15
|
+
* PWA in its `/signal` `session` response. Pimote's orchestrator only
|
|
16
|
+
* hands out the signalling URL.
|
|
17
|
+
*/
|
|
18
|
+
export function buildVoiceOrchestrator(args) {
|
|
19
|
+
const { config, sessionManager, clientRegistry } = args;
|
|
20
|
+
let speechmuxProc = null;
|
|
21
|
+
const busResolver = {
|
|
22
|
+
getSlot: (sessionId) => sessionManager.getSlot(sessionId),
|
|
23
|
+
getEventBus: (sessionId) => sessionManager.getSlot(sessionId)?.eventBusRef.current ?? null,
|
|
24
|
+
};
|
|
25
|
+
const orchestrator = new VoiceOrchestrator({
|
|
26
|
+
config,
|
|
27
|
+
sessionManager,
|
|
28
|
+
busResolver,
|
|
29
|
+
startSpeechmux: async () => {
|
|
30
|
+
const bin = config.voice?.speechmuxBinary;
|
|
31
|
+
if (!bin) {
|
|
32
|
+
console.log('[voice] speechmuxBinary not configured; assuming speechmux is externally managed (systemd, container, remote host, etc.)');
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
if (speechmuxProc)
|
|
36
|
+
return;
|
|
37
|
+
speechmuxProc = spawn(bin, [], { stdio: ['ignore', 'inherit', 'inherit'] });
|
|
38
|
+
speechmuxProc.on('exit', (code, signal) => {
|
|
39
|
+
console.warn(`[voice] speechmux exited (code=${code}, signal=${signal})`);
|
|
40
|
+
speechmuxProc = null;
|
|
41
|
+
});
|
|
42
|
+
// NB: we do not wait for a ready marker here — speechmux emits readiness
|
|
43
|
+
// to its own logs. Callers should ensure startup ordering or implement a
|
|
44
|
+
// readiness probe as part of the Step 14 smoke.
|
|
45
|
+
},
|
|
46
|
+
stopSpeechmux: async () => {
|
|
47
|
+
if (!speechmuxProc)
|
|
48
|
+
return;
|
|
49
|
+
const proc = speechmuxProc;
|
|
50
|
+
speechmuxProc = null;
|
|
51
|
+
await new Promise((resolve) => {
|
|
52
|
+
const timer = setTimeout(() => {
|
|
53
|
+
try {
|
|
54
|
+
proc.kill('SIGKILL');
|
|
55
|
+
}
|
|
56
|
+
catch {
|
|
57
|
+
/* ignore */
|
|
58
|
+
}
|
|
59
|
+
resolve();
|
|
60
|
+
}, 2000);
|
|
61
|
+
proc.once('exit', () => {
|
|
62
|
+
clearTimeout(timer);
|
|
63
|
+
resolve();
|
|
64
|
+
});
|
|
65
|
+
try {
|
|
66
|
+
proc.kill('SIGTERM');
|
|
67
|
+
}
|
|
68
|
+
catch {
|
|
69
|
+
clearTimeout(timer);
|
|
70
|
+
resolve();
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
},
|
|
74
|
+
displaceOwner: async (sessionId, _newOwner) => {
|
|
75
|
+
const slot = sessionManager.getSlot(sessionId);
|
|
76
|
+
const existingClientId = slot?.connection?.connectedClientId;
|
|
77
|
+
if (!existingClientId)
|
|
78
|
+
return;
|
|
79
|
+
const existing = clientRegistry.get(existingClientId);
|
|
80
|
+
existing?.sendDisplacedEvent(sessionId);
|
|
81
|
+
},
|
|
82
|
+
isOwnedByVoiceCall: (sessionId) => orchestrator.isCallActive(sessionId),
|
|
83
|
+
});
|
|
84
|
+
return {
|
|
85
|
+
orchestrator,
|
|
86
|
+
shutdown: async () => {
|
|
87
|
+
await orchestrator.stop();
|
|
88
|
+
},
|
|
89
|
+
};
|
|
90
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
// Voice orchestrator — owns the speechmux sidecar lifecycle and the per-call
|
|
2
|
+
// bind dispatch. See docs/plans/voice-mode.md → "Voice orchestrator".
|
|
3
|
+
//
|
|
4
|
+
// This file defines the interface surface + a stub implementation. The impl
|
|
5
|
+
// phase fills in start()/stop()/bindCall()/endCall() bodies.
|
|
6
|
+
/** Typed error carrying the discriminable reason code used in PimoteResponse.error. */
|
|
7
|
+
export class CallBindError extends Error {
|
|
8
|
+
code;
|
|
9
|
+
constructor(code, message) {
|
|
10
|
+
super(message ?? code);
|
|
11
|
+
this.code = code;
|
|
12
|
+
this.name = 'CallBindError';
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
export class VoiceOrchestrator {
|
|
16
|
+
opts;
|
|
17
|
+
started = false;
|
|
18
|
+
activeCalls = new Set();
|
|
19
|
+
constructor(opts) {
|
|
20
|
+
this.opts = opts;
|
|
21
|
+
}
|
|
22
|
+
/** Spawns speechmux sidecar. Throws if it fails to start. */
|
|
23
|
+
async start() {
|
|
24
|
+
if (this.started)
|
|
25
|
+
return;
|
|
26
|
+
await this.opts.startSpeechmux();
|
|
27
|
+
this.started = true;
|
|
28
|
+
}
|
|
29
|
+
/** Kills speechmux. Idempotent. */
|
|
30
|
+
async stop() {
|
|
31
|
+
if (!this.started)
|
|
32
|
+
return;
|
|
33
|
+
await this.opts.stopSpeechmux();
|
|
34
|
+
this.started = false;
|
|
35
|
+
this.activeCalls.clear();
|
|
36
|
+
}
|
|
37
|
+
/** Called by ws-handler for CallBindCommand. */
|
|
38
|
+
async bindCall(args) {
|
|
39
|
+
const slot = this.opts.busResolver.getSlot(args.sessionId);
|
|
40
|
+
if (!slot) {
|
|
41
|
+
throw new CallBindError('call_bind_failed_session_not_found', `No session ${args.sessionId}`);
|
|
42
|
+
}
|
|
43
|
+
const alreadyOwned = this.opts.isOwnedByVoiceCall(args.sessionId);
|
|
44
|
+
if (alreadyOwned && !args.force) {
|
|
45
|
+
throw new CallBindError('call_bind_failed_owned', 'Session already bound to a voice call');
|
|
46
|
+
}
|
|
47
|
+
if (alreadyOwned && args.force) {
|
|
48
|
+
await this.opts.displaceOwner(args.sessionId, args.clientConnection);
|
|
49
|
+
}
|
|
50
|
+
// Voice-disabled guard: if speechmux wiring isn't configured, fail the
|
|
51
|
+
// bind here rather than handing the client empty URLs. Speechmux is
|
|
52
|
+
// what mints the per-call TURN creds now (in the /signal `session`
|
|
53
|
+
// response) and what authenticates peers (via Cloudflare Access at the
|
|
54
|
+
// edge), so pimote no longer needs to mint anything.
|
|
55
|
+
const signalUrl = this.opts.config.voice?.speechmuxSignalUrl;
|
|
56
|
+
const llmWsUrl = this.opts.config.voice?.speechmuxLlmWsUrl;
|
|
57
|
+
if (!signalUrl || !llmWsUrl) {
|
|
58
|
+
throw new CallBindError('call_bind_failed_internal', 'voice_disabled: speechmux signal URL / llm WS URL not configured');
|
|
59
|
+
}
|
|
60
|
+
const bus = this.opts.busResolver.getEventBus(args.sessionId);
|
|
61
|
+
if (!bus) {
|
|
62
|
+
throw new CallBindError('call_bind_failed_internal', 'Session has no EventBus');
|
|
63
|
+
}
|
|
64
|
+
const activate = {
|
|
65
|
+
type: 'pimote:voice:activate',
|
|
66
|
+
sessionId: args.sessionId,
|
|
67
|
+
speechmuxWsUrl: llmWsUrl,
|
|
68
|
+
};
|
|
69
|
+
bus.emit(activate.type, activate);
|
|
70
|
+
this.activeCalls.add(args.sessionId);
|
|
71
|
+
return {
|
|
72
|
+
sessionId: args.sessionId,
|
|
73
|
+
webrtcSignalUrl: signalUrl,
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
/** Called by ws-handler for CallEndCommand, or internally on displacement/error. Idempotent. */
|
|
77
|
+
async endCall(args) {
|
|
78
|
+
if (!this.activeCalls.has(args.sessionId))
|
|
79
|
+
return;
|
|
80
|
+
this.activeCalls.delete(args.sessionId);
|
|
81
|
+
const bus = this.opts.busResolver.getEventBus(args.sessionId);
|
|
82
|
+
if (bus) {
|
|
83
|
+
const deactivate = { type: 'pimote:voice:deactivate', sessionId: args.sessionId };
|
|
84
|
+
bus.emit(deactivate.type, deactivate);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/** True if the given session currently has an active voice call bound. */
|
|
88
|
+
isCallActive(sessionId) {
|
|
89
|
+
return this.activeCalls.has(sessionId);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
@@ -8,6 +8,7 @@ import { createExtensionUIBridge } from './extension-ui-bridge.js';
|
|
|
8
8
|
import { findExternalPiProcesses, killExternalPiProcesses } from './takeover.js';
|
|
9
9
|
import { mapAgentMessages, extractMessageEntryIds, applyEntryIds } from './message-mapper.js';
|
|
10
10
|
import { getGitBranch } from './git-branch.js';
|
|
11
|
+
import { CallBindError } from './voice-orchestrator.js';
|
|
11
12
|
/** Parse data-URL encoded images into the shape the pi SDK expects. */
|
|
12
13
|
function parseDataUrlImages(images) {
|
|
13
14
|
if (!images || images.length === 0)
|
|
@@ -133,16 +134,18 @@ export class WsHandler {
|
|
|
133
134
|
pushNotificationService;
|
|
134
135
|
sessionMetadataStore;
|
|
135
136
|
clientRegistry;
|
|
137
|
+
voiceOrchestrator;
|
|
136
138
|
subscribedSessions = new Set();
|
|
137
139
|
viewedSessionId = null;
|
|
138
140
|
clientId;
|
|
139
|
-
constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry) {
|
|
141
|
+
constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry, voiceOrchestrator) {
|
|
140
142
|
this.sessionManager = sessionManager;
|
|
141
143
|
this.folderIndex = folderIndex;
|
|
142
144
|
this.ws = ws;
|
|
143
145
|
this.pushNotificationService = pushNotificationService;
|
|
144
146
|
this.sessionMetadataStore = sessionMetadataStore;
|
|
145
147
|
this.clientRegistry = clientRegistry;
|
|
148
|
+
this.voiceOrchestrator = voiceOrchestrator;
|
|
146
149
|
this.clientId = clientId;
|
|
147
150
|
}
|
|
148
151
|
getViewedSessionId() {
|
|
@@ -480,6 +483,65 @@ export class WsHandler {
|
|
|
480
483
|
this.sendResponse(id, true, { sessionId: takeoverSessionId, killedProcesses: killedCount });
|
|
481
484
|
break;
|
|
482
485
|
}
|
|
486
|
+
// ---- Voice call control ----
|
|
487
|
+
case 'call_bind': {
|
|
488
|
+
if (!this.voiceOrchestrator) {
|
|
489
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
|
|
490
|
+
break;
|
|
491
|
+
}
|
|
492
|
+
const slot = this.sessionManager.getSlot(command.sessionId);
|
|
493
|
+
if (!slot) {
|
|
494
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_session_not_found');
|
|
495
|
+
break;
|
|
496
|
+
}
|
|
497
|
+
const connection = {
|
|
498
|
+
ws: this.ws,
|
|
499
|
+
connectedClientId: this.clientId,
|
|
500
|
+
onSessionReset: (s) => this.handleSessionReset(s),
|
|
501
|
+
};
|
|
502
|
+
try {
|
|
503
|
+
const data = await this.voiceOrchestrator.bindCall({
|
|
504
|
+
sessionId: command.sessionId,
|
|
505
|
+
clientConnection: connection,
|
|
506
|
+
force: command.force ?? false,
|
|
507
|
+
});
|
|
508
|
+
this.sendResponse(id, true, data);
|
|
509
|
+
this.sendEvent({ type: 'call_status', sessionId: command.sessionId, status: 'binding' });
|
|
510
|
+
}
|
|
511
|
+
catch (err) {
|
|
512
|
+
if (err instanceof CallBindError) {
|
|
513
|
+
this.sendResponse(id, false, undefined, err.code);
|
|
514
|
+
}
|
|
515
|
+
else {
|
|
516
|
+
console.warn('[voice] call_bind failed', err);
|
|
517
|
+
this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
break;
|
|
521
|
+
}
|
|
522
|
+
case 'call_end': {
|
|
523
|
+
await this.voiceOrchestrator?.endCall({ sessionId: command.sessionId, reason: 'user_hangup' });
|
|
524
|
+
this.sendResponse(id, true);
|
|
525
|
+
this.sendEvent({ type: 'call_ended', sessionId: command.sessionId, reason: 'user_hangup' });
|
|
526
|
+
break;
|
|
527
|
+
}
|
|
528
|
+
// ---- Client diagnostic logs (voice/call tracing) ----
|
|
529
|
+
case 'client_log': {
|
|
530
|
+
// Forward to the server's logger so client-side traces interleave
|
|
531
|
+
// with the server-side voice extension logs in the same journal.
|
|
532
|
+
const clientWall = new Date(command.clientTimestampMs).toISOString();
|
|
533
|
+
const serverWall = new Date().toISOString();
|
|
534
|
+
const driftMs = Date.now() - command.clientTimestampMs;
|
|
535
|
+
const line = `[voice_trace][client/${command.tag}] ${command.message} ${JSON.stringify({ clientWall, serverWall, driftMs, ...(command.data ?? {}) })}`;
|
|
536
|
+
if (command.level === 'error')
|
|
537
|
+
console.error(line);
|
|
538
|
+
else if (command.level === 'warn')
|
|
539
|
+
console.warn(line);
|
|
540
|
+
else
|
|
541
|
+
console.log(line);
|
|
542
|
+
this.sendResponse(id, true);
|
|
543
|
+
break;
|
|
544
|
+
}
|
|
483
545
|
// ---- Extension UI ----
|
|
484
546
|
case 'extension_ui_response': {
|
|
485
547
|
const uiSlot = command.sessionId ? this.sessionManager.getSession(command.sessionId) : undefined;
|
|
@@ -896,13 +958,26 @@ export class WsHandler {
|
|
|
896
958
|
}
|
|
897
959
|
}
|
|
898
960
|
/** Notify the old owner that they've been displaced from a session.
|
|
899
|
-
* No-op if the session is unowned or owned by this client.
|
|
961
|
+
* No-op if the session is unowned or owned by this client.
|
|
962
|
+
*
|
|
963
|
+
* Voice-call tear-down on displacement lives in `sendDisplacedEvent` (the
|
|
964
|
+
* old-owner-side site that also emits `call_ended { displaced }`), so this
|
|
965
|
+
* method does not call `voiceOrchestrator.endCall` itself — see review
|
|
966
|
+
* finding 4.
|
|
967
|
+
*/
|
|
900
968
|
displaceOwner(sessionId, slot) {
|
|
901
969
|
if (slot.connection?.connectedClientId && slot.connection.connectedClientId !== this.clientId) {
|
|
902
970
|
const oldHandler = this.clientRegistry.get(slot.connection.connectedClientId);
|
|
903
971
|
if (oldHandler) {
|
|
904
972
|
oldHandler.sendDisplacedEvent(sessionId);
|
|
905
973
|
}
|
|
974
|
+
else if (this.voiceOrchestrator?.isCallActive(sessionId)) {
|
|
975
|
+
// Stale owner id with no live handler — clean up orchestrator state
|
|
976
|
+
// so the new owner doesn't inherit a phantom active call.
|
|
977
|
+
this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' }).catch((err) => {
|
|
978
|
+
console.warn('[voice] endCall on displace (stale handler) failed', err);
|
|
979
|
+
});
|
|
980
|
+
}
|
|
906
981
|
}
|
|
907
982
|
}
|
|
908
983
|
/** Bind a slot to this client — sets ownership, WebSocket routing,
|
|
@@ -920,7 +995,9 @@ export class WsHandler {
|
|
|
920
995
|
// ManagedSlot — on reconnect we skip rebinding, but on session reset
|
|
921
996
|
// we must rebind so the bridge points at the new session state.
|
|
922
997
|
if (!slot.sessionState.extensionsBound) {
|
|
923
|
-
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService
|
|
998
|
+
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
|
|
999
|
+
isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(sessionId) ?? false,
|
|
1000
|
+
});
|
|
924
1001
|
const commandContextActions = createCommandContextActions(slot);
|
|
925
1002
|
await slot.session.bindExtensions({ uiContext, commandContextActions });
|
|
926
1003
|
slot.sessionState.extensionsBound = true;
|
|
@@ -939,9 +1016,12 @@ export class WsHandler {
|
|
|
939
1016
|
return;
|
|
940
1017
|
}
|
|
941
1018
|
// Session ID changed — rebuild session state in-place on the same slot.
|
|
942
|
-
|
|
1019
|
+
// rebuildSessionState refreshes slot.folderPath from the new session's header cwd,
|
|
1020
|
+
// so capture folderPath AFTER the rebuild to pick up the new value (fork-from can
|
|
1021
|
+
// change cwd, e.g. the worktree extension).
|
|
943
1022
|
// Rebuild session state (tears down old, creates new from runtime.session)
|
|
944
1023
|
this.sessionManager.rebuildSessionState(slot);
|
|
1024
|
+
const folderPath = slot.folderPath;
|
|
945
1025
|
// Re-key the session map
|
|
946
1026
|
this.sessionManager.reKeySession(slot, oldSessionId, newSessionId);
|
|
947
1027
|
// Update handler bookkeeping
|
|
@@ -951,7 +1031,9 @@ export class WsHandler {
|
|
|
951
1031
|
this.viewedSessionId = newSessionId;
|
|
952
1032
|
}
|
|
953
1033
|
// Rebind extension UI bridge (new session state for dialog routing)
|
|
954
|
-
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService
|
|
1034
|
+
const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
|
|
1035
|
+
isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(newSessionId) ?? false,
|
|
1036
|
+
});
|
|
955
1037
|
const commandContextActions = createCommandContextActions(slot);
|
|
956
1038
|
await slot.session.bindExtensions({ uiContext, commandContextActions });
|
|
957
1039
|
slot.sessionState.extensionsBound = true;
|
|
@@ -1064,6 +1146,27 @@ export class WsHandler {
|
|
|
1064
1146
|
sessionId,
|
|
1065
1147
|
reason: 'displaced',
|
|
1066
1148
|
});
|
|
1149
|
+
// If the old owner had an active voice call on this session, tear down
|
|
1150
|
+
// orchestrator bookkeeping and surface `call_ended { reason: 'displaced' }`
|
|
1151
|
+
// so their VoiceCallStore tears down alongside the session_closed.
|
|
1152
|
+
if (this.voiceOrchestrator?.isCallActive(sessionId)) {
|
|
1153
|
+
void this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' });
|
|
1154
|
+
this.sendEvent({
|
|
1155
|
+
type: 'call_ended',
|
|
1156
|
+
sessionId,
|
|
1157
|
+
reason: 'displaced',
|
|
1158
|
+
});
|
|
1159
|
+
}
|
|
1160
|
+
}
|
|
1161
|
+
/** Broadcast a `call_ended` to this client (used by the session manager's
|
|
1162
|
+
* before-close hook so the orchestrator bookkeeping owner learns that a
|
|
1163
|
+
* server-initiated teardown happened). */
|
|
1164
|
+
sendCallEndedEvent(sessionId, reason) {
|
|
1165
|
+
this.sendEvent({
|
|
1166
|
+
type: 'call_ended',
|
|
1167
|
+
sessionId,
|
|
1168
|
+
reason,
|
|
1169
|
+
});
|
|
1067
1170
|
}
|
|
1068
1171
|
/** Send a session_closed event with reason 'killed' to this client's WebSocket.
|
|
1069
1172
|
* Also removes the session from this handler's subscribedSessions so that
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export * from './protocol.js';
|