@pimote/pimote 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +43 -16
  2. package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
  3. package/client/build/_app/immutable/assets/2.DwPXxSa-.css +1 -0
  4. package/client/build/_app/immutable/chunks/-Lc-U-GJ.js +1 -0
  5. package/client/build/_app/immutable/chunks/{CT6ckxpD.js → CO_BwWGt.js} +1 -1
  6. package/client/build/_app/immutable/chunks/CklMSqcv.js +1 -0
  7. package/client/build/_app/immutable/chunks/D1INvMB9.js +1 -0
  8. package/client/build/_app/immutable/chunks/D1vhgXpq.js +5 -0
  9. package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.B-HFVtpC.js} +2 -2
  10. package/client/build/_app/immutable/entry/start.DJTQ8-sD.js +1 -0
  11. package/client/build/_app/immutable/nodes/0.CepAO4xf.js +10 -0
  12. package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.CmxFYjRm.js} +1 -1
  13. package/client/build/_app/immutable/nodes/2.DAtqfmki.js +54 -0
  14. package/client/build/_app/version.json +1 -1
  15. package/client/build/index.html +7 -7
  16. package/package.json +7 -3
  17. package/server/dist/auto-drain-on-abort.js +49 -0
  18. package/server/dist/config.js +21 -0
  19. package/server/dist/extension-ui-bridge.js +14 -1
  20. package/server/dist/index.js +36 -1
  21. package/server/dist/message-mapper.js +38 -6
  22. package/server/dist/push-notification.js +11 -0
  23. package/server/dist/server.js +2 -2
  24. package/server/dist/session-manager.js +72 -4
  25. package/server/dist/voice/fsm/actions.js +6 -0
  26. package/server/dist/voice/fsm/events.js +7 -0
  27. package/server/dist/voice/fsm/reducer.js +74 -0
  28. package/server/dist/voice/fsm/reducers/lifecycle.js +158 -0
  29. package/server/dist/voice/fsm/reducers/streaming.js +220 -0
  30. package/server/dist/voice/fsm/reducers/walkback.js +73 -0
  31. package/server/dist/voice/fsm/state.js +21 -0
  32. package/server/dist/voice/fsm/text-extractor.js +128 -0
  33. package/server/dist/voice/index.js +336 -0
  34. package/server/dist/voice/interpreter-prompt.js +115 -0
  35. package/server/dist/voice/speechmux-client.js +153 -0
  36. package/server/dist/voice/state-machine.js +14 -0
  37. package/server/dist/voice/wait-for-idle.js +67 -0
  38. package/server/dist/voice/walk-back.js +198 -0
  39. package/server/dist/voice-orchestrator-boot.js +90 -0
  40. package/server/dist/voice-orchestrator.js +91 -0
  41. package/server/dist/ws-handler.js +112 -7
  42. package/shared/dist/index.d.ts +1 -0
  43. package/shared/dist/index.js +2 -0
  44. package/shared/dist/protocol.d.ts +614 -0
  45. package/shared/dist/protocol.js +30 -0
  46. package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
  47. package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
  48. package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
  49. package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
  50. package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
  51. package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
  52. package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
  53. package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
  54. package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54
@@ -0,0 +1,67 @@
1
+ // Wait-for-idle helper for the voice extension.
2
+ //
3
+ // The speechmux abort/user frame pair on barge-in arrives as two
4
+ // independent reducer calls. The `abort` action calls `ctx.abort()`
5
+ // (fire-and-forget — the actual teardown completes asynchronously). If
6
+ // the user frame arrives before the teardown is done, `sendUserMessage`
7
+ // throws ("Agent is already processing…") and the user's utterance is
8
+ // silently dropped.
9
+ //
10
+ // Steering doesn't help: pi-agent-core doesn't drain the steer queue on
11
+ // the abort exit path of `runLoop`. Pimote has a separate
12
+ // `autoDrainOnAbort` listener (see `auto-drain-on-abort.ts`) that
13
+ // rescues queued messages after an aborted run, but that only catches
14
+ // messages that *were* queued — it doesn't help an unqueued
15
+ // `sendUserMessage` that throws.
16
+ //
17
+ // So the voice extension polls `ctx.isIdle()` before calling
18
+ // `sendUserMessage` (without `deliverAs`), guaranteeing the SDK won't
19
+ // throw. Auto-drain remains a belt-and-braces safety net for any
20
+ // queued path that races an abort.
21
+ /**
22
+ * Resolve once the agent is idle, polling with exponential backoff
23
+ * (start 5 ms, doubling, capped at 50 ms). Returns false if the agent
24
+ * never becomes idle within `timeoutMs`. Returns true immediately
25
+ * when already idle.
26
+ *
27
+ * Timeout default of 2 s is well above any normal abort-teardown
28
+ * latency (tens to a few hundred ms). If a real agent doesn't reach
29
+ * idle within 2 s, something is genuinely stuck and dropping the
30
+ * message is preferable to hanging the executor.
31
+ */
32
+ export async function waitForAgentIdle(ctx, timeoutMs = 2000) {
33
+ if (ctx.isIdle())
34
+ return true;
35
+ const start = Date.now();
36
+ let delay = 5;
37
+ while (!ctx.isIdle()) {
38
+ if (Date.now() - start >= timeoutMs)
39
+ return false;
40
+ await new Promise((resolve) => setTimeout(resolve, delay));
41
+ delay = Math.min(50, delay * 2);
42
+ }
43
+ return true;
44
+ }
45
+ /**
46
+ * Ensure the agent is idle, synthesising a barge-in when it isn't.
47
+ *
48
+ * Speechmux only emits `abort` while it is actively playing TTS — i.e.
49
+ * during the harness's `token`/`end` stream. While the worker is
50
+ * silently reasoning between a `user` frame and its first `speak()`
51
+ * call, speechmux has no signal that the agent is busy and won't
52
+ * pre-empt. If the user starts a new utterance during that window, the
53
+ * `user` frame arrives at the harness with no preceding `abort`, so the
54
+ * agent is still mid-turn and `sendUserMessage` would race / be
55
+ * dropped.
56
+ *
57
+ * This helper closes that gap: when the agent isn't idle on entry, we
58
+ * fire `ctx.abort()` ourselves (idempotent if a real barge-in already
59
+ * issued one) and then poll for idle the same way the abort/user pair
60
+ * already does. Returns true once idle, false on timeout.
61
+ */
62
+ export async function ensureIdleWithImplicitAbort(ctx, timeoutMs = 2000) {
63
+ if (ctx.isIdle())
64
+ return true;
65
+ ctx.abort();
66
+ return waitForAgentIdle(ctx, timeoutMs);
67
+ }
@@ -0,0 +1,198 @@
1
+ // Walkback rewrite: surgical truncation of conversation history when
2
+ // speechmux reports the user heard only a prefix of an assistant
3
+ // utterance.
4
+ //
5
+ // **Identity-based design.** Walkback targets a specific `speak()` tool
6
+ // call by its `toolCallId`. That id is round-tripped through speechmux
7
+ // (every outgoing `token`/`end` frame carries it; speechmux echoes it
8
+ // back on `rollback`/`abort`) so we know exactly which utterance the
9
+ // `heardText` belongs to. The previous design used a captured snapshot
10
+ // of the in-flight assistant message and a string-prefix-matching
11
+ // algorithm — both of which broke whenever a turn contained more than
12
+ // one speak() or whenever the snapshot drifted out of sync with the
13
+ // real conversation.
14
+ //
15
+ // **Contract:** see `docs/plans/voice-mode.md` for the high-level
16
+ // behavioural spec. Briefly:
17
+ //
18
+ // 1. The trailing pi-synthetic empty-text aborted assistant (if any)
19
+ // is always stripped, even when no rollback is pending. This is
20
+ // pi's marker for "agent run was aborted"; we don't want it in
21
+ // the LLM context.
22
+ //
23
+ // 2. With a rollback pending, locate the speak block by
24
+ // `targetSpeakToolCallId`. If found:
25
+ // - If `heardText` is empty: drop the speak block entirely (and
26
+ // its paired tool_result if present).
27
+ // - If `heardText.length >= block.text.length`: keep block as-is
28
+ // (whole utterance was heard).
29
+ // - Otherwise: replace the block's text with `heardText` and
30
+ // drop the paired tool_result.
31
+ // Then drop blocks AFTER the target in the same message, and drop
32
+ // any subsequent assistant/tool_result messages — none of those
33
+ // could have been heard if the user interrupted at the target.
34
+ //
35
+ // 3. If the target is not found in messages (e.g. compacted away),
36
+ // walkback is a no-op beyond step 1.
37
+ //
38
+ // **Content-block shape compatibility.** The function handles both
39
+ // pi-agent-core's internal AgentMessage shape (`type:'toolCall'` +
40
+ // `arguments`) and the Anthropic API shape (`type:'tool_use'` +
41
+ // `input`). Earlier versions only matched the latter, which silently
42
+ // failed on every real captured message.
43
+ /**
44
+ * Apply walkback against `messages`. Pure function.
45
+ *
46
+ * Returns a new array; never mutates the input.
47
+ */
48
+ export function walkBack(input) {
49
+ const stripped = stripTrailingAbortedEmpty(input.messages);
50
+ if (input.rollback === null)
51
+ return stripped;
52
+ return rewriteByToolCallId(stripped, input.rollback.heardText, input.rollback.targetSpeakToolCallId);
53
+ }
54
+ // ---------------------------------------------------------------------------
55
+ /** True for the synthetic assistant pi appends to state on abort. */
56
+ export function isAbortedEmptyAssistant(msg) {
57
+ if (!isAssistantMessage(msg))
58
+ return false;
59
+ if (stopReason(msg) !== 'aborted')
60
+ return false;
61
+ return isEmptyText(contentOf(msg));
62
+ }
63
+ function stripTrailingAbortedEmpty(messages) {
64
+ let cut = messages.length;
65
+ while (cut > 0 && isAbortedEmptyAssistant(messages[cut - 1]))
66
+ cut -= 1;
67
+ return cut === messages.length ? messages.slice() : messages.slice(0, cut);
68
+ }
69
+ function rewriteByToolCallId(messages, heardText, targetId) {
70
+ // Search from the back — toolCallIds are unique per session, so the
71
+ // first match is the right one, but searching backward minimises work
72
+ // for the common case (target is in the recent tail).
73
+ let targetMsgIdx = -1;
74
+ let targetBlockIdx = -1;
75
+ for (let i = messages.length - 1; i >= 0; i--) {
76
+ const msg = messages[i];
77
+ if (!isAssistantMessage(msg))
78
+ continue;
79
+ const content = contentOf(msg);
80
+ for (let j = 0; j < content.length; j++) {
81
+ if (isSpeakToolCall(content[j]) && getToolCallId(content[j]) === targetId) {
82
+ targetMsgIdx = i;
83
+ targetBlockIdx = j;
84
+ break;
85
+ }
86
+ }
87
+ if (targetMsgIdx !== -1)
88
+ break;
89
+ }
90
+ if (targetMsgIdx === -1) {
91
+ // Target gone (compacted, or never landed in messages). Best we can
92
+ // do is honour step 1 (already done).
93
+ return messages;
94
+ }
95
+ const targetMsg = messages[targetMsgIdx];
96
+ const targetContent = contentOf(targetMsg);
97
+ const targetBlock = targetContent[targetBlockIdx];
98
+ const originalText = getSpeakText(targetBlock);
99
+ const newBlocks = targetContent.slice(0, targetBlockIdx);
100
+ const droppedToolUseIds = new Set();
101
+ if (heardText.length === 0) {
102
+ // Nothing was heard of this speak. Drop the block and its paired
103
+ // tool_result (if any).
104
+ droppedToolUseIds.add(targetId);
105
+ }
106
+ else if (heardText.length >= originalText.length) {
107
+ // Entire utterance was heard. Keep block intact.
108
+ newBlocks.push(targetBlock);
109
+ }
110
+ else {
111
+ // Partial. Truncate text in-place and drop the paired tool_result
112
+ // (per the contract — a truncated speak's result is no longer
113
+ // grounded in what the user heard).
114
+ newBlocks.push(replaceSpeakText(targetBlock, heardText));
115
+ droppedToolUseIds.add(targetId);
116
+ }
117
+ // Anything in this message AFTER the target block was emitted after
118
+ // the heard prefix and so was not heard.
119
+ for (let j = targetBlockIdx + 1; j < targetContent.length; j++) {
120
+ const id = getToolCallId(targetContent[j]);
121
+ if (id)
122
+ droppedToolUseIds.add(id);
123
+ }
124
+ const rewrittenTarget = {
125
+ ...targetMsg,
126
+ content: newBlocks,
127
+ stopReason: 'aborted',
128
+ };
129
+ // Anything AFTER the target message in the array was emitted by the
130
+ // agent after the interrupted speak — drop it. This includes any
131
+ // tool_result messages whose paired speak we just truncated, plus
132
+ // any subsequent assistant messages.
133
+ return [...messages.slice(0, targetMsgIdx), rewrittenTarget];
134
+ }
135
+ // ---------------------------------------------------------------------------
136
+ // Shape-tolerant accessors. pi-agent-core's runtime AgentMessage uses
137
+ // `toolCall`/`arguments`; the Anthropic API shape uses `tool_use`/`input`.
138
+ // Tests / tooling may pass either; we accept both.
139
+ function isAssistantMessage(msg) {
140
+ return msg.role === 'assistant';
141
+ }
142
+ function stopReason(msg) {
143
+ return msg.stopReason;
144
+ }
145
+ function contentOf(msg) {
146
+ const c = msg.content;
147
+ return Array.isArray(c) ? c : [];
148
+ }
149
+ function isEmptyText(blocks) {
150
+ if (blocks.length === 0)
151
+ return true;
152
+ return blocks.every((b) => {
153
+ if (b.type !== 'text')
154
+ return false;
155
+ const t = b.text;
156
+ return typeof t === 'string' && t.trim() === '';
157
+ });
158
+ }
159
+ export function isSpeakToolCall(block) {
160
+ if (block.type !== 'toolCall' && block.type !== 'tool_use')
161
+ return false;
162
+ return block.name === 'speak';
163
+ }
164
+ function getToolCallId(block) {
165
+ const id = block.id;
166
+ return typeof id === 'string' ? id : undefined;
167
+ }
168
+ function getSpeakText(block) {
169
+ // Try both shapes; whichever holds a string wins.
170
+ const args = block.arguments;
171
+ if (args && typeof args === 'object') {
172
+ const t = args.text;
173
+ if (typeof t === 'string')
174
+ return t;
175
+ }
176
+ const input = block.input;
177
+ if (input && typeof input === 'object') {
178
+ const t = input.text;
179
+ if (typeof t === 'string')
180
+ return t;
181
+ }
182
+ return '';
183
+ }
184
+ function replaceSpeakText(block, text) {
185
+ // Preserve whichever args/input shape was present, replacing only the
186
+ // `text` field. We don't normalise to a single shape — that would
187
+ // diverge from whatever pi-agent-core/the provider expects.
188
+ const args = block.arguments;
189
+ const input = block.input;
190
+ if (args && typeof args === 'object') {
191
+ return { ...block, arguments: { ...args, text } };
192
+ }
193
+ if (input && typeof input === 'object') {
194
+ return { ...block, input: { ...input, text } };
195
+ }
196
+ // Neither shape present — set both defensively.
197
+ return { ...block, arguments: { text }, input: { text } };
198
+ }
@@ -0,0 +1,90 @@
1
+ // Wire the VoiceOrchestrator together with its runtime dependencies at
2
+ // server boot time. Kept separate from `index.ts` so the wiring is
3
+ // testable (no network / child_process side-effects at import time) and
4
+ // isolated from the plain HTTP/WS boot sequence.
5
+ import { spawn } from 'node:child_process';
6
+ import { VoiceOrchestrator } from './voice-orchestrator.js';
7
+ /**
8
+ * Construct a VoiceOrchestrator backed by real seams:
9
+ * - speechmux sidecar via `child_process.spawn`
10
+ * - displacement = looks up current owner via clientRegistry and calls its
11
+ * `sendDisplacedEvent(sessionId)`
12
+ *
13
+ * Auth on `/signal` is handled by Cloudflare Access at the edge, and
14
+ * per-session TURN credentials are minted by speechmux and returned to the
15
+ * PWA in its `/signal` `session` response. Pimote's orchestrator only
16
+ * hands out the signalling URL.
17
+ */
18
+ export function buildVoiceOrchestrator(args) {
19
+ const { config, sessionManager, clientRegistry } = args;
20
+ let speechmuxProc = null;
21
+ const busResolver = {
22
+ getSlot: (sessionId) => sessionManager.getSlot(sessionId),
23
+ getEventBus: (sessionId) => sessionManager.getSlot(sessionId)?.eventBusRef.current ?? null,
24
+ };
25
+ const orchestrator = new VoiceOrchestrator({
26
+ config,
27
+ sessionManager,
28
+ busResolver,
29
+ startSpeechmux: async () => {
30
+ const bin = config.voice?.speechmuxBinary;
31
+ if (!bin) {
32
+ console.log('[voice] speechmuxBinary not configured; assuming speechmux is externally managed (systemd, container, remote host, etc.)');
33
+ return;
34
+ }
35
+ if (speechmuxProc)
36
+ return;
37
+ speechmuxProc = spawn(bin, [], { stdio: ['ignore', 'inherit', 'inherit'] });
38
+ speechmuxProc.on('exit', (code, signal) => {
39
+ console.warn(`[voice] speechmux exited (code=${code}, signal=${signal})`);
40
+ speechmuxProc = null;
41
+ });
42
+ // NB: we do not wait for a ready marker here — speechmux emits readiness
43
+ // to its own logs. Callers should ensure startup ordering or implement a
44
+ // readiness probe as part of the Step 14 smoke.
45
+ },
46
+ stopSpeechmux: async () => {
47
+ if (!speechmuxProc)
48
+ return;
49
+ const proc = speechmuxProc;
50
+ speechmuxProc = null;
51
+ await new Promise((resolve) => {
52
+ const timer = setTimeout(() => {
53
+ try {
54
+ proc.kill('SIGKILL');
55
+ }
56
+ catch {
57
+ /* ignore */
58
+ }
59
+ resolve();
60
+ }, 2000);
61
+ proc.once('exit', () => {
62
+ clearTimeout(timer);
63
+ resolve();
64
+ });
65
+ try {
66
+ proc.kill('SIGTERM');
67
+ }
68
+ catch {
69
+ clearTimeout(timer);
70
+ resolve();
71
+ }
72
+ });
73
+ },
74
+ displaceOwner: async (sessionId, _newOwner) => {
75
+ const slot = sessionManager.getSlot(sessionId);
76
+ const existingClientId = slot?.connection?.connectedClientId;
77
+ if (!existingClientId)
78
+ return;
79
+ const existing = clientRegistry.get(existingClientId);
80
+ existing?.sendDisplacedEvent(sessionId);
81
+ },
82
+ isOwnedByVoiceCall: (sessionId) => orchestrator.isCallActive(sessionId),
83
+ });
84
+ return {
85
+ orchestrator,
86
+ shutdown: async () => {
87
+ await orchestrator.stop();
88
+ },
89
+ };
90
+ }
@@ -0,0 +1,91 @@
1
+ // Voice orchestrator — owns the speechmux sidecar lifecycle and the per-call
2
+ // bind dispatch. See docs/plans/voice-mode.md → "Voice orchestrator".
3
+ //
4
+ // This file defines the interface surface + a stub implementation. The impl
5
+ // phase fills in start()/stop()/bindCall()/endCall() bodies.
6
+ /** Typed error carrying the discriminable reason code used in PimoteResponse.error. */
7
+ export class CallBindError extends Error {
8
+ code;
9
+ constructor(code, message) {
10
+ super(message ?? code);
11
+ this.code = code;
12
+ this.name = 'CallBindError';
13
+ }
14
+ }
15
+ export class VoiceOrchestrator {
16
+ opts;
17
+ started = false;
18
+ activeCalls = new Set();
19
+ constructor(opts) {
20
+ this.opts = opts;
21
+ }
22
+ /** Spawns speechmux sidecar. Throws if it fails to start. */
23
+ async start() {
24
+ if (this.started)
25
+ return;
26
+ await this.opts.startSpeechmux();
27
+ this.started = true;
28
+ }
29
+ /** Kills speechmux. Idempotent. */
30
+ async stop() {
31
+ if (!this.started)
32
+ return;
33
+ await this.opts.stopSpeechmux();
34
+ this.started = false;
35
+ this.activeCalls.clear();
36
+ }
37
+ /** Called by ws-handler for CallBindCommand. */
38
+ async bindCall(args) {
39
+ const slot = this.opts.busResolver.getSlot(args.sessionId);
40
+ if (!slot) {
41
+ throw new CallBindError('call_bind_failed_session_not_found', `No session ${args.sessionId}`);
42
+ }
43
+ const alreadyOwned = this.opts.isOwnedByVoiceCall(args.sessionId);
44
+ if (alreadyOwned && !args.force) {
45
+ throw new CallBindError('call_bind_failed_owned', 'Session already bound to a voice call');
46
+ }
47
+ if (alreadyOwned && args.force) {
48
+ await this.opts.displaceOwner(args.sessionId, args.clientConnection);
49
+ }
50
+ // Voice-disabled guard: if speechmux wiring isn't configured, fail the
51
+ // bind here rather than handing the client empty URLs. Speechmux is
52
+ // what mints the per-call TURN creds now (in the /signal `session`
53
+ // response) and what authenticates peers (via Cloudflare Access at the
54
+ // edge), so pimote no longer needs to mint anything.
55
+ const signalUrl = this.opts.config.voice?.speechmuxSignalUrl;
56
+ const llmWsUrl = this.opts.config.voice?.speechmuxLlmWsUrl;
57
+ if (!signalUrl || !llmWsUrl) {
58
+ throw new CallBindError('call_bind_failed_internal', 'voice_disabled: speechmux signal URL / llm WS URL not configured');
59
+ }
60
+ const bus = this.opts.busResolver.getEventBus(args.sessionId);
61
+ if (!bus) {
62
+ throw new CallBindError('call_bind_failed_internal', 'Session has no EventBus');
63
+ }
64
+ const activate = {
65
+ type: 'pimote:voice:activate',
66
+ sessionId: args.sessionId,
67
+ speechmuxWsUrl: llmWsUrl,
68
+ };
69
+ bus.emit(activate.type, activate);
70
+ this.activeCalls.add(args.sessionId);
71
+ return {
72
+ sessionId: args.sessionId,
73
+ webrtcSignalUrl: signalUrl,
74
+ };
75
+ }
76
+ /** Called by ws-handler for CallEndCommand, or internally on displacement/error. Idempotent. */
77
+ async endCall(args) {
78
+ if (!this.activeCalls.has(args.sessionId))
79
+ return;
80
+ this.activeCalls.delete(args.sessionId);
81
+ const bus = this.opts.busResolver.getEventBus(args.sessionId);
82
+ if (bus) {
83
+ const deactivate = { type: 'pimote:voice:deactivate', sessionId: args.sessionId };
84
+ bus.emit(deactivate.type, deactivate);
85
+ }
86
+ }
87
+ /** True if the given session currently has an active voice call bound. */
88
+ isCallActive(sessionId) {
89
+ return this.activeCalls.has(sessionId);
90
+ }
91
+ }
@@ -8,6 +8,7 @@ import { createExtensionUIBridge } from './extension-ui-bridge.js';
8
8
  import { findExternalPiProcesses, killExternalPiProcesses } from './takeover.js';
9
9
  import { mapAgentMessages, extractMessageEntryIds, applyEntryIds } from './message-mapper.js';
10
10
  import { getGitBranch } from './git-branch.js';
11
+ import { CallBindError } from './voice-orchestrator.js';
11
12
  /** Parse data-URL encoded images into the shape the pi SDK expects. */
12
13
  function parseDataUrlImages(images) {
13
14
  if (!images || images.length === 0)
@@ -133,16 +134,18 @@ export class WsHandler {
133
134
  pushNotificationService;
134
135
  sessionMetadataStore;
135
136
  clientRegistry;
137
+ voiceOrchestrator;
136
138
  subscribedSessions = new Set();
137
139
  viewedSessionId = null;
138
140
  clientId;
139
- constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry) {
141
+ constructor(sessionManager, folderIndex, ws, pushNotificationService, sessionMetadataStore, clientId, clientRegistry, voiceOrchestrator) {
140
142
  this.sessionManager = sessionManager;
141
143
  this.folderIndex = folderIndex;
142
144
  this.ws = ws;
143
145
  this.pushNotificationService = pushNotificationService;
144
146
  this.sessionMetadataStore = sessionMetadataStore;
145
147
  this.clientRegistry = clientRegistry;
148
+ this.voiceOrchestrator = voiceOrchestrator;
146
149
  this.clientId = clientId;
147
150
  }
148
151
  getViewedSessionId() {
@@ -480,6 +483,65 @@ export class WsHandler {
480
483
  this.sendResponse(id, true, { sessionId: takeoverSessionId, killedProcesses: killedCount });
481
484
  break;
482
485
  }
486
+ // ---- Voice call control ----
487
+ case 'call_bind': {
488
+ if (!this.voiceOrchestrator) {
489
+ this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
490
+ break;
491
+ }
492
+ const slot = this.sessionManager.getSlot(command.sessionId);
493
+ if (!slot) {
494
+ this.sendResponse(id, false, undefined, 'call_bind_failed_session_not_found');
495
+ break;
496
+ }
497
+ const connection = {
498
+ ws: this.ws,
499
+ connectedClientId: this.clientId,
500
+ onSessionReset: (s) => this.handleSessionReset(s),
501
+ };
502
+ try {
503
+ const data = await this.voiceOrchestrator.bindCall({
504
+ sessionId: command.sessionId,
505
+ clientConnection: connection,
506
+ force: command.force ?? false,
507
+ });
508
+ this.sendResponse(id, true, data);
509
+ this.sendEvent({ type: 'call_status', sessionId: command.sessionId, status: 'binding' });
510
+ }
511
+ catch (err) {
512
+ if (err instanceof CallBindError) {
513
+ this.sendResponse(id, false, undefined, err.code);
514
+ }
515
+ else {
516
+ console.warn('[voice] call_bind failed', err);
517
+ this.sendResponse(id, false, undefined, 'call_bind_failed_internal');
518
+ }
519
+ }
520
+ break;
521
+ }
522
+ case 'call_end': {
523
+ await this.voiceOrchestrator?.endCall({ sessionId: command.sessionId, reason: 'user_hangup' });
524
+ this.sendResponse(id, true);
525
+ this.sendEvent({ type: 'call_ended', sessionId: command.sessionId, reason: 'user_hangup' });
526
+ break;
527
+ }
528
+ // ---- Client diagnostic logs (voice/call tracing) ----
529
+ case 'client_log': {
530
+ // Forward to the server's logger so client-side traces interleave
531
+ // with the server-side voice extension logs in the same journal.
532
+ const clientWall = new Date(command.clientTimestampMs).toISOString();
533
+ const serverWall = new Date().toISOString();
534
+ const driftMs = Date.now() - command.clientTimestampMs;
535
+ const line = `[voice_trace][client/${command.tag}] ${command.message} ${JSON.stringify({ clientWall, serverWall, driftMs, ...(command.data ?? {}) })}`;
536
+ if (command.level === 'error')
537
+ console.error(line);
538
+ else if (command.level === 'warn')
539
+ console.warn(line);
540
+ else
541
+ console.log(line);
542
+ this.sendResponse(id, true);
543
+ break;
544
+ }
483
545
  // ---- Extension UI ----
484
546
  case 'extension_ui_response': {
485
547
  const uiSlot = command.sessionId ? this.sessionManager.getSession(command.sessionId) : undefined;
@@ -896,13 +958,26 @@ export class WsHandler {
896
958
  }
897
959
  }
898
960
  /** Notify the old owner that they've been displaced from a session.
899
- * No-op if the session is unowned or owned by this client. */
961
+ * No-op if the session is unowned or owned by this client.
962
+ *
963
+ * Voice-call tear-down on displacement lives in `sendDisplacedEvent` (the
964
+ * old-owner-side site that also emits `call_ended { displaced }`), so this
965
+ * method does not call `voiceOrchestrator.endCall` itself — see review
966
+ * finding 4.
967
+ */
900
968
  displaceOwner(sessionId, slot) {
901
969
  if (slot.connection?.connectedClientId && slot.connection.connectedClientId !== this.clientId) {
902
970
  const oldHandler = this.clientRegistry.get(slot.connection.connectedClientId);
903
971
  if (oldHandler) {
904
972
  oldHandler.sendDisplacedEvent(sessionId);
905
973
  }
974
+ else if (this.voiceOrchestrator?.isCallActive(sessionId)) {
975
+ // Stale owner id with no live handler — clean up orchestrator state
976
+ // so the new owner doesn't inherit a phantom active call.
977
+ this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' }).catch((err) => {
978
+ console.warn('[voice] endCall on displace (stale handler) failed', err);
979
+ });
980
+ }
906
981
  }
907
982
  }
908
983
  /** Bind a slot to this client — sets ownership, WebSocket routing,
@@ -914,13 +989,16 @@ export class WsHandler {
914
989
  onSessionReset: (s) => this.handleSessionReset(s),
915
990
  };
916
991
  slot.connection = connection;
917
- slot.sessionState.lastActivity = Date.now();
992
+ // Note: do NOT touch `idleSince` here. Idleness is an agent-level concept driven by
993
+ // agent_start/agent_end — a client claiming a session does not extend its idle clock.
918
994
  this.subscribedSessions.add(sessionId);
919
995
  // Bind extensions when needed. The bridge holds a direct reference to this
920
996
  // ManagedSlot — on reconnect we skip rebinding, but on session reset
921
997
  // we must rebind so the bridge points at the new session state.
922
998
  if (!slot.sessionState.extensionsBound) {
923
- const uiContext = createExtensionUIBridge(slot, this.pushNotificationService);
999
+ const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
1000
+ isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(sessionId) ?? false,
1001
+ });
924
1002
  const commandContextActions = createCommandContextActions(slot);
925
1003
  await slot.session.bindExtensions({ uiContext, commandContextActions });
926
1004
  slot.sessionState.extensionsBound = true;
@@ -939,9 +1017,12 @@ export class WsHandler {
939
1017
  return;
940
1018
  }
941
1019
  // Session ID changed — rebuild session state in-place on the same slot.
942
- const folderPath = slot.folderPath;
1020
+ // rebuildSessionState refreshes slot.folderPath from the new session's header cwd,
1021
+ // so capture folderPath AFTER the rebuild to pick up the new value (fork-from can
1022
+ // change cwd, e.g. the worktree extension).
943
1023
  // Rebuild session state (tears down old, creates new from runtime.session)
944
1024
  this.sessionManager.rebuildSessionState(slot);
1025
+ const folderPath = slot.folderPath;
945
1026
  // Re-key the session map
946
1027
  this.sessionManager.reKeySession(slot, oldSessionId, newSessionId);
947
1028
  // Update handler bookkeeping
@@ -951,7 +1032,9 @@ export class WsHandler {
951
1032
  this.viewedSessionId = newSessionId;
952
1033
  }
953
1034
  // Rebind extension UI bridge (new session state for dialog routing)
954
- const uiContext = createExtensionUIBridge(slot, this.pushNotificationService);
1035
+ const uiContext = createExtensionUIBridge(slot, this.pushNotificationService, {
1036
+ isVoiceModeActive: () => this.voiceOrchestrator?.isCallActive(newSessionId) ?? false,
1037
+ });
955
1038
  const commandContextActions = createCommandContextActions(slot);
956
1039
  await slot.session.bindExtensions({ uiContext, commandContextActions });
957
1040
  slot.sessionState.extensionsBound = true;
@@ -1064,6 +1147,27 @@ export class WsHandler {
1064
1147
  sessionId,
1065
1148
  reason: 'displaced',
1066
1149
  });
1150
+ // If the old owner had an active voice call on this session, tear down
1151
+ // orchestrator bookkeeping and surface `call_ended { reason: 'displaced' }`
1152
+ // so their VoiceCallStore tears down alongside the session_closed.
1153
+ if (this.voiceOrchestrator?.isCallActive(sessionId)) {
1154
+ void this.voiceOrchestrator.endCall({ sessionId, reason: 'displaced' });
1155
+ this.sendEvent({
1156
+ type: 'call_ended',
1157
+ sessionId,
1158
+ reason: 'displaced',
1159
+ });
1160
+ }
1161
+ }
1162
+ /** Broadcast a `call_ended` to this client (used by the session manager's
1163
+ * before-close hook so the orchestrator bookkeeping owner learns that a
1164
+ * server-initiated teardown happened). */
1165
+ sendCallEndedEvent(sessionId, reason) {
1166
+ this.sendEvent({
1167
+ type: 'call_ended',
1168
+ sessionId,
1169
+ reason,
1170
+ });
1067
1171
  }
1068
1172
  /** Send a session_closed event with reason 'killed' to this client's WebSocket.
1069
1173
  * Also removes the session from this handler's subscribedSessions so that
@@ -1218,9 +1322,10 @@ export class WsHandler {
1218
1322
  const slot = this.sessionManager.getSession(sid);
1219
1323
  if (slot) {
1220
1324
  slot.connection = null;
1221
- slot.sessionState.lastActivity = Date.now();
1222
1325
  // Note: pending UI responses are NOT resolved here — they survive
1223
1326
  // for replay on reconnect. They are resolved on session close or abort.
1327
+ // Note: do NOT touch `idleSince`. Disconnecting does not reset idleness — if the
1328
+ // agent finished 10 minutes ago, a peeking client should not extend the session's life.
1224
1329
  }
1225
1330
  }
1226
1331
  this.subscribedSessions.clear();
@@ -0,0 +1 @@
1
+ export * from './protocol.js';
@@ -0,0 +1,2 @@
1
+ // @pimote/shared barrel export
2
+ export * from './protocol.js';