@pimote/pimote 0.1.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. package/README.md +46 -17
  2. package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
  3. package/client/build/_app/immutable/assets/2.D9fiCd8W.css +1 -0
  4. package/client/build/_app/immutable/chunks/{BTSGQ0LP.js → B8lQCytv.js} +1 -1
  5. package/client/build/_app/immutable/chunks/BNqgidwO.js +5 -0
  6. package/client/build/_app/immutable/chunks/D26i4pYm.js +1 -0
  7. package/client/build/_app/immutable/chunks/D_Fpgknp.js +1 -0
  8. package/client/build/_app/immutable/chunks/DoVhjU85.js +1 -0
  9. package/client/build/_app/immutable/chunks/DzqbY2XU.js +1 -0
  10. package/client/build/_app/immutable/chunks/{L5t1qIFa.js → uZO1iyJZ.js} +2 -2
  11. package/client/build/_app/immutable/entry/app.DO-zgzyy.js +2 -0
  12. package/client/build/_app/immutable/entry/start.BZlrOH0-.js +1 -0
  13. package/client/build/_app/immutable/nodes/0.BEh4bPGQ.js +10 -0
  14. package/client/build/_app/immutable/nodes/1.B2l9JGRO.js +1 -0
  15. package/client/build/_app/immutable/nodes/2.ph9M0S1U.js +54 -0
  16. package/client/build/_app/version.json +1 -1
  17. package/client/build/index.html +8 -8
  18. package/package.json +9 -5
  19. package/patches/{@mariozechner+pi-coding-agent+0.65.0.patch → @mariozechner+pi-coding-agent+0.67.6.patch} +4 -4
  20. package/server/dist/auto-drain-on-abort.js +49 -0
  21. package/server/dist/config.js +21 -0
  22. package/server/dist/extension-ui-bridge.js +14 -1
  23. package/server/dist/folder-index.js +8 -4
  24. package/server/dist/git-branch.js +32 -0
  25. package/server/dist/index.js +31 -1
  26. package/server/dist/message-mapper.js +99 -4
  27. package/server/dist/server.js +5 -2
  28. package/server/dist/session-manager.js +99 -6
  29. package/server/dist/voice/fsm/actions.js +6 -0
  30. package/server/dist/voice/fsm/events.js +7 -0
  31. package/server/dist/voice/fsm/reducer.js +74 -0
  32. package/server/dist/voice/fsm/reducers/lifecycle.js +146 -0
  33. package/server/dist/voice/fsm/reducers/streaming.js +220 -0
  34. package/server/dist/voice/fsm/reducers/walkback.js +73 -0
  35. package/server/dist/voice/fsm/state.js +21 -0
  36. package/server/dist/voice/fsm/text-extractor.js +128 -0
  37. package/server/dist/voice/index.js +319 -0
  38. package/server/dist/voice/interpreter-prompt.js +115 -0
  39. package/server/dist/voice/speechmux-client.js +153 -0
  40. package/server/dist/voice/state-machine.js +7 -0
  41. package/server/dist/voice/wait-for-idle.js +67 -0
  42. package/server/dist/voice/walk-back.js +198 -0
  43. package/server/dist/voice-orchestrator-boot.js +90 -0
  44. package/server/dist/voice-orchestrator.js +91 -0
  45. package/server/dist/ws-handler.js +340 -36
  46. package/shared/dist/index.d.ts +1 -0
  47. package/shared/dist/index.js +2 -0
  48. package/shared/dist/protocol.d.ts +614 -0
  49. package/shared/dist/protocol.js +30 -0
  50. package/client/build/_app/immutable/assets/0.Cj7UL9cq.css +0 -2
  51. package/client/build/_app/immutable/assets/2.CIRqqeIr.css +0 -1
  52. package/client/build/_app/immutable/chunks/BEKHoMUP.js +0 -1
  53. package/client/build/_app/immutable/chunks/CfQ6Egqh.js +0 -1
  54. package/client/build/_app/immutable/chunks/DQ-KfPq0.js +0 -1
  55. package/client/build/_app/immutable/chunks/DfA0ecbz.js +0 -1
  56. package/client/build/_app/immutable/chunks/Dnh9Emns.js +0 -5
  57. package/client/build/_app/immutable/entry/app.j0V4R67V.js +0 -2
  58. package/client/build/_app/immutable/entry/start.wkfo4Ebw.js +0 -1
  59. package/client/build/_app/immutable/nodes/0.CUipL_P7.js +0 -5
  60. package/client/build/_app/immutable/nodes/1.ex7ejMby.js +0 -1
  61. package/client/build/_app/immutable/nodes/2.165oQG9Z.js +0 -49
@@ -0,0 +1,198 @@
1
+ // Walkback rewrite: surgical truncation of conversation history when
2
+ // speechmux reports the user heard only a prefix of an assistant
3
+ // utterance.
4
+ //
5
+ // **Identity-based design.** Walkback targets a specific `speak()` tool
6
+ // call by its `toolCallId`. That id is round-tripped through speechmux
7
+ // (every outgoing `token`/`end` frame carries it; speechmux echoes it
8
+ // back on `rollback`/`abort`) so we know exactly which utterance the
9
+ // `heardText` belongs to. The previous design used a captured snapshot
10
+ // of the in-flight assistant message and a string-prefix-matching
11
+ // algorithm — both of which broke whenever a turn contained more than
12
+ // one speak() or whenever the snapshot drifted out of sync with the
13
+ // real conversation.
14
+ //
15
+ // **Contract:** see `docs/plans/voice-mode.md` for the high-level
16
+ // behavioural spec. Briefly:
17
+ //
18
+ // 1. The trailing pi-synthetic empty-text aborted assistant (if any)
19
+ // is always stripped, even when no rollback is pending. This is
20
+ // pi's marker for "agent run was aborted"; we don't want it in
21
+ // the LLM context.
22
+ //
23
+ // 2. With a rollback pending, locate the speak block by
24
+ // `targetSpeakToolCallId`. If found:
25
+ // - If `heardText` is empty: drop the speak block entirely (and
26
+ // its paired tool_result if present).
27
+ // - If `heardText.length >= block.text.length`: keep block as-is
28
+ // (whole utterance was heard).
29
+ // - Otherwise: replace the block's text with `heardText` and
30
+ // drop the paired tool_result.
31
+ // Then drop blocks AFTER the target in the same message, and drop
32
+ // any subsequent assistant/tool_result messages — none of those
33
+ // could have been heard if the user interrupted at the target.
34
+ //
35
+ // 3. If the target is not found in messages (e.g. compacted away),
36
+ // walkback is a no-op beyond step 1.
37
+ //
38
+ // **Content-block shape compatibility.** The function handles both
39
+ // pi-agent-core's internal AgentMessage shape (`type:'toolCall'` +
40
+ // `arguments`) and the Anthropic API shape (`type:'tool_use'` +
41
+ // `input`). Earlier versions only matched the latter, which silently
42
+ // failed on every real captured message.
43
+ /**
44
+ * Apply walkback against `messages`. Pure function.
45
+ *
46
+ * Returns a new array; never mutates the input.
47
+ */
48
+ export function walkBack(input) {
49
+ const stripped = stripTrailingAbortedEmpty(input.messages);
50
+ if (input.rollback === null)
51
+ return stripped;
52
+ return rewriteByToolCallId(stripped, input.rollback.heardText, input.rollback.targetSpeakToolCallId);
53
+ }
54
+ // ---------------------------------------------------------------------------
55
+ /** True for the synthetic assistant pi appends to state on abort. */
56
+ export function isAbortedEmptyAssistant(msg) {
57
+ if (!isAssistantMessage(msg))
58
+ return false;
59
+ if (stopReason(msg) !== 'aborted')
60
+ return false;
61
+ return isEmptyText(contentOf(msg));
62
+ }
63
+ function stripTrailingAbortedEmpty(messages) {
64
+ let cut = messages.length;
65
+ while (cut > 0 && isAbortedEmptyAssistant(messages[cut - 1]))
66
+ cut -= 1;
67
+ return cut === messages.length ? messages.slice() : messages.slice(0, cut);
68
+ }
69
+ function rewriteByToolCallId(messages, heardText, targetId) {
70
+ // Search from the back — toolCallIds are unique per session, so the
71
+ // first match is the right one, but searching backward minimises work
72
+ // for the common case (target is in the recent tail).
73
+ let targetMsgIdx = -1;
74
+ let targetBlockIdx = -1;
75
+ for (let i = messages.length - 1; i >= 0; i--) {
76
+ const msg = messages[i];
77
+ if (!isAssistantMessage(msg))
78
+ continue;
79
+ const content = contentOf(msg);
80
+ for (let j = 0; j < content.length; j++) {
81
+ if (isSpeakToolCall(content[j]) && getToolCallId(content[j]) === targetId) {
82
+ targetMsgIdx = i;
83
+ targetBlockIdx = j;
84
+ break;
85
+ }
86
+ }
87
+ if (targetMsgIdx !== -1)
88
+ break;
89
+ }
90
+ if (targetMsgIdx === -1) {
91
+ // Target gone (compacted, or never landed in messages). Best we can
92
+ // do is honour step 1 (already done).
93
+ return messages;
94
+ }
95
+ const targetMsg = messages[targetMsgIdx];
96
+ const targetContent = contentOf(targetMsg);
97
+ const targetBlock = targetContent[targetBlockIdx];
98
+ const originalText = getSpeakText(targetBlock);
99
+ const newBlocks = targetContent.slice(0, targetBlockIdx);
100
+ const droppedToolUseIds = new Set();
101
+ if (heardText.length === 0) {
102
+ // Nothing was heard of this speak. Drop the block and its paired
103
+ // tool_result (if any).
104
+ droppedToolUseIds.add(targetId);
105
+ }
106
+ else if (heardText.length >= originalText.length) {
107
+ // Entire utterance was heard. Keep block intact.
108
+ newBlocks.push(targetBlock);
109
+ }
110
+ else {
111
+ // Partial. Truncate text in-place and drop the paired tool_result
112
+ // (per the contract — a truncated speak's result is no longer
113
+ // grounded in what the user heard).
114
+ newBlocks.push(replaceSpeakText(targetBlock, heardText));
115
+ droppedToolUseIds.add(targetId);
116
+ }
117
+ // Anything in this message AFTER the target block was emitted after
118
+ // the heard prefix and so was not heard.
119
+ for (let j = targetBlockIdx + 1; j < targetContent.length; j++) {
120
+ const id = getToolCallId(targetContent[j]);
121
+ if (id)
122
+ droppedToolUseIds.add(id);
123
+ }
124
+ const rewrittenTarget = {
125
+ ...targetMsg,
126
+ content: newBlocks,
127
+ stopReason: 'aborted',
128
+ };
129
+ // Anything AFTER the target message in the array was emitted by the
130
+ // agent after the interrupted speak — drop it. This includes any
131
+ // tool_result messages whose paired speak we just truncated, plus
132
+ // any subsequent assistant messages.
133
+ return [...messages.slice(0, targetMsgIdx), rewrittenTarget];
134
+ }
135
+ // ---------------------------------------------------------------------------
136
+ // Shape-tolerant accessors. pi-agent-core's runtime AgentMessage uses
137
+ // `toolCall`/`arguments`; the Anthropic API shape uses `tool_use`/`input`.
138
+ // Tests / tooling may pass either; we accept both.
139
+ function isAssistantMessage(msg) {
140
+ return msg.role === 'assistant';
141
+ }
142
+ function stopReason(msg) {
143
+ return msg.stopReason;
144
+ }
145
+ function contentOf(msg) {
146
+ const c = msg.content;
147
+ return Array.isArray(c) ? c : [];
148
+ }
149
+ function isEmptyText(blocks) {
150
+ if (blocks.length === 0)
151
+ return true;
152
+ return blocks.every((b) => {
153
+ if (b.type !== 'text')
154
+ return false;
155
+ const t = b.text;
156
+ return typeof t === 'string' && t.trim() === '';
157
+ });
158
+ }
159
+ export function isSpeakToolCall(block) {
160
+ if (block.type !== 'toolCall' && block.type !== 'tool_use')
161
+ return false;
162
+ return block.name === 'speak';
163
+ }
164
+ function getToolCallId(block) {
165
+ const id = block.id;
166
+ return typeof id === 'string' ? id : undefined;
167
+ }
168
+ function getSpeakText(block) {
169
+ // Try both shapes; whichever holds a string wins.
170
+ const args = block.arguments;
171
+ if (args && typeof args === 'object') {
172
+ const t = args.text;
173
+ if (typeof t === 'string')
174
+ return t;
175
+ }
176
+ const input = block.input;
177
+ if (input && typeof input === 'object') {
178
+ const t = input.text;
179
+ if (typeof t === 'string')
180
+ return t;
181
+ }
182
+ return '';
183
+ }
184
+ function replaceSpeakText(block, text) {
185
+ // Preserve whichever args/input shape was present, replacing only the
186
+ // `text` field. We don't normalise to a single shape — that would
187
+ // diverge from whatever pi-agent-core/the provider expects.
188
+ const args = block.arguments;
189
+ const input = block.input;
190
+ if (args && typeof args === 'object') {
191
+ return { ...block, arguments: { ...args, text } };
192
+ }
193
+ if (input && typeof input === 'object') {
194
+ return { ...block, input: { ...input, text } };
195
+ }
196
+ // Neither shape present — set both defensively.
197
+ return { ...block, arguments: { text }, input: { text } };
198
+ }
@@ -0,0 +1,90 @@
1
+ // Wire the VoiceOrchestrator together with its runtime dependencies at
2
+ // server boot time. Kept separate from `index.ts` so the wiring is
3
+ // testable (no network / child_process side-effects at import time) and
4
+ // isolated from the plain HTTP/WS boot sequence.
5
+ import { spawn } from 'node:child_process';
6
+ import { VoiceOrchestrator } from './voice-orchestrator.js';
7
+ /**
8
+ * Construct a VoiceOrchestrator backed by real seams:
9
+ * - speechmux sidecar via `child_process.spawn`
10
+ * - displacement = looks up current owner via clientRegistry and calls its
11
+ * `sendDisplacedEvent(sessionId)`
12
+ *
13
+ * Auth on `/signal` is handled by Cloudflare Access at the edge, and
14
+ * per-session TURN credentials are minted by speechmux and returned to the
15
+ * PWA in its `/signal` `session` response. Pimote's orchestrator only
16
+ * hands out the signalling URL.
17
+ */
18
+ export function buildVoiceOrchestrator(args) {
19
+ const { config, sessionManager, clientRegistry } = args;
20
+ let speechmuxProc = null;
21
+ const busResolver = {
22
+ getSlot: (sessionId) => sessionManager.getSlot(sessionId),
23
+ getEventBus: (sessionId) => sessionManager.getSlot(sessionId)?.eventBusRef.current ?? null,
24
+ };
25
+ const orchestrator = new VoiceOrchestrator({
26
+ config,
27
+ sessionManager,
28
+ busResolver,
29
+ startSpeechmux: async () => {
30
+ const bin = config.voice?.speechmuxBinary;
31
+ if (!bin) {
32
+ console.log('[voice] speechmuxBinary not configured; assuming speechmux is externally managed (systemd, container, remote host, etc.)');
33
+ return;
34
+ }
35
+ if (speechmuxProc)
36
+ return;
37
+ speechmuxProc = spawn(bin, [], { stdio: ['ignore', 'inherit', 'inherit'] });
38
+ speechmuxProc.on('exit', (code, signal) => {
39
+ console.warn(`[voice] speechmux exited (code=${code}, signal=${signal})`);
40
+ speechmuxProc = null;
41
+ });
42
+ // NB: we do not wait for a ready marker here — speechmux emits readiness
43
+ // to its own logs. Callers should ensure startup ordering or implement a
44
+ // readiness probe as part of the Step 14 smoke.
45
+ },
46
+ stopSpeechmux: async () => {
47
+ if (!speechmuxProc)
48
+ return;
49
+ const proc = speechmuxProc;
50
+ speechmuxProc = null;
51
+ await new Promise((resolve) => {
52
+ const timer = setTimeout(() => {
53
+ try {
54
+ proc.kill('SIGKILL');
55
+ }
56
+ catch {
57
+ /* ignore */
58
+ }
59
+ resolve();
60
+ }, 2000);
61
+ proc.once('exit', () => {
62
+ clearTimeout(timer);
63
+ resolve();
64
+ });
65
+ try {
66
+ proc.kill('SIGTERM');
67
+ }
68
+ catch {
69
+ clearTimeout(timer);
70
+ resolve();
71
+ }
72
+ });
73
+ },
74
+ displaceOwner: async (sessionId, _newOwner) => {
75
+ const slot = sessionManager.getSlot(sessionId);
76
+ const existingClientId = slot?.connection?.connectedClientId;
77
+ if (!existingClientId)
78
+ return;
79
+ const existing = clientRegistry.get(existingClientId);
80
+ existing?.sendDisplacedEvent(sessionId);
81
+ },
82
+ isOwnedByVoiceCall: (sessionId) => orchestrator.isCallActive(sessionId),
83
+ });
84
+ return {
85
+ orchestrator,
86
+ shutdown: async () => {
87
+ await orchestrator.stop();
88
+ },
89
+ };
90
+ }
@@ -0,0 +1,91 @@
1
+ // Voice orchestrator — owns the speechmux sidecar lifecycle and the per-call
2
+ // bind dispatch. See docs/plans/voice-mode.md → "Voice orchestrator".
3
+ //
4
+ // This file defines the interface surface + a stub implementation. The impl
5
+ // phase fills in start()/stop()/bindCall()/endCall() bodies.
6
+ /** Typed error carrying the discriminable reason code used in PimoteResponse.error. */
7
+ export class CallBindError extends Error {
8
+ code;
9
+ constructor(code, message) {
10
+ super(message ?? code);
11
+ this.code = code;
12
+ this.name = 'CallBindError';
13
+ }
14
+ }
15
+ export class VoiceOrchestrator {
16
+ opts;
17
+ started = false;
18
+ activeCalls = new Set();
19
+ constructor(opts) {
20
+ this.opts = opts;
21
+ }
22
+ /** Spawns speechmux sidecar. Throws if it fails to start. */
23
+ async start() {
24
+ if (this.started)
25
+ return;
26
+ await this.opts.startSpeechmux();
27
+ this.started = true;
28
+ }
29
+ /** Kills speechmux. Idempotent. */
30
+ async stop() {
31
+ if (!this.started)
32
+ return;
33
+ await this.opts.stopSpeechmux();
34
+ this.started = false;
35
+ this.activeCalls.clear();
36
+ }
37
+ /** Called by ws-handler for CallBindCommand. */
38
+ async bindCall(args) {
39
+ const slot = this.opts.busResolver.getSlot(args.sessionId);
40
+ if (!slot) {
41
+ throw new CallBindError('call_bind_failed_session_not_found', `No session ${args.sessionId}`);
42
+ }
43
+ const alreadyOwned = this.opts.isOwnedByVoiceCall(args.sessionId);
44
+ if (alreadyOwned && !args.force) {
45
+ throw new CallBindError('call_bind_failed_owned', 'Session already bound to a voice call');
46
+ }
47
+ if (alreadyOwned && args.force) {
48
+ await this.opts.displaceOwner(args.sessionId, args.clientConnection);
49
+ }
50
+ // Voice-disabled guard: if speechmux wiring isn't configured, fail the
51
+ // bind here rather than handing the client empty URLs. Speechmux is
52
+ // what mints the per-call TURN creds now (in the /signal `session`
53
+ // response) and what authenticates peers (via Cloudflare Access at the
54
+ // edge), so pimote no longer needs to mint anything.
55
+ const signalUrl = this.opts.config.voice?.speechmuxSignalUrl;
56
+ const llmWsUrl = this.opts.config.voice?.speechmuxLlmWsUrl;
57
+ if (!signalUrl || !llmWsUrl) {
58
+ throw new CallBindError('call_bind_failed_internal', 'voice_disabled: speechmux signal URL / llm WS URL not configured');
59
+ }
60
+ const bus = this.opts.busResolver.getEventBus(args.sessionId);
61
+ if (!bus) {
62
+ throw new CallBindError('call_bind_failed_internal', 'Session has no EventBus');
63
+ }
64
+ const activate = {
65
+ type: 'pimote:voice:activate',
66
+ sessionId: args.sessionId,
67
+ speechmuxWsUrl: llmWsUrl,
68
+ };
69
+ bus.emit(activate.type, activate);
70
+ this.activeCalls.add(args.sessionId);
71
+ return {
72
+ sessionId: args.sessionId,
73
+ webrtcSignalUrl: signalUrl,
74
+ };
75
+ }
76
+ /** Called by ws-handler for CallEndCommand, or internally on displacement/error. Idempotent. */
77
+ async endCall(args) {
78
+ if (!this.activeCalls.has(args.sessionId))
79
+ return;
80
+ this.activeCalls.delete(args.sessionId);
81
+ const bus = this.opts.busResolver.getEventBus(args.sessionId);
82
+ if (bus) {
83
+ const deactivate = { type: 'pimote:voice:deactivate', sessionId: args.sessionId };
84
+ bus.emit(deactivate.type, deactivate);
85
+ }
86
+ }
87
+ /** True if the given session currently has an active voice call bound. */
88
+ isCallActive(sessionId) {
89
+ return this.activeCalls.has(sessionId);
90
+ }
91
+ }