@pimote/pimote 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/README.md +43 -16
  2. package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
  3. package/client/build/_app/immutable/assets/2.DwPXxSa-.css +1 -0
  4. package/client/build/_app/immutable/chunks/-Lc-U-GJ.js +1 -0
  5. package/client/build/_app/immutable/chunks/{CT6ckxpD.js → CO_BwWGt.js} +1 -1
  6. package/client/build/_app/immutable/chunks/CklMSqcv.js +1 -0
  7. package/client/build/_app/immutable/chunks/D1INvMB9.js +1 -0
  8. package/client/build/_app/immutable/chunks/D1vhgXpq.js +5 -0
  9. package/client/build/_app/immutable/entry/{app.CNzpBgAg.js → app.B-HFVtpC.js} +2 -2
  10. package/client/build/_app/immutable/entry/start.DJTQ8-sD.js +1 -0
  11. package/client/build/_app/immutable/nodes/0.CepAO4xf.js +10 -0
  12. package/client/build/_app/immutable/nodes/{1.B8zmHMre.js → 1.CmxFYjRm.js} +1 -1
  13. package/client/build/_app/immutable/nodes/2.DAtqfmki.js +54 -0
  14. package/client/build/_app/version.json +1 -1
  15. package/client/build/index.html +7 -7
  16. package/package.json +7 -3
  17. package/server/dist/auto-drain-on-abort.js +49 -0
  18. package/server/dist/config.js +21 -0
  19. package/server/dist/extension-ui-bridge.js +14 -1
  20. package/server/dist/index.js +36 -1
  21. package/server/dist/message-mapper.js +38 -6
  22. package/server/dist/push-notification.js +11 -0
  23. package/server/dist/server.js +2 -2
  24. package/server/dist/session-manager.js +72 -4
  25. package/server/dist/voice/fsm/actions.js +6 -0
  26. package/server/dist/voice/fsm/events.js +7 -0
  27. package/server/dist/voice/fsm/reducer.js +74 -0
  28. package/server/dist/voice/fsm/reducers/lifecycle.js +158 -0
  29. package/server/dist/voice/fsm/reducers/streaming.js +220 -0
  30. package/server/dist/voice/fsm/reducers/walkback.js +73 -0
  31. package/server/dist/voice/fsm/state.js +21 -0
  32. package/server/dist/voice/fsm/text-extractor.js +128 -0
  33. package/server/dist/voice/index.js +336 -0
  34. package/server/dist/voice/interpreter-prompt.js +115 -0
  35. package/server/dist/voice/speechmux-client.js +153 -0
  36. package/server/dist/voice/state-machine.js +14 -0
  37. package/server/dist/voice/wait-for-idle.js +67 -0
  38. package/server/dist/voice/walk-back.js +198 -0
  39. package/server/dist/voice-orchestrator-boot.js +90 -0
  40. package/server/dist/voice-orchestrator.js +91 -0
  41. package/server/dist/ws-handler.js +112 -7
  42. package/shared/dist/index.d.ts +1 -0
  43. package/shared/dist/index.js +2 -0
  44. package/shared/dist/protocol.d.ts +614 -0
  45. package/shared/dist/protocol.js +30 -0
  46. package/client/build/_app/immutable/assets/0.DBrr7n4n.css +0 -2
  47. package/client/build/_app/immutable/assets/2.DE6k3bQj.css +0 -1
  48. package/client/build/_app/immutable/chunks/5vSSf6qG.js +0 -5
  49. package/client/build/_app/immutable/chunks/DlJOVoUQ.js +0 -1
  50. package/client/build/_app/immutable/chunks/YxmLwfhj.js +0 -1
  51. package/client/build/_app/immutable/chunks/yWVx3W2o.js +0 -1
  52. package/client/build/_app/immutable/entry/start.DYkTAHh1.js +0 -1
  53. package/client/build/_app/immutable/nodes/0.DNlQhEb_.js +0 -10
  54. package/client/build/_app/immutable/nodes/2.W9yV4-x2.js +0 -54
@@ -0,0 +1,336 @@
1
+ // @pimote/voice — voice extension for pimote.
2
+ //
3
+ // This is the imperative shell around the pure FSM in `./fsm/`. It does
4
+ // three things:
5
+ //
6
+ // 1. Translates external stimuli (pi-coding-agent SDK hooks, EventBus
7
+ // messages, speechmux WS frames) into typed `Event` values.
8
+ // 2. Calls `reduce(state, event)` and writes back the new state.
9
+ // 3. Interprets the emitted `Action` values into actual side effects:
10
+ // pi.sendUserMessage, ctx.abort, WS open/close/send, EventBus emit.
11
+ //
12
+ // **Why the redesign.** The previous monolithic implementation conflated
13
+ // lifecycle, streaming, and walkback into a single record of orthogonal
14
+ // flags whose invariants drifted out of sync. The most visible symptom
15
+ // was per-content-block streaming state leaking across assistant
16
+ // messages because it was reset on the wrong event (a substring of
17
+ // `assistantMessageEvent` that never fires inside `message_update`). The
18
+ // FSM split + correct reset-on-message_start eliminates that bug class.
19
+ import { Type } from '@sinclair/typebox';
20
+ import { renderInterpreterPrompt } from './interpreter-prompt.js';
21
+ import { createDefaultSpeechmuxClientFactory } from './speechmux-client.js';
22
+ import { ensureIdleWithImplicitAbort } from './wait-for-idle.js';
23
+ import { initialState } from './fsm/state.js';
24
+ import { reduce } from './fsm/reducer.js';
25
+ // ---- Diagnostic helpers ---------------------------------------------------
26
+ /** Render a compact event description for tracing. Returns null when the
27
+ * event type is enough on its own (the dispatcher logs the bare type). */
28
+ function traceEvent(event) {
29
+ switch (event.type) {
30
+ case 'sdk:toolcall_start':
31
+ return `sdk:toolcall_start(idx=${event.contentIndex}, partial.name=${partialName(event.partial, event.contentIndex)})`;
32
+ case 'sdk:toolcall_delta':
33
+ return `sdk:toolcall_delta(idx=${event.contentIndex}, deltaLen=${event.delta.length}, deltaPreview=${JSON.stringify(event.delta.slice(0, 40))})`;
34
+ case 'sdk:toolcall_end':
35
+ return `sdk:toolcall_end(idx=${event.contentIndex}, name=${event.toolCall.name ?? null}, finalTextLen=${typeof event.toolCall.arguments?.text === 'string' ? event.toolCall.arguments.text.length : 0})`;
36
+ case 'ws:incoming':
37
+ return `ws:incoming(${event.frame.type}${event.frame.type === 'user' ? `, textLen=${event.frame.text.length}` : ''})`;
38
+ case 'sdk:message_start':
39
+ return `sdk:message_start(role=${event.message.role})`;
40
+ case 'sdk:context':
41
+ return `sdk:context(messages=${event.messages.length})`;
42
+ case 'eb:activate':
43
+ return `eb:activate(${event.msg.sessionId})`;
44
+ default:
45
+ return null;
46
+ }
47
+ }
48
+ function partialName(partial, idx) {
49
+ const c = partial.content;
50
+ if (!Array.isArray(c))
51
+ return null;
52
+ const b = c[idx];
53
+ if (!b || typeof b !== 'object')
54
+ return null;
55
+ const name = b.name;
56
+ return typeof name === 'string' ? name : null;
57
+ }
58
+ function blockSummary(blocks) {
59
+ const out = {};
60
+ for (const [k, v] of blocks.entries()) {
61
+ out[k] = blockKind(v);
62
+ }
63
+ return out;
64
+ }
65
+ function blockKind(b) {
66
+ if (b.kind === 'speak_streaming')
67
+ return `speak_streaming(emitted=${b.emittedLength})`;
68
+ return b.kind;
69
+ }
70
+ // ---- Re-exports kept for back-compat with callers/tests -------------------
71
+ export { walkBack, isAbortedEmptyAssistant } from './walk-back.js';
72
+ export { VOICE_CALL_STARTED_SENTINEL } from './state-machine.js';
73
+ export { renderInterpreterPrompt, RAW_INTERPRETER_PROMPT } from './interpreter-prompt.js';
74
+ export { createDefaultSpeechmuxClientFactory } from './speechmux-client.js';
75
+ export function createVoiceExtension(opts) {
76
+ const interpreterPrompt = renderInterpreterPrompt({
77
+ workerProvider: opts.defaultWorkerModel.provider,
78
+ workerModel: opts.defaultWorkerModel.modelId,
79
+ });
80
+ const clientFactory = opts.speechmuxClientFactory ?? createDefaultSpeechmuxClientFactory();
81
+ return (pi) => {
82
+ // ---- Per-extension-instance state (per pimote session) ---------------
83
+ let state = initialState();
84
+ let lastCtx = null;
85
+ let speechmuxClient = null;
86
+ /** Slot read by the `context` hook to return rewritten messages. */
87
+ let pendingContextRewrite = null;
88
+ // ---- Reducer driver --------------------------------------------------
89
+ const dispatch = async (event) => {
90
+ const evtTrace = traceEvent(event);
91
+ const lifecycleBefore = state.lifecycle.kind;
92
+ const { next, actions } = reduce(state, event, {
93
+ config: { defaultInterpreterModel: opts.defaultInterpreterModel },
94
+ });
95
+ state = next;
96
+ if (evtTrace || lifecycleBefore !== state.lifecycle.kind || actions.length > 0) {
97
+ console.log('[voice_trace] dispatch', JSON.stringify({
98
+ event: evtTrace ?? event.type,
99
+ lifecycle: `${lifecycleBefore}→${state.lifecycle.kind}`,
100
+ actions: actions.map((a) => a.kind),
101
+ blocks: blockSummary(state.message.blocks),
102
+ }));
103
+ }
104
+ for (const action of actions) {
105
+ try {
106
+ await execute(action);
107
+ }
108
+ catch (err) {
109
+ console.warn('[voice] action failed', action.kind, err);
110
+ }
111
+ }
112
+ };
113
+ const execute = async (action) => {
114
+ switch (action.kind) {
115
+ case 'set_interpreter_model': {
116
+ if (!lastCtx) {
117
+ console.warn('[voice] set_interpreter_model: no ExtensionContext yet');
118
+ return;
119
+ }
120
+ const model = lastCtx.modelRegistry.find(action.provider, action.modelId);
121
+ if (!model) {
122
+ console.warn(`[voice] set_interpreter_model: no model ${action.provider}/${action.modelId}`);
123
+ return;
124
+ }
125
+ await pi.setModel(model);
126
+ return;
127
+ }
128
+ case 'send_user_message': {
129
+ // Ensure the agent is idle before sending. If it isn't, fire
130
+ // a synthesized barge-in (ctx.abort()) and wait for teardown
131
+ // — covers the case where the user spoke while the worker
132
+ // was silently reasoning, so speechmux didn't issue an abort
133
+ // (no TTS in flight to abort). See wait-for-idle.ts.
134
+ if (lastCtx) {
135
+ const ready = await ensureIdleWithImplicitAbort(lastCtx);
136
+ if (!ready) {
137
+ console.warn(`[voice] send_user_message: agent did not become idle within 2000ms after implicit abort, dropping: ${action.text.slice(0, 60)}`);
138
+ return;
139
+ }
140
+ }
141
+ pi.sendUserMessage(action.text, action.deliverAs ? { deliverAs: action.deliverAs } : undefined);
142
+ return;
143
+ }
144
+ case 'inject_silent_user_message': {
145
+ // sendMessage() with a `custom` role + triggerTurn:false appends
146
+ // an entry that converts to a `role:"user"` message for the LLM
147
+ // (see core/messages.ts convertToLlm) but does not start a turn
148
+ // now. Exactly what we want for the end-of-call sentinel.
149
+ pi.sendMessage({
150
+ customType: action.customType,
151
+ content: action.text,
152
+ display: true,
153
+ }, { triggerTurn: false });
154
+ return;
155
+ }
156
+ case 'open_ws': {
157
+ // Reentrancy guard: close any prior client first.
158
+ try {
159
+ speechmuxClient?.close();
160
+ }
161
+ catch {
162
+ /* ignore */
163
+ }
164
+ speechmuxClient = null;
165
+ try {
166
+ const client = await clientFactory({ wsUrl: action.url });
167
+ speechmuxClient = client;
168
+ client.onFrame((frame) => {
169
+ void dispatch({ type: 'ws:incoming', frame });
170
+ });
171
+ await dispatch({ type: 'ws:opened' });
172
+ }
173
+ catch (err) {
174
+ console.warn('[voice] speechmux open failed', err);
175
+ await dispatch({ type: 'ws:open_failed', error: err });
176
+ }
177
+ return;
178
+ }
179
+ case 'close_ws': {
180
+ try {
181
+ speechmuxClient?.close();
182
+ }
183
+ catch {
184
+ /* idempotent */
185
+ }
186
+ speechmuxClient = null;
187
+ return;
188
+ }
189
+ case 'send_frame': {
190
+ if (!speechmuxClient) {
191
+ console.warn('[voice] send_frame with no client — dropping', action.frame.type);
192
+ return;
193
+ }
194
+ const preview = action.frame.type === 'token' ? action.frame.text.slice(0, 60) : null;
195
+ console.log('[voice_trace] send_frame', JSON.stringify({ type: action.frame.type, preview }));
196
+ try {
197
+ speechmuxClient.send(action.frame);
198
+ }
199
+ catch (err) {
200
+ console.warn('[voice] speechmux send failed', action.frame.type, err);
201
+ }
202
+ return;
203
+ }
204
+ case 'abort_agent': {
205
+ lastCtx?.abort();
206
+ return;
207
+ }
208
+ case 'append_custom_entry': {
209
+ pi.appendEntry(action.customType, action.data);
210
+ return;
211
+ }
212
+ case 'emit_deactivate_request': {
213
+ const sessionId = state.lifecycle.kind === 'active' || state.lifecycle.kind === 'activating' ? state.lifecycle.sessionId : '';
214
+ const msg = {
215
+ type: 'pimote:voice:deactivate',
216
+ sessionId,
217
+ };
218
+ pi.events.emit('pimote:voice:deactivate', msg);
219
+ return;
220
+ }
221
+ case 'rewrite_context': {
222
+ // Stash; the `context` hook below reads this on its return.
223
+ pendingContextRewrite = action.messages;
224
+ return;
225
+ }
226
+ }
227
+ };
228
+ // ---- EventBus listeners ---------------------------------------------
229
+ pi.events.on('pimote:voice:activate', (data) => {
230
+ void dispatch({ type: 'eb:activate', msg: data });
231
+ });
232
+ pi.events.on('pimote:voice:deactivate', (data) => {
233
+ void dispatch({ type: 'eb:deactivate', msg: data });
234
+ });
235
+ // ---- speak() tool ---------------------------------------------------
236
+ //
237
+ // The streaming reducer is the sole emitter of speak `token`/`end`
238
+ // frames. The `execute` here only returns the synthetic success
239
+ // result so the agent loop progresses.
240
+ pi.registerTool({
241
+ name: 'speak',
242
+ label: 'Speak',
243
+ description: 'Speak text to the user via text-to-speech. This is the only way to produce audible output during a voice call. Keep messages short and TTS-friendly.',
244
+ promptSnippet: 'speak(text) — speak text to the user (voice-mode only).',
245
+ parameters: Type.Object({
246
+ text: Type.String({ description: 'The text to speak to the user.' }),
247
+ }),
248
+ execute: async () => {
249
+ if (state.lifecycle.kind === 'active' || state.lifecycle.kind === 'activating') {
250
+ return { content: [{ type: 'text', text: 'ok' }], details: {} };
251
+ }
252
+ return {
253
+ content: [
254
+ {
255
+ type: 'text',
256
+ text: 'Voice call has ended. The user is now in text mode — do NOT call speak() again. Reply with normal assistant text. Any further speak() calls in this session will be rejected.',
257
+ },
258
+ ],
259
+ details: {},
260
+ isError: true,
261
+ };
262
+ },
263
+ });
264
+ // ---- SDK hooks ------------------------------------------------------
265
+ pi.on('before_agent_start', (event, ctx) => {
266
+ lastCtx = ctx;
267
+ if (state.lifecycle.kind === 'dormant')
268
+ return;
269
+ return { systemPrompt: `${interpreterPrompt}\n\n${event.systemPrompt ?? ''}`.trim() };
270
+ });
271
+ // The `tool_call` hook is intentionally NOT registered. The streaming
272
+ // reducer is the sole emitter of speak frames; bulk-emission via
273
+ // tool_call was the source of the double-emit class of bugs.
274
+ //
275
+ // The `turn_end` safety net is also intentionally NOT registered.
276
+ // With per-speak `end` framing driven by `toolcall_end`, it was
277
+ // redundant and contributed to double-end emissions.
278
+ pi.on('message_start', (event) => {
279
+ // Only assistant messages reset the streaming state. User and
280
+ // tool-result messages don't have content blocks we care about.
281
+ if (event.message.role !== 'assistant')
282
+ return;
283
+ void dispatch({ type: 'sdk:message_start', message: event.message });
284
+ });
285
+ pi.on('message_update', (event, ctx) => {
286
+ lastCtx = ctx;
287
+ // Walkback no longer needs a captured snapshot — it operates on
288
+ // the messages array passed to `sdk:context` directly.
289
+ if (state.lifecycle.kind === 'dormant')
290
+ return;
291
+ const ame = event.assistantMessageEvent;
292
+ if (!ame || typeof ame.contentIndex !== 'number')
293
+ return;
294
+ switch (ame.type) {
295
+ case 'toolcall_start':
296
+ void dispatch({
297
+ type: 'sdk:toolcall_start',
298
+ contentIndex: ame.contentIndex,
299
+ partial: (ame.partial ?? {}),
300
+ });
301
+ return;
302
+ case 'toolcall_delta':
303
+ void dispatch({
304
+ type: 'sdk:toolcall_delta',
305
+ contentIndex: ame.contentIndex,
306
+ delta: typeof ame.delta === 'string' ? ame.delta : '',
307
+ partial: (ame.partial ?? {}),
308
+ });
309
+ return;
310
+ case 'toolcall_end':
311
+ void dispatch({
312
+ type: 'sdk:toolcall_end',
313
+ contentIndex: ame.contentIndex,
314
+ toolCall: (ame.toolCall ?? {}),
315
+ });
316
+ return;
317
+ default:
318
+ // text_*, thinking_* — not relevant to outbound streaming.
319
+ return;
320
+ }
321
+ });
322
+ pi.on('context', (event, ctx) => {
323
+ lastCtx = ctx;
324
+ // The walkback reducer always runs walkBack (even when no rewrite
325
+ // is pending — to strip aborted-empty-assistants). It writes the
326
+ // result into `pendingContextRewrite` via the `rewrite_context`
327
+ // action, which we read below.
328
+ void dispatch({ type: 'sdk:context', messages: event.messages });
329
+ const result = pendingContextRewrite;
330
+ pendingContextRewrite = null;
331
+ if (result)
332
+ return { messages: result };
333
+ return undefined;
334
+ });
335
+ };
336
+ }
@@ -0,0 +1,115 @@
1
+ // Interpreter prompt for the pimote voice extension. Adapted from voxcoder's
2
+ // interpreter prompt — see docs/plans/voice-mode.md (Step 2) and
3
+ // /home/alenna/repos/voxcoder/server/src/interpreter/prompt.ts.
4
+ //
5
+ // Multimodal placeholders from voxcoder are removed (the PWA is a separate
6
+ // text surface in v1 — it renders scrollback directly, not through the
7
+ // interpreter). The interpreter's sole audio-output path is the `speak(text)`
8
+ // pi custom tool; free-text assistant output is discarded from the audio
9
+ // channel by the extension.
10
+ //
11
+ // Placeholders `{{workerProvider}}` / `{{workerModel}}` are substituted once
12
+ // at factory time by `createVoiceExtension` so the registered string is
13
+ // static by the time pi's `before_agent_start` hook sees it.
14
+ /** Raw template — contains `{{workerProvider}}` / `{{workerModel}}` placeholders. */
15
+ export const RAW_INTERPRETER_PROMPT = `You are a voice interpreter — the conversational hub between a human user speaking over a phone-like call and a coding worker subagent that does the actual software engineering work.
16
+
17
+ <role>
18
+
19
+ You receive all user speech as user messages. You decide what to say back (via the \`speak\` tool) and when to delegate work to a worker. From the user's point of view you and the worker are one entity — use "I" when relaying what the worker is doing.
20
+
21
+ You have exactly one way to produce audio: the \`speak(text)\` tool. Any free-text assistant output you emit is discarded — the user will never hear it. If you have nothing to say and nothing to do, emit a single \`speak\` call with a brief acknowledgement (e.g. "ok") or simply end your turn.
22
+
23
+ </role>
24
+
25
+ <session_start>
26
+
27
+ When you see the sentinel user message \`<voice_call_started/>\`, the call has just connected. Greet the user proactively with a brief \`speak(...)\` — one or two sentences — and then end your turn so the user can reply. Example greetings:
28
+
29
+ - "Hey, I'm here. What are we working on?"
30
+ - "Hi — what can I help you with?"
31
+
32
+ Do not dispatch any worker task on the greeting turn. Just speak and wait.
33
+
34
+ </session_start>
35
+
36
+ <acknowledge_first>
37
+
38
+ **Every time the user speaks, your very first action in the response turn must be a \`speak(...)\` call that acknowledges what you heard and, if you're about to do something, what you're going to do about it.** Do not start a turn with a tool call, a subagent spawn, a read, or silent thinking — speak first, always.
39
+
40
+ The acknowledgement and any tool calls happen in the same turn. Emit the \`speak\` call first, then immediately follow with whatever tool calls you need (typically a worker subagent). The user hears the acknowledgement while the tools run in the background — that's the point.
41
+
42
+ Keep the ack short and concrete:
43
+
44
+ - "Okay, taking a look at the auth module now."
45
+ - "Got it — I'll check the test failures."
46
+ - "Sure, one sec while I read that file."
47
+
48
+ For purely conversational turns where no tool call is needed, the \`speak\` call alone *is* the response — same rule, just nothing after it.
49
+
50
+ The only exception is the \`<voice_call_started/>\` greeting turn, which is already a \`speak\`-first turn by definition.
51
+
52
+ </acknowledge_first>
53
+
54
+ <speaking>
55
+
56
+ All audible output goes through \`speak(text)\`:
57
+
58
+ - One or two short sentences per call. Natural spoken English.
59
+ - Never read code aloud. Describe what the code does instead.
60
+ - No markdown, backticks, bullet points, or code fences — they sound terrible as TTS.
61
+ - For long updates, break them into multiple \`speak\` calls in the same turn; each call is streamed to the user as you emit it.
62
+
63
+ You may emit multiple \`speak\` calls per turn. The user hears them concatenated in order. End your turn once you have nothing more to say on the current topic.
64
+
65
+ </speaking>
66
+
67
+ <worker_delegation>
68
+
69
+ For any real software-engineering task (reading files, editing code, running tests, investigating a bug, writing a new feature), spawn a worker via the \`my-pi\` \`subagent\` tool. The worker is a full pi coding agent — give it a clear task description and let it work.
70
+
71
+ **The worker is long-lived.** Spawn it once — either at the start of the call or the first time you need it — and then keep it alive for the rest of the call. Do **not** tear it down when it goes idle. For every subsequent task, use \`send\` to dispatch new work to the existing worker. This preserves the worker's context across the whole call so it remembers prior files, decisions, and reasoning. Only tear it down at the end of the call or if it gets into a clearly broken state.
72
+
73
+ **IMPORTANT:** When spawning a worker via \`my-pi\` \`subagent\`, always pass \`model: "{{workerModel}}"\` and \`provider: "{{workerProvider}}"\` in the agent configuration so the worker runs on the configured worker model rather than the interpreter model.
74
+
75
+ On the turn where you spawn the worker, emit the acknowledging \`speak(...)\` call as the first tool call in the list, with the \`subagent\` call right after it in the same response. Both fire in parallel — the user hears the ack while the worker is already starting up.
76
+
77
+ While the worker runs:
78
+
79
+ - When the worker reports progress or completion, summarise it briefly for the user — outcomes, not step-by-step narration.
80
+ - If the worker asks a question or flags a decision, relay it to the user and wait for their answer before forwarding it back.
81
+
82
+ For purely conversational turns (greetings, thanks, chit-chat, clarifying a previous answer) you can handle the turn with \`speak\` alone — no worker needed.
83
+
84
+ </worker_delegation>
85
+
86
+ <interruptions>
87
+
88
+ The user can interrupt you mid-sentence. When that happens, your in-flight turn is aborted and the user's new message arrives as the next user turn. Do not apologise for being interrupted or try to resume the old sentence — just respond to what the user said.
89
+
90
+ </interruptions>
91
+
92
+ <tts_guidelines>
93
+
94
+ The user is likely driving, cooking, or otherwise unable to look at a screen. Audio must be:
95
+
96
+ - Brief enough not to distract.
97
+ - Clear enough to understand without visual context.
98
+ - Natural enough not to sound robotic.
99
+
100
+ Rules of thumb:
101
+
102
+ - 1–3 sentences per \`speak\` call.
103
+ - Focus on outcomes, not internal state.
104
+ - Never read code, file paths with slashes, or long identifiers aloud verbatim — paraphrase.
105
+ - When the worker finishes, summarise the result in a sentence or two.
106
+
107
+ </tts_guidelines>
108
+ `;
109
+ /**
110
+ * Substitute the `{{workerProvider}}` / `{{workerModel}}` placeholders with
111
+ * concrete values. Called once at factory time.
112
+ */
113
+ export function renderInterpreterPrompt(vars) {
114
+ return RAW_INTERPRETER_PROMPT.replace(/\{\{workerProvider\}\}/g, vars.workerProvider).replace(/\{\{workerModel\}\}/g, vars.workerModel);
115
+ }
@@ -0,0 +1,153 @@
1
+ // Speechmux LlmBackend WS protocol — minimal interface the voice extension
2
+ // consumes. Full protocol: speechmux/docs/llm-ws-protocol.md.
3
+ //
4
+ // The voice extension uses this as the seam between itself and speechmux so
5
+ // tests can substitute an in-memory fake without running a real WebSocket.
6
+ // ---------------------------------------------------------------------------
7
+ // Default `ws`-backed implementation.
8
+ // ---------------------------------------------------------------------------
9
+ /**
10
+ * Default `SpeechmuxClient` factory backed by the `ws` package. Opens a
11
+ * WebSocket to `wsUrl` and routes incoming JSON text frames to registered
12
+ * listeners. The LLM-WS protocol has no hello frame — the harness simply
13
+ * connects and exchanges `user` / `token` / `end` / `abort` / `rollback`
14
+ * frames (see speechmux/docs/llm-ws-protocol.md).
15
+ *
16
+ * Resolves once the socket is open. Rejects if the socket errors or closes
17
+ * before opening.
18
+ */
19
+ export function createDefaultSpeechmuxClientFactory() {
20
+ // Dynamic import so consumers that never call the factory (e.g. tests)
21
+ // don't pay the `ws` resolution cost. Cached after first load.
22
+ let WsCtor = null;
23
+ return async (opts) => {
24
+ const { wsUrl } = opts;
25
+ if (!WsCtor) {
26
+ const mod = await import('ws');
27
+ WsCtor = mod.WebSocket;
28
+ }
29
+ const ws = new WsCtor(wsUrl);
30
+ const listeners = new Set();
31
+ // Buffer frames that arrive after `hello` but before the caller has had a
32
+ // chance to attach an `onFrame` listener. Drained on the first attach.
33
+ const pending = [];
34
+ let closed = false;
35
+ // Install the message handler before resolving so frames sent between
36
+ // open and the caller's onFrame attach are buffered instead of dropped.
37
+ // See review finding 5 (speechmux-client race).
38
+ ws.on('message', (raw, isBinary) => {
39
+ if (isBinary)
40
+ return;
41
+ let text;
42
+ if (typeof raw === 'string')
43
+ text = raw;
44
+ else if (raw instanceof Buffer)
45
+ text = raw.toString('utf8');
46
+ else if (Array.isArray(raw))
47
+ text = Buffer.concat(raw).toString('utf8');
48
+ else
49
+ text = Buffer.from(raw).toString('utf8');
50
+ let frame;
51
+ try {
52
+ frame = JSON.parse(text);
53
+ }
54
+ catch {
55
+ return; // ignore non-JSON
56
+ }
57
+ if (!isIncomingFrame(frame))
58
+ return;
59
+ if (listeners.size === 0) {
60
+ pending.push(frame);
61
+ return;
62
+ }
63
+ for (const listener of listeners)
64
+ listener(frame);
65
+ });
66
+ const connectTimeoutMs = opts.connectTimeoutMs ?? 5000;
67
+ await new Promise((resolve, reject) => {
68
+ let settled = false;
69
+ const cleanup = () => {
70
+ clearTimeout(timer);
71
+ ws.off('open', onOpen);
72
+ ws.off('error', onError);
73
+ };
74
+ const onOpen = () => {
75
+ if (settled)
76
+ return;
77
+ settled = true;
78
+ cleanup();
79
+ resolve();
80
+ };
81
+ const onError = (err) => {
82
+ if (settled)
83
+ return;
84
+ settled = true;
85
+ cleanup();
86
+ reject(err);
87
+ };
88
+ const timer = setTimeout(() => {
89
+ if (settled)
90
+ return;
91
+ settled = true;
92
+ cleanup();
93
+ try {
94
+ ws.terminate();
95
+ }
96
+ catch {
97
+ /* ignore */
98
+ }
99
+ reject(new Error(`SpeechmuxClient: connect timeout after ${connectTimeoutMs}ms (${wsUrl})`));
100
+ }, connectTimeoutMs);
101
+ ws.once('open', onOpen);
102
+ ws.once('error', onError);
103
+ });
104
+ ws.on('close', () => {
105
+ closed = true;
106
+ });
107
+ return {
108
+ send(frame) {
109
+ if (closed || ws.readyState !== ws.OPEN) {
110
+ throw new Error('SpeechmuxClient: socket is not open');
111
+ }
112
+ ws.send(JSON.stringify(frame));
113
+ },
114
+ onFrame(listener) {
115
+ const firstListener = listeners.size === 0;
116
+ listeners.add(listener);
117
+ if (firstListener && pending.length > 0) {
118
+ // Drain any frames that arrived before the listener attached.
119
+ const drained = pending.splice(0, pending.length);
120
+ for (const frame of drained)
121
+ listener(frame);
122
+ }
123
+ return () => listeners.delete(listener);
124
+ },
125
+ close() {
126
+ if (closed)
127
+ return;
128
+ closed = true;
129
+ try {
130
+ ws.close();
131
+ }
132
+ catch {
133
+ // ignore
134
+ }
135
+ },
136
+ };
137
+ };
138
+ }
139
+ function isIncomingFrame(value) {
140
+ if (!value || typeof value !== 'object')
141
+ return false;
142
+ const v = value;
143
+ switch (v.type) {
144
+ case 'user':
145
+ return typeof v.text === 'string';
146
+ case 'abort':
147
+ return true;
148
+ case 'rollback':
149
+ return typeof v.heard_text === 'string';
150
+ default:
151
+ return false;
152
+ }
153
+ }
@@ -0,0 +1,14 @@
1
+ // Voice extension activation state machine.
2
+ //
3
+ // Defined in docs/plans/voice-mode.md — "Voice extension" section under
4
+ // "Interfaces". The state machine is driven by EventBus messages from the
5
+ // server-side VoiceOrchestrator.
6
+ /** Sentinel user message appended on entry to the `active` state. */
7
+ export const VOICE_CALL_STARTED_SENTINEL = '<voice_call_started/>';
8
+ /**
9
+ * Sentinel user message appended on exit from `active`/`activating` back to
10
+ * `dormant`. Gives the agent an explicit in-history signal that the voice
11
+ * call has ended, so subsequent turns (including future text-mode pickups
12
+ * of the same session) don't keep mimicking prior `speak()` calls.
13
+ */
14
+ export const VOICE_CALL_ENDED_SENTINEL = '<voice_call_ended/>';