@pimote/pimote 0.1.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -17
- package/client/build/_app/immutable/assets/0.C7loWTOC.css +2 -0
- package/client/build/_app/immutable/assets/2.D9fiCd8W.css +1 -0
- package/client/build/_app/immutable/chunks/{BTSGQ0LP.js → B8lQCytv.js} +1 -1
- package/client/build/_app/immutable/chunks/BNqgidwO.js +5 -0
- package/client/build/_app/immutable/chunks/D26i4pYm.js +1 -0
- package/client/build/_app/immutable/chunks/D_Fpgknp.js +1 -0
- package/client/build/_app/immutable/chunks/DoVhjU85.js +1 -0
- package/client/build/_app/immutable/chunks/DzqbY2XU.js +1 -0
- package/client/build/_app/immutable/chunks/{L5t1qIFa.js → uZO1iyJZ.js} +2 -2
- package/client/build/_app/immutable/entry/app.DO-zgzyy.js +2 -0
- package/client/build/_app/immutable/entry/start.BZlrOH0-.js +1 -0
- package/client/build/_app/immutable/nodes/0.BEh4bPGQ.js +10 -0
- package/client/build/_app/immutable/nodes/1.B2l9JGRO.js +1 -0
- package/client/build/_app/immutable/nodes/2.ph9M0S1U.js +54 -0
- package/client/build/_app/version.json +1 -1
- package/client/build/index.html +8 -8
- package/package.json +9 -5
- package/patches/{@mariozechner+pi-coding-agent+0.65.0.patch → @mariozechner+pi-coding-agent+0.67.6.patch} +4 -4
- package/server/dist/auto-drain-on-abort.js +49 -0
- package/server/dist/config.js +21 -0
- package/server/dist/extension-ui-bridge.js +14 -1
- package/server/dist/folder-index.js +8 -4
- package/server/dist/git-branch.js +32 -0
- package/server/dist/index.js +31 -1
- package/server/dist/message-mapper.js +99 -4
- package/server/dist/server.js +5 -2
- package/server/dist/session-manager.js +99 -6
- package/server/dist/voice/fsm/actions.js +6 -0
- package/server/dist/voice/fsm/events.js +7 -0
- package/server/dist/voice/fsm/reducer.js +74 -0
- package/server/dist/voice/fsm/reducers/lifecycle.js +146 -0
- package/server/dist/voice/fsm/reducers/streaming.js +220 -0
- package/server/dist/voice/fsm/reducers/walkback.js +73 -0
- package/server/dist/voice/fsm/state.js +21 -0
- package/server/dist/voice/fsm/text-extractor.js +128 -0
- package/server/dist/voice/index.js +319 -0
- package/server/dist/voice/interpreter-prompt.js +115 -0
- package/server/dist/voice/speechmux-client.js +153 -0
- package/server/dist/voice/state-machine.js +7 -0
- package/server/dist/voice/wait-for-idle.js +67 -0
- package/server/dist/voice/walk-back.js +198 -0
- package/server/dist/voice-orchestrator-boot.js +90 -0
- package/server/dist/voice-orchestrator.js +91 -0
- package/server/dist/ws-handler.js +340 -36
- package/shared/dist/index.d.ts +1 -0
- package/shared/dist/index.js +2 -0
- package/shared/dist/protocol.d.ts +614 -0
- package/shared/dist/protocol.js +30 -0
- package/client/build/_app/immutable/assets/0.Cj7UL9cq.css +0 -2
- package/client/build/_app/immutable/assets/2.CIRqqeIr.css +0 -1
- package/client/build/_app/immutable/chunks/BEKHoMUP.js +0 -1
- package/client/build/_app/immutable/chunks/CfQ6Egqh.js +0 -1
- package/client/build/_app/immutable/chunks/DQ-KfPq0.js +0 -1
- package/client/build/_app/immutable/chunks/DfA0ecbz.js +0 -1
- package/client/build/_app/immutable/chunks/Dnh9Emns.js +0 -5
- package/client/build/_app/immutable/entry/app.j0V4R67V.js +0 -2
- package/client/build/_app/immutable/entry/start.wkfo4Ebw.js +0 -1
- package/client/build/_app/immutable/nodes/0.CUipL_P7.js +0 -5
- package/client/build/_app/immutable/nodes/1.ex7ejMby.js +0 -1
- package/client/build/_app/immutable/nodes/2.165oQG9Z.js +0 -49
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
// @pimote/voice — voice extension for pimote.
|
|
2
|
+
//
|
|
3
|
+
// This is the imperative shell around the pure FSM in `./fsm/`. It does
|
|
4
|
+
// three things:
|
|
5
|
+
//
|
|
6
|
+
// 1. Translates external stimuli (pi-coding-agent SDK hooks, EventBus
|
|
7
|
+
// messages, speechmux WS frames) into typed `Event` values.
|
|
8
|
+
// 2. Calls `reduce(state, event)` and writes back the new state.
|
|
9
|
+
// 3. Interprets the emitted `Action` values into actual side effects:
|
|
10
|
+
// pi.sendUserMessage, ctx.abort, WS open/close/send, EventBus emit.
|
|
11
|
+
//
|
|
12
|
+
// **Why the redesign.** The previous monolithic implementation conflated
|
|
13
|
+
// lifecycle, streaming, and walkback into a single record of orthogonal
|
|
14
|
+
// flags whose invariants drifted out of sync. The most visible symptom
|
|
15
|
+
// was per-content-block streaming state leaking across assistant
|
|
16
|
+
// messages because it was reset on the wrong event (a substring of
|
|
17
|
+
// `assistantMessageEvent` that never fires inside `message_update`). The
|
|
18
|
+
// FSM split + correct reset-on-message_start eliminates that bug class.
|
|
19
|
+
import { Type } from '@sinclair/typebox';
|
|
20
|
+
import { renderInterpreterPrompt } from './interpreter-prompt.js';
|
|
21
|
+
import { createDefaultSpeechmuxClientFactory } from './speechmux-client.js';
|
|
22
|
+
import { ensureIdleWithImplicitAbort } from './wait-for-idle.js';
|
|
23
|
+
import { initialState } from './fsm/state.js';
|
|
24
|
+
import { reduce } from './fsm/reducer.js';
|
|
25
|
+
// ---- Diagnostic helpers ---------------------------------------------------
|
|
26
|
+
/** Render a compact event description for tracing. Returns null when the
|
|
27
|
+
* event type is enough on its own (the dispatcher logs the bare type). */
|
|
28
|
+
function traceEvent(event) {
|
|
29
|
+
switch (event.type) {
|
|
30
|
+
case 'sdk:toolcall_start':
|
|
31
|
+
return `sdk:toolcall_start(idx=${event.contentIndex}, partial.name=${partialName(event.partial, event.contentIndex)})`;
|
|
32
|
+
case 'sdk:toolcall_delta':
|
|
33
|
+
return `sdk:toolcall_delta(idx=${event.contentIndex}, deltaLen=${event.delta.length}, deltaPreview=${JSON.stringify(event.delta.slice(0, 40))})`;
|
|
34
|
+
case 'sdk:toolcall_end':
|
|
35
|
+
return `sdk:toolcall_end(idx=${event.contentIndex}, name=${event.toolCall.name ?? null}, finalTextLen=${typeof event.toolCall.arguments?.text === 'string' ? event.toolCall.arguments.text.length : 0})`;
|
|
36
|
+
case 'ws:incoming':
|
|
37
|
+
return `ws:incoming(${event.frame.type}${event.frame.type === 'user' ? `, textLen=${event.frame.text.length}` : ''})`;
|
|
38
|
+
case 'sdk:message_start':
|
|
39
|
+
return `sdk:message_start(role=${event.message.role})`;
|
|
40
|
+
case 'sdk:context':
|
|
41
|
+
return `sdk:context(messages=${event.messages.length})`;
|
|
42
|
+
case 'eb:activate':
|
|
43
|
+
return `eb:activate(${event.msg.sessionId})`;
|
|
44
|
+
default:
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
function partialName(partial, idx) {
|
|
49
|
+
const c = partial.content;
|
|
50
|
+
if (!Array.isArray(c))
|
|
51
|
+
return null;
|
|
52
|
+
const b = c[idx];
|
|
53
|
+
if (!b || typeof b !== 'object')
|
|
54
|
+
return null;
|
|
55
|
+
const name = b.name;
|
|
56
|
+
return typeof name === 'string' ? name : null;
|
|
57
|
+
}
|
|
58
|
+
function blockSummary(blocks) {
|
|
59
|
+
const out = {};
|
|
60
|
+
for (const [k, v] of blocks.entries()) {
|
|
61
|
+
out[k] = blockKind(v);
|
|
62
|
+
}
|
|
63
|
+
return out;
|
|
64
|
+
}
|
|
65
|
+
function blockKind(b) {
|
|
66
|
+
if (b.kind === 'speak_streaming')
|
|
67
|
+
return `speak_streaming(emitted=${b.emittedLength})`;
|
|
68
|
+
return b.kind;
|
|
69
|
+
}
|
|
70
|
+
// ---- Re-exports kept for back-compat with callers/tests -------------------
|
|
71
|
+
export { walkBack, isAbortedEmptyAssistant } from './walk-back.js';
|
|
72
|
+
export { VOICE_CALL_STARTED_SENTINEL } from './state-machine.js';
|
|
73
|
+
export { renderInterpreterPrompt, RAW_INTERPRETER_PROMPT } from './interpreter-prompt.js';
|
|
74
|
+
export { createDefaultSpeechmuxClientFactory } from './speechmux-client.js';
|
|
75
|
+
export function createVoiceExtension(opts) {
|
|
76
|
+
const interpreterPrompt = renderInterpreterPrompt({
|
|
77
|
+
workerProvider: opts.defaultWorkerModel.provider,
|
|
78
|
+
workerModel: opts.defaultWorkerModel.modelId,
|
|
79
|
+
});
|
|
80
|
+
const clientFactory = opts.speechmuxClientFactory ?? createDefaultSpeechmuxClientFactory();
|
|
81
|
+
return (pi) => {
|
|
82
|
+
// ---- Per-extension-instance state (per pimote session) ---------------
|
|
83
|
+
let state = initialState();
|
|
84
|
+
let lastCtx = null;
|
|
85
|
+
let speechmuxClient = null;
|
|
86
|
+
/** Slot read by the `context` hook to return rewritten messages. */
|
|
87
|
+
let pendingContextRewrite = null;
|
|
88
|
+
// ---- Reducer driver --------------------------------------------------
|
|
89
|
+
const dispatch = async (event) => {
|
|
90
|
+
const evtTrace = traceEvent(event);
|
|
91
|
+
const lifecycleBefore = state.lifecycle.kind;
|
|
92
|
+
const { next, actions } = reduce(state, event, {
|
|
93
|
+
config: { defaultInterpreterModel: opts.defaultInterpreterModel },
|
|
94
|
+
});
|
|
95
|
+
state = next;
|
|
96
|
+
if (evtTrace || lifecycleBefore !== state.lifecycle.kind || actions.length > 0) {
|
|
97
|
+
console.log('[voice_trace] dispatch', JSON.stringify({
|
|
98
|
+
event: evtTrace ?? event.type,
|
|
99
|
+
lifecycle: `${lifecycleBefore}→${state.lifecycle.kind}`,
|
|
100
|
+
actions: actions.map((a) => a.kind),
|
|
101
|
+
blocks: blockSummary(state.message.blocks),
|
|
102
|
+
}));
|
|
103
|
+
}
|
|
104
|
+
for (const action of actions) {
|
|
105
|
+
try {
|
|
106
|
+
await execute(action);
|
|
107
|
+
}
|
|
108
|
+
catch (err) {
|
|
109
|
+
console.warn('[voice] action failed', action.kind, err);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
};
|
|
113
|
+
const execute = async (action) => {
|
|
114
|
+
switch (action.kind) {
|
|
115
|
+
case 'set_interpreter_model': {
|
|
116
|
+
if (!lastCtx) {
|
|
117
|
+
console.warn('[voice] set_interpreter_model: no ExtensionContext yet');
|
|
118
|
+
return;
|
|
119
|
+
}
|
|
120
|
+
const model = lastCtx.modelRegistry.find(action.provider, action.modelId);
|
|
121
|
+
if (!model) {
|
|
122
|
+
console.warn(`[voice] set_interpreter_model: no model ${action.provider}/${action.modelId}`);
|
|
123
|
+
return;
|
|
124
|
+
}
|
|
125
|
+
await pi.setModel(model);
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
case 'send_user_message': {
|
|
129
|
+
// Ensure the agent is idle before sending. If it isn't, fire
|
|
130
|
+
// a synthesized barge-in (ctx.abort()) and wait for teardown
|
|
131
|
+
// — covers the case where the user spoke while the worker
|
|
132
|
+
// was silently reasoning, so speechmux didn't issue an abort
|
|
133
|
+
// (no TTS in flight to abort). See wait-for-idle.ts.
|
|
134
|
+
if (lastCtx) {
|
|
135
|
+
const ready = await ensureIdleWithImplicitAbort(lastCtx);
|
|
136
|
+
if (!ready) {
|
|
137
|
+
console.warn(`[voice] send_user_message: agent did not become idle within 2000ms after implicit abort, dropping: ${action.text.slice(0, 60)}`);
|
|
138
|
+
return;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
pi.sendUserMessage(action.text, action.deliverAs ? { deliverAs: action.deliverAs } : undefined);
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
case 'open_ws': {
|
|
145
|
+
// Reentrancy guard: close any prior client first.
|
|
146
|
+
try {
|
|
147
|
+
speechmuxClient?.close();
|
|
148
|
+
}
|
|
149
|
+
catch {
|
|
150
|
+
/* ignore */
|
|
151
|
+
}
|
|
152
|
+
speechmuxClient = null;
|
|
153
|
+
try {
|
|
154
|
+
const client = await clientFactory({ wsUrl: action.url });
|
|
155
|
+
speechmuxClient = client;
|
|
156
|
+
client.onFrame((frame) => {
|
|
157
|
+
void dispatch({ type: 'ws:incoming', frame });
|
|
158
|
+
});
|
|
159
|
+
await dispatch({ type: 'ws:opened' });
|
|
160
|
+
}
|
|
161
|
+
catch (err) {
|
|
162
|
+
console.warn('[voice] speechmux open failed', err);
|
|
163
|
+
await dispatch({ type: 'ws:open_failed', error: err });
|
|
164
|
+
}
|
|
165
|
+
return;
|
|
166
|
+
}
|
|
167
|
+
case 'close_ws': {
|
|
168
|
+
try {
|
|
169
|
+
speechmuxClient?.close();
|
|
170
|
+
}
|
|
171
|
+
catch {
|
|
172
|
+
/* idempotent */
|
|
173
|
+
}
|
|
174
|
+
speechmuxClient = null;
|
|
175
|
+
return;
|
|
176
|
+
}
|
|
177
|
+
case 'send_frame': {
|
|
178
|
+
if (!speechmuxClient) {
|
|
179
|
+
console.warn('[voice] send_frame with no client — dropping', action.frame.type);
|
|
180
|
+
return;
|
|
181
|
+
}
|
|
182
|
+
const preview = action.frame.type === 'token' ? action.frame.text.slice(0, 60) : null;
|
|
183
|
+
console.log('[voice_trace] send_frame', JSON.stringify({ type: action.frame.type, preview }));
|
|
184
|
+
try {
|
|
185
|
+
speechmuxClient.send(action.frame);
|
|
186
|
+
}
|
|
187
|
+
catch (err) {
|
|
188
|
+
console.warn('[voice] speechmux send failed', action.frame.type, err);
|
|
189
|
+
}
|
|
190
|
+
return;
|
|
191
|
+
}
|
|
192
|
+
case 'abort_agent': {
|
|
193
|
+
lastCtx?.abort();
|
|
194
|
+
return;
|
|
195
|
+
}
|
|
196
|
+
case 'append_custom_entry': {
|
|
197
|
+
pi.appendEntry(action.customType, action.data);
|
|
198
|
+
return;
|
|
199
|
+
}
|
|
200
|
+
case 'emit_deactivate_request': {
|
|
201
|
+
const sessionId = state.lifecycle.kind === 'active' || state.lifecycle.kind === 'activating' ? state.lifecycle.sessionId : '';
|
|
202
|
+
const msg = {
|
|
203
|
+
type: 'pimote:voice:deactivate',
|
|
204
|
+
sessionId,
|
|
205
|
+
};
|
|
206
|
+
pi.events.emit('pimote:voice:deactivate', msg);
|
|
207
|
+
return;
|
|
208
|
+
}
|
|
209
|
+
case 'rewrite_context': {
|
|
210
|
+
// Stash; the `context` hook below reads this on its return.
|
|
211
|
+
pendingContextRewrite = action.messages;
|
|
212
|
+
return;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
};
|
|
216
|
+
// ---- EventBus listeners ---------------------------------------------
|
|
217
|
+
pi.events.on('pimote:voice:activate', (data) => {
|
|
218
|
+
void dispatch({ type: 'eb:activate', msg: data });
|
|
219
|
+
});
|
|
220
|
+
pi.events.on('pimote:voice:deactivate', (data) => {
|
|
221
|
+
void dispatch({ type: 'eb:deactivate', msg: data });
|
|
222
|
+
});
|
|
223
|
+
// ---- speak() tool ---------------------------------------------------
|
|
224
|
+
//
|
|
225
|
+
// The streaming reducer is the sole emitter of speak `token`/`end`
|
|
226
|
+
// frames. The `execute` here only returns the synthetic success
|
|
227
|
+
// result so the agent loop progresses.
|
|
228
|
+
pi.registerTool({
|
|
229
|
+
name: 'speak',
|
|
230
|
+
label: 'Speak',
|
|
231
|
+
description: 'Speak text to the user via text-to-speech. This is the only way to produce audible output during a voice call. Keep messages short and TTS-friendly.',
|
|
232
|
+
promptSnippet: 'speak(text) — speak text to the user (voice-mode only).',
|
|
233
|
+
parameters: Type.Object({
|
|
234
|
+
text: Type.String({ description: 'The text to speak to the user.' }),
|
|
235
|
+
}),
|
|
236
|
+
execute: async () => {
|
|
237
|
+
if (state.lifecycle.kind === 'active' || state.lifecycle.kind === 'activating') {
|
|
238
|
+
return { content: [{ type: 'text', text: 'ok' }], details: {} };
|
|
239
|
+
}
|
|
240
|
+
return {
|
|
241
|
+
content: [{ type: 'text', text: 'speak() is only available during an active voice call.' }],
|
|
242
|
+
details: {},
|
|
243
|
+
isError: true,
|
|
244
|
+
};
|
|
245
|
+
},
|
|
246
|
+
});
|
|
247
|
+
// ---- SDK hooks ------------------------------------------------------
|
|
248
|
+
pi.on('before_agent_start', (event, ctx) => {
|
|
249
|
+
lastCtx = ctx;
|
|
250
|
+
if (state.lifecycle.kind === 'dormant')
|
|
251
|
+
return;
|
|
252
|
+
return { systemPrompt: `${interpreterPrompt}\n\n${event.systemPrompt ?? ''}`.trim() };
|
|
253
|
+
});
|
|
254
|
+
// The `tool_call` hook is intentionally NOT registered. The streaming
|
|
255
|
+
// reducer is the sole emitter of speak frames; bulk-emission via
|
|
256
|
+
// tool_call was the source of the double-emit class of bugs.
|
|
257
|
+
//
|
|
258
|
+
// The `turn_end` safety net is also intentionally NOT registered.
|
|
259
|
+
// With per-speak `end` framing driven by `toolcall_end`, it was
|
|
260
|
+
// redundant and contributed to double-end emissions.
|
|
261
|
+
pi.on('message_start', (event) => {
|
|
262
|
+
// Only assistant messages reset the streaming state. User and
|
|
263
|
+
// tool-result messages don't have content blocks we care about.
|
|
264
|
+
if (event.message.role !== 'assistant')
|
|
265
|
+
return;
|
|
266
|
+
void dispatch({ type: 'sdk:message_start', message: event.message });
|
|
267
|
+
});
|
|
268
|
+
pi.on('message_update', (event, ctx) => {
|
|
269
|
+
lastCtx = ctx;
|
|
270
|
+
// Walkback no longer needs a captured snapshot — it operates on
|
|
271
|
+
// the messages array passed to `sdk:context` directly.
|
|
272
|
+
if (state.lifecycle.kind === 'dormant')
|
|
273
|
+
return;
|
|
274
|
+
const ame = event.assistantMessageEvent;
|
|
275
|
+
if (!ame || typeof ame.contentIndex !== 'number')
|
|
276
|
+
return;
|
|
277
|
+
switch (ame.type) {
|
|
278
|
+
case 'toolcall_start':
|
|
279
|
+
void dispatch({
|
|
280
|
+
type: 'sdk:toolcall_start',
|
|
281
|
+
contentIndex: ame.contentIndex,
|
|
282
|
+
partial: (ame.partial ?? {}),
|
|
283
|
+
});
|
|
284
|
+
return;
|
|
285
|
+
case 'toolcall_delta':
|
|
286
|
+
void dispatch({
|
|
287
|
+
type: 'sdk:toolcall_delta',
|
|
288
|
+
contentIndex: ame.contentIndex,
|
|
289
|
+
delta: typeof ame.delta === 'string' ? ame.delta : '',
|
|
290
|
+
partial: (ame.partial ?? {}),
|
|
291
|
+
});
|
|
292
|
+
return;
|
|
293
|
+
case 'toolcall_end':
|
|
294
|
+
void dispatch({
|
|
295
|
+
type: 'sdk:toolcall_end',
|
|
296
|
+
contentIndex: ame.contentIndex,
|
|
297
|
+
toolCall: (ame.toolCall ?? {}),
|
|
298
|
+
});
|
|
299
|
+
return;
|
|
300
|
+
default:
|
|
301
|
+
// text_*, thinking_* — not relevant to outbound streaming.
|
|
302
|
+
return;
|
|
303
|
+
}
|
|
304
|
+
});
|
|
305
|
+
pi.on('context', (event, ctx) => {
|
|
306
|
+
lastCtx = ctx;
|
|
307
|
+
// The walkback reducer always runs walkBack (even when no rewrite
|
|
308
|
+
// is pending — to strip aborted-empty-assistants). It writes the
|
|
309
|
+
// result into `pendingContextRewrite` via the `rewrite_context`
|
|
310
|
+
// action, which we read below.
|
|
311
|
+
void dispatch({ type: 'sdk:context', messages: event.messages });
|
|
312
|
+
const result = pendingContextRewrite;
|
|
313
|
+
pendingContextRewrite = null;
|
|
314
|
+
if (result)
|
|
315
|
+
return { messages: result };
|
|
316
|
+
return undefined;
|
|
317
|
+
});
|
|
318
|
+
};
|
|
319
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
// Interpreter prompt for the pimote voice extension. Adapted from voxcoder's
|
|
2
|
+
// interpreter prompt — see docs/plans/voice-mode.md (Step 2) and
|
|
3
|
+
// /home/alenna/repos/voxcoder/server/src/interpreter/prompt.ts.
|
|
4
|
+
//
|
|
5
|
+
// Multimodal placeholders from voxcoder are removed (the PWA is a separate
|
|
6
|
+
// text surface in v1 — it renders scrollback directly, not through the
|
|
7
|
+
// interpreter). The interpreter's sole audio-output path is the `speak(text)`
|
|
8
|
+
// pi custom tool; free-text assistant output is discarded from the audio
|
|
9
|
+
// channel by the extension.
|
|
10
|
+
//
|
|
11
|
+
// Placeholders `{{workerProvider}}` / `{{workerModel}}` are substituted once
|
|
12
|
+
// at factory time by `createVoiceExtension` so the registered string is
|
|
13
|
+
// static by the time pi's `before_agent_start` hook sees it.
|
|
14
|
+
/** Raw template — contains `{{workerProvider}}` / `{{workerModel}}` placeholders. */
|
|
15
|
+
export const RAW_INTERPRETER_PROMPT = `You are a voice interpreter — the conversational hub between a human user speaking over a phone-like call and a coding worker subagent that does the actual software engineering work.
|
|
16
|
+
|
|
17
|
+
<role>
|
|
18
|
+
|
|
19
|
+
You receive all user speech as user messages. You decide what to say back (via the \`speak\` tool) and when to delegate work to a worker. From the user's point of view you and the worker are one entity — use "I" when relaying what the worker is doing.
|
|
20
|
+
|
|
21
|
+
You have exactly one way to produce audio: the \`speak(text)\` tool. Any free-text assistant output you emit is discarded — the user will never hear it. If you have nothing to say and nothing to do, emit a single \`speak\` call with a brief acknowledgement (e.g. "ok") or simply end your turn.
|
|
22
|
+
|
|
23
|
+
</role>
|
|
24
|
+
|
|
25
|
+
<session_start>
|
|
26
|
+
|
|
27
|
+
When you see the sentinel user message \`<voice_call_started/>\`, the call has just connected. Greet the user proactively with a brief \`speak(...)\` — one or two sentences — and then end your turn so the user can reply. Example greetings:
|
|
28
|
+
|
|
29
|
+
- "Hey, I'm here. What are we working on?"
|
|
30
|
+
- "Hi — what can I help you with?"
|
|
31
|
+
|
|
32
|
+
Do not dispatch any worker task on the greeting turn. Just speak and wait.
|
|
33
|
+
|
|
34
|
+
</session_start>
|
|
35
|
+
|
|
36
|
+
<acknowledge_first>
|
|
37
|
+
|
|
38
|
+
**Every time the user speaks, your very first action in the response turn must be a \`speak(...)\` call that acknowledges what you heard and, if you're about to do something, what you're going to do about it.** Do not start a turn with a tool call, a subagent spawn, a read, or silent thinking — speak first, always.
|
|
39
|
+
|
|
40
|
+
The acknowledgement and any tool calls happen in the same turn. Emit the \`speak\` call first, then immediately follow with whatever tool calls you need (typically a worker subagent). The user hears the acknowledgement while the tools run in the background — that's the point.
|
|
41
|
+
|
|
42
|
+
Keep the ack short and concrete:
|
|
43
|
+
|
|
44
|
+
- "Okay, taking a look at the auth module now."
|
|
45
|
+
- "Got it — I'll check the test failures."
|
|
46
|
+
- "Sure, one sec while I read that file."
|
|
47
|
+
|
|
48
|
+
For purely conversational turns where no tool call is needed, the \`speak\` call alone *is* the response — same rule, just nothing after it.
|
|
49
|
+
|
|
50
|
+
The only exception is the \`<voice_call_started/>\` greeting turn, which is already a \`speak\`-first turn by definition.
|
|
51
|
+
|
|
52
|
+
</acknowledge_first>
|
|
53
|
+
|
|
54
|
+
<speaking>
|
|
55
|
+
|
|
56
|
+
All audible output goes through \`speak(text)\`:
|
|
57
|
+
|
|
58
|
+
- One or two short sentences per call. Natural spoken English.
|
|
59
|
+
- Never read code aloud. Describe what the code does instead.
|
|
60
|
+
- No markdown, backticks, bullet points, or code fences — they sound terrible as TTS.
|
|
61
|
+
- For long updates, break them into multiple \`speak\` calls in the same turn; each call is streamed to the user as you emit it.
|
|
62
|
+
|
|
63
|
+
You may emit multiple \`speak\` calls per turn. The user hears them concatenated in order. End your turn once you have nothing more to say on the current topic.
|
|
64
|
+
|
|
65
|
+
</speaking>
|
|
66
|
+
|
|
67
|
+
<worker_delegation>
|
|
68
|
+
|
|
69
|
+
For any real software-engineering task (reading files, editing code, running tests, investigating a bug, writing a new feature), spawn a worker via the \`my-pi\` \`subagent\` tool. The worker is a full pi coding agent — give it a clear task description and let it work.
|
|
70
|
+
|
|
71
|
+
**The worker is long-lived.** Spawn it once — either at the start of the call or the first time you need it — and then keep it alive for the rest of the call. Do **not** tear it down when it goes idle. For every subsequent task, use \`send\` to dispatch new work to the existing worker. This preserves the worker's context across the whole call so it remembers prior files, decisions, and reasoning. Only tear it down at the end of the call or if it gets into a clearly broken state.
|
|
72
|
+
|
|
73
|
+
**IMPORTANT:** When spawning a worker via \`my-pi\` \`subagent\`, always pass \`model: "{{workerModel}}"\` and \`provider: "{{workerProvider}}"\` in the agent configuration so the worker runs on the configured worker model rather than the interpreter model.
|
|
74
|
+
|
|
75
|
+
On the turn where you spawn the worker, emit the acknowledging \`speak(...)\` call as the first tool call in the list, with the \`subagent\` call right after it in the same response. Both fire in parallel — the user hears the ack while the worker is already starting up.
|
|
76
|
+
|
|
77
|
+
While the worker runs:
|
|
78
|
+
|
|
79
|
+
- When the worker reports progress or completion, summarise it briefly for the user — outcomes, not step-by-step narration.
|
|
80
|
+
- If the worker asks a question or flags a decision, relay it to the user and wait for their answer before forwarding it back.
|
|
81
|
+
|
|
82
|
+
For purely conversational turns (greetings, thanks, chit-chat, clarifying a previous answer) you can handle the turn with \`speak\` alone — no worker needed.
|
|
83
|
+
|
|
84
|
+
</worker_delegation>
|
|
85
|
+
|
|
86
|
+
<interruptions>
|
|
87
|
+
|
|
88
|
+
The user can interrupt you mid-sentence. When that happens, your in-flight turn is aborted and the user's new message arrives as the next user turn. Do not apologise for being interrupted or try to resume the old sentence — just respond to what the user said.
|
|
89
|
+
|
|
90
|
+
</interruptions>
|
|
91
|
+
|
|
92
|
+
<tts_guidelines>
|
|
93
|
+
|
|
94
|
+
The user is likely driving, cooking, or otherwise unable to look at a screen. Audio must be:
|
|
95
|
+
|
|
96
|
+
- Brief enough not to distract.
|
|
97
|
+
- Clear enough to understand without visual context.
|
|
98
|
+
- Natural enough not to sound robotic.
|
|
99
|
+
|
|
100
|
+
Rules of thumb:
|
|
101
|
+
|
|
102
|
+
- 1–3 sentences per \`speak\` call.
|
|
103
|
+
- Focus on outcomes, not internal state.
|
|
104
|
+
- Never read code, file paths with slashes, or long identifiers aloud verbatim — paraphrase.
|
|
105
|
+
- When the worker finishes, summarise the result in a sentence or two.
|
|
106
|
+
|
|
107
|
+
</tts_guidelines>
|
|
108
|
+
`;
|
|
109
|
+
/**
|
|
110
|
+
* Substitute the `{{workerProvider}}` / `{{workerModel}}` placeholders with
|
|
111
|
+
* concrete values. Called once at factory time.
|
|
112
|
+
*/
|
|
113
|
+
export function renderInterpreterPrompt(vars) {
|
|
114
|
+
return RAW_INTERPRETER_PROMPT.replace(/\{\{workerProvider\}\}/g, vars.workerProvider).replace(/\{\{workerModel\}\}/g, vars.workerModel);
|
|
115
|
+
}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// Speechmux LlmBackend WS protocol — minimal interface the voice extension
|
|
2
|
+
// consumes. Full protocol: speechmux/docs/llm-ws-protocol.md.
|
|
3
|
+
//
|
|
4
|
+
// The voice extension uses this as the seam between itself and speechmux so
|
|
5
|
+
// tests can substitute an in-memory fake without running a real WebSocket.
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
// Default `ws`-backed implementation.
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
/**
|
|
10
|
+
* Default `SpeechmuxClient` factory backed by the `ws` package. Opens a
|
|
11
|
+
* WebSocket to `wsUrl` and routes incoming JSON text frames to registered
|
|
12
|
+
* listeners. The LLM-WS protocol has no hello frame — the harness simply
|
|
13
|
+
* connects and exchanges `user` / `token` / `end` / `abort` / `rollback`
|
|
14
|
+
* frames (see speechmux/docs/llm-ws-protocol.md).
|
|
15
|
+
*
|
|
16
|
+
* Resolves once the socket is open. Rejects if the socket errors or closes
|
|
17
|
+
* before opening.
|
|
18
|
+
*/
|
|
19
|
+
export function createDefaultSpeechmuxClientFactory() {
|
|
20
|
+
// Dynamic import so consumers that never call the factory (e.g. tests)
|
|
21
|
+
// don't pay the `ws` resolution cost. Cached after first load.
|
|
22
|
+
let WsCtor = null;
|
|
23
|
+
return async (opts) => {
|
|
24
|
+
const { wsUrl } = opts;
|
|
25
|
+
if (!WsCtor) {
|
|
26
|
+
const mod = await import('ws');
|
|
27
|
+
WsCtor = mod.WebSocket;
|
|
28
|
+
}
|
|
29
|
+
const ws = new WsCtor(wsUrl);
|
|
30
|
+
const listeners = new Set();
|
|
31
|
+
// Buffer frames that arrive after `hello` but before the caller has had a
|
|
32
|
+
// chance to attach an `onFrame` listener. Drained on the first attach.
|
|
33
|
+
const pending = [];
|
|
34
|
+
let closed = false;
|
|
35
|
+
// Install the message handler before resolving so frames sent between
|
|
36
|
+
// open and the caller's onFrame attach are buffered instead of dropped.
|
|
37
|
+
// See review finding 5 (speechmux-client race).
|
|
38
|
+
ws.on('message', (raw, isBinary) => {
|
|
39
|
+
if (isBinary)
|
|
40
|
+
return;
|
|
41
|
+
let text;
|
|
42
|
+
if (typeof raw === 'string')
|
|
43
|
+
text = raw;
|
|
44
|
+
else if (raw instanceof Buffer)
|
|
45
|
+
text = raw.toString('utf8');
|
|
46
|
+
else if (Array.isArray(raw))
|
|
47
|
+
text = Buffer.concat(raw).toString('utf8');
|
|
48
|
+
else
|
|
49
|
+
text = Buffer.from(raw).toString('utf8');
|
|
50
|
+
let frame;
|
|
51
|
+
try {
|
|
52
|
+
frame = JSON.parse(text);
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
return; // ignore non-JSON
|
|
56
|
+
}
|
|
57
|
+
if (!isIncomingFrame(frame))
|
|
58
|
+
return;
|
|
59
|
+
if (listeners.size === 0) {
|
|
60
|
+
pending.push(frame);
|
|
61
|
+
return;
|
|
62
|
+
}
|
|
63
|
+
for (const listener of listeners)
|
|
64
|
+
listener(frame);
|
|
65
|
+
});
|
|
66
|
+
const connectTimeoutMs = opts.connectTimeoutMs ?? 5000;
|
|
67
|
+
await new Promise((resolve, reject) => {
|
|
68
|
+
let settled = false;
|
|
69
|
+
const cleanup = () => {
|
|
70
|
+
clearTimeout(timer);
|
|
71
|
+
ws.off('open', onOpen);
|
|
72
|
+
ws.off('error', onError);
|
|
73
|
+
};
|
|
74
|
+
const onOpen = () => {
|
|
75
|
+
if (settled)
|
|
76
|
+
return;
|
|
77
|
+
settled = true;
|
|
78
|
+
cleanup();
|
|
79
|
+
resolve();
|
|
80
|
+
};
|
|
81
|
+
const onError = (err) => {
|
|
82
|
+
if (settled)
|
|
83
|
+
return;
|
|
84
|
+
settled = true;
|
|
85
|
+
cleanup();
|
|
86
|
+
reject(err);
|
|
87
|
+
};
|
|
88
|
+
const timer = setTimeout(() => {
|
|
89
|
+
if (settled)
|
|
90
|
+
return;
|
|
91
|
+
settled = true;
|
|
92
|
+
cleanup();
|
|
93
|
+
try {
|
|
94
|
+
ws.terminate();
|
|
95
|
+
}
|
|
96
|
+
catch {
|
|
97
|
+
/* ignore */
|
|
98
|
+
}
|
|
99
|
+
reject(new Error(`SpeechmuxClient: connect timeout after ${connectTimeoutMs}ms (${wsUrl})`));
|
|
100
|
+
}, connectTimeoutMs);
|
|
101
|
+
ws.once('open', onOpen);
|
|
102
|
+
ws.once('error', onError);
|
|
103
|
+
});
|
|
104
|
+
ws.on('close', () => {
|
|
105
|
+
closed = true;
|
|
106
|
+
});
|
|
107
|
+
return {
|
|
108
|
+
send(frame) {
|
|
109
|
+
if (closed || ws.readyState !== ws.OPEN) {
|
|
110
|
+
throw new Error('SpeechmuxClient: socket is not open');
|
|
111
|
+
}
|
|
112
|
+
ws.send(JSON.stringify(frame));
|
|
113
|
+
},
|
|
114
|
+
onFrame(listener) {
|
|
115
|
+
const firstListener = listeners.size === 0;
|
|
116
|
+
listeners.add(listener);
|
|
117
|
+
if (firstListener && pending.length > 0) {
|
|
118
|
+
// Drain any frames that arrived before the listener attached.
|
|
119
|
+
const drained = pending.splice(0, pending.length);
|
|
120
|
+
for (const frame of drained)
|
|
121
|
+
listener(frame);
|
|
122
|
+
}
|
|
123
|
+
return () => listeners.delete(listener);
|
|
124
|
+
},
|
|
125
|
+
close() {
|
|
126
|
+
if (closed)
|
|
127
|
+
return;
|
|
128
|
+
closed = true;
|
|
129
|
+
try {
|
|
130
|
+
ws.close();
|
|
131
|
+
}
|
|
132
|
+
catch {
|
|
133
|
+
// ignore
|
|
134
|
+
}
|
|
135
|
+
},
|
|
136
|
+
};
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
function isIncomingFrame(value) {
|
|
140
|
+
if (!value || typeof value !== 'object')
|
|
141
|
+
return false;
|
|
142
|
+
const v = value;
|
|
143
|
+
switch (v.type) {
|
|
144
|
+
case 'user':
|
|
145
|
+
return typeof v.text === 'string';
|
|
146
|
+
case 'abort':
|
|
147
|
+
return true;
|
|
148
|
+
case 'rollback':
|
|
149
|
+
return typeof v.heard_text === 'string';
|
|
150
|
+
default:
|
|
151
|
+
return false;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
// Voice extension activation state machine.
|
|
2
|
+
//
|
|
3
|
+
// Defined in docs/plans/voice-mode.md — "Voice extension" section under
|
|
4
|
+
// "Interfaces". The state machine is driven by EventBus messages from the
|
|
5
|
+
// server-side VoiceOrchestrator.
|
|
6
|
+
/** Sentinel user message appended on entry to the `active` state. */
|
|
7
|
+
export const VOICE_CALL_STARTED_SENTINEL = '<voice_call_started/>';
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
// Wait-for-idle helper for the voice extension.
|
|
2
|
+
//
|
|
3
|
+
// The speechmux abort/user frame pair on barge-in arrives as two
|
|
4
|
+
// independent reducer calls. The `abort` action calls `ctx.abort()`
|
|
5
|
+
// (fire-and-forget — the actual teardown completes asynchronously). If
|
|
6
|
+
// the user frame arrives before the teardown is done, `sendUserMessage`
|
|
7
|
+
// throws ("Agent is already processing…") and the user's utterance is
|
|
8
|
+
// silently dropped.
|
|
9
|
+
//
|
|
10
|
+
// Steering doesn't help: pi-agent-core doesn't drain the steer queue on
|
|
11
|
+
// the abort exit path of `runLoop`. Pimote has a separate
|
|
12
|
+
// `autoDrainOnAbort` listener (see `auto-drain-on-abort.ts`) that
|
|
13
|
+
// rescues queued messages after an aborted run, but that only catches
|
|
14
|
+
// messages that *were* queued — it doesn't help an unqueued
|
|
15
|
+
// `sendUserMessage` that throws.
|
|
16
|
+
//
|
|
17
|
+
// So the voice extension polls `ctx.isIdle()` before calling
|
|
18
|
+
// `sendUserMessage` (without `deliverAs`), guaranteeing the SDK won't
|
|
19
|
+
// throw. Auto-drain remains a belt-and-braces safety net for any
|
|
20
|
+
// queued path that races an abort.
|
|
21
|
+
/**
|
|
22
|
+
* Resolve once the agent is idle, polling with exponential backoff
|
|
23
|
+
* (start 5 ms, doubling, capped at 50 ms). Returns false if the agent
|
|
24
|
+
* never becomes idle within `timeoutMs`. Returns true immediately
|
|
25
|
+
* when already idle.
|
|
26
|
+
*
|
|
27
|
+
* Timeout default of 2 s is well above any normal abort-teardown
|
|
28
|
+
* latency (tens to a few hundred ms). If a real agent doesn't reach
|
|
29
|
+
* idle within 2 s, something is genuinely stuck and dropping the
|
|
30
|
+
* message is preferable to hanging the executor.
|
|
31
|
+
*/
|
|
32
|
+
export async function waitForAgentIdle(ctx, timeoutMs = 2000) {
|
|
33
|
+
if (ctx.isIdle())
|
|
34
|
+
return true;
|
|
35
|
+
const start = Date.now();
|
|
36
|
+
let delay = 5;
|
|
37
|
+
while (!ctx.isIdle()) {
|
|
38
|
+
if (Date.now() - start >= timeoutMs)
|
|
39
|
+
return false;
|
|
40
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
41
|
+
delay = Math.min(50, delay * 2);
|
|
42
|
+
}
|
|
43
|
+
return true;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* Ensure the agent is idle, synthesising a barge-in when it isn't.
|
|
47
|
+
*
|
|
48
|
+
* Speechmux only emits `abort` while it is actively playing TTS — i.e.
|
|
49
|
+
* during the harness's `token`/`end` stream. While the worker is
|
|
50
|
+
* silently reasoning between a `user` frame and its first `speak()`
|
|
51
|
+
* call, speechmux has no signal that the agent is busy and won't
|
|
52
|
+
* pre-empt. If the user starts a new utterance during that window, the
|
|
53
|
+
* `user` frame arrives at the harness with no preceding `abort`, so the
|
|
54
|
+
* agent is still mid-turn and `sendUserMessage` would race / be
|
|
55
|
+
* dropped.
|
|
56
|
+
*
|
|
57
|
+
* This helper closes that gap: when the agent isn't idle on entry, we
|
|
58
|
+
* fire `ctx.abort()` ourselves (idempotent if a real barge-in already
|
|
59
|
+
* issued one) and then poll for idle the same way the abort/user pair
|
|
60
|
+
* already does. Returns true once idle, false on timeout.
|
|
61
|
+
*/
|
|
62
|
+
export async function ensureIdleWithImplicitAbort(ctx, timeoutMs = 2000) {
|
|
63
|
+
if (ctx.isIdle())
|
|
64
|
+
return true;
|
|
65
|
+
ctx.abort();
|
|
66
|
+
return waitForAgentIdle(ctx, timeoutMs);
|
|
67
|
+
}
|