bloby-bot 0.70.8 → 0.70.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist-bloby/assets/{bloby-CXmOcb1r.js → bloby-DSNB0g4w.js} +4 -4
  2. package/dist-bloby/assets/{globals-DpO5tO92.js → globals-B3cTbITX.js} +1 -1
  3. package/dist-bloby/assets/{highlighted-body-OFNGDK62-D7cU1Y-Z.js → highlighted-body-OFNGDK62-BLforpkr.js} +1 -1
  4. package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +1 -0
  5. package/dist-bloby/assets/{onboard-B96ELhXn.js → onboard-Dn2Ws_G2.js} +1 -1
  6. package/dist-bloby/bloby.html +2 -2
  7. package/dist-bloby/onboard.html +2 -2
  8. package/package.json +1 -1
  9. package/scripts/sync-pi-models.ts +37 -6
  10. package/supervisor/chat/OnboardWizard.tsx +4 -4
  11. package/supervisor/harnesses/pi/async-queue.ts +7 -11
  12. package/supervisor/harnesses/pi/index.ts +475 -73
  13. package/supervisor/harnesses/pi/models-catalog.generated.ts +840 -210
  14. package/supervisor/harnesses/pi/providers/humanize-error.ts +125 -0
  15. package/supervisor/harnesses/pi/providers/retry.ts +87 -0
  16. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +73 -11
  17. package/supervisor/harnesses/pi/providers/stream-google.ts +15 -5
  18. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +55 -19
  19. package/supervisor/harnesses/pi/providers/types.ts +26 -1
  20. package/supervisor/harnesses/pi/session.ts +179 -73
  21. package/supervisor/harnesses/pi/sub-providers.ts +30 -1
  22. package/supervisor/harnesses/pi/test-completion.ts +8 -2
  23. package/supervisor/harnesses/pi/tools/registry.ts +25 -9
  24. package/supervisor/harnesses/pi/tools/task.ts +108 -0
  25. package/supervisor/harnesses/pi/tools/types.ts +15 -0
  26. package/supervisor/index.ts +11 -10
  27. package/supervisor/public/morphy_sad.mov +0 -0
  28. package/supervisor/public/morphy_sad.webm +0 -0
  29. package/supervisor/shell.ts +1 -1
  30. package/supervisor/workspace-guard.js +1 -1
  31. package/workspace/client/public/morphy_bounce.mov +0 -0
  32. package/workspace/client/public/morphy_bounce.webm +0 -0
  33. package/workspace/client/public/morphy_hi.mov +0 -0
  34. package/workspace/client/public/morphy_hi.webm +0 -0
  35. package/workspace/client/src/App.tsx +5 -3
  36. package/dist-bloby/assets/mermaid-GHXKKRXX-D5YxphBn.js +0 -1
  37. package/supervisor/public/what-happened.mp4 +0 -0
  38. package/supervisor/public/what-happened.webm +0 -0
@@ -6,14 +6,18 @@
6
6
  * matches the Claude harness so the dispatcher needs no provider-specific
7
7
  * code.
8
8
  *
9
- * Phase 1 scope: live conversation + one-shot text only (no tools). The
9
+ * Live conversations run the full tool loop (session.ts); one-shots are still
10
+ * tool-less (audit Phase C will route them through createPiSession). The
10
11
  * non-blocking feel — user keeps typing while the model is still answering —
11
- * comes from the same `AsyncQueue` pattern Claude uses; see `async-queue.ts`.
12
+ * comes from the same `AsyncQueue` pattern Claude uses (one message per turn);
13
+ * see `async-queue.ts` and PI-PARITY-AUDIT-2026-06-11.md.
12
14
  */
13
15
  import { log } from '../../../shared/logger.js';
14
16
  import { WORKSPACE_DIR } from '../../../shared/paths.js';
15
17
  import type { SavedFile } from '../../file-saver.js';
16
18
  import { assembleSystemPrompt } from '../../../worker/prompts/prompt-assembler.js';
19
+ import { buildAgents } from '../../agents/index.js';
20
+ import crypto from 'crypto';
17
21
  import fs from 'fs';
18
22
  import path from 'path';
19
23
  import type {
@@ -26,12 +30,13 @@ export type { RecentMessage, AgentAttachment };
26
30
 
27
31
  import { buildSkillsIndex } from '../skills.js';
28
32
  import { createAsyncQueue, type AsyncQueue } from './async-queue.js';
29
- import { createPiSession, type PiSessionEvent } from './session.js';
30
- import { getPiSubProvider } from './sub-providers.js';
33
+ import { createPiSession, type PiSessionEvent, type PiSessionAuth } from './session.js';
34
+ import { getPiSubProvider, getCatalogModel } from './sub-providers.js';
31
35
  import { readPiAuth } from './auth-storage.js';
32
36
  import { streamProvider } from './providers/stream.js';
33
37
  import type { PiMessage } from './providers/types.js';
34
38
  import { toolDefsForProvider } from './tools/registry.js';
39
+ import type { PiTaskHost } from './tools/types.js';
35
40
 
36
41
  // ── Live conversation state ────────────────────────────────────────────────
37
42
 
@@ -41,11 +46,70 @@ interface LiveConversation {
41
46
  abortController: AbortController;
42
47
  onMessage: (type: string, data: any) => void;
43
48
  busy: boolean;
49
+ /** Messages pushed but not yet completed (1 turn-complete per message) — mirrors
50
+ * claude.ts pendingCount. idle:true on turn-complete only when this hits 0, so
51
+ * the supervisor's session recycling never fires with a message still queued. */
52
+ pendingCount: number;
53
+ /** 60ms micro-batcher for bot:token — collapses per-delta WS frame floods. */
54
+ batcher: TokenBatcher;
55
+ /** Running background sub-agent tasks (Phase B). While non-empty, the
56
+ * conversation reports idle:false (recycling deferred) and counts as busy
57
+ * (backend restarts / self-updates deferred) so a task is never killed
58
+ * mid-flight by housekeeping. */
59
+ tasks: Map<string, RunningTask>;
60
+ /** Set when a completed background task used file tools — OR'd into the next
61
+ * bot:turn-complete (the continuation turn) so the backend restarts right
62
+ * after the user hears "Done!", mirroring claude's usedTools capture of
63
+ * sub-agent tool_use blocks. */
64
+ taskUsedFileTools: boolean;
44
65
  loopDone: Promise<void> | null;
45
66
  }
46
67
 
68
+ interface RunningTask {
69
+ id: string;
70
+ description: string;
71
+ subagentType: string;
72
+ abortController: AbortController;
73
+ /** True when stopped via user:stop-task or conversation teardown. */
74
+ stopped: boolean;
75
+ startedAt: number;
76
+ }
77
+
47
78
  const liveConversations = new Map<string, LiveConversation>();
48
79
 
80
+ /**
81
+ * Micro-batch streamed deltas into ~60ms bot:token frames (house standard
82
+ * from the codex parity pass — an order-of-magnitude WS frame reduction with
83
+ * no visible change in streaming feel). Callers MUST flush() before emitting
84
+ * any non-token event so ordering and the streamed-text == bot:response
85
+ * contract are preserved; discard() on teardown drops post-abort stragglers.
86
+ */
87
+ interface TokenBatcher {
88
+ add(delta: string): void;
89
+ flush(): void;
90
+ discard(): void;
91
+ }
92
+
93
+ function createTokenBatcher(emit: (text: string) => void, intervalMs = 60): TokenBatcher {
94
+ let buf = '';
95
+ let timer: NodeJS.Timeout | null = null;
96
+ const flush = () => {
97
+ if (timer) { clearTimeout(timer); timer = null; }
98
+ if (buf) { const out = buf; buf = ''; emit(out); }
99
+ };
100
+ return {
101
+ add(delta: string) {
102
+ buf += delta;
103
+ if (!timer) timer = setTimeout(flush, intervalMs);
104
+ },
105
+ flush,
106
+ discard() {
107
+ if (timer) { clearTimeout(timer); timer = null; }
108
+ buf = '';
109
+ },
110
+ };
111
+ }
112
+
49
113
  export function hasConversation(conversationId: string): boolean {
50
114
  return liveConversations.has(conversationId);
51
115
  }
@@ -100,7 +164,7 @@ You are running in a streaming chat where the user can keep typing while you wor
100
164
 
101
165
  - Before kicking off a multi-step task, say one short line acknowledging it ("On it, looking at the widget now.").
102
166
  - Between tool calls on long tasks, drop a brief progress note ("Found the file, checking the layout next.") so the user knows you're still working.
103
- - If a new user message arrives while you're mid-task, you'll see it as a fresh user-role message in the conversation history. Answer it briefly inline, mention you're still working on the main task, then continue.
167
+ - Messages the user sends while you're working are queued and delivered to you one at a time after the current task finishes each gets its own answer, so never assume you missed one.
104
168
  - Final answers should be concise and concrete.`;
105
169
 
106
170
  async function buildSystemPrompt(
@@ -133,29 +197,232 @@ async function buildSystemPrompt(
133
197
  return systemPrompt;
134
198
  }
135
199
 
136
- /** Resolve sub-provider, base url, api key, model id from saved pi-auth.json. */
137
- function resolveAuth(): {
138
- ok: true;
139
- flavor: ReturnType<typeof getPiSubProvider> extends undefined ? never : NonNullable<ReturnType<typeof getPiSubProvider>>['flavor'];
140
- modelId: string;
141
- baseUrl: string;
142
- apiKey: string;
143
- } | { ok: false; error: string } {
144
- const auth = readPiAuth();
145
- if (!auth) return { ok: false, error: 'Bloby provider is not configured. Run the onboarding wizard.' };
146
- const sub = getPiSubProvider(auth.subProvider);
147
- if (!sub) return { ok: false, error: `Unknown sub-provider in pi-auth.json: ${auth.subProvider}` };
148
- const baseUrl = (auth.baseUrl || sub.baseUrl || '').replace(/\/+$/, '');
200
+ /**
201
+ * Resolve the full provider auth bundle from saved pi-auth.json: sub-provider
202
+ * flavor, base url, api key, model id, plus catalog metadata (per-model output
203
+ * cap, context window) and the sub-provider's max-tokens field quirk.
204
+ *
205
+ * Called at session/one-shot start AND re-called on every live provider round
206
+ * via the session's getAuth thunk — so fixing a revoked key or switching
207
+ * models in the wizard heals a live conversation on its very next round.
208
+ */
209
+ function resolveAuth(): { ok: true; auth: PiSessionAuth } | { ok: false; error: string } {
210
+ const saved = readPiAuth();
211
+ if (!saved) return { ok: false, error: 'Bloby provider is not configured. Run the onboarding wizard.' };
212
+ const sub = getPiSubProvider(saved.subProvider);
213
+ if (!sub) return { ok: false, error: `Unknown sub-provider in pi-auth.json: ${saved.subProvider}` };
214
+ const baseUrl = (saved.baseUrl || sub.baseUrl || '').replace(/\/+$/, '');
149
215
  if (!baseUrl) return { ok: false, error: `No base URL configured for ${sub.id}` };
150
- const modelId = auth.modelId || sub.defaultModel || '';
216
+ const modelId = saved.modelId || sub.defaultModel || '';
151
217
  if (!modelId) return { ok: false, error: `No model selected for ${sub.id}` };
152
- if (sub.needsApiKey && !auth.apiKey) return { ok: false, error: `Missing API key for ${sub.id}` };
218
+ if (sub.needsApiKey && !saved.apiKey) return { ok: false, error: `Missing API key for ${sub.id}` };
219
+ const catalog = getCatalogModel(sub.id, modelId);
220
+
221
+ // Effective window reported to the supervisor's recycler. Two corrections
222
+ // over the raw catalog figure (audit review F1):
223
+ // 1. Anthropic catalog windows can reflect the 1M-context beta; without the
224
+ // beta header (we don't send it) the real window is 200k.
225
+ // 2. Since every request reserves max_tokens of output budget, providers
226
+ // enforce input + max_tokens <= window — the usable INPUT ceiling is
227
+ // window - maxOutputTokens. Reporting the raw window would put the 70%
228
+ // recycle threshold ABOVE that ceiling (e.g. 140k > 200k-64k=136k on
229
+ // claude-haiku-4-5) and the recycler could never preempt the wall.
230
+ let contextWindow = catalog?.contextWindow;
231
+ if (contextWindow && sub.flavor === 'anthropic-messages') {
232
+ contextWindow = Math.min(contextWindow, 200_000);
233
+ }
234
+ if (contextWindow && catalog?.maxOutputTokens) {
235
+ contextWindow = Math.max(0, contextWindow - catalog.maxOutputTokens);
236
+ }
237
+
153
238
  return {
154
239
  ok: true,
155
- flavor: sub.flavor,
156
- modelId,
157
- baseUrl,
158
- apiKey: auth.apiKey || '',
240
+ auth: {
241
+ flavor: sub.flavor,
242
+ modelId,
243
+ baseUrl,
244
+ apiKey: saved.apiKey || '',
245
+ maxOutputTokens: catalog?.maxOutputTokens,
246
+ maxTokensField: sub.maxTokensField,
247
+ includeStreamUsage: sub.noStreamUsage ? false : undefined,
248
+ contextWindow,
249
+ },
250
+ };
251
+ }
252
+
253
+ // ── Background sub-agents (Phase B — audit D4-1) ───────────────────────────
254
+
255
+ /** Inject a system-originated message into the parent's queue (task completion).
256
+ * Mirrors the Claude SDK's self-prompted continuation turn: no routing target
257
+ * is enqueued (channelManager only wraps USER pushes), so the continuation's
258
+ * bot:response meets an empty routing FIFO and falls through to the dashboard
259
+ * broadcast — exactly claude's behavior. pendingCount/busy are maintained so
260
+ * idle stays accurate and the recycler can't fire mid-continuation. No
261
+ * bot:typing (claude's continuation turns emit none either). */
262
+ function pushSyntheticMessage(conv: LiveConversation, text: string): void {
263
+ conv.busy = true;
264
+ conv.pendingCount += 1;
265
+ conv.inputQueue.push({ role: 'user', content: [{ type: 'text', text }] });
266
+ }
267
+
268
+ /** coder.txt advertises the claude toolset ("Read, Write, Edit, Bash, Glob,
269
+ * Grep") — swap in the child's REAL pi toolset so the sub-agent never chases
270
+ * tools it doesn't have (audit D4-4). claude keeps its richer line. */
271
+ function rewriteToolAccessLine(prompt: string, toolNames: string[]): string {
272
+ return prompt.replace(/You have full tool access:[^\n]*/i, `You have full tool access: ${toolNames.join(', ')}.`);
273
+ }
274
+
275
+ /** Compact human-readable descriptor of a child tool call for bot:task-progress. */
276
+ function toolCallSummary(name: string, input: any): string {
277
+ const tail = (p: any) => (typeof p === 'string' ? p.split('/').slice(-2).join('/') : '');
278
+ switch (name.toLowerCase()) {
279
+ case 'bash': return `Bash: ${String(input?.description || input?.command || '').slice(0, 80)}`;
280
+ case 'read': return `Reading ${tail(input?.file_path)}`;
281
+ case 'write': return `Writing ${tail(input?.file_path)}`;
282
+ case 'edit': return `Editing ${tail(input?.file_path)}`;
283
+ default: return name;
284
+ }
285
+ }
286
+
287
+ /**
288
+ * Per-conversation task host: spawns an in-process child `createPiSession`
289
+ * per Task call, translates child events into the `bot:task-*` vocabulary
290
+ * (payload fields exactly as claude.ts:443-484 emits them), and injects the
291
+ * completion back into the parent's queue for the "Done!" continuation turn.
292
+ */
293
+ function createTaskHost(conv: LiveConversation, getAuth: () => PiSessionAuth): PiTaskHost {
294
+ return {
295
+ spawn(req) {
296
+ const agents = buildAgents();
297
+ const cfg = agents[req.subagentType];
298
+ if (!cfg) {
299
+ return {
300
+ ok: false,
301
+ error: `Unknown subagent_type "${req.subagentType}". Available: ${Object.keys(agents).join(', ') || 'none'}.`,
302
+ };
303
+ }
304
+
305
+ const taskId = crypto.randomUUID().slice(0, 8);
306
+ const abortController = new AbortController();
307
+ const task: RunningTask = {
308
+ id: taskId,
309
+ description: req.description,
310
+ subagentType: req.subagentType,
311
+ abortController,
312
+ stopped: false,
313
+ startedAt: Date.now(),
314
+ };
315
+ conv.tasks.set(taskId, task);
316
+
317
+ // Honor the agent config's tool restrictions (claude applies these via
318
+ // the SDK's tools/disallowedTools options — e.g. a future researcher
319
+ // agent with disallowedTools: ['Write','Edit']).
320
+ let childTools = toolDefsForProvider({ forSubagent: true });
321
+ if (Array.isArray(cfg.tools) && cfg.tools.length > 0) {
322
+ childTools = childTools.filter((t) => cfg.tools.includes(t.name));
323
+ }
324
+ if (Array.isArray(cfg.disallowedTools) && cfg.disallowedTools.length > 0) {
325
+ childTools = childTools.filter((t) => !cfg.disallowedTools.includes(t.name));
326
+ }
327
+ const systemPrompt = rewriteToolAccessLine(String(cfg.prompt || ''), childTools.map((t) => t.name));
328
+
329
+ let summaryText = '';
330
+ let errorText = '';
331
+ let usedFileTools = false;
332
+ let toolUses = 0;
333
+ let lastUsage: { inputTokens?: number; outputTokens?: number; cacheReadTokens?: number; cacheCreationTokens?: number } | undefined;
334
+
335
+ const session = createPiSession({
336
+ getAuth,
337
+ systemPrompt,
338
+ tools: childTools,
339
+ cwd: WORKSPACE_DIR,
340
+ abortController,
341
+ maxToolRounds: typeof cfg.maxTurns === 'number' ? cfg.maxTurns : 50,
342
+ onEvent: (evt: PiSessionEvent) => {
343
+ switch (evt.type) {
344
+ case 'tool_use':
345
+ toolUses += 1;
346
+ conv.batcher.flush();
347
+ conv.onMessage('bot:task-progress', {
348
+ conversationId: conv.id,
349
+ taskId,
350
+ summary: toolCallSummary(evt.name, evt.input),
351
+ lastTool: evt.name,
352
+ usage: { tool_uses: toolUses, duration_ms: Date.now() - task.startedAt },
353
+ });
354
+ break;
355
+ case 'text_end':
356
+ summaryText = evt.text;
357
+ break;
358
+ case 'error':
359
+ errorText = evt.error;
360
+ break;
361
+ case 'turn_complete':
362
+ usedFileTools = usedFileTools || evt.usedFileTools;
363
+ if (evt.usage) lastUsage = evt.usage;
364
+ break;
365
+ }
366
+ },
367
+ });
368
+
369
+ const queue = createAsyncQueue<PiMessage>();
370
+ queue.push({ role: 'user', content: [{ type: 'text', text: req.prompt }] });
371
+ queue.end();
372
+
373
+ log.info(`[pi/task] ──── SUB-AGENT STARTED ──── id=${taskId} type=${req.subagentType} "${req.description}"`);
374
+ // Task events bypass translateAndEmit, so flush the token batcher first —
375
+ // bot:task-created COMMITS the dashboard stream buffer (useBlobyChat),
376
+ // and a batch flushed after it would mis-slice committedTextLength.
377
+ conv.batcher.flush();
378
+ conv.onMessage('bot:task-created', {
379
+ conversationId: conv.id,
380
+ taskId,
381
+ description: req.description,
382
+ type: req.subagentType,
383
+ });
384
+
385
+ void (async () => {
386
+ try {
387
+ await session.run(queue);
388
+ } catch (err: any) {
389
+ errorText = errorText || err?.message || String(err);
390
+ } finally {
391
+ conv.tasks.delete(taskId);
392
+ const status = task.stopped ? 'stopped' : (errorText && !summaryText ? 'failed' : 'completed');
393
+ const summary = summaryText || errorText || '(the agent produced no output)';
394
+ const u = lastUsage;
395
+ const totalTokens = u
396
+ ? (u.inputTokens || 0) + (u.outputTokens || 0) + (u.cacheReadTokens || 0) + (u.cacheCreationTokens || 0)
397
+ : 0;
398
+ log.info(
399
+ `[pi/task] ──── SUB-AGENT ${status.toUpperCase()} ──── id=${taskId} ` +
400
+ `tools=${toolUses} ${Math.round((Date.now() - task.startedAt) / 1000)}s summary=${summary.slice(0, 160)}`,
401
+ );
402
+ conv.batcher.flush();
403
+ conv.onMessage('bot:task-done', {
404
+ conversationId: conv.id,
405
+ taskId,
406
+ status,
407
+ summary,
408
+ usage: { tool_uses: toolUses, duration_ms: Date.now() - task.startedAt, total_tokens: totalTokens },
409
+ });
410
+ if (usedFileTools) conv.taskUsedFileTools = true;
411
+
412
+ // Drive the user-facing continuation turn — unless the conversation
413
+ // itself is gone (ended/recycled), in which case the report dies with
414
+ // it (claude parity: the SDK subprocess dies too).
415
+ if (liveConversations.get(conv.id) === conv && !conv.abortController.signal.aborted) {
416
+ const note = task.stopped
417
+ ? `[System: the background task "${req.description}" was stopped by the user. Acknowledge that briefly in your own voice — never mention agents, tasks, or system messages.]`
418
+ : `[System: background task "${req.description}" ${status}.]\n\nResult summary:\n${summary}\n\nRelay the outcome to the user concisely in your own voice (never mention agents, tasks, ids, or system messages). If it failed, say what went wrong and offer a next step.`;
419
+ pushSyntheticMessage(conv, note);
420
+ }
421
+ }
422
+ })();
423
+
424
+ return { ok: true, taskId };
425
+ },
159
426
  };
160
427
  }
161
428
 
@@ -208,14 +475,14 @@ export async function startConversation(
208
475
  endConversation(conversationId);
209
476
  }
210
477
 
211
- const auth = resolveAuth();
212
- if (!auth.ok) {
213
- log.warn(`[pi/conversation] Cannot start: ${auth.error}`);
214
- onMessage('bot:error', { conversationId, error: auth.error });
478
+ const resolved = resolveAuth();
479
+ if (!resolved.ok) {
480
+ log.warn(`[pi/conversation] Cannot start: ${resolved.error}`);
481
+ onMessage('bot:error', { conversationId, error: resolved.error });
215
482
  return false;
216
483
  }
217
484
 
218
- log.info(`[pi/conversation] Sub-provider: ${auth.flavor} · model: ${auth.modelId}`);
485
+ log.info(`[pi/conversation] Sub-provider: ${resolved.auth.flavor} · model: ${resolved.auth.modelId}`);
219
486
 
220
487
  const systemPrompt = await buildSystemPrompt(names, recentMessages);
221
488
  log.info(`[pi/conversation] System prompt: ${systemPrompt.length} chars`);
@@ -229,19 +496,31 @@ export async function startConversation(
229
496
  abortController,
230
497
  onMessage,
231
498
  busy: false,
499
+ pendingCount: 0,
500
+ batcher: createTokenBatcher((text) => onMessage('bot:token', { conversationId, token: text })),
501
+ tasks: new Map(),
502
+ taskUsedFileTools: false,
232
503
  loopDone: null,
233
504
  };
234
505
  liveConversations.set(conversationId, conv);
235
506
 
507
+ // Re-resolve auth on every provider round so a key/model fix in the wizard
508
+ // applies to the next round with full history intact (audit D6-8). Falls
509
+ // back to the last good bundle if pi-auth.json turns unreadable mid-session.
510
+ let currentAuth: PiSessionAuth = resolved.auth;
511
+ const getAuth = (): PiSessionAuth => {
512
+ const fresh = resolveAuth();
513
+ if (fresh.ok) currentAuth = fresh.auth;
514
+ return currentAuth;
515
+ };
516
+
236
517
  const session = createPiSession({
237
- flavor: auth.flavor,
238
- modelId: auth.modelId,
239
- baseUrl: auth.baseUrl,
240
- apiKey: auth.apiKey,
518
+ getAuth,
241
519
  systemPrompt,
242
520
  tools: toolDefsForProvider(),
243
521
  cwd: WORKSPACE_DIR,
244
522
  abortController,
523
+ taskHost: createTaskHost(conv, getAuth),
245
524
  onEvent: (evt: PiSessionEvent) => {
246
525
  translateAndEmit(conv, evt);
247
526
  },
@@ -258,6 +537,10 @@ export async function startConversation(
258
537
  }
259
538
  } finally {
260
539
  log.info(`[pi/conversation] Cleaning up conversation ${conversationId}`);
540
+ // Drop any unflushed token stragglers — at teardown the turn is either
541
+ // complete (already flushed before turn_complete) or aborted (tokens
542
+ // from an aborted stream must not surface after the fact).
543
+ conv.batcher.discard();
261
544
  liveConversations.delete(conversationId);
262
545
  onMessage('bot:conversation-ended', { conversationId });
263
546
  }
@@ -268,28 +551,86 @@ export async function startConversation(
268
551
 
269
552
  /** Map session-level events back into bloby's `bot:*` vocabulary. */
270
553
  function translateAndEmit(conv: LiveConversation, evt: PiSessionEvent) {
554
+ if (evt.type === 'text_delta') {
555
+ conv.batcher.add(evt.delta);
556
+ return;
557
+ }
558
+ // Any non-token event flushes the batch first — ordering (tokens before the
559
+ // tool chip / final response) and the streamed-text == bot:response
560
+ // invariant both depend on it.
561
+ conv.batcher.flush();
562
+
271
563
  switch (evt.type) {
272
564
  case 'turn_started':
273
565
  // No bloby event for this — `bot:typing` is already emitted by pushMessage().
274
566
  break;
275
- case 'text_delta':
276
- conv.onMessage('bot:token', { conversationId: conv.id, token: evt.delta });
277
- break;
278
567
  case 'text_end':
279
568
  conv.onMessage('bot:response', { conversationId: conv.id, content: evt.text });
280
569
  break;
281
- case 'tool_use':
282
- conv.onMessage('bot:tool', { conversationId: conv.id, name: evt.name, input: evt.input });
570
+ case 'tool_use': {
571
+ // House vocabulary: claude's delegation tool is named Task; the pi
572
+ // prompt's 'Agent' alias resolves to the same tool — normalize the
573
+ // event so consumers see one name.
574
+ const toolName = evt.name === 'Agent' || evt.name === 'agent' ? 'Task' : evt.name;
575
+ conv.onMessage('bot:tool', { conversationId: conv.id, name: toolName, input: evt.input });
283
576
  break;
284
- case 'turn_complete':
285
- conv.busy = false;
286
- conv.onMessage('bot:turn-complete', { conversationId: conv.id, usedFileTools: evt.usedFileTools });
287
- log.info(`[pi/conversation] ──── TURN COMPLETE ──── busy=false`);
577
+ }
578
+ case 'tool_result':
579
+ // Not surfaced yet (Phase D: translate to a bot:tool progress pulse).
288
580
  break;
289
- case 'error':
581
+ case 'turn_complete': {
290
582
  conv.busy = false;
291
- conv.onMessage('bot:error', { conversationId: conv.id, error: evt.error });
583
+ // One turn-complete per pushed message (D1-1 restored that invariant);
584
+ // idle gates the supervisor's proactive recycling so it never fires with
585
+ // a message still queued OR a background task still running — recycling
586
+ // mid-task would kill the task (claude has the same teardown semantics,
587
+ // but its idle flag doesn't guard tasks; this is strictly safer).
588
+ conv.pendingCount = Math.max(0, conv.pendingCount - 1);
589
+ const idle = conv.pendingCount === 0 && conv.tasks.size === 0;
590
+ // A finished background task's file edits restart the backend on the
591
+ // very next turn boundary (the continuation turn) — claude captures
592
+ // sub-agent tool_use blocks into the parent's usedTools the same way.
593
+ const usedFileTools = evt.usedFileTools || conv.taskUsedFileTools;
594
+ conv.taskUsedFileTools = false;
595
+ // Prompt occupancy of the last provider round — input + cache reads +
596
+ // cache writes, exactly claude.ts's contextTokens math. Output tokens
597
+ // are NOT added (claude doesn't either; the recycler's 70% threshold
598
+ // absorbs the next-turn growth).
599
+ const contextTokens = evt.usage
600
+ ? (evt.usage.inputTokens || 0) + (evt.usage.cacheReadTokens || 0) + (evt.usage.cacheCreationTokens || 0)
601
+ : 0;
602
+ conv.onMessage('bot:turn-complete', {
603
+ conversationId: conv.id,
604
+ usedFileTools,
605
+ contextTokens,
606
+ contextWindow: evt.contextWindow || 0,
607
+ idle,
608
+ });
609
+ log.info(`[pi/conversation] ──── TURN COMPLETE ──── busy=false ctx=${contextTokens}/${evt.contextWindow || 'n/a'} idle=${idle} tasks=${conv.tasks.size}`);
292
610
  break;
611
+ }
612
+ case 'error': {
613
+ // busy is NOT cleared here (audit D1-9): turn_complete is the single
614
+ // busy=false site and the session guarantees it on every non-aborted
615
+ // turn; an aborted/fatal path is torn down via bot:conversation-ended.
616
+ const fatal = evt.kind === 'auth' || evt.kind === 'context-overflow';
617
+ const remedy = evt.kind === 'context-overflow'
618
+ ? ' Starting a fresh session — send your message again to continue.'
619
+ : evt.kind === 'auth'
620
+ ? ' I\'ll reconnect with the new key as soon as it\'s saved.'
621
+ : '';
622
+ conv.onMessage('bot:error', { conversationId: conv.id, error: `${evt.error}${remedy}` });
623
+ if (fatal) {
624
+ // Unrecoverable for this session (audit D6-4): an over-window history
625
+ // would re-fail on every future turn, and a dead key has no business
626
+ // keeping the loop alive. Tear down — the finally emits
627
+ // bot:conversation-ended (routes + flags clear) and the next user
628
+ // message cold-starts a fresh session with re-injected history.
629
+ log.warn(`[pi/conversation] Fatal provider error (${evt.kind}) — recycling session ${conv.id}`);
630
+ endConversation(conv.id);
631
+ }
632
+ break;
633
+ }
293
634
  }
294
635
  }
295
636
 
@@ -305,8 +646,9 @@ export function pushMessage(
305
646
  return false;
306
647
  }
307
648
 
308
- log.info(`[pi/conversation] ──── PUSH MESSAGE ──── busy=${conv.busy}`);
649
+ log.info(`[pi/conversation] ──── PUSH MESSAGE ──── busy=${conv.busy} pending=${conv.pendingCount + 1}`);
309
650
  conv.busy = true;
651
+ conv.pendingCount += 1;
310
652
  conv.inputQueue.push(buildUserMessage(content, attachments, savedFiles));
311
653
  conv.onMessage('bot:typing', { conversationId });
312
654
  return true;
@@ -317,6 +659,15 @@ export function endConversation(conversationId: string): void {
317
659
  if (!conv) return;
318
660
 
319
661
  log.info(`[pi/conversation] ──── ENDING CONVERSATION ${conversationId} ────`);
662
+ // Background tasks die with the conversation (claude parity — the SDK
663
+ // subprocess takes its tasks down too). Their finallys still emit
664
+ // bot:task-done {status:'stopped'} so dashboard task cards don't spin
665
+ // forever; the completion injection is skipped (conv gone).
666
+ for (const task of conv.tasks.values()) {
667
+ task.stopped = true;
668
+ task.abortController.abort();
669
+ }
670
+ conv.batcher.discard();
320
671
  conv.inputQueue.end();
321
672
  conv.abortController.abort();
322
673
  liveConversations.delete(conversationId);
@@ -326,16 +677,29 @@ export function isConversationBusy(conversationId: string): boolean {
326
677
  return liveConversations.get(conversationId)?.busy || false;
327
678
  }
328
679
 
329
- /** True if ANY live conversation in this harness is mid-turn. Used by the supervisor to defer
330
- * backend restarts during channel/Alexa turns (which don't set the dashboard's agentQueryActive). */
680
+ /** True if ANY live conversation in this harness is mid-turn OR has a background
681
+ * sub-agent running. Used by the supervisor to defer backend restarts and
682
+ * self-updates — a restart mid-task would kill the task's work in flight. */
331
683
  export function anyConversationBusy(): boolean {
332
- for (const c of liveConversations.values()) if (c.busy) return true;
684
+ for (const c of liveConversations.values()) {
685
+ if (c.busy || c.tasks.size > 0) return true;
686
+ }
333
687
  return false;
334
688
  }
335
689
 
336
- /** Pi has no sub-agents yet; provided for interface compatibility. */
337
- export async function stopSubAgentTask(_conversationId: string, _taskId: string): Promise<void> {
338
- // no-op for Phase 1
690
+ /** Stop a specific background sub-agent task (dashboard user:stop-task). The
691
+ * child's teardown emits bot:task-done {status:'stopped'} and injects a brief
692
+ * acknowledgement turn into the parent. */
693
+ export async function stopSubAgentTask(conversationId: string, taskId: string): Promise<void> {
694
+ const conv = liveConversations.get(conversationId);
695
+ const task = conv?.tasks.get(taskId);
696
+ if (!task) {
697
+ log.warn(`[pi/task] Cannot stop task ${taskId} — not running in conversation ${conversationId}`);
698
+ return;
699
+ }
700
+ log.info(`[pi/task] Stopping sub-agent task ${taskId}`);
701
+ task.stopped = true;
702
+ task.abortController.abort();
339
703
  }
340
704
 
341
705
  /** Pi has no pre-warm step (no subprocess), but the interface requires this. */
@@ -373,21 +737,21 @@ export async function startBlobyAgentQuery(
373
737
  supportPrompt?: string,
374
738
  _maxTurns?: number,
375
739
  ): Promise<void> {
376
- const auth = resolveAuth();
377
- if (!auth.ok) {
378
- onMessage('bot:error', { conversationId, error: auth.error });
740
+ const resolved = resolveAuth();
741
+ if (!resolved.ok) {
742
+ onMessage('bot:error', { conversationId, error: resolved.error });
743
+ // bot:done frees the caller's slot (WhatsApp activeAgents / scheduler) — without it
744
+ // each distinct customer hitting this path pins one of the 5 concurrent slots until
745
+ // supervisor restart (audit D3-2; mirrors claude.ts:620).
746
+ onMessage('bot:done', { conversationId, usedFileTools: false });
379
747
  return;
380
748
  }
749
+ const auth = resolved.auth;
381
750
 
382
- const abortController = new AbortController();
383
- activeQueries.set(conversationId, abortController);
384
- // Hard watchdog a hung provider stream would otherwise pin this query forever (finally never
385
- // runs, bot:done never fires). Abort after 5 min; cleared in the finally on normal completion.
386
- const watchdog = setTimeout(() => {
387
- log.warn(`[pi/bloby-agent] one-shot timed out (5m) — aborting conv=${conversationId}`);
388
- abortController.abort();
389
- }, 300_000);
390
-
751
+ // Build the prompt BEFORE registering in activeQueries / arming the watchdog
752
+ // (claude.ts ordering): if anything in here ever rejected after registration,
753
+ // the entry would leak forever anyOneShotActive() stuck true defers every
754
+ // backend restart/self-update, and the caller's slot never frees.
391
755
  let systemPrompt: string;
392
756
  if (supportPrompt) {
393
757
  systemPrompt = supportPrompt;
@@ -398,11 +762,23 @@ export async function startBlobyAgentQuery(
398
762
  const messages: PiMessage[] = recentToPiMessages(recentMessages);
399
763
  messages.push(buildUserMessage(prompt, attachments, savedFiles));
400
764
 
765
+ const abortController = new AbortController();
766
+ activeQueries.set(conversationId, abortController);
767
+ // Hard watchdog — a hung provider stream would otherwise pin this query forever (finally never
768
+ // runs, bot:done never fires). Abort after 5 min; cleared in the finally on normal completion.
769
+ const watchdog = setTimeout(() => {
770
+ log.warn(`[pi/bloby-agent] one-shot timed out (5m) — aborting conv=${conversationId}`);
771
+ abortController.abort();
772
+ }, 300_000);
773
+
401
774
  onMessage('bot:typing', { conversationId });
402
775
 
403
776
  let accumulated = '';
404
777
  const usedTools = new Set<string>();
405
- let errored = false;
778
+ // Errors are stashed, not emitted inline — at the end, partial text wins
779
+ // over the error bubble (audit D3-5/D6-2, claude.ts:730-737 precedence).
780
+ let errorMsg: string | null = null;
781
+ const batcher = createTokenBatcher((text) => onMessage('bot:token', { conversationId, token: text }));
406
782
 
407
783
  try {
408
784
  const stream = streamProvider(auth.flavor, {
@@ -411,6 +787,9 @@ export async function startBlobyAgentQuery(
411
787
  apiKey: auth.apiKey,
412
788
  systemPrompt,
413
789
  messages,
790
+ maxOutputTokens: auth.maxOutputTokens,
791
+ maxTokensField: auth.maxTokensField,
792
+ includeStreamUsage: auth.includeStreamUsage,
414
793
  signal: abortController.signal,
415
794
  });
416
795
 
@@ -419,30 +798,46 @@ export async function startBlobyAgentQuery(
419
798
  switch (evt.type) {
420
799
  case 'text_delta':
421
800
  accumulated += evt.delta;
422
- onMessage('bot:token', { conversationId, token: evt.delta });
801
+ batcher.add(evt.delta);
423
802
  break;
424
803
  case 'text_end':
804
+ batcher.flush();
425
805
  accumulated = evt.text;
426
806
  break;
427
807
  case 'tool_use':
808
+ batcher.flush();
428
809
  usedTools.add(evt.name);
429
810
  onMessage('bot:tool', { conversationId, name: evt.name, input: evt.input });
430
811
  break;
431
812
  case 'error':
432
- errored = true;
433
- onMessage('bot:error', { conversationId, error: evt.error });
813
+ batcher.flush();
814
+ errorMsg = evt.error;
434
815
  break;
435
816
  }
436
817
  }
437
- if (accumulated && !errored) {
438
- onMessage('bot:response', { conversationId, content: accumulated });
818
+ // Abort guard (audit D3-8): a watchdog-aborted run must not surface a
819
+ // truncated reply a stopped pulse could otherwise still fire <Message>
820
+ // pushes with half-finished content.
821
+ if (!abortController.signal.aborted) {
822
+ batcher.flush();
823
+ if (accumulated) {
824
+ onMessage('bot:response', { conversationId, content: accumulated });
825
+ } else if (errorMsg) {
826
+ onMessage('bot:error', { conversationId, error: errorMsg });
827
+ }
439
828
  }
440
829
  } catch (err: any) {
441
830
  if (!abortController.signal.aborted) {
442
831
  log.warn(`[pi/bloby-agent] one-shot error: ${err?.message || err}`);
443
- onMessage('bot:error', { conversationId, error: err?.message || String(err) });
832
+ batcher.flush();
833
+ if (accumulated) {
834
+ onMessage('bot:response', { conversationId, content: accumulated });
835
+ } else {
836
+ onMessage('bot:error', { conversationId, error: err?.message || String(err) });
837
+ }
444
838
  }
445
839
  } finally {
840
+ batcher.discard();
446
841
  clearTimeout(watchdog);
447
842
  activeQueries.delete(conversationId);
448
843
  const FILE_TOOL_NAMES = ['Write', 'Edit', 'write', 'edit'];
@@ -462,8 +857,9 @@ export function stopBlobyAgentQuery(conversationId: string): void {
462
857
  // ── Workspace agent endpoint (POST /api/agent/query) ──────────────────────
463
858
 
464
859
  export async function runAgentQuery(req: AgentQueryRequest): Promise<AgentQueryResult> {
465
- const auth = resolveAuth();
466
- if (!auth.ok) return { ok: false, error: auth.error };
860
+ const resolved = resolveAuth();
861
+ if (!resolved.ok) return { ok: false, error: resolved.error };
862
+ const auth = resolved.auth;
467
863
 
468
864
  const timeout = Math.min(Math.max(req.timeout || 120_000, 5_000), 300_000);
469
865
  const abortController = new AbortController();
@@ -487,6 +883,9 @@ export async function runAgentQuery(req: AgentQueryRequest): Promise<AgentQueryR
487
883
  apiKey: auth.apiKey,
488
884
  systemPrompt,
489
885
  messages,
886
+ maxOutputTokens: auth.maxOutputTokens,
887
+ maxTokensField: auth.maxTokensField,
888
+ includeStreamUsage: auth.includeStreamUsage,
490
889
  signal: abortController.signal,
491
890
  });
492
891
 
@@ -517,7 +916,10 @@ export async function runAgentQuery(req: AgentQueryRequest): Promise<AgentQueryR
517
916
  clearTimeout(timeoutHandle);
518
917
  }
519
918
 
520
- if (errored) return { ok: false, error: errorMsg || 'Agent query failed' };
919
+ // Partial-text precedence (claude parity, audit D6-2): if the model streamed
920
+ // anything before failing, return it as a successful (truncated) response —
921
+ // claude's runAgentQuery only reports the error when nothing streamed.
922
+ if (errored && !fullText) return { ok: false, error: errorMsg || 'Agent query failed' };
521
923
 
522
924
  const usedFileTools = ['Write', 'Edit', 'write', 'edit'].some((t) => usedTools.has(t));
523
925
  return { ok: true, response: fullText, toolsUsed: Array.from(usedTools), usedFileTools };