npm - bloby-bot - Versions diffs - 0.70.8 → 0.70.10 - Mend

bloby-bot 0.70.8 → 0.70.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist-bloby/assets/{bloby-CXmOcb1r.js → bloby-DSNB0g4w.js} +4 -4
package/dist-bloby/assets/{globals-DpO5tO92.js → globals-B3cTbITX.js} +1 -1
package/dist-bloby/assets/{highlighted-body-OFNGDK62-D7cU1Y-Z.js → highlighted-body-OFNGDK62-BLforpkr.js} +1 -1
package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +1 -0
package/dist-bloby/assets/{onboard-B96ELhXn.js → onboard-Dn2Ws_G2.js} +1 -1
package/dist-bloby/bloby.html +2 -2
package/dist-bloby/onboard.html +2 -2
package/package.json +1 -1
package/scripts/sync-pi-models.ts +37 -6
package/supervisor/chat/OnboardWizard.tsx +4 -4
package/supervisor/harnesses/pi/async-queue.ts +7 -11
package/supervisor/harnesses/pi/index.ts +475 -73
package/supervisor/harnesses/pi/models-catalog.generated.ts +840 -210
package/supervisor/harnesses/pi/providers/humanize-error.ts +125 -0
package/supervisor/harnesses/pi/providers/retry.ts +87 -0
package/supervisor/harnesses/pi/providers/stream-anthropic.ts +73 -11
package/supervisor/harnesses/pi/providers/stream-google.ts +15 -5
package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +55 -19
package/supervisor/harnesses/pi/providers/types.ts +26 -1
package/supervisor/harnesses/pi/session.ts +179 -73
package/supervisor/harnesses/pi/sub-providers.ts +30 -1
package/supervisor/harnesses/pi/test-completion.ts +8 -2
package/supervisor/harnesses/pi/tools/registry.ts +25 -9
package/supervisor/harnesses/pi/tools/task.ts +108 -0
package/supervisor/harnesses/pi/tools/types.ts +15 -0
package/supervisor/index.ts +11 -10
package/supervisor/public/morphy_sad.mov +0 -0
package/supervisor/public/morphy_sad.webm +0 -0
package/supervisor/shell.ts +1 -1
package/supervisor/workspace-guard.js +1 -1
package/workspace/client/public/morphy_bounce.mov +0 -0
package/workspace/client/public/morphy_bounce.webm +0 -0
package/workspace/client/public/morphy_hi.mov +0 -0
package/workspace/client/public/morphy_hi.webm +0 -0
package/workspace/client/src/App.tsx +5 -3
package/dist-bloby/assets/mermaid-GHXKKRXX-D5YxphBn.js +0 -1
package/supervisor/public/what-happened.mp4 +0 -0
package/supervisor/public/what-happened.webm +0 -0

package/supervisor/harnesses/pi/session.ts CHANGED Viewed

@@ -4,25 +4,41 @@
  * Mirrors the *shape* of the Claude harness loop in `harnesses/claude.ts`:
  *   - one long-lived session per conversation
  *   - user messages arrive via an `AsyncQueue<PiMessage>` input
- *   - the loop drains the queue one turn at a time
+ *   - the loop drains the queue ONE MESSAGE PER TURN — exactly like the Claude
+ *     SDK's input queue: each pushed message gets its own turn, its own
+ *     text_end, and its own turn_complete. (An earlier design folded mid-turn
+ *     messages into the in-flight turn; that broke the channel manager's
+ *     one-response-per-push routing FIFO — see PI-PARITY-AUDIT-2026-06-11.md
+ *     D1-1 — so queued messages now simply wait for their own turn.)
  *   - each turn streams provider events back through a single `onEvent`
  *     callback the caller hooked up
  *
- * Phase 2: each user turn is an inner loop — provider call → if the model
- * asked for tool calls, execute them and feed results back → call provider
- * again — until the model finishes without requesting more tools. Tokens
- * stream live; `text_end` only fires once at the very end of the turn so the
- * UI doesn't display half-answers between tool rounds.
+ * Each user turn is an inner loop — provider call → if the model asked for
+ * tool calls, execute them and feed results back → call provider again — until
+ * the model finishes without requesting more tools. Tokens stream live;
+ * `text_end` only fires once at the very end of the turn so the UI doesn't
+ * display half-answers between tool rounds.
  *
- * Sub-agents are NOT spawned here — Bruno will add those later.
+ * Error precedence matches claude (audit D6-2): streamed partial text is
+ * always committed via `text_end` (the consumer persists it and the routing
+ * FIFO consumes normally); the `error` event fires only when a failed turn
+ * produced no text — except fatal kinds (auth / context-overflow), which are
+ * surfaced even after partial text so the harness can tear the session down.
+ *
+ * Auth (key/model/base URL/flavor) is resolved via `getAuth()` on every
+ * provider round (audit D6-8): fixing a revoked key or switching models in the
+ * wizard applies on the very next round, with full history intact.
+ *
+ * Sub-agents are NOT spawned here — Bruno will add those later (Phase B).
  */
 import { log } from '../../../shared/logger.js';
 import type { PiApiFlavor } from './sub-providers.js';
 import { streamProvider } from './providers/stream.js';
-import type { PiMessage, PiStreamEvent, PiToolDef, PiContentBlock } from './providers/types.js';
+import type { PiMessage, PiStreamEvent, PiToolDef, PiContentBlock, PiUsage, PiErrorKind } from './providers/types.js';
+import { sleep } from './providers/retry.js';
 import type { AsyncQueue } from './async-queue.js';
 import { findTool } from './tools/registry.js';
-import type { PiTool } from './tools/types.js';
+import type { PiTool, PiTaskHost } from './tools/types.js';
 export type PiSessionEvent =
   | { type: 'turn_started' }
@@ -30,14 +46,31 @@ export type PiSessionEvent =
   | { type: 'text_end'; text: string }
   | { type: 'tool_use'; id: string; name: string; input: any }
   | { type: 'tool_result'; toolUseId: string; name: string; isError?: boolean }
-  | { type: 'turn_complete'; usedFileTools: boolean }
-  | { type: 'error'; error: string };
+  | { type: 'turn_complete'; usedFileTools: boolean; usage?: PiUsage; contextWindow?: number }
+  | { type: 'error'; error: string; kind?: PiErrorKind };
-export interface PiSessionInit {
+/** Everything the providers need that can change while a session is alive. */
+export interface PiSessionAuth {
   flavor: PiApiFlavor;
   modelId: string;
   baseUrl: string;
   apiKey: string;
+  /** Per-model output cap from the catalog; providers fall back to safe defaults. */
+  maxOutputTokens?: number;
+  /** openai-completions only: which field carries the output cap (C-2). */
+  maxTokensField?: 'max_tokens' | 'max_completion_tokens';
+  /** openai-completions only: false for strict-schema vendors that 422 on stream_options. */
+  includeStreamUsage?: boolean;
+  /** Model context window from the catalog — reported on turn_complete for the recycler. */
+  contextWindow?: number;
+}
+export interface PiSessionInit {
+  /**
+   * Resolved on EVERY provider round (not captured once) so wizard-side
+   * key/model fixes heal a live conversation on the next round.
+   */
+  getAuth: () => PiSessionAuth;
   systemPrompt: string;
   /** Pre-loaded history before the first new user turn. */
   initialMessages?: PiMessage[];
@@ -45,7 +78,17 @@ export interface PiSessionInit {
   tools?: PiToolDef[];
   /** Resolved every time a tool fires (registry → run). */
   cwd: string;
-  maxOutputTokens?: number;
+  /**
+   * Background sub-agent host (Phase B). Set only on PARENT live sessions —
+   * threaded into PiToolContext so the Task tool can spawn; child sessions
+   * leave it unset (no grandchildren, Claude SDK parity).
+   */
+  taskHost?: PiTaskHost;
+  /**
+   * Per-turn tool-round budget. Parents keep the default; sub-agent children
+   * get their agent config's maxTurns (e.g. coder: 50).
+   */
+  maxToolRounds?: number;
   /** Used to interrupt in-flight provider calls when the session ends. */
   abortController: AbortController;
   /** Caller's event sink — translated to bloby's `bot:*` events one layer up. */
@@ -61,28 +104,45 @@ export interface PiSession {
 const FILE_TOOL_NAMES = new Set(['Write', 'Edit', 'MultiEdit', 'NotebookEdit', 'write', 'edit', 'multiEdit', 'notebookEdit']);
 const MAX_TOOL_ROUNDS = 25;
+/** Transparent re-runs of a failed round that produced nothing (audit D6-1). */
+const MAX_ROUND_RETRIES = 2;
 export function createPiSession(init: PiSessionInit): PiSession {
   const messages: PiMessage[] = init.initialMessages ? [...init.initialMessages] : [];
+  // Last provider-reported usage + window, session-scoped so even an errored
+  // turn's turn_complete carries the most recent context occupancy (D2-1).
+  let lastUsage: PiUsage | undefined;
+  let lastContextWindow: number | undefined;
   /** One stream round — collect the assistant blocks the model emits this pass. */
   interface RoundResult {
     text: string;
     toolUses: { id: string; name: string; input: any; thoughtSignature?: string }[];
     errored: boolean;
+    /** Stashed, NOT emitted inline — the turn decides response-vs-error precedence (D6-2). */
+    errorMsg?: string;
+    errorKind?: PiErrorKind;
+    /** True when re-sending the identical round can plausibly succeed (429/5xx/network). */
+    retryable?: boolean;
   }
-  async function runOneRound(): Promise<RoundResult> {
+  async function runOneRound(emitSeparatorFirst: boolean): Promise<RoundResult> {
     const result: RoundResult = { text: '', toolUses: [], errored: false };
+    let firstDelta = true;
     try {
-      const stream = streamProvider(init.flavor, {
-        modelId: init.modelId,
-        baseUrl: init.baseUrl,
-        apiKey: init.apiKey,
+      const auth = init.getAuth();
+      lastContextWindow = auth.contextWindow ?? lastContextWindow;
+      const stream = streamProvider(auth.flavor, {
+        modelId: auth.modelId,
+        baseUrl: auth.baseUrl,
+        apiKey: auth.apiKey,
         systemPrompt: init.systemPrompt,
         messages,
         tools: init.tools,
-        maxOutputTokens: init.maxOutputTokens,
+        maxOutputTokens: auth.maxOutputTokens,
+        maxTokensField: auth.maxTokensField,
+        includeStreamUsage: auth.includeStreamUsage,
         signal: init.abortController.signal,
       });
@@ -90,6 +150,14 @@ export function createPiSession(init: PiSessionInit): PiSession {
         if (init.abortController.signal.aborted) break;
         switch (evt.type) {
           case 'text_delta':
+            // Round separator rides BEFORE the new round's first token —
+            // claude.ts:374-379 ordering — so the streamed bytes stay a true
+            // prefix of the final bot:response even when the dashboard commits
+            // the buffer at a tool boundary mid-turn (audit D1-5/PI-SES-1).
+            if (firstDelta && emitSeparatorFirst) {
+              init.onEvent({ type: 'text_delta', delta: '\n\n' });
+            }
+            firstDelta = false;
             result.text += evt.delta;
             init.onEvent({ type: 'text_delta', delta: evt.delta });
             break;
@@ -110,17 +178,24 @@ export function createPiSession(init: PiSessionInit): PiSession {
             break;
           case 'error':
             result.errored = true;
-            init.onEvent({ type: 'error', error: evt.error });
+            result.errorMsg = evt.error;
+            result.errorKind = evt.kind;
+            result.retryable = evt.retryable;
             break;
           case 'done':
-            // Loop control is by tool_use presence, not stop reason.
+            // Loop control is by tool_use presence, not stop reason — but the
+            // usage rides here and feeds the supervisor's session recycling.
+            if (evt.usage) lastUsage = evt.usage;
             break;
         }
       }
     } catch (err: any) {
       if (!init.abortController.signal.aborted) {
         result.errored = true;
-        init.onEvent({ type: 'error', error: err?.message || String(err) });
+        result.errorMsg = err?.message || String(err);
+        // A throw mid-iteration is a network/stream failure — transient.
+        result.errorKind = 'transient';
+        result.retryable = true;
       }
     }
     return result;
@@ -135,51 +210,87 @@ export function createPiSession(init: PiSessionInit): PiSession {
       };
     }
     try {
-      return await tool.run(call.input, { cwd: init.cwd, signal: init.abortController.signal });
+      return await tool.run(call.input, { cwd: init.cwd, signal: init.abortController.signal, tasks: init.taskHost });
     } catch (err: any) {
       return { output: `Tool ${call.name} threw: ${err?.message || err}`, isError: true };
     }
   }
-  async function runOneTurn(input: AsyncQueue<PiMessage>, firstUserMsg: PiMessage): Promise<void> {
+  async function runOneTurn(userMsg: PiMessage): Promise<void> {
     if (init.abortController.signal.aborted) return;
-    // Stack any messages that arrived in the same millisecond into one turn.
-    messages.push(firstUserMsg);
-    for (const extra of input.drainPending()) messages.push(extra);
+    // ONE message per turn — queued messages wait for their own turn so each
+    // push gets its own bot:response (routing-FIFO invariant, audit D1-1).
+    messages.push(userMsg);
     init.onEvent({ type: 'turn_started' });
     let accumulatedText = '';
     const usedTools = new Set<string>();
     let turnErrored = false;
-    let pendingInterleave = false;
+    let turnErrorMsg: string | undefined;
+    let turnErrorKind: PiErrorKind | undefined;
-    for (let round = 0; round < MAX_TOOL_ROUNDS; round++) {
+    const maxRounds = Math.max(1, init.maxToolRounds ?? MAX_TOOL_ROUNDS);
+    for (let round = 0; round < maxRounds; round++) {
       if (init.abortController.signal.aborted) break;
-      const { text, toolUses, errored } = await runOneRound();
+      // The separator condition is decided BEFORE the round so the round can
+      // emit it ahead of its first token (claude.ts ordering — see runOneRound).
+      const needsSeparator = accumulatedText.length > 0 && !accumulatedText.endsWith('\n');
+      let res = await runOneRound(needsSeparator);
+      // Transparent round retry (D6-1): a transient failure that produced
+      // NOTHING is safe to re-run — requests are stateless full-history
+      // resends. Never retry a round that already streamed text or tool calls.
+      for (
+        let attempt = 0;
+        attempt < MAX_ROUND_RETRIES &&
+        res.errored && res.retryable && !res.text && res.toolUses.length === 0 &&
+        !init.abortController.signal.aborted;
+        attempt++
+      ) {
+        log.info(`[pi/session] transient round failure — retrying (${attempt + 1}/${MAX_ROUND_RETRIES}): ${res.errorMsg?.slice(0, 160)}`);
+        try { await sleep(1000 * 2 ** attempt, init.abortController.signal); } catch { break; }
+        res = await runOneRound(needsSeparator);
+      }
+      const { text, toolUses, errored } = res;
       // Append whatever the model produced this round to history so subsequent
       // rounds (and the next user turn) see it.
       const assistantContent: PiContentBlock[] = [];
       if (text) {
-        accumulatedText += (accumulatedText && !accumulatedText.endsWith('\n') ? '\n\n' : '') + text;
+        // Matches the separator runOneRound streamed before this round's
+        // first delta — accumulatedText and the token stream stay byte-equal.
+        if (needsSeparator) accumulatedText += '\n\n';
+        accumulatedText += text;
         assistantContent.push({ type: 'text', text });
       }
-      for (const tu of toolUses) {
-        assistantContent.push({
-          type: 'tool_use',
-          id: tu.id,
-          name: tu.name,
-          input: tu.input,
-          // Forward Gemini's thoughtSignature unchanged so the next turn's
-          // request echoes it back; without it the API rejects with 400.
-          thoughtSignature: tu.thoughtSignature,
-        });
+      if (!errored) {
+        // On an errored round, keep the text but DROP the round's tool_use
+        // blocks: the turn ends before executing them, and a dangling
+        // tool_use with no tool_result poisons the history (Anthropic and
+        // Gemini reject the next request outright).
+        for (const tu of toolUses) {
+          assistantContent.push({
+            type: 'tool_use',
+            id: tu.id,
+            name: tu.name,
+            input: tu.input,
+            // Forward Gemini's thoughtSignature unchanged so the next turn's
+            // request echoes it back; without it the API rejects with 400.
+            thoughtSignature: tu.thoughtSignature,
+          });
+        }
       }
       if (assistantContent.length > 0) {
         messages.push({ role: 'assistant', content: assistantContent });
       }
-      if (errored) { turnErrored = true; break; }
+      if (errored) {
+        turnErrored = true;
+        turnErrorMsg = res.errorMsg;
+        turnErrorKind = res.errorKind;
+        break;
+      }
       // Run every tool the model asked for this round, then feed the results
       // back as a single user message Gemini accepts as a batch.
@@ -188,50 +299,45 @@ export function createPiSession(init: PiSessionInit): PiSession {
         usedTools.add(tu.name);
         if (init.abortController.signal.aborted) break;
         log.info(`[pi/session] tool call ${tu.name}(${JSON.stringify(tu.input).slice(0, 200)})`);
-        const res = await executeTool(tu);
-        init.onEvent({ type: 'tool_result', toolUseId: tu.id, name: tu.name, isError: !!res.isError });
+        const res2 = await executeTool(tu);
+        init.onEvent({ type: 'tool_result', toolUseId: tu.id, name: tu.name, isError: !!res2.isError });
         toolResultBlocks.push({
           type: 'tool_result',
           toolUseId: tu.id,
-          content: res.output,
-          isError: res.isError,
+          content: res2.output,
+          isError: res2.isError,
         });
       }
       if (toolResultBlocks.length > 0) {
         messages.push({ role: 'user', content: toolResultBlocks });
       }
-      // Fold any user messages that arrived during this round into history so
-      // the next stream pass sees them. This is what makes the conversation
-      // feel alive: while the agent is grinding on a long task, a question
-      // typed mid-stream lands in the very next request as a user-role part,
-      // and the model can answer it inline before continuing.
-      const interleaved = input.drainPending();
-      if (interleaved.length > 0) {
-        log.info(`[pi/session] interleaved ${interleaved.length} mid-turn user message(s) into history`);
-        for (const m of interleaved) messages.push(m);
-        pendingInterleave = true;
-      } else {
-        pendingInterleave = false;
-      }
-      // Exit when the model has nothing more to do AND no new user messages
-      // arrived mid-round. Either condition by itself keeps the loop alive.
-      if (toolUses.length === 0 && !pendingInterleave) break;
+      // No tool calls ⇒ the model is done with this turn.
+      if (toolUses.length === 0) break;
     }
-    // Emit text_end only on a clean turn (don't persist a half-baked answer from an errored
-    // turn). But ALWAYS emit turn_complete on a non-aborted turn — including the errored path
-    // — so the supervisor clears agentQueryActive (set on turn_started). Skipping it on error
-    // wedged the flag true: backend auto-heal stayed deferred and chat stuck in "typing" until
-    // the next successful turn. The 'error' event was already emitted by runOneRound, so the
-    // user still sees the failure. Aborted turns are torn down via bot:conversation-ended.
+    // Turn-end emission order (audit D6-2, mirrors claude.ts:394-401):
+    //   1. text_end whenever ANY text streamed — even on errored turns, so the
+    //      partial the user watched is committed, persisted, and consumes its
+    //      routing-FIFO entry (the frontend's bot:error handler would
+    //      otherwise erase it).
+    //   2. error only when the turn produced no text — EXCEPT fatal kinds
+    //      (auth / context-overflow), which must surface regardless so the
+    //      harness tears the poisoned session down.
+    //   3. turn_complete ALWAYS on a non-aborted turn — including errored
+    //      paths — so the supervisor clears agentQueryActive. Skipping it
+    //      wedged the flag true historically. Aborted turns are torn down via
+    //      bot:conversation-ended.
     if (!init.abortController.signal.aborted) {
-      if (!turnErrored && accumulatedText) {
+      if (accumulatedText) {
         init.onEvent({ type: 'text_end', text: accumulatedText });
       }
+      const fatal = turnErrorKind === 'auth' || turnErrorKind === 'context-overflow';
+      if (turnErrored && (!accumulatedText || fatal)) {
+        init.onEvent({ type: 'error', error: turnErrorMsg || 'Provider turn failed', kind: turnErrorKind });
+      }
       const usedFileTools = Array.from(usedTools).some((t) => FILE_TOOL_NAMES.has(t));
-      init.onEvent({ type: 'turn_complete', usedFileTools });
+      init.onEvent({ type: 'turn_complete', usedFileTools, usage: lastUsage, contextWindow: lastContextWindow });
     }
   }
@@ -240,7 +346,7 @@ export function createPiSession(init: PiSessionInit): PiSession {
       for await (const userMsg of input) {
         if (init.abortController.signal.aborted) break;
         try {
-          await runOneTurn(input, userMsg);
+          await runOneTurn(userMsg);
         } catch (err: any) {
           log.warn(`[pi/session] Turn failed: ${err?.message || err}`);
           init.onEvent({ type: 'error', error: err?.message || String(err) });
@@ -248,7 +354,7 @@ export function createPiSession(init: PiSessionInit): PiSession {
           // and chat aren't wedged. Skip when aborting (teardown emits conversation-ended).
           // usedFileTools=false is the safe default (it only governs whether to auto-restart now).
           if (!init.abortController.signal.aborted) {
-            init.onEvent({ type: 'turn_complete', usedFileTools: false });
+            init.onEvent({ type: 'turn_complete', usedFileTools: false, usage: lastUsage, contextWindow: lastContextWindow });
           }
         }
       }

package/supervisor/harnesses/pi/sub-providers.ts CHANGED Viewed

@@ -15,7 +15,7 @@
  * synced from upstream pi via `npm run sync:pi-models`. Sub-providers without
  * a pi mapping (Ollama, LM Studio, custom) stay `'dynamic'` — free-form ID.
  */
-import { PI_MODELS_CATALOG } from './models-catalog.generated.js';
+import { PI_MODELS_CATALOG, type PiCatalogModel } from './models-catalog.generated.js';
 export type PiApiFlavor = 'openai-completions' | 'anthropic-messages' | 'google-gemini';
@@ -24,6 +24,16 @@ export interface PiSubProviderModel {
   label: string;
 }
+/**
+ * Catalog metadata for a saved sub-provider + model pair. Drives the per-model
+ * output cap (C-5), the context-window figure the supervisor's recycler needs
+ * (D2-1), and — later — the vision gate. Returns undefined for dynamic
+ * sub-providers (OpenRouter/Ollama/LM Studio/custom) and unknown model ids.
+ */
+export function getCatalogModel(subProviderId: string, modelId: string): PiCatalogModel | undefined {
+  return PI_MODELS_CATALOG[subProviderId]?.find((m) => m.id === modelId);
+}
 export interface PiSubProvider {
   id: string;
   name: string;
@@ -41,6 +51,23 @@ export interface PiSubProvider {
   models: PiSubProviderModel[] | 'dynamic';
   /** Default model selection when the user hasn't picked one. */
   defaultModel?: string;
+  /**
+   * openai-completions flavor only: which request field carries the output
+   * cap. OpenAI's reasoning models (gpt-5.x, o-series — 31 of 37 catalog
+   * entries) reject the legacy `max_tokens` with HTTP 400;
+   * `max_completion_tokens` is accepted by ALL OpenAI models, so the
+   * openai-api entry opts in. Other vendors stay on `max_tokens`, matching
+   * their current working behavior.
+   */
+  maxTokensField?: 'max_tokens' | 'max_completion_tokens';
+  /**
+   * openai-completions flavor only: set true for vendors whose request schema
+   * rejects unknown fields — Mistral 422s ("Extra inputs are not permitted")
+   * on `stream_options`, so it must not receive the include_usage opt-in.
+   * (Mistral still sends usage in the final streamed chunk by default, so the
+   * provider's chunk.usage read keeps working without it.)
+   */
+  noStreamUsage?: boolean;
 }
 function fromCatalog(key: string): PiSubProviderModel[] | 'dynamic' {
@@ -130,6 +157,7 @@ export const PI_SUB_PROVIDERS: PiSubProvider[] = [
     apiKeyUrl: 'https://console.mistral.ai/api-keys/',
     models: fromCatalog('mistral'),
     defaultModel: defaultFor('mistral'),
+    noStreamUsage: true,
   },
   {
     id: 'openai-api',
@@ -141,6 +169,7 @@ export const PI_SUB_PROVIDERS: PiSubProvider[] = [
     apiKeyUrl: 'https://platform.openai.com/api-keys',
     models: fromCatalog('openai-api'),
     defaultModel: defaultFor('openai-api'),
+    maxTokensField: 'max_completion_tokens',
   },
   {
     id: 'anthropic-api',

package/supervisor/harnesses/pi/test-completion.ts CHANGED Viewed

@@ -74,6 +74,7 @@ export async function runPiTestCompletion(input: PiTestCompletionInput): Promise
       modelId,
       apiKey: input.apiKey?.trim() || '',
       prompt: input.prompt,
+      maxTokensField: provider.maxTokensField,
     });
     return { ok: true, text, modelId, subProvider: provider.id };
   } catch (err: any) {
@@ -92,6 +93,8 @@ interface DispatchArgs {
   modelId: string;
   apiKey: string;
   prompt: string;
+  /** openai-completions only — gpt-5.x/o-series reject the legacy max_tokens (C-2). */
+  maxTokensField?: 'max_tokens' | 'max_completion_tokens';
 }
 async function callByFlavor(flavor: PiApiFlavor, args: DispatchArgs): Promise<string> {
@@ -107,17 +110,20 @@ async function callByFlavor(flavor: PiApiFlavor, args: DispatchArgs): Promise<st
 /* ── OpenAI / OpenAI-compatible ── */
-async function callOpenAICompletions({ baseUrl, modelId, apiKey, prompt }: DispatchArgs): Promise<string> {
+async function callOpenAICompletions({ baseUrl, modelId, apiKey, prompt, maxTokensField }: DispatchArgs): Promise<string> {
   const headers: Record<string, string> = { 'content-type': 'application/json' };
   if (apiKey) headers['authorization'] = `Bearer ${apiKey}`;
+  // Reasoning models (gpt-5.x/o-series) spend the budget on hidden reasoning
+  // first — 256 would come back as an empty message, failing a valid key.
+  const capField = maxTokensField ?? 'max_tokens';
   const res = await timedFetch(`${baseUrl}/chat/completions`, {
     method: 'POST',
     headers,
     body: JSON.stringify({
       model: modelId,
       messages: [{ role: 'user', content: prompt }],
-      max_tokens: 256,
+      [capField]: capField === 'max_completion_tokens' ? 2048 : 256,
       stream: false,
     }),
   });

package/supervisor/harnesses/pi/tools/registry.ts CHANGED Viewed

@@ -1,8 +1,10 @@
 /**
  * Tool registry — the bag of tools the pi session passes to the model.
  *
- * Phase 2 ships the four core coding tools. Phase 3 or later will add Grep,
- * Glob, LS, NotebookEdit, etc. so the surface fully matches Claude SDK's.
+ * Read/Write/Edit/Bash mirror the Claude SDK tools; Task is the background
+ * sub-agent delegator (Phase B of the parity plan). Grep, Glob, LS,
+ * NotebookEdit etc. are still pending (Phase D) to fully match Claude SDK's
+ * surface.
  */
 import type { PiTool } from './types.js';
 import type { PiToolDef } from '../providers/types.js';
@@ -10,8 +12,9 @@ import { readTool } from './read.js';
 import { writeTool } from './write.js';
 import { editTool } from './edit.js';
 import { bashTool } from './bash.js';
+import { taskTool, taskToolDef } from './task.js';
-export const PI_TOOLS: PiTool[] = [readTool, writeTool, editTool, bashTool];
+export const PI_TOOLS: PiTool[] = [readTool, writeTool, editTool, bashTool, taskTool];
 const TOOL_BY_NAME = new Map<string, PiTool>();
 for (const t of PI_TOOLS) {
@@ -20,15 +23,28 @@ for (const t of PI_TOOLS) {
   // common aliases so we don't 404 a legitimate call over a casing nit.
   TOOL_BY_NAME.set(t.name.toLowerCase(), t);
 }
+// The pi system prompt calls background delegation "the Agent tool" (claude
+// heritage) — alias it so a model following the prompt verbatim still lands
+// on the Task implementation.
+TOOL_BY_NAME.set('Agent', taskTool);
+TOOL_BY_NAME.set('agent', taskTool);
 export function findTool(name: string): PiTool | undefined {
   return TOOL_BY_NAME.get(name) || TOOL_BY_NAME.get(name.toLowerCase());
 }
-export function toolDefsForProvider(): PiToolDef[] {
-  return PI_TOOLS.map((t) => ({
-    name: t.name,
-    description: t.description,
-    inputSchema: t.inputSchema,
-  }));
+export function toolDefsForProvider(opts?: { forSubagent?: boolean }): PiToolDef[] {
+  const defs: PiToolDef[] = [];
+  for (const t of PI_TOOLS) {
+    if (t.name === 'Task') {
+      // Children cannot spawn grandchildren (Claude SDK parity) — a child that
+      // hallucinates a Task call still fails gracefully (ctx.tasks is unset).
+      if (opts?.forSubagent) continue;
+      // Rebuilt fresh so agent-roster/prompt edits apply per session start.
+      defs.push(taskToolDef());
+      continue;
+    }
+    defs.push({ name: t.name, description: t.description, inputSchema: t.inputSchema });
+  }
+  return defs;
 }