npm - bloby-bot - Versions diffs - 0.70.8 → 0.70.10 - Mend

bloby-bot 0.70.8 → 0.70.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/dist-bloby/assets/{bloby-CXmOcb1r.js → bloby-DSNB0g4w.js} +4 -4
package/dist-bloby/assets/{globals-DpO5tO92.js → globals-B3cTbITX.js} +1 -1
package/dist-bloby/assets/{highlighted-body-OFNGDK62-D7cU1Y-Z.js → highlighted-body-OFNGDK62-BLforpkr.js} +1 -1
package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +1 -0
package/dist-bloby/assets/{onboard-B96ELhXn.js → onboard-Dn2Ws_G2.js} +1 -1
package/dist-bloby/bloby.html +2 -2
package/dist-bloby/onboard.html +2 -2
package/package.json +1 -1
package/scripts/sync-pi-models.ts +37 -6
package/supervisor/chat/OnboardWizard.tsx +4 -4
package/supervisor/harnesses/pi/async-queue.ts +7 -11
package/supervisor/harnesses/pi/index.ts +475 -73
package/supervisor/harnesses/pi/models-catalog.generated.ts +840 -210
package/supervisor/harnesses/pi/providers/humanize-error.ts +125 -0
package/supervisor/harnesses/pi/providers/retry.ts +87 -0
package/supervisor/harnesses/pi/providers/stream-anthropic.ts +73 -11
package/supervisor/harnesses/pi/providers/stream-google.ts +15 -5
package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +55 -19
package/supervisor/harnesses/pi/providers/types.ts +26 -1
package/supervisor/harnesses/pi/session.ts +179 -73
package/supervisor/harnesses/pi/sub-providers.ts +30 -1
package/supervisor/harnesses/pi/test-completion.ts +8 -2
package/supervisor/harnesses/pi/tools/registry.ts +25 -9
package/supervisor/harnesses/pi/tools/task.ts +108 -0
package/supervisor/harnesses/pi/tools/types.ts +15 -0
package/supervisor/index.ts +11 -10
package/supervisor/public/morphy_sad.mov +0 -0
package/supervisor/public/morphy_sad.webm +0 -0
package/supervisor/shell.ts +1 -1
package/supervisor/workspace-guard.js +1 -1
package/workspace/client/public/morphy_bounce.mov +0 -0
package/workspace/client/public/morphy_bounce.webm +0 -0
package/workspace/client/public/morphy_hi.mov +0 -0
package/workspace/client/public/morphy_hi.webm +0 -0
package/workspace/client/src/App.tsx +5 -3
package/dist-bloby/assets/mermaid-GHXKKRXX-D5YxphBn.js +0 -1
package/supervisor/public/what-happened.mp4 +0 -0
package/supervisor/public/what-happened.webm +0 -0

package/supervisor/harnesses/pi/providers/humanize-error.ts ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * Provider error classification + humanization for the pi harness.
+ *
+ * Every non-OK HTTP response and network failure from the three stream
+ * providers funnels through here so the user sees an actionable message
+ * ("update your key in the dashboard") instead of a raw JSON wall, and so the
+ * session/harness can react structurally: retry `retryable` rounds, tear the
+ * conversation down on `auth` / `context-overflow` (a poisoned history would
+ * otherwise re-fail forever).
+ *
+ * Mirrors the codex harness's codexErrorInfo mapping (house standard M4).
+ */
+import type { PiErrorKind } from './types.js';
+export interface ClassifiedPiError {
+  /** User-facing message — friendly, with a one-line raw detail for debugging. */
+  message: string;
+  kind: PiErrorKind;
+  /** True when re-sending the identical request can plausibly succeed. */
+  retryable: boolean;
+  status?: number;
+}
+/** Pull the provider's human-readable message out of a JSON error body. */
+function extractDetail(body: string): string {
+  const trimmed = (body || '').trim();
+  if (!trimmed) return '';
+  try {
+    const j = JSON.parse(trimmed);
+    // Google/OpenAI/Anthropic all nest it under error.message; some
+    // OpenAI-compat vendors use a top-level message.
+    const msg = j?.error?.message || j?.message || (typeof j?.error === 'string' ? j.error : '');
+    if (typeof msg === 'string' && msg.trim()) return msg.trim().slice(0, 300);
+  } catch {}
+  return trimmed.slice(0, 300);
+}
+const CONTEXT_OVERFLOW_RE =
+  /context.length|context_length_exceeded|maximum context length|prompt is too long|too many tokens|input token count.*exceed|token count exceeds|exceeds the maximum number of tokens|request exceeds the.*token|exceeds? (the )?context limit|input length and .{0,3}max_tokens/i;
+const AUTH_RE =
+  /api key not valid|invalid api key|invalid x-api-key|incorrect api key|invalid_api_key|authentication[_ ]error|permission_error|invalid bearer token|no auth credentials/i;
+// Deliberately narrow: only unambiguous out-of-credit markers. Gemini's
+// routine per-minute 429 says "check your plan and billing details" — that is
+// a RATE LIMIT (retryable), not billing; OpenAI's true quota exhaustion is
+// distinguished by the insufficient_quota code (absent from Gemini bodies).
+const BILLING_RE =
+  /insufficient_quota|credit balance is too low|payment required|purchase more credits/i;
+export function classifyPiError(
+  providerLabel: string,
+  status: number | undefined,
+  statusText: string,
+  body: string,
+): ClassifiedPiError {
+  const detail = extractDetail(body);
+  const suffix = detail ? ` (${detail})` : '';
+  // Order matters: overflow and billing hide behind generic 400/429 statuses.
+  if ((status === 400 || status === 413) && CONTEXT_OVERFLOW_RE.test(body)) {
+    return {
+      kind: 'context-overflow',
+      retryable: false,
+      status,
+      message: `The conversation has outgrown ${providerLabel}'s context window.${suffix}`,
+    };
+  }
+  if (BILLING_RE.test(body) || status === 402) {
+    return {
+      kind: 'billing',
+      retryable: false,
+      status,
+      message: `${providerLabel} reports a quota/billing problem — check your plan or credits on the provider's console.${suffix}`,
+    };
+  }
+  // 401 is always auth; 403 only when the body says so — vendors also use 403
+  // for per-message moderation/guardrail blocks (e.g. OpenRouter), which must
+  // NOT be classified auth (auth is a fatal kind that recycles the session).
+  if (status === 401 || AUTH_RE.test(body)) {
+    return {
+      kind: 'auth',
+      retryable: false,
+      status,
+      message: `${providerLabel} rejected your API key. Update it from the dashboard (Bloby provider settings).${suffix}`,
+    };
+  }
+  if (status === 429) {
+    return {
+      kind: 'rate-limit',
+      retryable: true,
+      status,
+      message: `${providerLabel} rate limit reached — give it a moment and try again.${suffix}`,
+    };
+  }
+  if (status === 408 || (status !== undefined && status >= 500)) {
+    return {
+      kind: 'transient',
+      retryable: true,
+      status,
+      message: `${providerLabel} is having trouble right now (HTTP ${status}) — try again in a moment.${suffix}`,
+    };
+  }
+  return {
+    kind: 'other',
+    retryable: false,
+    status,
+    message: `${providerLabel} ${status ?? ''} ${statusText || ''}`.trim() + `${detail ? `: ${detail}` : ''}`,
+  };
+}
+/** Network-level failures (DNS, refused, reset, undici timeouts) — always transient. */
+export function classifyPiNetworkError(providerLabel: string, err: any): ClassifiedPiError {
+  const raw = err?.message || String(err);
+  // undici's body/headers timeouts surface as the famously cryptic 'terminated'
+  // and 'Headers Timeout Error' — translate them.
+  const stalled = /terminated|timeout/i.test(raw);
+  return {
+    kind: 'transient',
+    retryable: true,
+    message: stalled
+      ? `${providerLabel} stream stalled (no data from the provider). Try again in a moment. (${raw})`
+      : `Could not reach ${providerLabel}: ${raw}`,
+  };
+}

package/supervisor/harnesses/pi/providers/retry.ts ADDED Viewed

@@ -0,0 +1,87 @@
+/**
+ * fetchWithRetry — transient-failure absorption for the pi providers.
+ *
+ * The Claude SDK retries transient provider errors inside its subprocess and
+ * codex suppresses willRetry errors; pi's hand-rolled providers previously did
+ * exactly one fetch, so a single 429 (routine on Gemini free tier) or a 5xx
+ * blip killed an entire multi-minute agentic turn. This wraps the initial
+ * request only — once a stream is open, mid-stream failures are handled by the
+ * session's round-retry (a full-history resend is stateless, so re-running a
+ * round that produced nothing is safe).
+ *
+ * Policy: up to 3 attempts on network errors and HTTP 408/429/5xx, exponential
+ * backoff 1s/2s with jitter, honoring Retry-After when it's short. A long
+ * Retry-After (> 15s) means the provider really wants us to back off — return
+ * the response and let the classifier surface a friendly rate-limit message.
+ */
+const RETRYABLE_STATUS = new Set([408, 429, 500, 502, 503, 504, 529]);
+const MAX_ATTEMPTS = 3;
+const MAX_HONORED_RETRY_AFTER_MS = 15_000;
+function retryAfterMs(res: Response): number | undefined {
+  const h = res.headers.get('retry-after');
+  if (!h) return undefined;
+  const secs = Number(h);
+  if (Number.isFinite(secs)) return Math.max(0, secs * 1000);
+  const date = Date.parse(h);
+  if (!Number.isNaN(date)) return Math.max(0, date - Date.now());
+  return undefined;
+}
+function abortError(): Error {
+  const err = new Error('This operation was aborted');
+  err.name = 'AbortError';
+  return err;
+}
+/** Sleep that wakes immediately (and throws AbortError) when the signal fires. */
+export function sleep(ms: number, signal?: AbortSignal): Promise<void> {
+  return new Promise((resolve, reject) => {
+    if (signal?.aborted) return reject(abortError());
+    const timer = setTimeout(() => {
+      signal?.removeEventListener('abort', onAbort);
+      resolve();
+    }, ms);
+    const onAbort = () => {
+      clearTimeout(timer);
+      reject(abortError());
+    };
+    signal?.addEventListener('abort', onAbort, { once: true });
+  });
+}
+export async function fetchWithRetry(
+  url: string,
+  init: RequestInit & { signal?: AbortSignal },
+): Promise<Response> {
+  let lastErr: any;
+  for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
+    if (init.signal?.aborted) throw abortError();
+    let res: Response;
+    try {
+      res = await fetch(url, init);
+    } catch (err: any) {
+      if (err?.name === 'AbortError') throw err;
+      lastErr = err;
+      if (attempt === MAX_ATTEMPTS - 1) throw err;
+      await sleep(1000 * 2 ** attempt + Math.random() * 250, init.signal);
+      continue;
+    }
+    if (res.ok || !RETRYABLE_STATUS.has(res.status) || attempt === MAX_ATTEMPTS - 1) {
+      return res;
+    }
+    const hinted = retryAfterMs(res);
+    if (hinted !== undefined && hinted > MAX_HONORED_RETRY_AFTER_MS) {
+      return res; // provider asked for a long back-off — surface it instead of stalling the turn
+    }
+    // Drain/cancel the body so the connection can be reused before retrying.
+    try { await res.body?.cancel(); } catch {}
+    await sleep(hinted ?? 1000 * 2 ** attempt + Math.random() * 250, init.signal);
+  }
+  // Unreachable, but keeps TS happy.
+  throw lastErr ?? new Error('fetchWithRetry: exhausted attempts');
+}

package/supervisor/harnesses/pi/providers/stream-anthropic.ts CHANGED Viewed

@@ -18,7 +18,10 @@ import type {
   PiMessage,
   PiContentBlock,
   PiStopReason,
+  PiUsage,
 } from './types.js';
+import { fetchWithRetry } from './retry.js';
+import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
 /* ── SSE parser (shares the LF/CRLF-tolerant pattern from the other providers) ── */
@@ -150,12 +153,30 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
     max_tokens: req.maxOutputTokens ?? 8192,
     stream: true,
   };
-  if (req.systemPrompt?.trim()) body.system = req.systemPrompt;
-  if (req.tools && req.tools.length > 0) body.tools = toAnthropicTools(req.tools);
+  // Prompt caching (3 of the 4 allowed breakpoints). Without these, every tool
+  // round re-prefills the full system prompt + history at full input price —
+  // up to 25x per agentic turn. The request prefix is tools → system →
+  // messages, so: last tool def caches the tool block, the system block caches
+  // tools+system as one prefix, and the last history block caches the
+  // conversation so far (Anthropic checks previous breakpoint positions for
+  // the longest cached prefix as the marker moves forward each round).
+  if (req.systemPrompt?.trim()) {
+    body.system = [{ type: 'text', text: req.systemPrompt, cache_control: { type: 'ephemeral' } }];
+  }
+  if (req.tools && req.tools.length > 0) {
+    body.tools = toAnthropicTools(req.tools);
+    body.tools[body.tools.length - 1].cache_control = { type: 'ephemeral' };
+  }
+  if (Array.isArray(body.messages) && body.messages.length > 0) {
+    const lastContent = body.messages[body.messages.length - 1].content;
+    if (Array.isArray(lastContent) && lastContent.length > 0) {
+      lastContent[lastContent.length - 1].cache_control = { type: 'ephemeral' };
+    }
+  }
   let res: Response;
   try {
-    res = await fetch(url, {
+    res = await fetchWithRetry(url, {
       method: 'POST',
       headers: {
         'content-type': 'application/json',
@@ -167,14 +188,20 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
       signal: req.signal,
     });
   } catch (err: any) {
-    yield { type: 'error', error: err?.message || String(err) };
+    if (err?.name === 'AbortError') {
+      yield { type: 'done', stopReason: 'aborted' };
+      return;
+    }
+    const cls = classifyPiNetworkError('Anthropic', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }
   if (!res.ok) {
     let detail = '';
     try { detail = await res.text(); } catch {}
-    yield { type: 'error', error: `Anthropic ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
+    const cls = classifyPiError('Anthropic', res.status, res.statusText, detail);
+    yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
     return;
   }
@@ -183,7 +210,7 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
   const blocks = new Map<number, PartialBlock>();
   let accumulated = '';
   let lastStop: string | undefined;
-  let usage: { inputTokens?: number; outputTokens?: number } | undefined;
+  let usage: PiUsage | undefined;
   let chunkCount = 0;
   let firstChunkSummary = '';
@@ -197,7 +224,17 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
       switch (type) {
         case 'message_start': {
           const u = evt?.message?.usage;
-          if (u) usage = { inputTokens: u.input_tokens, outputTokens: u.output_tokens };
+          if (u) {
+            usage = {
+              inputTokens: u.input_tokens,
+              outputTokens: u.output_tokens,
+              // With prompt caching on, the bulk of the prompt is cache reads —
+              // input_tokens alone would massively under-report occupancy and
+              // the supervisor's recycler would never fire.
+              cacheReadTokens: u.cache_read_input_tokens,
+              cacheCreationTokens: u.cache_creation_input_tokens,
+            };
+          }
           break;
         }
         case 'content_block_start': {
@@ -241,7 +278,19 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
             let input: any = {};
             if (slot.toolArgsBuf) {
               try { input = JSON.parse(slot.toolArgsBuf); }
-              catch { input = { _raw: slot.toolArgsBuf }; }
+              catch {
+                // Truncated tool-call JSON (output cap hit mid-arguments).
+                // Executing a fabricated {_raw} input sends the model into an
+                // unwinnable retry loop — fail the round loudly instead.
+                yield {
+                  type: 'error',
+                  error: `The model's ${slot.toolName} call was cut off by the output-token limit (${req.maxOutputTokens ?? 8192} tokens) — the arguments did not fit. Try a smaller change, or raise the model's output budget.`,
+                  kind: 'other',
+                  retryable: false,
+                };
+                yield { type: 'done', stopReason: 'error', usage };
+                return;
+              }
             }
             yield {
               type: 'tool_use',
@@ -259,13 +308,25 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
             usage = {
               inputTokens: u.input_tokens ?? usage?.inputTokens,
               outputTokens: u.output_tokens ?? usage?.outputTokens,
+              cacheReadTokens: u.cache_read_input_tokens ?? usage?.cacheReadTokens,
+              cacheCreationTokens: u.cache_creation_input_tokens ?? usage?.cacheCreationTokens,
             };
           }
           break;
         }
         case 'error': {
-          const msg = evt?.error?.message || evt?.message || 'Unknown error';
-          yield { type: 'error', error: `Anthropic stream error: ${msg}` };
+          // In-stream error event (e.g. overloaded_error) — classify so the
+          // session can retry transient ones and the user sees friendly text.
+          const cls = classifyPiError('Anthropic', undefined, '', JSON.stringify(evt?.error ?? evt ?? {}));
+          const isOverloaded = (evt?.error?.type || '') === 'overloaded_error';
+          yield {
+            type: 'error',
+            error: cls.kind === 'other' && !isOverloaded
+              ? `Anthropic stream error: ${evt?.error?.message || evt?.message || 'Unknown error'}`
+              : (isOverloaded ? 'Anthropic is overloaded right now — try again in a moment.' : cls.message),
+            kind: isOverloaded ? 'transient' : cls.kind,
+            retryable: isOverloaded || cls.retryable,
+          };
           yield { type: 'done', stopReason: 'error', usage };
           return;
         }
@@ -281,7 +342,8 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
       yield { type: 'done', stopReason: 'aborted' };
       return;
     }
-    yield { type: 'error', error: err?.message || String(err) };
+    const cls = classifyPiNetworkError('Anthropic', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }

package/supervisor/harnesses/pi/providers/stream-google.ts CHANGED Viewed

@@ -16,7 +16,10 @@ import type {
   PiMessage,
   PiContentBlock,
   PiStopReason,
+  PiUsage,
 } from './types.js';
+import { fetchWithRetry } from './retry.js';
+import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
 /** Walk an SSE byte stream and yield each parsed JSON event. */
 async function* parseSse(res: Response, dbg: { firstBytes: string }): AsyncIterable<any> {
@@ -209,21 +212,27 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
   let res: Response;
   try {
-    res = await fetch(url, {
+    res = await fetchWithRetry(url, {
       method: 'POST',
       headers: { 'content-type': 'application/json' },
       body: JSON.stringify(body),
       signal: req.signal,
     });
   } catch (err: any) {
-    yield { type: 'error', error: err?.message || String(err) };
+    if (err?.name === 'AbortError') {
+      yield { type: 'done', stopReason: 'aborted' };
+      return;
+    }
+    const cls = classifyPiNetworkError('Google Gemini', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }
   if (!res.ok) {
     let detail = '';
     try { detail = await res.text(); } catch {}
-    yield { type: 'error', error: `Google ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
+    const cls = classifyPiError('Google Gemini', res.status, res.statusText, detail);
+    yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
     return;
   }
@@ -231,7 +240,7 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
   let toolCallCount = 0;
   let lastFinish: string | undefined;
   let promptBlockReason: string | undefined;
-  let usage: { inputTokens?: number; outputTokens?: number } | undefined;
+  let usage: PiUsage | undefined;
   // Debug counters — drop once this stabilises.
   let chunkCount = 0;
   let thoughtPartCount = 0;
@@ -293,7 +302,8 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
       yield { type: 'done', stopReason: 'aborted' };
       return;
     }
-    yield { type: 'error', error: err?.message || String(err) };
+    const cls = classifyPiNetworkError('Google Gemini', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }

package/supervisor/harnesses/pi/providers/stream-openai-completions.ts CHANGED Viewed

@@ -16,7 +16,10 @@ import type {
   PiMessage,
   PiContentBlock,
   PiStopReason,
+  PiUsage,
 } from './types.js';
+import { fetchWithRetry } from './retry.js';
+import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
 /* ── SSE parser (LF or CRLF tolerant, flushes the trailing event) ── */
@@ -187,8 +190,17 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
     model: req.modelId,
     messages: openaiMessages,
     stream: true,
-    max_tokens: req.maxOutputTokens ?? 8192,
+    // gpt-5.x / o-series reject the legacy `max_tokens`; the openai-api
+    // sub-provider routes the cap through `max_completion_tokens` instead.
+    [req.maxTokensField ?? 'max_tokens']: req.maxOutputTokens ?? 8192,
   };
+  // Without this opt-in, OpenAI/OpenRouter streams carry NO usage at all — and
+  // usage is what feeds the supervisor's proactive session recycling. Gated
+  // per sub-provider: Mistral's strict schema 422s on unknown fields
+  // (noStreamUsage in sub-providers.ts); everyone else tolerates or needs it.
+  if (req.includeStreamUsage !== false) {
+    body.stream_options = { include_usage: true };
+  }
   if (req.tools && req.tools.length > 0) {
     body.tools = toOpenAITools(req.tools);
     body.tool_choice = 'auto';
@@ -201,47 +213,58 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
       'accept': 'text/event-stream',
     };
     if (req.apiKey) headers['authorization'] = `Bearer ${req.apiKey}`;
-    res = await fetch(url, {
+    res = await fetchWithRetry(url, {
       method: 'POST',
       headers,
       body: JSON.stringify(body),
       signal: req.signal,
     });
   } catch (err: any) {
-    yield { type: 'error', error: err?.message || String(err) };
+    if (err?.name === 'AbortError') {
+      yield { type: 'done', stopReason: 'aborted' };
+      return;
+    }
+    const cls = classifyPiNetworkError('OpenAI-compat', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }
   if (!res.ok) {
     let detail = '';
     try { detail = await res.text(); } catch {}
-    yield { type: 'error', error: `OpenAI-compat ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
+    const cls = classifyPiError('OpenAI-compat', res.status, res.statusText, detail);
+    yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
     return;
   }
   let accumulated = '';
   let lastFinish: string | undefined;
-  let usage: { inputTokens?: number; outputTokens?: number } | undefined;
+  let usage: PiUsage | undefined;
   const toolCallsByIndex = new Map<number, PartialToolCall>();
   let chunkCount = 0;
   let firstChunkSummary = '';
+  // Vendors disagree on where streamed usage lives: spec says a final
+  // choice-less chunk's `usage`, Groq defaults to nesting under `x_groq.usage`,
+  // Moonshot tucks it onto the choice itself. Read all three.
+  const readUsage = (u: any) => {
+    if (!u || (u.prompt_tokens === undefined && u.completion_tokens === undefined)) return;
+    usage = { inputTokens: u.prompt_tokens, outputTokens: u.completion_tokens };
+  };
   try {
     for await (const chunk of parseSse(res)) {
       chunkCount++;
       if (chunkCount === 1) {
         try { firstChunkSummary = JSON.stringify(chunk).slice(0, 600); } catch {}
       }
+      readUsage(chunk?.x_groq?.usage);
       const choice = chunk?.choices?.[0];
       if (!choice) {
-        if (chunk?.usage) {
-          usage = {
-            inputTokens: chunk.usage.prompt_tokens,
-            outputTokens: chunk.usage.completion_tokens,
-          };
-        }
+        readUsage(chunk?.usage);
         continue;
       }
+      readUsage(choice?.usage);
       const delta = choice.delta || {};
       if (typeof delta.content === 'string' && delta.content.length > 0) {
@@ -267,19 +290,15 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
       }
       if (choice.finish_reason) lastFinish = choice.finish_reason;
-      if (chunk?.usage) {
-        usage = {
-          inputTokens: chunk.usage.prompt_tokens,
-          outputTokens: chunk.usage.completion_tokens,
-        };
-      }
+      readUsage(chunk?.usage);
     }
   } catch (err: any) {
     if (err?.name === 'AbortError') {
       yield { type: 'done', stopReason: 'aborted' };
       return;
     }
-    yield { type: 'error', error: err?.message || String(err) };
+    const cls = classifyPiNetworkError('OpenAI-compat', err);
+    yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
     return;
   }
@@ -300,7 +319,24 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
     let input: any = {};
     if (partial.argsBuf) {
       try { input = JSON.parse(partial.argsBuf); }
-      catch { input = { _raw: partial.argsBuf }; }
+      catch {
+        // Truncated/malformed tool-call JSON — almost always the output-token
+        // cap cutting the arguments mid-stream. Executing a fabricated {_raw}
+        // input produces misleading tool errors the model retries forever;
+        // fail the round loudly instead (the session strips dangling tool_use
+        // blocks from history on errored rounds).
+        const capped = lastFinish === 'length';
+        yield {
+          type: 'error',
+          error: capped
+            ? `The model's ${partial.name} call was cut off by the output-token limit (${req.maxOutputTokens ?? 8192} tokens) — the arguments did not fit. Try a smaller change, or raise the model's output budget.`
+            : `The model emitted a malformed ${partial.name} tool call (arguments were not valid JSON).`,
+          kind: 'other',
+          retryable: !capped,
+        };
+        yield { type: 'done', stopReason: 'error', usage };
+        return;
+      }
     }
     yield {
       type: 'tool_use',

package/supervisor/harnesses/pi/providers/types.ts CHANGED Viewed

@@ -44,20 +44,45 @@ export interface PiStreamRequest {
   tools?: PiToolDef[];
   /** Hard cap on output tokens for a single turn. */
   maxOutputTokens?: number;
+  /**
+   * Which request field carries the output cap on the openai-completions
+   * flavor. OpenAI's reasoning models (gpt-5.x, o-series) reject the legacy
+   * `max_tokens` — the openai-api sub-provider sets `max_completion_tokens`
+   * (accepted by ALL OpenAI models); other vendors stay on `max_tokens`.
+   */
+  maxTokensField?: 'max_tokens' | 'max_completion_tokens';
+  /**
+   * openai-completions flavor: set false for strict-schema vendors (Mistral)
+   * that 422 on the `stream_options.include_usage` opt-in. Default true.
+   */
+  includeStreamUsage?: boolean;
   /** Optional abort signal so the session can interrupt in-flight requests. */
   signal?: AbortSignal;
 }
 export type PiStopReason = 'end_turn' | 'tool_use' | 'max_tokens' | 'error' | 'aborted';
+/**
+ * Coarse error classification so the session/harness can react without
+ * string-matching: retry transient rounds, tear down on auth/overflow, and
+ * show actionable messages instead of raw provider JSON.
+ */
+export type PiErrorKind = 'auth' | 'context-overflow' | 'rate-limit' | 'billing' | 'transient' | 'other';
 export type PiStreamEvent =
   | { type: 'text_delta'; delta: string }
   | { type: 'text_end'; text: string }
   | { type: 'tool_use'; id: string; name: string; input: any; thoughtSignature?: string }
   | { type: 'done'; stopReason: PiStopReason; usage?: PiUsage }
-  | { type: 'error'; error: string };
+  | { type: 'error'; error: string; status?: number; kind?: PiErrorKind; retryable?: boolean };
 export interface PiUsage {
+  /** Non-cached prompt tokens. NOTE: Anthropic's input_tokens EXCLUDES cache
+   *  reads/writes — prompt occupancy is input + cacheRead + cacheCreation
+   *  (Gemini's promptTokenCount and OpenAI's prompt_tokens already include
+   *  cached tokens, so their providers leave the cache fields unset). */
   inputTokens?: number;
   outputTokens?: number;
+  cacheReadTokens?: number;
+  cacheCreationTokens?: number;
 }