bloby-bot 0.70.8 → 0.70.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist-bloby/assets/{bloby-CXmOcb1r.js → bloby-DSNB0g4w.js} +4 -4
  2. package/dist-bloby/assets/{globals-DpO5tO92.js → globals-B3cTbITX.js} +1 -1
  3. package/dist-bloby/assets/{highlighted-body-OFNGDK62-D7cU1Y-Z.js → highlighted-body-OFNGDK62-BLforpkr.js} +1 -1
  4. package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +1 -0
  5. package/dist-bloby/assets/{onboard-B96ELhXn.js → onboard-Dn2Ws_G2.js} +1 -1
  6. package/dist-bloby/bloby.html +2 -2
  7. package/dist-bloby/onboard.html +2 -2
  8. package/package.json +1 -1
  9. package/scripts/sync-pi-models.ts +37 -6
  10. package/supervisor/chat/OnboardWizard.tsx +4 -4
  11. package/supervisor/harnesses/pi/async-queue.ts +7 -11
  12. package/supervisor/harnesses/pi/index.ts +475 -73
  13. package/supervisor/harnesses/pi/models-catalog.generated.ts +840 -210
  14. package/supervisor/harnesses/pi/providers/humanize-error.ts +125 -0
  15. package/supervisor/harnesses/pi/providers/retry.ts +87 -0
  16. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +73 -11
  17. package/supervisor/harnesses/pi/providers/stream-google.ts +15 -5
  18. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +55 -19
  19. package/supervisor/harnesses/pi/providers/types.ts +26 -1
  20. package/supervisor/harnesses/pi/session.ts +179 -73
  21. package/supervisor/harnesses/pi/sub-providers.ts +30 -1
  22. package/supervisor/harnesses/pi/test-completion.ts +8 -2
  23. package/supervisor/harnesses/pi/tools/registry.ts +25 -9
  24. package/supervisor/harnesses/pi/tools/task.ts +108 -0
  25. package/supervisor/harnesses/pi/tools/types.ts +15 -0
  26. package/supervisor/index.ts +11 -10
  27. package/supervisor/public/morphy_sad.mov +0 -0
  28. package/supervisor/public/morphy_sad.webm +0 -0
  29. package/supervisor/shell.ts +1 -1
  30. package/supervisor/workspace-guard.js +1 -1
  31. package/workspace/client/public/morphy_bounce.mov +0 -0
  32. package/workspace/client/public/morphy_bounce.webm +0 -0
  33. package/workspace/client/public/morphy_hi.mov +0 -0
  34. package/workspace/client/public/morphy_hi.webm +0 -0
  35. package/workspace/client/src/App.tsx +5 -3
  36. package/dist-bloby/assets/mermaid-GHXKKRXX-D5YxphBn.js +0 -1
  37. package/supervisor/public/what-happened.mp4 +0 -0
  38. package/supervisor/public/what-happened.webm +0 -0
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Provider error classification + humanization for the pi harness.
3
+ *
4
+ * Every non-OK HTTP response and network failure from the three stream
5
+ * providers funnels through here so the user sees an actionable message
6
+ * ("update your key in the dashboard") instead of a raw JSON wall, and so the
7
+ * session/harness can react structurally: retry `retryable` rounds, tear the
8
+ * conversation down on `auth` / `context-overflow` (a poisoned history would
9
+ * otherwise re-fail forever).
10
+ *
11
+ * Mirrors the codex harness's codexErrorInfo mapping (house standard M4).
12
+ */
13
+ import type { PiErrorKind } from './types.js';
14
+
15
+ export interface ClassifiedPiError {
16
+ /** User-facing message — friendly, with a one-line raw detail for debugging. */
17
+ message: string;
18
+ kind: PiErrorKind;
19
+ /** True when re-sending the identical request can plausibly succeed. */
20
+ retryable: boolean;
21
+ status?: number;
22
+ }
23
+
24
+ /** Pull the provider's human-readable message out of a JSON error body. */
25
+ function extractDetail(body: string): string {
26
+ const trimmed = (body || '').trim();
27
+ if (!trimmed) return '';
28
+ try {
29
+ const j = JSON.parse(trimmed);
30
+ // Google/OpenAI/Anthropic all nest it under error.message; some
31
+ // OpenAI-compat vendors use a top-level message.
32
+ const msg = j?.error?.message || j?.message || (typeof j?.error === 'string' ? j.error : '');
33
+ if (typeof msg === 'string' && msg.trim()) return msg.trim().slice(0, 300);
34
+ } catch {}
35
+ return trimmed.slice(0, 300);
36
+ }
37
+
38
+ const CONTEXT_OVERFLOW_RE =
39
+ /context.length|context_length_exceeded|maximum context length|prompt is too long|too many tokens|input token count.*exceed|token count exceeds|exceeds the maximum number of tokens|request exceeds the.*token|exceeds? (the )?context limit|input length and .{0,3}max_tokens/i;
40
+
41
+ const AUTH_RE =
42
+ /api key not valid|invalid api key|invalid x-api-key|incorrect api key|invalid_api_key|authentication[_ ]error|permission_error|invalid bearer token|no auth credentials/i;
43
+
44
+ // Deliberately narrow: only unambiguous out-of-credit markers. Gemini's
45
+ // routine per-minute 429 says "check your plan and billing details" — that is
46
+ // a RATE LIMIT (retryable), not billing; OpenAI's true quota exhaustion is
47
+ // distinguished by the insufficient_quota code (absent from Gemini bodies).
48
+ const BILLING_RE =
49
+ /insufficient_quota|credit balance is too low|payment required|purchase more credits/i;
50
+
51
+ export function classifyPiError(
52
+ providerLabel: string,
53
+ status: number | undefined,
54
+ statusText: string,
55
+ body: string,
56
+ ): ClassifiedPiError {
57
+ const detail = extractDetail(body);
58
+ const suffix = detail ? ` (${detail})` : '';
59
+
60
+ // Order matters: overflow and billing hide behind generic 400/429 statuses.
61
+ if ((status === 400 || status === 413) && CONTEXT_OVERFLOW_RE.test(body)) {
62
+ return {
63
+ kind: 'context-overflow',
64
+ retryable: false,
65
+ status,
66
+ message: `The conversation has outgrown ${providerLabel}'s context window.${suffix}`,
67
+ };
68
+ }
69
+ if (BILLING_RE.test(body) || status === 402) {
70
+ return {
71
+ kind: 'billing',
72
+ retryable: false,
73
+ status,
74
+ message: `${providerLabel} reports a quota/billing problem — check your plan or credits on the provider's console.${suffix}`,
75
+ };
76
+ }
77
+ // 401 is always auth; 403 only when the body says so — vendors also use 403
78
+ // for per-message moderation/guardrail blocks (e.g. OpenRouter), which must
79
+ // NOT be classified auth (auth is a fatal kind that recycles the session).
80
+ if (status === 401 || AUTH_RE.test(body)) {
81
+ return {
82
+ kind: 'auth',
83
+ retryable: false,
84
+ status,
85
+ message: `${providerLabel} rejected your API key. Update it from the dashboard (Bloby provider settings).${suffix}`,
86
+ };
87
+ }
88
+ if (status === 429) {
89
+ return {
90
+ kind: 'rate-limit',
91
+ retryable: true,
92
+ status,
93
+ message: `${providerLabel} rate limit reached — give it a moment and try again.${suffix}`,
94
+ };
95
+ }
96
+ if (status === 408 || (status !== undefined && status >= 500)) {
97
+ return {
98
+ kind: 'transient',
99
+ retryable: true,
100
+ status,
101
+ message: `${providerLabel} is having trouble right now (HTTP ${status}) — try again in a moment.${suffix}`,
102
+ };
103
+ }
104
+ return {
105
+ kind: 'other',
106
+ retryable: false,
107
+ status,
108
+ message: `${providerLabel} ${status ?? ''} ${statusText || ''}`.trim() + `${detail ? `: ${detail}` : ''}`,
109
+ };
110
+ }
111
+
112
+ /** Network-level failures (DNS, refused, reset, undici timeouts) — always transient. */
113
+ export function classifyPiNetworkError(providerLabel: string, err: any): ClassifiedPiError {
114
+ const raw = err?.message || String(err);
115
+ // undici's body/headers timeouts surface as the famously cryptic 'terminated'
116
+ // and 'Headers Timeout Error' — translate them.
117
+ const stalled = /terminated|timeout/i.test(raw);
118
+ return {
119
+ kind: 'transient',
120
+ retryable: true,
121
+ message: stalled
122
+ ? `${providerLabel} stream stalled (no data from the provider). Try again in a moment. (${raw})`
123
+ : `Could not reach ${providerLabel}: ${raw}`,
124
+ };
125
+ }
@@ -0,0 +1,87 @@
1
+ /**
2
+ * fetchWithRetry — transient-failure absorption for the pi providers.
3
+ *
4
+ * The Claude SDK retries transient provider errors inside its subprocess and
5
+ * codex suppresses willRetry errors; pi's hand-rolled providers previously did
6
+ * exactly one fetch, so a single 429 (routine on Gemini free tier) or a 5xx
7
+ * blip killed an entire multi-minute agentic turn. This wraps the initial
8
+ * request only — once a stream is open, mid-stream failures are handled by the
9
+ * session's round-retry (a full-history resend is stateless, so re-running a
10
+ * round that produced nothing is safe).
11
+ *
12
+ * Policy: up to 3 attempts on network errors and HTTP 408/429/5xx, exponential
13
+ * backoff 1s/2s with jitter, honoring Retry-After when it's short. A long
14
+ * Retry-After (> 15s) means the provider really wants us to back off — return
15
+ * the response and let the classifier surface a friendly rate-limit message.
16
+ */
17
+
18
+ const RETRYABLE_STATUS = new Set([408, 429, 500, 502, 503, 504, 529]);
19
+ const MAX_ATTEMPTS = 3;
20
+ const MAX_HONORED_RETRY_AFTER_MS = 15_000;
21
+
22
+ function retryAfterMs(res: Response): number | undefined {
23
+ const h = res.headers.get('retry-after');
24
+ if (!h) return undefined;
25
+ const secs = Number(h);
26
+ if (Number.isFinite(secs)) return Math.max(0, secs * 1000);
27
+ const date = Date.parse(h);
28
+ if (!Number.isNaN(date)) return Math.max(0, date - Date.now());
29
+ return undefined;
30
+ }
31
+
32
+ function abortError(): Error {
33
+ const err = new Error('This operation was aborted');
34
+ err.name = 'AbortError';
35
+ return err;
36
+ }
37
+
38
+ /** Sleep that wakes immediately (and throws AbortError) when the signal fires. */
39
+ export function sleep(ms: number, signal?: AbortSignal): Promise<void> {
40
+ return new Promise((resolve, reject) => {
41
+ if (signal?.aborted) return reject(abortError());
42
+ const timer = setTimeout(() => {
43
+ signal?.removeEventListener('abort', onAbort);
44
+ resolve();
45
+ }, ms);
46
+ const onAbort = () => {
47
+ clearTimeout(timer);
48
+ reject(abortError());
49
+ };
50
+ signal?.addEventListener('abort', onAbort, { once: true });
51
+ });
52
+ }
53
+
54
+ export async function fetchWithRetry(
55
+ url: string,
56
+ init: RequestInit & { signal?: AbortSignal },
57
+ ): Promise<Response> {
58
+ let lastErr: any;
59
+ for (let attempt = 0; attempt < MAX_ATTEMPTS; attempt++) {
60
+ if (init.signal?.aborted) throw abortError();
61
+
62
+ let res: Response;
63
+ try {
64
+ res = await fetch(url, init);
65
+ } catch (err: any) {
66
+ if (err?.name === 'AbortError') throw err;
67
+ lastErr = err;
68
+ if (attempt === MAX_ATTEMPTS - 1) throw err;
69
+ await sleep(1000 * 2 ** attempt + Math.random() * 250, init.signal);
70
+ continue;
71
+ }
72
+
73
+ if (res.ok || !RETRYABLE_STATUS.has(res.status) || attempt === MAX_ATTEMPTS - 1) {
74
+ return res;
75
+ }
76
+
77
+ const hinted = retryAfterMs(res);
78
+ if (hinted !== undefined && hinted > MAX_HONORED_RETRY_AFTER_MS) {
79
+ return res; // provider asked for a long back-off — surface it instead of stalling the turn
80
+ }
81
+ // Drain/cancel the body so the connection can be reused before retrying.
82
+ try { await res.body?.cancel(); } catch {}
83
+ await sleep(hinted ?? 1000 * 2 ** attempt + Math.random() * 250, init.signal);
84
+ }
85
+ // Unreachable, but keeps TS happy.
86
+ throw lastErr ?? new Error('fetchWithRetry: exhausted attempts');
87
+ }
@@ -18,7 +18,10 @@ import type {
18
18
  PiMessage,
19
19
  PiContentBlock,
20
20
  PiStopReason,
21
+ PiUsage,
21
22
  } from './types.js';
23
+ import { fetchWithRetry } from './retry.js';
24
+ import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
22
25
 
23
26
  /* ── SSE parser (shares the LF/CRLF-tolerant pattern from the other providers) ── */
24
27
 
@@ -150,12 +153,30 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
150
153
  max_tokens: req.maxOutputTokens ?? 8192,
151
154
  stream: true,
152
155
  };
153
- if (req.systemPrompt?.trim()) body.system = req.systemPrompt;
154
- if (req.tools && req.tools.length > 0) body.tools = toAnthropicTools(req.tools);
156
+ // Prompt caching (3 of the 4 allowed breakpoints). Without these, every tool
157
+ // round re-prefills the full system prompt + history at full input price —
158
+ // up to 25x per agentic turn. The request prefix is tools → system →
159
+ // messages, so: last tool def caches the tool block, the system block caches
160
+ // tools+system as one prefix, and the last history block caches the
161
+ // conversation so far (Anthropic checks previous breakpoint positions for
162
+ // the longest cached prefix as the marker moves forward each round).
163
+ if (req.systemPrompt?.trim()) {
164
+ body.system = [{ type: 'text', text: req.systemPrompt, cache_control: { type: 'ephemeral' } }];
165
+ }
166
+ if (req.tools && req.tools.length > 0) {
167
+ body.tools = toAnthropicTools(req.tools);
168
+ body.tools[body.tools.length - 1].cache_control = { type: 'ephemeral' };
169
+ }
170
+ if (Array.isArray(body.messages) && body.messages.length > 0) {
171
+ const lastContent = body.messages[body.messages.length - 1].content;
172
+ if (Array.isArray(lastContent) && lastContent.length > 0) {
173
+ lastContent[lastContent.length - 1].cache_control = { type: 'ephemeral' };
174
+ }
175
+ }
155
176
 
156
177
  let res: Response;
157
178
  try {
158
- res = await fetch(url, {
179
+ res = await fetchWithRetry(url, {
159
180
  method: 'POST',
160
181
  headers: {
161
182
  'content-type': 'application/json',
@@ -167,14 +188,20 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
167
188
  signal: req.signal,
168
189
  });
169
190
  } catch (err: any) {
170
- yield { type: 'error', error: err?.message || String(err) };
191
+ if (err?.name === 'AbortError') {
192
+ yield { type: 'done', stopReason: 'aborted' };
193
+ return;
194
+ }
195
+ const cls = classifyPiNetworkError('Anthropic', err);
196
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
171
197
  return;
172
198
  }
173
199
 
174
200
  if (!res.ok) {
175
201
  let detail = '';
176
202
  try { detail = await res.text(); } catch {}
177
- yield { type: 'error', error: `Anthropic ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
203
+ const cls = classifyPiError('Anthropic', res.status, res.statusText, detail);
204
+ yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
178
205
  return;
179
206
  }
180
207
 
@@ -183,7 +210,7 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
183
210
  const blocks = new Map<number, PartialBlock>();
184
211
  let accumulated = '';
185
212
  let lastStop: string | undefined;
186
- let usage: { inputTokens?: number; outputTokens?: number } | undefined;
213
+ let usage: PiUsage | undefined;
187
214
  let chunkCount = 0;
188
215
  let firstChunkSummary = '';
189
216
 
@@ -197,7 +224,17 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
197
224
  switch (type) {
198
225
  case 'message_start': {
199
226
  const u = evt?.message?.usage;
200
- if (u) usage = { inputTokens: u.input_tokens, outputTokens: u.output_tokens };
227
+ if (u) {
228
+ usage = {
229
+ inputTokens: u.input_tokens,
230
+ outputTokens: u.output_tokens,
231
+ // With prompt caching on, the bulk of the prompt is cache reads —
232
+ // input_tokens alone would massively under-report occupancy and
233
+ // the supervisor's recycler would never fire.
234
+ cacheReadTokens: u.cache_read_input_tokens,
235
+ cacheCreationTokens: u.cache_creation_input_tokens,
236
+ };
237
+ }
201
238
  break;
202
239
  }
203
240
  case 'content_block_start': {
@@ -241,7 +278,19 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
241
278
  let input: any = {};
242
279
  if (slot.toolArgsBuf) {
243
280
  try { input = JSON.parse(slot.toolArgsBuf); }
244
- catch { input = { _raw: slot.toolArgsBuf }; }
281
+ catch {
282
+ // Truncated tool-call JSON (output cap hit mid-arguments).
283
+ // Executing a fabricated {_raw} input sends the model into an
284
+ // unwinnable retry loop — fail the round loudly instead.
285
+ yield {
286
+ type: 'error',
287
+ error: `The model's ${slot.toolName} call was cut off by the output-token limit (${req.maxOutputTokens ?? 8192} tokens) — the arguments did not fit. Try a smaller change, or raise the model's output budget.`,
288
+ kind: 'other',
289
+ retryable: false,
290
+ };
291
+ yield { type: 'done', stopReason: 'error', usage };
292
+ return;
293
+ }
245
294
  }
246
295
  yield {
247
296
  type: 'tool_use',
@@ -259,13 +308,25 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
259
308
  usage = {
260
309
  inputTokens: u.input_tokens ?? usage?.inputTokens,
261
310
  outputTokens: u.output_tokens ?? usage?.outputTokens,
311
+ cacheReadTokens: u.cache_read_input_tokens ?? usage?.cacheReadTokens,
312
+ cacheCreationTokens: u.cache_creation_input_tokens ?? usage?.cacheCreationTokens,
262
313
  };
263
314
  }
264
315
  break;
265
316
  }
266
317
  case 'error': {
267
- const msg = evt?.error?.message || evt?.message || 'Unknown error';
268
- yield { type: 'error', error: `Anthropic stream error: ${msg}` };
318
+ // In-stream error event (e.g. overloaded_error) classify so the
319
+ // session can retry transient ones and the user sees friendly text.
320
+ const cls = classifyPiError('Anthropic', undefined, '', JSON.stringify(evt?.error ?? evt ?? {}));
321
+ const isOverloaded = (evt?.error?.type || '') === 'overloaded_error';
322
+ yield {
323
+ type: 'error',
324
+ error: cls.kind === 'other' && !isOverloaded
325
+ ? `Anthropic stream error: ${evt?.error?.message || evt?.message || 'Unknown error'}`
326
+ : (isOverloaded ? 'Anthropic is overloaded right now — try again in a moment.' : cls.message),
327
+ kind: isOverloaded ? 'transient' : cls.kind,
328
+ retryable: isOverloaded || cls.retryable,
329
+ };
269
330
  yield { type: 'done', stopReason: 'error', usage };
270
331
  return;
271
332
  }
@@ -281,7 +342,8 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
281
342
  yield { type: 'done', stopReason: 'aborted' };
282
343
  return;
283
344
  }
284
- yield { type: 'error', error: err?.message || String(err) };
345
+ const cls = classifyPiNetworkError('Anthropic', err);
346
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
285
347
  return;
286
348
  }
287
349
 
@@ -16,7 +16,10 @@ import type {
16
16
  PiMessage,
17
17
  PiContentBlock,
18
18
  PiStopReason,
19
+ PiUsage,
19
20
  } from './types.js';
21
+ import { fetchWithRetry } from './retry.js';
22
+ import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
20
23
 
21
24
  /** Walk an SSE byte stream and yield each parsed JSON event. */
22
25
  async function* parseSse(res: Response, dbg: { firstBytes: string }): AsyncIterable<any> {
@@ -209,21 +212,27 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
209
212
 
210
213
  let res: Response;
211
214
  try {
212
- res = await fetch(url, {
215
+ res = await fetchWithRetry(url, {
213
216
  method: 'POST',
214
217
  headers: { 'content-type': 'application/json' },
215
218
  body: JSON.stringify(body),
216
219
  signal: req.signal,
217
220
  });
218
221
  } catch (err: any) {
219
- yield { type: 'error', error: err?.message || String(err) };
222
+ if (err?.name === 'AbortError') {
223
+ yield { type: 'done', stopReason: 'aborted' };
224
+ return;
225
+ }
226
+ const cls = classifyPiNetworkError('Google Gemini', err);
227
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
220
228
  return;
221
229
  }
222
230
 
223
231
  if (!res.ok) {
224
232
  let detail = '';
225
233
  try { detail = await res.text(); } catch {}
226
- yield { type: 'error', error: `Google ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
234
+ const cls = classifyPiError('Google Gemini', res.status, res.statusText, detail);
235
+ yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
227
236
  return;
228
237
  }
229
238
 
@@ -231,7 +240,7 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
231
240
  let toolCallCount = 0;
232
241
  let lastFinish: string | undefined;
233
242
  let promptBlockReason: string | undefined;
234
- let usage: { inputTokens?: number; outputTokens?: number } | undefined;
243
+ let usage: PiUsage | undefined;
235
244
  // Debug counters — drop once this stabilises.
236
245
  let chunkCount = 0;
237
246
  let thoughtPartCount = 0;
@@ -293,7 +302,8 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
293
302
  yield { type: 'done', stopReason: 'aborted' };
294
303
  return;
295
304
  }
296
- yield { type: 'error', error: err?.message || String(err) };
305
+ const cls = classifyPiNetworkError('Google Gemini', err);
306
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
297
307
  return;
298
308
  }
299
309
 
@@ -16,7 +16,10 @@ import type {
16
16
  PiMessage,
17
17
  PiContentBlock,
18
18
  PiStopReason,
19
+ PiUsage,
19
20
  } from './types.js';
21
+ import { fetchWithRetry } from './retry.js';
22
+ import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
20
23
 
21
24
  /* ── SSE parser (LF or CRLF tolerant, flushes the trailing event) ── */
22
25
 
@@ -187,8 +190,17 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
187
190
  model: req.modelId,
188
191
  messages: openaiMessages,
189
192
  stream: true,
190
- max_tokens: req.maxOutputTokens ?? 8192,
193
+ // gpt-5.x / o-series reject the legacy `max_tokens`; the openai-api
194
+ // sub-provider routes the cap through `max_completion_tokens` instead.
195
+ [req.maxTokensField ?? 'max_tokens']: req.maxOutputTokens ?? 8192,
191
196
  };
197
+ // Without this opt-in, OpenAI/OpenRouter streams carry NO usage at all — and
198
+ // usage is what feeds the supervisor's proactive session recycling. Gated
199
+ // per sub-provider: Mistral's strict schema 422s on unknown fields
200
+ // (noStreamUsage in sub-providers.ts); everyone else tolerates or needs it.
201
+ if (req.includeStreamUsage !== false) {
202
+ body.stream_options = { include_usage: true };
203
+ }
192
204
  if (req.tools && req.tools.length > 0) {
193
205
  body.tools = toOpenAITools(req.tools);
194
206
  body.tool_choice = 'auto';
@@ -201,47 +213,58 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
201
213
  'accept': 'text/event-stream',
202
214
  };
203
215
  if (req.apiKey) headers['authorization'] = `Bearer ${req.apiKey}`;
204
- res = await fetch(url, {
216
+ res = await fetchWithRetry(url, {
205
217
  method: 'POST',
206
218
  headers,
207
219
  body: JSON.stringify(body),
208
220
  signal: req.signal,
209
221
  });
210
222
  } catch (err: any) {
211
- yield { type: 'error', error: err?.message || String(err) };
223
+ if (err?.name === 'AbortError') {
224
+ yield { type: 'done', stopReason: 'aborted' };
225
+ return;
226
+ }
227
+ const cls = classifyPiNetworkError('OpenAI-compat', err);
228
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
212
229
  return;
213
230
  }
214
231
 
215
232
  if (!res.ok) {
216
233
  let detail = '';
217
234
  try { detail = await res.text(); } catch {}
218
- yield { type: 'error', error: `OpenAI-compat ${res.status} ${res.statusText}${detail ? `: ${detail.slice(0, 400)}` : ''}` };
235
+ const cls = classifyPiError('OpenAI-compat', res.status, res.statusText, detail);
236
+ yield { type: 'error', error: cls.message, status: cls.status, kind: cls.kind, retryable: cls.retryable };
219
237
  return;
220
238
  }
221
239
 
222
240
  let accumulated = '';
223
241
  let lastFinish: string | undefined;
224
- let usage: { inputTokens?: number; outputTokens?: number } | undefined;
242
+ let usage: PiUsage | undefined;
225
243
  const toolCallsByIndex = new Map<number, PartialToolCall>();
226
244
  let chunkCount = 0;
227
245
  let firstChunkSummary = '';
228
246
 
247
+ // Vendors disagree on where streamed usage lives: spec says a final
248
+ // choice-less chunk's `usage`, Groq defaults to nesting under `x_groq.usage`,
249
+ // Moonshot tucks it onto the choice itself. Read all three.
250
+ const readUsage = (u: any) => {
251
+ if (!u || (u.prompt_tokens === undefined && u.completion_tokens === undefined)) return;
252
+ usage = { inputTokens: u.prompt_tokens, outputTokens: u.completion_tokens };
253
+ };
254
+
229
255
  try {
230
256
  for await (const chunk of parseSse(res)) {
231
257
  chunkCount++;
232
258
  if (chunkCount === 1) {
233
259
  try { firstChunkSummary = JSON.stringify(chunk).slice(0, 600); } catch {}
234
260
  }
261
+ readUsage(chunk?.x_groq?.usage);
235
262
  const choice = chunk?.choices?.[0];
236
263
  if (!choice) {
237
- if (chunk?.usage) {
238
- usage = {
239
- inputTokens: chunk.usage.prompt_tokens,
240
- outputTokens: chunk.usage.completion_tokens,
241
- };
242
- }
264
+ readUsage(chunk?.usage);
243
265
  continue;
244
266
  }
267
+ readUsage(choice?.usage);
245
268
  const delta = choice.delta || {};
246
269
 
247
270
  if (typeof delta.content === 'string' && delta.content.length > 0) {
@@ -267,19 +290,15 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
267
290
  }
268
291
 
269
292
  if (choice.finish_reason) lastFinish = choice.finish_reason;
270
- if (chunk?.usage) {
271
- usage = {
272
- inputTokens: chunk.usage.prompt_tokens,
273
- outputTokens: chunk.usage.completion_tokens,
274
- };
275
- }
293
+ readUsage(chunk?.usage);
276
294
  }
277
295
  } catch (err: any) {
278
296
  if (err?.name === 'AbortError') {
279
297
  yield { type: 'done', stopReason: 'aborted' };
280
298
  return;
281
299
  }
282
- yield { type: 'error', error: err?.message || String(err) };
300
+ const cls = classifyPiNetworkError('OpenAI-compat', err);
301
+ yield { type: 'error', error: cls.message, kind: cls.kind, retryable: cls.retryable };
283
302
  return;
284
303
  }
285
304
 
@@ -300,7 +319,24 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
300
319
  let input: any = {};
301
320
  if (partial.argsBuf) {
302
321
  try { input = JSON.parse(partial.argsBuf); }
303
- catch { input = { _raw: partial.argsBuf }; }
322
+ catch {
323
+ // Truncated/malformed tool-call JSON — almost always the output-token
324
+ // cap cutting the arguments mid-stream. Executing a fabricated {_raw}
325
+ // input produces misleading tool errors the model retries forever;
326
+ // fail the round loudly instead (the session strips dangling tool_use
327
+ // blocks from history on errored rounds).
328
+ const capped = lastFinish === 'length';
329
+ yield {
330
+ type: 'error',
331
+ error: capped
332
+ ? `The model's ${partial.name} call was cut off by the output-token limit (${req.maxOutputTokens ?? 8192} tokens) — the arguments did not fit. Try a smaller change, or raise the model's output budget.`
333
+ : `The model emitted a malformed ${partial.name} tool call (arguments were not valid JSON).`,
334
+ kind: 'other',
335
+ retryable: !capped,
336
+ };
337
+ yield { type: 'done', stopReason: 'error', usage };
338
+ return;
339
+ }
304
340
  }
305
341
  yield {
306
342
  type: 'tool_use',
@@ -44,20 +44,45 @@ export interface PiStreamRequest {
44
44
  tools?: PiToolDef[];
45
45
  /** Hard cap on output tokens for a single turn. */
46
46
  maxOutputTokens?: number;
47
+ /**
48
+ * Which request field carries the output cap on the openai-completions
49
+ * flavor. OpenAI's reasoning models (gpt-5.x, o-series) reject the legacy
50
+ * `max_tokens` — the openai-api sub-provider sets `max_completion_tokens`
51
+ * (accepted by ALL OpenAI models); other vendors stay on `max_tokens`.
52
+ */
53
+ maxTokensField?: 'max_tokens' | 'max_completion_tokens';
54
+ /**
55
+ * openai-completions flavor: set false for strict-schema vendors (Mistral)
56
+ * that 422 on the `stream_options.include_usage` opt-in. Default true.
57
+ */
58
+ includeStreamUsage?: boolean;
47
59
  /** Optional abort signal so the session can interrupt in-flight requests. */
48
60
  signal?: AbortSignal;
49
61
  }
50
62
 
51
63
  export type PiStopReason = 'end_turn' | 'tool_use' | 'max_tokens' | 'error' | 'aborted';
52
64
 
65
+ /**
66
+ * Coarse error classification so the session/harness can react without
67
+ * string-matching: retry transient rounds, tear down on auth/overflow, and
68
+ * show actionable messages instead of raw provider JSON.
69
+ */
70
+ export type PiErrorKind = 'auth' | 'context-overflow' | 'rate-limit' | 'billing' | 'transient' | 'other';
71
+
53
72
  export type PiStreamEvent =
54
73
  | { type: 'text_delta'; delta: string }
55
74
  | { type: 'text_end'; text: string }
56
75
  | { type: 'tool_use'; id: string; name: string; input: any; thoughtSignature?: string }
57
76
  | { type: 'done'; stopReason: PiStopReason; usage?: PiUsage }
58
- | { type: 'error'; error: string };
77
+ | { type: 'error'; error: string; status?: number; kind?: PiErrorKind; retryable?: boolean };
59
78
 
60
79
  export interface PiUsage {
80
+ /** Non-cached prompt tokens. NOTE: Anthropic's input_tokens EXCLUDES cache
81
+ * reads/writes — prompt occupancy is input + cacheRead + cacheCreation
82
+ * (Gemini's promptTokenCount and OpenAI's prompt_tokens already include
83
+ * cached tokens, so their providers leave the cache fields unset). */
61
84
  inputTokens?: number;
62
85
  outputTokens?: number;
86
+ cacheReadTokens?: number;
87
+ cacheCreationTokens?: number;
63
88
  }