bloby-bot 0.70.12 → 0.71.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/bin/cli.js +234 -48
  2. package/dist-bloby/assets/{bloby-DSNB0g4w.js → bloby-es6cZJzs.js} +6 -6
  3. package/dist-bloby/assets/globals-DBqwNiJV.css +2 -0
  4. package/dist-bloby/assets/{globals-B3cTbITX.js → globals-DN3F0CQE.js} +1 -1
  5. package/dist-bloby/assets/{highlighted-body-OFNGDK62-BLforpkr.js → highlighted-body-OFNGDK62-8PiOHw9p.js} +1 -1
  6. package/dist-bloby/assets/mermaid-GHXKKRXX-BJWX8urU.js +1 -0
  7. package/dist-bloby/assets/{onboard-Dn2Ws_G2.js → onboard-BKgy17OU.js} +1 -1
  8. package/dist-bloby/bloby.html +3 -3
  9. package/dist-bloby/onboard.html +3 -3
  10. package/package.json +3 -4
  11. package/scripts/install +156 -41
  12. package/scripts/install.ps1 +146 -29
  13. package/scripts/install.sh +156 -41
  14. package/shared/config.ts +37 -2
  15. package/shared/relay.ts +3 -1
  16. package/supervisor/channels/manager.ts +84 -44
  17. package/supervisor/channels/telegram.ts +57 -16
  18. package/supervisor/channels/types.ts +4 -1
  19. package/supervisor/channels/whatsapp.ts +57 -10
  20. package/supervisor/chat/OnboardWizard.tsx +0 -15
  21. package/supervisor/chat/src/components/Chat/AudioBubble.tsx +1 -1
  22. package/supervisor/chat/src/components/Chat/AuthedImage.tsx +16 -3
  23. package/supervisor/chat/src/components/Chat/BlobyImageCard.tsx +2 -2
  24. package/supervisor/chat/src/components/Chat/ImageLightbox.tsx +25 -8
  25. package/supervisor/chat/src/components/Chat/InputBar.tsx +62 -7
  26. package/supervisor/chat/src/components/Chat/MessageBubble.tsx +37 -18
  27. package/supervisor/chat/src/components/Chat/MessageList.tsx +3 -3
  28. package/supervisor/chat/src/hooks/useChat.ts +52 -0
  29. package/supervisor/chat/src/lib/authedFile.ts +24 -12
  30. package/supervisor/file-saver.ts +92 -19
  31. package/supervisor/harnesses/attachment-policy.ts +111 -0
  32. package/supervisor/harnesses/claude.ts +62 -15
  33. package/supervisor/harnesses/codex.ts +69 -43
  34. package/supervisor/harnesses/pi/index.ts +367 -112
  35. package/supervisor/harnesses/pi/providers/humanize-error.ts +27 -2
  36. package/supervisor/harnesses/pi/providers/retry.ts +31 -0
  37. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +31 -3
  38. package/supervisor/harnesses/pi/providers/stream-google.ts +26 -3
  39. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +32 -9
  40. package/supervisor/harnesses/pi/providers/types.ts +29 -1
  41. package/supervisor/harnesses/pi/session.ts +143 -3
  42. package/supervisor/harnesses/pi/test-completion.ts +56 -0
  43. package/supervisor/harnesses/pi/tools/bash.ts +198 -22
  44. package/supervisor/harnesses/pi/tools/glob.ts +79 -0
  45. package/supervisor/harnesses/pi/tools/grep.ts +0 -0
  46. package/supervisor/harnesses/pi/tools/registry.ts +18 -6
  47. package/supervisor/harnesses/pi/tools/todo-write.ts +45 -0
  48. package/supervisor/harnesses/pi/tools/web-fetch.ts +129 -0
  49. package/supervisor/index.ts +93 -18
  50. package/supervisor/widget.js +19 -5
  51. package/worker/db.ts +2 -0
  52. package/worker/index.ts +18 -1
  53. package/worker/prompts/bloby-system-prompt-codex.txt +1 -1
  54. package/worker/prompts/bloby-system-prompt-pi.txt +6 -24
  55. package/worker/prompts/bloby-system-prompt.txt +1 -1
  56. package/workspace/client/src/components/Dashboard/DashboardPage.tsx +4 -117
  57. package/workspace/client/src/components/Dashboard/deleteme_placeholders.tsx +194 -0
  58. package/workspace/client/src/components/Layout/Sidebar.tsx +52 -30
  59. package/workspace/client/src/components/deleteme_onboarding/WorkspaceTour.tsx +25 -15
  60. package/workspace/client/src/components/deleteme_onboarding/tour-theme.css +24 -0
  61. package/workspace/skills/mac/SKILL.md +13 -4
  62. package/dist-bloby/assets/globals-DyeW509Y.css +0 -2
  63. package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +0 -1
  64. package/supervisor/public/headphones_spritesheet.webp +0 -0
  65. package/supervisor/public/spritesheet.webp +0 -0
@@ -48,6 +48,18 @@ const AUTH_RE =
48
48
  const BILLING_RE =
49
49
  /insufficient_quota|credit balance is too low|payment required|purchase more credits/i;
50
50
 
51
+ // A text-only model rejecting an attached image. Vendors phrase it many ways:
52
+ // OpenAI "Invalid content type. image_url is only supported by certain models",
53
+ // OpenRouter "No endpoints found that support image input", others mention
54
+ // "image input" / "does not support images" / "unsupported content type".
55
+ // Only EXPLICIT image-naming phrases — the bare tokens "vision"/"multimodal"/
56
+ // "modality" were removed because the provider body routinely echoes the model id
57
+ // (e.g. "gpt-4-vision-preview", "llama-3.2-90b-vision-instruct"), which would
58
+ // mis-classify an unrelated 400 from a vision-capable model and wrongly disable
59
+ // vision for the rest of the session. Paired with a 400/415/422 status below.
60
+ const IMAGE_UNSUPPORTED_RE =
61
+ /image[_ ]?url|image input|images?(?: are| is)? not supported|does not support images?|no endpoints? .*support image|unsupported content type/i;
62
+
51
63
  export function classifyPiError(
52
64
  providerLabel: string,
53
65
  status: number | undefined,
@@ -85,6 +97,19 @@ export function classifyPiError(
85
97
  message: `${providerLabel} rejected your API key. Update it from the dashboard (Bloby provider settings).${suffix}`,
86
98
  };
87
99
  }
100
+ // A text-only model that the catalog couldn't flag up front (dynamic/unknown
101
+ // sub-providers) 400/415/422s on the attached image. The session reacts by
102
+ // disabling vision for the rest of the session and re-running the round with
103
+ // images downgraded — self-healing so a single screenshot can't permanently
104
+ // 400-poison the conversation (it rides every stateless resend otherwise).
105
+ if ((status === 400 || status === 415 || status === 422) && IMAGE_UNSUPPORTED_RE.test(body)) {
106
+ return {
107
+ kind: 'image-unsupported',
108
+ retryable: false,
109
+ status,
110
+ message: `${providerLabel} rejected the attached image — this model appears to be text-only. Retrying without the image; switch to a vision-capable model to send images.${suffix}`,
111
+ };
112
+ }
88
113
  if (status === 429) {
89
114
  return {
90
115
  kind: 'rate-limit',
@@ -113,8 +138,8 @@ export function classifyPiError(
113
138
  export function classifyPiNetworkError(providerLabel: string, err: any): ClassifiedPiError {
114
139
  const raw = err?.message || String(err);
115
140
  // undici's body/headers timeouts surface as the famously cryptic 'terminated'
116
- // and 'Headers Timeout Error' translate them.
117
- const stalled = /terminated|timeout/i.test(raw);
141
+ // and 'Headers Timeout Error'; our own SSE idle guard says 'stalled'.
142
+ const stalled = /terminated|timeout|stalled/i.test(raw);
118
143
  return {
119
144
  kind: 'transient',
120
145
  retryable: true,
@@ -51,6 +51,37 @@ export function sleep(ms: number, signal?: AbortSignal): Promise<void> {
51
51
  });
52
52
  }
53
53
 
54
+ /** Per-chunk SSE idle guard (audit D6-7). Without it, a stalled-but-open
55
+ * stream waits ~300s for Node's undici body timeout and then surfaces a
56
+ * cryptic 'terminated'. 120s is generous: Anthropic pings every ~20s and
57
+ * Gemini/OpenAI chunk every few seconds while healthy. */
58
+ export const SSE_IDLE_TIMEOUT_MS = 120_000;
59
+
60
+ export async function readWithIdleTimeout<T>(
61
+ reader: { read(): Promise<T>; cancel?: (reason?: any) => Promise<void> | void },
62
+ providerLabel: string,
63
+ ): Promise<T> {
64
+ let timer: NodeJS.Timeout | undefined;
65
+ const timeoutP = new Promise<never>((_, reject) => {
66
+ timer = setTimeout(
67
+ () => reject(new Error(`${providerLabel} stream stalled — no data received for ${SSE_IDLE_TIMEOUT_MS / 1000}s.`)),
68
+ SSE_IDLE_TIMEOUT_MS,
69
+ );
70
+ });
71
+ const readP = reader.read();
72
+ // Mark the losing read promise handled so a post-timeout rejection (after
73
+ // reader.cancel) never surfaces as an unhandledRejection.
74
+ readP.catch?.(() => {});
75
+ try {
76
+ return await Promise.race([readP, timeoutP]);
77
+ } catch (err) {
78
+ try { void reader.cancel?.(); } catch {}
79
+ throw err;
80
+ } finally {
81
+ clearTimeout(timer!);
82
+ }
83
+ }
84
+
54
85
  export async function fetchWithRetry(
55
86
  url: string,
56
87
  init: RequestInit & { signal?: AbortSignal },
@@ -20,7 +20,7 @@ import type {
20
20
  PiStopReason,
21
21
  PiUsage,
22
22
  } from './types.js';
23
- import { fetchWithRetry } from './retry.js';
23
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
24
24
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
25
25
 
26
26
  /* ── SSE parser (shares the LF/CRLF-tolerant pattern from the other providers) ── */
@@ -32,7 +32,7 @@ async function* parseSse(res: Response): AsyncIterable<any> {
32
32
  let buffer = '';
33
33
  try {
34
34
  while (true) {
35
- const { value, done } = await reader.read();
35
+ const { value, done } = await readWithIdleTimeout(reader, 'Anthropic');
36
36
  if (done) break;
37
37
  buffer += decoder.decode(value, { stream: true });
38
38
  let idx;
@@ -79,12 +79,24 @@ function toAnthropicContent(blocks: PiContentBlock[]): any[] {
79
79
  const out: any[] = [];
80
80
  for (const b of blocks) {
81
81
  if (b.type === 'text') {
82
+ // The Messages API rejects empty/whitespace-only text blocks ("text
83
+ // content blocks must be non-empty") — drop them; an all-empty message
84
+ // is then filtered by the content-length guards in toAnthropicMessages.
85
+ if (!b.text || !b.text.trim()) continue;
82
86
  out.push({ type: 'text', text: b.text });
83
87
  } else if (b.type === 'image') {
84
88
  out.push({
85
89
  type: 'image',
86
90
  source: { type: 'base64', media_type: b.mediaType, data: b.data },
87
91
  });
92
+ } else if (b.type === 'document') {
93
+ // Native PDF document block — the Messages API renders the pages and the
94
+ // model reads them as vision. The base64 document source accepts ONLY
95
+ // application/pdf (buildUserMessage gates it on canNativeDocument).
96
+ out.push({
97
+ type: 'document',
98
+ source: { type: 'base64', media_type: b.mediaType, data: b.data },
99
+ });
88
100
  } else if (b.type === 'tool_use') {
89
101
  out.push({
90
102
  type: 'tool_use',
@@ -105,13 +117,19 @@ function toAnthropicContent(blocks: PiContentBlock[]): any[] {
105
117
  }
106
118
 
107
119
  function toAnthropicMessages(pi: PiMessage[]): any[] {
108
- return pi
120
+ const msgs = pi
109
121
  .filter((m) => m.content.length > 0)
110
122
  .map((m) => ({
111
123
  role: m.role === 'assistant' ? 'assistant' : 'user',
112
124
  content: toAnthropicContent(m.content),
113
125
  }))
114
126
  .filter((m) => m.content.length > 0);
127
+ // The Messages API requires the first message to be user-role. Rolling
128
+ // history windows (customer buffers) are trimmed user-first at the source
129
+ // (channels/manager.ts trimCustomerBuffer), but defend here too — a leading
130
+ // assistant message 400s the whole request (audit C-7).
131
+ while (msgs.length > 0 && msgs[0].role !== 'user') msgs.shift();
132
+ return msgs;
115
133
  }
116
134
 
117
135
  function toAnthropicTools(tools: { name: string; description: string; inputSchema: Record<string, any> }[]) {
@@ -166,6 +184,9 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
166
184
  if (req.tools && req.tools.length > 0) {
167
185
  body.tools = toAnthropicTools(req.tools);
168
186
  body.tools[body.tools.length - 1].cache_control = { type: 'ephemeral' };
187
+ // Round-cap wrap-up: forbid further tool calls; tools stay declared so
188
+ // tool_use/tool_result blocks in history remain valid.
189
+ if (req.toolChoice === 'none') body.tool_choice = { type: 'none' };
169
190
  }
170
191
  if (Array.isArray(body.messages) && body.messages.length > 0) {
171
192
  const lastContent = body.messages[body.messages.length - 1].content;
@@ -213,6 +234,7 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
213
234
  let usage: PiUsage | undefined;
214
235
  let chunkCount = 0;
215
236
  let firstChunkSummary = '';
237
+ let thinkingEmitted = false;
216
238
 
217
239
  try {
218
240
  for await (const evt of parseSse(res)) {
@@ -250,6 +272,12 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
250
272
  toolArgsBuf: '',
251
273
  });
252
274
  } else {
275
+ // Extended-thinking blocks (not requested today, future-proofed):
276
+ // one liveness pulse, text never forwarded.
277
+ if (block.type === 'thinking' && !thinkingEmitted) {
278
+ thinkingEmitted = true;
279
+ yield { type: 'thinking' };
280
+ }
253
281
  blocks.set(idx, { kind: 'other' });
254
282
  }
255
283
  break;
@@ -18,7 +18,7 @@ import type {
18
18
  PiStopReason,
19
19
  PiUsage,
20
20
  } from './types.js';
21
- import { fetchWithRetry } from './retry.js';
21
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
22
22
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
23
23
 
24
24
  /** Walk an SSE byte stream and yield each parsed JSON event. */
@@ -30,7 +30,7 @@ async function* parseSse(res: Response, dbg: { firstBytes: string }): AsyncItera
30
30
  let totalBytes = 0;
31
31
  try {
32
32
  while (true) {
33
- const { value, done } = await reader.read();
33
+ const { value, done } = await readWithIdleTimeout(reader, 'Google Gemini');
34
34
  if (done) break;
35
35
  if (value) totalBytes += value.byteLength;
36
36
  buffer += decoder.decode(value, { stream: true });
@@ -102,6 +102,11 @@ function toGeminiParts(content: PiContentBlock[]): any[] {
102
102
  parts.push({ text: b.text });
103
103
  } else if (b.type === 'image') {
104
104
  parts.push({ inlineData: { mimeType: b.mediaType, data: b.data } });
105
+ } else if (b.type === 'document') {
106
+ // Gemini ingests application/pdf inline via the same inlineData shape as
107
+ // images (it OCRs/renders the document). buildUserMessage only routes a
108
+ // document block here when the flavor supports it.
109
+ parts.push({ inlineData: { mimeType: b.mediaType, data: b.data } });
105
110
  } else if (b.type === 'tool_use') {
106
111
  // Assistant turn: the model asked to invoke a tool. Thinking-capable
107
112
  // Gemini 3.x rejects (HTTP 400) any echoed functionCall whose
@@ -203,11 +208,25 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
203
208
  maxOutputTokens: req.maxOutputTokens ?? 32768,
204
209
  },
205
210
  };
211
+ // Thinking-capable families (2.5+/3.x): ask for thought summaries so the
212
+ // harness can emit a liveness pulse — without this, Gemini 3 burns its
213
+ // output budget on invisible reasoning and the chat looks hung. Gated by
214
+ // model id; unknown/dynamic ids skip it (older models reject the field).
215
+ // The rolling aliases (gemini-flash-latest / gemini-flash-lite-latest)
216
+ // resolve to 2.5+/3.x thinking models too (review PI-D-2).
217
+ if (/gemini-(2\.5|[3-9]|flash(-lite)?-latest)/i.test(req.modelId)) {
218
+ body.generationConfig.thinkingConfig = { includeThoughts: true };
219
+ }
206
220
  if (req.systemPrompt?.trim()) {
207
221
  body.systemInstruction = { parts: [{ text: req.systemPrompt }] };
208
222
  }
209
223
  if (req.tools && req.tools.length > 0) {
210
224
  body.tools = toGeminiTools(req.tools);
225
+ // Round-cap wrap-up: forbid further function calls; tools stay declared so
226
+ // functionCall/functionResponse parts in history remain valid.
227
+ if (req.toolChoice === 'none') {
228
+ body.toolConfig = { functionCallingConfig: { mode: 'NONE' } };
229
+ }
211
230
  }
212
231
 
213
232
  let res: Response;
@@ -263,7 +282,11 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
263
282
  for (const part of parts) {
264
283
  // Thinking models emit reasoning parts with `thought: true`. They
265
284
  // shouldn't be shown to the user as part of the visible answer.
266
- if (part?.thought) { thoughtPartCount++; continue; }
285
+ if (part?.thought) {
286
+ thoughtPartCount++;
287
+ if (thoughtPartCount === 1) yield { type: 'thinking' };
288
+ continue;
289
+ }
267
290
  if (part?.functionCall && typeof part.functionCall.name === 'string') {
268
291
  // Gemini doesn't surface a tool-call id of its own; bake the tool
269
292
  // name into the id so the session can echo it back as a
@@ -18,7 +18,7 @@ import type {
18
18
  PiStopReason,
19
19
  PiUsage,
20
20
  } from './types.js';
21
- import { fetchWithRetry } from './retry.js';
21
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
22
22
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
23
23
 
24
24
  /* ── SSE parser (LF or CRLF tolerant, flushes the trailing event) ── */
@@ -30,7 +30,7 @@ async function* parseSse(res: Response): AsyncIterable<any> {
30
30
  let buffer = '';
31
31
  try {
32
32
  while (true) {
33
- const { value, done } = await reader.read();
33
+ const { value, done } = await readWithIdleTimeout(reader, 'OpenAI-compat');
34
34
  if (done) break;
35
35
  buffer += decoder.decode(value, { stream: true });
36
36
  let idx;
@@ -119,24 +119,33 @@ function toOpenAIMessages(pi: PiMessage[]): any[] {
119
119
  out.push(msg);
120
120
  continue;
121
121
  }
122
- // role === 'user' with non-tool-result content (text + optional images)
122
+ // role === 'user' with non-tool-result content (text + optional images).
123
+ // Media parts go first; text is appended last (parity with the other
124
+ // providers and pi/index's media-first block ordering).
123
125
  const contentBlocks: any[] = [];
124
126
  let plainText = '';
125
- let hasImage = false;
127
+ let hasMedia = false;
126
128
  for (const b of m.content) {
127
129
  if (b.type === 'text') {
128
130
  plainText += (plainText ? '\n' : '') + b.text;
129
131
  } else if (b.type === 'image') {
130
- hasImage = true;
132
+ hasMedia = true;
131
133
  contentBlocks.push({
132
134
  type: 'image_url',
133
135
  image_url: { url: `data:${b.mediaType};base64,${b.data}` },
134
136
  });
137
+ } else if (b.type === 'document') {
138
+ // The Chat Completions schema has no document part — degrade to a text
139
+ // note rather than crashing. The file is also on disk (saved-files
140
+ // note), so the agent can open it with its tools. This shouldn't
141
+ // normally happen: buildUserMessage gates documents on canNativeDocument
142
+ // (false for this flavor), so a PDF here rides as the disk pointer.
143
+ plainText += (plainText ? '\n' : '') +
144
+ `[Attached document${b.name ? ` "${b.name}"` : ''} (${b.mediaType}) could not be inlined for this model — it is saved to disk; open it with your file tools.]`;
135
145
  }
136
146
  }
137
- if (hasImage) {
138
- // Mixed image+text: prepend text part to the content array.
139
- if (plainText) contentBlocks.unshift({ type: 'text', text: plainText });
147
+ if (hasMedia) {
148
+ if (plainText) contentBlocks.push({ type: 'text', text: plainText });
140
149
  out.push({ role: 'user', content: contentBlocks });
141
150
  } else {
142
151
  out.push({ role: 'user', content: plainText });
@@ -203,7 +212,10 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
203
212
  }
204
213
  if (req.tools && req.tools.length > 0) {
205
214
  body.tools = toOpenAITools(req.tools);
206
- body.tool_choice = 'auto';
215
+ // 'none' = the round-cap wrap-up round: the model must summarize, not
216
+ // start more work. Tools stay declared so histories containing tool calls
217
+ // remain valid.
218
+ body.tool_choice = req.toolChoice === 'none' ? 'none' : 'auto';
207
219
  }
208
220
 
209
221
  let res: Response;
@@ -243,6 +255,7 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
243
255
  const toolCallsByIndex = new Map<number, PartialToolCall>();
244
256
  let chunkCount = 0;
245
257
  let firstChunkSummary = '';
258
+ let thinkingEmitted = false;
246
259
 
247
260
  // Vendors disagree on where streamed usage lives: spec says a final
248
261
  // choice-less chunk's `usage`, Groq defaults to nesting under `x_groq.usage`,
@@ -267,6 +280,16 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
267
280
  readUsage(choice?.usage);
268
281
  const delta = choice.delta || {};
269
282
 
283
+ // Reasoning models stream hidden thinking under vendor-specific fields
284
+ // (DeepSeek/OpenRouter: reasoning_content; others: reasoning /
285
+ // reasoning_text — upstream pi's field priority). Emit ONE liveness
286
+ // pulse so the chat doesn't look hung; never forward the text itself.
287
+ const reasoningDelta = delta.reasoning_content ?? delta.reasoning ?? delta.reasoning_text;
288
+ if (!thinkingEmitted && typeof reasoningDelta === 'string' && reasoningDelta.length > 0) {
289
+ thinkingEmitted = true;
290
+ yield { type: 'thinking' };
291
+ }
292
+
270
293
  if (typeof delta.content === 'string' && delta.content.length > 0) {
271
294
  accumulated += delta.content;
272
295
  yield { type: 'text_delta', delta: delta.content };
@@ -17,6 +17,11 @@ export type PiRole = 'user' | 'assistant' | 'tool';
17
17
  export type PiContentBlock =
18
18
  | { type: 'text'; text: string }
19
19
  | { type: 'image'; mediaType: string; data: string } // base64
20
+ // Native document block (PDF). Only the flavors with native document support
21
+ // (anthropic-messages, google-gemini) ever receive one — buildUserMessage
22
+ // gates it on canNativeDocument; openai-completions degrades it to a text
23
+ // note rather than crashing if one ever reaches it.
24
+ | { type: 'document'; mediaType: string; data: string; name?: string } // base64
20
25
  // `thoughtSignature` is a Gemini 3.x thinking-model field. Pi-flavored
21
26
  // providers that emit reasoning attach it to function-call parts; the API
22
27
  // rejects the next turn with HTTP 400 if we don't echo it back verbatim.
@@ -56,6 +61,13 @@ export interface PiStreamRequest {
56
61
  * that 422 on the `stream_options.include_usage` opt-in. Default true.
57
62
  */
58
63
  includeStreamUsage?: boolean;
64
+ /**
65
+ * 'none' forbids tool calls for this request (mapped per flavor: OpenAI
66
+ * tool_choice:'none', Anthropic {type:'none'}, Gemini functionCallingConfig
67
+ * mode NONE). Used by the session's round-cap wrap-up round, where the model
68
+ * must summarize instead of starting more work.
69
+ */
70
+ toolChoice?: 'auto' | 'none';
59
71
  /** Optional abort signal so the session can interrupt in-flight requests. */
60
72
  signal?: AbortSignal;
61
73
  }
@@ -67,11 +79,27 @@ export type PiStopReason = 'end_turn' | 'tool_use' | 'max_tokens' | 'error' | 'a
67
79
  * string-matching: retry transient rounds, tear down on auth/overflow, and
68
80
  * show actionable messages instead of raw provider JSON.
69
81
  */
70
- export type PiErrorKind = 'auth' | 'context-overflow' | 'rate-limit' | 'billing' | 'transient' | 'other';
82
+ export type PiErrorKind =
83
+ | 'auth'
84
+ | 'context-overflow'
85
+ | 'rate-limit'
86
+ | 'billing'
87
+ | 'transient'
88
+ /** The model rejected an image/vision/modality block (a text-only model 400/
89
+ * 415/422s on the attached image). The session reacts by disabling vision
90
+ * for the rest of the session and re-running the round with images
91
+ * downgraded to placeholders — self-healing for dynamic/unknown models whose
92
+ * catalog can't tell us up front whether they see images. */
93
+ | 'image-unsupported'
94
+ | 'other';
71
95
 
72
96
  export type PiStreamEvent =
73
97
  | { type: 'text_delta'; delta: string }
74
98
  | { type: 'text_end'; text: string }
99
+ /** Emitted when the model starts (visibly) reasoning — a liveness pulse for
100
+ * thinking models so the chat doesn't look hung. Reasoning TEXT is never
101
+ * forwarded (it would corrupt the streamed-text == response contract). */
102
+ | { type: 'thinking' }
75
103
  | { type: 'tool_use'; id: string; name: string; input: any; thoughtSignature?: string }
76
104
  | { type: 'done'; stopReason: PiStopReason; usage?: PiUsage }
77
105
  | { type: 'error'; error: string; status?: number; kind?: PiErrorKind; retryable?: boolean };
@@ -44,6 +44,8 @@ export type PiSessionEvent =
44
44
  | { type: 'turn_started' }
45
45
  | { type: 'text_delta'; delta: string }
46
46
  | { type: 'text_end'; text: string }
47
+ /** Liveness pulse: the model is reasoning (thinking models) — no text attached. */
48
+ | { type: 'thinking' }
47
49
  | { type: 'tool_use'; id: string; name: string; input: any }
48
50
  | { type: 'tool_result'; toolUseId: string; name: string; isError?: boolean }
49
51
  | {
@@ -78,6 +80,10 @@ export interface PiSessionAuth {
78
80
  includeStreamUsage?: boolean;
79
81
  /** Model context window from the catalog — reported on turn_complete for the recycler. */
80
82
  contextWindow?: number;
83
+ /** False when the catalog says the model is text-only — image blocks are
84
+ * downgraded to placeholders on send so one screenshot can't 400-poison
85
+ * the session (audit C-8). Undefined (dynamic models) ⇒ assume vision. */
86
+ supportsImages?: boolean;
81
87
  }
82
88
 
83
89
  export interface PiSessionInit {
@@ -117,6 +123,51 @@ export interface PiSession {
117
123
  getMessages(): PiMessage[];
118
124
  }
119
125
 
126
+ /** Transform-on-send for text-only models (audit C-8): image blocks become
127
+ * placeholders in the REQUEST only — the stored history keeps the images, so
128
+ * switching to a vision model later restores them. */
129
+ function downgradeImages(messages: PiMessage[]): PiMessage[] {
130
+ let any = false;
131
+ const out = messages.map((m) => {
132
+ if (!m.content.some((b) => b.type === 'image')) return m;
133
+ any = true;
134
+ return {
135
+ ...m,
136
+ content: m.content.map((b): PiContentBlock =>
137
+ b.type === 'image'
138
+ ? { type: 'text', text: '[An image was attached here, but the current model cannot view images. Tell the user to switch to a vision-capable model if the image matters.]' }
139
+ : b,
140
+ ),
141
+ };
142
+ });
143
+ return any ? out : messages;
144
+ }
145
+
146
+ /** Emergency in-turn context relief (audit D2-6): when occupancy crosses the
147
+ * threshold MID-turn (recycling only acts between idle turns), stub out the
148
+ * oldest large tool_result payloads — never user/assistant text, never the
149
+ * protected tail (the current round's results). Cruder than real compaction,
150
+ * but the turn finishes instead of 400ing on the context wall. */
151
+ function trimOldToolResults(messages: PiMessage[], charsToFree: number, protectTail: number): number {
152
+ let freed = 0;
153
+ const limit = Math.max(0, messages.length - protectTail);
154
+ for (let i = 0; i < limit && freed < charsToFree; i++) {
155
+ const m = messages[i];
156
+ if (m.role !== 'user') continue;
157
+ for (const b of m.content) {
158
+ if (b.type === 'tool_result' && typeof b.content === 'string' && b.content.length > 2048) {
159
+ freed += b.content.length;
160
+ b.content = `[tool output trimmed to fit the context window — ~${Math.round(b.content.length / 1024)} KB removed]`;
161
+ if (freed >= charsToFree) break;
162
+ }
163
+ }
164
+ }
165
+ return freed;
166
+ }
167
+
168
+ const ROUND_CAP_NOTICE =
169
+ '[System: the tool budget for this turn is exhausted. Stop working now. In 2-3 sentences, summarize what you completed, what remains, and the exact next step.]';
170
+
120
171
  const FILE_TOOL_NAMES = new Set(['Write', 'Edit', 'MultiEdit', 'NotebookEdit', 'write', 'edit', 'multiEdit', 'notebookEdit']);
121
172
  const MAX_TOOL_ROUNDS = 25;
122
173
  /** Transparent re-runs of a failed round that produced nothing (audit D6-1). */
@@ -130,6 +181,14 @@ export function createPiSession(init: PiSessionInit): PiSession {
130
181
  let lastUsage: PiUsage | undefined;
131
182
  let lastContextWindow: number | undefined;
132
183
 
184
+ // Self-healing vision (audit D rank 12): when a model the catalog couldn't
185
+ // classify (dynamic/unknown sub-providers ⇒ supportsImages undefined) rejects
186
+ // an image with an 'image-unsupported' error, latch this for the rest of the
187
+ // session and downgrade images on every subsequent send. The IMAGE stays in
188
+ // history (downgradeImages is transform-on-send only), so switching to a
189
+ // vision-capable model later restores it.
190
+ let visionDisabled = false;
191
+
133
192
  /** One stream round — collect the assistant blocks the model emits this pass. */
134
193
  interface RoundResult {
135
194
  text: string;
@@ -142,7 +201,7 @@ export function createPiSession(init: PiSessionInit): PiSession {
142
201
  retryable?: boolean;
143
202
  }
144
203
 
145
- async function runOneRound(emitSeparatorFirst: boolean): Promise<RoundResult> {
204
+ async function runOneRound(emitSeparatorFirst: boolean, opts?: { wrapUp?: boolean }): Promise<RoundResult> {
146
205
  const result: RoundResult = { text: '', toolUses: [], errored: false };
147
206
  let firstDelta = true;
148
207
  try {
@@ -153,8 +212,13 @@ export function createPiSession(init: PiSessionInit): PiSession {
153
212
  baseUrl: auth.baseUrl,
154
213
  apiKey: auth.apiKey,
155
214
  systemPrompt: init.systemPrompt,
156
- messages,
215
+ // Downgrade images when the catalog says text-only (supportsImages
216
+ // false) OR a prior round in THIS session learned it the hard way via
217
+ // an 'image-unsupported' error (visionDisabled). The stored history
218
+ // keeps the image so a later vision-capable model still restores it.
219
+ messages: auth.supportsImages === false || visionDisabled ? downgradeImages(messages) : messages,
157
220
  tools: init.tools,
221
+ toolChoice: opts?.wrapUp ? 'none' : undefined,
158
222
  maxOutputTokens: auth.maxOutputTokens,
159
223
  maxTokensField: auth.maxTokensField,
160
224
  includeStreamUsage: auth.includeStreamUsage,
@@ -182,6 +246,9 @@ export function createPiSession(init: PiSessionInit): PiSession {
182
246
  // at the end of the whole turn so the UI doesn't show half-answers.
183
247
  result.text = evt.text;
184
248
  break;
249
+ case 'thinking':
250
+ init.onEvent({ type: 'thinking' });
251
+ break;
185
252
  case 'tool_use':
186
253
  result.toolUses.push({
187
254
  id: evt.id,
@@ -189,7 +256,12 @@ export function createPiSession(init: PiSessionInit): PiSession {
189
256
  input: evt.input,
190
257
  thoughtSignature: evt.thoughtSignature,
191
258
  });
192
- init.onEvent({ type: 'tool_use', id: evt.id, name: evt.name, input: evt.input });
259
+ // Wrap-up rounds forbid tools (toolChoice 'none'); if a vendor
260
+ // ignores that, swallow the phantom call silently — it is never
261
+ // executed or persisted.
262
+ if (!opts?.wrapUp) {
263
+ init.onEvent({ type: 'tool_use', id: evt.id, name: evt.name, input: evt.input });
264
+ }
193
265
  break;
194
266
  case 'error':
195
267
  result.errored = true;
@@ -270,6 +342,21 @@ export function createPiSession(init: PiSessionInit): PiSession {
270
342
  res = await runOneRound(needsSeparator);
271
343
  }
272
344
 
345
+ // Self-healing vision (audit D rank 12): a model the catalog couldn't
346
+ // classify just 400/415/422'd on an attached image. Latch visionDisabled
347
+ // and re-run the round ONCE — runOneRound now downgrades images on send,
348
+ // so the resend succeeds. Guarded by !visionDisabled so it fires at most
349
+ // once per session; an image rides every stateless resend, so without
350
+ // this the whole conversation would keep re-400ing.
351
+ if (
352
+ res.errored && res.errorKind === 'image-unsupported' && !visionDisabled &&
353
+ !init.abortController.signal.aborted
354
+ ) {
355
+ log.info('[pi/session] model rejected image — disabling vision for this session and retrying without it');
356
+ visionDisabled = true;
357
+ res = await runOneRound(needsSeparator);
358
+ }
359
+
273
360
  const { text, toolUses, errored } = res;
274
361
 
275
362
  // Append whatever the model produced this round to history so subsequent
@@ -331,10 +418,63 @@ export function createPiSession(init: PiSessionInit): PiSession {
331
418
  messages.push({ role: 'user', content: toolResultBlocks });
332
419
  }
333
420
 
421
+ // Emergency in-turn context relief (audit D2-6): recycling only acts
422
+ // between idle turns, so a single heavy tool loop could cross the wall
423
+ // mid-turn. Above 85% occupancy, stub the oldest large tool outputs to
424
+ // bring the next request back toward 70%.
425
+ if (lastContextWindow && lastUsage) {
426
+ const occupancy =
427
+ (lastUsage.inputTokens || 0) + (lastUsage.cacheReadTokens || 0) + (lastUsage.cacheCreationTokens || 0);
428
+ if (occupancy > 0.85 * lastContextWindow) {
429
+ const charsToFree = (occupancy - Math.floor(0.7 * lastContextWindow)) * 4; // ~4 chars/token
430
+ const freed = trimOldToolResults(messages, charsToFree, 4);
431
+ if (freed > 0) {
432
+ log.info(`[pi/session] context at ${occupancy}/${lastContextWindow} tok mid-turn — trimmed ~${Math.round(freed / 1024)} KB of old tool output`);
433
+ }
434
+ }
435
+ }
436
+
334
437
  // No tool calls ⇒ the model is done with this turn.
335
438
  if (toolUses.length === 0) { roundCapHit = false; break; }
336
439
  }
337
440
 
441
+ // Round-cap wrap-up (audit D5-8): the budget ran out with the model still
442
+ // mid-task. Run ONE final no-tools round so the turn ends with an honest
443
+ // status summary instead of silent truncation. roundCapHit stays true on
444
+ // turn_complete — consumers still know the work is incomplete.
445
+ if (roundCapHit && !turnErrored && !init.abortController.signal.aborted) {
446
+ log.info(`[pi/session] tool-round budget (${maxRounds}) exhausted — running a no-tools wrap-up round`);
447
+ messages.push({ role: 'user', content: [{ type: 'text', text: ROUND_CAP_NOTICE }] });
448
+ const needsSeparator = accumulatedText.length > 0 && !accumulatedText.endsWith('\n');
449
+ const res = await runOneRound(needsSeparator, { wrapUp: true });
450
+ if (res.text) {
451
+ if (needsSeparator) accumulatedText += '\n\n';
452
+ accumulatedText += res.text;
453
+ messages.push({ role: 'assistant', content: [{ type: 'text', text: res.text }] });
454
+ } else {
455
+ // The notice was never answered — pop it so the NEXT turn doesn't
456
+ // open under a stale "stop working now" instruction (review PI-D-1).
457
+ const last = messages[messages.length - 1];
458
+ if (last?.role === 'user' && last.content.length === 1 &&
459
+ last.content[0].type === 'text' && last.content[0].text === ROUND_CAP_NOTICE) {
460
+ messages.pop();
461
+ }
462
+ }
463
+ // Fatal wrap-up failures (dead key / context wall) must still tear the
464
+ // session down, and a cap-hit turn with NO text at all must not end in
465
+ // total silence — claude surfaces error_max_turns and pi's one-shot
466
+ // paths guard this state too (PI-C-2). Set the turn-error fields so the
467
+ // standard emission below handles both (review PI-D-1).
468
+ if (res.errored && (res.errorKind === 'auth' || res.errorKind === 'context-overflow')) {
469
+ turnErrored = true;
470
+ turnErrorMsg = res.errorMsg;
471
+ turnErrorKind = res.errorKind;
472
+ } else if (!accumulatedText) {
473
+ turnErrored = true;
474
+ turnErrorMsg = `I hit my tool budget for this turn (${maxRounds} rounds) before finishing — say "continue" and I'll pick up where I left off.`;
475
+ }
476
+ }
477
+
338
478
  // Turn-end emission order (audit D6-2, mirrors claude.ts:394-401):
339
479
  // 1. text_end whenever ANY text streamed — even on errored turns, so the
340
480
  // partial the user watched is committed, persisted, and consumes its