bloby-bot 0.70.12 → 0.70.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/bin/cli.js +11 -3
  2. package/dist-bloby/assets/{bloby-DSNB0g4w.js → bloby-CU9KhQdP.js} +4 -4
  3. package/dist-bloby/assets/globals-DlPtwiZL.css +2 -0
  4. package/dist-bloby/assets/{globals-B3cTbITX.js → globals-mGpojCOe.js} +1 -1
  5. package/dist-bloby/assets/{highlighted-body-OFNGDK62-BLforpkr.js → highlighted-body-OFNGDK62-D0Tm_wgU.js} +1 -1
  6. package/dist-bloby/assets/mermaid-GHXKKRXX-B95J3s3s.js +1 -0
  7. package/dist-bloby/assets/{onboard-Dn2Ws_G2.js → onboard-GfjHF9nm.js} +1 -1
  8. package/dist-bloby/bloby.html +3 -3
  9. package/dist-bloby/onboard.html +3 -3
  10. package/package.json +2 -2
  11. package/scripts/install +15 -7
  12. package/scripts/install.ps1 +35 -14
  13. package/scripts/install.sh +15 -7
  14. package/shared/relay.ts +3 -1
  15. package/supervisor/channels/manager.ts +16 -11
  16. package/supervisor/chat/OnboardWizard.tsx +0 -15
  17. package/supervisor/harnesses/pi/index.ts +320 -100
  18. package/supervisor/harnesses/pi/providers/humanize-error.ts +2 -2
  19. package/supervisor/harnesses/pi/providers/retry.ts +31 -0
  20. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +23 -3
  21. package/supervisor/harnesses/pi/providers/stream-google.ts +21 -3
  22. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +17 -3
  23. package/supervisor/harnesses/pi/providers/types.ts +11 -0
  24. package/supervisor/harnesses/pi/session.ts +116 -3
  25. package/supervisor/harnesses/pi/test-completion.ts +56 -0
  26. package/supervisor/harnesses/pi/tools/bash.ts +198 -22
  27. package/supervisor/harnesses/pi/tools/glob.ts +79 -0
  28. package/supervisor/harnesses/pi/tools/grep.ts +0 -0
  29. package/supervisor/harnesses/pi/tools/registry.ts +18 -6
  30. package/supervisor/harnesses/pi/tools/todo-write.ts +45 -0
  31. package/supervisor/harnesses/pi/tools/web-fetch.ts +129 -0
  32. package/supervisor/index.ts +36 -2
  33. package/worker/index.ts +18 -1
  34. package/worker/prompts/bloby-system-prompt-codex.txt +1 -1
  35. package/worker/prompts/bloby-system-prompt-pi.txt +6 -24
  36. package/worker/prompts/bloby-system-prompt.txt +1 -1
  37. package/workspace/client/src/components/Dashboard/DashboardPage.tsx +4 -117
  38. package/workspace/client/src/components/Dashboard/deleteme_placeholders.tsx +194 -0
  39. package/workspace/client/src/components/Layout/Sidebar.tsx +52 -30
  40. package/workspace/client/src/components/deleteme_onboarding/WorkspaceTour.tsx +25 -15
  41. package/workspace/client/src/components/deleteme_onboarding/tour-theme.css +24 -0
  42. package/workspace/skills/mac/SKILL.md +13 -4
  43. package/dist-bloby/assets/globals-DyeW509Y.css +0 -2
  44. package/dist-bloby/assets/mermaid-GHXKKRXX-C1H_fSCU.js +0 -1
@@ -20,7 +20,7 @@ import type {
20
20
  PiStopReason,
21
21
  PiUsage,
22
22
  } from './types.js';
23
- import { fetchWithRetry } from './retry.js';
23
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
24
24
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
25
25
 
26
26
  /* ── SSE parser (shares the LF/CRLF-tolerant pattern from the other providers) ── */
@@ -32,7 +32,7 @@ async function* parseSse(res: Response): AsyncIterable<any> {
32
32
  let buffer = '';
33
33
  try {
34
34
  while (true) {
35
- const { value, done } = await reader.read();
35
+ const { value, done } = await readWithIdleTimeout(reader, 'Anthropic');
36
36
  if (done) break;
37
37
  buffer += decoder.decode(value, { stream: true });
38
38
  let idx;
@@ -79,6 +79,10 @@ function toAnthropicContent(blocks: PiContentBlock[]): any[] {
79
79
  const out: any[] = [];
80
80
  for (const b of blocks) {
81
81
  if (b.type === 'text') {
82
+ // The Messages API rejects empty/whitespace-only text blocks ("text
83
+ // content blocks must be non-empty") — drop them; an all-empty message
84
+ // is then filtered by the content-length guards in toAnthropicMessages.
85
+ if (!b.text || !b.text.trim()) continue;
82
86
  out.push({ type: 'text', text: b.text });
83
87
  } else if (b.type === 'image') {
84
88
  out.push({
@@ -105,13 +109,19 @@ function toAnthropicContent(blocks: PiContentBlock[]): any[] {
105
109
  }
106
110
 
107
111
  function toAnthropicMessages(pi: PiMessage[]): any[] {
108
- return pi
112
+ const msgs = pi
109
113
  .filter((m) => m.content.length > 0)
110
114
  .map((m) => ({
111
115
  role: m.role === 'assistant' ? 'assistant' : 'user',
112
116
  content: toAnthropicContent(m.content),
113
117
  }))
114
118
  .filter((m) => m.content.length > 0);
119
+ // The Messages API requires the first message to be user-role. Rolling
120
+ // history windows (customer buffers) are trimmed user-first at the source
121
+ // (channels/manager.ts trimCustomerBuffer), but defend here too — a leading
122
+ // assistant message 400s the whole request (audit C-7).
123
+ while (msgs.length > 0 && msgs[0].role !== 'user') msgs.shift();
124
+ return msgs;
115
125
  }
116
126
 
117
127
  function toAnthropicTools(tools: { name: string; description: string; inputSchema: Record<string, any> }[]) {
@@ -166,6 +176,9 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
166
176
  if (req.tools && req.tools.length > 0) {
167
177
  body.tools = toAnthropicTools(req.tools);
168
178
  body.tools[body.tools.length - 1].cache_control = { type: 'ephemeral' };
179
+ // Round-cap wrap-up: forbid further tool calls; tools stay declared so
180
+ // tool_use/tool_result blocks in history remain valid.
181
+ if (req.toolChoice === 'none') body.tool_choice = { type: 'none' };
169
182
  }
170
183
  if (Array.isArray(body.messages) && body.messages.length > 0) {
171
184
  const lastContent = body.messages[body.messages.length - 1].content;
@@ -213,6 +226,7 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
213
226
  let usage: PiUsage | undefined;
214
227
  let chunkCount = 0;
215
228
  let firstChunkSummary = '';
229
+ let thinkingEmitted = false;
216
230
 
217
231
  try {
218
232
  for await (const evt of parseSse(res)) {
@@ -250,6 +264,12 @@ export async function* streamAnthropic(req: PiStreamRequest): AsyncIterable<PiSt
250
264
  toolArgsBuf: '',
251
265
  });
252
266
  } else {
267
+ // Extended-thinking blocks (not requested today, future-proofed):
268
+ // one liveness pulse, text never forwarded.
269
+ if (block.type === 'thinking' && !thinkingEmitted) {
270
+ thinkingEmitted = true;
271
+ yield { type: 'thinking' };
272
+ }
253
273
  blocks.set(idx, { kind: 'other' });
254
274
  }
255
275
  break;
@@ -18,7 +18,7 @@ import type {
18
18
  PiStopReason,
19
19
  PiUsage,
20
20
  } from './types.js';
21
- import { fetchWithRetry } from './retry.js';
21
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
22
22
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
23
23
 
24
24
  /** Walk an SSE byte stream and yield each parsed JSON event. */
@@ -30,7 +30,7 @@ async function* parseSse(res: Response, dbg: { firstBytes: string }): AsyncItera
30
30
  let totalBytes = 0;
31
31
  try {
32
32
  while (true) {
33
- const { value, done } = await reader.read();
33
+ const { value, done } = await readWithIdleTimeout(reader, 'Google Gemini');
34
34
  if (done) break;
35
35
  if (value) totalBytes += value.byteLength;
36
36
  buffer += decoder.decode(value, { stream: true });
@@ -203,11 +203,25 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
203
203
  maxOutputTokens: req.maxOutputTokens ?? 32768,
204
204
  },
205
205
  };
206
+ // Thinking-capable families (2.5+/3.x): ask for thought summaries so the
207
+ // harness can emit a liveness pulse — without this, Gemini 3 burns its
208
+ // output budget on invisible reasoning and the chat looks hung. Gated by
209
+ // model id; unknown/dynamic ids skip it (older models reject the field).
210
+ // The rolling aliases (gemini-flash-latest / gemini-flash-lite-latest)
211
+ // resolve to 2.5+/3.x thinking models too (review PI-D-2).
212
+ if (/gemini-(2\.5|[3-9]|flash(-lite)?-latest)/i.test(req.modelId)) {
213
+ body.generationConfig.thinkingConfig = { includeThoughts: true };
214
+ }
206
215
  if (req.systemPrompt?.trim()) {
207
216
  body.systemInstruction = { parts: [{ text: req.systemPrompt }] };
208
217
  }
209
218
  if (req.tools && req.tools.length > 0) {
210
219
  body.tools = toGeminiTools(req.tools);
220
+ // Round-cap wrap-up: forbid further function calls; tools stay declared so
221
+ // functionCall/functionResponse parts in history remain valid.
222
+ if (req.toolChoice === 'none') {
223
+ body.toolConfig = { functionCallingConfig: { mode: 'NONE' } };
224
+ }
211
225
  }
212
226
 
213
227
  let res: Response;
@@ -263,7 +277,11 @@ export async function* streamGoogle(req: PiStreamRequest): AsyncIterable<PiStrea
263
277
  for (const part of parts) {
264
278
  // Thinking models emit reasoning parts with `thought: true`. They
265
279
  // shouldn't be shown to the user as part of the visible answer.
266
- if (part?.thought) { thoughtPartCount++; continue; }
280
+ if (part?.thought) {
281
+ thoughtPartCount++;
282
+ if (thoughtPartCount === 1) yield { type: 'thinking' };
283
+ continue;
284
+ }
267
285
  if (part?.functionCall && typeof part.functionCall.name === 'string') {
268
286
  // Gemini doesn't surface a tool-call id of its own; bake the tool
269
287
  // name into the id so the session can echo it back as a
@@ -18,7 +18,7 @@ import type {
18
18
  PiStopReason,
19
19
  PiUsage,
20
20
  } from './types.js';
21
- import { fetchWithRetry } from './retry.js';
21
+ import { fetchWithRetry, readWithIdleTimeout } from './retry.js';
22
22
  import { classifyPiError, classifyPiNetworkError } from './humanize-error.js';
23
23
 
24
24
  /* ── SSE parser (LF or CRLF tolerant, flushes the trailing event) ── */
@@ -30,7 +30,7 @@ async function* parseSse(res: Response): AsyncIterable<any> {
30
30
  let buffer = '';
31
31
  try {
32
32
  while (true) {
33
- const { value, done } = await reader.read();
33
+ const { value, done } = await readWithIdleTimeout(reader, 'OpenAI-compat');
34
34
  if (done) break;
35
35
  buffer += decoder.decode(value, { stream: true });
36
36
  let idx;
@@ -203,7 +203,10 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
203
203
  }
204
204
  if (req.tools && req.tools.length > 0) {
205
205
  body.tools = toOpenAITools(req.tools);
206
- body.tool_choice = 'auto';
206
+ // 'none' = the round-cap wrap-up round: the model must summarize, not
207
+ // start more work. Tools stay declared so histories containing tool calls
208
+ // remain valid.
209
+ body.tool_choice = req.toolChoice === 'none' ? 'none' : 'auto';
207
210
  }
208
211
 
209
212
  let res: Response;
@@ -243,6 +246,7 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
243
246
  const toolCallsByIndex = new Map<number, PartialToolCall>();
244
247
  let chunkCount = 0;
245
248
  let firstChunkSummary = '';
249
+ let thinkingEmitted = false;
246
250
 
247
251
  // Vendors disagree on where streamed usage lives: spec says a final
248
252
  // choice-less chunk's `usage`, Groq defaults to nesting under `x_groq.usage`,
@@ -267,6 +271,16 @@ export async function* streamOpenAICompletions(req: PiStreamRequest): AsyncItera
267
271
  readUsage(choice?.usage);
268
272
  const delta = choice.delta || {};
269
273
 
274
+ // Reasoning models stream hidden thinking under vendor-specific fields
275
+ // (DeepSeek/OpenRouter: reasoning_content; others: reasoning /
276
+ // reasoning_text — upstream pi's field priority). Emit ONE liveness
277
+ // pulse so the chat doesn't look hung; never forward the text itself.
278
+ const reasoningDelta = delta.reasoning_content ?? delta.reasoning ?? delta.reasoning_text;
279
+ if (!thinkingEmitted && typeof reasoningDelta === 'string' && reasoningDelta.length > 0) {
280
+ thinkingEmitted = true;
281
+ yield { type: 'thinking' };
282
+ }
283
+
270
284
  if (typeof delta.content === 'string' && delta.content.length > 0) {
271
285
  accumulated += delta.content;
272
286
  yield { type: 'text_delta', delta: delta.content };
@@ -56,6 +56,13 @@ export interface PiStreamRequest {
56
56
  * that 422 on the `stream_options.include_usage` opt-in. Default true.
57
57
  */
58
58
  includeStreamUsage?: boolean;
59
+ /**
60
+ * 'none' forbids tool calls for this request (mapped per flavor: OpenAI
61
+ * tool_choice:'none', Anthropic {type:'none'}, Gemini functionCallingConfig
62
+ * mode NONE). Used by the session's round-cap wrap-up round, where the model
63
+ * must summarize instead of starting more work.
64
+ */
65
+ toolChoice?: 'auto' | 'none';
59
66
  /** Optional abort signal so the session can interrupt in-flight requests. */
60
67
  signal?: AbortSignal;
61
68
  }
@@ -72,6 +79,10 @@ export type PiErrorKind = 'auth' | 'context-overflow' | 'rate-limit' | 'billing'
72
79
  export type PiStreamEvent =
73
80
  | { type: 'text_delta'; delta: string }
74
81
  | { type: 'text_end'; text: string }
82
+ /** Emitted when the model starts (visibly) reasoning — a liveness pulse for
83
+ * thinking models so the chat doesn't look hung. Reasoning TEXT is never
84
+ * forwarded (it would corrupt the streamed-text == response contract). */
85
+ | { type: 'thinking' }
75
86
  | { type: 'tool_use'; id: string; name: string; input: any; thoughtSignature?: string }
76
87
  | { type: 'done'; stopReason: PiStopReason; usage?: PiUsage }
77
88
  | { type: 'error'; error: string; status?: number; kind?: PiErrorKind; retryable?: boolean };
@@ -44,6 +44,8 @@ export type PiSessionEvent =
44
44
  | { type: 'turn_started' }
45
45
  | { type: 'text_delta'; delta: string }
46
46
  | { type: 'text_end'; text: string }
47
+ /** Liveness pulse: the model is reasoning (thinking models) — no text attached. */
48
+ | { type: 'thinking' }
47
49
  | { type: 'tool_use'; id: string; name: string; input: any }
48
50
  | { type: 'tool_result'; toolUseId: string; name: string; isError?: boolean }
49
51
  | {
@@ -78,6 +80,10 @@ export interface PiSessionAuth {
78
80
  includeStreamUsage?: boolean;
79
81
  /** Model context window from the catalog — reported on turn_complete for the recycler. */
80
82
  contextWindow?: number;
83
+ /** False when the catalog says the model is text-only — image blocks are
84
+ * downgraded to placeholders on send so one screenshot can't 400-poison
85
+ * the session (audit C-8). Undefined (dynamic models) ⇒ assume vision. */
86
+ supportsImages?: boolean;
81
87
  }
82
88
 
83
89
  export interface PiSessionInit {
@@ -117,6 +123,51 @@ export interface PiSession {
117
123
  getMessages(): PiMessage[];
118
124
  }
119
125
 
126
+ /** Transform-on-send for text-only models (audit C-8): image blocks become
127
+ * placeholders in the REQUEST only — the stored history keeps the images, so
128
+ * switching to a vision model later restores them. */
129
+ function downgradeImages(messages: PiMessage[]): PiMessage[] {
130
+ let any = false;
131
+ const out = messages.map((m) => {
132
+ if (!m.content.some((b) => b.type === 'image')) return m;
133
+ any = true;
134
+ return {
135
+ ...m,
136
+ content: m.content.map((b): PiContentBlock =>
137
+ b.type === 'image'
138
+ ? { type: 'text', text: '[An image was attached here, but the current model cannot view images. Tell the user to switch to a vision-capable model if the image matters.]' }
139
+ : b,
140
+ ),
141
+ };
142
+ });
143
+ return any ? out : messages;
144
+ }
145
+
146
+ /** Emergency in-turn context relief (audit D2-6): when occupancy crosses the
147
+ * threshold MID-turn (recycling only acts between idle turns), stub out the
148
+ * oldest large tool_result payloads — never user/assistant text, never the
149
+ * protected tail (the current round's results). Cruder than real compaction,
150
+ * but the turn finishes instead of 400ing on the context wall. */
151
+ function trimOldToolResults(messages: PiMessage[], charsToFree: number, protectTail: number): number {
152
+ let freed = 0;
153
+ const limit = Math.max(0, messages.length - protectTail);
154
+ for (let i = 0; i < limit && freed < charsToFree; i++) {
155
+ const m = messages[i];
156
+ if (m.role !== 'user') continue;
157
+ for (const b of m.content) {
158
+ if (b.type === 'tool_result' && typeof b.content === 'string' && b.content.length > 2048) {
159
+ freed += b.content.length;
160
+ b.content = `[tool output trimmed to fit the context window — ~${Math.round(b.content.length / 1024)} KB removed]`;
161
+ if (freed >= charsToFree) break;
162
+ }
163
+ }
164
+ }
165
+ return freed;
166
+ }
167
+
168
+ const ROUND_CAP_NOTICE =
169
+ '[System: the tool budget for this turn is exhausted. Stop working now. In 2-3 sentences, summarize what you completed, what remains, and the exact next step.]';
170
+
120
171
  const FILE_TOOL_NAMES = new Set(['Write', 'Edit', 'MultiEdit', 'NotebookEdit', 'write', 'edit', 'multiEdit', 'notebookEdit']);
121
172
  const MAX_TOOL_ROUNDS = 25;
122
173
  /** Transparent re-runs of a failed round that produced nothing (audit D6-1). */
@@ -142,7 +193,7 @@ export function createPiSession(init: PiSessionInit): PiSession {
142
193
  retryable?: boolean;
143
194
  }
144
195
 
145
- async function runOneRound(emitSeparatorFirst: boolean): Promise<RoundResult> {
196
+ async function runOneRound(emitSeparatorFirst: boolean, opts?: { wrapUp?: boolean }): Promise<RoundResult> {
146
197
  const result: RoundResult = { text: '', toolUses: [], errored: false };
147
198
  let firstDelta = true;
148
199
  try {
@@ -153,8 +204,9 @@ export function createPiSession(init: PiSessionInit): PiSession {
153
204
  baseUrl: auth.baseUrl,
154
205
  apiKey: auth.apiKey,
155
206
  systemPrompt: init.systemPrompt,
156
- messages,
207
+ messages: auth.supportsImages === false ? downgradeImages(messages) : messages,
157
208
  tools: init.tools,
209
+ toolChoice: opts?.wrapUp ? 'none' : undefined,
158
210
  maxOutputTokens: auth.maxOutputTokens,
159
211
  maxTokensField: auth.maxTokensField,
160
212
  includeStreamUsage: auth.includeStreamUsage,
@@ -182,6 +234,9 @@ export function createPiSession(init: PiSessionInit): PiSession {
182
234
  // at the end of the whole turn so the UI doesn't show half-answers.
183
235
  result.text = evt.text;
184
236
  break;
237
+ case 'thinking':
238
+ init.onEvent({ type: 'thinking' });
239
+ break;
185
240
  case 'tool_use':
186
241
  result.toolUses.push({
187
242
  id: evt.id,
@@ -189,7 +244,12 @@ export function createPiSession(init: PiSessionInit): PiSession {
189
244
  input: evt.input,
190
245
  thoughtSignature: evt.thoughtSignature,
191
246
  });
192
- init.onEvent({ type: 'tool_use', id: evt.id, name: evt.name, input: evt.input });
247
+ // Wrap-up rounds forbid tools (toolChoice 'none'); if a vendor
248
+ // ignores that, swallow the phantom call silently — it is never
249
+ // executed or persisted.
250
+ if (!opts?.wrapUp) {
251
+ init.onEvent({ type: 'tool_use', id: evt.id, name: evt.name, input: evt.input });
252
+ }
193
253
  break;
194
254
  case 'error':
195
255
  result.errored = true;
@@ -331,10 +391,63 @@ export function createPiSession(init: PiSessionInit): PiSession {
331
391
  messages.push({ role: 'user', content: toolResultBlocks });
332
392
  }
333
393
 
394
+ // Emergency in-turn context relief (audit D2-6): recycling only acts
395
+ // between idle turns, so a single heavy tool loop could cross the wall
396
+ // mid-turn. Above 85% occupancy, stub the oldest large tool outputs to
397
+ // bring the next request back toward 70%.
398
+ if (lastContextWindow && lastUsage) {
399
+ const occupancy =
400
+ (lastUsage.inputTokens || 0) + (lastUsage.cacheReadTokens || 0) + (lastUsage.cacheCreationTokens || 0);
401
+ if (occupancy > 0.85 * lastContextWindow) {
402
+ const charsToFree = (occupancy - Math.floor(0.7 * lastContextWindow)) * 4; // ~4 chars/token
403
+ const freed = trimOldToolResults(messages, charsToFree, 4);
404
+ if (freed > 0) {
405
+ log.info(`[pi/session] context at ${occupancy}/${lastContextWindow} tok mid-turn — trimmed ~${Math.round(freed / 1024)} KB of old tool output`);
406
+ }
407
+ }
408
+ }
409
+
334
410
  // No tool calls ⇒ the model is done with this turn.
335
411
  if (toolUses.length === 0) { roundCapHit = false; break; }
336
412
  }
337
413
 
414
+ // Round-cap wrap-up (audit D5-8): the budget ran out with the model still
415
+ // mid-task. Run ONE final no-tools round so the turn ends with an honest
416
+ // status summary instead of silent truncation. roundCapHit stays true on
417
+ // turn_complete — consumers still know the work is incomplete.
418
+ if (roundCapHit && !turnErrored && !init.abortController.signal.aborted) {
419
+ log.info(`[pi/session] tool-round budget (${maxRounds}) exhausted — running a no-tools wrap-up round`);
420
+ messages.push({ role: 'user', content: [{ type: 'text', text: ROUND_CAP_NOTICE }] });
421
+ const needsSeparator = accumulatedText.length > 0 && !accumulatedText.endsWith('\n');
422
+ const res = await runOneRound(needsSeparator, { wrapUp: true });
423
+ if (res.text) {
424
+ if (needsSeparator) accumulatedText += '\n\n';
425
+ accumulatedText += res.text;
426
+ messages.push({ role: 'assistant', content: [{ type: 'text', text: res.text }] });
427
+ } else {
428
+ // The notice was never answered — pop it so the NEXT turn doesn't
429
+ // open under a stale "stop working now" instruction (review PI-D-1).
430
+ const last = messages[messages.length - 1];
431
+ if (last?.role === 'user' && last.content.length === 1 &&
432
+ last.content[0].type === 'text' && last.content[0].text === ROUND_CAP_NOTICE) {
433
+ messages.pop();
434
+ }
435
+ }
436
+ // Fatal wrap-up failures (dead key / context wall) must still tear the
437
+ // session down, and a cap-hit turn with NO text at all must not end in
438
+ // total silence — claude surfaces error_max_turns and pi's one-shot
439
+ // paths guard this state too (PI-C-2). Set the turn-error fields so the
440
+ // standard emission below handles both (review PI-D-1).
441
+ if (res.errored && (res.errorKind === 'auth' || res.errorKind === 'context-overflow')) {
442
+ turnErrored = true;
443
+ turnErrorMsg = res.errorMsg;
444
+ turnErrorKind = res.errorKind;
445
+ } else if (!accumulatedText) {
446
+ turnErrored = true;
447
+ turnErrorMsg = `I hit my tool budget for this turn (${maxRounds} rounds) before finishing — say "continue" and I'll pick up where I left off.`;
448
+ }
449
+ }
450
+
338
451
  // Turn-end emission order (audit D6-2, mirrors claude.ts:394-401):
339
452
  // 1. text_end whenever ANY text streamed — even on errored turns, so the
340
453
  // partial the user watched is committed, persisted, and consumes its
@@ -11,6 +11,8 @@
11
11
  * - google-gemini → POST {baseUrl}/models/{modelId}:generateContent
12
12
  */
13
13
  import { getPiSubProvider, type PiApiFlavor } from './sub-providers.js';
14
+ import { streamProvider } from './providers/stream.js';
15
+ import { toolDefsForProvider } from './tools/registry.js';
14
16
 
15
17
  export interface PiTestCompletionInput {
16
18
  subProvider: string;
@@ -88,6 +90,60 @@ export async function runPiTestCompletion(input: PiTestCompletionInput): Promise
88
90
  }
89
91
  }
90
92
 
93
+ /**
94
+ * Streaming + tools probe (audit C-4). The non-streaming, tool-less test above
95
+ * validates a contract no real turn uses — free-form model ids (Ollama, LM
96
+ * Studio, custom, OpenRouter) could pass it and then fail the first actual
97
+ * message, which streams SSE with the full tool schema attached. This probe
98
+ * exercises the REAL wire shape in one cheap request: success = any
99
+ * text/tool-call event arrives before an error does.
100
+ */
101
+ export async function runPiStreamProbe(input: PiTestCompletionInput): Promise<PiTestCompletionResult> {
102
+ const provider = getPiSubProvider(input.subProvider);
103
+ if (!provider) return { ok: false, error: `Unknown sub-provider: ${input.subProvider}` };
104
+ const baseUrl = pickBaseUrl(input);
105
+ if (!baseUrl) return { ok: false, error: 'Missing base URL' };
106
+ const modelId = pickModelId(input);
107
+ if (!modelId) return { ok: false, error: 'Missing model ID' };
108
+
109
+ const ctl = new AbortController();
110
+ const timer = setTimeout(() => ctl.abort(), REQUEST_TIMEOUT_MS);
111
+ try {
112
+ const stream = streamProvider(provider.flavor, {
113
+ modelId,
114
+ baseUrl,
115
+ apiKey: input.apiKey?.trim() || '',
116
+ systemPrompt: 'You are a connectivity probe. Reply with the single word OK.',
117
+ messages: [{ role: 'user', content: [{ type: 'text', text: input.prompt || 'Reply with the single word OK.' }] }],
118
+ // withTask: the live conversation's schema is the superset every real
119
+ // turn sends — probe with the same shape (review PI-D-4).
120
+ tools: toolDefsForProvider({ withTask: true }),
121
+ // Generous: reasoning models burn output budget on hidden thinking first.
122
+ maxOutputTokens: 2048,
123
+ maxTokensField: provider.maxTokensField,
124
+ includeStreamUsage: provider.noStreamUsage ? false : undefined,
125
+ signal: ctl.signal,
126
+ });
127
+ for await (const evt of stream) {
128
+ if (evt.type === 'text_delta' || evt.type === 'tool_use') {
129
+ return { ok: true, text: 'stream OK', modelId, subProvider: provider.id };
130
+ }
131
+ if (evt.type === 'error') {
132
+ return { ok: false, error: evt.error, modelId, subProvider: provider.id };
133
+ }
134
+ }
135
+ if (ctl.signal.aborted) {
136
+ return { ok: false, error: `Stream probe timed out after ${REQUEST_TIMEOUT_MS / 1000}s.`, modelId, subProvider: provider.id };
137
+ }
138
+ return { ok: false, error: 'The stream ended without producing any output.', modelId, subProvider: provider.id };
139
+ } catch (err: any) {
140
+ const msg = err?.name === 'AbortError' ? `Stream probe timed out after ${REQUEST_TIMEOUT_MS / 1000}s.` : err?.message || String(err);
141
+ return { ok: false, error: msg, modelId, subProvider: provider.id };
142
+ } finally {
143
+ clearTimeout(timer);
144
+ }
145
+ }
146
+
91
147
  interface DispatchArgs {
92
148
  baseUrl: string;
93
149
  modelId: string;