bloby-bot 0.70.13 → 0.71.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/bin/cli.js +223 -45
  2. package/dist-bloby/assets/{bloby-CU9KhQdP.js → bloby-es6cZJzs.js} +6 -6
  3. package/dist-bloby/assets/globals-DBqwNiJV.css +2 -0
  4. package/dist-bloby/assets/{highlighted-body-OFNGDK62-D0Tm_wgU.js → highlighted-body-OFNGDK62-8PiOHw9p.js} +1 -1
  5. package/dist-bloby/assets/mermaid-GHXKKRXX-BJWX8urU.js +1 -0
  6. package/dist-bloby/assets/{onboard-GfjHF9nm.js → onboard-BKgy17OU.js} +1 -1
  7. package/dist-bloby/bloby.html +3 -3
  8. package/dist-bloby/onboard.html +3 -3
  9. package/package.json +2 -3
  10. package/scripts/install +141 -34
  11. package/scripts/install.ps1 +111 -15
  12. package/scripts/install.sh +141 -34
  13. package/shared/config.ts +37 -2
  14. package/supervisor/channels/manager.ts +68 -33
  15. package/supervisor/channels/telegram.ts +57 -16
  16. package/supervisor/channels/types.ts +4 -1
  17. package/supervisor/channels/whatsapp.ts +57 -10
  18. package/supervisor/chat/src/components/Chat/AudioBubble.tsx +1 -1
  19. package/supervisor/chat/src/components/Chat/AuthedImage.tsx +16 -3
  20. package/supervisor/chat/src/components/Chat/BlobyImageCard.tsx +2 -2
  21. package/supervisor/chat/src/components/Chat/ImageLightbox.tsx +25 -8
  22. package/supervisor/chat/src/components/Chat/InputBar.tsx +62 -7
  23. package/supervisor/chat/src/components/Chat/MessageBubble.tsx +37 -18
  24. package/supervisor/chat/src/components/Chat/MessageList.tsx +3 -3
  25. package/supervisor/chat/src/hooks/useChat.ts +52 -0
  26. package/supervisor/chat/src/lib/authedFile.ts +24 -12
  27. package/supervisor/file-saver.ts +92 -19
  28. package/supervisor/harnesses/attachment-policy.ts +111 -0
  29. package/supervisor/harnesses/claude.ts +62 -15
  30. package/supervisor/harnesses/codex.ts +69 -43
  31. package/supervisor/harnesses/pi/index.ts +84 -49
  32. package/supervisor/harnesses/pi/providers/humanize-error.ts +25 -0
  33. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +8 -0
  34. package/supervisor/harnesses/pi/providers/stream-google.ts +5 -0
  35. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +15 -6
  36. package/supervisor/harnesses/pi/providers/types.ts +18 -1
  37. package/supervisor/harnesses/pi/session.ts +28 -1
  38. package/supervisor/index.ts +57 -16
  39. package/supervisor/widget.js +19 -5
  40. package/worker/db.ts +2 -0
  41. package/dist-bloby/assets/globals-DlPtwiZL.css +0 -2
  42. package/dist-bloby/assets/mermaid-GHXKKRXX-B95J3s3s.js +0 -1
  43. package/supervisor/public/headphones_spritesheet.webp +0 -0
  44. package/supervisor/public/spritesheet.webp +0 -0
  45. /package/dist-bloby/assets/{globals-mGpojCOe.js → globals-DN3F0CQE.js} +0 -0
@@ -21,6 +21,15 @@ import { assembleSystemPrompt } from '../../worker/prompts/prompt-assembler.js';
21
21
  import { buildAgents } from '../agents/index.js';
22
22
  import { preWarm, claimWarmup, discardWarmup } from '../cli-warmup.js';
23
23
  import { mirrorSkillsInto } from './skills.js';
24
+ import {
25
+ routeAttachment,
26
+ normalizeImageMediaType,
27
+ approxBase64Bytes,
28
+ buildSavedFilesNote,
29
+ INLINE_TEXT_PER_FILE_CHARS,
30
+ INLINE_TEXT_TOTAL_CHARS,
31
+ MAX_INLINE_IMAGE_BYTES,
32
+ } from './attachment-policy.js';
24
33
 
25
34
  // ── Types ──────────────────────────────────────────────────────────────────
26
35
 
@@ -157,30 +166,68 @@ function loadMcpServers(): Record<string, any> | undefined {
157
166
  return undefined;
158
167
  }
159
168
 
160
- /** Build an SDKUserMessage from text + optional attachments */
169
+ /** Build an SDKUserMessage from text + optional attachments.
170
+ * Routing is delegated to the shared attachment-policy so all three harnesses
171
+ * ingest identically. The Anthropic Messages API base64 document source accepts
172
+ * ONLY application/pdf — handing it a docx/xlsx/csv/markdown/octet-stream 400s
173
+ * the whole turn — so non-PDF binaries are NOT emitted as provider blocks; they
174
+ * ride on the saved-files disk pointer instead. Blocks stay MEDIA-FIRST, TEXT-last. */
161
175
  function buildUserMessage(text: string, attachments?: AgentAttachment[], savedFiles?: SavedFile[]): SDKUserMessage {
162
176
  const content: any[] = [];
163
177
 
164
178
  if (attachments?.length) {
179
+ // Running budget so the cross-file inline-text total never exceeds the cap.
180
+ let inlineTextBudget = INLINE_TEXT_TOTAL_CHARS;
181
+
165
182
  for (const att of attachments) {
166
- if (att.type === 'image') {
167
- content.push({
168
- type: 'image',
169
- source: { type: 'base64', media_type: att.mediaType, data: att.data },
170
- });
171
- } else {
172
- content.push({
173
- type: 'document',
174
- source: { type: 'base64', media_type: att.mediaType, data: att.data },
175
- });
183
+ // Claude natively renders PDF document blocks (vision over the rendered pages).
184
+ const route = routeAttachment(att, { canNativeDocument: true });
185
+ switch (route) {
186
+ case 'image': {
187
+ // Drop the inline copy when it would bloat every stateless resend — the
188
+ // file is on disk and buildSavedFilesNote points the file tools at it.
189
+ if (approxBase64Bytes(att.data) > MAX_INLINE_IMAGE_BYTES) break;
190
+ content.push({
191
+ type: 'image',
192
+ source: { type: 'base64', media_type: normalizeImageMediaType(att.mediaType), data: att.data },
193
+ });
194
+ break;
195
+ }
196
+ case 'native-document': {
197
+ content.push({
198
+ type: 'document',
199
+ source: { type: 'base64', media_type: 'application/pdf', data: att.data },
200
+ });
201
+ break;
202
+ }
203
+ case 'inline-text': {
204
+ if (inlineTextBudget <= 0) break;
205
+ let decoded = '';
206
+ try {
207
+ decoded = Buffer.from(att.data, 'base64').toString('utf-8');
208
+ } catch {
209
+ break; // undecodable → rely on the saved-files note
210
+ }
211
+ const cap = Math.min(INLINE_TEXT_PER_FILE_CHARS, inlineTextBudget);
212
+ const slice = decoded.slice(0, cap);
213
+ inlineTextBudget -= slice.length;
214
+ // text/csv/markdown also 400 as document sources, so inline as a text note.
215
+ content.push({ type: 'text', text: `--- ${att.name} ---\n${slice}` });
216
+ break;
217
+ }
218
+ case 'reference-only':
219
+ default:
220
+ // Binary we can't inline (docx/xlsx/zip/…) or an unexpected route — no
221
+ // provider block; the saved-files note below carries the disk pointer.
222
+ break;
176
223
  }
177
224
  }
178
225
  }
179
226
 
180
227
  let promptText = text || '(attached files)';
181
228
  if (savedFiles?.length) {
182
- const lines = savedFiles.map((f) => `- ${f.name} -> ${f.relPath}`);
183
- promptText += `\n\n[Attached files saved to disk]\n${lines.join('\n')}\nYou can read or reference these files using the paths above (relative to your cwd).`;
229
+ const note = buildSavedFilesNote(savedFiles);
230
+ if (note) promptText += `\n\n${note}`;
184
231
  }
185
232
 
186
233
  content.push({ type: 'text', text: promptText });
@@ -663,8 +710,8 @@ export async function startBlobyAgentQuery(
663
710
 
664
711
  let plainPrompt = prompt;
665
712
  if (savedFiles?.length && !attachments?.length) {
666
- const lines = savedFiles.map((f) => `- ${f.name} -> ${f.relPath}`);
667
- plainPrompt += `\n\n[Attached files saved to disk]\n${lines.join('\n')}\nYou can read or reference these files using the paths above (relative to your cwd).`;
713
+ const note = buildSavedFilesNote(savedFiles);
714
+ if (note) plainPrompt += `\n\n${note}`;
668
715
  }
669
716
 
670
717
  const sdkPrompt: string | AsyncIterable<SDKUserMessage> =
@@ -44,6 +44,15 @@ import type { SavedFile } from '../file-saver.js';
44
44
  import { getCodexAccessToken } from '../../worker/codex-auth.js';
45
45
  import { assembleSystemPrompt } from '../../worker/prompts/prompt-assembler.js';
46
46
  import { mirrorSkillsInto } from './skills.js';
47
+ import {
48
+ routeAttachment,
49
+ normalizeImageMediaType,
50
+ approxBase64Bytes,
51
+ buildSavedFilesNote,
52
+ INLINE_TEXT_PER_FILE_CHARS,
53
+ INLINE_TEXT_TOTAL_CHARS,
54
+ MAX_INLINE_IMAGE_BYTES,
55
+ } from './attachment-policy.js';
47
56
  import type { OnAgentMessage, RecentMessage, AgentAttachment, AgentQueryRequest, AgentQueryResult } from './types.js';
48
57
  export type { RecentMessage, AgentAttachment };
49
58
 
@@ -555,42 +564,24 @@ function emitDone(conv: CodexConversation): void {
555
564
 
556
565
  /* ── Input building ────────────────────────────────────────────────────── */
557
566
 
558
- /** mediaTypes whose content we inline into the prompt — codex has no document
559
- * input type (verified against 0.138 UserInput), so this is the closest we get
560
- * to claude's native base64 document ingestion for text-like files. */
561
- const INLINE_TEXT_MEDIA = /^(text\/|application\/(json|xml|yaml|x-yaml|toml|csv|javascript|typescript))/;
562
- const INLINE_TEXT_MAX_BYTES = 48_000;
563
- const INLINE_TEXT_TOTAL_BUDGET = 96_000;
564
-
567
+ /**
568
+ * Build codex `UserInput` blocks from the user text + saved files + raw
569
+ * attachments. Routing is delegated to the shared attachment-policy so codex
570
+ * stays byte-for-byte consistent with the Claude/PI harnesses. Codex's UserInput
571
+ * has NO native document type (verified against 0.138), so canNativeDocument is
572
+ * FALSE: PDFs/binaries become a disk-pointer note and the agent opens them with
573
+ * its file tools.
574
+ *
575
+ * Block order is MEDIA-first then TEXT (matching Claude/PI): the inline-text
576
+ * notes and the saved-files pointer are folded into the trailing text block.
577
+ */
565
578
  function buildUserInput(text: string, savedFiles?: SavedFile[], attachments?: AgentAttachment[]): Array<Record<string, any>> {
566
579
  const input: Array<Record<string, any>> = [];
567
580
 
568
- let promptText = text || '(attached files)';
569
- if (savedFiles?.length) {
570
- const lines = savedFiles.map((f) => `- ${f.name} -> ${f.relPath}`);
571
- promptText += `\n\n[Attached files saved to disk]\n${lines.join('\n')}\nYou can read or reference these files using the paths above (relative to your cwd).`;
572
- }
573
-
574
- // Inline text-like attachments (size-capped) so the model can answer about
575
- // their contents immediately instead of shelling out to read them from disk.
576
- if (attachments?.length) {
577
- let budget = INLINE_TEXT_TOTAL_BUDGET;
578
- for (const att of attachments) {
579
- if (att.type !== 'file' || !INLINE_TEXT_MEDIA.test(att.mediaType || '')) continue;
580
- const approxBytes = Math.floor((att.data?.length || 0) * 0.75);
581
- if (approxBytes === 0 || approxBytes > INLINE_TEXT_MAX_BYTES || approxBytes > budget) continue;
582
- try {
583
- const decoded = Buffer.from(att.data, 'base64').toString('utf-8');
584
- budget -= approxBytes;
585
- promptText += `\n\n[Attached file content: ${att.name}]\n\`\`\`\n${decoded}\n\`\`\``;
586
- } catch {}
587
- }
588
- }
589
-
590
- input.push({ type: 'text', text: promptText });
591
-
592
581
  // Codex understands `localImage` (path on disk) — Bloby's file-saver already
593
- // wrote attachments to disk, so we just point at the absolute path.
582
+ // wrote image attachments to disk, so we point at the absolute path. Track a
583
+ // per-name COUNT (not presence): WhatsApp multi-image pushes share one
584
+ // attachment name and each saved file covers exactly one of them.
594
585
  const savedImageCounts = new Map<string, number>();
595
586
  if (savedFiles?.length) {
596
587
  for (const f of savedFiles) {
@@ -601,23 +592,58 @@ function buildUserInput(text: string, savedFiles?: SavedFile[], attachments?: Ag
601
592
  }
602
593
  }
603
594
 
604
- // Fallback for image attachments that never made it to disk (file-saver
605
- // failure): inline as a data URL so the agent still SEES the image — claude
606
- // always gets the base64 inline, codex shouldn't silently go blind.
607
- // Match by COUNT, not name presence: WhatsApp multi-image pushes share one
608
- // attachment name, and each saved file covers exactly one of them.
595
+ // Route every attachment through the shared policy. Inline-text notes are
596
+ // accumulated into `inlineNotes` (appended to the trailing text block);
597
+ // images become data-URL blocks (with the localImage path already covering
598
+ // disk-saved copies); everything else falls back to the saved-files pointer.
599
+ let promptText = text || '(attached files)';
600
+ const inlineNotes: string[] = [];
601
+ let inlineBudget = INLINE_TEXT_TOTAL_CHARS;
609
602
  if (attachments?.length) {
610
603
  for (const att of attachments) {
611
- if (att.type !== 'image' || !att.data) continue;
612
- const remaining = savedImageCounts.get(att.name) || 0;
613
- if (remaining > 0) {
614
- savedImageCounts.set(att.name, remaining - 1);
615
- continue;
604
+ switch (routeAttachment(att, { canNativeDocument: false })) {
605
+ case 'image': {
606
+ if (!att.data) break;
607
+ // Skip data-URL inlining when a disk copy exists (localImage already
608
+ // points codex at it) or when the payload is too big to resend on
609
+ // every stateless turn — the saved-files pointer covers it instead.
610
+ const remaining = savedImageCounts.get(att.name) || 0;
611
+ if (remaining > 0) {
612
+ savedImageCounts.set(att.name, remaining - 1);
613
+ break;
614
+ }
615
+ if (approxBase64Bytes(att.data) > MAX_INLINE_IMAGE_BYTES) break;
616
+ const mediaType = normalizeImageMediaType(att.mediaType);
617
+ input.push({ type: 'image', url: `data:${mediaType};base64,${att.data}` });
618
+ break;
619
+ }
620
+ case 'inline-text': {
621
+ if (!att.data) break;
622
+ try {
623
+ let decoded = Buffer.from(att.data, 'base64').toString('utf-8');
624
+ if (decoded.length > INLINE_TEXT_PER_FILE_CHARS) decoded = decoded.slice(0, INLINE_TEXT_PER_FILE_CHARS);
625
+ if (decoded.length > inlineBudget) decoded = decoded.slice(0, inlineBudget);
626
+ if (!decoded.length) break;
627
+ inlineBudget -= decoded.length;
628
+ inlineNotes.push(`\n\n[Attached file content: ${att.name}]\n\`\`\`\n${decoded}\n\`\`\``);
629
+ } catch {}
630
+ break;
631
+ }
632
+ // 'native-document' cannot occur (canNativeDocument:false); it and
633
+ // 'reference-only' both rely on the saved-files disk pointer below.
634
+ default:
635
+ break;
616
636
  }
617
- input.push({ type: 'image', url: `data:${att.mediaType};base64,${att.data}` });
618
637
  }
619
638
  }
620
639
 
640
+ for (const note of inlineNotes) promptText += note;
641
+
642
+ const savedNote = buildSavedFilesNote(savedFiles || []);
643
+ if (savedNote) promptText += `\n\n${savedNote}`;
644
+
645
+ input.push({ type: 'text', text: promptText });
646
+
621
647
  return input;
622
648
  }
623
649
 
@@ -29,11 +29,20 @@ import type {
29
29
  export type { RecentMessage, AgentAttachment };
30
30
 
31
31
  import { buildSkillsIndex } from '../skills.js';
32
+ import {
33
+ routeAttachment,
34
+ buildSavedFilesNote,
35
+ normalizeImageMediaType,
36
+ approxBase64Bytes,
37
+ MAX_INLINE_IMAGE_BYTES,
38
+ INLINE_TEXT_PER_FILE_CHARS,
39
+ INLINE_TEXT_TOTAL_CHARS,
40
+ } from '../attachment-policy.js';
32
41
  import { createAsyncQueue, type AsyncQueue } from './async-queue.js';
33
42
  import { createPiSession, type PiSessionEvent, type PiSessionAuth } from './session.js';
34
- import { getPiSubProvider, getCatalogModel } from './sub-providers.js';
43
+ import { getPiSubProvider, getCatalogModel, type PiApiFlavor } from './sub-providers.js';
35
44
  import { readPiAuth } from './auth-storage.js';
36
- import type { PiMessage } from './providers/types.js';
45
+ import type { PiMessage, PiContentBlock } from './providers/types.js';
37
46
  import { toolDefsForProvider } from './tools/registry.js';
38
47
  import type { PiTaskHost } from './tools/types.js';
39
48
 
@@ -495,62 +504,82 @@ function recentToPiMessages(messages: RecentMessage[] | undefined): PiMessage[]
495
504
  }));
496
505
  }
497
506
 
498
- const DOC_INLINE_CAP_CHARS = 48_000;
499
-
500
- /** Text-like documents can ride inline as decoded text on every flavor. */
501
- function isTextLikeMediaType(mediaType: string): boolean {
502
- const mt = (mediaType || '').toLowerCase();
503
- return (
504
- mt.startsWith('text/') ||
505
- mt.includes('json') ||
506
- mt.includes('xml') ||
507
- mt.includes('yaml') ||
508
- mt.includes('csv') ||
509
- mt.includes('javascript') ||
510
- mt.includes('typescript') ||
511
- mt === 'application/x-sh'
512
- );
507
+ /** Native PDF document blocks reach only the flavors that render them — the
508
+ * Anthropic Messages API and Gemini both ingest application/pdf inline
509
+ * (base64 document source / inlineData). openai-completions has no document
510
+ * type, so a PDF there falls back to the saved-files disk pointer. Matches the
511
+ * shared attachment-policy routing rule. */
512
+ function canNativeDocumentForFlavor(flavor: PiApiFlavor): boolean {
513
+ return flavor === 'anthropic-messages' || flavor === 'google-gemini';
513
514
  }
514
515
 
515
- /** Wrap a raw user input into a PiMessage with text + optional image blocks. */
516
- function buildUserMessage(text: string, attachments?: AgentAttachment[], savedFiles?: SavedFile[]): PiMessage {
517
- const content: PiMessage['content'] = [];
516
+ /** Build a PiContentBlock[] from raw text + attachments, MEDIA-FIRST then the
517
+ * prompt text last (parity with claude.ts and the other pi providers). Routing
518
+ * is delegated to the shared attachment-policy so all three harnesses ingest
519
+ * identically; canNativeDocument is the active provider's PDF capability. */
520
+ function buildAttachmentBlocks(
521
+ text: string,
522
+ canNativeDocument: boolean,
523
+ attachments?: AgentAttachment[],
524
+ savedFiles?: SavedFile[],
525
+ ): PiContentBlock[] {
526
+ const content: PiContentBlock[] = [];
518
527
  if (attachments?.length) {
528
+ // Running budget so the cross-file inline-text total never exceeds the cap.
529
+ let inlineTextBudget = INLINE_TEXT_TOTAL_CHARS;
519
530
  for (const att of attachments) {
520
- if (att.type === 'image') {
521
- content.push({ type: 'image', mediaType: att.mediaType, data: att.data });
522
- } else if (isTextLikeMediaType(att.mediaType)) {
523
- // Text-like documents are inlined (codex-parity posture, audit D5-7):
524
- // claude sends native document blocks; pi inlines the decoded text,
525
- // capped so a huge file can't blow the context.
526
- let docText = '';
527
- try { docText = Buffer.from(att.data, 'base64').toString('utf-8'); } catch {}
528
- const capped = docText.length > DOC_INLINE_CAP_CHARS
529
- ? `${docText.slice(0, DOC_INLINE_CAP_CHARS)}\n…[truncated at ${DOC_INLINE_CAP_CHARS} characters the full file is in the saved-files note below]`
530
- : docText;
531
- content.push({
532
- type: 'text',
533
- text: capped
534
- ? `[Attached document: ${att.name} (${att.mediaType})]\n${capped}\n[End of ${att.name}]`
535
- : `[Attached document: ${att.name} (${att.mediaType}) — could not decode]`,
536
- });
537
- } else {
538
- // Binary documents (PDF etc.) can't be inlined across all
539
- // sub-providers point the model at the saved copy instead.
540
- content.push({
541
- type: 'text',
542
- text: `[Attached document: ${att.name} (${att.mediaType}). It is saved to disk — see the saved-files note below and use your tools to inspect it if needed.]`,
543
- });
531
+ switch (routeAttachment(att, { canNativeDocument })) {
532
+ case 'image': {
533
+ // Drop the inline copy when it would bloat every stateless resend —
534
+ // the file is on disk and buildSavedFilesNote points the tools at it.
535
+ if (approxBase64Bytes(att.data) > MAX_INLINE_IMAGE_BYTES) break;
536
+ content.push({ type: 'image', mediaType: normalizeImageMediaType(att.mediaType), data: att.data });
537
+ break;
538
+ }
539
+ case 'native-document': {
540
+ // PDF on a flavor that renders it natively (anthropic / gemini).
541
+ content.push({ type: 'document', mediaType: 'application/pdf', data: att.data, name: att.name });
542
+ break;
543
+ }
544
+ case 'inline-text': {
545
+ if (inlineTextBudget <= 0) break;
546
+ let decoded = '';
547
+ try { decoded = Buffer.from(att.data, 'base64').toString('utf-8'); }
548
+ catch { break; } // undecodable → rely on the saved-files note
549
+ const cap = Math.min(INLINE_TEXT_PER_FILE_CHARS, inlineTextBudget);
550
+ const slice = decoded.slice(0, cap);
551
+ inlineTextBudget -= slice.length;
552
+ content.push({ type: 'text', text: `--- ${att.name} ---\n${slice}` });
553
+ break;
554
+ }
555
+ case 'reference-only':
556
+ default:
557
+ // Binary we can't inline (docx/xlsx/zip/…), a PDF on a flavor without
558
+ // native documents, or an unexpected route — no provider block; the
559
+ // saved-files note below carries the disk pointer. Never emit a
560
+ // malformed block (defensive default, review PI-E).
561
+ break;
544
562
  }
545
563
  }
546
564
  }
565
+
547
566
  let prompt = text || '(attached files)';
548
567
  if (savedFiles?.length) {
549
- const lines = savedFiles.map((f) => `- ${f.name} -> ${f.relPath}`);
550
- prompt += `\n\n[Attached files saved to disk]\n${lines.join('\n')}\nYou can read or reference these files using the paths above (relative to your cwd).`;
568
+ const note = buildSavedFilesNote(savedFiles);
569
+ if (note) prompt += `\n\n${note}`;
551
570
  }
552
571
  content.push({ type: 'text', text: prompt });
553
- return { role: 'user', content };
572
+ return content;
573
+ }
574
+
575
+ /** Wrap a raw user input into a PiMessage with text + optional media blocks. */
576
+ function buildUserMessage(
577
+ text: string,
578
+ canNativeDocument: boolean,
579
+ attachments?: AgentAttachment[],
580
+ savedFiles?: SavedFile[],
581
+ ): PiMessage {
582
+ return { role: 'user', content: buildAttachmentBlocks(text, canNativeDocument, attachments, savedFiles) };
554
583
  }
555
584
 
556
585
  // ── Live Conversation API ──────────────────────────────────────────────────
@@ -771,7 +800,13 @@ export function pushMessage(
771
800
  conv.busy = true;
772
801
  conv.pendingCount += 1;
773
802
  conv.turnOrigins.push('user');
774
- conv.inputQueue.push(buildUserMessage(content, attachments, savedFiles));
803
+ // Resolve the active flavor at push time (the session re-resolves auth every
804
+ // round, so a wizard provider switch mid-session is honored). Unreadable auth
805
+ // ⇒ no native documents — the conservative route sends a PDF to the disk
806
+ // pointer rather than emitting a block the provider can't render.
807
+ const resolved = resolveAuth();
808
+ const canNativeDocument = resolved.ok ? canNativeDocumentForFlavor(resolved.auth.flavor) : false;
809
+ conv.inputQueue.push(buildUserMessage(content, canNativeDocument, attachments, savedFiles));
775
810
  conv.onMessage('bot:typing', { conversationId });
776
811
  return true;
777
812
  }
@@ -981,7 +1016,7 @@ export async function startBlobyAgentQuery(
981
1016
  });
982
1017
 
983
1018
  const queue = createAsyncQueue<PiMessage>();
984
- queue.push(buildUserMessage(prompt, attachments, savedFiles));
1019
+ queue.push(buildUserMessage(prompt, canNativeDocumentForFlavor(resolved.auth.flavor), attachments, savedFiles));
985
1020
  queue.end();
986
1021
  await session.run(queue);
987
1022
 
@@ -48,6 +48,18 @@ const AUTH_RE =
48
48
  const BILLING_RE =
49
49
  /insufficient_quota|credit balance is too low|payment required|purchase more credits/i;
50
50
 
51
+ // A text-only model rejecting an attached image. Vendors phrase it many ways:
52
+ // OpenAI "Invalid content type. image_url is only supported by certain models",
53
+ // OpenRouter "No endpoints found that support image input", others mention
54
+ // "image input" / "does not support images" / "unsupported content type".
55
+ // Only EXPLICIT image-naming phrases — the bare tokens "vision"/"multimodal"/
56
+ // "modality" were removed because the provider body routinely echoes the model id
57
+ // (e.g. "gpt-4-vision-preview", "llama-3.2-90b-vision-instruct"), which would
58
+ // mis-classify an unrelated 400 from a vision-capable model and wrongly disable
59
+ // vision for the rest of the session. Paired with a 400/415/422 status below.
60
+ const IMAGE_UNSUPPORTED_RE =
61
+ /image[_ ]?url|image input|images?(?: are| is)? not supported|does not support images?|no endpoints? .*support image|unsupported content type/i;
62
+
51
63
  export function classifyPiError(
52
64
  providerLabel: string,
53
65
  status: number | undefined,
@@ -85,6 +97,19 @@ export function classifyPiError(
85
97
  message: `${providerLabel} rejected your API key. Update it from the dashboard (Bloby provider settings).${suffix}`,
86
98
  };
87
99
  }
100
+ // A text-only model that the catalog couldn't flag up front (dynamic/unknown
101
+ // sub-providers) 400/415/422s on the attached image. The session reacts by
102
+ // disabling vision for the rest of the session and re-running the round with
103
+ // images downgraded — self-healing so a single screenshot can't permanently
104
+ // 400-poison the conversation (it rides every stateless resend otherwise).
105
+ if ((status === 400 || status === 415 || status === 422) && IMAGE_UNSUPPORTED_RE.test(body)) {
106
+ return {
107
+ kind: 'image-unsupported',
108
+ retryable: false,
109
+ status,
110
+ message: `${providerLabel} rejected the attached image — this model appears to be text-only. Retrying without the image; switch to a vision-capable model to send images.${suffix}`,
111
+ };
112
+ }
88
113
  if (status === 429) {
89
114
  return {
90
115
  kind: 'rate-limit',
@@ -89,6 +89,14 @@ function toAnthropicContent(blocks: PiContentBlock[]): any[] {
89
89
  type: 'image',
90
90
  source: { type: 'base64', media_type: b.mediaType, data: b.data },
91
91
  });
92
+ } else if (b.type === 'document') {
93
+ // Native PDF document block — the Messages API renders the pages and the
94
+ // model reads them as vision. The base64 document source accepts ONLY
95
+ // application/pdf (buildUserMessage gates it on canNativeDocument).
96
+ out.push({
97
+ type: 'document',
98
+ source: { type: 'base64', media_type: b.mediaType, data: b.data },
99
+ });
92
100
  } else if (b.type === 'tool_use') {
93
101
  out.push({
94
102
  type: 'tool_use',
@@ -102,6 +102,11 @@ function toGeminiParts(content: PiContentBlock[]): any[] {
102
102
  parts.push({ text: b.text });
103
103
  } else if (b.type === 'image') {
104
104
  parts.push({ inlineData: { mimeType: b.mediaType, data: b.data } });
105
+ } else if (b.type === 'document') {
106
+ // Gemini ingests application/pdf inline via the same inlineData shape as
107
+ // images (it OCRs/renders the document). buildUserMessage only routes a
108
+ // document block here when the flavor supports it.
109
+ parts.push({ inlineData: { mimeType: b.mediaType, data: b.data } });
105
110
  } else if (b.type === 'tool_use') {
106
111
  // Assistant turn: the model asked to invoke a tool. Thinking-capable
107
112
  // Gemini 3.x rejects (HTTP 400) any echoed functionCall whose
@@ -119,24 +119,33 @@ function toOpenAIMessages(pi: PiMessage[]): any[] {
119
119
  out.push(msg);
120
120
  continue;
121
121
  }
122
- // role === 'user' with non-tool-result content (text + optional images)
122
+ // role === 'user' with non-tool-result content (text + optional images).
123
+ // Media parts go first; text is appended last (parity with the other
124
+ // providers and pi/index's media-first block ordering).
123
125
  const contentBlocks: any[] = [];
124
126
  let plainText = '';
125
- let hasImage = false;
127
+ let hasMedia = false;
126
128
  for (const b of m.content) {
127
129
  if (b.type === 'text') {
128
130
  plainText += (plainText ? '\n' : '') + b.text;
129
131
  } else if (b.type === 'image') {
130
- hasImage = true;
132
+ hasMedia = true;
131
133
  contentBlocks.push({
132
134
  type: 'image_url',
133
135
  image_url: { url: `data:${b.mediaType};base64,${b.data}` },
134
136
  });
137
+ } else if (b.type === 'document') {
138
+ // The Chat Completions schema has no document part — degrade to a text
139
+ // note rather than crashing. The file is also on disk (saved-files
140
+ // note), so the agent can open it with its tools. This shouldn't
141
+ // normally happen: buildUserMessage gates documents on canNativeDocument
142
+ // (false for this flavor), so a PDF here rides as the disk pointer.
143
+ plainText += (plainText ? '\n' : '') +
144
+ `[Attached document${b.name ? ` "${b.name}"` : ''} (${b.mediaType}) could not be inlined for this model — it is saved to disk; open it with your file tools.]`;
135
145
  }
136
146
  }
137
- if (hasImage) {
138
- // Mixed image+text: prepend text part to the content array.
139
- if (plainText) contentBlocks.unshift({ type: 'text', text: plainText });
147
+ if (hasMedia) {
148
+ if (plainText) contentBlocks.push({ type: 'text', text: plainText });
140
149
  out.push({ role: 'user', content: contentBlocks });
141
150
  } else {
142
151
  out.push({ role: 'user', content: plainText });
@@ -17,6 +17,11 @@ export type PiRole = 'user' | 'assistant' | 'tool';
17
17
  export type PiContentBlock =
18
18
  | { type: 'text'; text: string }
19
19
  | { type: 'image'; mediaType: string; data: string } // base64
20
+ // Native document block (PDF). Only the flavors with native document support
21
+ // (anthropic-messages, google-gemini) ever receive one — buildUserMessage
22
+ // gates it on canNativeDocument; openai-completions degrades it to a text
23
+ // note rather than crashing if one ever reaches it.
24
+ | { type: 'document'; mediaType: string; data: string; name?: string } // base64
20
25
  // `thoughtSignature` is a Gemini 3.x thinking-model field. Pi-flavored
21
26
  // providers that emit reasoning attach it to function-call parts; the API
22
27
  // rejects the next turn with HTTP 400 if we don't echo it back verbatim.
@@ -74,7 +79,19 @@ export type PiStopReason = 'end_turn' | 'tool_use' | 'max_tokens' | 'error' | 'a
74
79
  * string-matching: retry transient rounds, tear down on auth/overflow, and
75
80
  * show actionable messages instead of raw provider JSON.
76
81
  */
77
- export type PiErrorKind = 'auth' | 'context-overflow' | 'rate-limit' | 'billing' | 'transient' | 'other';
82
+ export type PiErrorKind =
83
+ | 'auth'
84
+ | 'context-overflow'
85
+ | 'rate-limit'
86
+ | 'billing'
87
+ | 'transient'
88
+ /** The model rejected an image/vision/modality block (a text-only model 400/
89
+ * 415/422s on the attached image). The session reacts by disabling vision
90
+ * for the rest of the session and re-running the round with images
91
+ * downgraded to placeholders — self-healing for dynamic/unknown models whose
92
+ * catalog can't tell us up front whether they see images. */
93
+ | 'image-unsupported'
94
+ | 'other';
78
95
 
79
96
  export type PiStreamEvent =
80
97
  | { type: 'text_delta'; delta: string }
@@ -181,6 +181,14 @@ export function createPiSession(init: PiSessionInit): PiSession {
181
181
  let lastUsage: PiUsage | undefined;
182
182
  let lastContextWindow: number | undefined;
183
183
 
184
+ // Self-healing vision (audit D rank 12): when a model the catalog couldn't
185
+ // classify (dynamic/unknown sub-providers ⇒ supportsImages undefined) rejects
186
+ // an image with an 'image-unsupported' error, latch this for the rest of the
187
+ // session and downgrade images on every subsequent send. The IMAGE stays in
188
+ // history (downgradeImages is transform-on-send only), so switching to a
189
+ // vision-capable model later restores it.
190
+ let visionDisabled = false;
191
+
184
192
  /** One stream round — collect the assistant blocks the model emits this pass. */
185
193
  interface RoundResult {
186
194
  text: string;
@@ -204,7 +212,11 @@ export function createPiSession(init: PiSessionInit): PiSession {
204
212
  baseUrl: auth.baseUrl,
205
213
  apiKey: auth.apiKey,
206
214
  systemPrompt: init.systemPrompt,
207
- messages: auth.supportsImages === false ? downgradeImages(messages) : messages,
215
+ // Downgrade images when the catalog says text-only (supportsImages
216
+ // false) OR a prior round in THIS session learned it the hard way via
217
+ // an 'image-unsupported' error (visionDisabled). The stored history
218
+ // keeps the image so a later vision-capable model still restores it.
219
+ messages: auth.supportsImages === false || visionDisabled ? downgradeImages(messages) : messages,
208
220
  tools: init.tools,
209
221
  toolChoice: opts?.wrapUp ? 'none' : undefined,
210
222
  maxOutputTokens: auth.maxOutputTokens,
@@ -330,6 +342,21 @@ export function createPiSession(init: PiSessionInit): PiSession {
330
342
  res = await runOneRound(needsSeparator);
331
343
  }
332
344
 
345
+ // Self-healing vision (audit D rank 12): a model the catalog couldn't
346
+ // classify just 400/415/422'd on an attached image. Latch visionDisabled
347
+ // and re-run the round ONCE — runOneRound now downgrades images on send,
348
+ // so the resend succeeds. Guarded by !visionDisabled so it fires at most
349
+ // once per session; an image rides every stateless resend, so without
350
+ // this the whole conversation would keep re-400ing.
351
+ if (
352
+ res.errored && res.errorKind === 'image-unsupported' && !visionDisabled &&
353
+ !init.abortController.signal.aborted
354
+ ) {
355
+ log.info('[pi/session] model rejected image — disabling vision for this session and retrying without it');
356
+ visionDisabled = true;
357
+ res = await runOneRound(needsSeparator);
358
+ }
359
+
333
360
  const { text, toolUses, errored } = res;
334
361
 
335
362
  // Append whatever the model produced this round to history so subsequent