bloby-bot 0.70.13 → 0.71.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/bin/cli.js +223 -45
  2. package/dist-bloby/assets/{bloby-CU9KhQdP.js → bloby-es6cZJzs.js} +6 -6
  3. package/dist-bloby/assets/globals-DBqwNiJV.css +2 -0
  4. package/dist-bloby/assets/{highlighted-body-OFNGDK62-D0Tm_wgU.js → highlighted-body-OFNGDK62-8PiOHw9p.js} +1 -1
  5. package/dist-bloby/assets/mermaid-GHXKKRXX-BJWX8urU.js +1 -0
  6. package/dist-bloby/assets/{onboard-GfjHF9nm.js → onboard-BKgy17OU.js} +1 -1
  7. package/dist-bloby/bloby.html +3 -3
  8. package/dist-bloby/onboard.html +3 -3
  9. package/package.json +2 -3
  10. package/scripts/install +141 -34
  11. package/scripts/install.ps1 +111 -15
  12. package/scripts/install.sh +141 -34
  13. package/shared/config.ts +37 -2
  14. package/supervisor/channels/manager.ts +68 -33
  15. package/supervisor/channels/telegram.ts +57 -16
  16. package/supervisor/channels/types.ts +4 -1
  17. package/supervisor/channels/whatsapp.ts +57 -10
  18. package/supervisor/chat/src/components/Chat/AudioBubble.tsx +1 -1
  19. package/supervisor/chat/src/components/Chat/AuthedImage.tsx +16 -3
  20. package/supervisor/chat/src/components/Chat/BlobyImageCard.tsx +2 -2
  21. package/supervisor/chat/src/components/Chat/ImageLightbox.tsx +25 -8
  22. package/supervisor/chat/src/components/Chat/InputBar.tsx +62 -7
  23. package/supervisor/chat/src/components/Chat/MessageBubble.tsx +37 -18
  24. package/supervisor/chat/src/components/Chat/MessageList.tsx +3 -3
  25. package/supervisor/chat/src/hooks/useChat.ts +52 -0
  26. package/supervisor/chat/src/lib/authedFile.ts +24 -12
  27. package/supervisor/file-saver.ts +92 -19
  28. package/supervisor/harnesses/attachment-policy.ts +111 -0
  29. package/supervisor/harnesses/claude.ts +62 -15
  30. package/supervisor/harnesses/codex.ts +69 -43
  31. package/supervisor/harnesses/pi/index.ts +84 -49
  32. package/supervisor/harnesses/pi/providers/humanize-error.ts +25 -0
  33. package/supervisor/harnesses/pi/providers/stream-anthropic.ts +8 -0
  34. package/supervisor/harnesses/pi/providers/stream-google.ts +5 -0
  35. package/supervisor/harnesses/pi/providers/stream-openai-completions.ts +15 -6
  36. package/supervisor/harnesses/pi/providers/types.ts +18 -1
  37. package/supervisor/harnesses/pi/session.ts +28 -1
  38. package/supervisor/index.ts +57 -16
  39. package/supervisor/widget.js +19 -5
  40. package/worker/db.ts +2 -0
  41. package/dist-bloby/assets/globals-DlPtwiZL.css +0 -2
  42. package/dist-bloby/assets/mermaid-GHXKKRXX-B95J3s3s.js +0 -1
  43. package/supervisor/public/headphones_spritesheet.webp +0 -0
  44. package/supervisor/public/spritesheet.webp +0 -0
  45. /package/dist-bloby/assets/{globals-mGpojCOe.js → globals-DN3F0CQE.js} +0 -0
@@ -29,7 +29,7 @@ import { AlexaChannel } from './alexa.js';
29
29
  import { TelegramChannel, type TelegramInbound } from './telegram.js';
30
30
  import type { ChannelConfig, ChannelProvider, ChannelStatus, ChannelType, InboundMessage, InboundMessageAttachment, RoutingTarget, SenderRole } from './types.js';
31
31
  import type { AgentAttachment } from '../bloby-agent.js';
32
- import { saveAttachment, type SavedFile } from '../file-saver.js';
32
+ import { saveAttachment, MAX_ATTACHMENTS_PER_MESSAGE, MAX_TOTAL_ATTACHMENT_BYTES, type SavedFile } from '../file-saver.js';
33
33
  import type { WAMessageKey } from '@whiskeysockets/baileys';
34
34
 
35
35
  const MAX_CONCURRENT_AGENTS = 5;
@@ -37,16 +37,34 @@ const MAX_BUFFER_MESSAGES = 30;
37
37
  const DEBOUNCE_MS = 4000; // 4s — wait for the user to finish typing
38
38
 
39
39
  /** Persist channel-inbound attachments to disk so harnesses that consume file
40
- * paths (Codex's `localImage`) can see them. Failures are logged and the
41
- * attachment is dropped — text-only delivery is still useful. */
42
- function saveInboundAttachments(attachments?: AgentAttachment[]): SavedFile[] {
43
- if (!attachments?.length) return [];
40
+ * paths (Codex's `localImage`) can see them. Per-file failures are logged and that
41
+ * attachment is dropped — one oversize/corrupt file can't abort the whole message,
42
+ * and text-only delivery still goes through. Bounded by MAX_ATTACHMENTS_PER_MESSAGE
43
+ * (count) and MAX_TOTAL_ATTACHMENT_BYTES (decoded bytes) so a single message can't
44
+ * flood the disk; saveAttachment itself caps each file's size. */
45
+ function saveInboundAttachments(attachments?: AgentAttachment[]): { saved: SavedFile[]; accepted: AgentAttachment[] } {
46
+ if (!attachments?.length) return { saved: [], accepted: [] };
47
+ const capped = attachments.slice(0, MAX_ATTACHMENTS_PER_MESSAGE);
48
+ if (attachments.length > capped.length) {
49
+ log.warn(`[channels] Dropping ${attachments.length - capped.length} inbound attachment(s) over the per-message cap (${MAX_ATTACHMENTS_PER_MESSAGE})`);
50
+ }
44
51
  const saved: SavedFile[] = [];
45
- for (const att of attachments) {
46
- try { saved.push(saveAttachment(att)); }
52
+ // The raw attachments that actually saved within budget — handed to the harness so the
53
+ // model inlines exactly what got persisted + shown in chat (no over-cap divergence).
54
+ const accepted: AgentAttachment[] = [];
55
+ let totalBytes = 0;
56
+ for (const att of capped) {
57
+ // Estimate decoded size from the base64 length (×3/4) before writing so a burst of
58
+ // mid-size files can't blow the per-message byte budget in aggregate.
59
+ totalBytes += Math.floor((att.data?.length || 0) * 0.75);
60
+ if (totalBytes > MAX_TOTAL_ATTACHMENT_BYTES) {
61
+ log.warn('[channels] Per-message attachment byte budget exceeded — dropping remaining inbound attachments');
62
+ break;
63
+ }
64
+ try { saved.push(saveAttachment(att)); accepted.push(att); }
47
65
  catch (err: any) { log.warn(`[channels] Failed to save inbound attachment: ${err.message}`); }
48
66
  }
49
- return saved;
67
+ return { saved, accepted };
50
68
  }
51
69
 
52
70
  interface ChannelManagerOpts {
@@ -144,8 +162,8 @@ export class ChannelManager {
144
162
  if (channelConfigs?.whatsapp?.enabled && !this.providers.has('whatsapp')) {
145
163
  log.info('[channels] Initializing WhatsApp channel...');
146
164
  const whatsapp = new WhatsAppChannel(
147
- (sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, images, inboundKey) => {
148
- const attachments = images?.map((img) => ({ type: 'image' as const, mediaType: img.mediaType, data: img.data }));
165
+ (sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, media, inboundKey) => {
166
+ const attachments = media?.map((att) => ({ type: att.type, mediaType: att.mediaType, data: att.data, name: att.name }));
149
167
  this.handleInboundMessage('whatsapp', sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, attachments, inboundKey);
150
168
  },
151
169
  (status) => this.handleStatusChange(status),
@@ -211,7 +229,7 @@ export class ChannelManager {
211
229
  }
212
230
  }
213
231
  const isOwner = !!ownerUserId && msg.fromUserId === String(ownerUserId);
214
- const attachments = msg.images?.map((img) => ({ type: 'image' as const, mediaType: img.mediaType, data: img.data }));
232
+ const attachments = msg.attachments?.map((att) => ({ type: att.type, mediaType: att.mediaType, data: att.data, name: att.name }));
215
233
  // Sanitize the attacker-controlled display name so it can't fake a `[Telegram | … | admin]`
216
234
  // context tag or inject newlines into the agent's context.
217
235
  const safeName = msg.senderName ? msg.senderName.replace(/[\[\]|\r\n]/g, ' ').slice(0, 64).trim() || undefined : undefined;
@@ -236,8 +254,8 @@ export class ChannelManager {
236
254
  if (provider?.getStatus().connected) return;
237
255
  if (!provider) {
238
256
  const whatsapp = new WhatsAppChannel(
239
- (sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, images, inboundKey) => {
240
- const attachments = images?.map((img) => ({ type: 'image' as const, mediaType: img.mediaType, data: img.data }));
257
+ (sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, media, inboundKey) => {
258
+ const attachments = media?.map((att) => ({ type: att.type, mediaType: att.mediaType, data: att.data, name: att.name }));
241
259
  this.handleInboundMessage('whatsapp', sender, senderName, text, fromMe, isSelfChat, chatJid, isGroup, attachments, inboundKey);
242
260
  },
243
261
  (status) => this.handleStatusChange(status),
@@ -992,12 +1010,34 @@ export class ChannelManager {
992
1010
  const channelTag = `[${this.channelLabel(msg.channel)} | ${msg.sender} | ${earlyRoleTag}]\n`;
993
1011
  const displayContent = channelTag + rawDisplay;
994
1012
 
1013
+ // Convert inbound attachments to agent format and persist them to disk BEFORE the
1014
+ // user-message persist/broadcast — so the StoredAttachment array (filePath-based, served
1015
+ // at /api/files/<relPath>) can ride along in meta.attachments + chat:sync. Without this
1016
+ // the agent sees the media but the chat shows nothing (live or after refresh). An image
1017
+ // keeps an auto-generated name; a file uses the channel-provided filename. (Mirrors the
1018
+ // PWA path in supervisor/index.ts.)
1019
+ const agentAttachments: AgentAttachment[] | undefined = msg.attachments?.map((att) => ({
1020
+ type: att.type,
1021
+ name: att.type === 'image'
1022
+ ? `${msg.channel}_image.${att.mediaType.split('/')[1] || 'jpg'}`
1023
+ : (att.name || `${msg.channel}_file`),
1024
+ mediaType: att.mediaType,
1025
+ data: att.data,
1026
+ }));
1027
+ // Save to disk so providers that consume file paths (Codex → localImage) can see the
1028
+ // attachment. Claude consumes raw base64 from `agentAttachments` directly, but the
1029
+ // on-disk copy is what the chat UI renders (filePath → /api/files/<relPath>).
1030
+ const { saved: savedFiles, accepted: acceptedAttachments } = saveInboundAttachments(agentAttachments);
1031
+ const storedAtts = savedFiles.map((f) => ({ type: f.type, name: f.name, mediaType: f.mediaType, filePath: f.relPath }));
1032
+
995
1033
  // Save user message to DB
996
1034
  try {
1035
+ const userMeta: any = { model, channel: msg.channel };
1036
+ if (storedAtts.length) userMeta.attachments = JSON.stringify(storedAtts);
997
1037
  await workerApi(`/api/conversations/${convId}/messages`, 'POST', {
998
1038
  role: 'user',
999
1039
  content: displayContent,
1000
- meta: { model, channel: msg.channel },
1040
+ meta: userMeta,
1001
1041
  });
1002
1042
  } catch (err: any) {
1003
1043
  log.warn(`[channels] DB persist error: ${err.message}`);
@@ -1006,7 +1046,12 @@ export class ChannelManager {
1006
1046
  // Broadcast to chat clients (mirroring)
1007
1047
  broadcastBloby('chat:sync', {
1008
1048
  conversationId: convId,
1009
- message: { role: 'user', content: displayContent, timestamp: new Date().toISOString() },
1049
+ message: {
1050
+ role: 'user',
1051
+ content: displayContent,
1052
+ timestamp: new Date().toISOString(),
1053
+ attachments: storedAtts.length ? storedAtts : undefined,
1054
+ },
1010
1055
  });
1011
1056
 
1012
1057
  // Fetch names and recent messages
@@ -1033,18 +1078,6 @@ export class ChannelManager {
1033
1078
  // Channel context — same tag we already prepended to the stored display content
1034
1079
  const channelContext = channelTag;
1035
1080
 
1036
- // Convert inbound attachments to agent format
1037
- const agentAttachments: AgentAttachment[] | undefined = msg.attachments?.map((att) => ({
1038
- type: 'image' as const,
1039
- name: `whatsapp_image.${att.mediaType.split('/')[1] || 'jpg'}`,
1040
- mediaType: att.mediaType,
1041
- data: att.data,
1042
- }));
1043
- // Save to disk so providers that consume file paths (Codex → localImage)
1044
- // can see the attachment. Claude consumes raw base64 from `agentAttachments`
1045
- // directly, but the on-disk copy is still useful for the path mention.
1046
- const savedFiles = saveInboundAttachments(agentAttachments);
1047
-
1048
1081
  // Show "typing..." in the correct chat
1049
1082
  this.startTyping(msg.channel, msg.rawSender);
1050
1083
 
@@ -1127,7 +1160,7 @@ export class ChannelManager {
1127
1160
  assistantBufferKey: msg.role === 'assistant' ? `${msg.channel}:${msg.sender}` : undefined,
1128
1161
  inboundKey: msg.inboundKey,
1129
1162
  };
1130
- this.pushWithRouting(convId, target, channelContent, agentAttachments, savedFiles);
1163
+ this.pushWithRouting(convId, target, channelContent, acceptedAttachments, savedFiles);
1131
1164
  }
1132
1165
 
1133
1166
  /** Synchronously handle an Alexa utterance: push into the shared conversation,
@@ -1338,14 +1371,16 @@ export class ChannelManager {
1338
1371
 
1339
1372
  const channelContext = `[${this.channelLabel(msg.channel)} | ${msg.sender} | customer${msg.senderName ? ` | ${msg.senderName}` : ''}]\n`;
1340
1373
 
1341
- // Convert inbound attachments to agent format
1374
+ // Convert inbound attachments to agent format (image → auto-name; file → channel filename)
1342
1375
  const agentAttachments: AgentAttachment[] | undefined = msg.attachments?.map((att) => ({
1343
- type: 'image' as const,
1344
- name: `whatsapp_image.${att.mediaType.split('/')[1] || 'jpg'}`,
1376
+ type: att.type,
1377
+ name: att.type === 'image'
1378
+ ? `${msg.channel}_image.${att.mediaType.split('/')[1] || 'jpg'}`
1379
+ : (att.name || `${msg.channel}_file`),
1345
1380
  mediaType: att.mediaType,
1346
1381
  data: att.data,
1347
1382
  }));
1348
- const savedFiles = saveInboundAttachments(agentAttachments);
1383
+ const { saved: savedFiles, accepted: acceptedAttachments } = saveInboundAttachments(agentAttachments);
1349
1384
 
1350
1385
  // Stable convId per customer (not per message)
1351
1386
  const convId = `channel-${agentKey}`;
@@ -1404,7 +1439,7 @@ export class ChannelManager {
1404
1439
  this.processQueue();
1405
1440
  }
1406
1441
  },
1407
- agentAttachments,
1442
+ acceptedAttachments,
1408
1443
  savedFiles,
1409
1444
  { botName, humanName },
1410
1445
  recentMessages,
@@ -20,10 +20,14 @@ const POLL_TIMEOUT_S = 25; // long-poll hold time
20
20
  const MAX_MESSAGE_CHARS = 4096; // Telegram hard limit per sendMessage
21
21
  const TYPING_REFRESH_MS = 5_000; // Telegram "typing" expires ~5s
22
22
 
23
- /** Image extracted from an inbound Telegram message. */
24
- export interface TelegramImageAttachment {
23
+ /** Media attachment extracted from an inbound Telegram message.
24
+ * `type: 'image'` → inline vision; `type: 'file'` → a document the agent reads from disk. */
25
+ export interface TelegramMediaAttachment {
26
+ type: 'image' | 'file';
25
27
  mediaType: string;
26
28
  data: string; // base64
29
+ /** Original filename — present for documents, absent for photos. */
30
+ name?: string;
27
31
  }
28
32
 
29
33
  /** Normalized inbound message handed to the ChannelManager. */
@@ -37,7 +41,7 @@ export interface TelegramInbound {
37
41
  text: string;
38
42
  isGroup: boolean;
39
43
  messageId?: number;
40
- images?: TelegramImageAttachment[];
44
+ attachments?: TelegramMediaAttachment[];
41
45
  }
42
46
 
43
47
  export type OnTelegramMessage = (msg: TelegramInbound) => void;
@@ -236,13 +240,27 @@ export class TelegramChannel implements ChannelProvider {
236
240
  : (from.username || undefined);
237
241
 
238
242
  let rawText: string = message.text || message.caption || '';
239
- const images: TelegramImageAttachment[] = [];
243
+ const attachments: TelegramMediaAttachment[] = [];
240
244
 
241
- // Photo: download the largest available size.
245
+ // Photo: download the largest available size. Derive the real mediaType from the CDN
246
+ // file extension (Telegram stores PNG/JPEG/WebP as-is) — default to image/jpeg only when unknown.
242
247
  if (Array.isArray(message.photo) && message.photo.length > 0) {
243
248
  const largest = message.photo[message.photo.length - 1];
244
249
  const img = await this.downloadFile(largest.file_id).catch(() => null);
245
- if (img) images.push({ mediaType: 'image/jpeg', data: img.toString('base64') });
250
+ if (img) attachments.push({ type: 'image', mediaType: mimeFromPath(img.filePath, 'image/jpeg'), data: img.buffer.toString('base64') });
251
+ }
252
+
253
+ // Document: download the binary and forward as a file the agent reads from disk.
254
+ if (message.document?.file_id) {
255
+ const doc = await this.downloadFile(message.document.file_id).catch(() => null);
256
+ if (doc) {
257
+ attachments.push({
258
+ type: 'file',
259
+ mediaType: message.document.mime_type || mimeFromPath(doc.filePath, 'application/octet-stream'),
260
+ data: doc.buffer.toString('base64'),
261
+ name: message.document.file_name || undefined,
262
+ });
263
+ }
246
264
  }
247
265
 
248
266
  // Voice note / audio: download + transcribe.
@@ -252,9 +270,9 @@ export class TelegramChannel implements ChannelProvider {
252
270
  await this.sendMessage(chatId, 'Voice transcription is off — add an OpenAI API key in your Bloby chat settings (the three-dots menu) to enable it.');
253
271
  return;
254
272
  }
255
- const buf = await this.downloadFile(voice.file_id).catch(() => null);
256
- if (buf) {
257
- const transcript = await this.transcribe(buf.toString('base64')).catch(() => null);
273
+ const got = await this.downloadFile(voice.file_id).catch(() => null);
274
+ if (got) {
275
+ const transcript = await this.transcribe(got.buffer.toString('base64')).catch(() => null);
258
276
  if (transcript) {
259
277
  rawText = transcript;
260
278
  log.info(`[telegram] Transcribed voice: "${rawText.slice(0, 80)}"`);
@@ -265,12 +283,23 @@ export class TelegramChannel implements ChannelProvider {
265
283
  }
266
284
  }
267
285
 
268
- if (!rawText && images.length === 0) return;
269
- if (!rawText && images.length > 0) rawText = '(image)';
286
+ // Nothing usable extracted. If the message DID carry media we couldn't handle
287
+ // (sticker, video, location, contact, ), tell the user instead of dropping it silently.
288
+ if (!rawText && attachments.length === 0) {
289
+ const hadUnsupportedMedia = !!(message.sticker || message.video || message.video_note ||
290
+ message.animation || message.location || message.contact || message.poll || message.dice);
291
+ if (hadUnsupportedMedia) {
292
+ await this.sendMessage(chatId, "Sorry, I can't read that type of message yet — try sending text, a photo, or a document.");
293
+ }
294
+ return;
295
+ }
296
+ if (!rawText && attachments.length > 0) {
297
+ rawText = attachments.some((a) => a.type === 'image') ? '(image)' : '(document)';
298
+ }
270
299
 
271
300
  const text = escapeMessageText(rawText);
272
301
 
273
- log.info(`[telegram] Message from ${fromUserId} (chat=${chatId}, group=${isGroup}, images=${images.length}): ${text.slice(0, 80)}`);
302
+ log.info(`[telegram] Message from ${fromUserId} (chat=${chatId}, group=${isGroup}, media=${attachments.length}): ${text.slice(0, 80)}`);
274
303
 
275
304
  this.onMessage({
276
305
  chatId,
@@ -279,12 +308,13 @@ export class TelegramChannel implements ChannelProvider {
279
308
  text,
280
309
  isGroup,
281
310
  messageId: message.message_id,
282
- images: images.length > 0 ? images : undefined,
311
+ attachments: attachments.length > 0 ? attachments : undefined,
283
312
  });
284
313
  }
285
314
 
286
- /** Resolve a Telegram file_id to its bytes (getFile → download from the file CDN). */
287
- private async downloadFile(fileId: string): Promise<Buffer | null> {
315
+ /** Resolve a Telegram file_id to its bytes (getFile → download from the file CDN).
316
+ * Also returns the CDN file_path so callers can derive an extension/mediaType. */
317
+ private async downloadFile(fileId: string): Promise<{ buffer: Buffer; filePath: string } | null> {
288
318
  const file = await this.call('getFile', { file_id: fileId });
289
319
  const filePath = file?.file_path;
290
320
  if (!filePath) return null;
@@ -292,7 +322,7 @@ export class TelegramChannel implements ChannelProvider {
292
322
  if (!r.ok) throw new Error(`file download HTTP ${r.status}`);
293
323
  const buf = Buffer.from(await r.arrayBuffer());
294
324
  log.info(`[telegram] Downloaded file (${Math.round(buf.length / 1024)}KB)`);
295
- return buf;
325
+ return { buffer: buf, filePath };
296
326
  }
297
327
 
298
328
  /** Call a Bot API method, returning `result` or throwing on `ok:false`. */
@@ -336,6 +366,17 @@ function sleep(ms: number): Promise<void> {
336
366
  return new Promise((resolve) => setTimeout(resolve, ms));
337
367
  }
338
368
 
369
+ /** Best-effort mime type from a file path's extension; returns `fallback` when unknown.
370
+ * Used for Telegram photos/documents where the API doesn't always supply a mime_type. */
371
+ function mimeFromPath(filePath: string | undefined, fallback: string): string {
372
+ const ext = (filePath?.split('.').pop() || '').toLowerCase();
373
+ const map: Record<string, string> = {
374
+ png: 'image/png', jpg: 'image/jpeg', jpeg: 'image/jpeg', gif: 'image/gif', webp: 'image/webp',
375
+ pdf: 'application/pdf', zip: 'application/zip', txt: 'text/plain', csv: 'text/csv', json: 'application/json',
376
+ };
377
+ return map[ext] || fallback;
378
+ }
379
+
339
380
  /** Split a long message into <=limit-char chunks, preferring newline boundaries. */
340
381
  function splitMessage(text: string, limit: number): string[] {
341
382
  if (text.length <= limit) return [text];
@@ -23,9 +23,12 @@ export interface ChannelConfig {
23
23
  }
24
24
 
25
25
  export interface InboundMessageAttachment {
26
- type: 'image';
26
+ /** 'image' → inline vision block; 'file' → any document the agent reads from disk. */
27
+ type: 'image' | 'file';
27
28
  mediaType: string;
28
29
  data: string; // base64
30
+ /** Original filename when the channel provides one (WhatsApp/Telegram documents). */
31
+ name?: string;
29
32
  }
30
33
 
31
34
  export interface InboundMessage {
@@ -24,16 +24,21 @@ import type { ChannelProvider, ChannelStatus, ChannelType } from './types.js';
24
24
 
25
25
  const AUTH_DIR = path.join(DATA_DIR, 'channels', 'whatsapp', 'auth');
26
26
 
27
- /** Image attachment extracted from a WhatsApp message */
28
- export interface WhatsAppImageAttachment {
27
+ /** Media attachment extracted from a WhatsApp message.
28
+ * `type: 'image'` → inline vision; `type: 'file'` → a document the agent reads from disk. */
29
+ export interface WhatsAppMediaAttachment {
30
+ type: 'image' | 'file';
29
31
  mediaType: string;
30
32
  data: string; // base64
33
+ /** Original filename — present for documents (WhatsApp supplies it), absent for images. */
34
+ name?: string;
31
35
  }
32
36
 
33
37
  /** Callback when a new message arrives.
34
38
  * - sender: who sent it (phone JID, translated from LID where possible)
35
39
  * - chatJid: the conversation identifier (group JID for groups, peer JID for 1:1) — reply to this
36
40
  * - isGroup: true when the chat is a WhatsApp group (@g.us)
41
+ * - media: image and/or document attachments extracted from the message
37
42
  * - inboundKey: original Baileys message key — used to react/quote/ack the user's message
38
43
  */
39
44
  export type OnWhatsAppMessage = (
@@ -44,7 +49,7 @@ export type OnWhatsAppMessage = (
44
49
  isSelfChat: boolean,
45
50
  chatJid: string,
46
51
  isGroup: boolean,
47
- images?: WhatsAppImageAttachment[],
52
+ media?: WhatsAppMediaAttachment[],
48
53
  inboundKey?: WAMessageKey,
49
54
  ) => void;
50
55
 
@@ -576,7 +581,7 @@ export class WhatsAppChannel implements ChannelProvider {
576
581
 
577
582
  // Extract text — or transcribe audio if it's a voice note
578
583
  let rawText = this.extractText(msg.message);
579
- const images: WhatsAppImageAttachment[] = [];
584
+ const media: WhatsAppMediaAttachment[] = [];
580
585
 
581
586
  // Download image if present
582
587
  if (this.isImageMessage(msg.message)) {
@@ -584,13 +589,32 @@ export class WhatsAppChannel implements ChannelProvider {
584
589
  const buffer = await downloadMediaMessage(msg, 'buffer', {}) as Buffer;
585
590
  const mimeType = this.getImageMimeType(msg.message) || 'image/jpeg';
586
591
  const base64 = buffer.toString('base64');
587
- images.push({ mediaType: mimeType, data: base64 });
592
+ media.push({ type: 'image', mediaType: mimeType, data: base64 });
588
593
  log.info(`[whatsapp] Downloaded image (${Math.round(buffer.length / 1024)}KB, ${mimeType})`);
589
594
  } catch (err: any) {
590
595
  log.warn(`[whatsapp] Image download failed: ${err.message}`);
591
596
  }
592
597
  }
593
598
 
599
+ // Download document if present (PDF, docx, zip, etc.) — the binary is downloaded
600
+ // here (the caption, if any, is already covered by extractText above).
601
+ const docInfo = this.getDocumentInfo(msg.message);
602
+ if (docInfo) {
603
+ try {
604
+ const buffer = await downloadMediaMessage(msg, 'buffer', {}) as Buffer;
605
+ const base64 = buffer.toString('base64');
606
+ media.push({
607
+ type: 'file',
608
+ mediaType: docInfo.mimetype || 'application/octet-stream',
609
+ data: base64,
610
+ name: docInfo.fileName,
611
+ });
612
+ log.info(`[whatsapp] Downloaded document (${Math.round(buffer.length / 1024)}KB, ${docInfo.mimetype || 'unknown'}, ${docInfo.fileName || 'unnamed'})`);
613
+ } catch (err: any) {
614
+ log.warn(`[whatsapp] Document download failed: ${err.message}`);
615
+ }
616
+ }
617
+
594
618
  if (!rawText && this.isAudioMessage(msg.message)) {
595
619
  // Voice note / audio — download and transcribe
596
620
  if (!this.transcribe) {
@@ -616,11 +640,11 @@ export class WhatsAppChannel implements ChannelProvider {
616
640
  }
617
641
  }
618
642
 
619
- // Skip if no text AND no images; otherwise default text for image-only
643
+ // Skip if no text AND no media; otherwise default text for media-only
620
644
  // messages. Collapsing both branches also narrows `rawText` to `string`.
621
645
  if (!rawText) {
622
- if (images.length === 0) continue;
623
- rawText = '(image)';
646
+ if (media.length === 0) continue;
647
+ rawText = media.some((m) => m.type === 'image') ? '(image)' : '(document)';
624
648
  }
625
649
 
626
650
  // Escape special characters to prevent prompt injection via message content
@@ -663,7 +687,7 @@ export class WhatsAppChannel implements ChannelProvider {
663
687
  const ownsParticipant = !participant || participantResolved === this.ownPhoneJid;
664
688
  const isSelfChat = !isGroup && ownsChat && ownsParticipant;
665
689
 
666
- log.info(`[whatsapp] Message from ${sender} (chat=${chatJid}, group=${isGroup}, fromMe=${fromMe}, selfChat=${isSelfChat}, images=${images.length}): ${text.slice(0, 80)}`);
690
+ log.info(`[whatsapp] Message from ${sender} (chat=${chatJid}, group=${isGroup}, fromMe=${fromMe}, selfChat=${isSelfChat}, media=${media.length}): ${text.slice(0, 80)}`);
667
691
 
668
692
  this.onMessage(
669
693
  sender,
@@ -673,7 +697,7 @@ export class WhatsAppChannel implements ChannelProvider {
673
697
  isSelfChat,
674
698
  chatJid,
675
699
  isGroup,
676
- images.length > 0 ? images : undefined,
700
+ media.length > 0 ? media : undefined,
677
701
  msg.key,
678
702
  );
679
703
  }
@@ -692,6 +716,10 @@ export class WhatsAppChannel implements ChannelProvider {
692
716
  if (message.imageMessage?.caption) return message.imageMessage.caption;
693
717
  if (message.videoMessage?.caption) return message.videoMessage.caption;
694
718
  if (message.documentMessage?.caption) return message.documentMessage.caption;
719
+ // Captioned documents arrive wrapped in documentWithCaptionMessage.
720
+ if (message.documentWithCaptionMessage?.message?.documentMessage?.caption) {
721
+ return message.documentWithCaptionMessage.message.documentMessage.caption;
722
+ }
695
723
 
696
724
  // View-once wrappers
697
725
  if (message.viewOnceMessage?.message) return this.extractText(message.viewOnceMessage.message);
@@ -729,6 +757,25 @@ export class WhatsAppChannel implements ChannelProvider {
729
757
  return null;
730
758
  }
731
759
 
760
+ /** Extract document metadata (mimetype + fileName) from a message, unwrapping the
761
+ * common containers. Returns null when there is no document.
762
+ *
763
+ * WhatsApp wraps a captioned document in `documentWithCaptionMessage.message.documentMessage`
764
+ * while a bare document is `documentMessage` directly — both must resolve. (The actual binary
765
+ * is fetched via downloadMediaMessage on the outer `msg`, which Baileys unwraps itself.) */
766
+ private getDocumentInfo(message: any): { mimetype?: string; fileName?: string } | null {
767
+ if (!message) return null;
768
+ const doc =
769
+ message.documentMessage ||
770
+ message.documentWithCaptionMessage?.message?.documentMessage ||
771
+ message.viewOnceMessage?.message?.documentMessage ||
772
+ message.viewOnceMessageV2?.message?.documentMessage ||
773
+ message.ephemeralMessage?.message?.documentMessage ||
774
+ message.ephemeralMessage?.message?.documentWithCaptionMessage?.message?.documentMessage;
775
+ if (!doc) return null;
776
+ return { mimetype: doc.mimetype || undefined, fileName: doc.fileName || undefined };
777
+ }
778
+
732
779
  /** Check if a message contains audio (voice note or audio file) */
733
780
  private isAudioMessage(message: any): boolean {
734
781
  if (!message) return false;
@@ -23,7 +23,7 @@ export default function AudioBubble({ audioData }: Props) {
23
23
  // Historical audio is stored as an /api/files/* path, which a native Audio element
24
24
  // can't fetch (the Bearer token can't ride on the request) — resolve it to a blob URL
25
25
  // fetched with auth. data: URLs (freshly-recorded clips) pass straight through.
26
- const resolvedAudioUrl = useAuthedFileUrl(audioData);
26
+ const { url: resolvedAudioUrl } = useAuthedFileUrl(audioData);
27
27
 
28
28
  // Create Audio element once the source URL is ready
29
29
  useEffect(() => {
@@ -1,3 +1,4 @@
1
+ import { ImageOff } from 'lucide-react';
1
2
  import { useAuthedFileUrl } from '../../lib/authedFile';
2
3
 
3
4
  interface Props {
@@ -11,11 +12,23 @@ interface Props {
11
12
  * An `<img>` for `/api/files/*` attachments. The file is fetched with the auth token
12
13
  * (see `useAuthedFileUrl`) and rendered from a blob URL, because a native `<img src>`
13
14
  * request can't carry the Bearer token that `/api/files` now requires. While the fetch
14
- * is in flight (or if it failed) a subtle pulsing placeholder is shown in its place so
15
- * the layout doesn't jump.
15
+ * is in flight a subtle pulsing placeholder is shown in its place so the layout doesn't
16
+ * jump; if it failed (deleted / 401 / 5xx) a "broken image" fallback is shown instead.
16
17
  */
17
18
  export default function AuthedImage({ src, alt, className, onClick }: Props) {
18
- const resolvedSrc = useAuthedFileUrl(src);
19
+ const { url: resolvedSrc, status } = useAuthedFileUrl(src);
20
+
21
+ if (status === 'error') {
22
+ return (
23
+ <div
24
+ className={`${className ?? ''} flex items-center justify-center bg-black/10 text-muted-foreground/50`}
25
+ onClick={onClick}
26
+ title={alt || 'Image not found'}
27
+ >
28
+ <ImageOff className="h-5 w-5" />
29
+ </div>
30
+ );
31
+ }
19
32
 
20
33
  if (!resolvedSrc) {
21
34
  return <div className={`${className ?? ''} bg-white/10 animate-pulse`} onClick={onClick} />;
@@ -18,7 +18,7 @@ export default function BlobyImageCard({ src, alt }: Props) {
18
18
  // `src` may be a same-origin /api/files/* path (needs the auth token, which a native
19
19
  // <img> can't send) or an external URL — useAuthedFileUrl only fetches+authes the
20
20
  // former and passes external URLs through untouched (so the token never leaves origin).
21
- const resolvedSrc = useAuthedFileUrl(src);
21
+ const { url: resolvedSrc, status } = useAuthedFileUrl(src);
22
22
 
23
23
  const handleDownload = async () => {
24
24
  try {
@@ -37,7 +37,7 @@ export default function BlobyImageCard({ src, alt }: Props) {
37
37
  }
38
38
  };
39
39
 
40
- if (failed) {
40
+ if (failed || status === 'error') {
41
41
  return (
42
42
  <div className="my-2 flex items-center gap-2.5 px-3.5 py-2.5 rounded-xl border border-border/30 bg-black/10 text-muted-foreground/50 text-xs">
43
43
  <ImageOff className="h-4 w-4 shrink-0" />
@@ -1,17 +1,26 @@
1
1
  import { useCallback, useEffect } from 'react';
2
2
  import { motion, AnimatePresence } from 'framer-motion';
3
- import { ChevronLeft, ChevronRight, X, Download } from 'lucide-react';
3
+ import { ChevronLeft, ChevronRight, X, Download, ImageOff } from 'lucide-react';
4
4
  import { authFetch } from '../../lib/auth';
5
5
  import { useAuthedFileUrl } from '../../lib/authedFile';
6
6
 
7
+ /** One lightbox entry: the (possibly data:/`/api/files`) URL plus the human filename,
8
+ * so downloads and alt text use the real name rather than a URL stamp. */
9
+ export interface LightboxImage {
10
+ url: string;
11
+ name?: string;
12
+ }
13
+
7
14
  interface Props {
8
- images: string[];
15
+ images: LightboxImage[];
9
16
  index: number;
10
17
  onClose: () => void;
11
18
  onNavigate: (index: number) => void;
12
19
  }
13
20
 
14
21
  export default function ImageLightbox({ images, index, onClose, onNavigate }: Props) {
22
+ const current = images[index];
23
+
15
24
  const goPrev = useCallback(() => {
16
25
  if (index > 0) onNavigate(index - 1);
17
26
  }, [index, onNavigate]);
@@ -22,7 +31,7 @@ export default function ImageLightbox({ images, index, onClose, onNavigate }: Pr
22
31
 
23
32
  // /api/files/* needs the auth token, which a native <img src> can't send — resolve
24
33
  // the currently-shown image to a blob URL fetched with the Authorization header.
25
- const resolvedSrc = useAuthedFileUrl(images[index]);
34
+ const { url: resolvedSrc, status } = useAuthedFileUrl(current?.url);
26
35
 
27
36
  useEffect(() => {
28
37
  const handleKey = (e: KeyboardEvent) => {
@@ -50,18 +59,18 @@ export default function ImageLightbox({ images, index, onClose, onNavigate }: Pr
50
59
  onClick={async (e) => {
51
60
  e.stopPropagation();
52
61
  try {
53
- const res = await authFetch(images[index]);
62
+ const res = await authFetch(current.url);
54
63
  const blob = await res.blob();
55
64
  const url = URL.createObjectURL(blob);
56
65
  const a = document.createElement('a');
57
66
  a.href = url;
58
- a.download = images[index].split('/').pop() || 'image';
67
+ a.download = current.name || current.url.split('/').pop() || 'image';
59
68
  document.body.appendChild(a);
60
69
  a.click();
61
70
  document.body.removeChild(a);
62
71
  URL.revokeObjectURL(url);
63
72
  } catch {
64
- window.open(images[index], '_blank');
73
+ window.open(current.url, '_blank');
65
74
  }
66
75
  }}
67
76
  className="p-2 rounded-full bg-white/10 hover:bg-white/20 transition-colors text-white"
@@ -105,10 +114,18 @@ export default function ImageLightbox({ images, index, onClose, onNavigate }: Pr
105
114
  )}
106
115
 
107
116
  {/* Image */}
108
- {resolvedSrc ? (
117
+ {status === 'error' ? (
118
+ <div
119
+ className="flex flex-col items-center gap-2 rounded-lg bg-white/5 px-8 py-10 text-white/50"
120
+ onClick={(e) => e.stopPropagation()}
121
+ >
122
+ <ImageOff className="h-10 w-10" />
123
+ <span className="text-sm">{current?.name || 'Image not found'}</span>
124
+ </div>
125
+ ) : resolvedSrc ? (
109
126
  <img
110
127
  src={resolvedSrc}
111
- alt=""
128
+ alt={current?.name || ''}
112
129
  className="max-h-[85vh] max-w-[90vw] object-contain rounded-lg"
113
130
  onClick={(e) => e.stopPropagation()}
114
131
  />