@friendlyrobot/discord-pi-agent 0.12.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  import type { Model } from "@earendil-works/pi-ai";
2
2
  import type { AgentService } from "./agent-service";
3
3
  /**
4
- * Use a vision-capable model to describe an image, returning a text
5
- * description that can be inlined into a prompt for a non-vision model.
4
+ * Use a vision-capable model to describe a media attachment (image or PDF),
5
+ * returning a text description that can be inlined into a prompt for a
6
+ * non-vision model.
6
7
  *
7
- * Creates a temporary in-memory session, sends the image, extracts the
8
+ * Creates a temporary in-memory session, sends the media, extracts the
8
9
  * assistant's text reply, then disposes the session.
9
10
  */
10
11
  export declare function describeImage(agentService: AgentService, imageData: string, mimeType: string, userText: string, visionModel: Model<any>): Promise<string>;
package/dist/index.js CHANGED
@@ -870,26 +870,32 @@ var logger5 = createModuleLogger("image-description");
870
870
  async function describeImage(agentService, imageData, mimeType, userText, visionModel) {
871
871
  const session = await agentService.createTemporarySession();
872
872
  await session.setModel(visionModel);
873
+ const mediaType = getMediaType(mimeType);
873
874
  const imageContent = {
874
875
  type: "image",
875
876
  data: imageData,
876
877
  mimeType
877
878
  };
878
- const promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
879
+ let promptText;
880
+ if (mediaType === "document") {
881
+ promptText = userText.trim().length > 0 ? `The user sent a document with the following message: "${userText}". Please extract and summarize the text content of this document. Be thorough — include all important details, sections, and data from the document.` : "Please extract and summarize the text content of this document. Be thorough — include all important details, sections, data, and key points.";
882
+ } else {
883
+ promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
884
+ }
879
885
  let text = "";
880
886
  try {
881
887
  await session.prompt(promptText, { images: [imageContent] });
882
888
  text = extractLastAssistantText(session);
883
889
  } catch (error) {
884
- logger5.error({ error }, "vision model prompt failed");
885
- text = "(Vision model failed to process the image.)";
890
+ logger5.error({ error, mimeType }, "vision model prompt failed");
891
+ text = "(Vision model failed to process the file.)";
886
892
  } finally {
887
893
  session.dispose();
888
894
  }
889
895
  if (!text) {
890
896
  return "(Vision model returned no description.)";
891
897
  }
892
- logger5.debug({ textLength: text.length }, "image described");
898
+ logger5.debug({ textLength: text.length, mimeType }, "media described");
893
899
  return text;
894
900
  }
895
901
  function extractLastAssistantText(session) {
@@ -914,6 +920,12 @@ function extractLastAssistantText(session) {
914
920
  }
915
921
  return "";
916
922
  }
923
+ function getMediaType(mimeType) {
924
+ if (mimeType.startsWith("image/")) {
925
+ return "image";
926
+ }
927
+ return "document";
928
+ }
917
929
  function isAssistantMessage(msg) {
918
930
  return typeof msg === "object" && msg !== null && "role" in msg && msg.role === "assistant";
919
931
  }
@@ -1211,28 +1223,57 @@ async function readTextAttachments(message) {
1211
1223
  }
1212
1224
  return results;
1213
1225
  }
1214
- var IMAGE_ATTACHMENT_EXTENSIONS = [".png", ".jpg", ".jpeg", ".gif", ".webp"];
1215
- var MAX_IMAGE_ATTACHMENT_SIZE = 10 * 1024 * 1024;
1216
- async function readImageAttachments(message) {
1226
+ var MEDIA_ATTACHMENT_EXTENSIONS = [
1227
+ ".png",
1228
+ ".jpg",
1229
+ ".jpeg",
1230
+ ".gif",
1231
+ ".webp",
1232
+ ".pdf",
1233
+ ".docx",
1234
+ ".doc",
1235
+ ".pptx",
1236
+ ".ppt",
1237
+ ".xlsx",
1238
+ ".xls"
1239
+ ];
1240
+ var MAX_MEDIA_ATTACHMENT_SIZE = 25 * 1024 * 1024;
1241
+ var OFFICE_MIME_TYPES = new Set([
1242
+ "application/pdf",
1243
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
1244
+ "application/msword",
1245
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
1246
+ "application/vnd.ms-powerpoint",
1247
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
1248
+ "application/vnd.ms-excel"
1249
+ ]);
1250
+ function isMediaAttachment(attachment) {
1251
+ const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
1252
+ if (!ext || !MEDIA_ATTACHMENT_EXTENSIONS.includes(ext)) {
1253
+ return false;
1254
+ }
1255
+ const ct = attachment.contentType;
1256
+ if (!ct) {
1257
+ return false;
1258
+ }
1259
+ return ct.startsWith("image/") || OFFICE_MIME_TYPES.has(ct);
1260
+ }
1261
+ async function readMediaAttachments(message) {
1217
1262
  const attachments = message.attachments;
1218
1263
  if (attachments.size === 0) {
1219
1264
  return [];
1220
1265
  }
1221
1266
  const results = [];
1222
1267
  for (const [, attachment] of attachments) {
1223
- const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
1224
- if (!ext || !IMAGE_ATTACHMENT_EXTENSIONS.includes(ext)) {
1268
+ if (!isMediaAttachment(attachment)) {
1225
1269
  continue;
1226
1270
  }
1227
- if (!attachment.contentType?.startsWith("image/")) {
1228
- continue;
1229
- }
1230
- if (attachment.size > MAX_IMAGE_ATTACHMENT_SIZE) {
1271
+ if (attachment.size > MAX_MEDIA_ATTACHMENT_SIZE) {
1231
1272
  logger6.warn({
1232
1273
  messageId: message.id,
1233
1274
  filename: attachment.name,
1234
1275
  size: attachment.size
1235
- }, "image attachment too large, skipping");
1276
+ }, "media attachment too large, skipping");
1236
1277
  continue;
1237
1278
  }
1238
1279
  try {
@@ -1240,14 +1281,14 @@ async function readImageAttachments(message) {
1240
1281
  messageId: message.id,
1241
1282
  filename: attachment.name,
1242
1283
  size: attachment.size
1243
- }, "fetching image attachment");
1284
+ }, "fetching media attachment");
1244
1285
  const response = await fetch(attachment.url);
1245
1286
  if (!response.ok) {
1246
1287
  logger6.warn({
1247
1288
  messageId: message.id,
1248
1289
  filename: attachment.name,
1249
1290
  status: response.status
1250
- }, "failed to fetch image attachment");
1291
+ }, "failed to fetch media attachment");
1251
1292
  continue;
1252
1293
  }
1253
1294
  const buffer = await response.arrayBuffer();
@@ -1255,10 +1296,10 @@ async function readImageAttachments(message) {
1255
1296
  results.push({
1256
1297
  filename: attachment.name,
1257
1298
  data: base64,
1258
- mimeType: attachment.contentType ?? "image/png"
1299
+ mimeType: attachment.contentType ?? "application/octet-stream"
1259
1300
  });
1260
1301
  } catch (error) {
1261
- logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching image attachment");
1302
+ logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching media attachment");
1262
1303
  }
1263
1304
  }
1264
1305
  return results;
@@ -1277,27 +1318,44 @@ function parseVisionModelId(visionModelId) {
1277
1318
  modelId: trimmed.substring(slashIndex + 1)
1278
1319
  };
1279
1320
  }
1280
- async function resolveImageAttachments(imageAttachments, content, currentModel, config, agentService) {
1321
+ function getMediaLabel(filename, mimeType) {
1322
+ if (mimeType === "application/pdf") {
1323
+ return `[PDF: ${filename}]`;
1324
+ }
1325
+ if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || mimeType === "application/msword") {
1326
+ return `[Word: ${filename}]`;
1327
+ }
1328
+ if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || mimeType === "application/vnd.ms-excel") {
1329
+ return `[Excel: ${filename}]`;
1330
+ }
1331
+ if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || mimeType === "application/vnd.ms-powerpoint") {
1332
+ return `[PowerPoint: ${filename}]`;
1333
+ }
1334
+ return `[Image: ${filename}]`;
1335
+ }
1336
+ async function resolveMediaAttachments(media, content, currentModel, config, agentService) {
1281
1337
  const modelSupportsVision = currentModel?.input.includes("image") ?? false;
1282
1338
  if (modelSupportsVision) {
1339
+ const names = media.map((m) => m.filename).join(", ");
1283
1340
  logger6.info({
1284
- imageCount: imageAttachments.length,
1341
+ count: media.length,
1342
+ filenames: names,
1285
1343
  model: currentModel ? `${currentModel.provider}/${currentModel.id}` : "none"
1286
- }, "passing images natively to vision-capable model");
1287
- const images = imageAttachments.map((img) => ({
1344
+ }, "passing media natively to vision-capable model");
1345
+ const images = media.map((m) => ({
1288
1346
  type: "image",
1289
- data: img.data,
1290
- mimeType: img.mimeType
1347
+ data: m.data,
1348
+ mimeType: m.mimeType
1291
1349
  }));
1292
1350
  return { content, images };
1293
1351
  }
1294
1352
  if (!config.visionModelId) {
1295
- const imageNames = imageAttachments.map((i) => i.filename).join(", ");
1296
- logger6.info({ imageNames }, "image attachments received but vision model not configured");
1353
+ const names = media.map((m) => m.filename).join(", ");
1354
+ logger6.info({ filenames: names }, "media attachments received but vision model not configured");
1297
1355
  const note = `
1298
1356
 
1299
- [User sent image attachment(s): ${imageNames}]
1300
- ` + "(Image vision not configured. Set visionModelId to enable image understanding.)";
1357
+ [User sent media attachment(s): ${names}]
1358
+ ` + "(Media vision not configured. Set visionModelId to enable image/PDF/document understanding.)";
1301
1359
  return { content: content ? content + note : note, images: [] };
1302
1360
  }
1303
1361
  const parsed = parseVisionModelId(config.visionModelId);
@@ -1307,21 +1365,22 @@ async function resolveImageAttachments(imageAttachments, content, currentModel,
1307
1365
  const visionModel = agentService.findModel(parsed.provider, parsed.modelId);
1308
1366
  if (!visionModel) {
1309
1367
  logger6.warn({ visionModelId: config.visionModelId }, "vision model not found in registry");
1310
- const imageNames = imageAttachments.map((i) => i.filename).join(", ");
1368
+ const names = media.map((m) => m.filename).join(", ");
1311
1369
  const note = `
1312
1370
 
1313
- [User sent image attachment(s): ${imageNames}]
1371
+ [User sent media attachment(s): ${names}]
1314
1372
  (Vision model not found: ${config.visionModelId})`;
1315
1373
  return { content: content ? content + note : note, images: [] };
1316
1374
  }
1317
1375
  logger6.info({
1318
- imageCount: imageAttachments.length,
1376
+ count: media.length,
1319
1377
  visionModel: `${visionModel.provider}/${visionModel.id}`
1320
- }, "describing images with vision model");
1378
+ }, "describing media with vision model");
1321
1379
  const descriptions = [];
1322
- for (const img of imageAttachments) {
1323
- const description = await describeImage(agentService, img.data, img.mimeType, content, visionModel);
1324
- descriptions.push(`[Image: ${img.filename}]
1380
+ for (const m of media) {
1381
+ const description = await describeImage(agentService, m.data, m.mimeType, content, visionModel);
1382
+ const label = getMediaLabel(m.filename, m.mimeType);
1383
+ descriptions.push(`${label}
1325
1384
  ${description}`);
1326
1385
  }
1327
1386
  if (descriptions.length > 0) {
@@ -1414,8 +1473,8 @@ async function onMessage(message, config, agentService, sessionRegistry, authCon
1414
1473
  ${a.content}`).join("");
1415
1474
  content = content ? content + suffix : attachmentContents[0].content;
1416
1475
  }
1417
- const imageAttachments = await readImageAttachments(message);
1418
- if (!content && imageAttachments.length === 0) {
1476
+ const mediaAttachments = await readMediaAttachments(message);
1477
+ if (!content && mediaAttachments.length === 0) {
1419
1478
  logger6.debug({ messageId: message.id }, "ignored empty message (no text or images)");
1420
1479
  return;
1421
1480
  }
@@ -1488,8 +1547,8 @@ ${a.content}`).join("");
1488
1547
  response = await promptQueue.enqueue(async () => {
1489
1548
  let promptContent = content;
1490
1549
  let promptImages;
1491
- if (imageAttachments.length > 0) {
1492
- const resolved = await resolveImageAttachments(imageAttachments, promptContent, session.model, config, agentService);
1550
+ if (mediaAttachments.length > 0) {
1551
+ const resolved = await resolveMediaAttachments(mediaAttachments, promptContent, session.model, config, agentService);
1493
1552
  promptContent = resolved.content;
1494
1553
  if (resolved.images.length > 0) {
1495
1554
  promptImages = resolved.images;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@friendlyrobot/discord-pi-agent",
3
- "version": "0.12.0",
3
+ "version": "0.14.0",
4
4
  "description": "Reusable Discord gateway bridge for persistent pi agent sessions",
5
5
  "license": "MIT",
6
6
  "type": "module",