@friendlyrobot/discord-pi-agent 0.12.0 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,11 @@
1
1
  import type { Model } from "@earendil-works/pi-ai";
2
2
  import type { AgentService } from "./agent-service";
3
3
  /**
4
- * Use a vision-capable model to describe an image, returning a text
5
- * description that can be inlined into a prompt for a non-vision model.
4
+ * Use a vision-capable model to describe a media attachment (image or PDF),
5
+ * returning a text description that can be inlined into a prompt for a
6
+ * non-vision model.
6
7
  *
7
- * Creates a temporary in-memory session, sends the image, extracts the
8
+ * Creates a temporary in-memory session, sends the media, extracts the
8
9
  * assistant's text reply, then disposes the session.
9
10
  */
10
11
  export declare function describeImage(agentService: AgentService, imageData: string, mimeType: string, userText: string, visionModel: Model<any>): Promise<string>;
package/dist/index.js CHANGED
@@ -870,26 +870,32 @@ var logger5 = createModuleLogger("image-description");
870
870
  async function describeImage(agentService, imageData, mimeType, userText, visionModel) {
871
871
  const session = await agentService.createTemporarySession();
872
872
  await session.setModel(visionModel);
873
+ const isPdf = mimeType === "application/pdf";
873
874
  const imageContent = {
874
875
  type: "image",
875
876
  data: imageData,
876
877
  mimeType
877
878
  };
878
- const promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
879
+ let promptText;
880
+ if (isPdf) {
881
+ promptText = userText.trim().length > 0 ? `The user sent a PDF document with the following message: "${userText}". Please extract and summarize the text content of this PDF. Be thorough — include all important details, sections, and data from the document.` : "Please extract and summarize the text content of this PDF document. Be thorough — include all important details, sections, data, and key points.";
882
+ } else {
883
+ promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
884
+ }
879
885
  let text = "";
880
886
  try {
881
887
  await session.prompt(promptText, { images: [imageContent] });
882
888
  text = extractLastAssistantText(session);
883
889
  } catch (error) {
884
- logger5.error({ error }, "vision model prompt failed");
885
- text = "(Vision model failed to process the image.)";
890
+ logger5.error({ error, mimeType }, "vision model prompt failed");
891
+ text = "(Vision model failed to process the file.)";
886
892
  } finally {
887
893
  session.dispose();
888
894
  }
889
895
  if (!text) {
890
896
  return "(Vision model returned no description.)";
891
897
  }
892
- logger5.debug({ textLength: text.length }, "image described");
898
+ logger5.debug({ textLength: text.length, mimeType }, "media described");
893
899
  return text;
894
900
  }
895
901
  function extractLastAssistantText(session) {
@@ -1211,28 +1217,42 @@ async function readTextAttachments(message) {
1211
1217
  }
1212
1218
  return results;
1213
1219
  }
1214
- var IMAGE_ATTACHMENT_EXTENSIONS = [".png", ".jpg", ".jpeg", ".gif", ".webp"];
1215
- var MAX_IMAGE_ATTACHMENT_SIZE = 10 * 1024 * 1024;
1216
- async function readImageAttachments(message) {
1220
+ var MEDIA_ATTACHMENT_EXTENSIONS = [
1221
+ ".png",
1222
+ ".jpg",
1223
+ ".jpeg",
1224
+ ".gif",
1225
+ ".webp",
1226
+ ".pdf"
1227
+ ];
1228
+ var MAX_MEDIA_ATTACHMENT_SIZE = 25 * 1024 * 1024;
1229
+ function isMediaAttachment(attachment) {
1230
+ const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
1231
+ if (!ext || !MEDIA_ATTACHMENT_EXTENSIONS.includes(ext)) {
1232
+ return false;
1233
+ }
1234
+ const ct = attachment.contentType;
1235
+ if (!ct) {
1236
+ return false;
1237
+ }
1238
+ return ct.startsWith("image/") || ct === "application/pdf";
1239
+ }
1240
+ async function readMediaAttachments(message) {
1217
1241
  const attachments = message.attachments;
1218
1242
  if (attachments.size === 0) {
1219
1243
  return [];
1220
1244
  }
1221
1245
  const results = [];
1222
1246
  for (const [, attachment] of attachments) {
1223
- const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
1224
- if (!ext || !IMAGE_ATTACHMENT_EXTENSIONS.includes(ext)) {
1225
- continue;
1226
- }
1227
- if (!attachment.contentType?.startsWith("image/")) {
1247
+ if (!isMediaAttachment(attachment)) {
1228
1248
  continue;
1229
1249
  }
1230
- if (attachment.size > MAX_IMAGE_ATTACHMENT_SIZE) {
1250
+ if (attachment.size > MAX_MEDIA_ATTACHMENT_SIZE) {
1231
1251
  logger6.warn({
1232
1252
  messageId: message.id,
1233
1253
  filename: attachment.name,
1234
1254
  size: attachment.size
1235
- }, "image attachment too large, skipping");
1255
+ }, "media attachment too large, skipping");
1236
1256
  continue;
1237
1257
  }
1238
1258
  try {
@@ -1240,14 +1260,14 @@ async function readImageAttachments(message) {
1240
1260
  messageId: message.id,
1241
1261
  filename: attachment.name,
1242
1262
  size: attachment.size
1243
- }, "fetching image attachment");
1263
+ }, "fetching media attachment");
1244
1264
  const response = await fetch(attachment.url);
1245
1265
  if (!response.ok) {
1246
1266
  logger6.warn({
1247
1267
  messageId: message.id,
1248
1268
  filename: attachment.name,
1249
1269
  status: response.status
1250
- }, "failed to fetch image attachment");
1270
+ }, "failed to fetch media attachment");
1251
1271
  continue;
1252
1272
  }
1253
1273
  const buffer = await response.arrayBuffer();
@@ -1255,10 +1275,10 @@ async function readImageAttachments(message) {
1255
1275
  results.push({
1256
1276
  filename: attachment.name,
1257
1277
  data: base64,
1258
- mimeType: attachment.contentType ?? "image/png"
1278
+ mimeType: attachment.contentType ?? "application/octet-stream"
1259
1279
  });
1260
1280
  } catch (error) {
1261
- logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching image attachment");
1281
+ logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching media attachment");
1262
1282
  }
1263
1283
  }
1264
1284
  return results;
@@ -1277,27 +1297,29 @@ function parseVisionModelId(visionModelId) {
1277
1297
  modelId: trimmed.substring(slashIndex + 1)
1278
1298
  };
1279
1299
  }
1280
- async function resolveImageAttachments(imageAttachments, content, currentModel, config, agentService) {
1300
+ async function resolveMediaAttachments(media, content, currentModel, config, agentService) {
1281
1301
  const modelSupportsVision = currentModel?.input.includes("image") ?? false;
1282
1302
  if (modelSupportsVision) {
1303
+ const names = media.map((m) => m.filename).join(", ");
1283
1304
  logger6.info({
1284
- imageCount: imageAttachments.length,
1305
+ count: media.length,
1306
+ filenames: names,
1285
1307
  model: currentModel ? `${currentModel.provider}/${currentModel.id}` : "none"
1286
- }, "passing images natively to vision-capable model");
1287
- const images = imageAttachments.map((img) => ({
1308
+ }, "passing media natively to vision-capable model");
1309
+ const images = media.map((m) => ({
1288
1310
  type: "image",
1289
- data: img.data,
1290
- mimeType: img.mimeType
1311
+ data: m.data,
1312
+ mimeType: m.mimeType
1291
1313
  }));
1292
1314
  return { content, images };
1293
1315
  }
1294
1316
  if (!config.visionModelId) {
1295
- const imageNames = imageAttachments.map((i) => i.filename).join(", ");
1296
- logger6.info({ imageNames }, "image attachments received but vision model not configured");
1317
+ const names = media.map((m) => m.filename).join(", ");
1318
+ logger6.info({ filenames: names }, "media attachments received but vision model not configured");
1297
1319
  const note = `
1298
1320
 
1299
- [User sent image attachment(s): ${imageNames}]
1300
- ` + "(Image vision not configured. Set visionModelId to enable image understanding.)";
1321
+ [User sent media attachment(s): ${names}]
1322
+ ` + "(Media vision not configured. Set visionModelId to enable image/PDF understanding.)";
1301
1323
  return { content: content ? content + note : note, images: [] };
1302
1324
  }
1303
1325
  const parsed = parseVisionModelId(config.visionModelId);
@@ -1307,21 +1329,23 @@ async function resolveImageAttachments(imageAttachments, content, currentModel,
1307
1329
  const visionModel = agentService.findModel(parsed.provider, parsed.modelId);
1308
1330
  if (!visionModel) {
1309
1331
  logger6.warn({ visionModelId: config.visionModelId }, "vision model not found in registry");
1310
- const imageNames = imageAttachments.map((i) => i.filename).join(", ");
1332
+ const names = media.map((m) => m.filename).join(", ");
1311
1333
  const note = `
1312
1334
 
1313
- [User sent image attachment(s): ${imageNames}]
1335
+ [User sent media attachment(s): ${names}]
1314
1336
  (Vision model not found: ${config.visionModelId})`;
1315
1337
  return { content: content ? content + note : note, images: [] };
1316
1338
  }
1317
1339
  logger6.info({
1318
- imageCount: imageAttachments.length,
1340
+ count: media.length,
1319
1341
  visionModel: `${visionModel.provider}/${visionModel.id}`
1320
- }, "describing images with vision model");
1342
+ }, "describing media with vision model");
1321
1343
  const descriptions = [];
1322
- for (const img of imageAttachments) {
1323
- const description = await describeImage(agentService, img.data, img.mimeType, content, visionModel);
1324
- descriptions.push(`[Image: ${img.filename}]
1344
+ for (const m of media) {
1345
+ const isPdf = m.mimeType === "application/pdf";
1346
+ const description = await describeImage(agentService, m.data, m.mimeType, content, visionModel);
1347
+ const label = isPdf ? `[PDF: ${m.filename}]` : `[Image: ${m.filename}]`;
1348
+ descriptions.push(`${label}
1325
1349
  ${description}`);
1326
1350
  }
1327
1351
  if (descriptions.length > 0) {
@@ -1414,8 +1438,8 @@ async function onMessage(message, config, agentService, sessionRegistry, authCon
1414
1438
  ${a.content}`).join("");
1415
1439
  content = content ? content + suffix : attachmentContents[0].content;
1416
1440
  }
1417
- const imageAttachments = await readImageAttachments(message);
1418
- if (!content && imageAttachments.length === 0) {
1441
+ const mediaAttachments = await readMediaAttachments(message);
1442
+ if (!content && mediaAttachments.length === 0) {
1419
1443
  logger6.debug({ messageId: message.id }, "ignored empty message (no text or images)");
1420
1444
  return;
1421
1445
  }
@@ -1488,8 +1512,8 @@ ${a.content}`).join("");
1488
1512
  response = await promptQueue.enqueue(async () => {
1489
1513
  let promptContent = content;
1490
1514
  let promptImages;
1491
- if (imageAttachments.length > 0) {
1492
- const resolved = await resolveImageAttachments(imageAttachments, promptContent, session.model, config, agentService);
1515
+ if (mediaAttachments.length > 0) {
1516
+ const resolved = await resolveMediaAttachments(mediaAttachments, promptContent, session.model, config, agentService);
1493
1517
  promptContent = resolved.content;
1494
1518
  if (resolved.images.length > 0) {
1495
1519
  promptImages = resolved.images;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@friendlyrobot/discord-pi-agent",
3
- "version": "0.12.0",
3
+ "version": "0.13.0",
4
4
  "description": "Reusable Discord gateway bridge for persistent pi agent sessions",
5
5
  "license": "MIT",
6
6
  "type": "module",