@friendlyrobot/discord-pi-agent 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/image-description.d.ts +4 -3
- package/dist/index.js +63 -39
- package/package.json +1 -1
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { Model } from "@earendil-works/pi-ai";
|
|
2
2
|
import type { AgentService } from "./agent-service";
|
|
3
3
|
/**
|
|
4
|
-
* Use a vision-capable model to describe
|
|
5
|
-
* description that can be inlined into a prompt for a
|
|
4
|
+
* Use a vision-capable model to describe a media attachment (image or PDF),
|
|
5
|
+
* returning a text description that can be inlined into a prompt for a
|
|
6
|
+
* non-vision model.
|
|
6
7
|
*
|
|
7
|
-
* Creates a temporary in-memory session, sends the
|
|
8
|
+
* Creates a temporary in-memory session, sends the media, extracts the
|
|
8
9
|
* assistant's text reply, then disposes the session.
|
|
9
10
|
*/
|
|
10
11
|
export declare function describeImage(agentService: AgentService, imageData: string, mimeType: string, userText: string, visionModel: Model<any>): Promise<string>;
|
package/dist/index.js
CHANGED
|
@@ -870,26 +870,32 @@ var logger5 = createModuleLogger("image-description");
|
|
|
870
870
|
async function describeImage(agentService, imageData, mimeType, userText, visionModel) {
|
|
871
871
|
const session = await agentService.createTemporarySession();
|
|
872
872
|
await session.setModel(visionModel);
|
|
873
|
+
const isPdf = mimeType === "application/pdf";
|
|
873
874
|
const imageContent = {
|
|
874
875
|
type: "image",
|
|
875
876
|
data: imageData,
|
|
876
877
|
mimeType
|
|
877
878
|
};
|
|
878
|
-
|
|
879
|
+
let promptText;
|
|
880
|
+
if (isPdf) {
|
|
881
|
+
promptText = userText.trim().length > 0 ? `The user sent a PDF document with the following message: "${userText}". Please extract and summarize the text content of this PDF. Be thorough — include all important details, sections, and data from the document.` : "Please extract and summarize the text content of this PDF document. Be thorough — include all important details, sections, data, and key points.";
|
|
882
|
+
} else {
|
|
883
|
+
promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
|
|
884
|
+
}
|
|
879
885
|
let text = "";
|
|
880
886
|
try {
|
|
881
887
|
await session.prompt(promptText, { images: [imageContent] });
|
|
882
888
|
text = extractLastAssistantText(session);
|
|
883
889
|
} catch (error) {
|
|
884
|
-
logger5.error({ error }, "vision model prompt failed");
|
|
885
|
-
text = "(Vision model failed to process the
|
|
890
|
+
logger5.error({ error, mimeType }, "vision model prompt failed");
|
|
891
|
+
text = "(Vision model failed to process the file.)";
|
|
886
892
|
} finally {
|
|
887
893
|
session.dispose();
|
|
888
894
|
}
|
|
889
895
|
if (!text) {
|
|
890
896
|
return "(Vision model returned no description.)";
|
|
891
897
|
}
|
|
892
|
-
logger5.debug({ textLength: text.length }, "
|
|
898
|
+
logger5.debug({ textLength: text.length, mimeType }, "media described");
|
|
893
899
|
return text;
|
|
894
900
|
}
|
|
895
901
|
function extractLastAssistantText(session) {
|
|
@@ -1211,28 +1217,42 @@ async function readTextAttachments(message) {
|
|
|
1211
1217
|
}
|
|
1212
1218
|
return results;
|
|
1213
1219
|
}
|
|
1214
|
-
var
|
|
1215
|
-
|
|
1216
|
-
|
|
1220
|
+
var MEDIA_ATTACHMENT_EXTENSIONS = [
|
|
1221
|
+
".png",
|
|
1222
|
+
".jpg",
|
|
1223
|
+
".jpeg",
|
|
1224
|
+
".gif",
|
|
1225
|
+
".webp",
|
|
1226
|
+
".pdf"
|
|
1227
|
+
];
|
|
1228
|
+
var MAX_MEDIA_ATTACHMENT_SIZE = 25 * 1024 * 1024;
|
|
1229
|
+
function isMediaAttachment(attachment) {
|
|
1230
|
+
const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
|
|
1231
|
+
if (!ext || !MEDIA_ATTACHMENT_EXTENSIONS.includes(ext)) {
|
|
1232
|
+
return false;
|
|
1233
|
+
}
|
|
1234
|
+
const ct = attachment.contentType;
|
|
1235
|
+
if (!ct) {
|
|
1236
|
+
return false;
|
|
1237
|
+
}
|
|
1238
|
+
return ct.startsWith("image/") || ct === "application/pdf";
|
|
1239
|
+
}
|
|
1240
|
+
async function readMediaAttachments(message) {
|
|
1217
1241
|
const attachments = message.attachments;
|
|
1218
1242
|
if (attachments.size === 0) {
|
|
1219
1243
|
return [];
|
|
1220
1244
|
}
|
|
1221
1245
|
const results = [];
|
|
1222
1246
|
for (const [, attachment] of attachments) {
|
|
1223
|
-
|
|
1224
|
-
if (!ext || !IMAGE_ATTACHMENT_EXTENSIONS.includes(ext)) {
|
|
1225
|
-
continue;
|
|
1226
|
-
}
|
|
1227
|
-
if (!attachment.contentType?.startsWith("image/")) {
|
|
1247
|
+
if (!isMediaAttachment(attachment)) {
|
|
1228
1248
|
continue;
|
|
1229
1249
|
}
|
|
1230
|
-
if (attachment.size >
|
|
1250
|
+
if (attachment.size > MAX_MEDIA_ATTACHMENT_SIZE) {
|
|
1231
1251
|
logger6.warn({
|
|
1232
1252
|
messageId: message.id,
|
|
1233
1253
|
filename: attachment.name,
|
|
1234
1254
|
size: attachment.size
|
|
1235
|
-
}, "
|
|
1255
|
+
}, "media attachment too large, skipping");
|
|
1236
1256
|
continue;
|
|
1237
1257
|
}
|
|
1238
1258
|
try {
|
|
@@ -1240,14 +1260,14 @@ async function readImageAttachments(message) {
|
|
|
1240
1260
|
messageId: message.id,
|
|
1241
1261
|
filename: attachment.name,
|
|
1242
1262
|
size: attachment.size
|
|
1243
|
-
}, "fetching
|
|
1263
|
+
}, "fetching media attachment");
|
|
1244
1264
|
const response = await fetch(attachment.url);
|
|
1245
1265
|
if (!response.ok) {
|
|
1246
1266
|
logger6.warn({
|
|
1247
1267
|
messageId: message.id,
|
|
1248
1268
|
filename: attachment.name,
|
|
1249
1269
|
status: response.status
|
|
1250
|
-
}, "failed to fetch
|
|
1270
|
+
}, "failed to fetch media attachment");
|
|
1251
1271
|
continue;
|
|
1252
1272
|
}
|
|
1253
1273
|
const buffer = await response.arrayBuffer();
|
|
@@ -1255,10 +1275,10 @@ async function readImageAttachments(message) {
|
|
|
1255
1275
|
results.push({
|
|
1256
1276
|
filename: attachment.name,
|
|
1257
1277
|
data: base64,
|
|
1258
|
-
mimeType: attachment.contentType ?? "
|
|
1278
|
+
mimeType: attachment.contentType ?? "application/octet-stream"
|
|
1259
1279
|
});
|
|
1260
1280
|
} catch (error) {
|
|
1261
|
-
logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching
|
|
1281
|
+
logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching media attachment");
|
|
1262
1282
|
}
|
|
1263
1283
|
}
|
|
1264
1284
|
return results;
|
|
@@ -1277,27 +1297,29 @@ function parseVisionModelId(visionModelId) {
|
|
|
1277
1297
|
modelId: trimmed.substring(slashIndex + 1)
|
|
1278
1298
|
};
|
|
1279
1299
|
}
|
|
1280
|
-
async function
|
|
1300
|
+
async function resolveMediaAttachments(media, content, currentModel, config, agentService) {
|
|
1281
1301
|
const modelSupportsVision = currentModel?.input.includes("image") ?? false;
|
|
1282
1302
|
if (modelSupportsVision) {
|
|
1303
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1283
1304
|
logger6.info({
|
|
1284
|
-
|
|
1305
|
+
count: media.length,
|
|
1306
|
+
filenames: names,
|
|
1285
1307
|
model: currentModel ? `${currentModel.provider}/${currentModel.id}` : "none"
|
|
1286
|
-
}, "passing
|
|
1287
|
-
const images =
|
|
1308
|
+
}, "passing media natively to vision-capable model");
|
|
1309
|
+
const images = media.map((m) => ({
|
|
1288
1310
|
type: "image",
|
|
1289
|
-
data:
|
|
1290
|
-
mimeType:
|
|
1311
|
+
data: m.data,
|
|
1312
|
+
mimeType: m.mimeType
|
|
1291
1313
|
}));
|
|
1292
1314
|
return { content, images };
|
|
1293
1315
|
}
|
|
1294
1316
|
if (!config.visionModelId) {
|
|
1295
|
-
const
|
|
1296
|
-
logger6.info({
|
|
1317
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1318
|
+
logger6.info({ filenames: names }, "media attachments received but vision model not configured");
|
|
1297
1319
|
const note = `
|
|
1298
1320
|
|
|
1299
|
-
[User sent
|
|
1300
|
-
` + "(
|
|
1321
|
+
[User sent media attachment(s): ${names}]
|
|
1322
|
+
` + "(Media vision not configured. Set visionModelId to enable image/PDF understanding.)";
|
|
1301
1323
|
return { content: content ? content + note : note, images: [] };
|
|
1302
1324
|
}
|
|
1303
1325
|
const parsed = parseVisionModelId(config.visionModelId);
|
|
@@ -1307,21 +1329,23 @@ async function resolveImageAttachments(imageAttachments, content, currentModel,
|
|
|
1307
1329
|
const visionModel = agentService.findModel(parsed.provider, parsed.modelId);
|
|
1308
1330
|
if (!visionModel) {
|
|
1309
1331
|
logger6.warn({ visionModelId: config.visionModelId }, "vision model not found in registry");
|
|
1310
|
-
const
|
|
1332
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1311
1333
|
const note = `
|
|
1312
1334
|
|
|
1313
|
-
[User sent
|
|
1335
|
+
[User sent media attachment(s): ${names}]
|
|
1314
1336
|
(Vision model not found: ${config.visionModelId})`;
|
|
1315
1337
|
return { content: content ? content + note : note, images: [] };
|
|
1316
1338
|
}
|
|
1317
1339
|
logger6.info({
|
|
1318
|
-
|
|
1340
|
+
count: media.length,
|
|
1319
1341
|
visionModel: `${visionModel.provider}/${visionModel.id}`
|
|
1320
|
-
}, "describing
|
|
1342
|
+
}, "describing media with vision model");
|
|
1321
1343
|
const descriptions = [];
|
|
1322
|
-
for (const
|
|
1323
|
-
const
|
|
1324
|
-
|
|
1344
|
+
for (const m of media) {
|
|
1345
|
+
const isPdf = m.mimeType === "application/pdf";
|
|
1346
|
+
const description = await describeImage(agentService, m.data, m.mimeType, content, visionModel);
|
|
1347
|
+
const label = isPdf ? `[PDF: ${m.filename}]` : `[Image: ${m.filename}]`;
|
|
1348
|
+
descriptions.push(`${label}
|
|
1325
1349
|
${description}`);
|
|
1326
1350
|
}
|
|
1327
1351
|
if (descriptions.length > 0) {
|
|
@@ -1414,8 +1438,8 @@ async function onMessage(message, config, agentService, sessionRegistry, authCon
|
|
|
1414
1438
|
${a.content}`).join("");
|
|
1415
1439
|
content = content ? content + suffix : attachmentContents[0].content;
|
|
1416
1440
|
}
|
|
1417
|
-
const
|
|
1418
|
-
if (!content &&
|
|
1441
|
+
const mediaAttachments = await readMediaAttachments(message);
|
|
1442
|
+
if (!content && mediaAttachments.length === 0) {
|
|
1419
1443
|
logger6.debug({ messageId: message.id }, "ignored empty message (no text or images)");
|
|
1420
1444
|
return;
|
|
1421
1445
|
}
|
|
@@ -1488,8 +1512,8 @@ ${a.content}`).join("");
|
|
|
1488
1512
|
response = await promptQueue.enqueue(async () => {
|
|
1489
1513
|
let promptContent = content;
|
|
1490
1514
|
let promptImages;
|
|
1491
|
-
if (
|
|
1492
|
-
const resolved = await
|
|
1515
|
+
if (mediaAttachments.length > 0) {
|
|
1516
|
+
const resolved = await resolveMediaAttachments(mediaAttachments, promptContent, session.model, config, agentService);
|
|
1493
1517
|
promptContent = resolved.content;
|
|
1494
1518
|
if (resolved.images.length > 0) {
|
|
1495
1519
|
promptImages = resolved.images;
|