@friendlyrobot/discord-pi-agent 0.12.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/image-description.d.ts +4 -3
- package/dist/index.js +98 -39
- package/package.json +1 -1
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
import type { Model } from "@earendil-works/pi-ai";
|
|
2
2
|
import type { AgentService } from "./agent-service";
|
|
3
3
|
/**
|
|
4
|
-
* Use a vision-capable model to describe
|
|
5
|
-
* description that can be inlined into a prompt for a
|
|
4
|
+
* Use a vision-capable model to describe a media attachment (image or PDF),
|
|
5
|
+
* returning a text description that can be inlined into a prompt for a
|
|
6
|
+
* non-vision model.
|
|
6
7
|
*
|
|
7
|
-
* Creates a temporary in-memory session, sends the
|
|
8
|
+
* Creates a temporary in-memory session, sends the media, extracts the
|
|
8
9
|
* assistant's text reply, then disposes the session.
|
|
9
10
|
*/
|
|
10
11
|
export declare function describeImage(agentService: AgentService, imageData: string, mimeType: string, userText: string, visionModel: Model<any>): Promise<string>;
|
package/dist/index.js
CHANGED
|
@@ -870,26 +870,32 @@ var logger5 = createModuleLogger("image-description");
|
|
|
870
870
|
async function describeImage(agentService, imageData, mimeType, userText, visionModel) {
|
|
871
871
|
const session = await agentService.createTemporarySession();
|
|
872
872
|
await session.setModel(visionModel);
|
|
873
|
+
const mediaType = getMediaType(mimeType);
|
|
873
874
|
const imageContent = {
|
|
874
875
|
type: "image",
|
|
875
876
|
data: imageData,
|
|
876
877
|
mimeType
|
|
877
878
|
};
|
|
878
|
-
|
|
879
|
+
let promptText;
|
|
880
|
+
if (mediaType === "document") {
|
|
881
|
+
promptText = userText.trim().length > 0 ? `The user sent a document with the following message: "${userText}". Please extract and summarize the text content of this document. Be thorough — include all important details, sections, and data from the document.` : "Please extract and summarize the text content of this document. Be thorough — include all important details, sections, data, and key points.";
|
|
882
|
+
} else {
|
|
883
|
+
promptText = userText.trim().length > 0 ? `The user sent this image with the following message: "${userText}". Please describe the image in detail and address any questions from the user's message.` : "Please describe this image in detail. What do you see?";
|
|
884
|
+
}
|
|
879
885
|
let text = "";
|
|
880
886
|
try {
|
|
881
887
|
await session.prompt(promptText, { images: [imageContent] });
|
|
882
888
|
text = extractLastAssistantText(session);
|
|
883
889
|
} catch (error) {
|
|
884
|
-
logger5.error({ error }, "vision model prompt failed");
|
|
885
|
-
text = "(Vision model failed to process the
|
|
890
|
+
logger5.error({ error, mimeType }, "vision model prompt failed");
|
|
891
|
+
text = "(Vision model failed to process the file.)";
|
|
886
892
|
} finally {
|
|
887
893
|
session.dispose();
|
|
888
894
|
}
|
|
889
895
|
if (!text) {
|
|
890
896
|
return "(Vision model returned no description.)";
|
|
891
897
|
}
|
|
892
|
-
logger5.debug({ textLength: text.length }, "
|
|
898
|
+
logger5.debug({ textLength: text.length, mimeType }, "media described");
|
|
893
899
|
return text;
|
|
894
900
|
}
|
|
895
901
|
function extractLastAssistantText(session) {
|
|
@@ -914,6 +920,12 @@ function extractLastAssistantText(session) {
|
|
|
914
920
|
}
|
|
915
921
|
return "";
|
|
916
922
|
}
|
|
923
|
+
function getMediaType(mimeType) {
|
|
924
|
+
if (mimeType.startsWith("image/")) {
|
|
925
|
+
return "image";
|
|
926
|
+
}
|
|
927
|
+
return "document";
|
|
928
|
+
}
|
|
917
929
|
function isAssistantMessage(msg) {
|
|
918
930
|
return typeof msg === "object" && msg !== null && "role" in msg && msg.role === "assistant";
|
|
919
931
|
}
|
|
@@ -1211,28 +1223,57 @@ async function readTextAttachments(message) {
|
|
|
1211
1223
|
}
|
|
1212
1224
|
return results;
|
|
1213
1225
|
}
|
|
1214
|
-
var
|
|
1215
|
-
|
|
1216
|
-
|
|
1226
|
+
var MEDIA_ATTACHMENT_EXTENSIONS = [
|
|
1227
|
+
".png",
|
|
1228
|
+
".jpg",
|
|
1229
|
+
".jpeg",
|
|
1230
|
+
".gif",
|
|
1231
|
+
".webp",
|
|
1232
|
+
".pdf",
|
|
1233
|
+
".docx",
|
|
1234
|
+
".doc",
|
|
1235
|
+
".pptx",
|
|
1236
|
+
".ppt",
|
|
1237
|
+
".xlsx",
|
|
1238
|
+
".xls"
|
|
1239
|
+
];
|
|
1240
|
+
var MAX_MEDIA_ATTACHMENT_SIZE = 25 * 1024 * 1024;
|
|
1241
|
+
var OFFICE_MIME_TYPES = new Set([
|
|
1242
|
+
"application/pdf",
|
|
1243
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
1244
|
+
"application/msword",
|
|
1245
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
1246
|
+
"application/vnd.ms-powerpoint",
|
|
1247
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
1248
|
+
"application/vnd.ms-excel"
|
|
1249
|
+
]);
|
|
1250
|
+
function isMediaAttachment(attachment) {
|
|
1251
|
+
const ext = attachment.name?.slice(attachment.name.lastIndexOf(".")).toLowerCase();
|
|
1252
|
+
if (!ext || !MEDIA_ATTACHMENT_EXTENSIONS.includes(ext)) {
|
|
1253
|
+
return false;
|
|
1254
|
+
}
|
|
1255
|
+
const ct = attachment.contentType;
|
|
1256
|
+
if (!ct) {
|
|
1257
|
+
return false;
|
|
1258
|
+
}
|
|
1259
|
+
return ct.startsWith("image/") || OFFICE_MIME_TYPES.has(ct);
|
|
1260
|
+
}
|
|
1261
|
+
async function readMediaAttachments(message) {
|
|
1217
1262
|
const attachments = message.attachments;
|
|
1218
1263
|
if (attachments.size === 0) {
|
|
1219
1264
|
return [];
|
|
1220
1265
|
}
|
|
1221
1266
|
const results = [];
|
|
1222
1267
|
for (const [, attachment] of attachments) {
|
|
1223
|
-
|
|
1224
|
-
if (!ext || !IMAGE_ATTACHMENT_EXTENSIONS.includes(ext)) {
|
|
1268
|
+
if (!isMediaAttachment(attachment)) {
|
|
1225
1269
|
continue;
|
|
1226
1270
|
}
|
|
1227
|
-
if (
|
|
1228
|
-
continue;
|
|
1229
|
-
}
|
|
1230
|
-
if (attachment.size > MAX_IMAGE_ATTACHMENT_SIZE) {
|
|
1271
|
+
if (attachment.size > MAX_MEDIA_ATTACHMENT_SIZE) {
|
|
1231
1272
|
logger6.warn({
|
|
1232
1273
|
messageId: message.id,
|
|
1233
1274
|
filename: attachment.name,
|
|
1234
1275
|
size: attachment.size
|
|
1235
|
-
}, "
|
|
1276
|
+
}, "media attachment too large, skipping");
|
|
1236
1277
|
continue;
|
|
1237
1278
|
}
|
|
1238
1279
|
try {
|
|
@@ -1240,14 +1281,14 @@ async function readImageAttachments(message) {
|
|
|
1240
1281
|
messageId: message.id,
|
|
1241
1282
|
filename: attachment.name,
|
|
1242
1283
|
size: attachment.size
|
|
1243
|
-
}, "fetching
|
|
1284
|
+
}, "fetching media attachment");
|
|
1244
1285
|
const response = await fetch(attachment.url);
|
|
1245
1286
|
if (!response.ok) {
|
|
1246
1287
|
logger6.warn({
|
|
1247
1288
|
messageId: message.id,
|
|
1248
1289
|
filename: attachment.name,
|
|
1249
1290
|
status: response.status
|
|
1250
|
-
}, "failed to fetch
|
|
1291
|
+
}, "failed to fetch media attachment");
|
|
1251
1292
|
continue;
|
|
1252
1293
|
}
|
|
1253
1294
|
const buffer = await response.arrayBuffer();
|
|
@@ -1255,10 +1296,10 @@ async function readImageAttachments(message) {
|
|
|
1255
1296
|
results.push({
|
|
1256
1297
|
filename: attachment.name,
|
|
1257
1298
|
data: base64,
|
|
1258
|
-
mimeType: attachment.contentType ?? "
|
|
1299
|
+
mimeType: attachment.contentType ?? "application/octet-stream"
|
|
1259
1300
|
});
|
|
1260
1301
|
} catch (error) {
|
|
1261
|
-
logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching
|
|
1302
|
+
logger6.error({ messageId: message.id, filename: attachment.name, error }, "error fetching media attachment");
|
|
1262
1303
|
}
|
|
1263
1304
|
}
|
|
1264
1305
|
return results;
|
|
@@ -1277,27 +1318,44 @@ function parseVisionModelId(visionModelId) {
|
|
|
1277
1318
|
modelId: trimmed.substring(slashIndex + 1)
|
|
1278
1319
|
};
|
|
1279
1320
|
}
|
|
1280
|
-
|
|
1321
|
+
function getMediaLabel(filename, mimeType) {
|
|
1322
|
+
if (mimeType === "application/pdf") {
|
|
1323
|
+
return `[PDF: ${filename}]`;
|
|
1324
|
+
}
|
|
1325
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document" || mimeType === "application/msword") {
|
|
1326
|
+
return `[Word: ${filename}]`;
|
|
1327
|
+
}
|
|
1328
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" || mimeType === "application/vnd.ms-excel") {
|
|
1329
|
+
return `[Excel: ${filename}]`;
|
|
1330
|
+
}
|
|
1331
|
+
if (mimeType === "application/vnd.openxmlformats-officedocument.presentationml.presentation" || mimeType === "application/vnd.ms-powerpoint") {
|
|
1332
|
+
return `[PowerPoint: ${filename}]`;
|
|
1333
|
+
}
|
|
1334
|
+
return `[Image: ${filename}]`;
|
|
1335
|
+
}
|
|
1336
|
+
async function resolveMediaAttachments(media, content, currentModel, config, agentService) {
|
|
1281
1337
|
const modelSupportsVision = currentModel?.input.includes("image") ?? false;
|
|
1282
1338
|
if (modelSupportsVision) {
|
|
1339
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1283
1340
|
logger6.info({
|
|
1284
|
-
|
|
1341
|
+
count: media.length,
|
|
1342
|
+
filenames: names,
|
|
1285
1343
|
model: currentModel ? `${currentModel.provider}/${currentModel.id}` : "none"
|
|
1286
|
-
}, "passing
|
|
1287
|
-
const images =
|
|
1344
|
+
}, "passing media natively to vision-capable model");
|
|
1345
|
+
const images = media.map((m) => ({
|
|
1288
1346
|
type: "image",
|
|
1289
|
-
data:
|
|
1290
|
-
mimeType:
|
|
1347
|
+
data: m.data,
|
|
1348
|
+
mimeType: m.mimeType
|
|
1291
1349
|
}));
|
|
1292
1350
|
return { content, images };
|
|
1293
1351
|
}
|
|
1294
1352
|
if (!config.visionModelId) {
|
|
1295
|
-
const
|
|
1296
|
-
logger6.info({
|
|
1353
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1354
|
+
logger6.info({ filenames: names }, "media attachments received but vision model not configured");
|
|
1297
1355
|
const note = `
|
|
1298
1356
|
|
|
1299
|
-
[User sent
|
|
1300
|
-
` + "(
|
|
1357
|
+
[User sent media attachment(s): ${names}]
|
|
1358
|
+
` + "(Media vision not configured. Set visionModelId to enable image/PDF/document understanding.)";
|
|
1301
1359
|
return { content: content ? content + note : note, images: [] };
|
|
1302
1360
|
}
|
|
1303
1361
|
const parsed = parseVisionModelId(config.visionModelId);
|
|
@@ -1307,21 +1365,22 @@ async function resolveImageAttachments(imageAttachments, content, currentModel,
|
|
|
1307
1365
|
const visionModel = agentService.findModel(parsed.provider, parsed.modelId);
|
|
1308
1366
|
if (!visionModel) {
|
|
1309
1367
|
logger6.warn({ visionModelId: config.visionModelId }, "vision model not found in registry");
|
|
1310
|
-
const
|
|
1368
|
+
const names = media.map((m) => m.filename).join(", ");
|
|
1311
1369
|
const note = `
|
|
1312
1370
|
|
|
1313
|
-
[User sent
|
|
1371
|
+
[User sent media attachment(s): ${names}]
|
|
1314
1372
|
(Vision model not found: ${config.visionModelId})`;
|
|
1315
1373
|
return { content: content ? content + note : note, images: [] };
|
|
1316
1374
|
}
|
|
1317
1375
|
logger6.info({
|
|
1318
|
-
|
|
1376
|
+
count: media.length,
|
|
1319
1377
|
visionModel: `${visionModel.provider}/${visionModel.id}`
|
|
1320
|
-
}, "describing
|
|
1378
|
+
}, "describing media with vision model");
|
|
1321
1379
|
const descriptions = [];
|
|
1322
|
-
for (const
|
|
1323
|
-
const description = await describeImage(agentService,
|
|
1324
|
-
|
|
1380
|
+
for (const m of media) {
|
|
1381
|
+
const description = await describeImage(agentService, m.data, m.mimeType, content, visionModel);
|
|
1382
|
+
const label = getMediaLabel(m.filename, m.mimeType);
|
|
1383
|
+
descriptions.push(`${label}
|
|
1325
1384
|
${description}`);
|
|
1326
1385
|
}
|
|
1327
1386
|
if (descriptions.length > 0) {
|
|
@@ -1414,8 +1473,8 @@ async function onMessage(message, config, agentService, sessionRegistry, authCon
|
|
|
1414
1473
|
${a.content}`).join("");
|
|
1415
1474
|
content = content ? content + suffix : attachmentContents[0].content;
|
|
1416
1475
|
}
|
|
1417
|
-
const
|
|
1418
|
-
if (!content &&
|
|
1476
|
+
const mediaAttachments = await readMediaAttachments(message);
|
|
1477
|
+
if (!content && mediaAttachments.length === 0) {
|
|
1419
1478
|
logger6.debug({ messageId: message.id }, "ignored empty message (no text or images)");
|
|
1420
1479
|
return;
|
|
1421
1480
|
}
|
|
@@ -1488,8 +1547,8 @@ ${a.content}`).join("");
|
|
|
1488
1547
|
response = await promptQueue.enqueue(async () => {
|
|
1489
1548
|
let promptContent = content;
|
|
1490
1549
|
let promptImages;
|
|
1491
|
-
if (
|
|
1492
|
-
const resolved = await
|
|
1550
|
+
if (mediaAttachments.length > 0) {
|
|
1551
|
+
const resolved = await resolveMediaAttachments(mediaAttachments, promptContent, session.model, config, agentService);
|
|
1493
1552
|
promptContent = resolved.content;
|
|
1494
1553
|
if (resolved.images.length > 0) {
|
|
1495
1554
|
promptImages = resolved.images;
|