@marshulll/openclaw-wecom 0.1.15 → 0.1.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/docs/wecom.config.full.example.json +11 -1
- package/package.json +1 -1
- package/wecom/src/config-schema.ts +20 -0
- package/wecom/src/media-vision.ts +98 -0
- package/wecom/src/types.ts +10 -0
- package/wecom/src/wecom-app.ts +170 -22
- package/wecom/src/wecom-bot.ts +177 -32
|
@@ -8,7 +8,17 @@
|
|
|
8
8
|
"tempDir": "/tmp/openclaw-wecom",
|
|
9
9
|
"retentionHours": 72,
|
|
10
10
|
"cleanupOnStart": true,
|
|
11
|
-
"maxBytes": 10485760
|
|
11
|
+
"maxBytes": 10485760,
|
|
12
|
+
"vision": {
|
|
13
|
+
"enabled": true,
|
|
14
|
+
"baseUrl": "https://newapi.looksunlight.com/v1",
|
|
15
|
+
"apiKey": "YOUR_API_KEY",
|
|
16
|
+
"model": "gpt-4o-mini",
|
|
17
|
+
"prompt": "请描述图片内容并尽量提取可见文字。",
|
|
18
|
+
"maxTokens": 400,
|
|
19
|
+
"timeoutMs": 15000,
|
|
20
|
+
"maxBytes": 5242880
|
|
21
|
+
}
|
|
12
22
|
},
|
|
13
23
|
"botMediaBridge": true,
|
|
14
24
|
|
package/package.json
CHANGED
|
@@ -47,6 +47,16 @@ const accountSchema = z.object({
|
|
|
47
47
|
retentionHours: z.number().optional(),
|
|
48
48
|
cleanupOnStart: z.boolean().optional(),
|
|
49
49
|
maxBytes: z.number().optional(),
|
|
50
|
+
vision: z.object({
|
|
51
|
+
enabled: z.boolean().optional(),
|
|
52
|
+
baseUrl: z.string().optional(),
|
|
53
|
+
apiKey: z.string().optional(),
|
|
54
|
+
model: z.string().optional(),
|
|
55
|
+
prompt: z.string().optional(),
|
|
56
|
+
maxTokens: z.number().optional(),
|
|
57
|
+
timeoutMs: z.number().optional(),
|
|
58
|
+
maxBytes: z.number().optional(),
|
|
59
|
+
}).optional(),
|
|
50
60
|
}).optional(),
|
|
51
61
|
|
|
52
62
|
network: z.object({
|
|
@@ -81,6 +91,16 @@ export const WecomConfigSchema = ensureJsonSchema(z.object({
|
|
|
81
91
|
retentionHours: z.number().optional(),
|
|
82
92
|
cleanupOnStart: z.boolean().optional(),
|
|
83
93
|
maxBytes: z.number().optional(),
|
|
94
|
+
vision: z.object({
|
|
95
|
+
enabled: z.boolean().optional(),
|
|
96
|
+
baseUrl: z.string().optional(),
|
|
97
|
+
apiKey: z.string().optional(),
|
|
98
|
+
model: z.string().optional(),
|
|
99
|
+
prompt: z.string().optional(),
|
|
100
|
+
maxTokens: z.number().optional(),
|
|
101
|
+
timeoutMs: z.number().optional(),
|
|
102
|
+
maxBytes: z.number().optional(),
|
|
103
|
+
}).optional(),
|
|
84
104
|
}).optional(),
|
|
85
105
|
|
|
86
106
|
network: z.object({
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import type { WecomAccountConfig } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export type VisionConfig = {
|
|
4
|
+
enabled?: boolean;
|
|
5
|
+
baseUrl?: string;
|
|
6
|
+
apiKey?: string;
|
|
7
|
+
model?: string;
|
|
8
|
+
prompt?: string;
|
|
9
|
+
maxTokens?: number;
|
|
10
|
+
timeoutMs?: number;
|
|
11
|
+
maxBytes?: number;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
function resolveBaseUrl(raw?: string): string | null {
|
|
15
|
+
const value = raw?.trim();
|
|
16
|
+
if (!value) return null;
|
|
17
|
+
if (value.endsWith("/v1")) return value;
|
|
18
|
+
return `${value.replace(/\/+$/, "")}/v1`;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function resolveVisionConfig(accountConfig: WecomAccountConfig): VisionConfig | null {
|
|
22
|
+
const vision = accountConfig.media?.vision;
|
|
23
|
+
if (!vision?.enabled) return null;
|
|
24
|
+
|
|
25
|
+
const baseUrl = resolveBaseUrl(
|
|
26
|
+
vision.baseUrl
|
|
27
|
+
|| process.env.OPENAI_BASE_URL
|
|
28
|
+
|| process.env.OPENAI_API_BASE
|
|
29
|
+
|| process.env.OPENAI_ENDPOINT,
|
|
30
|
+
);
|
|
31
|
+
const apiKey = vision.apiKey || process.env.OPENAI_API_KEY || process.env.OPENAI_KEY;
|
|
32
|
+
if (!baseUrl || !apiKey) return null;
|
|
33
|
+
|
|
34
|
+
return {
|
|
35
|
+
enabled: true,
|
|
36
|
+
baseUrl,
|
|
37
|
+
apiKey,
|
|
38
|
+
model: vision.model || process.env.OPENAI_MODEL || "gpt-4o-mini",
|
|
39
|
+
prompt: vision.prompt
|
|
40
|
+
|| "请描述图片内容并尽量提取可见文字。输出简洁中文要点。",
|
|
41
|
+
maxTokens: typeof vision.maxTokens === "number" ? vision.maxTokens : 400,
|
|
42
|
+
timeoutMs: typeof vision.timeoutMs === "number" ? vision.timeoutMs : 15000,
|
|
43
|
+
maxBytes: typeof vision.maxBytes === "number" ? vision.maxBytes : undefined,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export async function describeImageWithVision(params: {
|
|
48
|
+
config: VisionConfig;
|
|
49
|
+
buffer: Buffer;
|
|
50
|
+
mimeType: string;
|
|
51
|
+
}): Promise<string | null> {
|
|
52
|
+
const { config, buffer, mimeType } = params;
|
|
53
|
+
if (!config.enabled || !config.baseUrl || !config.apiKey) return null;
|
|
54
|
+
|
|
55
|
+
if (config.maxBytes && buffer.length > config.maxBytes) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const controller = new AbortController();
|
|
60
|
+
const timeout = setTimeout(() => controller.abort(), config.timeoutMs ?? 15000);
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
const imageBase64 = buffer.toString("base64");
|
|
64
|
+
const payload = {
|
|
65
|
+
model: config.model,
|
|
66
|
+
messages: [
|
|
67
|
+
{
|
|
68
|
+
role: "user",
|
|
69
|
+
content: [
|
|
70
|
+
{ type: "text", text: config.prompt },
|
|
71
|
+
{ type: "image_url", image_url: { url: `data:${mimeType};base64,${imageBase64}` } },
|
|
72
|
+
],
|
|
73
|
+
},
|
|
74
|
+
],
|
|
75
|
+
max_tokens: config.maxTokens ?? 400,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
const res = await fetch(`${config.baseUrl}/chat/completions`, {
|
|
79
|
+
method: "POST",
|
|
80
|
+
headers: {
|
|
81
|
+
"Content-Type": "application/json",
|
|
82
|
+
Authorization: `Bearer ${config.apiKey}`,
|
|
83
|
+
},
|
|
84
|
+
body: JSON.stringify(payload),
|
|
85
|
+
signal: controller.signal,
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
if (!res.ok) return null;
|
|
89
|
+
const data = await res.json() as any;
|
|
90
|
+
const content = data?.choices?.[0]?.message?.content;
|
|
91
|
+
if (typeof content !== "string") return null;
|
|
92
|
+
return content.trim() || null;
|
|
93
|
+
} catch {
|
|
94
|
+
return null;
|
|
95
|
+
} finally {
|
|
96
|
+
clearTimeout(timeout);
|
|
97
|
+
}
|
|
98
|
+
}
|
package/wecom/src/types.ts
CHANGED
|
@@ -47,6 +47,16 @@ export type WecomAccountConfig = {
|
|
|
47
47
|
retentionHours?: number;
|
|
48
48
|
cleanupOnStart?: boolean;
|
|
49
49
|
maxBytes?: number;
|
|
50
|
+
vision?: {
|
|
51
|
+
enabled?: boolean;
|
|
52
|
+
baseUrl?: string;
|
|
53
|
+
apiKey?: string;
|
|
54
|
+
model?: string;
|
|
55
|
+
prompt?: string;
|
|
56
|
+
maxTokens?: number;
|
|
57
|
+
timeoutMs?: number;
|
|
58
|
+
maxBytes?: number;
|
|
59
|
+
};
|
|
50
60
|
};
|
|
51
61
|
|
|
52
62
|
// Network behavior
|
package/wecom/src/wecom-app.ts
CHANGED
|
@@ -1,15 +1,16 @@
|
|
|
1
1
|
import type { IncomingMessage, ServerResponse } from "node:http";
|
|
2
2
|
import crypto from "node:crypto";
|
|
3
3
|
import { XMLParser } from "fast-xml-parser";
|
|
4
|
-
import { mkdir, readdir, rm, stat, writeFile } from "node:fs/promises";
|
|
4
|
+
import { mkdir, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
|
|
5
5
|
import { tmpdir } from "node:os";
|
|
6
|
-
import { join } from "node:path";
|
|
6
|
+
import { basename, extname, join } from "node:path";
|
|
7
7
|
|
|
8
8
|
import type { WecomWebhookTarget } from "./monitor.js";
|
|
9
9
|
import { decryptWecomEncrypted, verifyWecomSignature } from "./crypto.js";
|
|
10
10
|
import { getWecomRuntime } from "./runtime.js";
|
|
11
11
|
import { handleCommand } from "./commands.js";
|
|
12
12
|
import { markdownToWecomText } from "./format.js";
|
|
13
|
+
import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
|
|
13
14
|
import { downloadWecomMedia, fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomText, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
|
|
14
15
|
|
|
15
16
|
const xmlParser = new XMLParser({
|
|
@@ -26,6 +27,7 @@ type MediaCacheEntry = {
|
|
|
26
27
|
type: "image" | "voice" | "video" | "file";
|
|
27
28
|
mimeType?: string;
|
|
28
29
|
url?: string;
|
|
30
|
+
summary?: string;
|
|
29
31
|
createdAt: number;
|
|
30
32
|
size: number;
|
|
31
33
|
};
|
|
@@ -171,6 +173,139 @@ function normalizeMediaType(raw?: string): "image" | "voice" | "video" | "file"
|
|
|
171
173
|
return null;
|
|
172
174
|
}
|
|
173
175
|
|
|
176
|
+
function pickString(...values: unknown[]): string {
|
|
177
|
+
for (const value of values) {
|
|
178
|
+
if (typeof value === "string" && value.trim()) return value.trim();
|
|
179
|
+
}
|
|
180
|
+
return "";
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
function resolveContentTypeFromExt(ext: string): string {
|
|
184
|
+
const value = ext.toLowerCase();
|
|
185
|
+
if (value === "png") return "image/png";
|
|
186
|
+
if (value === "gif") return "image/gif";
|
|
187
|
+
if (value === "jpg" || value === "jpeg") return "image/jpeg";
|
|
188
|
+
if (value === "webp") return "image/webp";
|
|
189
|
+
if (value === "bmp") return "image/bmp";
|
|
190
|
+
if (value === "amr") return "audio/amr";
|
|
191
|
+
if (value === "wav") return "audio/wav";
|
|
192
|
+
if (value === "mp3") return "audio/mpeg";
|
|
193
|
+
if (value === "m4a") return "audio/mp4";
|
|
194
|
+
if (value === "mp4") return "video/mp4";
|
|
195
|
+
if (value === "mov") return "video/quicktime";
|
|
196
|
+
if (value === "avi") return "video/x-msvideo";
|
|
197
|
+
if (value === "pdf") return "application/pdf";
|
|
198
|
+
if (value === "txt") return "text/plain";
|
|
199
|
+
if (value === "csv") return "text/csv";
|
|
200
|
+
if (value === "json") return "application/json";
|
|
201
|
+
if (value === "doc") return "application/msword";
|
|
202
|
+
if (value === "docx") return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
203
|
+
if (value === "xls") return "application/vnd.ms-excel";
|
|
204
|
+
if (value === "xlsx") return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
205
|
+
if (value === "ppt") return "application/vnd.ms-powerpoint";
|
|
206
|
+
if (value === "pptx") return "application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
|
207
|
+
if (value === "zip") return "application/zip";
|
|
208
|
+
return "application/octet-stream";
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
function resolveMediaTypeFromContentType(contentType: string): "image" | "voice" | "video" | "file" {
|
|
212
|
+
const value = contentType.toLowerCase();
|
|
213
|
+
if (value.startsWith("image/")) return "image";
|
|
214
|
+
if (value.startsWith("audio/")) return "voice";
|
|
215
|
+
if (value.startsWith("video/")) return "video";
|
|
216
|
+
return "file";
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
function stripFileProtocol(rawPath: string): string {
|
|
220
|
+
return rawPath.startsWith("file://") ? rawPath.replace(/^file:\/\//, "") : rawPath;
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
function parseBase64Input(input: string): { data: string; mimeType?: string } {
|
|
224
|
+
const match = input.match(/^data:([^;]+);base64,(.*)$/i);
|
|
225
|
+
if (match) {
|
|
226
|
+
return { data: match[2], mimeType: match[1] };
|
|
227
|
+
}
|
|
228
|
+
return { data: input };
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function resolveOutboundMediaSpec(payload: any): {
|
|
232
|
+
type?: string;
|
|
233
|
+
url?: string;
|
|
234
|
+
path?: string;
|
|
235
|
+
base64?: string;
|
|
236
|
+
filename?: string;
|
|
237
|
+
mimeType?: string;
|
|
238
|
+
} | null {
|
|
239
|
+
if (!payload || typeof payload !== "object") return null;
|
|
240
|
+
const mediaBlockRaw = payload.media ?? payload.attachment ?? payload.file ?? payload.files;
|
|
241
|
+
const mediaBlock = Array.isArray(mediaBlockRaw) ? mediaBlockRaw[0] : mediaBlockRaw;
|
|
242
|
+
const url = pickString(
|
|
243
|
+
payload.mediaUrl,
|
|
244
|
+
mediaBlock?.url,
|
|
245
|
+
mediaBlock?.mediaUrl,
|
|
246
|
+
mediaBlock?.fileUrl,
|
|
247
|
+
mediaBlock?.file_url,
|
|
248
|
+
);
|
|
249
|
+
const path = pickString(
|
|
250
|
+
payload.mediaPath,
|
|
251
|
+
payload.filePath,
|
|
252
|
+
mediaBlock?.path,
|
|
253
|
+
mediaBlock?.filePath,
|
|
254
|
+
mediaBlock?.localPath,
|
|
255
|
+
);
|
|
256
|
+
const base64 = pickString(
|
|
257
|
+
payload.mediaBase64,
|
|
258
|
+
payload.base64,
|
|
259
|
+
mediaBlock?.base64,
|
|
260
|
+
mediaBlock?.data,
|
|
261
|
+
);
|
|
262
|
+
const type = pickString(payload.mediaType, mediaBlock?.type, mediaBlock?.mediaType);
|
|
263
|
+
const filename = pickString(payload.filename, payload.fileName, mediaBlock?.filename, mediaBlock?.fileName, mediaBlock?.name);
|
|
264
|
+
const mimeType = pickString(payload.mimeType, payload.mediaMimeType, mediaBlock?.mimeType, mediaBlock?.contentType);
|
|
265
|
+
if (!url && !path && !base64) return null;
|
|
266
|
+
return { type, url, path, base64, filename, mimeType };
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
async function loadOutboundMedia(params: {
|
|
270
|
+
payload: any;
|
|
271
|
+
account: WecomWebhookTarget["account"];
|
|
272
|
+
maxBytes: number | undefined;
|
|
273
|
+
}): Promise<{ buffer: Buffer; contentType: string; type: "image" | "voice" | "video" | "file"; filename: string } | null> {
|
|
274
|
+
const spec = resolveOutboundMediaSpec(params.payload);
|
|
275
|
+
if (!spec) return null;
|
|
276
|
+
|
|
277
|
+
let buffer: Buffer | null = null;
|
|
278
|
+
let contentType = spec.mimeType ?? "";
|
|
279
|
+
let filename = spec.filename ?? "";
|
|
280
|
+
|
|
281
|
+
if (spec.base64) {
|
|
282
|
+
const parsed = parseBase64Input(spec.base64);
|
|
283
|
+
buffer = Buffer.from(parsed.data, "base64");
|
|
284
|
+
if (!contentType && parsed.mimeType) contentType = parsed.mimeType;
|
|
285
|
+
} else if (spec.path) {
|
|
286
|
+
const resolvedPath = stripFileProtocol(spec.path);
|
|
287
|
+
buffer = await readFile(resolvedPath);
|
|
288
|
+
if (!filename) filename = basename(resolvedPath);
|
|
289
|
+
if (!contentType) {
|
|
290
|
+
const ext = extname(resolvedPath).replace(".", "");
|
|
291
|
+
contentType = resolveContentTypeFromExt(ext);
|
|
292
|
+
}
|
|
293
|
+
} else if (spec.url) {
|
|
294
|
+
const media = await fetchMediaFromUrl(spec.url, params.account);
|
|
295
|
+
buffer = media.buffer;
|
|
296
|
+
if (!contentType) contentType = media.contentType;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
if (!buffer) return null;
|
|
300
|
+
if (params.maxBytes && buffer.length > params.maxBytes) return null;
|
|
301
|
+
|
|
302
|
+
const type = normalizeMediaType(spec.type) ?? resolveMediaTypeFromContentType(contentType || "application/octet-stream");
|
|
303
|
+
const ext = resolveExtFromContentType(contentType || "application/octet-stream", type);
|
|
304
|
+
const safeName = sanitizeFilename(filename, `${type}.${ext}`);
|
|
305
|
+
|
|
306
|
+
return { buffer, contentType: contentType || resolveContentTypeFromExt(ext), type, filename: safeName };
|
|
307
|
+
}
|
|
308
|
+
|
|
174
309
|
function sanitizeFilename(name: string, fallback: string): string {
|
|
175
310
|
const base = name.split(/[/\\\\]/).pop() ?? "";
|
|
176
311
|
const trimmed = base.trim();
|
|
@@ -327,38 +462,35 @@ async function startAgentForApp(params: {
|
|
|
327
462
|
cfg: config,
|
|
328
463
|
dispatcherOptions: {
|
|
329
464
|
deliver: async (payload, info) => {
|
|
330
|
-
const
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
const media = await fetchMediaFromUrl(maybeMediaUrl, account);
|
|
335
|
-
const type = normalizeMediaType(maybeMediaType) ?? "file";
|
|
336
|
-
const ext = resolveExtFromContentType(media.contentType, type);
|
|
465
|
+
const maxBytes = resolveMediaMaxBytes(target);
|
|
466
|
+
try {
|
|
467
|
+
const outbound = await loadOutboundMedia({ payload, account, maxBytes });
|
|
468
|
+
if (outbound) {
|
|
337
469
|
const mediaId = await uploadWecomMedia({
|
|
338
470
|
account,
|
|
339
|
-
type: type
|
|
340
|
-
buffer:
|
|
341
|
-
filename:
|
|
471
|
+
type: outbound.type,
|
|
472
|
+
buffer: outbound.buffer,
|
|
473
|
+
filename: outbound.filename,
|
|
342
474
|
});
|
|
343
|
-
if (type === "image") {
|
|
475
|
+
if (outbound.type === "image") {
|
|
344
476
|
await sendWecomImage({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
|
|
345
477
|
logVerbose(target, `app image reply delivered (${info.kind}) to ${fromUser}`);
|
|
346
|
-
} else if (type === "voice") {
|
|
478
|
+
} else if (outbound.type === "voice") {
|
|
347
479
|
await sendWecomVoice({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
|
|
348
480
|
logVerbose(target, `app voice reply delivered (${info.kind}) to ${fromUser}`);
|
|
349
|
-
} else if (type === "video") {
|
|
481
|
+
} else if (outbound.type === "video") {
|
|
350
482
|
const title = (payload as any).title as string | undefined;
|
|
351
483
|
const description = (payload as any).description as string | undefined;
|
|
352
484
|
await sendWecomVideo({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId, title, description });
|
|
353
485
|
logVerbose(target, `app video reply delivered (${info.kind}) to ${fromUser}`);
|
|
354
|
-
} else if (type === "file") {
|
|
486
|
+
} else if (outbound.type === "file") {
|
|
355
487
|
await sendWecomFile({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
|
|
356
488
|
logVerbose(target, `app file reply delivered (${info.kind}) to ${fromUser}`);
|
|
357
489
|
}
|
|
358
490
|
target.statusSink?.({ lastOutboundAt: Date.now() });
|
|
359
|
-
} catch (err) {
|
|
360
|
-
target.runtime.error?.(`wecom app media reply failed: ${String(err)}`);
|
|
361
491
|
}
|
|
492
|
+
} catch (err) {
|
|
493
|
+
target.runtime.error?.(`wecom app media reply failed: ${String(err)}`);
|
|
362
494
|
}
|
|
363
495
|
|
|
364
496
|
const text = markdownToWecomText(core.channel.text.convertMarkdownTables(payload.text ?? "", tableMode));
|
|
@@ -467,7 +599,11 @@ async function processAppMessage(params: {
|
|
|
467
599
|
if (cached) {
|
|
468
600
|
mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
|
|
469
601
|
logVerbose(target, `app image cache hit: ${cached.path}`);
|
|
470
|
-
|
|
602
|
+
if (cached.summary) {
|
|
603
|
+
messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`;
|
|
604
|
+
} else {
|
|
605
|
+
messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
|
|
606
|
+
}
|
|
471
607
|
} else {
|
|
472
608
|
let buffer: Buffer | null = null;
|
|
473
609
|
let contentType = "";
|
|
@@ -498,16 +634,27 @@ async function processAppMessage(params: {
|
|
|
498
634
|
await writeFile(tempImagePath, buffer);
|
|
499
635
|
const mimeType = contentType || "image/jpeg";
|
|
500
636
|
mediaContext = { type: "image", path: tempImagePath, mimeType, url: picUrl || undefined };
|
|
637
|
+
|
|
638
|
+
const visionConfig = resolveVisionConfig(target.account.config);
|
|
639
|
+
const summary = visionConfig
|
|
640
|
+
? await describeImageWithVision({ config: visionConfig, buffer, mimeType })
|
|
641
|
+
: null;
|
|
642
|
+
|
|
501
643
|
storeCachedMedia(cacheKey, {
|
|
502
644
|
path: tempImagePath,
|
|
503
645
|
type: "image",
|
|
504
646
|
mimeType,
|
|
505
647
|
url: picUrl || undefined,
|
|
648
|
+
summary: summary ?? undefined,
|
|
506
649
|
createdAt: Date.now(),
|
|
507
650
|
size: buffer.length,
|
|
508
651
|
});
|
|
509
652
|
logVerbose(target, `app image saved (${buffer.length} bytes): ${tempImagePath}`);
|
|
510
|
-
|
|
653
|
+
if (summary) {
|
|
654
|
+
messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`;
|
|
655
|
+
} else {
|
|
656
|
+
messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
|
|
657
|
+
}
|
|
511
658
|
}
|
|
512
659
|
} else {
|
|
513
660
|
messageText = "[用户发送了一张图片,但下载失败]\n\n请告诉用户图片处理暂时不可用。";
|
|
@@ -582,7 +729,8 @@ async function processAppMessage(params: {
|
|
|
582
729
|
if (cached) {
|
|
583
730
|
mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
|
|
584
731
|
logVerbose(target, `app file cache hit: ${cached.path}`);
|
|
585
|
-
|
|
732
|
+
const cachedName = fileName || basename(cached.path) || "未知文件";
|
|
733
|
+
messageText = `[用户发送了一个文件: ${cachedName},已保存到: ${cached.path}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`;
|
|
586
734
|
} else {
|
|
587
735
|
const media = await downloadWecomMedia({ account: target.account, mediaId });
|
|
588
736
|
const maxBytes = resolveMediaMaxBytes(target);
|
|
@@ -610,7 +758,7 @@ async function processAppMessage(params: {
|
|
|
610
758
|
size: media.buffer.length,
|
|
611
759
|
});
|
|
612
760
|
logVerbose(target, `app file saved (${media.buffer.length} bytes): ${tempFilePath}`);
|
|
613
|
-
messageText = `[用户发送了一个文件: ${safeName}]\n\n
|
|
761
|
+
messageText = `[用户发送了一个文件: ${safeName},已保存到: ${tempFilePath}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`;
|
|
614
762
|
}
|
|
615
763
|
}
|
|
616
764
|
} catch (err) {
|
package/wecom/src/wecom-bot.ts
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import type { IncomingMessage, ServerResponse } from "node:http";
|
|
2
2
|
import crypto from "node:crypto";
|
|
3
|
-
import { mkdir, readdir, rm, stat, writeFile } from "node:fs/promises";
|
|
3
|
+
import { mkdir, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
|
|
4
4
|
import { tmpdir } from "node:os";
|
|
5
|
-
import { join } from "node:path";
|
|
5
|
+
import { basename, extname, join } from "node:path";
|
|
6
6
|
|
|
7
7
|
import type { PluginRuntime } from "openclaw/plugin-sdk";
|
|
8
8
|
|
|
@@ -11,6 +11,7 @@ import type { ResolvedWecomAccount, WecomInboundMessage } from "./types.js";
|
|
|
11
11
|
import { computeWecomMsgSignature, decryptWecomEncrypted, encryptWecomPlaintext, verifyWecomSignature } from "./crypto.js";
|
|
12
12
|
import { fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
|
|
13
13
|
import { getWecomRuntime } from "./runtime.js";
|
|
14
|
+
import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
|
|
14
15
|
|
|
15
16
|
const STREAM_TTL_MS = 10 * 60 * 1000;
|
|
16
17
|
const STREAM_MAX_BYTES = 20_480;
|
|
@@ -20,7 +21,7 @@ const DEDUPE_MAX_ENTRIES = 2_000;
|
|
|
20
21
|
const MEDIA_CACHE_MAX_ENTRIES = 200;
|
|
21
22
|
|
|
22
23
|
const cleanupExecuted = new Set<string>();
|
|
23
|
-
const mediaCache = new Map<string, { entry: InboundMedia; createdAt: number; size: number }>();
|
|
24
|
+
const mediaCache = new Map<string, { entry: InboundMedia; createdAt: number; size: number; summary?: string }>();
|
|
24
25
|
|
|
25
26
|
type StreamState = {
|
|
26
27
|
streamId: string;
|
|
@@ -413,49 +414,44 @@ async function startAgentForStream(params: {
|
|
|
413
414
|
cfg: config,
|
|
414
415
|
dispatcherOptions: {
|
|
415
416
|
deliver: async (payload) => {
|
|
416
|
-
const maybeMediaUrl = (payload as any).mediaUrl as string | undefined;
|
|
417
|
-
const maybeMediaType = (payload as any).mediaType as string | undefined;
|
|
418
417
|
const canBridgeMedia = account.config.botMediaBridge !== false
|
|
419
418
|
&& Boolean(account.corpId && account.corpSecret && account.agentId);
|
|
420
419
|
const toChatId = chatType === "group" ? chatId : undefined;
|
|
421
420
|
|
|
422
|
-
if (
|
|
421
|
+
if (canBridgeMedia) {
|
|
423
422
|
try {
|
|
424
|
-
const
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
:
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
: media.contentType.includes("amr") ? "amr"
|
|
431
|
-
: media.contentType.includes("wav") ? "wav"
|
|
432
|
-
: media.contentType.includes("mp3") ? "mp3"
|
|
433
|
-
: "bin";
|
|
423
|
+
const outbound = await loadOutboundMedia({
|
|
424
|
+
payload,
|
|
425
|
+
account,
|
|
426
|
+
maxBytes: resolveMediaMaxBytes(target),
|
|
427
|
+
});
|
|
428
|
+
if (outbound) {
|
|
434
429
|
const mediaId = await uploadWecomMedia({
|
|
435
430
|
account,
|
|
436
|
-
type: type
|
|
437
|
-
buffer:
|
|
438
|
-
filename:
|
|
431
|
+
type: outbound.type,
|
|
432
|
+
buffer: outbound.buffer,
|
|
433
|
+
filename: outbound.filename,
|
|
439
434
|
});
|
|
440
|
-
if (type === "image") {
|
|
435
|
+
if (outbound.type === "image") {
|
|
441
436
|
await sendWecomImage({ account, toUser: userid, chatId: toChatId, mediaId });
|
|
442
|
-
} else if (type === "voice") {
|
|
437
|
+
} else if (outbound.type === "voice") {
|
|
443
438
|
await sendWecomVoice({ account, toUser: userid, chatId: toChatId, mediaId });
|
|
444
|
-
} else if (type === "video") {
|
|
439
|
+
} else if (outbound.type === "video") {
|
|
445
440
|
const title = (payload as any).title as string | undefined;
|
|
446
441
|
const description = (payload as any).description as string | undefined;
|
|
447
442
|
await sendWecomVideo({ account, toUser: userid, chatId: toChatId, mediaId, title, description });
|
|
448
|
-
} else if (type === "file") {
|
|
443
|
+
} else if (outbound.type === "file") {
|
|
449
444
|
await sendWecomFile({ account, toUser: userid, chatId: toChatId, mediaId });
|
|
450
445
|
}
|
|
451
446
|
const current = streams.get(streamId);
|
|
452
447
|
if (current) {
|
|
453
|
-
const note = mediaSentLabel(type);
|
|
448
|
+
const note = mediaSentLabel(outbound.type);
|
|
454
449
|
const nextText = current.content ? `${current.content}\n\n${note}` : note;
|
|
455
450
|
current.content = truncateUtf8Bytes(nextText.trim(), STREAM_MAX_BYTES);
|
|
456
451
|
current.updatedAt = Date.now();
|
|
457
452
|
}
|
|
458
453
|
target.statusSink?.({ lastOutboundAt: Date.now() });
|
|
454
|
+
}
|
|
459
455
|
} catch (err) {
|
|
460
456
|
target.runtime.error?.(`[${account.accountId}] wecom bot media bridge failed: ${String(err)}`);
|
|
461
457
|
}
|
|
@@ -593,9 +589,22 @@ async function buildBotMediaMessage(params: {
|
|
|
593
589
|
const cacheKey = buildMediaCacheKey({ url, base64 });
|
|
594
590
|
const cached = await getCachedMedia(cacheKey, resolveMediaRetentionMs(target));
|
|
595
591
|
if (cached) {
|
|
592
|
+
if (msgtype === "image" && cached.summary) {
|
|
593
|
+
return {
|
|
594
|
+
text: `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`,
|
|
595
|
+
media: cached.media,
|
|
596
|
+
};
|
|
597
|
+
}
|
|
598
|
+
if (msgtype === "file") {
|
|
599
|
+
const safeName = sanitizeFilename(filename || basename(cached.media.path), "file");
|
|
600
|
+
return {
|
|
601
|
+
text: `[用户发送了一个文件: ${safeName},已保存到: ${cached.media.path}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`,
|
|
602
|
+
media: cached.media,
|
|
603
|
+
};
|
|
604
|
+
}
|
|
596
605
|
return {
|
|
597
606
|
text: buildInboundMediaPrompt(msgtype, filename),
|
|
598
|
-
media: cached,
|
|
607
|
+
media: cached.media,
|
|
599
608
|
};
|
|
600
609
|
}
|
|
601
610
|
|
|
@@ -652,7 +661,7 @@ async function buildBotMediaMessage(params: {
|
|
|
652
661
|
};
|
|
653
662
|
storeCachedMedia(cacheKey, media, buffer.length);
|
|
654
663
|
return {
|
|
655
|
-
text:
|
|
664
|
+
text: `[用户发送了一个文件: ${safeName},已保存到: ${tempFilePath}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`,
|
|
656
665
|
media,
|
|
657
666
|
};
|
|
658
667
|
}
|
|
@@ -670,9 +679,19 @@ async function buildBotMediaMessage(params: {
|
|
|
670
679
|
mimeType: contentType || "image/jpeg",
|
|
671
680
|
url,
|
|
672
681
|
};
|
|
673
|
-
|
|
682
|
+
const visionConfig = resolveVisionConfig(target.account.config);
|
|
683
|
+
const summary = visionConfig
|
|
684
|
+
? await describeImageWithVision({
|
|
685
|
+
config: visionConfig,
|
|
686
|
+
buffer,
|
|
687
|
+
mimeType: media.mimeType || "image/jpeg",
|
|
688
|
+
})
|
|
689
|
+
: null;
|
|
690
|
+
storeCachedMedia(cacheKey, media, buffer.length, summary ?? undefined);
|
|
674
691
|
return {
|
|
675
|
-
text:
|
|
692
|
+
text: summary
|
|
693
|
+
? `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`
|
|
694
|
+
: buildInboundMediaPrompt("image"),
|
|
676
695
|
media,
|
|
677
696
|
};
|
|
678
697
|
}
|
|
@@ -776,6 +795,132 @@ function normalizeMediaType(raw?: string): "image" | "voice" | "video" | "file"
|
|
|
776
795
|
return null;
|
|
777
796
|
}
|
|
778
797
|
|
|
798
|
+
function resolveContentTypeFromExt(ext: string): string {
|
|
799
|
+
const value = ext.toLowerCase();
|
|
800
|
+
if (value === "png") return "image/png";
|
|
801
|
+
if (value === "gif") return "image/gif";
|
|
802
|
+
if (value === "jpg" || value === "jpeg") return "image/jpeg";
|
|
803
|
+
if (value === "webp") return "image/webp";
|
|
804
|
+
if (value === "bmp") return "image/bmp";
|
|
805
|
+
if (value === "amr") return "audio/amr";
|
|
806
|
+
if (value === "wav") return "audio/wav";
|
|
807
|
+
if (value === "mp3") return "audio/mpeg";
|
|
808
|
+
if (value === "m4a") return "audio/mp4";
|
|
809
|
+
if (value === "mp4") return "video/mp4";
|
|
810
|
+
if (value === "mov") return "video/quicktime";
|
|
811
|
+
if (value === "avi") return "video/x-msvideo";
|
|
812
|
+
if (value === "pdf") return "application/pdf";
|
|
813
|
+
if (value === "txt") return "text/plain";
|
|
814
|
+
if (value === "csv") return "text/csv";
|
|
815
|
+
if (value === "json") return "application/json";
|
|
816
|
+
if (value === "doc") return "application/msword";
|
|
817
|
+
if (value === "docx") return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
|
|
818
|
+
if (value === "xls") return "application/vnd.ms-excel";
|
|
819
|
+
if (value === "xlsx") return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
820
|
+
if (value === "ppt") return "application/vnd.ms-powerpoint";
|
|
821
|
+
if (value === "pptx") return "application/vnd.openxmlformats-officedocument.presentationml.presentation";
|
|
822
|
+
if (value === "zip") return "application/zip";
|
|
823
|
+
return "application/octet-stream";
|
|
824
|
+
}
|
|
825
|
+
|
|
826
|
+
function resolveMediaTypeFromContentType(contentType: string): "image" | "voice" | "video" | "file" {
|
|
827
|
+
const value = contentType.toLowerCase();
|
|
828
|
+
if (value.startsWith("image/")) return "image";
|
|
829
|
+
if (value.startsWith("audio/")) return "voice";
|
|
830
|
+
if (value.startsWith("video/")) return "video";
|
|
831
|
+
return "file";
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
function stripFileProtocol(rawPath: string): string {
|
|
835
|
+
return rawPath.startsWith("file://") ? rawPath.replace(/^file:\/\//, "") : rawPath;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
function parseBase64Input(input: string): { data: string; mimeType?: string } {
|
|
839
|
+
const match = input.match(/^data:([^;]+);base64,(.*)$/i);
|
|
840
|
+
if (match) {
|
|
841
|
+
return { data: match[2], mimeType: match[1] };
|
|
842
|
+
}
|
|
843
|
+
return { data: input };
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
function resolveOutboundMediaSpec(payload: any): {
|
|
847
|
+
type?: string;
|
|
848
|
+
url?: string;
|
|
849
|
+
path?: string;
|
|
850
|
+
base64?: string;
|
|
851
|
+
filename?: string;
|
|
852
|
+
mimeType?: string;
|
|
853
|
+
} | null {
|
|
854
|
+
if (!payload || typeof payload !== "object") return null;
|
|
855
|
+
const mediaBlockRaw = payload.media ?? payload.attachment ?? payload.file ?? payload.files;
|
|
856
|
+
const mediaBlock = Array.isArray(mediaBlockRaw) ? mediaBlockRaw[0] : mediaBlockRaw;
|
|
857
|
+
const url = pickString(
|
|
858
|
+
payload.mediaUrl,
|
|
859
|
+
mediaBlock?.url,
|
|
860
|
+
mediaBlock?.mediaUrl,
|
|
861
|
+
mediaBlock?.fileUrl,
|
|
862
|
+
mediaBlock?.file_url,
|
|
863
|
+
);
|
|
864
|
+
const path = pickString(
|
|
865
|
+
payload.mediaPath,
|
|
866
|
+
payload.filePath,
|
|
867
|
+
mediaBlock?.path,
|
|
868
|
+
mediaBlock?.filePath,
|
|
869
|
+
mediaBlock?.localPath,
|
|
870
|
+
);
|
|
871
|
+
const base64 = pickString(
|
|
872
|
+
payload.mediaBase64,
|
|
873
|
+
payload.base64,
|
|
874
|
+
mediaBlock?.base64,
|
|
875
|
+
mediaBlock?.data,
|
|
876
|
+
);
|
|
877
|
+
const type = pickString(payload.mediaType, mediaBlock?.type, mediaBlock?.mediaType);
|
|
878
|
+
const filename = pickString(payload.filename, payload.fileName, mediaBlock?.filename, mediaBlock?.fileName, mediaBlock?.name);
|
|
879
|
+
const mimeType = pickString(payload.mimeType, payload.mediaMimeType, mediaBlock?.mimeType, mediaBlock?.contentType);
|
|
880
|
+
if (!url && !path && !base64) return null;
|
|
881
|
+
return { type, url, path, base64, filename, mimeType };
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
async function loadOutboundMedia(params: {
|
|
885
|
+
payload: any;
|
|
886
|
+
account: ResolvedWecomAccount;
|
|
887
|
+
maxBytes: number | undefined;
|
|
888
|
+
}): Promise<{ buffer: Buffer; contentType: string; type: "image" | "voice" | "video" | "file"; filename: string } | null> {
|
|
889
|
+
const spec = resolveOutboundMediaSpec(params.payload);
|
|
890
|
+
if (!spec) return null;
|
|
891
|
+
|
|
892
|
+
let buffer: Buffer | null = null;
|
|
893
|
+
let contentType = spec.mimeType ?? "";
|
|
894
|
+
let filename = spec.filename ?? "";
|
|
895
|
+
|
|
896
|
+
if (spec.base64) {
|
|
897
|
+
const parsed = parseBase64Input(spec.base64);
|
|
898
|
+
buffer = Buffer.from(parsed.data, "base64");
|
|
899
|
+
if (!contentType && parsed.mimeType) contentType = parsed.mimeType;
|
|
900
|
+
} else if (spec.path) {
|
|
901
|
+
const resolvedPath = stripFileProtocol(spec.path);
|
|
902
|
+
buffer = await readFile(resolvedPath);
|
|
903
|
+
if (!filename) filename = basename(resolvedPath);
|
|
904
|
+
if (!contentType) {
|
|
905
|
+
const ext = extname(resolvedPath).replace(".", "");
|
|
906
|
+
contentType = resolveContentTypeFromExt(ext);
|
|
907
|
+
}
|
|
908
|
+
} else if (spec.url) {
|
|
909
|
+
const media = await fetchMediaFromUrl(spec.url, params.account);
|
|
910
|
+
buffer = media.buffer;
|
|
911
|
+
if (!contentType) contentType = media.contentType;
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
if (!buffer) return null;
|
|
915
|
+
if (params.maxBytes && buffer.length > params.maxBytes) return null;
|
|
916
|
+
|
|
917
|
+
const type = normalizeMediaType(spec.type) ?? resolveMediaTypeFromContentType(contentType || "application/octet-stream");
|
|
918
|
+
const ext = resolveExtFromContentType(contentType || "application/octet-stream", type);
|
|
919
|
+
const safeName = sanitizeFilename(filename, `${type}.${ext}`);
|
|
920
|
+
|
|
921
|
+
return { buffer, contentType: contentType || resolveContentTypeFromExt(ext), type, filename: safeName };
|
|
922
|
+
}
|
|
923
|
+
|
|
779
924
|
function mediaSentLabel(type: string): string {
|
|
780
925
|
if (type === "image") return "[已发送图片]";
|
|
781
926
|
if (type === "voice") return "[已发送语音]";
|
|
@@ -812,7 +957,7 @@ function pruneMediaCache(): void {
|
|
|
812
957
|
async function getCachedMedia(
|
|
813
958
|
key: string | null,
|
|
814
959
|
retentionMs?: number,
|
|
815
|
-
): Promise<InboundMedia | null> {
|
|
960
|
+
): Promise<{ media: InboundMedia; summary?: string } | null> {
|
|
816
961
|
if (!key) return null;
|
|
817
962
|
const cached = mediaCache.get(key);
|
|
818
963
|
if (!cached) return null;
|
|
@@ -826,12 +971,12 @@ async function getCachedMedia(
|
|
|
826
971
|
mediaCache.delete(key);
|
|
827
972
|
return null;
|
|
828
973
|
}
|
|
829
|
-
return cached.entry;
|
|
974
|
+
return { media: cached.entry, summary: cached.summary };
|
|
830
975
|
}
|
|
831
976
|
|
|
832
|
-
function storeCachedMedia(key: string | null, entry: InboundMedia, size: number): void {
|
|
977
|
+
function storeCachedMedia(key: string | null, entry: InboundMedia, size: number, summary?: string): void {
|
|
833
978
|
if (!key) return;
|
|
834
|
-
mediaCache.set(key, { entry, createdAt: Date.now(), size });
|
|
979
|
+
mediaCache.set(key, { entry, createdAt: Date.now(), size, summary });
|
|
835
980
|
pruneMediaCache();
|
|
836
981
|
}
|
|
837
982
|
|