@marshulll/openclaw-wecom 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,17 @@
8
8
  "tempDir": "/tmp/openclaw-wecom",
9
9
  "retentionHours": 72,
10
10
  "cleanupOnStart": true,
11
- "maxBytes": 10485760
11
+ "maxBytes": 10485760,
12
+ "vision": {
13
+ "enabled": true,
14
+ "baseUrl": "https://newapi.looksunlight.com/v1",
15
+ "apiKey": "YOUR_API_KEY",
16
+ "model": "gpt-4o-mini",
17
+ "prompt": "请描述图片内容并尽量提取可见文字。",
18
+ "maxTokens": 400,
19
+ "timeoutMs": 15000,
20
+ "maxBytes": 5242880
21
+ }
12
22
  },
13
23
  "botMediaBridge": true,
14
24
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@marshulll/openclaw-wecom",
3
- "version": "0.1.15",
3
+ "version": "0.1.17",
4
4
  "type": "module",
5
5
  "description": "OpenClaw WeCom channel plugin (intelligent bot + internal app)",
6
6
  "author": "OpenClaw",
@@ -47,6 +47,16 @@ const accountSchema = z.object({
47
47
  retentionHours: z.number().optional(),
48
48
  cleanupOnStart: z.boolean().optional(),
49
49
  maxBytes: z.number().optional(),
50
+ vision: z.object({
51
+ enabled: z.boolean().optional(),
52
+ baseUrl: z.string().optional(),
53
+ apiKey: z.string().optional(),
54
+ model: z.string().optional(),
55
+ prompt: z.string().optional(),
56
+ maxTokens: z.number().optional(),
57
+ timeoutMs: z.number().optional(),
58
+ maxBytes: z.number().optional(),
59
+ }).optional(),
50
60
  }).optional(),
51
61
 
52
62
  network: z.object({
@@ -81,6 +91,16 @@ export const WecomConfigSchema = ensureJsonSchema(z.object({
81
91
  retentionHours: z.number().optional(),
82
92
  cleanupOnStart: z.boolean().optional(),
83
93
  maxBytes: z.number().optional(),
94
+ vision: z.object({
95
+ enabled: z.boolean().optional(),
96
+ baseUrl: z.string().optional(),
97
+ apiKey: z.string().optional(),
98
+ model: z.string().optional(),
99
+ prompt: z.string().optional(),
100
+ maxTokens: z.number().optional(),
101
+ timeoutMs: z.number().optional(),
102
+ maxBytes: z.number().optional(),
103
+ }).optional(),
84
104
  }).optional(),
85
105
 
86
106
  network: z.object({
@@ -0,0 +1,98 @@
1
+ import type { WecomAccountConfig } from "./types.js";
2
+
3
+ export type VisionConfig = {
4
+ enabled?: boolean;
5
+ baseUrl?: string;
6
+ apiKey?: string;
7
+ model?: string;
8
+ prompt?: string;
9
+ maxTokens?: number;
10
+ timeoutMs?: number;
11
+ maxBytes?: number;
12
+ };
13
+
14
+ function resolveBaseUrl(raw?: string): string | null {
15
+ const value = raw?.trim();
16
+ if (!value) return null;
17
+ if (value.endsWith("/v1")) return value;
18
+ return `${value.replace(/\/+$/, "")}/v1`;
19
+ }
20
+
21
+ export function resolveVisionConfig(accountConfig: WecomAccountConfig): VisionConfig | null {
22
+ const vision = accountConfig.media?.vision;
23
+ if (!vision?.enabled) return null;
24
+
25
+ const baseUrl = resolveBaseUrl(
26
+ vision.baseUrl
27
+ || process.env.OPENAI_BASE_URL
28
+ || process.env.OPENAI_API_BASE
29
+ || process.env.OPENAI_ENDPOINT,
30
+ );
31
+ const apiKey = vision.apiKey || process.env.OPENAI_API_KEY || process.env.OPENAI_KEY;
32
+ if (!baseUrl || !apiKey) return null;
33
+
34
+ return {
35
+ enabled: true,
36
+ baseUrl,
37
+ apiKey,
38
+ model: vision.model || process.env.OPENAI_MODEL || "gpt-4o-mini",
39
+ prompt: vision.prompt
40
+ || "请描述图片内容并尽量提取可见文字。输出简洁中文要点。",
41
+ maxTokens: typeof vision.maxTokens === "number" ? vision.maxTokens : 400,
42
+ timeoutMs: typeof vision.timeoutMs === "number" ? vision.timeoutMs : 15000,
43
+ maxBytes: typeof vision.maxBytes === "number" ? vision.maxBytes : undefined,
44
+ };
45
+ }
46
+
47
+ export async function describeImageWithVision(params: {
48
+ config: VisionConfig;
49
+ buffer: Buffer;
50
+ mimeType: string;
51
+ }): Promise<string | null> {
52
+ const { config, buffer, mimeType } = params;
53
+ if (!config.enabled || !config.baseUrl || !config.apiKey) return null;
54
+
55
+ if (config.maxBytes && buffer.length > config.maxBytes) {
56
+ return null;
57
+ }
58
+
59
+ const controller = new AbortController();
60
+ const timeout = setTimeout(() => controller.abort(), config.timeoutMs ?? 15000);
61
+
62
+ try {
63
+ const imageBase64 = buffer.toString("base64");
64
+ const payload = {
65
+ model: config.model,
66
+ messages: [
67
+ {
68
+ role: "user",
69
+ content: [
70
+ { type: "text", text: config.prompt },
71
+ { type: "image_url", image_url: { url: `data:${mimeType};base64,${imageBase64}` } },
72
+ ],
73
+ },
74
+ ],
75
+ max_tokens: config.maxTokens ?? 400,
76
+ };
77
+
78
+ const res = await fetch(`${config.baseUrl}/chat/completions`, {
79
+ method: "POST",
80
+ headers: {
81
+ "Content-Type": "application/json",
82
+ Authorization: `Bearer ${config.apiKey}`,
83
+ },
84
+ body: JSON.stringify(payload),
85
+ signal: controller.signal,
86
+ });
87
+
88
+ if (!res.ok) return null;
89
+ const data = await res.json() as any;
90
+ const content = data?.choices?.[0]?.message?.content;
91
+ if (typeof content !== "string") return null;
92
+ return content.trim() || null;
93
+ } catch {
94
+ return null;
95
+ } finally {
96
+ clearTimeout(timeout);
97
+ }
98
+ }
@@ -47,6 +47,16 @@ export type WecomAccountConfig = {
47
47
  retentionHours?: number;
48
48
  cleanupOnStart?: boolean;
49
49
  maxBytes?: number;
50
+ vision?: {
51
+ enabled?: boolean;
52
+ baseUrl?: string;
53
+ apiKey?: string;
54
+ model?: string;
55
+ prompt?: string;
56
+ maxTokens?: number;
57
+ timeoutMs?: number;
58
+ maxBytes?: number;
59
+ };
50
60
  };
51
61
 
52
62
  // Network behavior
@@ -1,15 +1,16 @@
1
1
  import type { IncomingMessage, ServerResponse } from "node:http";
2
2
  import crypto from "node:crypto";
3
3
  import { XMLParser } from "fast-xml-parser";
4
- import { mkdir, readdir, rm, stat, writeFile } from "node:fs/promises";
4
+ import { mkdir, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
5
5
  import { tmpdir } from "node:os";
6
- import { join } from "node:path";
6
+ import { basename, extname, join } from "node:path";
7
7
 
8
8
  import type { WecomWebhookTarget } from "./monitor.js";
9
9
  import { decryptWecomEncrypted, verifyWecomSignature } from "./crypto.js";
10
10
  import { getWecomRuntime } from "./runtime.js";
11
11
  import { handleCommand } from "./commands.js";
12
12
  import { markdownToWecomText } from "./format.js";
13
+ import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
13
14
  import { downloadWecomMedia, fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomText, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
14
15
 
15
16
  const xmlParser = new XMLParser({
@@ -26,6 +27,7 @@ type MediaCacheEntry = {
26
27
  type: "image" | "voice" | "video" | "file";
27
28
  mimeType?: string;
28
29
  url?: string;
30
+ summary?: string;
29
31
  createdAt: number;
30
32
  size: number;
31
33
  };
@@ -171,6 +173,139 @@ function normalizeMediaType(raw?: string): "image" | "voice" | "video" | "file"
171
173
  return null;
172
174
  }
173
175
 
176
+ function pickString(...values: unknown[]): string {
177
+ for (const value of values) {
178
+ if (typeof value === "string" && value.trim()) return value.trim();
179
+ }
180
+ return "";
181
+ }
182
+
183
+ function resolveContentTypeFromExt(ext: string): string {
184
+ const value = ext.toLowerCase();
185
+ if (value === "png") return "image/png";
186
+ if (value === "gif") return "image/gif";
187
+ if (value === "jpg" || value === "jpeg") return "image/jpeg";
188
+ if (value === "webp") return "image/webp";
189
+ if (value === "bmp") return "image/bmp";
190
+ if (value === "amr") return "audio/amr";
191
+ if (value === "wav") return "audio/wav";
192
+ if (value === "mp3") return "audio/mpeg";
193
+ if (value === "m4a") return "audio/mp4";
194
+ if (value === "mp4") return "video/mp4";
195
+ if (value === "mov") return "video/quicktime";
196
+ if (value === "avi") return "video/x-msvideo";
197
+ if (value === "pdf") return "application/pdf";
198
+ if (value === "txt") return "text/plain";
199
+ if (value === "csv") return "text/csv";
200
+ if (value === "json") return "application/json";
201
+ if (value === "doc") return "application/msword";
202
+ if (value === "docx") return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
203
+ if (value === "xls") return "application/vnd.ms-excel";
204
+ if (value === "xlsx") return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
205
+ if (value === "ppt") return "application/vnd.ms-powerpoint";
206
+ if (value === "pptx") return "application/vnd.openxmlformats-officedocument.presentationml.presentation";
207
+ if (value === "zip") return "application/zip";
208
+ return "application/octet-stream";
209
+ }
210
+
211
+ function resolveMediaTypeFromContentType(contentType: string): "image" | "voice" | "video" | "file" {
212
+ const value = contentType.toLowerCase();
213
+ if (value.startsWith("image/")) return "image";
214
+ if (value.startsWith("audio/")) return "voice";
215
+ if (value.startsWith("video/")) return "video";
216
+ return "file";
217
+ }
218
+
219
+ function stripFileProtocol(rawPath: string): string {
220
+ return rawPath.startsWith("file://") ? rawPath.replace(/^file:\/\//, "") : rawPath;
221
+ }
222
+
223
+ function parseBase64Input(input: string): { data: string; mimeType?: string } {
224
+ const match = input.match(/^data:([^;]+);base64,(.*)$/i);
225
+ if (match) {
226
+ return { data: match[2], mimeType: match[1] };
227
+ }
228
+ return { data: input };
229
+ }
230
+
231
+ function resolveOutboundMediaSpec(payload: any): {
232
+ type?: string;
233
+ url?: string;
234
+ path?: string;
235
+ base64?: string;
236
+ filename?: string;
237
+ mimeType?: string;
238
+ } | null {
239
+ if (!payload || typeof payload !== "object") return null;
240
+ const mediaBlockRaw = payload.media ?? payload.attachment ?? payload.file ?? payload.files;
241
+ const mediaBlock = Array.isArray(mediaBlockRaw) ? mediaBlockRaw[0] : mediaBlockRaw;
242
+ const url = pickString(
243
+ payload.mediaUrl,
244
+ mediaBlock?.url,
245
+ mediaBlock?.mediaUrl,
246
+ mediaBlock?.fileUrl,
247
+ mediaBlock?.file_url,
248
+ );
249
+ const path = pickString(
250
+ payload.mediaPath,
251
+ payload.filePath,
252
+ mediaBlock?.path,
253
+ mediaBlock?.filePath,
254
+ mediaBlock?.localPath,
255
+ );
256
+ const base64 = pickString(
257
+ payload.mediaBase64,
258
+ payload.base64,
259
+ mediaBlock?.base64,
260
+ mediaBlock?.data,
261
+ );
262
+ const type = pickString(payload.mediaType, mediaBlock?.type, mediaBlock?.mediaType);
263
+ const filename = pickString(payload.filename, payload.fileName, mediaBlock?.filename, mediaBlock?.fileName, mediaBlock?.name);
264
+ const mimeType = pickString(payload.mimeType, payload.mediaMimeType, mediaBlock?.mimeType, mediaBlock?.contentType);
265
+ if (!url && !path && !base64) return null;
266
+ return { type, url, path, base64, filename, mimeType };
267
+ }
268
+
269
+ async function loadOutboundMedia(params: {
270
+ payload: any;
271
+ account: WecomWebhookTarget["account"];
272
+ maxBytes: number | undefined;
273
+ }): Promise<{ buffer: Buffer; contentType: string; type: "image" | "voice" | "video" | "file"; filename: string } | null> {
274
+ const spec = resolveOutboundMediaSpec(params.payload);
275
+ if (!spec) return null;
276
+
277
+ let buffer: Buffer | null = null;
278
+ let contentType = spec.mimeType ?? "";
279
+ let filename = spec.filename ?? "";
280
+
281
+ if (spec.base64) {
282
+ const parsed = parseBase64Input(spec.base64);
283
+ buffer = Buffer.from(parsed.data, "base64");
284
+ if (!contentType && parsed.mimeType) contentType = parsed.mimeType;
285
+ } else if (spec.path) {
286
+ const resolvedPath = stripFileProtocol(spec.path);
287
+ buffer = await readFile(resolvedPath);
288
+ if (!filename) filename = basename(resolvedPath);
289
+ if (!contentType) {
290
+ const ext = extname(resolvedPath).replace(".", "");
291
+ contentType = resolveContentTypeFromExt(ext);
292
+ }
293
+ } else if (spec.url) {
294
+ const media = await fetchMediaFromUrl(spec.url, params.account);
295
+ buffer = media.buffer;
296
+ if (!contentType) contentType = media.contentType;
297
+ }
298
+
299
+ if (!buffer) return null;
300
+ if (params.maxBytes && buffer.length > params.maxBytes) return null;
301
+
302
+ const type = normalizeMediaType(spec.type) ?? resolveMediaTypeFromContentType(contentType || "application/octet-stream");
303
+ const ext = resolveExtFromContentType(contentType || "application/octet-stream", type);
304
+ const safeName = sanitizeFilename(filename, `${type}.${ext}`);
305
+
306
+ return { buffer, contentType: contentType || resolveContentTypeFromExt(ext), type, filename: safeName };
307
+ }
308
+
174
309
  function sanitizeFilename(name: string, fallback: string): string {
175
310
  const base = name.split(/[/\\\\]/).pop() ?? "";
176
311
  const trimmed = base.trim();
@@ -327,38 +462,35 @@ async function startAgentForApp(params: {
327
462
  cfg: config,
328
463
  dispatcherOptions: {
329
464
  deliver: async (payload, info) => {
330
- const maybeMediaUrl = (payload as any).mediaUrl as string | undefined;
331
- const maybeMediaType = (payload as any).mediaType as string | undefined;
332
- if (maybeMediaUrl) {
333
- try {
334
- const media = await fetchMediaFromUrl(maybeMediaUrl, account);
335
- const type = normalizeMediaType(maybeMediaType) ?? "file";
336
- const ext = resolveExtFromContentType(media.contentType, type);
465
+ const maxBytes = resolveMediaMaxBytes(target);
466
+ try {
467
+ const outbound = await loadOutboundMedia({ payload, account, maxBytes });
468
+ if (outbound) {
337
469
  const mediaId = await uploadWecomMedia({
338
470
  account,
339
- type: type as "image" | "voice" | "video" | "file",
340
- buffer: media.buffer,
341
- filename: `${type}.${ext}`,
471
+ type: outbound.type,
472
+ buffer: outbound.buffer,
473
+ filename: outbound.filename,
342
474
  });
343
- if (type === "image") {
475
+ if (outbound.type === "image") {
344
476
  await sendWecomImage({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
345
477
  logVerbose(target, `app image reply delivered (${info.kind}) to ${fromUser}`);
346
- } else if (type === "voice") {
478
+ } else if (outbound.type === "voice") {
347
479
  await sendWecomVoice({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
348
480
  logVerbose(target, `app voice reply delivered (${info.kind}) to ${fromUser}`);
349
- } else if (type === "video") {
481
+ } else if (outbound.type === "video") {
350
482
  const title = (payload as any).title as string | undefined;
351
483
  const description = (payload as any).description as string | undefined;
352
484
  await sendWecomVideo({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId, title, description });
353
485
  logVerbose(target, `app video reply delivered (${info.kind}) to ${fromUser}`);
354
- } else if (type === "file") {
486
+ } else if (outbound.type === "file") {
355
487
  await sendWecomFile({ account, toUser: fromUser, chatId: isGroup ? chatId : undefined, mediaId });
356
488
  logVerbose(target, `app file reply delivered (${info.kind}) to ${fromUser}`);
357
489
  }
358
490
  target.statusSink?.({ lastOutboundAt: Date.now() });
359
- } catch (err) {
360
- target.runtime.error?.(`wecom app media reply failed: ${String(err)}`);
361
491
  }
492
+ } catch (err) {
493
+ target.runtime.error?.(`wecom app media reply failed: ${String(err)}`);
362
494
  }
363
495
 
364
496
  const text = markdownToWecomText(core.channel.text.convertMarkdownTables(payload.text ?? "", tableMode));
@@ -467,7 +599,11 @@ async function processAppMessage(params: {
467
599
  if (cached) {
468
600
  mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
469
601
  logVerbose(target, `app image cache hit: ${cached.path}`);
470
- messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
602
+ if (cached.summary) {
603
+ messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`;
604
+ } else {
605
+ messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
606
+ }
471
607
  } else {
472
608
  let buffer: Buffer | null = null;
473
609
  let contentType = "";
@@ -498,16 +634,27 @@ async function processAppMessage(params: {
498
634
  await writeFile(tempImagePath, buffer);
499
635
  const mimeType = contentType || "image/jpeg";
500
636
  mediaContext = { type: "image", path: tempImagePath, mimeType, url: picUrl || undefined };
637
+
638
+ const visionConfig = resolveVisionConfig(target.account.config);
639
+ const summary = visionConfig
640
+ ? await describeImageWithVision({ config: visionConfig, buffer, mimeType })
641
+ : null;
642
+
501
643
  storeCachedMedia(cacheKey, {
502
644
  path: tempImagePath,
503
645
  type: "image",
504
646
  mimeType,
505
647
  url: picUrl || undefined,
648
+ summary: summary ?? undefined,
506
649
  createdAt: Date.now(),
507
650
  size: buffer.length,
508
651
  });
509
652
  logVerbose(target, `app image saved (${buffer.length} bytes): ${tempImagePath}`);
510
- messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
653
+ if (summary) {
654
+ messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`;
655
+ } else {
656
+ messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
657
+ }
511
658
  }
512
659
  } else {
513
660
  messageText = "[用户发送了一张图片,但下载失败]\n\n请告诉用户图片处理暂时不可用。";
@@ -582,7 +729,8 @@ async function processAppMessage(params: {
582
729
  if (cached) {
583
730
  mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
584
731
  logVerbose(target, `app file cache hit: ${cached.path}`);
585
- messageText = `[用户发送了一个文件: ${fileName || "未知文件"}]\n\n请根据文件内容回复用户。`;
732
+ const cachedName = fileName || basename(cached.path) || "未知文件";
733
+ messageText = `[用户发送了一个文件: ${cachedName},已保存到: ${cached.path}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`;
586
734
  } else {
587
735
  const media = await downloadWecomMedia({ account: target.account, mediaId });
588
736
  const maxBytes = resolveMediaMaxBytes(target);
@@ -610,7 +758,7 @@ async function processAppMessage(params: {
610
758
  size: media.buffer.length,
611
759
  });
612
760
  logVerbose(target, `app file saved (${media.buffer.length} bytes): ${tempFilePath}`);
613
- messageText = `[用户发送了一个文件: ${safeName}]\n\n请根据文件内容回复用户。`;
761
+ messageText = `[用户发送了一个文件: ${safeName},已保存到: ${tempFilePath}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`;
614
762
  }
615
763
  }
616
764
  } catch (err) {
@@ -1,8 +1,8 @@
1
1
  import type { IncomingMessage, ServerResponse } from "node:http";
2
2
  import crypto from "node:crypto";
3
- import { mkdir, readdir, rm, stat, writeFile } from "node:fs/promises";
3
+ import { mkdir, readFile, readdir, rm, stat, writeFile } from "node:fs/promises";
4
4
  import { tmpdir } from "node:os";
5
- import { join } from "node:path";
5
+ import { basename, extname, join } from "node:path";
6
6
 
7
7
  import type { PluginRuntime } from "openclaw/plugin-sdk";
8
8
 
@@ -11,6 +11,7 @@ import type { ResolvedWecomAccount, WecomInboundMessage } from "./types.js";
11
11
  import { computeWecomMsgSignature, decryptWecomEncrypted, encryptWecomPlaintext, verifyWecomSignature } from "./crypto.js";
12
12
  import { fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
13
13
  import { getWecomRuntime } from "./runtime.js";
14
+ import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
14
15
 
15
16
  const STREAM_TTL_MS = 10 * 60 * 1000;
16
17
  const STREAM_MAX_BYTES = 20_480;
@@ -20,7 +21,7 @@ const DEDUPE_MAX_ENTRIES = 2_000;
20
21
  const MEDIA_CACHE_MAX_ENTRIES = 200;
21
22
 
22
23
  const cleanupExecuted = new Set<string>();
23
- const mediaCache = new Map<string, { entry: InboundMedia; createdAt: number; size: number }>();
24
+ const mediaCache = new Map<string, { entry: InboundMedia; createdAt: number; size: number; summary?: string }>();
24
25
 
25
26
  type StreamState = {
26
27
  streamId: string;
@@ -413,49 +414,44 @@ async function startAgentForStream(params: {
413
414
  cfg: config,
414
415
  dispatcherOptions: {
415
416
  deliver: async (payload) => {
416
- const maybeMediaUrl = (payload as any).mediaUrl as string | undefined;
417
- const maybeMediaType = (payload as any).mediaType as string | undefined;
418
417
  const canBridgeMedia = account.config.botMediaBridge !== false
419
418
  && Boolean(account.corpId && account.corpSecret && account.agentId);
420
419
  const toChatId = chatType === "group" ? chatId : undefined;
421
420
 
422
- if (maybeMediaUrl && canBridgeMedia) {
421
+ if (canBridgeMedia) {
423
422
  try {
424
- const media = await fetchMediaFromUrl(maybeMediaUrl, account);
425
- const type = normalizeMediaType(maybeMediaType) ?? "file";
426
- const ext = media.contentType.includes("png") ? "png"
427
- : media.contentType.includes("gif") ? "gif"
428
- : media.contentType.includes("jpeg") || media.contentType.includes("jpg") ? "jpg"
429
- : media.contentType.includes("mp4") ? "mp4"
430
- : media.contentType.includes("amr") ? "amr"
431
- : media.contentType.includes("wav") ? "wav"
432
- : media.contentType.includes("mp3") ? "mp3"
433
- : "bin";
423
+ const outbound = await loadOutboundMedia({
424
+ payload,
425
+ account,
426
+ maxBytes: resolveMediaMaxBytes(target),
427
+ });
428
+ if (outbound) {
434
429
  const mediaId = await uploadWecomMedia({
435
430
  account,
436
- type: type as "image" | "voice" | "video" | "file",
437
- buffer: media.buffer,
438
- filename: `${type}.${ext}`,
431
+ type: outbound.type,
432
+ buffer: outbound.buffer,
433
+ filename: outbound.filename,
439
434
  });
440
- if (type === "image") {
435
+ if (outbound.type === "image") {
441
436
  await sendWecomImage({ account, toUser: userid, chatId: toChatId, mediaId });
442
- } else if (type === "voice") {
437
+ } else if (outbound.type === "voice") {
443
438
  await sendWecomVoice({ account, toUser: userid, chatId: toChatId, mediaId });
444
- } else if (type === "video") {
439
+ } else if (outbound.type === "video") {
445
440
  const title = (payload as any).title as string | undefined;
446
441
  const description = (payload as any).description as string | undefined;
447
442
  await sendWecomVideo({ account, toUser: userid, chatId: toChatId, mediaId, title, description });
448
- } else if (type === "file") {
443
+ } else if (outbound.type === "file") {
449
444
  await sendWecomFile({ account, toUser: userid, chatId: toChatId, mediaId });
450
445
  }
451
446
  const current = streams.get(streamId);
452
447
  if (current) {
453
- const note = mediaSentLabel(type);
448
+ const note = mediaSentLabel(outbound.type);
454
449
  const nextText = current.content ? `${current.content}\n\n${note}` : note;
455
450
  current.content = truncateUtf8Bytes(nextText.trim(), STREAM_MAX_BYTES);
456
451
  current.updatedAt = Date.now();
457
452
  }
458
453
  target.statusSink?.({ lastOutboundAt: Date.now() });
454
+ }
459
455
  } catch (err) {
460
456
  target.runtime.error?.(`[${account.accountId}] wecom bot media bridge failed: ${String(err)}`);
461
457
  }
@@ -593,9 +589,22 @@ async function buildBotMediaMessage(params: {
593
589
  const cacheKey = buildMediaCacheKey({ url, base64 });
594
590
  const cached = await getCachedMedia(cacheKey, resolveMediaRetentionMs(target));
595
591
  if (cached) {
592
+ if (msgtype === "image" && cached.summary) {
593
+ return {
594
+ text: `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`,
595
+ media: cached.media,
596
+ };
597
+ }
598
+ if (msgtype === "file") {
599
+ const safeName = sanitizeFilename(filename || basename(cached.media.path), "file");
600
+ return {
601
+ text: `[用户发送了一个文件: ${safeName},已保存到: ${cached.media.path}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`,
602
+ media: cached.media,
603
+ };
604
+ }
596
605
  return {
597
606
  text: buildInboundMediaPrompt(msgtype, filename),
598
- media: cached,
607
+ media: cached.media,
599
608
  };
600
609
  }
601
610
 
@@ -652,7 +661,7 @@ async function buildBotMediaMessage(params: {
652
661
  };
653
662
  storeCachedMedia(cacheKey, media, buffer.length);
654
663
  return {
655
- text: buildInboundMediaPrompt("file", safeName),
664
+ text: `[用户发送了一个文件: ${safeName},已保存到: ${tempFilePath}]\n\n请使用 Read 工具查看这个文件的内容并回复用户。`,
656
665
  media,
657
666
  };
658
667
  }
@@ -670,9 +679,19 @@ async function buildBotMediaMessage(params: {
670
679
  mimeType: contentType || "image/jpeg",
671
680
  url,
672
681
  };
673
- storeCachedMedia(cacheKey, media, buffer.length);
682
+ const visionConfig = resolveVisionConfig(target.account.config);
683
+ const summary = visionConfig
684
+ ? await describeImageWithVision({
685
+ config: visionConfig,
686
+ buffer,
687
+ mimeType: media.mimeType || "image/jpeg",
688
+ })
689
+ : null;
690
+ storeCachedMedia(cacheKey, media, buffer.length, summary ?? undefined);
674
691
  return {
675
- text: buildInboundMediaPrompt("image"),
692
+ text: summary
693
+ ? `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`
694
+ : buildInboundMediaPrompt("image"),
676
695
  media,
677
696
  };
678
697
  }
@@ -776,6 +795,132 @@ function normalizeMediaType(raw?: string): "image" | "voice" | "video" | "file"
776
795
  return null;
777
796
  }
778
797
 
798
+ function resolveContentTypeFromExt(ext: string): string {
799
+ const value = ext.toLowerCase();
800
+ if (value === "png") return "image/png";
801
+ if (value === "gif") return "image/gif";
802
+ if (value === "jpg" || value === "jpeg") return "image/jpeg";
803
+ if (value === "webp") return "image/webp";
804
+ if (value === "bmp") return "image/bmp";
805
+ if (value === "amr") return "audio/amr";
806
+ if (value === "wav") return "audio/wav";
807
+ if (value === "mp3") return "audio/mpeg";
808
+ if (value === "m4a") return "audio/mp4";
809
+ if (value === "mp4") return "video/mp4";
810
+ if (value === "mov") return "video/quicktime";
811
+ if (value === "avi") return "video/x-msvideo";
812
+ if (value === "pdf") return "application/pdf";
813
+ if (value === "txt") return "text/plain";
814
+ if (value === "csv") return "text/csv";
815
+ if (value === "json") return "application/json";
816
+ if (value === "doc") return "application/msword";
817
+ if (value === "docx") return "application/vnd.openxmlformats-officedocument.wordprocessingml.document";
818
+ if (value === "xls") return "application/vnd.ms-excel";
819
+ if (value === "xlsx") return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
820
+ if (value === "ppt") return "application/vnd.ms-powerpoint";
821
+ if (value === "pptx") return "application/vnd.openxmlformats-officedocument.presentationml.presentation";
822
+ if (value === "zip") return "application/zip";
823
+ return "application/octet-stream";
824
+ }
825
+
826
+ function resolveMediaTypeFromContentType(contentType: string): "image" | "voice" | "video" | "file" {
827
+ const value = contentType.toLowerCase();
828
+ if (value.startsWith("image/")) return "image";
829
+ if (value.startsWith("audio/")) return "voice";
830
+ if (value.startsWith("video/")) return "video";
831
+ return "file";
832
+ }
833
+
834
+ function stripFileProtocol(rawPath: string): string {
835
+ return rawPath.startsWith("file://") ? rawPath.replace(/^file:\/\//, "") : rawPath;
836
+ }
837
+
838
+ function parseBase64Input(input: string): { data: string; mimeType?: string } {
839
+ const match = input.match(/^data:([^;]+);base64,(.*)$/i);
840
+ if (match) {
841
+ return { data: match[2], mimeType: match[1] };
842
+ }
843
+ return { data: input };
844
+ }
845
+
846
+ function resolveOutboundMediaSpec(payload: any): {
847
+ type?: string;
848
+ url?: string;
849
+ path?: string;
850
+ base64?: string;
851
+ filename?: string;
852
+ mimeType?: string;
853
+ } | null {
854
+ if (!payload || typeof payload !== "object") return null;
855
+ const mediaBlockRaw = payload.media ?? payload.attachment ?? payload.file ?? payload.files;
856
+ const mediaBlock = Array.isArray(mediaBlockRaw) ? mediaBlockRaw[0] : mediaBlockRaw;
857
+ const url = pickString(
858
+ payload.mediaUrl,
859
+ mediaBlock?.url,
860
+ mediaBlock?.mediaUrl,
861
+ mediaBlock?.fileUrl,
862
+ mediaBlock?.file_url,
863
+ );
864
+ const path = pickString(
865
+ payload.mediaPath,
866
+ payload.filePath,
867
+ mediaBlock?.path,
868
+ mediaBlock?.filePath,
869
+ mediaBlock?.localPath,
870
+ );
871
+ const base64 = pickString(
872
+ payload.mediaBase64,
873
+ payload.base64,
874
+ mediaBlock?.base64,
875
+ mediaBlock?.data,
876
+ );
877
+ const type = pickString(payload.mediaType, mediaBlock?.type, mediaBlock?.mediaType);
878
+ const filename = pickString(payload.filename, payload.fileName, mediaBlock?.filename, mediaBlock?.fileName, mediaBlock?.name);
879
+ const mimeType = pickString(payload.mimeType, payload.mediaMimeType, mediaBlock?.mimeType, mediaBlock?.contentType);
880
+ if (!url && !path && !base64) return null;
881
+ return { type, url, path, base64, filename, mimeType };
882
+ }
883
+
884
+ async function loadOutboundMedia(params: {
885
+ payload: any;
886
+ account: ResolvedWecomAccount;
887
+ maxBytes: number | undefined;
888
+ }): Promise<{ buffer: Buffer; contentType: string; type: "image" | "voice" | "video" | "file"; filename: string } | null> {
889
+ const spec = resolveOutboundMediaSpec(params.payload);
890
+ if (!spec) return null;
891
+
892
+ let buffer: Buffer | null = null;
893
+ let contentType = spec.mimeType ?? "";
894
+ let filename = spec.filename ?? "";
895
+
896
+ if (spec.base64) {
897
+ const parsed = parseBase64Input(spec.base64);
898
+ buffer = Buffer.from(parsed.data, "base64");
899
+ if (!contentType && parsed.mimeType) contentType = parsed.mimeType;
900
+ } else if (spec.path) {
901
+ const resolvedPath = stripFileProtocol(spec.path);
902
+ buffer = await readFile(resolvedPath);
903
+ if (!filename) filename = basename(resolvedPath);
904
+ if (!contentType) {
905
+ const ext = extname(resolvedPath).replace(".", "");
906
+ contentType = resolveContentTypeFromExt(ext);
907
+ }
908
+ } else if (spec.url) {
909
+ const media = await fetchMediaFromUrl(spec.url, params.account);
910
+ buffer = media.buffer;
911
+ if (!contentType) contentType = media.contentType;
912
+ }
913
+
914
+ if (!buffer) return null;
915
+ if (params.maxBytes && buffer.length > params.maxBytes) return null;
916
+
917
+ const type = normalizeMediaType(spec.type) ?? resolveMediaTypeFromContentType(contentType || "application/octet-stream");
918
+ const ext = resolveExtFromContentType(contentType || "application/octet-stream", type);
919
+ const safeName = sanitizeFilename(filename, `${type}.${ext}`);
920
+
921
+ return { buffer, contentType: contentType || resolveContentTypeFromExt(ext), type, filename: safeName };
922
+ }
923
+
779
924
  function mediaSentLabel(type: string): string {
780
925
  if (type === "image") return "[已发送图片]";
781
926
  if (type === "voice") return "[已发送语音]";
@@ -812,7 +957,7 @@ function pruneMediaCache(): void {
812
957
  async function getCachedMedia(
813
958
  key: string | null,
814
959
  retentionMs?: number,
815
- ): Promise<InboundMedia | null> {
960
+ ): Promise<{ media: InboundMedia; summary?: string } | null> {
816
961
  if (!key) return null;
817
962
  const cached = mediaCache.get(key);
818
963
  if (!cached) return null;
@@ -826,12 +971,12 @@ async function getCachedMedia(
826
971
  mediaCache.delete(key);
827
972
  return null;
828
973
  }
829
- return cached.entry;
974
+ return { media: cached.entry, summary: cached.summary };
830
975
  }
831
976
 
832
- function storeCachedMedia(key: string | null, entry: InboundMedia, size: number): void {
977
+ function storeCachedMedia(key: string | null, entry: InboundMedia, size: number, summary?: string): void {
833
978
  if (!key) return;
834
- mediaCache.set(key, { entry, createdAt: Date.now(), size });
979
+ mediaCache.set(key, { entry, createdAt: Date.now(), size, summary });
835
980
  pruneMediaCache();
836
981
  }
837
982