npm - @marshulll/openclaw-wecom - Versions diffs - 0.1.14 → 0.1.16 - Mend

@marshulll/openclaw-wecom 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/docs/wecom.config.full.example.json +11 -1
package/package.json +1 -1
package/wecom/src/config-schema.ts +20 -0
package/wecom/src/media-vision.ts +98 -0
package/wecom/src/types.ts +10 -0
package/wecom/src/wecom-app.ts +204 -54
package/wecom/src/wecom-bot.ts +124 -8

package/docs/wecom.config.full.example.json CHANGED Viewed

@@ -8,7 +8,17 @@
         "tempDir": "/tmp/openclaw-wecom",
         "retentionHours": 72,
         "cleanupOnStart": true,
-        "maxBytes": 10485760
+        "maxBytes": 10485760,
+        "vision": {
+          "enabled": true,
+          "baseUrl": "https://newapi.looksunlight.com/v1",
+          "apiKey": "YOUR_API_KEY",
+          "model": "gpt-4o-mini",
+          "prompt": "请描述图片内容并尽量提取可见文字。",
+          "maxTokens": 400,
+          "timeoutMs": 15000,
+          "maxBytes": 5242880
+        }
       },
       "botMediaBridge": true,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@marshulll/openclaw-wecom",
-  "version": "0.1.14",
+  "version": "0.1.16",
   "type": "module",
   "description": "OpenClaw WeCom channel plugin (intelligent bot + internal app)",
   "author": "OpenClaw",

package/wecom/src/config-schema.ts CHANGED Viewed

@@ -47,6 +47,16 @@ const accountSchema = z.object({
     retentionHours: z.number().optional(),
     cleanupOnStart: z.boolean().optional(),
     maxBytes: z.number().optional(),
+    vision: z.object({
+      enabled: z.boolean().optional(),
+      baseUrl: z.string().optional(),
+      apiKey: z.string().optional(),
+      model: z.string().optional(),
+      prompt: z.string().optional(),
+      maxTokens: z.number().optional(),
+      timeoutMs: z.number().optional(),
+      maxBytes: z.number().optional(),
+    }).optional(),
   }).optional(),
   network: z.object({
@@ -81,6 +91,16 @@ export const WecomConfigSchema = ensureJsonSchema(z.object({
     retentionHours: z.number().optional(),
     cleanupOnStart: z.boolean().optional(),
     maxBytes: z.number().optional(),
+    vision: z.object({
+      enabled: z.boolean().optional(),
+      baseUrl: z.string().optional(),
+      apiKey: z.string().optional(),
+      model: z.string().optional(),
+      prompt: z.string().optional(),
+      maxTokens: z.number().optional(),
+      timeoutMs: z.number().optional(),
+      maxBytes: z.number().optional(),
+    }).optional(),
   }).optional(),
   network: z.object({

package/wecom/src/media-vision.ts ADDED Viewed

@@ -0,0 +1,98 @@
+import type { WecomAccountConfig } from "./types.js";
+export type VisionConfig = {
+  enabled?: boolean;
+  baseUrl?: string;
+  apiKey?: string;
+  model?: string;
+  prompt?: string;
+  maxTokens?: number;
+  timeoutMs?: number;
+  maxBytes?: number;
+};
+function resolveBaseUrl(raw?: string): string | null {
+  const value = raw?.trim();
+  if (!value) return null;
+  if (value.endsWith("/v1")) return value;
+  return `${value.replace(/\/+$/, "")}/v1`;
+}
+export function resolveVisionConfig(accountConfig: WecomAccountConfig): VisionConfig | null {
+  const vision = accountConfig.media?.vision;
+  if (!vision?.enabled) return null;
+  const baseUrl = resolveBaseUrl(
+    vision.baseUrl
+      || process.env.OPENAI_BASE_URL
+      || process.env.OPENAI_API_BASE
+      || process.env.OPENAI_ENDPOINT,
+  );
+  const apiKey = vision.apiKey || process.env.OPENAI_API_KEY || process.env.OPENAI_KEY;
+  if (!baseUrl || !apiKey) return null;
+  return {
+    enabled: true,
+    baseUrl,
+    apiKey,
+    model: vision.model || process.env.OPENAI_MODEL || "gpt-4o-mini",
+    prompt: vision.prompt
+      || "请描述图片内容并尽量提取可见文字。输出简洁中文要点。",
+    maxTokens: typeof vision.maxTokens === "number" ? vision.maxTokens : 400,
+    timeoutMs: typeof vision.timeoutMs === "number" ? vision.timeoutMs : 15000,
+    maxBytes: typeof vision.maxBytes === "number" ? vision.maxBytes : undefined,
+  };
+}
+export async function describeImageWithVision(params: {
+  config: VisionConfig;
+  buffer: Buffer;
+  mimeType: string;
+}): Promise<string | null> {
+  const { config, buffer, mimeType } = params;
+  if (!config.enabled || !config.baseUrl || !config.apiKey) return null;
+  if (config.maxBytes && buffer.length > config.maxBytes) {
+    return null;
+  }
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), config.timeoutMs ?? 15000);
+  try {
+    const imageBase64 = buffer.toString("base64");
+    const payload = {
+      model: config.model,
+      messages: [
+        {
+          role: "user",
+          content: [
+            { type: "text", text: config.prompt },
+            { type: "image_url", image_url: { url: `data:${mimeType};base64,${imageBase64}` } },
+          ],
+        },
+      ],
+      max_tokens: config.maxTokens ?? 400,
+    };
+    const res = await fetch(`${config.baseUrl}/chat/completions`, {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+        Authorization: `Bearer ${config.apiKey}`,
+      },
+      body: JSON.stringify(payload),
+      signal: controller.signal,
+    });
+    if (!res.ok) return null;
+    const data = await res.json() as any;
+    const content = data?.choices?.[0]?.message?.content;
+    if (typeof content !== "string") return null;
+    return content.trim() || null;
+  } catch {
+    return null;
+  } finally {
+    clearTimeout(timeout);
+  }
+}

package/wecom/src/types.ts CHANGED Viewed

@@ -47,6 +47,16 @@ export type WecomAccountConfig = {
     retentionHours?: number;
     cleanupOnStart?: boolean;
     maxBytes?: number;
+    vision?: {
+      enabled?: boolean;
+      baseUrl?: string;
+      apiKey?: string;
+      model?: string;
+      prompt?: string;
+      maxTokens?: number;
+      timeoutMs?: number;
+      maxBytes?: number;
+    };
   };
   // Network behavior

package/wecom/src/wecom-app.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { IncomingMessage, ServerResponse } from "node:http";
+import crypto from "node:crypto";
 import { XMLParser } from "fast-xml-parser";
 import { mkdir, readdir, rm, stat, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
@@ -9,6 +10,7 @@ import { decryptWecomEncrypted, verifyWecomSignature } from "./crypto.js";
 import { getWecomRuntime } from "./runtime.js";
 import { handleCommand } from "./commands.js";
 import { markdownToWecomText } from "./format.js";
+import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
 import { downloadWecomMedia, fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomText, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
 const xmlParser = new XMLParser({
@@ -18,6 +20,19 @@ const xmlParser = new XMLParser({
 });
 const MAX_REQUEST_BODY_SIZE = 1024 * 1024;
+const MEDIA_CACHE_MAX_ENTRIES = 200;
+type MediaCacheEntry = {
+  path: string;
+  type: "image" | "voice" | "video" | "file";
+  mimeType?: string;
+  url?: string;
+  summary?: string;
+  createdAt: number;
+  size: number;
+};
+const mediaCache = new Map<string, MediaCacheEntry>();
 function parseIncomingXml(xml: string): Record<string, any> {
   const obj = xmlParser.parse(xml);
@@ -146,6 +161,11 @@ function resolveMediaMaxBytes(target: WecomWebhookTarget): number | undefined {
   return typeof maxBytes === "number" && maxBytes > 0 ? maxBytes : undefined;
 }
+function resolveMediaRetentionMs(target: WecomWebhookTarget): number | undefined {
+  const hours = target.account.config.media?.retentionHours;
+  return typeof hours === "number" && hours > 0 ? hours * 3600 * 1000 : undefined;
+}
 function normalizeMediaType(raw?: string): "image" | "voice" | "video" | "file" | null {
   if (!raw) return null;
   const value = raw.toLowerCase();
@@ -164,6 +184,51 @@ function sanitizeFilename(name: string, fallback: string): string {
   return finalName || fallback;
 }
+function hashKey(input: string): string {
+  return crypto.createHash("sha1").update(input).digest("hex");
+}
+function buildMediaCacheKey(params: { mediaId?: string; url?: string }): string | null {
+  if (params.mediaId) return `media:${params.mediaId}`;
+  if (params.url) return `url:${hashKey(params.url)}`;
+  return null;
+}
+function pruneMediaCache(): void {
+  if (mediaCache.size <= MEDIA_CACHE_MAX_ENTRIES) return;
+  const entries = Array.from(mediaCache.entries())
+    .sort((a, b) => a[1].createdAt - b[1].createdAt);
+  const excess = entries.length - MEDIA_CACHE_MAX_ENTRIES;
+  for (let i = 0; i < excess; i += 1) {
+    mediaCache.delete(entries[i]![0]);
+  }
+}
+async function getCachedMedia(
+  key: string | null,
+  retentionMs?: number,
+): Promise<MediaCacheEntry | null> {
+  if (!key) return null;
+  const entry = mediaCache.get(key);
+  if (!entry) return null;
+  if (retentionMs && Date.now() - entry.createdAt > retentionMs) {
+    mediaCache.delete(key);
+    return null;
+  }
+  try {
+    await stat(entry.path);
+  } catch {
+    mediaCache.delete(key);
+    return null;
+  }
+  return entry;
+}
+function storeCachedMedia(key: string | null, entry: MediaCacheEntry): void {
+  if (!key) return;
+  mediaCache.set(key, entry);
+  pruneMediaCache();
+}
 async function startAgentForApp(params: {
   target: WecomWebhookTarget;
@@ -174,6 +239,7 @@ async function startAgentForApp(params: {
   media?: {
     type: "image" | "voice" | "video" | "file";
     path: string;
+    mimeType?: string;
     url?: string;
   } | null;
 }): Promise<void> {
@@ -229,6 +295,9 @@ async function startAgentForApp(params: {
   if (media?.path) {
     ctxPayload.MediaPath = media.path;
     ctxPayload.MediaType = media.type;
+    if (media.mimeType) {
+      (ctxPayload as any).MediaMimeType = media.mimeType;
+    }
     if (media.url) {
       ctxPayload.MediaUrl = media.url;
     }
@@ -331,7 +400,8 @@ async function processAppMessage(params: {
   if (!fromUser) return;
   let messageText = "";
-  let mediaContext: { type: "image" | "voice" | "video" | "file"; path: string; url?: string } | null = null;
+  const retentionMs = resolveMediaRetentionMs(target);
+  let mediaContext: { type: "image" | "voice" | "video" | "file"; path: string; mimeType?: string; url?: string } | null = null;
   if (msgType === "text") {
     messageText = String(msgObj?.Content ?? "");
@@ -345,24 +415,40 @@ async function processAppMessage(params: {
       const mediaId = String(msgObj?.MediaId ?? "");
       if (mediaId) {
         try {
-          const media = await downloadWecomMedia({ account: target.account, mediaId });
-          const maxBytes = resolveMediaMaxBytes(target);
-          if (maxBytes && media.buffer.length > maxBytes) {
-            messageText = "[语音消息过大，未处理]\n\n请发送更短的语音消息。";
+          const cacheKey = buildMediaCacheKey({ mediaId });
+          const cached = await getCachedMedia(cacheKey, retentionMs);
+          if (cached) {
+            mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
+            logVerbose(target, `app voice cache hit: ${cached.path}`);
+            messageText = "[用户发送了一条语音消息]\n\n请根据语音内容回复用户。";
           } else {
-            const ext = resolveExtFromContentType(media.contentType, "amr");
-            const tempDir = resolveMediaTempDir(target);
-            await mkdir(tempDir, { recursive: true });
-            await cleanupMediaDir(
-              tempDir,
-              target.account.config.media?.retentionHours,
-              target.account.config.media?.cleanupOnStart,
-            );
-            const tempVoicePath = join(tempDir, `voice-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`);
-            await writeFile(tempVoicePath, media.buffer);
-            mediaContext = { type: "voice", path: tempVoicePath };
-            logVerbose(target, `app voice saved (${media.buffer.length} bytes): ${tempVoicePath}`);
-            messageText = `[用户发送了一条语音消息，已保存到: ${tempVoicePath}]\n\n请根据语音内容回复用户。`;
+            const media = await downloadWecomMedia({ account: target.account, mediaId });
+            const maxBytes = resolveMediaMaxBytes(target);
+            if (maxBytes && media.buffer.length > maxBytes) {
+              messageText = "[语音消息过大，未处理]\n\n请发送更短的语音消息。";
+            } else {
+              const ext = resolveExtFromContentType(media.contentType, "amr");
+              const tempDir = resolveMediaTempDir(target);
+              await mkdir(tempDir, { recursive: true });
+              await cleanupMediaDir(
+                tempDir,
+                target.account.config.media?.retentionHours,
+                target.account.config.media?.cleanupOnStart,
+              );
+              const tempVoicePath = join(tempDir, `voice-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`);
+              await writeFile(tempVoicePath, media.buffer);
+              const mimeType = media.contentType || "audio/amr";
+              mediaContext = { type: "voice", path: tempVoicePath, mimeType };
+              storeCachedMedia(cacheKey, {
+                path: tempVoicePath,
+                type: "voice",
+                mimeType,
+                createdAt: Date.now(),
+                size: media.buffer.length,
+              });
+              logVerbose(target, `app voice saved (${media.buffer.length} bytes): ${tempVoicePath}`);
+              messageText = "[用户发送了一条语音消息]\n\n请根据语音内容回复用户。";
+            }
           }
         } catch (err) {
           target.runtime.error?.(`wecom app voice download failed: ${String(err)}`);
@@ -378,39 +464,71 @@ async function processAppMessage(params: {
     const mediaId = String(msgObj?.MediaId ?? "");
     const picUrl = String(msgObj?.PicUrl ?? "");
     try {
-      let buffer: Buffer | null = null;
-      let contentType = "";
-      if (mediaId) {
-        const media = await downloadWecomMedia({ account: target.account, mediaId });
-        buffer = media.buffer;
-        contentType = media.contentType;
-      } else if (picUrl) {
-        const media = await fetchMediaFromUrl(picUrl, target.account);
-        buffer = media.buffer;
-        contentType = media.contentType;
-      }
-      if (buffer) {
-        const maxBytes = resolveMediaMaxBytes(target);
-        if (maxBytes && buffer.length > maxBytes) {
-          messageText = "[图片过大，未处理]\n\n请发送更小的图片。";
+      const cacheKey = buildMediaCacheKey({ mediaId, url: picUrl });
+      const cached = await getCachedMedia(cacheKey, retentionMs);
+      if (cached) {
+        mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
+        logVerbose(target, `app image cache hit: ${cached.path}`);
+        if (cached.summary) {
+          messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`;
         } else {
-          const ext = resolveExtFromContentType(contentType, "jpg");
-          const tempDir = resolveMediaTempDir(target);
-          await mkdir(tempDir, { recursive: true });
-          await cleanupMediaDir(
-            tempDir,
-            target.account.config.media?.retentionHours,
-            target.account.config.media?.cleanupOnStart,
-          );
-          const tempImagePath = join(tempDir, `image-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`);
-          await writeFile(tempImagePath, buffer);
-          mediaContext = { type: "image", path: tempImagePath, url: picUrl || undefined };
-          logVerbose(target, `app image saved (${buffer.length} bytes): ${tempImagePath}`);
-          messageText = `[用户发送了一张图片，已保存到: ${tempImagePath}]\n\n请根据图片内容回复用户。`;
+          messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
         }
       } else {
-        messageText = "[用户发送了一张图片，但下载失败]\n\n请告诉用户图片处理暂时不可用。";
+        let buffer: Buffer | null = null;
+        let contentType = "";
+        if (mediaId) {
+          const media = await downloadWecomMedia({ account: target.account, mediaId });
+          buffer = media.buffer;
+          contentType = media.contentType;
+        } else if (picUrl) {
+          const media = await fetchMediaFromUrl(picUrl, target.account);
+          buffer = media.buffer;
+          contentType = media.contentType;
+        }
+        if (buffer) {
+          const maxBytes = resolveMediaMaxBytes(target);
+          if (maxBytes && buffer.length > maxBytes) {
+            messageText = "[图片过大，未处理]\n\n请发送更小的图片。";
+          } else {
+            const ext = resolveExtFromContentType(contentType, "jpg");
+            const tempDir = resolveMediaTempDir(target);
+            await mkdir(tempDir, { recursive: true });
+            await cleanupMediaDir(
+              tempDir,
+              target.account.config.media?.retentionHours,
+              target.account.config.media?.cleanupOnStart,
+            );
+            const tempImagePath = join(tempDir, `image-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`);
+            await writeFile(tempImagePath, buffer);
+            const mimeType = contentType || "image/jpeg";
+            mediaContext = { type: "image", path: tempImagePath, mimeType, url: picUrl || undefined };
+            const visionConfig = resolveVisionConfig(target.account.config);
+            const summary = visionConfig
+              ? await describeImageWithVision({ config: visionConfig, buffer, mimeType })
+              : null;
+            storeCachedMedia(cacheKey, {
+              path: tempImagePath,
+              type: "image",
+              mimeType,
+              url: picUrl || undefined,
+              summary: summary ?? undefined,
+              createdAt: Date.now(),
+              size: buffer.length,
+            });
+            logVerbose(target, `app image saved (${buffer.length} bytes): ${tempImagePath}`);
+            if (summary) {
+              messageText = `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`;
+            } else {
+              messageText = "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
+            }
+          }
+        } else {
+          messageText = "[用户发送了一张图片，但下载失败]\n\n请告诉用户图片处理暂时不可用。";
+        }
       }
     } catch (err) {
       target.runtime.error?.(`wecom app image download failed: ${String(err)}`);
@@ -429,7 +547,14 @@ async function processAppMessage(params: {
     const mediaId = String(msgObj?.MediaId ?? "");
     if (mediaId) {
       try {
-        const media = await downloadWecomMedia({ account: target.account, mediaId });
+        const cacheKey = buildMediaCacheKey({ mediaId });
+        const cached = await getCachedMedia(cacheKey, retentionMs);
+        if (cached) {
+          mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
+          logVerbose(target, `app video cache hit: ${cached.path}`);
+          messageText = "[用户发送了一个视频文件]\n\n请根据视频内容回复用户。";
+        } else {
+          const media = await downloadWecomMedia({ account: target.account, mediaId });
         const maxBytes = resolveMediaMaxBytes(target);
         if (maxBytes && media.buffer.length > maxBytes) {
           messageText = "[视频过大，未处理]\n\n请发送更小的视频。";
@@ -444,9 +569,18 @@ async function processAppMessage(params: {
           );
           const tempVideoPath = join(tempDir, `video-${Date.now()}-${Math.random().toString(36).slice(2)}.${ext}`);
           await writeFile(tempVideoPath, media.buffer);
-          mediaContext = { type: "video", path: tempVideoPath };
+          const mimeType = media.contentType || "video/mp4";
+          mediaContext = { type: "video", path: tempVideoPath, mimeType };
+          storeCachedMedia(cacheKey, {
+            path: tempVideoPath,
+            type: "video",
+            mimeType,
+            createdAt: Date.now(),
+            size: media.buffer.length,
+          });
           logVerbose(target, `app video saved (${media.buffer.length} bytes): ${tempVideoPath}`);
-          messageText = `[用户发送了一个视频文件，已保存到: ${tempVideoPath}]\n\n请根据视频内容回复用户。`;
+          messageText = "[用户发送了一个视频文件]\n\n请根据视频内容回复用户。";
+        }
         }
       } catch (err) {
         target.runtime.error?.(`wecom app video download failed: ${String(err)}`);
@@ -460,7 +594,14 @@ async function processAppMessage(params: {
     const fileName = String(msgObj?.FileName ?? "");
     if (mediaId) {
       try {
-        const media = await downloadWecomMedia({ account: target.account, mediaId });
+        const cacheKey = buildMediaCacheKey({ mediaId });
+        const cached = await getCachedMedia(cacheKey, retentionMs);
+        if (cached) {
+          mediaContext = { type: cached.type, path: cached.path, mimeType: cached.mimeType, url: cached.url };
+          logVerbose(target, `app file cache hit: ${cached.path}`);
+          messageText = `[用户发送了一个文件: ${fileName || "未知文件"}]\n\n请根据文件内容回复用户。`;
+        } else {
+          const media = await downloadWecomMedia({ account: target.account, mediaId });
         const maxBytes = resolveMediaMaxBytes(target);
         if (maxBytes && media.buffer.length > maxBytes) {
           messageText = "[文件过大，未处理]\n\n请发送更小的文件。";
@@ -476,9 +617,18 @@ async function processAppMessage(params: {
           const safeName = sanitizeFilename(fileName, `file-${Date.now()}.${ext}`);
           const tempFilePath = join(tempDir, safeName);
           await writeFile(tempFilePath, media.buffer);
-          mediaContext = { type: "file", path: tempFilePath };
+          const mimeType = media.contentType || "application/octet-stream";
+          mediaContext = { type: "file", path: tempFilePath, mimeType };
+          storeCachedMedia(cacheKey, {
+            path: tempFilePath,
+            type: "file",
+            mimeType,
+            createdAt: Date.now(),
+            size: media.buffer.length,
+          });
           logVerbose(target, `app file saved (${media.buffer.length} bytes): ${tempFilePath}`);
-          messageText = `[用户发送了一个文件: ${safeName}，已保存到: ${tempFilePath}]\n\n请根据文件内容回复用户。`;
+          messageText = `[用户发送了一个文件: ${safeName}]\n\n请根据文件内容回复用户。`;
+        }
         }
       } catch (err) {
         target.runtime.error?.(`wecom app file download failed: ${String(err)}`);

package/wecom/src/wecom-bot.ts CHANGED Viewed

@@ -11,14 +11,17 @@ import type { ResolvedWecomAccount, WecomInboundMessage } from "./types.js";
 import { computeWecomMsgSignature, decryptWecomEncrypted, encryptWecomPlaintext, verifyWecomSignature } from "./crypto.js";
 import { fetchMediaFromUrl, sendWecomFile, sendWecomImage, sendWecomVideo, sendWecomVoice, uploadWecomMedia } from "./wecom-api.js";
 import { getWecomRuntime } from "./runtime.js";
+import { describeImageWithVision, resolveVisionConfig } from "./media-vision.js";
 const STREAM_TTL_MS = 10 * 60 * 1000;
 const STREAM_MAX_BYTES = 20_480;
 const STREAM_MAX_ENTRIES = 500;
 const DEDUPE_TTL_MS = 2 * 60 * 1000;
 const DEDUPE_MAX_ENTRIES = 2_000;
+const MEDIA_CACHE_MAX_ENTRIES = 200;
 const cleanupExecuted = new Set<string>();
+const mediaCache = new Map<string, { entry: InboundMedia; createdAt: number; size: number; summary?: string }>();
 type StreamState = {
   streamId: string;
@@ -34,6 +37,7 @@ type StreamState = {
 type InboundMedia = {
   path: string;
   type: string;
+  mimeType?: string;
   url?: string;
 };
@@ -382,6 +386,9 @@ async function startAgentForStream(params: {
   if (inbound.media) {
     ctxPayload.MediaPath = inbound.media.path;
     ctxPayload.MediaType = inbound.media.type;
+    if (inbound.media.mimeType) {
+      (ctxPayload as any).MediaMimeType = inbound.media.mimeType;
+    }
     if (inbound.media.url) {
       ctxPayload.MediaUrl = inbound.media.url;
     }
@@ -584,6 +591,18 @@ async function buildBotMediaMessage(params: {
   if (!url && !base64) return { text: fallbackLabel };
   try {
+    const cacheKey = buildMediaCacheKey({ url, base64 });
+    const cached = await getCachedMedia(cacheKey, resolveMediaRetentionMs(target));
+    if (cached) {
+      const text = msgtype === "image" && cached.summary
+        ? `[用户发送了一张图片]\n\n[图片识别结果]\n${cached.summary}\n\n请根据识别结果回复用户。`
+        : buildInboundMediaPrompt(msgtype, filename);
+      return {
+        text,
+        media: cached.media,
+      };
+    }
     let buffer: Buffer | null = null;
     let contentType = "";
     if (base64) {
@@ -629,9 +648,16 @@ async function buildBotMediaMessage(params: {
       const safeName = sanitizeFilename(filename || "", `file-${Date.now()}.${ext}`);
       const tempFilePath = join(tempDir, safeName);
       await writeFile(tempFilePath, buffer);
+      const media: InboundMedia = {
+        path: tempFilePath,
+        type: "file",
+        mimeType: contentType || "application/octet-stream",
+        url,
+      };
+      storeCachedMedia(cacheKey, media, buffer.length);
       return {
-        text: `[用户发送了一个文件: ${safeName}，已保存到: ${tempFilePath}]\n\n请根据文件内容回复用户。`,
-        media: { path: tempFilePath, type: contentType || "application/octet-stream", url },
+        text: buildInboundMediaPrompt("file", safeName),
+        media,
       };
     }
@@ -642,21 +668,52 @@ async function buildBotMediaMessage(params: {
     await writeFile(tempPath, buffer);
     if (msgtype === "image") {
+      const media: InboundMedia = {
+        path: tempPath,
+        type: "image",
+        mimeType: contentType || "image/jpeg",
+        url,
+      };
+      const visionConfig = resolveVisionConfig(target.account.config);
+      const summary = visionConfig
+        ? await describeImageWithVision({
+          config: visionConfig,
+          buffer,
+          mimeType: media.mimeType || "image/jpeg",
+        })
+        : null;
+      storeCachedMedia(cacheKey, media, buffer.length, summary ?? undefined);
       return {
-        text: `[用户发送了一张图片，已保存到: ${tempPath}]\n\n请使用 Read 工具查看这张图片并描述内容。`,
-        media: { path: tempPath, type: contentType || "image/jpeg", url },
+        text: summary
+          ? `[用户发送了一张图片]\n\n[图片识别结果]\n${summary}\n\n请根据识别结果回复用户。`
+          : buildInboundMediaPrompt("image"),
+        media,
       };
     }
     if (msgtype === "voice") {
+      const media: InboundMedia = {
+        path: tempPath,
+        type: "voice",
+        mimeType: contentType || "audio/amr",
+        url,
+      };
+      storeCachedMedia(cacheKey, media, buffer.length);
       return {
-        text: `[用户发送了一条语音消息，已保存到: ${tempPath}]\n\n请根据语音内容回复用户。`,
-        media: { path: tempPath, type: contentType || "audio/amr", url },
+        text: buildInboundMediaPrompt("voice"),
+        media,
       };
     }
     if (msgtype === "video") {
+      const media: InboundMedia = {
+        path: tempPath,
+        type: "video",
+        mimeType: contentType || "video/mp4",
+        url,
+      };
+      storeCachedMedia(cacheKey, media, buffer.length);
       return {
-        text: `[用户发送了一个视频文件，已保存到: ${tempPath}]\n\n请根据视频内容回复用户。`,
-        media: { path: tempPath, type: contentType || "video/mp4", url },
+        text: buildInboundMediaPrompt("video"),
+        media,
       };
     }
     return { text: fallbackLabel };
@@ -741,6 +798,65 @@ function mediaSentLabel(type: string): string {
   return "[已发送媒体]";
 }
+function resolveMediaRetentionMs(target: WecomWebhookTarget): number | undefined {
+  const hours = target.account.config.media?.retentionHours;
+  return typeof hours === "number" && hours > 0 ? hours * 3600 * 1000 : undefined;
+}
+function hashCacheKey(input: string): string {
+  return crypto.createHash("sha1").update(input).digest("hex");
+}
+function buildMediaCacheKey(params: { url?: string; base64?: string }): string | null {
+  if (params.url) return `url:${hashCacheKey(params.url)}`;
+  if (params.base64) return `b64:${hashCacheKey(params.base64)}`;
+  return null;
+}
+function pruneMediaCache(): void {
+  if (mediaCache.size <= MEDIA_CACHE_MAX_ENTRIES) return;
+  const entries = Array.from(mediaCache.entries())
+    .sort((a, b) => a[1].createdAt - b[1].createdAt);
+  const excess = entries.length - MEDIA_CACHE_MAX_ENTRIES;
+  for (let i = 0; i < excess; i += 1) {
+    mediaCache.delete(entries[i]![0]);
+  }
+}
+async function getCachedMedia(
+  key: string | null,
+  retentionMs?: number,
+): Promise<{ media: InboundMedia; summary?: string } | null> {
+  if (!key) return null;
+  const cached = mediaCache.get(key);
+  if (!cached) return null;
+  if (retentionMs && Date.now() - cached.createdAt > retentionMs) {
+    mediaCache.delete(key);
+    return null;
+  }
+  try {
+    await stat(cached.entry.path);
+  } catch {
+    mediaCache.delete(key);
+    return null;
+  }
+  return { media: cached.entry, summary: cached.summary };
+}
+function storeCachedMedia(key: string | null, entry: InboundMedia, size: number, summary?: string): void {
+  if (!key) return;
+  mediaCache.set(key, { entry, createdAt: Date.now(), size, summary });
+  pruneMediaCache();
+}
+function buildInboundMediaPrompt(msgtype: "image" | "voice" | "video" | "file", filename?: string): string {
+  if (msgtype === "image") return "[用户发送了一张图片]\n\n请根据图片内容回复用户。";
+  if (msgtype === "voice") return "[用户发送了一条语音消息]\n\n请根据语音内容回复用户。";
+  if (msgtype === "video") return "[用户发送了一个视频文件]\n\n请根据视频内容回复用户。";
+  const label = filename ? `用户发送了一个文件: ${filename}` : "用户发送了一个文件";
+  return `[${label}]\n\n请根据文件内容回复用户。`;
+}
 function shouldHandleBot(account: ResolvedWecomAccount): boolean {
   return account.mode === "bot" || account.mode === "both";
 }