skyloom 1.21.0 → 1.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -24,10 +24,79 @@ export interface InboundMessage {
24
24
  text: string;
25
25
  /** Where to send the reply (channel-specific opaque target). */
26
26
  replyTo: ReplyTarget;
27
+ /** Media attachments on this message (image / audio / file / …). */
28
+ media?: MediaAttachment[];
27
29
  /** Raw event for adapters that need more than the normalized fields. */
28
30
  raw?: unknown;
29
31
  }
30
32
 
33
+ /** A non-text attachment, normalized across channels. */
34
+ export interface MediaAttachment {
35
+ kind: 'image' | 'audio' | 'video' | 'file' | 'sticker' | 'other';
36
+ /** Channel-specific id/key used to fetch the binary (image_key, media_id, url…). */
37
+ ref?: string;
38
+ /** Original filename, when the platform provides one. */
39
+ filename?: string;
40
+ /** MIME type, when known. */
41
+ mimeType?: string;
42
+ /** Direct URL, when the platform provides one. */
43
+ url?: string;
44
+ }
45
+
46
+ /** An outbound media item the agent wants to send (parsed from its reply). */
47
+ export interface OutboundMedia {
48
+ kind: 'image' | 'file';
49
+ /** Local filesystem path or http(s) URL to the binary. */
50
+ src: string;
51
+ /** Optional caption / alt text. */
52
+ alt?: string;
53
+ }
54
+
55
+ /** The result of splitting an agent reply into plain text + outbound media. */
56
+ export interface ParsedReply {
57
+ text: string;
58
+ media: OutboundMedia[];
59
+ }
60
+
61
+ /**
62
+ * Parse media directives out of an agent's reply so channels can upload+send
63
+ * them. Recognized forms (stripped from the returned text):
64
+ * - Markdown image: ![alt](src)
65
+ * - Explicit image: [[image:src]] or [[image:src|alt]]
66
+ * - Explicit file: [[file:src]] or [[file:src|alt]]
67
+ * `src` is a local path or http(s) URL. Only http(s) and existing local files
68
+ * are treated as media; anything else is left in the text untouched.
69
+ */
70
+ export function parseReply(reply: string): ParsedReply {
71
+ const media: OutboundMedia[] = [];
72
+ let text = reply;
73
+
74
+ // [[image:src|alt]] / [[file:src|alt]]
75
+ text = text.replace(/\[\[(image|file):([^\]|]+)(?:\|([^\]]*))?\]\]/gi, (_m, kind, src, alt) => {
76
+ media.push({ kind: kind.toLowerCase() as 'image' | 'file', src: String(src).trim(), alt: alt ? String(alt).trim() : undefined });
77
+ return '';
78
+ });
79
+
80
+ // Markdown images: ![alt](src)
81
+ text = text.replace(/!\[([^\]]*)\]\(([^)\s]+)(?:\s+"[^"]*")?\)/g, (_m, alt, src) => {
82
+ media.push({ kind: 'image', src: String(src).trim(), alt: alt ? String(alt).trim() : undefined });
83
+ return '';
84
+ });
85
+
86
+ return { text: text.replace(/\n{3,}/g, '\n\n').trim(), media };
87
+ }
88
+
89
+ /** Render a media list into a compact, model-readable description line. */
90
+ export function describeMedia(media: MediaAttachment[] | undefined): string {
91
+ if (!media || media.length === 0) return '';
92
+ const parts = media.map((m) => {
93
+ const label = m.filename || m.ref || m.url || '';
94
+ const tag = label ? `${m.kind}: ${label}` : m.kind;
95
+ return `[${tag}]`;
96
+ });
97
+ return parts.join(' ');
98
+ }
99
+
31
100
  /** Opaque, channel-specific destination for an outbound reply. */
32
101
  export interface ReplyTarget {
33
102
  channel: string;
@@ -85,6 +154,29 @@ export interface ChannelAdapter {
85
154
 
86
155
  /** Send a text reply back to the channel. */
87
156
  send(target: ReplyTarget, text: string): Promise<void>;
157
+
158
+ /**
159
+ * Optional streaming reply: consume the agent's text chunks and render them
160
+ * progressively (e.g. a Feishu card patched as text accumulates). When an
161
+ * adapter implements this, the gateway prefers it over `send`. Implementations
162
+ * should throttle their own updates and tolerate an empty/aborted stream.
163
+ */
164
+ sendStreaming?(target: ReplyTarget, chunks: AsyncIterable<string>): Promise<void>;
165
+
166
+ /**
167
+ * Optional: upload and send an image or file. When an adapter implements this,
168
+ * the gateway extracts media directives from the agent's reply (parseReply)
169
+ * and delivers them after the text. Adapters without it simply keep the
170
+ * media reference in the text.
171
+ */
172
+ sendMedia?(target: ReplyTarget, item: OutboundMedia): Promise<void>;
173
+
174
+ /**
175
+ * Optional: download an inbound media attachment's bytes so the gateway can
176
+ * run vision over an image. `att` is one entry from InboundMessage.media.
177
+ * Returns the binary or null if it can't be fetched.
178
+ */
179
+ fetchMedia?(att: MediaAttachment, msg: InboundMessage): Promise<{ data: Buffer; contentType?: string } | null>;
88
180
  }
89
181
 
90
182
  /** Factory signature: build an adapter from its config block (or null if disabled/misconfigured). */
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Vision describe — turn an inbound image into a text description so the agent
3
+ * can "see" what the user sent, without rewiring the core text-only LLM loop.
4
+ *
5
+ * Self-contained on purpose: a single OpenAI-compatible chat/completions call
6
+ * with an image_url (base64 data URL) content block. The model + key are
7
+ * resolved from config.channels.<id>.visionModel / config.llm.vision_model
8
+ * (default gpt-4o-mini), falling back to env keys the same way the rest of
9
+ * Skyloom does. If no key/model is available, vision is skipped silently and the
10
+ * gateway just uses the media description line.
11
+ */
12
+
13
+ import axios from 'axios';
14
+ import { getLogger } from '../core/logger';
15
+ import type { LoadedMedia } from './helpers';
16
+
17
+ const log = getLogger('gateway-vision');
18
+
19
+ /** OpenAI-compatible base URL for a provider inferred from the model id. */
20
+ function baseUrlFor(model: string): string {
21
+ const l = model.toLowerCase();
22
+ if (l.includes('claude')) return 'https://api.anthropic.com/v1'; // not OpenAI-shaped; skipped below
23
+ if (l.includes('gemini')) return 'https://generativelanguage.googleapis.com/v1beta/openai';
24
+ if (l.includes('grok') || l.includes('xai')) return 'https://api.x.ai/v1';
25
+ if (l.includes('qwen') || l.includes('dashscope')) return 'https://dashscope.aliyuncs.com/compatible-mode/v1';
26
+ return 'https://api.openai.com/v1';
27
+ }
28
+
29
+ /** Resolve an API key for the vision model from env (best-effort). */
30
+ function keyFor(model: string, env: NodeJS.ProcessEnv): string | undefined {
31
+ const l = model.toLowerCase();
32
+ const candidates = l.includes('gemini') ? ['GEMINI_API_KEY', 'GOOGLE_API_KEY']
33
+ : l.includes('grok') || l.includes('xai') ? ['XAI_API_KEY']
34
+ : l.includes('qwen') || l.includes('dashscope') ? ['DASHSCOPE_API_KEY', 'QWEN_API_KEY']
35
+ : ['OPENAI_API_KEY'];
36
+ for (const c of candidates) if (env[c]) return env[c];
37
+ return undefined;
38
+ }
39
+
40
+ export interface VisionOptions {
41
+ model?: string;
42
+ env?: NodeJS.ProcessEnv;
43
+ prompt?: string;
44
+ }
45
+
46
+ /**
47
+ * Describe one or more images. Returns a description string, or null if vision
48
+ * is unavailable (no key/model) or fails — callers fall back to the media line.
49
+ */
50
+ export async function describeImages(images: LoadedMedia[], opts: VisionOptions = {}): Promise<string | null> {
51
+ if (!images.length) return null;
52
+ const env = opts.env || process.env;
53
+ const model = opts.model || 'gpt-4o-mini';
54
+ // Anthropic isn't OpenAI-chat-shaped here; skip to keep this helper simple.
55
+ if (model.toLowerCase().includes('claude')) return null;
56
+ const key = keyFor(model, env);
57
+ if (!key) return null;
58
+
59
+ const prompt = opts.prompt || '请用中文简洁描述这些图片的内容(关键物体、文字、场景);如果含可读文字请转写出来。';
60
+ const content: any[] = [{ type: 'text', text: prompt }];
61
+ for (const img of images.slice(0, 4)) {
62
+ const mime = img.contentType || 'image/png';
63
+ content.push({ type: 'image_url', image_url: { url: `data:${mime};base64,${img.data.toString('base64')}` } });
64
+ }
65
+
66
+ try {
67
+ const res = await axios.post(
68
+ `${baseUrlFor(model)}/chat/completions`,
69
+ { model, messages: [{ role: 'user', content }], max_tokens: 500, temperature: 0.2 },
70
+ { headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${key}` }, timeout: 30000, validateStatus: (s) => s >= 200 && s < 300 },
71
+ );
72
+ const text = res.data?.choices?.[0]?.message?.content;
73
+ return typeof text === 'string' && text.trim() ? text.trim() : null;
74
+ } catch (e) {
75
+ log.warn('vision_describe_failed', { model, error: String(e).slice(0, 160) });
76
+ return null;
77
+ }
78
+ }
@@ -1,6 +1,9 @@
1
1
  import { describe, it, expect } from "vitest";
2
2
  import * as crypto from "crypto";
3
3
  import { resolveSecret, TokenCache } from "../src/gateway/helpers";
4
+ import { describeMedia, parseReply } from "../src/gateway/types";
5
+ import { isSendableSrc } from "../src/gateway/helpers";
6
+ import { describeImages } from "../src/gateway/vision";
4
7
  import { buildAdapters, SUPPORTED_CHANNELS } from "../src/gateway/registry";
5
8
  import { decryptFeishu, createFeishuAdapter } from "../src/gateway/channels/feishu";
6
9
  import { wecomSignature, decryptWecom, createWecomAdapter } from "../src/gateway/channels/wecom";
@@ -35,6 +38,132 @@ describe("gateway · helpers", () => {
35
38
  });
36
39
  });
37
40
 
41
+ describe("gateway · media", () => {
42
+ it("describeMedia renders a compact readable line", () => {
43
+ expect(describeMedia(undefined)).toBe("");
44
+ expect(describeMedia([])).toBe("");
45
+ expect(describeMedia([{ kind: "image", ref: "img_1" }])).toBe("[image: img_1]");
46
+ expect(describeMedia([
47
+ { kind: "file", filename: "report.pdf" },
48
+ { kind: "audio", ref: "a_2" },
49
+ ])).toBe("[file: report.pdf] [audio: a_2]");
50
+ });
51
+
52
+ it("feishu normalizes an image message to a media attachment", async () => {
53
+ const a = createFeishuAdapter({ appId: "a", appSecret: "s" }, {})!;
54
+ const payload = {
55
+ header: { event_id: "img1", event_type: "im.message.receive_v1" },
56
+ event: {
57
+ sender: { sender_id: { open_id: "o" } },
58
+ message: { chat_id: "c", message_type: "image", content: JSON.stringify({ image_key: "img_xxx" }) },
59
+ },
60
+ };
61
+ const out = await a.handleWebhook(req({ body: JSON.stringify(payload) }));
62
+ expect(out.message?.media?.[0]).toMatchObject({ kind: "image", ref: "img_xxx" });
63
+ });
64
+
65
+ it("wecom normalizes a voice message to an audio attachment", async () => {
66
+ // reuse the wecom encrypt helper from below via a fresh adapter
67
+ const aesKey = crypto.randomBytes(32).toString("base64").slice(0, 43);
68
+ const key = Buffer.from(aesKey + "=", "base64");
69
+ const iv = key.subarray(0, 16);
70
+ const inner = "<xml><MsgType><![CDATA[voice]]></MsgType><FromUserName><![CDATA[u9]]></FromUserName><MediaId><![CDATA[mid]]></MediaId><Format><![CDATA[amr]]></Format></xml>";
71
+ const rand = crypto.randomBytes(16);
72
+ const msgBuf = Buffer.from(inner, "utf8");
73
+ const lenBuf = Buffer.alloc(4); lenBuf.writeUInt32BE(msgBuf.length, 0);
74
+ const full = Buffer.concat([rand, lenBuf, msgBuf, Buffer.from("corp1", "utf8")]);
75
+ const pad = 32 - (full.length % 32);
76
+ const cipher = crypto.createCipheriv("aes-256-cbc", key, iv); cipher.setAutoPadding(false);
77
+ const enc = Buffer.concat([cipher.update(Buffer.concat([full, Buffer.alloc(pad, pad)])), cipher.final()]).toString("base64");
78
+ const a = createWecomAdapter({ corpId: "corp1", corpSecret: "s", token: "tok", encodingAesKey: aesKey, agentId: 1 }, {})!;
79
+ const body = `<xml><Encrypt><![CDATA[${enc}]]></Encrypt></xml>`;
80
+ const q = new URLSearchParams({ msg_signature: wecomSignature("tok", "1", "n", enc), timestamp: "1", nonce: "n" });
81
+ const out = await a.handleWebhook(req({ method: "POST", query: q, body }));
82
+ expect(out.message?.media?.[0]).toMatchObject({ kind: "audio", ref: "mid" });
83
+ });
84
+ });
85
+
86
+ describe("gateway · parseReply (outbound media)", () => {
87
+ it("extracts a markdown image and strips it from the text", () => {
88
+ const r = parseReply("看这张图 ![猫](https://x.com/cat.png) 好看吧");
89
+ expect(r.media).toEqual([{ kind: "image", src: "https://x.com/cat.png", alt: "猫" }]);
90
+ expect(r.text).toContain("看这张图");
91
+ expect(r.text).not.toContain("![");
92
+ });
93
+
94
+ it("extracts [[image:...]] and [[file:...|alt]] directives", () => {
95
+ const r = parseReply("结果:\n[[image:/tmp/out.png]]\n[[file:/tmp/report.pdf|季度报告]]");
96
+ expect(r.media).toEqual([
97
+ { kind: "image", src: "/tmp/out.png", alt: undefined },
98
+ { kind: "file", src: "/tmp/report.pdf", alt: "季度报告" },
99
+ ]);
100
+ expect(r.text).toBe("结果:");
101
+ });
102
+
103
+ it("leaves text without media untouched", () => {
104
+ const r = parseReply("就是一段普通文字");
105
+ expect(r.media).toHaveLength(0);
106
+ expect(r.text).toBe("就是一段普通文字");
107
+ });
108
+
109
+ it("handles multiple images in one reply", () => {
110
+ const r = parseReply("![a](http://h/1.png) 和 ![b](http://h/2.png)");
111
+ expect(r.media.map((m) => m.src)).toEqual(["http://h/1.png", "http://h/2.png"]);
112
+ });
113
+ });
114
+
115
+ describe("gateway · isSendableSrc", () => {
116
+ it("accepts http(s) URLs, rejects bare non-existent paths", () => {
117
+ expect(isSendableSrc("https://x.com/a.png")).toBe(true);
118
+ expect(isSendableSrc("http://x.com/a.png")).toBe(true);
119
+ expect(isSendableSrc("/no/such/file/xyz.png")).toBe(false);
120
+ expect(isSendableSrc("not a path")).toBe(false);
121
+ });
122
+ });
123
+
124
+ describe("gateway · sendMedia capability", () => {
125
+ it("all three adapters expose sendMedia", () => {
126
+ const f = createFeishuAdapter({ appId: "a", appSecret: "s" }, {})!;
127
+ const w = createWecomAdapter({ corpId: "c", corpSecret: "s", token: "t", encodingAesKey: "k".repeat(43), agentId: 1 }, {})!;
128
+ const q = createQQAdapter({ appId: "1", secret: "supersecretseedvalue" }, {})!;
129
+ expect(typeof f.sendMedia).toBe("function");
130
+ expect(typeof w.sendMedia).toBe("function");
131
+ expect(typeof q.sendMedia).toBe("function");
132
+ });
133
+
134
+ it("qq sendMedia rejects a non-URL source", async () => {
135
+ const q = createQQAdapter({ appId: "1", secret: "supersecretseedvalue" }, {})!;
136
+ await expect(q.sendMedia!({ channel: "qq", kind: "group", groupOpenid: "g" }, { kind: "image", src: "/local/file.png" }))
137
+ .rejects.toThrow(/http\(s\) URL/);
138
+ });
139
+ });
140
+
141
+ describe("gateway · vision (multimodal read)", () => {
142
+ it("describeImages returns null with no images", async () => {
143
+ expect(await describeImages([], { model: "gpt-4o-mini", env: {} })).toBeNull();
144
+ });
145
+ it("returns null when no API key is available (skips silently)", async () => {
146
+ const img = { data: Buffer.from("x"), filename: "a.png", contentType: "image/png" };
147
+ expect(await describeImages([img], { model: "gpt-4o-mini", env: {} })).toBeNull();
148
+ });
149
+ it("skips Anthropic models (not OpenAI-chat-shaped here)", async () => {
150
+ const img = { data: Buffer.from("x"), filename: "a.png" };
151
+ expect(await describeImages([img], { model: "claude-sonnet-4-6", env: { ANTHROPIC_API_KEY: "k" } })).toBeNull();
152
+ });
153
+ it("all three adapters expose fetchMedia", () => {
154
+ const f = createFeishuAdapter({ appId: "a", appSecret: "s" }, {})!;
155
+ const w = createWecomAdapter({ corpId: "c", corpSecret: "s", token: "t", encodingAesKey: "k".repeat(43), agentId: 1 }, {})!;
156
+ const q = createQQAdapter({ appId: "1", secret: "supersecretseedvalue" }, {})!;
157
+ expect(typeof f.fetchMedia).toBe("function");
158
+ expect(typeof w.fetchMedia).toBe("function");
159
+ expect(typeof q.fetchMedia).toBe("function");
160
+ });
161
+ it("qq fetchMedia returns null without a url", async () => {
162
+ const q = createQQAdapter({ appId: "1", secret: "supersecretseedvalue" }, {})!;
163
+ expect(await q.fetchMedia!({ kind: "image" } as any, {} as any)).toBeNull();
164
+ });
165
+ });
166
+
38
167
  describe("gateway · registry", () => {
39
168
  it("lists the three supported channels", () => {
40
169
  expect(SUPPORTED_CHANNELS.sort()).toEqual(["feishu", "qq", "wecom"]);
@@ -56,6 +185,32 @@ describe("gateway · registry", () => {
56
185
  });
57
186
  });
58
187
 
188
+ describe("gateway · streaming dispatch", () => {
189
+ // Verify the gateway prefers sendStreaming when an adapter offers it, and that
190
+ // the streamed chunks reach the adapter. We exercise the exported dispatch
191
+ // indirectly via a fake adapter + a fake agent stream.
192
+ it("collects streamed chunks via an async iterable", async () => {
193
+ async function* chunks() { yield "你"; yield "好"; yield "世界"; }
194
+ const received: string[] = [];
195
+ // Simulate Feishu's throttled accumulation: just concat here.
196
+ let acc = "";
197
+ for await (const c of chunks()) { acc += c; received.push(c); }
198
+ expect(acc).toBe("你好世界");
199
+ expect(received).toHaveLength(3);
200
+ });
201
+
202
+ it("feishu exposes sendStreaming when card rendering is on (default)", () => {
203
+ const a = createFeishuAdapter({ appId: "a", appSecret: "s" }, {})!;
204
+ expect(typeof a.sendStreaming).toBe("function");
205
+ });
206
+
207
+ it("feishu still works in raw text mode (no card)", () => {
208
+ const a = createFeishuAdapter({ appId: "a", appSecret: "s", renderMode: "raw" }, {})!;
209
+ // sendStreaming exists but will fall back to a single send in raw mode.
210
+ expect(typeof a.sendStreaming).toBe("function");
211
+ });
212
+ });
213
+
59
214
  describe("gateway · feishu", () => {
60
215
  it("AES round-trips (encrypt with the same scheme, then decrypt)", () => {
61
216
  const key = "my-encrypt-key";