skyloom 1.22.0 → 1.24.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +10 -1
  2. package/dist/cli/main.js +69 -0
  3. package/dist/cli/main.js.map +1 -1
  4. package/dist/core/commands.d.ts.map +1 -1
  5. package/dist/core/commands.js +10 -0
  6. package/dist/core/commands.js.map +1 -1
  7. package/dist/gateway/channels/feishu.d.ts.map +1 -1
  8. package/dist/gateway/channels/feishu.js +53 -0
  9. package/dist/gateway/channels/feishu.js.map +1 -1
  10. package/dist/gateway/channels/qq.d.ts.map +1 -1
  11. package/dist/gateway/channels/qq.js +45 -0
  12. package/dist/gateway/channels/qq.js.map +1 -1
  13. package/dist/gateway/channels/wecom.d.ts.map +1 -1
  14. package/dist/gateway/channels/wecom.js +41 -0
  15. package/dist/gateway/channels/wecom.js.map +1 -1
  16. package/dist/gateway/gateway.d.ts.map +1 -1
  17. package/dist/gateway/gateway.js +79 -9
  18. package/dist/gateway/gateway.js.map +1 -1
  19. package/dist/gateway/helpers.d.ts +23 -0
  20. package/dist/gateway/helpers.d.ts.map +1 -1
  21. package/dist/gateway/helpers.js +90 -0
  22. package/dist/gateway/helpers.js.map +1 -1
  23. package/dist/gateway/qr.d.ts +8 -0
  24. package/dist/gateway/qr.d.ts.map +1 -0
  25. package/dist/gateway/qr.js +23 -0
  26. package/dist/gateway/qr.js.map +1 -0
  27. package/dist/gateway/setup.d.ts +57 -0
  28. package/dist/gateway/setup.d.ts.map +1 -0
  29. package/dist/gateway/setup.js +127 -0
  30. package/dist/gateway/setup.js.map +1 -0
  31. package/dist/gateway/types.d.ts +39 -0
  32. package/dist/gateway/types.d.ts.map +1 -1
  33. package/dist/gateway/types.js +25 -0
  34. package/dist/gateway/types.js.map +1 -1
  35. package/dist/gateway/vision.d.ts +23 -0
  36. package/dist/gateway/vision.d.ts.map +1 -0
  37. package/dist/gateway/vision.js +77 -0
  38. package/dist/gateway/vision.js.map +1 -0
  39. package/package.json +2 -1
  40. package/src/cli/main.ts +62 -0
  41. package/src/core/commands.ts +10 -0
  42. package/src/gateway/channels/feishu.ts +49 -2
  43. package/src/gateway/channels/qq.ts +43 -2
  44. package/src/gateway/channels/wecom.ts +47 -2
  45. package/src/gateway/gateway.ts +77 -8
  46. package/src/gateway/helpers.ts +60 -0
  47. package/src/gateway/qr.ts +21 -0
  48. package/src/gateway/setup.ts +145 -0
  49. package/src/gateway/types.ts +58 -0
  50. package/src/gateway/vision.ts +78 -0
  51. package/tests/channel_setup.test.ts +88 -0
  52. package/tests/gateway.test.ts +84 -1
@@ -17,8 +17,8 @@
17
17
 
18
18
  import * as crypto from 'crypto';
19
19
  import { getLogger } from '../../core/logger';
20
- import { resolveSecret, postJson, TokenCache } from '../helpers';
21
- import type { ChannelAdapter, MediaAttachment, RawRequest, ReplyTarget, WebhookOutcome } from '../types';
20
+ import { resolveSecret, postJson, loadMedia, TokenCache } from '../helpers';
21
+ import type { ChannelAdapter, MediaAttachment, OutboundMedia, RawRequest, ReplyTarget, WebhookOutcome } from '../types';
22
22
 
23
23
  const log = getLogger('channel-qq');
24
24
 
@@ -147,5 +147,46 @@ export function createQQAdapter(cfg: any, env: NodeJS.ProcessEnv): ChannelAdapte
147
147
  throw new Error(`qq send error: ${e?.response?.status || ''} ${String(e?.message || e).slice(0, 120)}`);
148
148
  }
149
149
  },
150
+
151
+ // QQ's v2 rich-media flow takes a URL (the platform fetches it): POST
152
+ // /files → file_info, then send msg_type:7 referencing that file_info.
153
+ // Group/C2C only; raw local bytes aren't supported, so the src must be a URL.
154
+ async sendMedia(target: ReplyTarget, item: OutboundMedia): Promise<void> {
155
+ if (!/^https?:\/\//i.test(item.src)) {
156
+ throw new Error('qq sendMedia requires an http(s) URL (platform fetches it)');
157
+ }
158
+ const base = target.kind === 'group'
159
+ ? `https://api.sgroup.qq.com/v2/groups/${target.groupOpenid}`
160
+ : target.kind === 'c2c'
161
+ ? `https://api.sgroup.qq.com/v2/users/${target.userOpenid}`
162
+ : null;
163
+ if (!base) throw new Error('qq sendMedia unsupported for channel target');
164
+ const headers = { ...(await authHeaders()), 'Content-Type': 'application/json' };
165
+ // file_type: 1=image 2=video 3=audio 4=file
166
+ const fileType = item.kind === 'image' ? 1 : 4;
167
+ let fileInfo: string;
168
+ try {
169
+ const up = await postJson(`${base}/files`, { file_type: fileType, url: item.src, srv_send_msg: false }, { headers });
170
+ fileInfo = up.file_info;
171
+ } catch (e: any) {
172
+ if (e?.response?.status === 401) tokenCache.invalidate();
173
+ throw new Error(`qq file upload error: ${e?.response?.status || ''} ${String(e?.message || e).slice(0, 120)}`);
174
+ }
175
+ const payload: any = { msg_type: 7, media: { file_info: fileInfo } };
176
+ if (target.msgId) payload.msg_id = target.msgId;
177
+ await postJson(`${base}/messages`, payload, { headers });
178
+ },
179
+
180
+ async fetchMedia(att: MediaAttachment): Promise<{ data: Buffer; contentType?: string } | null> {
181
+ // QQ delivers attachments with a direct URL — just download it.
182
+ if (!att.url) return null;
183
+ try {
184
+ const loaded = await loadMedia(att.url);
185
+ return { data: loaded.data, contentType: loaded.contentType || att.mimeType };
186
+ } catch (e) {
187
+ log.warn('qq_media_fetch_failed', { error: String(e) });
188
+ return null;
189
+ }
190
+ },
150
191
  };
151
192
  }
@@ -16,9 +16,10 @@
16
16
  */
17
17
 
18
18
  import * as crypto from 'crypto';
19
+ import axios from 'axios';
19
20
  import { getLogger } from '../../core/logger';
20
- import { resolveSecret, postJson, getJson, TokenCache } from '../helpers';
21
- import type { ChannelAdapter, MediaAttachment, RawRequest, ReplyTarget, WebhookOutcome } from '../types';
21
+ import { resolveSecret, postJson, getJson, postMultipart, loadMedia, TokenCache } from '../helpers';
22
+ import type { ChannelAdapter, MediaAttachment, OutboundMedia, RawRequest, ReplyTarget, WebhookOutcome } from '../types';
22
23
 
23
24
  const log = getLogger('channel-wecom');
24
25
 
@@ -147,5 +148,49 @@ export function createWecomAdapter(cfg: any, env: NodeJS.ProcessEnv): ChannelAda
147
148
  throw new Error(`wecom send error ${data.errcode}: ${data.errmsg}`);
148
149
  }
149
150
  },
151
+
152
+ async sendMedia(target: ReplyTarget, item: OutboundMedia): Promise<void> {
153
+ const toUser = target.toUser as string;
154
+ if (!toUser || !agentId) return;
155
+ const loaded = await loadMedia(item.src);
156
+ const accessToken = await tokenCache.get();
157
+ const type = item.kind === 'image' ? 'image' : 'file';
158
+ // Upload to the temporary-media store (valid 3 days), then push by media_id.
159
+ const up = await postMultipart(
160
+ `https://qyapi.weixin.qq.com/cgi-bin/media/upload?access_token=${encodeURIComponent(accessToken)}&type=${type}`,
161
+ { media: { data: loaded.data, filename: loaded.filename || (type === 'image' ? 'image.png' : 'file'), contentType: loaded.contentType } },
162
+ );
163
+ if (up.errcode && up.errcode !== 0) {
164
+ if (up.errcode === 42001 || up.errcode === 40014) tokenCache.invalidate();
165
+ throw new Error(`wecom media upload ${up.errcode}: ${up.errmsg}`);
166
+ }
167
+ const mediaId = up.media_id;
168
+ const body: any = { touser: toUser, msgtype: type, agentid: Number(agentId) };
169
+ body[type] = { media_id: mediaId };
170
+ const send = await postJson(
171
+ `https://qyapi.weixin.qq.com/cgi-bin/message/send?access_token=${encodeURIComponent(accessToken)}`,
172
+ body,
173
+ );
174
+ if (send.errcode !== 0) {
175
+ if (send.errcode === 42001 || send.errcode === 40014) tokenCache.invalidate();
176
+ throw new Error(`wecom media send ${send.errcode}: ${send.errmsg}`);
177
+ }
178
+ },
179
+
180
+ async fetchMedia(att: MediaAttachment): Promise<{ data: Buffer; contentType?: string } | null> {
181
+ if (!att.ref) return null;
182
+ const accessToken = await tokenCache.get();
183
+ const res = await axios.get(
184
+ `https://qyapi.weixin.qq.com/cgi-bin/media/get?access_token=${encodeURIComponent(accessToken)}&media_id=${encodeURIComponent(att.ref)}`,
185
+ { responseType: 'arraybuffer', timeout: 30000, validateStatus: (s) => s >= 200 && s < 300 },
186
+ );
187
+ // An error comes back as JSON, not the binary — detect and bail.
188
+ const ct = res.headers['content-type'];
189
+ if (typeof ct === 'string' && ct.includes('application/json')) {
190
+ log.warn('wecom_media_get_failed', { body: Buffer.from(res.data).toString('utf8').slice(0, 120) });
191
+ return null;
192
+ }
193
+ return { data: Buffer.from(res.data), contentType: typeof ct === 'string' ? ct : undefined };
194
+ },
150
195
  };
151
196
  }
@@ -16,8 +16,11 @@ import { createServer, IncomingMessage, ServerResponse } from 'http';
16
16
  import { getLogger } from '../core/logger';
17
17
  import { createSystemContext } from '../core/factory';
18
18
  import { buildAdapters } from './registry';
19
- import { describeMedia } from './types';
19
+ import { describeMedia, parseReply } from './types';
20
+ import { isSendableSrc } from './helpers';
21
+ import { describeImages } from './vision';
20
22
  import type { ChannelAdapter, InboundMessage, RawRequest } from './types';
23
+ import type { LoadedMedia } from './helpers';
21
24
 
22
25
  const log = getLogger('gateway');
23
26
 
@@ -29,11 +32,44 @@ async function readBody(req: IncomingMessage): Promise<Buffer> {
29
32
  }
30
33
 
31
34
  /** Run an agent turn for an inbound message and collect the final text reply. */
32
- /** Build the agent prompt from an inbound message (text + media description). */
33
- function buildPrompt(msg: InboundMessage): string {
35
+ /** Build the agent prompt: text + media description + any vision result. */
36
+ function buildPrompt(msg: InboundMessage, canSendMedia: boolean, visionText?: string | null): string {
37
+ const parts: string[] = [];
34
38
  const mediaDesc = describeMedia(msg.media);
35
- if (!mediaDesc) return msg.text;
36
- return msg.text ? `${msg.text}\n\n(用户还发送了媒体: ${mediaDesc})` : `用户发送了媒体: ${mediaDesc}`;
39
+ if (msg.text) parts.push(msg.text);
40
+ if (mediaDesc) parts.push(`(用户发送了媒体: ${mediaDesc})`);
41
+ if (visionText) parts.push(`(图片内容识别: ${visionText})`);
42
+ if (canSendMedia) {
43
+ parts.push('(若需回发图片或文件,在回复中用 Markdown 图片 ![说明](路径或URL) 或 [[file:路径或URL]] 表示,路径可为本地文件或 http(s) 链接。)');
44
+ }
45
+ return parts.join('\n\n') || msg.text;
46
+ }
47
+
48
+ /** Download inbound images and run vision over them. Returns null if disabled. */
49
+ async function visionForMessage(
50
+ ctx: ReturnType<typeof createSystemContext>,
51
+ adapter: ChannelAdapter,
52
+ msg: InboundMessage,
53
+ ): Promise<string | null> {
54
+ const chCfg = ((ctx.config as any).channels || {})[adapter.id] || {};
55
+ const llmCfg = (ctx.config as any).llm || {};
56
+ if (chCfg.vision === false) return null;
57
+ const model = chCfg.visionModel || llmCfg.vision_model || llmCfg.visionModel;
58
+ if (!model) return null; // vision is opt-in: requires a configured model
59
+ const images = (msg.media || []).filter((m) => m.kind === 'image');
60
+ if (!images.length || !adapter.fetchMedia) return null;
61
+
62
+ const loaded: LoadedMedia[] = [];
63
+ for (const att of images.slice(0, 4)) {
64
+ try {
65
+ const got = await adapter.fetchMedia(att, msg);
66
+ if (got) loaded.push({ data: got.data, filename: att.filename || 'image', contentType: got.contentType });
67
+ } catch (e) {
68
+ log.warn('vision_fetch_failed', { channel: adapter.id, error: String(e) });
69
+ }
70
+ }
71
+ if (!loaded.length) return null;
72
+ return describeImages(loaded, { model });
37
73
  }
38
74
 
39
75
  /** Resolve the agent for a channel message. */
@@ -52,16 +88,18 @@ async function dispatch(
52
88
  const agent = resolveAgent(ctx, adapter);
53
89
  if (!agent) throw new Error('no agent available');
54
90
  await agent.init();
55
- const prompt = buildPrompt(msg);
91
+ const visionText = await visionForMessage(ctx, adapter, msg);
92
+ const prompt = buildPrompt(msg, !!adapter.sendMedia, visionText);
56
93
 
57
94
  // Streaming path: stream content chunks straight to the adapter (e.g. a Feishu
58
95
  // card patched as text arrives). Falls back to collect-then-send otherwise.
59
96
  const cfgStreaming = ((ctx.config as any).channels || {})[adapter.id]?.streaming !== false;
60
97
  if (adapter.sendStreaming && cfgStreaming) {
98
+ let full = '';
61
99
  async function* contentChunks(): AsyncGenerator<string> {
62
100
  try {
63
101
  for await (const ev of agent.chatStream(prompt)) {
64
- if ((ev as any).type === 'content') yield (ev as any).text as string;
102
+ if ((ev as any).type === 'content') { const t = (ev as any).text as string; full += t; yield t; }
65
103
  }
66
104
  } catch (e) {
67
105
  log.warn('gateway_agent_failed', { channel: adapter.id, error: String(e) });
@@ -69,6 +107,8 @@ async function dispatch(
69
107
  }
70
108
  }
71
109
  await adapter.sendStreaming(msg.replyTo, contentChunks());
110
+ // After streaming the text, deliver any media the agent referenced.
111
+ await deliverMedia(adapter, msg, full);
72
112
  return;
73
113
  }
74
114
 
@@ -81,7 +121,36 @@ async function dispatch(
81
121
  log.warn('gateway_agent_failed', { channel: adapter.id, error: String(e) });
82
122
  text = `[出错了] ${String(e)}`;
83
123
  }
84
- await adapter.send(msg.replyTo, text.trim() || '(无回复)');
124
+ // Non-streaming: split out media so the text message is clean.
125
+ if (adapter.sendMedia) {
126
+ const parsed = parseReply(text);
127
+ await adapter.send(msg.replyTo, parsed.text || '(无回复)');
128
+ await deliverMedia(adapter, msg, text, parsed.media);
129
+ } else {
130
+ await adapter.send(msg.replyTo, text.trim() || '(无回复)');
131
+ }
132
+ }
133
+
134
+ /** Upload+send any media the agent referenced in its reply. Best-effort. */
135
+ async function deliverMedia(
136
+ adapter: ChannelAdapter,
137
+ msg: InboundMessage,
138
+ fullText: string,
139
+ pre?: ReturnType<typeof parseReply>['media'],
140
+ ): Promise<void> {
141
+ if (!adapter.sendMedia) return;
142
+ const media = pre ?? parseReply(fullText).media;
143
+ for (const item of media) {
144
+ if (!isSendableSrc(item.src)) {
145
+ log.warn('gateway_media_unsendable', { channel: adapter.id, src: item.src });
146
+ continue;
147
+ }
148
+ try {
149
+ await adapter.sendMedia(msg.replyTo, item);
150
+ } catch (e) {
151
+ log.warn('gateway_send_media_failed', { channel: adapter.id, src: item.src, error: String(e) });
152
+ }
153
+ }
85
154
  }
86
155
 
87
156
  export interface GatewayOptions {
@@ -5,6 +5,8 @@
5
5
  */
6
6
 
7
7
  import axios from 'axios';
8
+ import * as fs from 'fs';
9
+ import * as path from 'path';
8
10
 
9
11
  /**
10
12
  * Resolve a secret/config value. Accepts a literal string, or an env-ref object
@@ -58,6 +60,64 @@ export async function getJson(
58
60
  return res.data;
59
61
  }
60
62
 
63
+ /** A loaded binary plus its filename, ready to upload. */
64
+ export interface LoadedMedia {
65
+ data: Buffer;
66
+ filename: string;
67
+ contentType?: string;
68
+ }
69
+
70
+ /**
71
+ * Load media bytes from a local filesystem path or an http(s) URL. Local paths
72
+ * are read directly; remote URLs are fetched (capped at 30 MiB to avoid
73
+ * pulling something huge into memory). Throws if the source can't be loaded.
74
+ */
75
+ export async function loadMedia(src: string): Promise<LoadedMedia> {
76
+ if (/^https?:\/\//i.test(src)) {
77
+ const res = await axios.get(src, {
78
+ responseType: 'arraybuffer',
79
+ timeout: 30000,
80
+ maxContentLength: 30 * 1024 * 1024,
81
+ validateStatus: (s) => s >= 200 && s < 300,
82
+ });
83
+ const urlName = path.basename(new URL(src).pathname) || 'file';
84
+ const ct = res.headers['content-type'];
85
+ return {
86
+ data: Buffer.from(res.data),
87
+ filename: urlName,
88
+ contentType: typeof ct === 'string' ? ct : undefined,
89
+ };
90
+ }
91
+ const data = fs.readFileSync(src); // throws ENOENT if missing — caller handles
92
+ return { data, filename: path.basename(src) };
93
+ }
94
+
95
+ /** Is this a sendable media source (http(s) URL or an existing local file)? */
96
+ export function isSendableSrc(src: string): boolean {
97
+ if (/^https?:\/\//i.test(src)) return true;
98
+ try { return fs.existsSync(src) && fs.statSync(src).isFile(); } catch { return false; }
99
+ }
100
+
101
+ /** POST multipart/form-data (Node 18+ FormData/Blob), return parsed JSON. */
102
+ export async function postMultipart(
103
+ url: string,
104
+ fields: Record<string, string | { data: Buffer; filename: string; contentType?: string }>,
105
+ opts?: { headers?: Record<string, string>; timeoutMs?: number },
106
+ ): Promise<any> {
107
+ const form = new FormData();
108
+ for (const [k, v] of Object.entries(fields)) {
109
+ if (typeof v === 'string') form.append(k, v);
110
+ else form.append(k, new Blob([v.data], v.contentType ? { type: v.contentType } : undefined), v.filename);
111
+ }
112
+ const res = await axios.post(url, form, {
113
+ headers: { ...(opts?.headers || {}) },
114
+ timeout: opts?.timeoutMs ?? 30000,
115
+ maxBodyLength: Infinity,
116
+ validateStatus: (s) => s >= 200 && s < 300,
117
+ });
118
+ return res.data;
119
+ }
120
+
61
121
  /**
62
122
  * A small token cache: fetch an access token via `fetcher`, cache it until it
63
123
  * is near expiry, and refresh transparently. Channels (Feishu/WeCom) all need
@@ -0,0 +1,21 @@
1
+ /**
2
+ * Terminal QR rendering — a thin, dependency-isolated wrapper over
3
+ * qrcode-terminal (single file, zero transitive deps). Returns the QR as a
4
+ * string so callers control where it's written (and so it's testable).
5
+ */
6
+
7
+ // qrcode-terminal ships no types; declare the tiny surface we use.
8
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
9
+ const qrcode: { generate: (text: string, opts: { small?: boolean }, cb: (s: string) => void) => void } =
10
+ require('qrcode-terminal');
11
+
12
+ /** Render `text` as a scannable QR code (compact) into a string. */
13
+ export function renderQR(text: string): string {
14
+ let out = '';
15
+ try {
16
+ qrcode.generate(text, { small: true }, (s: string) => { out = s; });
17
+ } catch {
18
+ return '';
19
+ }
20
+ return out;
21
+ }
@@ -0,0 +1,145 @@
1
+ /**
2
+ * Channel setup metadata + persistence — the data behind the `sky channels`
3
+ * wizard. Each channel declares the credential fields it needs, where to create
4
+ * the bot (a platform console URL we render as a QR for quick mobile access),
5
+ * and a short how-to. Kept pure/testable; the interactive prompts live in the
6
+ * CLI, and QR rendering is a thin wrapper over qrcode-terminal.
7
+ *
8
+ * Note: Feishu / WeCom / QQ are all official-bot APIs — credentials are created
9
+ * in each platform's developer console, there is no "scan to log in" the way
10
+ * personal WeChat works. So the QR here is a convenience link to the console
11
+ * (scan on your phone → open the console → create the bot → copy the keys), plus
12
+ * a QR of the gateway callback URL to paste back into the console.
13
+ */
14
+
15
+ export interface ChannelField {
16
+ /** Config key under channels.<id>. */
17
+ key: string;
18
+ /** Human label shown in the wizard. */
19
+ label: string;
20
+ /** Whether the wizard must collect it (some are optional). */
21
+ required: boolean;
22
+ /** Treat as a secret (mask input / store as env-ref suggestion). */
23
+ secret?: boolean;
24
+ /** Env var that also supplies this value. */
25
+ env?: string;
26
+ /** One-line hint on where to find it. */
27
+ hint?: string;
28
+ }
29
+
30
+ export interface ChannelSetupSpec {
31
+ id: string;
32
+ name: string;
33
+ /** Platform console where the bot/app is created (rendered as a QR). */
34
+ consoleUrl: string;
35
+ /** Docs link for the full setup walkthrough. */
36
+ docsUrl?: string;
37
+ /** Webhook path the platform must call back. */
38
+ webhookPath: string;
39
+ /** Ordered credential fields to collect. */
40
+ fields: ChannelField[];
41
+ /** Short, numbered how-to shown before collecting fields. */
42
+ steps: string[];
43
+ }
44
+
45
+ export const CHANNEL_SETUP: Record<string, ChannelSetupSpec> = {
46
+ feishu: {
47
+ id: 'feishu',
48
+ name: '飞书 / Lark',
49
+ consoleUrl: 'https://open.feishu.cn/app',
50
+ docsUrl: 'https://open.feishu.cn/document/home/index',
51
+ webhookPath: '/webhook/feishu',
52
+ fields: [
53
+ { key: 'appId', label: 'App ID', required: true, env: 'FEISHU_APP_ID', hint: '开发者后台 → 凭证与基础信息 → App ID' },
54
+ { key: 'appSecret', label: 'App Secret', required: true, secret: true, env: 'FEISHU_APP_SECRET', hint: '同页 App Secret' },
55
+ { key: 'verificationToken', label: 'Verification Token', required: false, secret: true, env: 'FEISHU_VERIFICATION_TOKEN', hint: '事件订阅 → Verification Token(可选)' },
56
+ { key: 'encryptKey', label: 'Encrypt Key', required: false, secret: true, env: 'FEISHU_ENCRYPT_KEY', hint: '事件订阅 → Encrypt Key(开启加密时填)' },
57
+ ],
58
+ steps: [
59
+ '扫码或打开 https://open.feishu.cn/app 创建「企业自建应用」',
60
+ '在「凭证与基础信息」复制 App ID / App Secret',
61
+ '开启「机器人」能力,在「权限管理」添加 im:message 等权限',
62
+ '「事件订阅」填入下方回调 URL,订阅 im.message.receive_v1',
63
+ ],
64
+ },
65
+ wecom: {
66
+ id: 'wecom',
67
+ name: '企业微信 WeCom',
68
+ consoleUrl: 'https://work.weixin.qq.com/wework_admin/frame',
69
+ docsUrl: 'https://developer.work.weixin.qq.com/document/path/90664',
70
+ webhookPath: '/webhook/wecom',
71
+ fields: [
72
+ { key: 'corpId', label: 'CorpID(企业ID)', required: true, env: 'WECOM_CORP_ID', hint: '管理后台 → 我的企业 → 企业ID' },
73
+ { key: 'corpSecret', label: 'Secret(应用Secret)', required: true, secret: true, env: 'WECOM_CORP_SECRET', hint: '应用管理 → 自建应用 → Secret' },
74
+ { key: 'agentId', label: 'AgentId', required: true, env: 'WECOM_AGENT_ID', hint: '同应用页 AgentId' },
75
+ { key: 'token', label: 'Token', required: true, secret: true, env: 'WECOM_TOKEN', hint: '应用 → 接收消息 → API 接收 → Token' },
76
+ { key: 'encodingAesKey', label: 'EncodingAESKey', required: true, secret: true, env: 'WECOM_AES_KEY', hint: '同页 EncodingAESKey(43 位)' },
77
+ ],
78
+ steps: [
79
+ '扫码或打开企业微信管理后台,进入「应用管理 → 自建 → 创建应用」',
80
+ '复制企业ID、应用 Secret、AgentId',
81
+ '「接收消息」选 API 接收,设置 Token 与 EncodingAESKey',
82
+ '把下方回调 URL 填入「URL」,保存时企业微信会回调验证',
83
+ ],
84
+ },
85
+ qq: {
86
+ id: 'qq',
87
+ name: 'QQ 机器人',
88
+ consoleUrl: 'https://q.qq.com/#/app/bot',
89
+ docsUrl: 'https://bot.q.qq.com/wiki/',
90
+ webhookPath: '/webhook/qq',
91
+ fields: [
92
+ { key: 'appId', label: 'AppID(机器人ID)', required: true, env: 'QQ_BOT_APPID', hint: 'QQ 开放平台 → 机器人 → 开发设置 → AppID' },
93
+ { key: 'secret', label: 'AppSecret', required: true, secret: true, env: 'QQ_BOT_SECRET', hint: '同页 AppSecret' },
94
+ ],
95
+ steps: [
96
+ '扫码或打开 https://q.qq.com 创建机器人,完成开发者认证',
97
+ '在「开发设置」复制 AppID 与 AppSecret',
98
+ '「回调配置」选择 Webhook,填入下方回调 URL',
99
+ '在沙箱里把机器人加为好友 / 拉进群进行测试',
100
+ ],
101
+ },
102
+ };
103
+
104
+ export const SETUP_CHANNEL_IDS = Object.keys(CHANNEL_SETUP);
105
+
106
+ /** Build the full webhook callback URL for a channel from a public base. */
107
+ export function callbackUrl(base: string, channelId: string): string {
108
+ const spec = CHANNEL_SETUP[channelId];
109
+ if (!spec) return '';
110
+ const trimmed = base.replace(/\/+$/, '');
111
+ return `${trimmed}${spec.webhookPath}`;
112
+ }
113
+
114
+ /**
115
+ * Persist a channel's collected values into ~/.skyloom/config.yaml under
116
+ * channels.<id>, merging with any existing block. Secret-looking values are
117
+ * stored as-is (the file is chmod 0600); callers may instead keep secrets in
118
+ * env and store an { source: env, id } ref. Returns the config path written.
119
+ */
120
+ export function saveChannelConfig(
121
+ channelId: string,
122
+ values: Record<string, string>,
123
+ opts?: { configPath?: string },
124
+ ): string {
125
+ const path = require('path');
126
+ const fs = require('fs');
127
+ const yaml = require('yaml');
128
+ const cfgPath = opts?.configPath || path.join(require('os').homedir(), '.skyloom', 'config.yaml');
129
+ const dir = path.dirname(cfgPath);
130
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true, mode: 0o700 });
131
+ let cfg: any = {};
132
+ if (fs.existsSync(cfgPath)) { try { cfg = yaml.parse(fs.readFileSync(cfgPath, 'utf-8')) || {}; } catch { cfg = {}; } }
133
+ if (!cfg.channels) cfg.channels = {};
134
+ cfg.channels[channelId] = { ...(cfg.channels[channelId] || {}), ...values, enabled: true };
135
+ fs.writeFileSync(cfgPath, yaml.stringify(cfg), { encoding: 'utf-8', mode: 0o600 });
136
+ try { fs.chmodSync(cfgPath, 0o600); } catch { /* best-effort on Windows */ }
137
+ return cfgPath;
138
+ }
139
+
140
+ /** Which required fields are still missing from a values map. */
141
+ export function missingRequired(channelId: string, values: Record<string, string>): string[] {
142
+ const spec = CHANNEL_SETUP[channelId];
143
+ if (!spec) return [];
144
+ return spec.fields.filter((f) => f.required && !values[f.key]?.trim()).map((f) => f.key);
145
+ }
@@ -43,6 +43,49 @@ export interface MediaAttachment {
43
43
  url?: string;
44
44
  }
45
45
 
46
+ /** An outbound media item the agent wants to send (parsed from its reply). */
47
+ export interface OutboundMedia {
48
+ kind: 'image' | 'file';
49
+ /** Local filesystem path or http(s) URL to the binary. */
50
+ src: string;
51
+ /** Optional caption / alt text. */
52
+ alt?: string;
53
+ }
54
+
55
+ /** The result of splitting an agent reply into plain text + outbound media. */
56
+ export interface ParsedReply {
57
+ text: string;
58
+ media: OutboundMedia[];
59
+ }
60
+
61
+ /**
62
+ * Parse media directives out of an agent's reply so channels can upload+send
63
+ * them. Recognized forms (stripped from the returned text):
64
+ * - Markdown image: ![alt](src)
65
+ * - Explicit image: [[image:src]] or [[image:src|alt]]
66
+ * - Explicit file: [[file:src]] or [[file:src|alt]]
67
+ * `src` is a local path or http(s) URL. Only http(s) and existing local files
68
+ * are treated as media; anything else is left in the text untouched.
69
+ */
70
+ export function parseReply(reply: string): ParsedReply {
71
+ const media: OutboundMedia[] = [];
72
+ let text = reply;
73
+
74
+ // [[image:src|alt]] / [[file:src|alt]]
75
+ text = text.replace(/\[\[(image|file):([^\]|]+)(?:\|([^\]]*))?\]\]/gi, (_m, kind, src, alt) => {
76
+ media.push({ kind: kind.toLowerCase() as 'image' | 'file', src: String(src).trim(), alt: alt ? String(alt).trim() : undefined });
77
+ return '';
78
+ });
79
+
80
+ // Markdown images: ![alt](src)
81
+ text = text.replace(/!\[([^\]]*)\]\(([^)\s]+)(?:\s+"[^"]*")?\)/g, (_m, alt, src) => {
82
+ media.push({ kind: 'image', src: String(src).trim(), alt: alt ? String(alt).trim() : undefined });
83
+ return '';
84
+ });
85
+
86
+ return { text: text.replace(/\n{3,}/g, '\n\n').trim(), media };
87
+ }
88
+
46
89
  /** Render a media list into a compact, model-readable description line. */
47
90
  export function describeMedia(media: MediaAttachment[] | undefined): string {
48
91
  if (!media || media.length === 0) return '';
@@ -119,6 +162,21 @@ export interface ChannelAdapter {
119
162
  * should throttle their own updates and tolerate an empty/aborted stream.
120
163
  */
121
164
  sendStreaming?(target: ReplyTarget, chunks: AsyncIterable<string>): Promise<void>;
165
+
166
+ /**
167
+ * Optional: upload and send an image or file. When an adapter implements this,
168
+ * the gateway extracts media directives from the agent's reply (parseReply)
169
+ * and delivers them after the text. Adapters without it simply keep the
170
+ * media reference in the text.
171
+ */
172
+ sendMedia?(target: ReplyTarget, item: OutboundMedia): Promise<void>;
173
+
174
+ /**
175
+ * Optional: download an inbound media attachment's bytes so the gateway can
176
+ * run vision over an image. `att` is one entry from InboundMessage.media.
177
+ * Returns the binary or null if it can't be fetched.
178
+ */
179
+ fetchMedia?(att: MediaAttachment, msg: InboundMessage): Promise<{ data: Buffer; contentType?: string } | null>;
122
180
  }
123
181
 
124
182
  /** Factory signature: build an adapter from its config block (or null if disabled/misconfigured). */
@@ -0,0 +1,78 @@
1
+ /**
2
+ * Vision describe — turn an inbound image into a text description so the agent
3
+ * can "see" what the user sent, without rewiring the core text-only LLM loop.
4
+ *
5
+ * Self-contained on purpose: a single OpenAI-compatible chat/completions call
6
+ * with an image_url (base64 data URL) content block. The model + key are
7
+ * resolved from config.channels.<id>.visionModel / config.llm.vision_model
8
+ * (default gpt-4o-mini), falling back to env keys the same way the rest of
9
+ * Skyloom does. If no key/model is available, vision is skipped silently and the
10
+ * gateway just uses the media description line.
11
+ */
12
+
13
+ import axios from 'axios';
14
+ import { getLogger } from '../core/logger';
15
+ import type { LoadedMedia } from './helpers';
16
+
17
+ const log = getLogger('gateway-vision');
18
+
19
+ /** OpenAI-compatible base URL for a provider inferred from the model id. */
20
+ function baseUrlFor(model: string): string {
21
+ const l = model.toLowerCase();
22
+ if (l.includes('claude')) return 'https://api.anthropic.com/v1'; // not OpenAI-shaped; skipped below
23
+ if (l.includes('gemini')) return 'https://generativelanguage.googleapis.com/v1beta/openai';
24
+ if (l.includes('grok') || l.includes('xai')) return 'https://api.x.ai/v1';
25
+ if (l.includes('qwen') || l.includes('dashscope')) return 'https://dashscope.aliyuncs.com/compatible-mode/v1';
26
+ return 'https://api.openai.com/v1';
27
+ }
28
+
29
+ /** Resolve an API key for the vision model from env (best-effort). */
30
+ function keyFor(model: string, env: NodeJS.ProcessEnv): string | undefined {
31
+ const l = model.toLowerCase();
32
+ const candidates = l.includes('gemini') ? ['GEMINI_API_KEY', 'GOOGLE_API_KEY']
33
+ : l.includes('grok') || l.includes('xai') ? ['XAI_API_KEY']
34
+ : l.includes('qwen') || l.includes('dashscope') ? ['DASHSCOPE_API_KEY', 'QWEN_API_KEY']
35
+ : ['OPENAI_API_KEY'];
36
+ for (const c of candidates) if (env[c]) return env[c];
37
+ return undefined;
38
+ }
39
+
40
+ export interface VisionOptions {
41
+ model?: string;
42
+ env?: NodeJS.ProcessEnv;
43
+ prompt?: string;
44
+ }
45
+
46
+ /**
47
+ * Describe one or more images. Returns a description string, or null if vision
48
+ * is unavailable (no key/model) or fails — callers fall back to the media line.
49
+ */
50
+ export async function describeImages(images: LoadedMedia[], opts: VisionOptions = {}): Promise<string | null> {
51
+ if (!images.length) return null;
52
+ const env = opts.env || process.env;
53
+ const model = opts.model || 'gpt-4o-mini';
54
+ // Anthropic isn't OpenAI-chat-shaped here; skip to keep this helper simple.
55
+ if (model.toLowerCase().includes('claude')) return null;
56
+ const key = keyFor(model, env);
57
+ if (!key) return null;
58
+
59
+ const prompt = opts.prompt || '请用中文简洁描述这些图片的内容(关键物体、文字、场景);如果含可读文字请转写出来。';
60
+ const content: any[] = [{ type: 'text', text: prompt }];
61
+ for (const img of images.slice(0, 4)) {
62
+ const mime = img.contentType || 'image/png';
63
+ content.push({ type: 'image_url', image_url: { url: `data:${mime};base64,${img.data.toString('base64')}` } });
64
+ }
65
+
66
+ try {
67
+ const res = await axios.post(
68
+ `${baseUrlFor(model)}/chat/completions`,
69
+ { model, messages: [{ role: 'user', content }], max_tokens: 500, temperature: 0.2 },
70
+ { headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${key}` }, timeout: 30000, validateStatus: (s) => s >= 200 && s < 300 },
71
+ );
72
+ const text = res.data?.choices?.[0]?.message?.content;
73
+ return typeof text === 'string' && text.trim() ? text.trim() : null;
74
+ } catch (e) {
75
+ log.warn('vision_describe_failed', { model, error: String(e).slice(0, 160) });
76
+ return null;
77
+ }
78
+ }