@zhin.js/agent 0.0.13 → 0.0.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/lib/builtin-tools.d.ts +17 -2
- package/lib/builtin-tools.d.ts.map +1 -1
- package/lib/builtin-tools.js +57 -14
- package/lib/builtin-tools.js.map +1 -1
- package/lib/init/create-zhin-agent.d.ts.map +1 -1
- package/lib/init/create-zhin-agent.js +5 -2
- package/lib/init/create-zhin-agent.js.map +1 -1
- package/lib/init/register-ai-trigger.d.ts.map +1 -1
- package/lib/init/register-ai-trigger.js +85 -24
- package/lib/init/register-ai-trigger.js.map +1 -1
- package/lib/init/register-builtin-tools.d.ts.map +1 -1
- package/lib/init/register-builtin-tools.js +8 -5
- package/lib/init/register-builtin-tools.js.map +1 -1
- package/lib/zhin-agent/exec-policy.d.ts.map +1 -1
- package/lib/zhin-agent/exec-policy.js +5 -3
- package/lib/zhin-agent/exec-policy.js.map +1 -1
- package/lib/zhin-agent/index.d.ts.map +1 -1
- package/lib/zhin-agent/index.js +31 -6
- package/lib/zhin-agent/index.js.map +1 -1
- package/lib/zhin-agent/prompt.d.ts.map +1 -1
- package/lib/zhin-agent/prompt.js +12 -1
- package/lib/zhin-agent/prompt.js.map +1 -1
- package/lib/zhin-agent/tool-collector.d.ts.map +1 -1
- package/lib/zhin-agent/tool-collector.js +10 -3
- package/lib/zhin-agent/tool-collector.js.map +1 -1
- package/package.json +3 -3
- package/src/builtin-tools.ts +61 -15
- package/src/init/create-zhin-agent.ts +5 -2
- package/src/init/register-ai-trigger.ts +85 -23
- package/src/init/register-builtin-tools.ts +14 -5
- package/src/zhin-agent/exec-policy.ts +5 -3
- package/src/zhin-agent/index.ts +38 -6
- package/src/zhin-agent/prompt.ts +11 -1
- package/src/zhin-agent/tool-collector.ts +10 -3
- package/tests/ai/multimodal.test.ts +106 -0
- package/tests/ai/zhin-agent.test.ts +130 -1
- package/tests/file-policy.test.ts +1 -1
|
@@ -8,27 +8,80 @@ import type { ContentPart } from '@zhin.js/core';
|
|
|
8
8
|
import type { OutputElement } from '../output.js';
|
|
9
9
|
import type { AIServiceRefs } from './shared-refs.js';
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
11
|
+
/**
|
|
12
|
+
* Extract multimodal ContentPart[] from a Message's structured $content segments.
|
|
13
|
+
* Handles image, video, audio, and face/sticker types.
|
|
14
|
+
* Falls back to raw string parsing for image URLs when $content has no media segments.
|
|
15
|
+
*/
|
|
16
|
+
function extractMediaParts(message: Message<any>): ContentPart[] {
|
|
17
|
+
const parts: ContentPart[] = [];
|
|
18
|
+
|
|
19
|
+
// 1. Extract from structured $content segments
|
|
20
|
+
if (Array.isArray(message.$content)) {
|
|
21
|
+
for (const seg of message.$content) {
|
|
22
|
+
if (typeof seg === 'string' || !seg || !seg.type) continue;
|
|
23
|
+
const { type, data } = seg;
|
|
24
|
+
switch (type) {
|
|
25
|
+
case 'image': {
|
|
26
|
+
const url = data?.url || data?.file || data?.src;
|
|
27
|
+
if (url) parts.push({ type: 'image_url', image_url: { url } });
|
|
28
|
+
break;
|
|
29
|
+
}
|
|
30
|
+
case 'video': {
|
|
31
|
+
const url = data?.url || data?.file || data?.src;
|
|
32
|
+
if (url) parts.push({ type: 'video_url', video_url: { url } });
|
|
33
|
+
break;
|
|
34
|
+
}
|
|
35
|
+
case 'audio':
|
|
36
|
+
case 'record':
|
|
37
|
+
case 'voice': {
|
|
38
|
+
const dataStr = data?.data || data?.base64;
|
|
39
|
+
if (dataStr) {
|
|
40
|
+
const fmt = data?.format === 'wav' ? 'wav' : 'mp3';
|
|
41
|
+
parts.push({ type: 'audio', audio: { data: dataStr, format: fmt } });
|
|
42
|
+
} else {
|
|
43
|
+
const url = data?.url || data?.file || data?.src;
|
|
44
|
+
if (url) {
|
|
45
|
+
// Audio URL: describe as text since most LLMs can't play audio URLs directly
|
|
46
|
+
parts.push({ type: 'text', text: `[用户发送了一段语音: ${url}]` });
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
break;
|
|
50
|
+
}
|
|
51
|
+
case 'face':
|
|
52
|
+
case 'sticker':
|
|
53
|
+
case 'emoji': {
|
|
54
|
+
const id = String(data?.id ?? data?.face_id ?? '');
|
|
55
|
+
const text = data?.text || data?.name || data?.describe;
|
|
56
|
+
if (id) parts.push({ type: 'face', face: { id, text } });
|
|
57
|
+
break;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
20
60
|
}
|
|
21
61
|
}
|
|
22
62
|
|
|
23
|
-
|
|
24
|
-
if (
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
63
|
+
// 2. Fallback: parse image URLs from $raw for adapters that don't use structured $content
|
|
64
|
+
if (parts.length === 0) {
|
|
65
|
+
const raw = typeof message.$raw === 'string' ? message.$raw : JSON.stringify(message.$raw || '');
|
|
66
|
+
|
|
67
|
+
const xmlMatches = raw.match(/<image[^>]+url="([^"]+)"/g);
|
|
68
|
+
if (xmlMatches) {
|
|
69
|
+
for (const m of xmlMatches) {
|
|
70
|
+
const urlMatch = m.match(/url="([^"]+)"/);
|
|
71
|
+
if (urlMatch) parts.push({ type: 'image_url', image_url: { url: urlMatch[1] } });
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const cqMatches = raw.match(/\[CQ:image[^\]]*url=([^\],]+)/g);
|
|
76
|
+
if (cqMatches) {
|
|
77
|
+
for (const m of cqMatches) {
|
|
78
|
+
const urlMatch = m.match(/url=([^\],]+)/);
|
|
79
|
+
if (urlMatch) parts.push({ type: 'image_url', image_url: { url: urlMatch[1] } });
|
|
80
|
+
}
|
|
28
81
|
}
|
|
29
82
|
}
|
|
30
83
|
|
|
31
|
-
return
|
|
84
|
+
return parts;
|
|
32
85
|
}
|
|
33
86
|
|
|
34
87
|
function renderOutput(elements: OutputElement[]): string {
|
|
@@ -76,14 +129,25 @@ export function registerAITrigger(refs: AIServiceRefs): void {
|
|
|
76
129
|
return;
|
|
77
130
|
}
|
|
78
131
|
|
|
132
|
+
const dispatcherSvc = root.inject('dispatcher') as
|
|
133
|
+
| { replyWithPolish?: (m: Message<any>, s: 'ai' | 'command', c: unknown) => Promise<unknown> }
|
|
134
|
+
| undefined;
|
|
135
|
+
|
|
79
136
|
const handleAIMessage = async (
|
|
80
137
|
message: Message<any>,
|
|
81
138
|
content: string,
|
|
82
139
|
) => {
|
|
140
|
+
const replyOutbound = async (payload: unknown) => {
|
|
141
|
+
if (dispatcherSvc && typeof dispatcherSvc.replyWithPolish === 'function') {
|
|
142
|
+
return dispatcherSvc.replyWithPolish(message, 'ai', payload as any);
|
|
143
|
+
}
|
|
144
|
+
return message.$reply(payload as any);
|
|
145
|
+
};
|
|
146
|
+
|
|
83
147
|
const t0 = performance.now();
|
|
84
148
|
if (!ai.isReady()) return;
|
|
85
149
|
if (triggerConfig.thinkingMessage)
|
|
86
|
-
await
|
|
150
|
+
await replyOutbound(triggerConfig.thinkingMessage);
|
|
87
151
|
|
|
88
152
|
const permissions = inferSenderPermissions(message, triggerConfig);
|
|
89
153
|
const toolContext: ToolContext = {
|
|
@@ -116,14 +180,12 @@ export function registerAITrigger(refs: AIServiceRefs): void {
|
|
|
116
180
|
|
|
117
181
|
let responseText: string;
|
|
118
182
|
if (refs.zhinAgent) {
|
|
119
|
-
const
|
|
183
|
+
const mediaParts = extractMediaParts(message);
|
|
120
184
|
let elements: OutputElement[];
|
|
121
|
-
if (
|
|
185
|
+
if (mediaParts.length > 0) {
|
|
122
186
|
const parts: ContentPart[] = [];
|
|
123
187
|
if (content) parts.push({ type: 'text', text: content });
|
|
124
|
-
|
|
125
|
-
parts.push({ type: 'image_url', image_url: { url } });
|
|
126
|
-
}
|
|
188
|
+
parts.push(...mediaParts);
|
|
127
189
|
elements = await Promise.race([
|
|
128
190
|
refs.zhinAgent.processMultimodal(parts, toolContext),
|
|
129
191
|
timeout,
|
|
@@ -143,12 +205,12 @@ export function registerAITrigger(refs: AIServiceRefs): void {
|
|
|
143
205
|
responseText = typeof response === 'string' ? response : '';
|
|
144
206
|
}
|
|
145
207
|
|
|
146
|
-
if (responseText) await
|
|
208
|
+
if (responseText) await replyOutbound(parseRichMediaContent(responseText));
|
|
147
209
|
logger.info(`[AI Handler] 总耗时: ${(performance.now() - t0).toFixed(0)}ms`);
|
|
148
210
|
} catch (error) {
|
|
149
211
|
const msg = error instanceof Error ? error.message : String(error);
|
|
150
212
|
logger.warn(`[AI Handler] 失败 (${(performance.now() - t0).toFixed(0)}ms): ${msg}`);
|
|
151
|
-
await
|
|
213
|
+
await replyOutbound(triggerConfig.errorTemplate.replace('{error}', msg));
|
|
152
214
|
}
|
|
153
215
|
};
|
|
154
216
|
|
|
@@ -6,7 +6,13 @@ import * as fs from 'fs';
|
|
|
6
6
|
import * as os from 'os';
|
|
7
7
|
import * as path from 'path';
|
|
8
8
|
import { getPlugin, type Tool, type SkillFeature } from '@zhin.js/core';
|
|
9
|
-
import {
|
|
9
|
+
import {
|
|
10
|
+
collectPluginSkillSearchRoots,
|
|
11
|
+
createBuiltinTools,
|
|
12
|
+
discoverWorkspaceSkills,
|
|
13
|
+
loadAlwaysSkillsContent,
|
|
14
|
+
buildSkillsSummaryXML,
|
|
15
|
+
} from '../builtin-tools.js';
|
|
10
16
|
import { resolveSkillInstructionMaxChars, DEFAULT_CONFIG } from '../zhin-agent/config.js';
|
|
11
17
|
import { loadBootstrapFiles, buildContextFiles, buildBootstrapContextSection } from '../bootstrap.js';
|
|
12
18
|
import { triggerAIHook, createAIHookEvent } from '../hooks.js';
|
|
@@ -24,7 +30,10 @@ export function registerBuiltinTools(refs: AIServiceRefs): void {
|
|
|
24
30
|
const agentCfg = ai.getAgentConfig();
|
|
25
31
|
const fullCfg = { ...DEFAULT_CONFIG, ...agentCfg } as Required<import('../zhin-agent/config.js').ZhinAgentConfig>;
|
|
26
32
|
const modelName = provider.models[0] || '';
|
|
27
|
-
const builtinTools = createBuiltinTools({
|
|
33
|
+
const builtinTools = createBuiltinTools({
|
|
34
|
+
skillInstructionMaxChars: resolveSkillInstructionMaxChars(fullCfg, modelName),
|
|
35
|
+
pluginSkillRootsResolver: () => collectPluginSkillSearchRoots(root),
|
|
36
|
+
});
|
|
28
37
|
const disposers: (() => void)[] = [];
|
|
29
38
|
for (const tool of builtinTools) disposers.push(toolService.addTool(tool, root.name));
|
|
30
39
|
const cronTools = createCronTools();
|
|
@@ -39,7 +48,7 @@ export function registerBuiltinTools(refs: AIServiceRefs): void {
|
|
|
39
48
|
if (!skillFeature) return 0;
|
|
40
49
|
const existing = skillFeature.getByPlugin(root.name);
|
|
41
50
|
for (const s of existing) skillFeature.remove(s);
|
|
42
|
-
const skills = await discoverWorkspaceSkills();
|
|
51
|
+
const skills = await discoverWorkspaceSkills(root);
|
|
43
52
|
if (skills.length === 0) return 0;
|
|
44
53
|
const allRegisteredTools = toolService.getAll();
|
|
45
54
|
const toolNameIndex = new Map<string, Tool>();
|
|
@@ -111,7 +120,7 @@ export function registerBuiltinTools(refs: AIServiceRefs): void {
|
|
|
111
120
|
|
|
112
121
|
// Step 3: inject always-on skills content + XML summary
|
|
113
122
|
try {
|
|
114
|
-
const skillsForContext = await discoverWorkspaceSkills();
|
|
123
|
+
const skillsForContext = await discoverWorkspaceSkills(root);
|
|
115
124
|
const alwaysContent = await loadAlwaysSkillsContent(skillsForContext);
|
|
116
125
|
const skillsXml = buildSkillsSummaryXML(skillsForContext);
|
|
117
126
|
if (refs.zhinAgent) {
|
|
@@ -140,7 +149,7 @@ export function registerBuiltinTools(refs: AIServiceRefs): void {
|
|
|
140
149
|
skillReloadDebounce = null;
|
|
141
150
|
try {
|
|
142
151
|
const count = await syncWorkspaceSkills();
|
|
143
|
-
const skillsForContext = await discoverWorkspaceSkills();
|
|
152
|
+
const skillsForContext = await discoverWorkspaceSkills(root);
|
|
144
153
|
const alwaysContent = await loadAlwaysSkillsContent(skillsForContext);
|
|
145
154
|
const skillsXml = buildSkillsSummaryXML(skillsForContext);
|
|
146
155
|
if (refs.zhinAgent) {
|
|
@@ -41,12 +41,14 @@ export function checkExecPolicy(config: Required<ZhinAgentConfig>, command: stri
|
|
|
41
41
|
// allowlist
|
|
42
42
|
const list = resolveExecAllowlist(config);
|
|
43
43
|
const cmd = (command || '').trim();
|
|
44
|
+
// 提取命令的第一个 token(实际可执行程序名)进行白名单匹配
|
|
45
|
+
const cmdName = cmd.split(/[\s;|&]/)[0];
|
|
44
46
|
const allowed = list.some(pattern => {
|
|
45
47
|
try {
|
|
46
|
-
const re = new RegExp(pattern);
|
|
47
|
-
return re.test(
|
|
48
|
+
const re = new RegExp(`^${pattern}$`);
|
|
49
|
+
return re.test(cmdName);
|
|
48
50
|
} catch {
|
|
49
|
-
return
|
|
51
|
+
return cmdName === pattern;
|
|
50
52
|
}
|
|
51
53
|
});
|
|
52
54
|
if (!allowed) {
|
package/src/zhin-agent/index.ts
CHANGED
|
@@ -385,17 +385,49 @@ ${preData ? `\nPre-fetched data:\n${preData}\n` : ''}`;
|
|
|
385
385
|
const profileSummary = await this.userProfiles.buildProfileSummary(userId);
|
|
386
386
|
const personaEnhanced = buildEnhancedPersona(this.config, profileSummary, '');
|
|
387
387
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
388
|
+
// Build text summary describing the multimodal content
|
|
389
|
+
const textFragments: string[] = [];
|
|
390
|
+
const llmParts: ContentPart[] = [];
|
|
391
|
+
|
|
392
|
+
/** Full multimodal ContentPart union (core/ai may export a narrower type in some builds) */
|
|
393
|
+
type MultimodalPart =
|
|
394
|
+
| ContentPart
|
|
395
|
+
| { type: 'video_url'; video_url: { url: string } }
|
|
396
|
+
| { type: 'face'; face: { id: string; text?: string } };
|
|
397
|
+
|
|
398
|
+
for (const p of parts as MultimodalPart[]) {
|
|
399
|
+
switch (p.type) {
|
|
400
|
+
case 'text':
|
|
401
|
+
textFragments.push(p.text);
|
|
402
|
+
llmParts.push(p);
|
|
403
|
+
break;
|
|
404
|
+
case 'image_url':
|
|
405
|
+
textFragments.push('[图片]');
|
|
406
|
+
llmParts.push(p);
|
|
407
|
+
break;
|
|
408
|
+
case 'video_url':
|
|
409
|
+
textFragments.push('[视频]');
|
|
410
|
+
// Most LLMs don't support video natively; describe it as a URL for context
|
|
411
|
+
llmParts.push({ type: 'text', text: `[用户发送了一个视频: ${p.video_url.url}]` });
|
|
412
|
+
break;
|
|
413
|
+
case 'audio':
|
|
414
|
+
textFragments.push('[音频]');
|
|
415
|
+
llmParts.push(p);
|
|
416
|
+
break;
|
|
417
|
+
case 'face':
|
|
418
|
+
textFragments.push(p.face.text || `[表情:${p.face.id}]`);
|
|
419
|
+
llmParts.push({ type: 'text', text: p.face.text ? `[表情: ${p.face.text}]` : `[表情ID: ${p.face.id}]` });
|
|
420
|
+
break;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
392
423
|
|
|
424
|
+
const textContent = textFragments.join(' ') || '[多模态消息]';
|
|
393
425
|
const visionModel = this.config.visionModel || this.provider.models[0];
|
|
394
426
|
|
|
395
427
|
const messages: ChatMessage[] = [
|
|
396
428
|
{ role: 'system', content: personaEnhanced },
|
|
397
429
|
...historyMessages,
|
|
398
|
-
{ role: 'user', content:
|
|
430
|
+
{ role: 'user', content: llmParts },
|
|
399
431
|
];
|
|
400
432
|
|
|
401
433
|
let reply = '';
|
|
@@ -413,7 +445,7 @@ ${preData ? `\nPre-fetched data:\n${preData}\n` : ''}`;
|
|
|
413
445
|
reply = typeof msg === 'string' ? msg : '';
|
|
414
446
|
}
|
|
415
447
|
|
|
416
|
-
if (!reply) reply = '
|
|
448
|
+
if (!reply) reply = '抱歉,我无法理解这条消息。';
|
|
417
449
|
await this.saveToSession(sessionId, textContent, reply, sceneId);
|
|
418
450
|
return parseOutput(reply);
|
|
419
451
|
}
|
package/src/zhin-agent/prompt.ts
CHANGED
|
@@ -15,7 +15,17 @@ export function contentToText(c: string | ContentPart[] | ContentPart | null | u
|
|
|
15
15
|
if (c == null) return '';
|
|
16
16
|
if (typeof c === 'string') return c;
|
|
17
17
|
const parts = Array.isArray(c) ? c : [c as ContentPart];
|
|
18
|
-
return parts.map(p =>
|
|
18
|
+
return parts.map(p => {
|
|
19
|
+
if (!p) return '';
|
|
20
|
+
switch (p.type) {
|
|
21
|
+
case 'text': return p.text;
|
|
22
|
+
case 'image_url': return '[图片]';
|
|
23
|
+
case 'audio': return '[音频]';
|
|
24
|
+
case 'video_url': return '[视频]';
|
|
25
|
+
case 'face': return (p as Extract<ContentPart, { type: 'face' }>).face.text || '[表情]';
|
|
26
|
+
default: return '';
|
|
27
|
+
}
|
|
28
|
+
}).join('');
|
|
19
29
|
}
|
|
20
30
|
|
|
21
31
|
export function buildUserMessageWithHistory(history: ChatMessage[], currentContent: string): string {
|
|
@@ -106,16 +106,23 @@ export function collectRelevantTools(
|
|
|
106
106
|
const collected: AgentTool[] = [];
|
|
107
107
|
const collectedNames = new Set<string>();
|
|
108
108
|
|
|
109
|
-
// 0. Detect if user mentions a known skill
|
|
109
|
+
// 0. Detect if user mentions a known skill(名称或关键词,与 SkillFeature / SKILL.md 注入一致)
|
|
110
110
|
let mentionedSkill: string | null = null;
|
|
111
111
|
if (skillRegistry && skillRegistry.size > 0) {
|
|
112
112
|
const msgLower = message.toLowerCase();
|
|
113
|
-
for (const skill of skillRegistry.getAll()) {
|
|
113
|
+
outer: for (const skill of skillRegistry.getAll()) {
|
|
114
114
|
if (msgLower.includes(skill.name.toLowerCase())) {
|
|
115
115
|
mentionedSkill = skill.name;
|
|
116
|
-
logger.debug(`[技能检测]
|
|
116
|
+
logger.debug(`[技能检测] 用户提到技能(名称): ${mentionedSkill}`);
|
|
117
117
|
break;
|
|
118
118
|
}
|
|
119
|
+
for (const kw of skill.keywords || []) {
|
|
120
|
+
if (kw && msgLower.includes(String(kw).toLowerCase())) {
|
|
121
|
+
mentionedSkill = skill.name;
|
|
122
|
+
logger.debug(`[技能检测] 用户提到技能(关键词→${skill.name}): ${kw}`);
|
|
123
|
+
break outer;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
119
126
|
}
|
|
120
127
|
}
|
|
121
128
|
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 多模态功能测试
|
|
3
|
+
*
|
|
4
|
+
* 测试 ContentPart 类型扩展、contentToText 辅助函数等多模态相关功能
|
|
5
|
+
*/
|
|
6
|
+
import { describe, it, expect } from 'vitest';
|
|
7
|
+
import type { ContentPart } from '@zhin.js/core';
|
|
8
|
+
import { contentToText } from '@zhin.js/agent';
|
|
9
|
+
|
|
10
|
+
describe('contentToText 多模态支持', () => {
|
|
11
|
+
it('应处理纯文本', () => {
|
|
12
|
+
expect(contentToText('hello')).toBe('hello');
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
it('应处理 null 和 undefined', () => {
|
|
16
|
+
expect(contentToText(null)).toBe('');
|
|
17
|
+
expect(contentToText(undefined)).toBe('');
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it('应处理 text ContentPart', () => {
|
|
21
|
+
const parts: ContentPart[] = [{ type: 'text', text: '你好' }];
|
|
22
|
+
expect(contentToText(parts)).toBe('你好');
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it('应将 image_url ContentPart 转为 [图片]', () => {
|
|
26
|
+
const parts: ContentPart[] = [
|
|
27
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/img.jpg' } },
|
|
28
|
+
];
|
|
29
|
+
expect(contentToText(parts)).toBe('[图片]');
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it('应将 video_url ContentPart 转为 [视频]', () => {
|
|
33
|
+
const parts: ContentPart[] = [
|
|
34
|
+
{ type: 'video_url', video_url: { url: 'https://example.com/video.mp4' } },
|
|
35
|
+
];
|
|
36
|
+
expect(contentToText(parts)).toBe('[视频]');
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it('应将 audio ContentPart 转为 [音频]', () => {
|
|
40
|
+
const parts: ContentPart[] = [
|
|
41
|
+
{ type: 'audio', audio: { data: 'base64data', format: 'mp3' } },
|
|
42
|
+
];
|
|
43
|
+
expect(contentToText(parts)).toBe('[音频]');
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
it('应将 face ContentPart 转为表情文字', () => {
|
|
47
|
+
const parts: ContentPart[] = [
|
|
48
|
+
{ type: 'face', face: { id: '178', text: '笑哭' } },
|
|
49
|
+
];
|
|
50
|
+
expect(contentToText(parts)).toBe('笑哭');
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
it('应将无文字 face ContentPart 转为 [表情]', () => {
|
|
54
|
+
const parts: ContentPart[] = [
|
|
55
|
+
{ type: 'face', face: { id: '178' } },
|
|
56
|
+
];
|
|
57
|
+
expect(contentToText(parts)).toBe('[表情]');
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('应正确处理混合内容', () => {
|
|
61
|
+
const parts: ContentPart[] = [
|
|
62
|
+
{ type: 'text', text: '看看这个' },
|
|
63
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/img.jpg' } },
|
|
64
|
+
{ type: 'face', face: { id: '1', text: '微笑' } },
|
|
65
|
+
];
|
|
66
|
+
expect(contentToText(parts)).toBe('看看这个[图片]微笑');
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
it('应处理单个 ContentPart(非数组)', () => {
|
|
70
|
+
const part: ContentPart = { type: 'text', text: '单个' };
|
|
71
|
+
expect(contentToText(part)).toBe('单个');
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
describe('ContentPart 类型完整性', () => {
|
|
76
|
+
it('应支持所有多模态类型', () => {
|
|
77
|
+
const textPart: ContentPart = { type: 'text', text: 'hello' };
|
|
78
|
+
const imagePart: ContentPart = { type: 'image_url', image_url: { url: 'https://img.png' } };
|
|
79
|
+
const audioPart: ContentPart = { type: 'audio', audio: { data: 'data', format: 'mp3' } };
|
|
80
|
+
const videoPart: ContentPart = { type: 'video_url', video_url: { url: 'https://vid.mp4' } };
|
|
81
|
+
const facePart: ContentPart = { type: 'face', face: { id: '1', text: '微笑' } };
|
|
82
|
+
|
|
83
|
+
expect(textPart.type).toBe('text');
|
|
84
|
+
expect(imagePart.type).toBe('image_url');
|
|
85
|
+
expect(audioPart.type).toBe('audio');
|
|
86
|
+
expect(videoPart.type).toBe('video_url');
|
|
87
|
+
expect(facePart.type).toBe('face');
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it('image_url 应支持 detail 参数', () => {
|
|
91
|
+
const part: ContentPart = {
|
|
92
|
+
type: 'image_url',
|
|
93
|
+
image_url: { url: 'https://img.png', detail: 'high' },
|
|
94
|
+
};
|
|
95
|
+
if (part.type === 'image_url') {
|
|
96
|
+
expect(part.image_url.detail).toBe('high');
|
|
97
|
+
}
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('face 的 text 应为可选', () => {
|
|
101
|
+
const part: ContentPart = { type: 'face', face: { id: '100' } };
|
|
102
|
+
if (part.type === 'face') {
|
|
103
|
+
expect(part.face.text).toBeUndefined();
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
});
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
|
7
7
|
import { ZhinAgent } from '@zhin.js/agent';
|
|
8
8
|
import { SkillFeature } from '@zhin.js/core';
|
|
9
|
-
import type { AIProvider, AgentTool } from '@zhin.js/core';
|
|
9
|
+
import type { AIProvider, AgentTool, ContentPart } from '@zhin.js/core';
|
|
10
10
|
import type { Tool, ToolContext } from '@zhin.js/core';
|
|
11
11
|
|
|
12
12
|
// Mock AIProvider
|
|
@@ -21,6 +21,27 @@ function createMockProvider(response: string = '你好!'): AIProvider {
|
|
|
21
21
|
};
|
|
22
22
|
}
|
|
23
23
|
|
|
24
|
+
// Mock AIProvider with chatStream support (for multimodal tests)
|
|
25
|
+
function createStreamMockProvider(response: string = '你好!'): AIProvider {
|
|
26
|
+
return {
|
|
27
|
+
name: 'mock',
|
|
28
|
+
models: ['mock-model'],
|
|
29
|
+
chat: vi.fn(async () => ({
|
|
30
|
+
choices: [{ message: { role: 'assistant' as const, content: response }, finish_reason: 'stop' }],
|
|
31
|
+
} as ChatResponse)),
|
|
32
|
+
chatStream: vi.fn(async function* () {
|
|
33
|
+
yield {
|
|
34
|
+
id: 'chunk-1',
|
|
35
|
+
object: 'chat.completion.chunk' as const,
|
|
36
|
+
created: Date.now(),
|
|
37
|
+
model: 'mock-model',
|
|
38
|
+
choices: [{ index: 0, delta: { content: response }, finish_reason: null }],
|
|
39
|
+
};
|
|
40
|
+
}),
|
|
41
|
+
listModels: vi.fn(async () => ['mock-model']),
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
24
45
|
function makeToolContext(overrides: Partial<ToolContext> = {}): ToolContext {
|
|
25
46
|
return {
|
|
26
47
|
platform: 'test',
|
|
@@ -174,4 +195,112 @@ describe('ZhinAgent', () => {
|
|
|
174
195
|
expect(() => agent.dispose()).not.toThrow();
|
|
175
196
|
});
|
|
176
197
|
});
|
|
198
|
+
|
|
199
|
+
describe('processMultimodal', () => {
|
|
200
|
+
let streamAgent: ZhinAgent;
|
|
201
|
+
let streamProvider: AIProvider;
|
|
202
|
+
|
|
203
|
+
beforeEach(() => {
|
|
204
|
+
streamProvider = createStreamMockProvider();
|
|
205
|
+
streamAgent = new ZhinAgent(streamProvider, {
|
|
206
|
+
persona: '测试助手',
|
|
207
|
+
maxIterations: 3,
|
|
208
|
+
});
|
|
209
|
+
});
|
|
210
|
+
|
|
211
|
+
afterEach(() => {
|
|
212
|
+
streamAgent.dispose();
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
it('应处理图片+文本的多模态消息', async () => {
|
|
216
|
+
const context = makeToolContext();
|
|
217
|
+
const parts: ContentPart[] = [
|
|
218
|
+
{ type: 'text', text: '这是什么?' },
|
|
219
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/cat.jpg' } },
|
|
220
|
+
];
|
|
221
|
+
|
|
222
|
+
const result = await streamAgent.processMultimodal(parts, context);
|
|
223
|
+
|
|
224
|
+
expect(result).toBeDefined();
|
|
225
|
+
expect(Array.isArray(result)).toBe(true);
|
|
226
|
+
expect(result.length).toBeGreaterThan(0);
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
it('应处理视频类型的多模态消息', async () => {
|
|
230
|
+
const context = makeToolContext();
|
|
231
|
+
const parts: ContentPart[] = [
|
|
232
|
+
{ type: 'text', text: '这个视频讲的是什么?' },
|
|
233
|
+
{ type: 'video_url', video_url: { url: 'https://example.com/video.mp4' } },
|
|
234
|
+
];
|
|
235
|
+
|
|
236
|
+
const result = await streamAgent.processMultimodal(parts, context);
|
|
237
|
+
|
|
238
|
+
expect(result).toBeDefined();
|
|
239
|
+
expect(Array.isArray(result)).toBe(true);
|
|
240
|
+
expect(result.length).toBeGreaterThan(0);
|
|
241
|
+
});
|
|
242
|
+
|
|
243
|
+
it('应处理表情类型的多模态消息', async () => {
|
|
244
|
+
const context = makeToolContext();
|
|
245
|
+
const parts: ContentPart[] = [
|
|
246
|
+
{ type: 'text', text: '你好' },
|
|
247
|
+
{ type: 'face', face: { id: '178', text: '笑哭' } },
|
|
248
|
+
];
|
|
249
|
+
|
|
250
|
+
const result = await streamAgent.processMultimodal(parts, context);
|
|
251
|
+
|
|
252
|
+
expect(result).toBeDefined();
|
|
253
|
+
expect(Array.isArray(result)).toBe(true);
|
|
254
|
+
expect(result.length).toBeGreaterThan(0);
|
|
255
|
+
});
|
|
256
|
+
|
|
257
|
+
it('应处理混合多种媒体类型的多模态消息', async () => {
|
|
258
|
+
const context = makeToolContext();
|
|
259
|
+
const parts: ContentPart[] = [
|
|
260
|
+
{ type: 'text', text: '看看这些' },
|
|
261
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/pic.jpg' } },
|
|
262
|
+
{ type: 'video_url', video_url: { url: 'https://example.com/clip.mp4' } },
|
|
263
|
+
{ type: 'face', face: { id: '1', text: '微笑' } },
|
|
264
|
+
];
|
|
265
|
+
|
|
266
|
+
const result = await streamAgent.processMultimodal(parts, context);
|
|
267
|
+
|
|
268
|
+
expect(result).toBeDefined();
|
|
269
|
+
expect(Array.isArray(result)).toBe(true);
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
it('无文本时应使用默认描述', async () => {
|
|
273
|
+
const context = makeToolContext();
|
|
274
|
+
const parts: ContentPart[] = [
|
|
275
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/img.jpg' } },
|
|
276
|
+
];
|
|
277
|
+
|
|
278
|
+
const result = await streamAgent.processMultimodal(parts, context);
|
|
279
|
+
|
|
280
|
+
expect(result).toBeDefined();
|
|
281
|
+
expect(Array.isArray(result)).toBe(true);
|
|
282
|
+
});
|
|
283
|
+
|
|
284
|
+
it('速率限制在多模态处理中应生效', async () => {
|
|
285
|
+
const strictAgent = new ZhinAgent(streamProvider, {
|
|
286
|
+
rateLimit: { maxRequestsPerMinute: 1, cooldownSeconds: 5 },
|
|
287
|
+
});
|
|
288
|
+
|
|
289
|
+
const context = makeToolContext();
|
|
290
|
+
const parts: ContentPart[] = [
|
|
291
|
+
{ type: 'text', text: '第一次' },
|
|
292
|
+
{ type: 'image_url', image_url: { url: 'https://example.com/1.jpg' } },
|
|
293
|
+
];
|
|
294
|
+
|
|
295
|
+
// 第一次
|
|
296
|
+
await strictAgent.processMultimodal(parts, context);
|
|
297
|
+
|
|
298
|
+
// 第二次应被限制
|
|
299
|
+
const result = await strictAgent.processMultimodal(parts, context);
|
|
300
|
+
expect(result).toBeDefined();
|
|
301
|
+
expect(Array.isArray(result)).toBe(true);
|
|
302
|
+
|
|
303
|
+
strictAgent.dispose();
|
|
304
|
+
});
|
|
305
|
+
});
|
|
177
306
|
});
|
|
@@ -54,6 +54,7 @@ describe('file-policy', () => {
|
|
|
54
54
|
'/home/user/.gcloud/properties',
|
|
55
55
|
'/home/user/.kube/config',
|
|
56
56
|
'/root/.ssh/authorized_keys',
|
|
57
|
+
'data/memory/notes.md', // data 目录为敏感目录,禁止访问
|
|
57
58
|
];
|
|
58
59
|
|
|
59
60
|
for (const p of blockedPaths) {
|
|
@@ -85,7 +86,6 @@ describe('file-policy', () => {
|
|
|
85
86
|
'/home/user/project/README.md',
|
|
86
87
|
'/home/user/project/tsconfig.json',
|
|
87
88
|
'./src/utils.ts',
|
|
88
|
-
'data/memory/notes.md',
|
|
89
89
|
'/tmp/test.txt',
|
|
90
90
|
];
|
|
91
91
|
|