@fmode/vision 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ {
2
+ "name": "fmode-vision",
3
+ "description": "Analyze images and videos via Fmode API vision models. Single-pass and multi-pass focused analysis with structured JSON output, plus a renovation room-measurement prompt pipeline.",
4
+ "version": "0.1.0",
5
+ "author": {
6
+ "name": "fmode"
7
+ }
8
+ }
package/README.md ADDED
@@ -0,0 +1,46 @@
1
+ # @fmode/vision
2
+
3
+ Claude Code skill for analyzing images and videos via Fmode API vision models (`api.fmode.cn`). Supports single-pass analysis, multi-pass focused analysis with intermediate caching, video-frame analysis, batch processing, and a renovation room-measurement (毛坯房量尺) 5-pass prompt pipeline.
4
+
5
+ ## Install
6
+
7
+ Project workspace (writes `./.claude/skills/fmode-vision/`):
8
+
9
+ ```bash
10
+ npx @fmode/vision@latest workspace
11
+ ```
12
+
13
+ User level for all workspaces (writes `~/.claude/skills/fmode-vision/`):
14
+
15
+ ```bash
16
+ npx @fmode/vision@latest install
17
+ ```
18
+
19
+ Restart the Claude Code session afterwards so it discovers the skill.
20
+
21
+ ## Token
22
+
23
+ The skill auto-detects the Fmode API token in this priority order:
24
+
25
+ 1. `FMODE_API_TOKEN` environment variable
26
+ 2. `~/.fmode/config.json` → `fmodeApiToken` (or `newapiToken`)
27
+ 3. `<project>/.fmode/config.json` → `fmodeApiToken` (or `newapiToken`)
28
+ 4. `ANTHROPIC_AUTH_TOKEN` environment variable (zero-config inside Claude Code, since it is the same Fmode gateway token)
29
+
30
+ If none are found, the skill returns a clear message listing these four options.
31
+
32
+ ## Usage
33
+
34
+ In Claude Code, just ask in natural language, e.g.:
35
+
36
+ ```
37
+ 帮我分析这张图片里的关键内容,输出结构化信息。
38
+ ```
39
+
40
+ The skill calls `POST https://api.fmode.cn/v1/chat/completions` with the default vision model `doubao-seed-2-0-pro-260215`. Usage is billed against the Fmode token.
41
+
42
+ ## Verify
43
+
44
+ ```bash
45
+ npm run smoke
46
+ ```
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env node
2
+ const fs = require('fs');
3
+ const os = require('os');
4
+ const path = require('path');
5
+ const { spawnSync } = require('child_process');
6
+
7
+ const SKILL_NAME = 'fmode-vision';
8
+ const SOURCE_ROOT = path.resolve(__dirname, '..');
9
+ const SKILL_SOURCE = path.join(SOURCE_ROOT, 'skills', SKILL_NAME);
10
+ const WORKSPACE_ROOT = process.cwd();
11
+ const GLOBAL_TARGET = path.join(os.homedir(), '.claude', 'skills', SKILL_NAME);
12
+ const WORKSPACE_TARGET = path.join(WORKSPACE_ROOT, '.claude', 'skills', SKILL_NAME);
13
+ const WORKSPACE_SKILLS_ROOT = path.join(WORKSPACE_ROOT, '.claude', 'skills');
14
+ const GLOBAL_SKILLS_ROOT = path.join(os.homedir(), '.claude', 'skills');
15
+
16
+ function expandHome(value) {
17
+ return String(value || '').replace(/^~(?=$|[\\/])/, os.homedir());
18
+ }
19
+
20
+ function parseArgs(argv) {
21
+ const first = argv[0] && !argv[0].startsWith('--') ? argv[0] : 'install';
22
+ const args = { command: first, target: GLOBAL_TARGET, smoke: false, force: false, help: false };
23
+ if (first === 'workspace' || first === 'install-workspace') {
24
+ args.command = 'install';
25
+ args.target = WORKSPACE_TARGET;
26
+ }
27
+ for (let i = first === argv[0] ? 1 : 0; i < argv.length; i++) {
28
+ const token = argv[i];
29
+ if (token === '--target' && argv[i + 1]) args.target = argv[++i];
30
+ else if (token.startsWith('--target=')) args.target = token.slice('--target='.length);
31
+ else if (token === '--workspace') args.target = WORKSPACE_TARGET;
32
+ else if (token === '--global') args.target = GLOBAL_TARGET;
33
+ else if (token === '--smoke') args.smoke = true;
34
+ else if (token === '--force') args.force = true;
35
+ else if (token === '--help' || token === '-h') args.help = true;
36
+ }
37
+ args.target = path.resolve(expandHome(args.target));
38
+ return args;
39
+ }
40
+
41
+ function printHelp() {
42
+ console.log([
43
+ 'fmode-vision skill installer',
44
+ '',
45
+ 'Usage:',
46
+ ' npx @fmode/vision@latest workspace [--smoke] # install into ./.claude/skills/fmode-vision',
47
+ ' npx @fmode/vision@latest install [--smoke] # install into ~/.claude/skills/fmode-vision',
48
+ ' npx @fmode/vision@latest install --target <dir> [--force]',
49
+ ' npx @fmode/vision@latest check',
50
+ ' npx @fmode/vision@latest smoke',
51
+ ' npx @fmode/vision@latest path',
52
+ '',
53
+ 'Options:',
54
+ ' --workspace Install into ./.claude/skills/fmode-vision',
55
+ ' --global Install into ~/.claude/skills/fmode-vision (default)',
56
+ ' --target <dir> Install into a custom directory',
57
+ ' --force Allow overwriting a custom target',
58
+ ' --smoke Run smoke checks after install',
59
+ ' --help, -h Show help'
60
+ ].join('\n'));
61
+ }
62
+
63
+ function ensureDir(dirPath) { fs.mkdirSync(dirPath, { recursive: true }); }
64
+
65
+ function isInside(parentDir, childDir) {
66
+ const relative = path.relative(path.resolve(parentDir), path.resolve(childDir));
67
+ return relative === '' || (!!relative && !relative.startsWith('..') && !path.isAbsolute(relative));
68
+ }
69
+
70
+ function canOverwriteTarget(targetDir, force) {
71
+ return force
72
+ || path.resolve(targetDir) === path.resolve(GLOBAL_TARGET)
73
+ || isInside(WORKSPACE_SKILLS_ROOT, targetDir)
74
+ || isInside(GLOBAL_SKILLS_ROOT, targetDir);
75
+ }
76
+
77
+ function copyDirRecursive(source, destination) {
78
+ const stat = fs.statSync(source);
79
+ if (stat.isDirectory()) {
80
+ ensureDir(destination);
81
+ for (const child of fs.readdirSync(source)) {
82
+ if (child === 'node_modules' || child === 'outputs' || child === '.git') continue;
83
+ copyDirRecursive(path.join(source, child), path.join(destination, child));
84
+ }
85
+ return;
86
+ }
87
+ ensureDir(path.dirname(destination));
88
+ fs.copyFileSync(source, destination);
89
+ }
90
+
91
+ function installSkill(target, force) {
92
+ if (!fs.existsSync(SKILL_SOURCE)) {
93
+ throw new Error(`Skill source missing: ${SKILL_SOURCE}`);
94
+ }
95
+ if (fs.existsSync(target)) {
96
+ if (!canOverwriteTarget(target, force)) {
97
+ throw new Error(`Refusing to overwrite custom target without --force: ${target}`);
98
+ }
99
+ fs.rmSync(target, { recursive: true, force: true });
100
+ }
101
+ ensureDir(target);
102
+ copyDirRecursive(SKILL_SOURCE, target);
103
+ }
104
+
105
+ function checkSkill(target) {
106
+ const required = ['SKILL.md', 'scripts/vision-client.mjs'];
107
+ const missing = required.filter(entry => !fs.existsSync(path.join(target, entry)));
108
+ if (missing.length) {
109
+ throw new Error(`Install target is missing required files: ${missing.join(', ')}`);
110
+ }
111
+ return { status: 'ok', skill: SKILL_NAME, target, required };
112
+ }
113
+
114
+ function runSmoke() {
115
+ const result = spawnSync(process.execPath, ['scripts/smoke.js'], { cwd: SOURCE_ROOT, stdio: 'inherit', shell: false });
116
+ if (result.status !== 0) throw new Error('smoke failed');
117
+ }
118
+
119
+ function printNextSteps(target) {
120
+ const workspaceMode = isInside(WORKSPACE_SKILLS_ROOT, target);
121
+ console.log('');
122
+ console.log('Install complete.');
123
+ console.log(`Skill installed at: ${target}`);
124
+ console.log('');
125
+ if (workspaceMode) {
126
+ console.log('Project-level skill is ready. Restart the VSCode Claude Code session if it was open.');
127
+ } else {
128
+ console.log('User-level skill is ready for all Claude Code workspaces.');
129
+ }
130
+ console.log('');
131
+ console.log('Token: set FMODE_API_TOKEN, or ~/.fmode/config.json -> fmodeApiToken, or rely on ANTHROPIC_AUTH_TOKEN.');
132
+ console.log('');
133
+ console.log('Try this prompt in Claude Code:');
134
+ console.log(' 帮我分析这张图片里的关键内容,输出结构化信息。');
135
+ }
136
+
137
+ function main() {
138
+ const args = parseArgs(process.argv.slice(2));
139
+ if (args.help || args.command === 'help') { printHelp(); return; }
140
+ if (args.command === 'path') { console.log(args.target); return; }
141
+ if (args.command === 'install') {
142
+ installSkill(args.target, args.force);
143
+ console.log(JSON.stringify(checkSkill(args.target), null, 2));
144
+ if (args.smoke) runSmoke();
145
+ printNextSteps(args.target);
146
+ return;
147
+ }
148
+ if (args.command === 'check') { console.log(JSON.stringify(checkSkill(args.target), null, 2)); return; }
149
+ if (args.command === 'smoke') { runSmoke(); return; }
150
+ printHelp();
151
+ process.exitCode = 1;
152
+ }
153
+
154
+ try { main(); }
155
+ catch (error) { console.error(`fmode-vision failed: ${error.message}`); process.exit(1); }
package/package.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "name": "@fmode/vision",
3
+ "publishConfig": {
4
+ "access": "public"
5
+ },
6
+ "version": "0.1.0",
7
+ "description": "Claude Code skill: analyze images and videos via Fmode API vision models (api.fmode.cn). Single-pass and multi-pass focused analysis with structured JSON output. Auto-reads token from FMODE_API_TOKEN, ~/.fmode/config.json, project .fmode/config.json, or ANTHROPIC_AUTH_TOKEN.",
8
+ "type": "commonjs",
9
+ "bin": {
10
+ "fmode-vision": "bin/fmode-vision.js"
11
+ },
12
+ "scripts": {
13
+ "smoke": "node scripts/smoke.js"
14
+ },
15
+ "files": [
16
+ ".claude-plugin/",
17
+ "bin/",
18
+ "README.md",
19
+ "scripts/",
20
+ "skill-package-manifest.json",
21
+ "skills/"
22
+ ],
23
+ "keywords": [
24
+ "claude-code",
25
+ "claude-skill",
26
+ "fmode",
27
+ "vision",
28
+ "image-analysis",
29
+ "doubao"
30
+ ],
31
+ "license": "MIT",
32
+ "dependencies": {}
33
+ }
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env node
2
+ const fs = require('fs');
3
+ const path = require('path');
4
+ const { pathToFileURL } = require('url');
5
+
6
+ const ROOT = path.resolve(__dirname, '..');
7
+ const SKILL_DIR = path.join(ROOT, 'skills', 'fmode-vision');
8
+
9
+ function fail(msg) { console.error('SMOKE FAIL: ' + msg); process.exit(1); }
10
+
11
+ const required = [
12
+ 'SKILL.md',
13
+ 'scripts/vision-client.mjs',
14
+ 'scripts/prompts/room-measurement.mjs'
15
+ ];
16
+ for (const rel of required) {
17
+ if (!fs.existsSync(path.join(SKILL_DIR, rel))) fail('missing ' + rel);
18
+ }
19
+
20
+ (async () => {
21
+ const mod = await import(pathToFileURL(path.join(SKILL_DIR, 'scripts', 'vision-client.mjs')).href);
22
+ for (const fn of ['resolveApiToken', 'callVisionAPI', 'callMultiPass', 'extractJSON']) {
23
+ if (typeof mod[fn] !== 'function') fail('export ' + fn + ' is not a function');
24
+ }
25
+ const { parsed } = mod.extractJSON('text {"a":1} tail');
26
+ if (!parsed || parsed.a !== 1) fail('extractJSON did not parse JSON');
27
+ console.log('SMOKE OK: fmode-vision package structure + module exports verified');
28
+ })().catch(e => fail(e.message));
@@ -0,0 +1,17 @@
1
+ {
2
+ "name": "fmode-vision",
3
+ "version": "0.1.0",
4
+ "description": "Claude Code 独立技能包:通过 Fmode API 视觉模型对图片、视频进行分析。支持单轮分析、多轮聚焦分析、视频帧分析与批量处理,并内置毛坯房量尺 5-pass 分析提示词。",
5
+ "plugin": "fmode-vision",
6
+ "skills": [
7
+ "fmode-vision"
8
+ ],
9
+ "entrySkill": "fmode-vision",
10
+ "npmPackage": "@fmode/vision",
11
+ "smokeCommand": "npm run smoke",
12
+ "installCommand": "npx @fmode/vision@latest install",
13
+ "workspaceInstallCommand": "npx @fmode/vision@latest workspace",
14
+ "workspaceSkillPath": ".claude/skills/fmode-vision/SKILL.md",
15
+ "globalSkillPath": "%USERPROFILE%/.claude/skills/fmode-vision/SKILL.md",
16
+ "installHint": "工作区安装:npx @fmode/vision@latest workspace,会写入 ./.claude/skills/fmode-vision/。用户级安装:npx @fmode/vision@latest install,会写入 ~/.claude/skills/fmode-vision/。Token 优先级:FMODE_API_TOKEN > ~/.fmode/config.json > 项目 .fmode/config.json > ANTHROPIC_AUTH_TOKEN。"
17
+ }
@@ -0,0 +1,185 @@
1
+ # Fmode Vision Skill — 维护文档
2
+
3
+ ## 项目结构
4
+
5
+ ```
6
+ .claude/skills/fmode-vision/
7
+ ├── SKILL.md # 技能入口,Claude 读取后知道何时及如何使用本技能
8
+ ├── README.md # 本文件:开发者维护文档
9
+ ├── .skillfish.json # 技能元信息(版本、来源仓库)
10
+ └── scripts/
11
+ ├── vision-client.mjs # 核心:通用视觉 API 客户端
12
+ └── prompts/
13
+ └── room-measurement.mjs # 领域模块:毛坯房量尺 5-pass 提示词
14
+ ```
15
+
16
+ ## 核心逻辑
17
+
18
+ ### 1. Token 解析链 (`resolveApiToken`)
19
+
20
+ 三级优先级,短路返回:
21
+
22
+ ```
23
+ FMODE_API_TOKEN 环境变量
24
+ → ~/.fmode/config.json 的 fmodeApiToken / newapiToken 字段
25
+ → <cwd>/.fmode/config.json 的 fmodeApiToken / newapiToken 字段
26
+ → 抛出异常(提示用户配置)
27
+ ```
28
+
29
+ 设计原因:环境变量适合 CI/CD;用户级配置适合个人开发机;项目级配置适合团队共享(加入 .gitignore)。
30
+
31
+ ### 2. API 调用流程 (`callVisionAPI`)
32
+
33
+ ```
34
+ 输入: imagePath | imageBase64 | imageUrl | videoUrl
35
+ |
36
+ ├─ 解析 token
37
+ ├─ 构造 messages 数组
38
+ │ ├─ system prompt
39
+ │ └─ user content:
40
+ │ ├─ text part(用户提示词)
41
+ │ └─ image_url / video_url part(视觉内容)
42
+ ├─ POST https://api.fmode.cn/v1/chat/completions
43
+ │ body: { model, messages, temperature, max_tokens }
44
+ ├─ 响应的 content 字符串 → extractJSON()
45
+ └─ 返回 { raw, parsed, error, usage }
46
+ ```
47
+
48
+ ### 3. 多轮分析模式 (`callMultiPass`)
49
+
50
+ 核心理念:每轮独立调用 API,各自聚焦一个分析维度,最后一轮合并。这比单轮全量分析精度更高。
51
+
52
+ ```
53
+ 输入: imagePath + passes[{name, systemPrompt, userPrompt, maxTokens}]
54
+ |
55
+ for each pass:
56
+ ├─ 检查 cacheDir/pass<N>.json 是否存在
57
+ │ ├─ 存在 → 跳过,读取缓存
58
+ │ └─ 不存在 → callVisionAPI() → 写入缓存
59
+ ├─ sleep(delayMs) 避免限流
60
+ |
61
+ └─ 返回 results[]
62
+ ```
63
+
64
+ 缓存设计:
65
+ - 每轮结果独立缓存,支持断点续跑
66
+ - 缓存 key = pass 序号,与提示词内容无关
67
+ - 如需强制重新分析,删除对应缓存文件即可
68
+ - 提示词迭代时,建议手动清理缓存
69
+
70
+ ### 4. JSON 提取 (`extractJSON`)
71
+
72
+ LLM 响应可能被 markdown 代码块包裹(```json ... ```),也可能前后有解释文字。用正则 `/\{[\s\S]*\}/` 提取第一个 JSON 对象。
73
+
74
+ ### 5. 毛坯房 5-pass 专用流程 (`room-measurement.mjs`)
75
+
76
+ 继承自 `analyze-photos-v4.mjs`,5 轮各有独立职责:
77
+
78
+ | Pass | 名称 | 分析焦点 | tokens |
79
+ |------|------|---------|--------|
80
+ | 1 | spatial | 空间结构:透视类型、墙面多边形、阴阳角、天地面 | 2000 |
81
+ | 2 | ceiling | 吊顶特征:cornice/trayStep/beam/bulkhead | 1000 |
82
+ | 3 | openings | 门窗洞口:双层框架(outer+inner polygon) | 2500 |
83
+ | 4 | obstacles | 障碍物:插座/开关/电箱/踢脚线/风口等 | 1500 |
84
+ | 5 | merge | 文本合并:场景描述、房间类型、测量计划、质量评估 | 2000 |
85
+
86
+ 质量验证:
87
+ - 踢脚线 height > 10% → 警告(应为 2-5%)
88
+ - 吊顶特征 polygon 顶点 > 4 → 警告
89
+
90
+ ## 配置说明
91
+
92
+ ### API Token
93
+
94
+ 方式一:环境变量
95
+ ```bash
96
+ export FMODE_API_TOKEN="sk-xxxxxxxx"
97
+ ```
98
+
99
+ 方式二:用户级配置 `~/.fmode/config.json`
100
+ ```json
101
+ {
102
+ "fmodeApiToken": "sk-xxxxxxxx"
103
+ }
104
+ ```
105
+
106
+ 方式三:项目级配置 `<project>/.fmode/config.json`(需加入 .gitignore)
107
+ ```json
108
+ {
109
+ "fmodeApiToken": "sk-xxxxxxxx"
110
+ }
111
+ ```
112
+
113
+ ### 可用模型
114
+
115
+ | 模型 ID | 用途 | 备注 |
116
+ |---------|------|------|
117
+ | `doubao-seed-2-0-pro-260215` | 视觉理解(默认) | 豆包视觉模型,性价比高 |
118
+ | `gpt-4o` | 视觉理解 | 如账号有权限 |
119
+
120
+ 模型列表可能更新,以 Fmode API 返回为准。
121
+
122
+ ## 扩展指南
123
+
124
+ ### 添加新的提示词模板
125
+
126
+ 在 `scripts/prompts/` 下新建 `.mjs` 文件:
127
+
128
+ ```js
129
+ import { callVisionAPI, callMultiPass } from '../vision-client.mjs';
130
+
131
+ export const MY_SYSTEM_PROMPT = `...`;
132
+ export const MY_USER_PROMPT = `...`;
133
+
134
+ export async function analyzeSomething(imagePath) {
135
+ const result = await callVisionAPI({
136
+ imagePath,
137
+ systemPrompt: MY_SYSTEM_PROMPT,
138
+ userPrompt: MY_USER_PROMPT,
139
+ maxTokens: 1000,
140
+ });
141
+ return result.parsed;
142
+ }
143
+ ```
144
+
145
+ ### 添加新模型
146
+
147
+ 在 `vision-client.mjs` 的 `DEFAULT_CONFIG` 中调整默认模型,或调用时传入 `model` 参数:
148
+
149
+ ```js
150
+ const result = await callVisionAPI({
151
+ imagePath: '/path/to/img.jpg',
152
+ systemPrompt: '...',
153
+ userPrompt: '...',
154
+ model: 'gpt-4o', // 覆盖默认模型
155
+ });
156
+ ```
157
+
158
+ ### 多轮分析自定义
159
+
160
+ ```js
161
+ import { callMultiPass } from './vision-client.mjs';
162
+
163
+ const results = await callMultiPass({
164
+ imagePath: '/path/to/img.jpg',
165
+ cacheDir: '/tmp/my-analysis/img-001/',
166
+ passes: [
167
+ { name: 'overview', systemPrompt: '...', userPrompt: '描述整体场景', maxTokens: 500 },
168
+ { name: 'details', systemPrompt: '...', userPrompt: '标注细节元素', maxTokens: 1500 },
169
+ { name: 'verify', systemPrompt: '...', userPrompt: '验证前两轮一致性', maxTokens: 1000 },
170
+ ],
171
+ });
172
+ ```
173
+
174
+ ## 依赖
175
+
176
+ 仅使用 Node.js 内置模块:`fs`, `path`, `os`。无需 `npm install`。
177
+
178
+ 全局 `fetch` 需要 Node.js 18+(已内置)。
179
+
180
+ ## 源文件参考
181
+
182
+ 本技能从以下文件提取和通用化:
183
+ - `d:\workspace\hundun\lami-scale-canvas\scripts\analyze-photos-v4.mjs` — 原始 5-pass 量尺分析实现
184
+
185
+ 版本追踪:lami-scale-canvas 仓库 `analyze-photos-v4.mjs` 如有更新,需同步检查本技能是否需要升级。
@@ -0,0 +1,167 @@
1
+ ---
2
+ name: fmode-vision
3
+ description: "通过 Fmode API 调用视觉模型对图片、视频进行分析。适用场景:(1) 图片内容识别与结构化提取, (2) 多轮聚焦分析获取高精度结果, (3) 视频帧分析, (4) 视觉素材批量处理"
4
+ description_en: "Analyze images and videos via Fmode API vision models. Use for: (1) Image content recognition and structured extraction, (2) Multi-pass focused analysis for high-precision results, (3) Video frame analysis, (4) Batch visual material processing"
5
+ ---
6
+
7
+ # Fmode Vision — 视觉识别技能
8
+
9
+ ## Overview
10
+
11
+ 本技能封装 Fmode API (api.fmode.cn) 的视觉模型调用,支持单轮和多轮分析。用户可能要求你分析图片、处理视频帧、或对视觉素材进行结构化信息提取。
12
+
13
+ ## Token 获取
14
+
15
+ 首次使用需获取 API token,按以下优先级查找:
16
+
17
+ 1. **环境变量** `FMODE_API_TOKEN`(最高优先级)
18
+ ```bash
19
+ echo $FMODE_API_TOKEN
20
+ ```
21
+ 2. **用户级配置** `~/.fmode/config.json` → `fmodeApiToken` 字段
22
+ ```bash
23
+ cat ~/.fmode/config.json
24
+ ```
25
+ 3. **项目级配置** `<project>/.fmode/config.json` → `fmodeApiToken`
26
+
27
+ 如果三级都未找到,提示用户提供 token。
28
+
29
+ ## API 调用规范
30
+
31
+ - **Base URL**: `https://api.fmode.cn`
32
+ - **Endpoint**: `POST /v1/chat/completions`
33
+ - **Auth**: `Authorization: Bearer <token>`
34
+ - **默认视觉模型**: `doubao-seed-2-0-pro-260215`
35
+ - **备选模型**: `gpt-4o`, `gpt-4-vision-preview`(如有权限)
36
+
37
+ ### 请求体结构
38
+
39
+ ```json
40
+ {
41
+ "model": "doubao-seed-2-0-pro-260215",
42
+ "messages": [
43
+ { "role": "system", "content": "系统提示词" },
44
+ {
45
+ "role": "user",
46
+ "content": [
47
+ { "type": "text", "text": "用户提示词" },
48
+ { "type": "image_url", "image_url": { "url": "data:image/jpeg;base64,<base64>" } }
49
+ ]
50
+ }
51
+ ],
52
+ "temperature": 0.12,
53
+ "max_tokens": 2000
54
+ }
55
+ ```
56
+
57
+ ### 视觉内容支持
58
+
59
+ | 类型 | 传递方式 | 适用场景 |
60
+ |------|---------|---------|
61
+ | 本地图片 | `data:image/<fmt>;base64,<data>` | jpg/png/webp |
62
+ | 远程图片 | 直接 URL | 需模型支持公网访问 |
63
+ | 视频 | `type: "video_url"` | 模型自动抽帧 |
64
+
65
+ ## 核心工作流
66
+
67
+ ### 决策树
68
+
69
+ ```
70
+ 需要分析视觉内容?
71
+ ├── 简单描述/单维度提取 → 单轮分析 (callVisionAPI)
72
+ ├── 多维度精确标注 → 多轮聚焦分析 (callMultiPass)
73
+ │ ├── 每轮独立调用 API,专注一个维度
74
+ │ ├── 中间结果写入缓存目录
75
+ │ └── 最后一轮合并所有结果
76
+ └── 批量处理 → 遍历 + 单轮/多轮
77
+ ```
78
+
79
+ ### 单轮分析
80
+
81
+ 使用 `scripts/vision-client.mjs` 的 `callVisionAPI()` 函数:
82
+
83
+ ```js
84
+ import { callVisionAPI, resolveApiToken } from './scripts/vision-client.mjs';
85
+
86
+ const result = await callVisionAPI({
87
+ imagePath: '/path/to/image.jpg',
88
+ systemPrompt: '你是一位影像分析专家...',
89
+ userPrompt: '请描述这张图片中的关键元素...',
90
+ maxTokens: 1000,
91
+ });
92
+ // result = { raw, parsed, error, usage }
93
+ ```
94
+
95
+ ### 多轮聚焦分析
96
+
97
+ 使用 `callMultiPass()` 封装,适用于需要从不同维度精确分析的场景:
98
+
99
+ ```js
100
+ import { callMultiPass } from './scripts/vision-client.mjs';
101
+
102
+ const passes = [
103
+ { name: 'structure', systemPrompt: '...', userPrompt: '...', maxTokens: 2000 },
104
+ { name: 'details', systemPrompt: '...', userPrompt: '...', maxTokens: 1000 },
105
+ ];
106
+
107
+ const results = await callMultiPass({
108
+ imagePath: '/path/to/image.jpg',
109
+ passes,
110
+ cacheDir: '/tmp/analysis/image-id/',
111
+ });
112
+ ```
113
+
114
+ ## 提示词工程
115
+
116
+ ### 结构化输出
117
+
118
+ 始终要求模型输出严格 JSON,在 system prompt 中给出完整 schema:
119
+
120
+ ```
121
+ ## 输出格式(严格JSON,无markdown代码块)
122
+ {
123
+ "field1": "value",
124
+ "field2": [{ "sub": "value" }]
125
+ }
126
+ ```
127
+
128
+ ### 聚焦原则
129
+
130
+ 多轮分析中每轮只关注一个维度,明确告知模型忽略其他内容:
131
+ ```
132
+ ## 规则
133
+ 1. 只标注 X 类元素,忽略 Y、Z 等其他所有元素
134
+ 2. 每个元素标注精确的 boundingBox
135
+ ```
136
+
137
+ ### JSON 提取
138
+
139
+ 模型可能包裹 markdown 代码块,使用 `extractJSON()` 提取:
140
+
141
+ ```js
142
+ import { extractJSON } from './scripts/vision-client.mjs';
143
+ const parsed = extractJSON(rawResponse);
144
+ ```
145
+
146
+ ## 结果缓存
147
+
148
+ 多轮分析支持中间结果缓存:
149
+ - 每轮结果写入 `cacheDir/pass<N>.json`
150
+ - 重新运行时自动跳过已有缓存
151
+ - 如需强制重新分析,删除对应缓存文件
152
+
153
+ ## 领域模块
154
+
155
+ ### 毛坯房量尺分析
156
+
157
+ `scripts/prompts/room-measurement.mjs` 提供 5-pass 量尺分析提示词:
158
+ - Pass 1: 空间结构(透视/墙面/阴阳角)
159
+ - Pass 2: 吊顶特征(cornice/trayStep/beam/bulkhead)
160
+ - Pass 3: 门窗洞口(双层框架)
161
+ - Pass 4: 障碍物(精确 boundingBox)
162
+ - Pass 5: 合并 + 测量计划
163
+
164
+ ```js
165
+ import { processPhoto } from './scripts/prompts/room-measurement.mjs';
166
+ const merged = await processPhoto('/path/to/photo.jpg', 'photo-001', 'photo-001.jpg');
167
+ ```
@@ -0,0 +1,408 @@
1
+ /**
2
+ * 毛坯房量尺 — 5轮聚焦提示词模块
3
+ *
4
+ * 从 analyze-photos-v4.mjs 移植,API 调用委托给 vision-client.mjs。
5
+ *
6
+ * 使用示例:
7
+ * import { processPhoto, PASS_CONFIGS } from './prompts/room-measurement.mjs';
8
+ * const result = await processPhoto('/path/to/photo.jpg', 'img-001', 'room-a.jpg');
9
+ */
10
+
11
+ import fs from 'fs';
12
+ import path from 'path';
13
+ import { callVisionAPI } from '../vision-client.mjs';
14
+
15
+ // ============================================================
16
+ // 5轮聚焦提示词
17
+ // ============================================================
18
+
19
+ export const PASS1_SYSTEM = `你是一位建筑空间分析专家。你的任务是精确分析毛坯房照片的**空间结构**。
20
+
21
+ ## 规则
22
+ 1. **透视类型**:判断一点透视/两点透视/三点透视。
23
+ - 一点透视:正面墙正对镜头,水平线汇聚到画面中心
24
+ - 两点透视:墙角在画面中心附近,两侧墙面分别向左右消失
25
+ - 三点透视:仰拍/俯拍导致垂直线也汇聚
26
+ - 特别注意:如果看到两个墙面以夹角呈现(墙角在画面中心附近),必须报告 twoPoint
27
+
28
+ 2. **墙面多边形**:每面可见墙标注**精确的4个角点**(四边形),沿建筑实际边缘。
29
+ - surfaceType: facing(正面)/leftWall(左墙)/rightWall(右墙)
30
+ - 每条边放3个等分测量点(measurePoints)
31
+
32
+ 3. **天花/地面区域**:各标注4个角点的多边形
33
+
34
+ 4. **阴阳角**:标注位置(x,y)
35
+
36
+ 5. **忽略**所有小物件、家具、装饰、门窗、吊顶细节——这些会在后续分析中处理
37
+
38
+ ## 输出格式(严格JSON,无markdown代码块)
39
+ {
40
+ "pass": 1,
41
+ "perspective": {"type": "onePoint|twoPoint|threePoint", "description": "透视说明", "vanishingPoints": [{"x": 50, "y": 40}]},
42
+ "surfaces": {
43
+ "walls": [
44
+ {"id": "w1", "label": "正面主墙", "surfaceType": "facing",
45
+ "polygon": [{"x":20,"y":25},{"x":75,"y":25},{"x":75,"y":82},{"x":20,"y":80}],
46
+ "measureLines": [
47
+ {"label":"顶边3点","type":"horizontal","edge":"top","startPoint":{"x":20,"y":25},"endPoint":{"x":75,"y":25},"measurePoints":[{"x":20,"y":25},{"x":47.5,"y":25},{"x":75,"y":25}]},
48
+ {"label":"底边3点","type":"horizontal","edge":"bottom","startPoint":{"x":20,"y":80},"endPoint":{"x":75,"y":82},"measurePoints":[{"x":20,"y":80},{"x":47.5,"y":81},{"x":75,"y":82}]},
49
+ {"label":"左边3点","type":"vertical","edge":"left","startPoint":{"x":20,"y":25},"endPoint":{"x":20,"y":80},"measurePoints":[{"x":20,"y":25},{"x":20,"y":52.5},{"x":20,"y":80}]},
50
+ {"label":"右边3点","type":"vertical","edge":"right","startPoint":{"x":75,"y":25},"endPoint":{"x":75,"y":82},"measurePoints":[{"x":75,"y":25},{"x":75,"y":53.5},{"x":75,"y":82}]}
51
+ ]}
52
+ ],
53
+ "floorRegion": {"polygon": [{"x":0,"y":80},{"x":100,"y":80},{"x":100,"y":100},{"x":0,"y":100}], "label": "可见地面"},
54
+ "ceilingRegion": {"polygon": [{"x":0,"y":0},{"x":100,"y":0},{"x":100,"y":20},{"x":0,"y":20}], "label": "可见天花"}
55
+ },
56
+ "corners": [
57
+ {"id":"c1","type":"internal","label":"左阴角","position":{"x":20,"y":55}},
58
+ {"id":"c2","type":"internal","label":"右阴角","position":{"x":75,"y":55}}
59
+ ]
60
+ }`;
61
+
62
+ export const PASS1_USER = `请分析这张照片的**空间结构**:
63
+ 1. 判断透视类型(一点/两点/三点),找消失点
64
+ 2. 标注每面可见墙的4角多边形,区分facing/leftWall/rightWall
65
+ 3. 标注天花/地面区域
66
+ 4. 标注阴阳角位置
67
+
68
+ 只输出JSON,不包含其他内容:`;
69
+
70
+ export const PASS2_SYSTEM = `你是一位吊顶与天花结构分析专家。你的任务是精确分析照片中的**天花板特征**。
71
+
72
+ ## 规则
73
+ 1. **只标注天花板上的结构特征**,忽略墙面、地面、门窗、障碍物
74
+ 2. **关键:每个特征必须用4个角点的简单四边形标注**。即使实际形状不规则,也只能用4点近似。禁止使用5点或更多点。
75
+ 3. 特征类型:
76
+ - cornice: 石膏线/阴角线(天花与墙面交界处的装饰线条)
77
+ - trayStep: 吊顶叠级/双眼皮(不同高度的吊顶分界线)
78
+ - beam: 梁/下返结构
79
+ - bulkhead: 窗帘盒/设备带(局部下返区域)
80
+ - soffit: 管道包封/检修口
81
+ 4. polygon的4个点按顺时针方向标注
82
+
83
+ ## 输出格式(严格JSON,无markdown代码块)
84
+ {
85
+ "pass": 2,
86
+ "ceilingFeatures": [
87
+ {"id":"cf1","type":"cornice","label":"石膏阴角线",
88
+ "polygon": [{"x":0,"y":8},{"x":100,"y":8},{"x":100,"y":12},{"x":0,"y":12}]},
89
+ {"id":"cf2","type":"trayStep","label":"第一层叠级线",
90
+ "polygon": [{"x":20,"y":22},{"x":80,"y":22},{"x":80,"y":26},{"x":20,"y":26}]}
91
+ ]
92
+ }
93
+
94
+ 如果没有可见的天花特征,返回空数组:{"pass":2,"ceilingFeatures":[]}`;
95
+
96
+ export const PASS2_USER = `请分析这张照片的**天花板特征**:
97
+ 1. 石膏线/阴角线(cornice)
98
+ 2. 吊顶叠级/双眼皮(trayStep)
99
+ 3. 梁/下返结构(beam)
100
+ 4. 窗帘盒/设备带(bulkhead)
101
+
102
+ 记住:每个特征只能用4个角点标注!简单四边形!
103
+
104
+ 只输出JSON:`;
105
+
106
+ export const PASS3_SYSTEM = `你是一位门窗洞口测量专家。你的任务是精确分析照片中的**所有门洞和窗洞**。
107
+
108
+ ## 规则
109
+ 1. **只标注门洞和窗洞**,忽略其他所有元素(墙壁、天花、障碍物等)
110
+ 2. 每个洞口标注**双层框架**:
111
+ - outerPolygon: 洞口在墙面上的外轮廓(4个角点,即墙面上的实际开口边缘)
112
+ - innerPolygon: 门扇/窗扇/玻璃区域的内轮廓(4个角点)
113
+ - frameThickness: 门套/窗套线宽度(百分比),如无套线则为0
114
+ 3. 测量线沿外框放置:上中下宽度3点 + 左中右高度3点
115
+ 4. 如果无可见洞口,返回空数组
116
+
117
+ ## 输出格式(严格JSON,无markdown代码块)
118
+ {
119
+ "pass": 3,
120
+ "openings": [
121
+ {"id":"d1","type":"door","label":"入户门",
122
+ "frame": {
123
+ "outerPolygon": [{"x":35,"y":20},{"x":55,"y":18},{"x":55,"y":80},{"x":35,"y":82}],
124
+ "innerPolygon": [{"x":37,"y":22},{"x":53,"y":20},{"x":53,"y":78},{"x":37,"y":80}],
125
+ "frameThickness": 2.0
126
+ },
127
+ "measureLines": [
128
+ {"label":"门洞上口宽","type":"horizontal","startPoint":{"x":35,"y":20},"endPoint":{"x":55,"y":18},"measurePoints":[{"x":35,"y":20},{"x":45,"y":19},{"x":55,"y":18}]},
129
+ {"label":"门洞左口高","type":"vertical","startPoint":{"x":35,"y":20},"endPoint":{"x":35,"y":82},"measurePoints":[{"x":35,"y":20},{"x":35,"y":51},{"x":35,"y":82}]}
130
+ ]}
131
+ ]
132
+ }`;
133
+
134
+ export const PASS3_USER = `请分析这张照片的**所有门洞和窗洞**:
135
+ 1. 标注外层框架(outerPolygon,墙上开口的精确边缘)
136
+ 2. 标注内层框架(innerPolygon,门扇/玻璃边缘)
137
+ 3. 标注门套/窗套厚度(frameThickness)
138
+ 4. 放置测量点
139
+
140
+ 只输出JSON:`;
141
+
142
+ export const PASS4_SYSTEM = `你是一位全屋定制障碍物检测专家。你的任务是精确标注照片中**所有可见障碍物**的包围盒。
143
+
144
+ ## 核心原则
145
+ 每个包围盒(boundingBox)告诉测量人员"需要测量这个矩形区域的实际尺寸"。你必须非常精确——贴合物体的真实可见边缘。
146
+
147
+ ## 障碍物类型
148
+ - outlet(插座): 86型约2%×2%, 118型约3%×2%
149
+ - switch(开关): 同插座
150
+ - electricBox(电箱): 箱体外框,通常5-15%
151
+ - vent(风口): 格栅外框在吊顶/墙上
152
+ - pipe(管道): 管道与墙/地接触范围
153
+ - baseboard(踢脚线): 墙底水平条带
154
+ - doorFrame(门套线): 门套在墙上的宽度条带
155
+ - windowFrame(窗套线): 窗套在墙上的范围
156
+ - gasMeter(燃气表): 表箱外框
157
+ - floorDrain(地漏): 地面位置
158
+ - downlight(筒灯): 天花位置
159
+
160
+ ## ⚠️ 踢脚线高度规则(非常重要!)
161
+ - 踢脚线(baseboard)的高度必须在 2%-5% 之间
162
+ - 这是踢脚线条带**本身**的高度,不是从踢脚线到墙顶的距离
163
+ - 正面墙(facing)踢脚线:沿着墙底的水平窄条,height = 2-4%
164
+ - 侧墙(leftWall/rightWall)踢脚线:height = 2-5%(不要被透视缩短误导!)
165
+ - **如果标注的height > 10%,一定是错误的——请重新检查!**那是整面墙的高度,不是踢脚线
166
+ - 侧墙的踢脚线:看墙底部那条水平的细线/条带,标注那条条带的高度
167
+
168
+ ## 包围盒格式
169
+ boundingBox: { x, y, width, height } — 全部百分比
170
+ - x, y: 包围盒左上角相对于图片的百分比位置
171
+ - width, height: 包围盒的宽高百分比
172
+
173
+ ## 输出格式(严格JSON,无markdown代码块)
174
+ {
175
+ "pass": 4,
176
+ "obstacles": [
177
+ {"id":"obs1","type":"outlet","label":"五孔插座(86型)","boundingBox":{"x":42,"y":56,"width":2.5,"height":3.2}},
178
+ {"id":"obs2","type":"baseboard","label":"木质踢脚线","boundingBox":{"x":20,"y":80,"width":55,"height":3}},
179
+ {"id":"obs3","type":"vent","label":"空调出风口","boundingBox":{"x":8,"y":10,"width":14,"height":4}}
180
+ ]
181
+ }`;
182
+
183
+ export const PASS4_USER = `请分析这张照片的**所有障碍物**:
184
+ 1. 插座、开关、电箱
185
+ 2. 风口(空调、新风、排风)
186
+ 3. 管道
187
+ 4. 踢脚线(⚠️ height必须2-5%,不能是整面墙高度!)
188
+ 5. 门套线、窗套线
189
+ 6. 燃气表、地漏
190
+ 7. 筒灯、射灯
191
+
192
+ 每个障碍物用精确的boundingBox{x,y,width,height}标注。
193
+ 只输出JSON:`;
194
+
195
+ export const PASS5_SYSTEM = `你是一位全屋定制测量专家。你有4份针对同一房间的分析数据,分别来自不同专家的独立观察。请将它们合并为一份完整的测量分析报告。
196
+
197
+ ## 你的任务
198
+ 1. 阅读4份数据,理解空间结构
199
+ 2. 写出 sceneDescription(完整的场景描述,2-3句话)
200
+ 3. 判断 roomType(卧室/客厅/厨房/卫生间/阳台/走廊/储物间/其他)
201
+ 4. 生成 measurementPlan(测量计划),将所有元素关联到测量步骤
202
+ 5. 评估 photoQuality(是否广角、畸变程度、是否需要补拍)
203
+ 6. 列出 issues(如有遮挡、光线不足等问题)
204
+
205
+ ## 测量计划规则
206
+ - 每面墙至少一个步骤(3点宽+3点高)
207
+ - 每个门洞/窗洞一个步骤
208
+ - 每组同类障碍物可以合并为一个步骤(如"测量所有插座位置")
209
+ - 步骤按重要性排序:required > recommended > optional
210
+ - elementIds必须引用实际存在的ID(来自输入数据)
211
+ - 工具:激光测距仪(长距离)、卷尺(小尺寸)、水平仪(垂直度)
212
+
213
+ ## 输出格式(严格JSON,无markdown代码块)
214
+ {
215
+ "pass": 5,
216
+ "sceneDescription": "完整的场景描述...",
217
+ "roomType": "卧室",
218
+ "measurementPlan": [
219
+ {"step":1,"action":"测量正面主墙顶中底3点宽度与左中右3点高度","target":"w1","tool":"激光测距仪","priority":"required","elementIds":["w1"]}
220
+ ],
221
+ "photoQuality": {"isWideAngle":true,"distortionLevel":"low","recommendReshoot":false,"reshootAdvice":""},
222
+ "issues": []
223
+ }`;
224
+
225
+ export const PASS5_USER_TEMPLATE = `以下是一个房间的4份独立分析数据。请将它们合并:
226
+
227
+ === 空间结构 ===
228
+ __PASS1__
229
+
230
+ === 吊顶特征 ===
231
+ __PASS2__
232
+
233
+ === 门窗洞口 ===
234
+ __PASS3__
235
+
236
+ === 障碍物 ===
237
+ __PASS4__
238
+
239
+ 请生成完整的测量分析报告。只输出JSON:`;
240
+
241
+ // ============================================================
242
+ // 轮次配置(供 callMultiPass 使用)
243
+ // ============================================================
244
+
245
+ export const PASS_CONFIGS = [
246
+ { name: 'spatial', systemPrompt: PASS1_SYSTEM, userPrompt: PASS1_USER, maxTokens: 2000 },
247
+ { name: 'ceiling', systemPrompt: PASS2_SYSTEM, userPrompt: PASS2_USER, maxTokens: 1000 },
248
+ { name: 'openings', systemPrompt: PASS3_SYSTEM, userPrompt: PASS3_USER, maxTokens: 2500 },
249
+ { name: 'obstacles', systemPrompt: PASS4_SYSTEM, userPrompt: PASS4_USER, maxTokens: 1500 },
250
+ ];
251
+
252
+ // ============================================================
253
+ // 合并函数
254
+ // ============================================================
255
+
256
+ export function mergeResults(photoId, fileName, passResults) {
257
+ const p1 = passResults[0]?.parsed || {};
258
+ const p2 = passResults[1]?.parsed || {};
259
+ const p3 = passResults[2]?.parsed || {};
260
+ const p4 = passResults[3]?.parsed || {};
261
+ const p5 = passResults[4]?.parsed || {};
262
+
263
+ const merged = {
264
+ version: 'v4-multipass',
265
+ photoId,
266
+ fileName,
267
+ analyzedAt: new Date().toISOString(),
268
+ passes: passResults.map((p, i) => ({
269
+ pass: i + 1,
270
+ name: p.name || `pass${i + 1}`,
271
+ status: p.error ? 'error' : 'ok',
272
+ error: p.error || null,
273
+ usage: p.usage || null,
274
+ })),
275
+ parsed: {
276
+ sceneDescription: p5.sceneDescription || '',
277
+ roomType: p5.roomType || '',
278
+ perspective: p1.perspective || { type: 'onePoint', description: '', vanishingPoints: [] },
279
+ surfaces: p1.surfaces || { walls: [], floorRegion: null, ceilingRegion: null },
280
+ openings: p3.openings || [],
281
+ ceilingFeatures: p2.ceilingFeatures || [],
282
+ corners: p1.corners || [],
283
+ obstacles: p4.obstacles || [],
284
+ measurementPlan: p5.measurementPlan || [],
285
+ issues: p5.issues || [],
286
+ photoQuality: p5.photoQuality || { isWideAngle: false, distortionLevel: 'unknown', recommendReshoot: false, reshootAdvice: '' },
287
+ },
288
+ };
289
+
290
+ // 质量验证:踢脚线高度检查
291
+ const suspiciousBaseboards = (merged.parsed.obstacles || []).filter(
292
+ o => o.type === 'baseboard' && o.boundingBox?.height > 10
293
+ );
294
+ if (suspiciousBaseboards.length > 0) {
295
+ console.log(` ⚠ 发现 ${suspiciousBaseboards.length} 个异常踢脚线高度>10%:`);
296
+ suspiciousBaseboards.forEach(o => {
297
+ console.log(` ${o.id}: height=${o.boundingBox.height}% (预计2-5%)`);
298
+ });
299
+ }
300
+
301
+ // 质量验证:吊顶特征顶点数检查
302
+ const complexCeilings = (merged.parsed.ceilingFeatures || []).filter(
303
+ cf => cf.polygon && cf.polygon.length > 4
304
+ );
305
+ if (complexCeilings.length > 0) {
306
+ console.log(` ⚠ 发现 ${complexCeilings.length} 个吊顶特征顶点>4:`);
307
+ complexCeilings.forEach(cf => {
308
+ console.log(` ${cf.id}: ${cf.polygon.length}点 (期望4点)`);
309
+ });
310
+ }
311
+
312
+ return merged;
313
+ }
314
+
315
+ // ============================================================
316
+ // 主流程:处理单张照片
317
+ // ============================================================
318
+
319
+ /**
320
+ * 对单张毛坯房照片执行 5-pass 分析
321
+ *
322
+ * @param {string} imagePath 图片路径
323
+ * @param {string} photoId 照片 ID(用于缓存目录命名)
324
+ * @param {string} fileName 原始文件名
325
+ * @param {Object} [opts]
326
+ * @param {string} [opts.cacheDir] 缓存目录,默认 './output/v4/<photoId>'
327
+ * @param {string} [opts.model] 模型名
328
+ * @returns {Promise<Object>} 合并后的分析结果
329
+ */
330
+ export async function processPhoto(imagePath, photoId, fileName, opts = {}) {
331
+ const cacheDir = opts.cacheDir || path.resolve('./output/v4', photoId);
332
+
333
+ // Pass 1-4: 视觉分析
334
+ const passResults = [];
335
+
336
+ for (const cfg of PASS_CONFIGS) {
337
+ const passNum = cfg.name === 'spatial' ? 1 : cfg.name === 'ceiling' ? 2 : cfg.name === 'openings' ? 3 : 4;
338
+ const cacheFile = path.join(cacheDir, `pass${passNum}.json`);
339
+
340
+ if (fs.existsSync(cacheFile)) {
341
+ console.log(` Pass ${passNum} (${cfg.name}): 已有缓存,跳过`);
342
+ passResults.push(JSON.parse(fs.readFileSync(cacheFile, 'utf-8')));
343
+ continue;
344
+ }
345
+
346
+ console.log(` Pass ${passNum} (${cfg.name}, ${cfg.maxTokens}t)...`);
347
+ try {
348
+ const result = await callVisionAPI({
349
+ imagePath,
350
+ systemPrompt: cfg.systemPrompt,
351
+ userPrompt: cfg.userPrompt,
352
+ maxTokens: cfg.maxTokens,
353
+ model: opts.model,
354
+ });
355
+ const entry = { pass: passNum, name: cfg.name, ...result };
356
+ if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true });
357
+ fs.writeFileSync(cacheFile, JSON.stringify(entry, null, 2));
358
+ passResults.push(entry);
359
+ console.log(` ${result.error ? '✗ ' + result.error : '✓ OK'} | tokens:${result.usage?.total_tokens || '?'}`);
360
+ } catch (e) {
361
+ console.log(` ✗ ${e.message}`);
362
+ const entry = { pass: passNum, name: cfg.name, error: e.message, parsed: null, usage: null };
363
+ if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true });
364
+ fs.writeFileSync(cacheFile, JSON.stringify(entry, null, 2));
365
+ passResults.push(entry);
366
+ }
367
+
368
+ await new Promise(r => setTimeout(r, 1500));
369
+ }
370
+
371
+ // Pass 5: 文本合并
372
+ const pass5File = path.join(cacheDir, 'pass5.json');
373
+ if (fs.existsSync(pass5File)) {
374
+ console.log(' Pass 5 (merge): 已有缓存,跳过');
375
+ passResults.push(JSON.parse(fs.readFileSync(pass5File, 'utf-8')));
376
+ } else {
377
+ console.log(' Pass 5 (merge, 2000t)...');
378
+ const p1Json = JSON.stringify(passResults[0]?.parsed || {}, null, 2);
379
+ const p2Json = JSON.stringify(passResults[1]?.parsed || {}, null, 2);
380
+ const p3Json = JSON.stringify(passResults[2]?.parsed || {}, null, 2);
381
+ const p4Json = JSON.stringify(passResults[3]?.parsed || {}, null, 2);
382
+ const mergePrompt = PASS5_USER_TEMPLATE
383
+ .replace('__PASS1__', p1Json)
384
+ .replace('__PASS2__', p2Json)
385
+ .replace('__PASS3__', p3Json)
386
+ .replace('__PASS4__', p4Json);
387
+
388
+ try {
389
+ const result = await callVisionAPI({
390
+ systemPrompt: PASS5_SYSTEM,
391
+ userPrompt: mergePrompt,
392
+ maxTokens: 2000,
393
+ model: opts.model,
394
+ });
395
+ const entry = { pass: 5, name: 'merge', ...result };
396
+ fs.writeFileSync(pass5File, JSON.stringify(entry, null, 2));
397
+ passResults.push(entry);
398
+ console.log(` ${result.error ? '✗ ' + result.error : '✓ OK'} | tokens:${result.usage?.total_tokens || '?'}`);
399
+ } catch (e) {
400
+ console.log(` ✗ ${e.message}`);
401
+ const entry = { pass: 5, name: 'merge', error: e.message, parsed: null, usage: null };
402
+ fs.writeFileSync(pass5File, JSON.stringify(entry, null, 2));
403
+ passResults.push(entry);
404
+ }
405
+ }
406
+
407
+ return mergeResults(photoId, fileName, passResults);
408
+ }
@@ -0,0 +1,254 @@
1
+ /**
2
+ * Fmode Vision API 通用客户端
3
+ *
4
+ * 功能:
5
+ * - resolveApiToken() 三级优先级获取 API token
6
+ * - callVisionAPI() 单轮视觉分析
7
+ * - callMultiPass() 多轮聚焦分析(支持缓存)
8
+ * - extractJSON() 从 LLM 响应提取 JSON
9
+ */
10
+
11
+ import fs from 'fs';
12
+ import path from 'path';
13
+ import os from 'os';
14
+
15
+ // ============================================================
16
+ // Token 解析
17
+ // ============================================================
18
+
19
+ /**
20
+ * 三级优先级获取 API token:
21
+ * 1. 环境变量 FMODE_API_TOKEN
22
+ * 2. ~/.fmode/config.json → fmodeApiToken 或 newapiToken
23
+ * 3. <cwd>/.fmode/config.json → fmodeApiToken 或 newapiToken
24
+ *
25
+ * @param {string} [projectRoot] 项目根目录,默认 process.cwd()
26
+ * @returns {{ token: string, source: string }}
27
+ */
28
+ export function resolveApiToken(projectRoot) {
29
+ // 1. 环境变量
30
+ if (process.env.FMODE_API_TOKEN) {
31
+ return { token: process.env.FMODE_API_TOKEN, source: 'env:FMODE_API_TOKEN' };
32
+ }
33
+
34
+ // 2. 用户级配置 ~/.fmode/config.json
35
+ const userConfigPath = path.join(os.homedir(), '.fmode', 'config.json');
36
+ const userToken = readTokenFromConfig(userConfigPath);
37
+ if (userToken) {
38
+ return { token: userToken, source: userConfigPath };
39
+ }
40
+
41
+ // 3. 项目级配置 <project>/.fmode/config.json
42
+ const root = projectRoot || process.cwd();
43
+ const projectConfigPath = path.join(root, '.fmode', 'config.json');
44
+ const projectToken = readTokenFromConfig(projectConfigPath);
45
+ if (projectToken) {
46
+ return { token: projectToken, source: projectConfigPath };
47
+ }
48
+
49
+ // 4. 兜底:Claude Code 注入的 ANTHROPIC_AUTH_TOKEN(与 fmode token 同源)
50
+ if (process.env.ANTHROPIC_AUTH_TOKEN) {
51
+ return { token: process.env.ANTHROPIC_AUTH_TOKEN, source: 'env:ANTHROPIC_AUTH_TOKEN' };
52
+ }
53
+
54
+ throw new Error(
55
+ '未找到 Fmode API token。请通过以下任一方式提供:\n' +
56
+ ' 1. 环境变量 FMODE_API_TOKEN\n' +
57
+ ' 2. ~/.fmode/config.json 中 fmodeApiToken 字段\n' +
58
+ ' 3. 项目 .fmode/config.json 中 fmodeApiToken 字段\n' +
59
+ ' 4. 环境变量 ANTHROPIC_AUTH_TOKEN'
60
+ );
61
+ }
62
+
63
+ function readTokenFromConfig(configPath) {
64
+ try {
65
+ if (!fs.existsSync(configPath)) return null;
66
+ const raw = fs.readFileSync(configPath, 'utf-8');
67
+ const cfg = JSON.parse(raw);
68
+ return cfg.fmodeApiToken || cfg.newapiToken || null;
69
+ } catch {
70
+ return null;
71
+ }
72
+ }
73
+
74
+ // ============================================================
75
+ // API 调用
76
+ // ============================================================
77
+
78
+ const DEFAULT_CONFIG = {
79
+ apiBase: 'https://api.fmode.cn',
80
+ model: 'doubao-seed-2-0-pro-260215',
81
+ temperature: 0.12,
82
+ maxTokens: 2000,
83
+ };
84
+
85
+ /**
86
+ * 调用 Fmode Vision API
87
+ *
88
+ * @param {Object} opts
89
+ * @param {string} [opts.imagePath] 本地图片路径
90
+ * @param {string} [opts.imageBase64] 图片 base64 数据(与 imagePath 二选一)
91
+ * @param {string} [opts.imageUrl] 远程图片 URL
92
+ * @param {string} [opts.videoUrl] 视频 URL
93
+ * @param {string} opts.systemPrompt 系统提示词
94
+ * @param {string} opts.userPrompt 用户提示词
95
+ * @param {string} [opts.model] 模型名,默认 doubao-seed-2-0-pro-260215
96
+ * @param {number} [opts.temperature] 默认 0.12
97
+ * @param {number} [opts.maxTokens] 默认 2000
98
+ * @param {string} [opts.apiToken] 手动传入 token,否则自动解析
99
+ * @returns {Promise<{ raw: string, parsed: object|null, error: string|null, usage: object|null }>}
100
+ */
101
+ export async function callVisionAPI(opts) {
102
+ const {
103
+ imagePath, imageBase64, imageUrl, videoUrl,
104
+ systemPrompt, userPrompt,
105
+ model, temperature, maxTokens, apiToken,
106
+ } = opts;
107
+
108
+ const token = apiToken || resolveApiToken().token;
109
+ const messages = [{ role: 'system', content: systemPrompt }];
110
+
111
+ const userContent = [{ type: 'text', text: userPrompt }];
112
+
113
+ // 视觉内容
114
+ if (imagePath) {
115
+ const buffer = fs.readFileSync(imagePath);
116
+ const ext = path.extname(imagePath).slice(1).toLowerCase();
117
+ const mime = ext === 'png' ? 'image/png' : ext === 'webp' ? 'image/webp' : 'image/jpeg';
118
+ const b64 = buffer.toString('base64');
119
+ userContent.push({
120
+ type: 'image_url',
121
+ image_url: { url: `data:${mime};base64,${b64}` },
122
+ });
123
+ } else if (imageBase64) {
124
+ userContent.push({
125
+ type: 'image_url',
126
+ image_url: { url: imageBase64 },
127
+ });
128
+ } else if (imageUrl) {
129
+ userContent.push({
130
+ type: 'image_url',
131
+ image_url: { url: imageUrl },
132
+ });
133
+ } else if (videoUrl) {
134
+ userContent.push({
135
+ type: 'video_url',
136
+ video_url: { url: videoUrl },
137
+ });
138
+ }
139
+
140
+ messages.push({ role: 'user', content: userContent });
141
+
142
+ const body = {
143
+ model: model || DEFAULT_CONFIG.model,
144
+ messages,
145
+ temperature: temperature ?? DEFAULT_CONFIG.temperature,
146
+ max_tokens: maxTokens || DEFAULT_CONFIG.maxTokens,
147
+ };
148
+
149
+ const res = await fetch(`${DEFAULT_CONFIG.apiBase}/v1/chat/completions`, {
150
+ method: 'POST',
151
+ headers: {
152
+ 'Content-Type': 'application/json',
153
+ Authorization: `Bearer ${token}`,
154
+ },
155
+ body: JSON.stringify(body),
156
+ });
157
+
158
+ if (!res.ok) {
159
+ const errText = await res.text();
160
+ throw new Error(`API ${res.status}: ${errText}`);
161
+ }
162
+
163
+ const data = await res.json();
164
+ const content = data.choices?.[0]?.message?.content || '';
165
+
166
+ const { parsed, error } = extractJSON(content);
167
+
168
+ return { raw: content, parsed, error, usage: data.usage || null };
169
+ }
170
+
171
+ // ============================================================
172
+ // JSON 提取
173
+ // ============================================================
174
+
175
+ /**
176
+ * 从 LLM 响应中提取 JSON 对象
177
+ * 容忍 markdown 代码块包裹、前后文字
178
+ */
179
+ export function extractJSON(rawContent) {
180
+ const m = rawContent.match(/\{[\s\S]*\}/);
181
+ if (!m) return { parsed: null, error: 'No JSON object in response' };
182
+
183
+ try {
184
+ return { parsed: JSON.parse(m[0]), error: null };
185
+ } catch (e) {
186
+ return { parsed: null, error: e.message };
187
+ }
188
+ }
189
+
190
+ // ============================================================
191
+ // 多轮分析
192
+ // ============================================================
193
+
194
+ const sleep = ms => new Promise(r => setTimeout(r, ms));
195
+
196
+ /**
197
+ * 多轮聚焦分析
198
+ * 每轮独立调用 API,中间结果写入缓存目录。已有缓存则跳过。
199
+ *
200
+ * @param {Object} opts
201
+ * @param {string} opts.imagePath 图片路径
202
+ * @param {Array} opts.passes 轮次配置数组
203
+ * [{ name: string, systemPrompt: string, userPrompt: string, maxTokens?: number }]
204
+ * @param {string} opts.cacheDir 缓存目录
205
+ * @param {string} [opts.model] 模型名
206
+ * @param {number} [opts.delayMs=1500] 轮次间延迟
207
+ * @returns {Promise<Array<{ pass: number, name: string, raw: string, parsed: object|null, error: string|null, usage: object|null }>>}
208
+ */
209
+ export async function callMultiPass(opts) {
210
+ const { imagePath, passes, cacheDir, model, delayMs = 1500 } = opts;
211
+
212
+ if (!fs.existsSync(cacheDir)) {
213
+ fs.mkdirSync(cacheDir, { recursive: true });
214
+ }
215
+
216
+ const results = [];
217
+
218
+ for (let i = 0; i < passes.length; i++) {
219
+ const p = passes[i];
220
+ const passNum = i + 1;
221
+ const cacheFile = path.join(cacheDir, `pass${passNum}.json`);
222
+
223
+ // 检查缓存
224
+ if (fs.existsSync(cacheFile)) {
225
+ console.log(` Pass ${passNum} (${p.name}): 已有缓存,跳过`);
226
+ results.push(JSON.parse(fs.readFileSync(cacheFile, 'utf-8')));
227
+ continue;
228
+ }
229
+
230
+ console.log(` Pass ${passNum} (${p.name}, ${p.maxTokens || 2000}t)...`);
231
+ try {
232
+ const result = await callVisionAPI({
233
+ imagePath,
234
+ systemPrompt: p.systemPrompt,
235
+ userPrompt: p.userPrompt,
236
+ maxTokens: p.maxTokens,
237
+ model,
238
+ });
239
+ const entry = { pass: passNum, name: p.name, ...result };
240
+ fs.writeFileSync(cacheFile, JSON.stringify(entry, null, 2));
241
+ results.push(entry);
242
+ console.log(` ${result.error ? '✗ ' + result.error : '✓ OK'} | tokens:${result.usage?.total_tokens || '?'}`);
243
+ } catch (e) {
244
+ console.log(` ✗ ${e.message}`);
245
+ const entry = { pass: passNum, name: p.name, error: e.message, parsed: null, usage: null };
246
+ fs.writeFileSync(cacheFile, JSON.stringify(entry, null, 2));
247
+ results.push(entry);
248
+ }
249
+
250
+ if (i < passes.length - 1) await sleep(delayMs);
251
+ }
252
+
253
+ return results;
254
+ }