braintrust-lite 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,14 +1,16 @@
1
1
  {
2
2
  "name": "braintrust-lite",
3
- "version": "0.1.6",
3
+ "version": "0.1.7",
4
4
  "description": "Lightweight multi-model advisor for Claude Code — parallel Codex + Gemini consultation via MCP",
5
5
  "type": "module",
6
6
  "bin": {
7
7
  "consult": "bin/consult",
8
- "braintrust-lite": "src/server.js"
8
+ "braintrust-lite": "src/server.js",
9
+ "braintrust-setup": "scripts/setup.js"
9
10
  },
10
11
  "scripts": {
11
- "start": "node src/server.js"
12
+ "start": "node src/server.js",
13
+ "setup": "node scripts/setup.js"
12
14
  },
13
15
  "dependencies": {
14
16
  "@modelcontextprotocol/sdk": "^1.10.2"
@@ -28,5 +30,11 @@
28
30
  "repository": {
29
31
  "type": "git",
30
32
  "url": "git+https://github.com/HongjieRen/braintrust-lite.git"
31
- }
33
+ },
34
+ "files": [
35
+ "bin/",
36
+ "src/",
37
+ "scripts/",
38
+ "skills/"
39
+ ]
32
40
  }
@@ -0,0 +1,66 @@
1
+ #!/usr/bin/env node
2
+ import { mkdirSync, writeFileSync, existsSync, readFileSync, copyFileSync } from 'fs';
3
+ import { join, dirname } from 'path';
4
+ import { fileURLToPath } from 'url';
5
+ import { spawnSync } from 'child_process';
6
+ import https from 'https';
7
+
8
+ const __dirname = dirname(fileURLToPath(import.meta.url));
9
+ const SKILL_DIR = join(process.env.HOME || '/tmp', '.claude', 'skills', 'consult');
10
+ const SKILL_PATH = join(SKILL_DIR, 'SKILL.md');
11
+ const SKILL_URL = 'https://raw.githubusercontent.com/HongjieRen/braintrust-lite/main/skills/consult/SKILL.md';
12
+ const G = '\x1b[32m✓\x1b[0m', R = '\x1b[31m✗\x1b[0m';
13
+
14
+ function fetch(url) {
15
+ return new Promise((resolve, reject) => {
16
+ const req = https.get(url, { timeout: 10000 }, res => {
17
+ if (res.statusCode === 301 || res.statusCode === 302) return fetch(res.headers.location).then(resolve).catch(reject);
18
+ if (res.statusCode !== 200) return reject(new Error(`HTTP ${res.statusCode}`));
19
+ let body = ''; res.on('data', d => { body += d; }); res.on('end', () => resolve(body));
20
+ });
21
+ req.on('error', reject); req.on('timeout', () => { req.destroy(); reject(new Error('timeout')); });
22
+ });
23
+ }
24
+
25
+ function getVersion(p) { try { return (readFileSync(p,'utf8').match(/^version:\s*(.+)$/m)||[])[1]?.trim(); } catch { return null; } }
26
+
27
+ console.log('\nbraintrust-lite setup\n');
28
+ console.log('Installing consult skill:');
29
+ mkdirSync(SKILL_DIR, { recursive: true });
30
+ if (existsSync(SKILL_PATH)) copyFileSync(SKILL_PATH, SKILL_PATH + '.bak');
31
+
32
+ const bundled = join(__dirname, '..', 'skills', 'consult', 'SKILL.md');
33
+ if (existsSync(bundled)) {
34
+ copyFileSync(bundled, SKILL_PATH);
35
+ console.log(` ${G} SKILL.md installed (v${getVersion(SKILL_PATH)})`);
36
+ } else {
37
+ try {
38
+ const content = await fetch(SKILL_URL);
39
+ writeFileSync(SKILL_PATH, content, 'utf8');
40
+ console.log(` ${G} SKILL.md downloaded from GitHub (v${getVersion(SKILL_PATH)})`);
41
+ } catch(e) {
42
+ console.log(` ${R} SKILL.md download failed: ${e.message}`);
43
+ process.exitCode = 1;
44
+ }
45
+ }
46
+
47
+ console.log('\nRegistering MCP server:');
48
+ const claudeOk = spawnSync('claude', ['--version'], { timeout: 5000 }).status === 0;
49
+ if (!claudeOk) {
50
+ console.log(` ${R} claude CLI not found — install Claude Code first`);
51
+ console.log(` Then: claude mcp add braintrust-lite -- npx -y braintrust-lite@~0.1`);
52
+ process.exitCode = 1;
53
+ } else {
54
+ const list = spawnSync('claude', ['mcp', 'list'], { encoding: 'utf8', timeout: 5000 });
55
+ if ((list.stdout || '').includes('braintrust-lite')) {
56
+ console.log(` ${G} MCP server already registered`);
57
+ } else {
58
+ const r = spawnSync('claude', ['mcp', 'add', 'braintrust-lite', '--', 'npx', '-y', 'braintrust-lite@~0.1'], { encoding: 'utf8', timeout: 10000 });
59
+ if (r.status === 0) console.log(` ${G} MCP server registered (npx braintrust-lite@~0.1)`);
60
+ else { console.log(` ${R} MCP registration failed`); console.log(` Manual: claude mcp add braintrust-lite -- npx -y braintrust-lite@~0.1`); process.exitCode = 1; }
61
+ }
62
+ }
63
+
64
+ console.log();
65
+ console.log(process.exitCode ? ' \x1b[33mSetup done with warnings — fix issues above.\x1b[0m' : ' \x1b[32mDone! Restart Claude Code, then use /consult.\x1b[0m');
66
+ console.log();
@@ -0,0 +1,213 @@
1
+ ---
2
+ name: consult
3
+ version: 1.4.0
4
+ description: 在处理规划/设计/架构/调研类任务时,并发调用 codex + gemini + claude 获取多视角,主 Claude 担任 Judge 盲评综合输出。支持渐进式多轮对话和自动更新。
5
+ ---
6
+
7
+ # consult — 多模型旁路军师
8
+
9
+ ## Preamble(每次调用前先执行)
10
+
11
+ ```bash
12
+ _REMOTE_VER=$(curl -sf --max-time 3 "https://raw.githubusercontent.com/HongjieRen/braintrust-lite/main/skills/consult/SKILL.md" | awk '/^version:/ {print $2; exit}')
13
+ _LOCAL_VER=$(awk '/^version:/ {print $2; exit}' ~/.claude/skills/consult/SKILL.md 2>/dev/null)
14
+ [ -n "$_REMOTE_VER" ] && [ "$_REMOTE_VER" != "$_LOCAL_VER" ] && echo "UPGRADE_AVAILABLE $_LOCAL_VER $_REMOTE_VER" || echo "UP_TO_DATE $_LOCAL_VER"
15
+ ```
16
+
17
+ **如果输出 `UPGRADE_AVAILABLE <old> <new>`**:静默自动更新,不打断用户:
18
+ ```bash
19
+ cp ~/.claude/skills/consult/SKILL.md ~/.claude/skills/consult/SKILL.md.bak 2>/dev/null || true
20
+ curl -fsSL "https://raw.githubusercontent.com/HongjieRen/braintrust-lite/main/skills/consult/SKILL.md" \
21
+ -o ~/.claude/skills/consult/SKILL.md && echo "Updated consult skill $_old → $_new"
22
+ ```
23
+ 更新完成后继续执行本次任务,在最终回复末尾附一行:`*(consult skill 已自动更新 v{old} → v{new})*`
24
+
25
+ **如果输出 `UP_TO_DATE`**:直接继续,无需提示。
26
+ **如果 curl 失败**:忽略,继续执行,不提示用户。
27
+
28
+ ---
29
+
30
+ ## 这是什么
31
+
32
+ `braintrust-lite` 提供的 MCP tool `mcp__braintrust_lite__consult` 会在后台并发调用 **Codex CLI**、**Gemini CLI** 和 **Claude CLI**,把三个顶尖模型的独立回答以匿名形式(Model A/B/C)交回给你。
33
+
34
+ 你(主 Claude)负责担任 Judge——盲评内容,提炼共识、标注独特洞见、裁决分歧,输出集大成方案。
35
+
36
+ ## 何时使用
37
+
38
+ 对以下类型的任务,**在启动子 agent 的同时并行调用 consult**:
39
+
40
+ - 架构选型、技术选型、框架比较
41
+ - 方案设计(新功能、重大重构、系统集成)
42
+ - 复杂 bug 根因分析(多种假设并存时)
43
+ - 非显而易见的技术决策(有明显 trade-off 的场景)
44
+ - 安全或性能评审
45
+
46
+ ## 何时跳过
47
+
48
+ - typo 修复、单行改动、简单 rename
49
+ - 只读信息查询(用 Grep / Read 就够)
50
+ - 用户已经明确指定方案,不需要二次意见
51
+ - 已知有标准答案的操作性任务
52
+
53
+ ---
54
+
55
+ ## 工作流:单轮
56
+
57
+ ```
58
+ 1. 发一条 message,同时 parallel call:
59
+ ├─ Task(subagent_type=Plan/Explore/..., prompt=X)
60
+ └─ mcp__braintrust_lite__consult(prompt=X, timeout_sec=<见下表>)
61
+
62
+ 2. 等两者都返回后,你亲自担任 Judge(盲评流程):
63
+
64
+ 步骤一:只看 Model A/B/C 内容,按结构完成评估(见下方 Judge 输出格式)
65
+
66
+ 步骤二:读 REVEAL 映射表
67
+
68
+ 步骤三:在回复末尾揭晓:
69
+ "揭晓:Model A = Gemini,Model B = Claude,Model C = Codex"
70
+ ```
71
+
72
+ ### Judge 输出格式(必须分节,供多轮渐进加载)
73
+
74
+ 每轮 Judge 输出**强制使用以下四节**,不可合并:
75
+
76
+ ```
77
+ ### VERDICT
78
+ <核心结论,1-3句,永远保留进历史>
79
+
80
+ ### REASONING
81
+ <关键推理和裁决依据,有追问才加载>
82
+
83
+ ### TRADEOFFS
84
+ <权衡分析、已排除方案及理由,用户问"有没有其他方案"时加载>
85
+
86
+ ### OPEN_QUESTIONS
87
+ <未解决的分歧或待确认的假设,用户问"还有什么不确定"时加载>
88
+ ```
89
+
90
+ ---
91
+
92
+ ## 工作流:多轮对话(会话模式)
93
+
94
+ ### 进入信号
95
+
96
+ `/consult` 触发后,第一轮回复顶部显示:
97
+
98
+ ```
99
+ ┌─ Consult 会话已启动 ──────────────────────────┐
100
+ │ 模型:Codex · Gemini · Claude CLI │
101
+ │ 记忆:Balanced(可用 !brief / !deep 切换) │
102
+ │ 输入问题继续追问,或输入 /done 退出 │
103
+ └────────────────────────────────────────────────┘
104
+ ```
105
+
106
+ ### 每轮状态栏(**每轮回复第一行**,始终显示)
107
+
108
+ 每轮 Judge 输出**最开始**,必须先输出这一行状态栏,再输出任何正文:
109
+
110
+ ```
111
+ [Consult·R{N} | 3 models | Consensus: {High/Split}]
112
+ ```
113
+
114
+ - `R{N}` = 第几轮,帮助用户感知多轮积累
115
+ - `Consensus: Split` 时额外显示一行分歧摘要:`Note: split on <主要分歧点>`
116
+ - 平时无分歧则只显示 `High`,不展开
117
+ - 若模型降级(实际跑了少于 3 个),显示 `⚠ 2/3 models` 代替 `3 models`
118
+
119
+ ### 多轮上下文:渐进式加载(核心设计)
120
+
121
+ **设计原则:不预先决定压缩多少,而是根据 follow-up 意图决定加载什么。**
122
+
123
+ 每轮结束后维护一个**会话状态对象**(始终随 prompt 携带,~100 token):
124
+
125
+ ```
126
+ [Session State]
127
+ Goal: <用户核心目标>
128
+ Constraints: <已确认约束>
129
+ Decisions: <已做决策及理由>
130
+ Rejected: <已排除选项>
131
+ Open: <未解决问题>
132
+ Current best: <当前推荐方案一句话>
133
+ ```
134
+
135
+ 历史内容**按意图懒加载**,不机械按轮次:
136
+
137
+ | follow-up 意图 | 加载的历史内容 |
138
+ |---------------|--------------|
139
+ | 普通追问、深入某方向 | Session State + 所有历史 VERDICT |
140
+ | "为什么这样判断" | + 最近1轮 REASONING |
141
+ | "有没有其他方案" | + 最近1轮 TRADEOFFS |
142
+ | "还有什么不确定" | + 最近1轮 OPEN_QUESTIONS |
143
+ | "刚才某模型说的那个点" | + 按需检索原文片段(Model A/B/C 原始回答存档备查) |
144
+
145
+ 历史 VERDICT 全部保留(每条 ~50 token),其余节只保留最近1-2轮,更老的丢弃。
146
+
147
+ ### 自动降级
148
+
149
+ 用户回复是简单确认时("好的"、"谢谢"、"明白了"等),**不触发三模型并发**,由主 Claude 直接响应,节省成本和延迟。
150
+
151
+ 判断标准:用户回复 < 20 字且不含实质性新问题。
152
+
153
+ ### 多轮终止条件
154
+
155
+ - 用户输入 `/done`(或 `!stop`)
156
+ - 用户明确表示满意("好了"、"没问题了")
157
+ - 用户切换到无关新话题
158
+ - 已进行 5 轮(自动退出,告知用户可重新 `/consult`)
159
+
160
+ 退出时显示:`── Consult 会话结束(共 {N} 轮)──`
161
+
162
+ ### 用户控制命令
163
+
164
+ ```
165
+ !brief 切换到精简记忆(只带 VERDICT,适合快速迭代)
166
+ !deep 切换到完整记忆(带最近1轮 REASONING + TRADEOFFS,适合复杂设计)
167
+ /done 退出 Consult 会话模式(!stop 同效)
168
+ !deltas 展开本轮三模型核心主张各一句(不显示原文全文)
169
+ !raw 展开本轮三模型完整原始回答
170
+ ```
171
+
172
+ ---
173
+
174
+ ## consult tool 参数
175
+
176
+ ```
177
+ prompt (必须) 问题,建议精炼、自包含,含必要上下文
178
+ only (可选) "codex" | "gemini" | "claude" — 只调用一个
179
+ skip (可选) ["codex"] | ["gemini"] | ["claude"] — 跳过某个
180
+ timeout_sec (可选) 每个模型超时秒数,默认 90;传 0 = 不限时等待完成
181
+ blind (可选) 默认 true;传 false 可直接看真实模型名称
182
+ cwd (可选) 子进程工作目录
183
+ ```
184
+
185
+ ## timeout 选择策略
186
+
187
+ **你(主 Claude)负责决定 timeout_sec:**
188
+
189
+ | 任务类型 | timeout_sec |
190
+ |---------|------------|
191
+ | 深度调研、市场分析、可行性研究 | **0**(不限时) |
192
+ | 架构设计、复杂方案对比 | **0**(不限时) |
193
+ | 代码审查、技术选型 | 180 |
194
+ | 简单问答、快速决策 | 90(默认) |
195
+
196
+ 调研类任务一律传 `timeout_sec: 0`。
197
+
198
+ ## 成本与延迟
199
+
200
+ - 每次 consult = 3 次 API 调用(codex + gemini + claude)
201
+ - 延迟 = `max(三者响应时间)`(并发)
202
+ - 简单问题 ~$0.05–0.20,中等 ~$0.20–0.50
203
+ - 自动降级(简单确认)= 0 次额外 API 调用
204
+
205
+ ## 终端 fallback
206
+
207
+ ```bash
208
+ consult "你的问题"
209
+ consult --only codex "快速问题"
210
+ consult --timeout 0 "深度调研问题"
211
+ consult --dir /your/project "review this project"
212
+ cat file.ts | consult "review this code"
213
+ ```
package/src/consult.js CHANGED
@@ -1,37 +1,18 @@
1
1
  import {
2
2
  runProcess,
3
- adaptCodex,
4
- adaptGemini,
5
- adaptClaude,
6
- CODEX_ARGS_PREFIX,
7
- GEMINI_ARGS_PREFIX,
8
- CLAUDE_ARGS_PREFIX,
3
+ adaptCodex, adaptGemini, adaptClaude,
4
+ CODEX_ARGS_PREFIX, GEMINI_ARGS_PREFIX, CLAUDE_ARGS_PREFIX,
9
5
  } from './providers.js';
10
6
 
11
7
  const SYSTEM_PROMPT = `你是一个独立思考的高级专家。请基于自己的判断给出高质量、可执行的回答。
12
8
  要求:独立思考,不假设其他专家会补充;区分结论、依据、假设、风险;简洁但完整。`;
13
9
 
14
10
  const PROVIDERS = {
15
- codex: {
16
- cmd: 'codex',
17
- buildArgs: prompt => [...CODEX_ARGS_PREFIX, `${SYSTEM_PROMPT}\n\n${prompt}`],
18
- adapt: adaptCodex,
19
- },
20
- gemini: {
21
- cmd: 'gemini',
22
- buildArgs: prompt => ['-p', `${SYSTEM_PROMPT}\n\n${prompt}`, ...GEMINI_ARGS_PREFIX],
23
- adapt: adaptGemini,
24
- },
25
- claude: {
26
- cmd: 'claude',
27
- buildArgs: prompt => [...CLAUDE_ARGS_PREFIX, `${SYSTEM_PROMPT}\n\n${prompt}`],
28
- adapt: adaptClaude,
29
- },
11
+ codex: { cmd: 'codex', buildArgs: p => [...CODEX_ARGS_PREFIX, `${SYSTEM_PROMPT}\n\n${p}`], adapt: adaptCodex },
12
+ gemini: { cmd: 'gemini', buildArgs: p => ['-p', `${SYSTEM_PROMPT}\n\n${p}`, ...GEMINI_ARGS_PREFIX], adapt: adaptGemini },
13
+ claude: { cmd: 'claude', buildArgs: p => [...CLAUDE_ARGS_PREFIX, `${SYSTEM_PROMPT}\n\n${p}`], adapt: adaptClaude },
30
14
  };
31
15
 
32
- /**
33
- * Shuffle an array in-place using Fisher-Yates and return it.
34
- */
35
16
  function shuffle(arr) {
36
17
  for (let i = arr.length - 1; i > 0; i--) {
37
18
  const j = Math.floor(Math.random() * (i + 1));
@@ -40,11 +21,6 @@ function shuffle(arr) {
40
21
  return arr;
41
22
  }
42
23
 
43
- /**
44
- * Replace provider names with anonymous labels (Model A, B, C…).
45
- * Order is randomised so the judge cannot infer identity from position.
46
- * Returns { results: anonymized array, mapping: { 'Model A': 'gemini', … } }
47
- */
48
24
  function anonymize(results) {
49
25
  const labels = ['Model A', 'Model B', 'Model C', 'Model D', 'Model E'];
50
26
  const shuffled = shuffle([...results]);
@@ -56,44 +32,33 @@ function anonymize(results) {
56
32
  return { results: anonymized, mapping };
57
33
  }
58
34
 
59
- /**
60
- * Run a single provider and return a normalized result object.
61
- * Never throws — errors are captured in the `error` field.
62
- */
63
35
  async function runOne(name, prompt, { cwd, timeoutMs }) {
64
36
  const p = PROVIDERS[name];
65
37
  const start = Date.now();
66
38
  const raw = await runProcess(p.cmd, p.buildArgs(prompt), { cwd, timeoutMs });
67
39
  const duration_ms = Date.now() - start;
68
40
 
69
- const error = raw.code === 'timeout' ? 'timeout'
70
- : raw.code !== 0 ? `exit ${raw.code}`
41
+ const error_type = raw.error_type || null;
42
+ const error = error_type === 'enoent' ? 'not installed'
43
+ : error_type === 'timeout' ? 'timeout'
44
+ : error_type === 'nonzero' ? `exit ${raw.code}`
45
+ : error_type ? error_type
71
46
  : null;
72
47
 
73
48
  const { content } = error ? { content: '' } : p.adapt(raw);
74
- return { provider: name, content, duration_ms, error };
49
+ return { provider: name, content, duration_ms, error, error_type };
75
50
  }
76
51
 
77
52
  /**
78
53
  * Consult Codex, Gemini, and/or Claude in parallel.
79
- *
80
- * @param {object} opts
81
- * @param {string} opts.prompt - The question to ask.
82
- * @param {string} [opts.only] - 'codex' | 'gemini' | 'claude' — only run this one.
83
- * @param {string[]} [opts.skip] - Providers to skip.
84
- * @param {number} [opts.timeoutMs] - Per-provider timeout in ms (default 90 000). 0 = no timeout.
85
- * @param {string} [opts.cwd] - Working directory for subprocesses.
86
- * @param {boolean} [opts.blind] - Anonymise provider names (default true).
87
- * @returns {Promise<{ results: Array, mapping: object|null }>}
54
+ * Returns { results, mapping, successCount, totalCount }
88
55
  */
89
56
  export async function consult({ prompt, only, skip = [], timeoutMs = 90_000, cwd, blind = true } = {}) {
90
57
  const targets = Object.keys(PROVIDERS)
91
58
  .filter(name => (only ? name === only : true))
92
59
  .filter(name => !skip.includes(name));
93
60
 
94
- if (targets.length === 0) {
95
- throw new Error('No providers selected — check --only / --skip flags.');
96
- }
61
+ if (targets.length === 0) throw new Error('No providers selected — check --only / --skip flags.');
97
62
 
98
63
  const settled = await Promise.allSettled(
99
64
  targets.map(name => runOne(name, prompt, { cwd, timeoutMs }))
@@ -102,11 +67,15 @@ export async function consult({ prompt, only, skip = [], timeoutMs = 90_000, cwd
102
67
  const results = targets.map((name, i) =>
103
68
  settled[i].status === 'fulfilled'
104
69
  ? settled[i].value
105
- : { provider: name, content: '', duration_ms: 0, error: settled[i].reason?.message ?? 'unknown' }
70
+ : { provider: name, content: '', duration_ms: 0, error: 'rejected', error_type: 'rejected' }
106
71
  );
107
72
 
73
+ const successCount = results.filter(r => !r.error).length;
74
+ const totalCount = results.length;
75
+
108
76
  if (blind) {
109
- return anonymize(results);
77
+ const { results: anon, mapping } = anonymize(results);
78
+ return { results: anon, mapping, successCount, totalCount };
110
79
  }
111
- return { results, mapping: null };
80
+ return { results, mapping: null, successCount, totalCount };
112
81
  }
package/src/format.js CHANGED
@@ -1,12 +1,15 @@
1
1
  /**
2
- * Format an array of provider results as human-readable Markdown.
3
- * Each provider gets a ## header with timing (or error), then its content.
4
- *
5
- * When a mapping is provided (blind mode), a REVEAL section is appended
6
- * at the end so the Judge can disclose model identity to the user AFTER
7
- * completing their evaluation.
2
+ * Format provider results as human-readable Markdown with run manifest.
8
3
  */
9
- export function formatAsMarkdown(results, mapping = null) {
4
+ export function formatAsMarkdown(results, mapping = null, { successCount, totalCount } = {}) {
5
+ const total = totalCount ?? results.length;
6
+ const succeeded = successCount ?? results.filter(r => !r.error).length;
7
+ const degraded = succeeded < total;
8
+
9
+ // Status line (mirrors SKILL.md status bar format)
10
+ const modelsLabel = degraded ? `⚠ ${succeeded}/${total} models` : `${total} models`;
11
+ const statusLine = `[Consult | ${modelsLabel} | responses below]\n`;
12
+
10
13
  const body = results.map(r => {
11
14
  const label = r.error
12
15
  ? `## ${r.provider} (${r.error})`
@@ -15,28 +18,36 @@ export function formatAsMarkdown(results, mapping = null) {
15
18
  return `${label}\n\n${content}`;
16
19
  }).join('\n\n---\n\n');
17
20
 
18
- if (!mapping) return body;
21
+ const revealSection = mapping ? buildReveal(mapping) : '';
22
+ const manifest = buildManifest(results, { successCount: succeeded, totalCount: total });
23
+
24
+ return `${statusLine}\n${body}${revealSection}\n\n---\n\n${manifest}`;
25
+ }
19
26
 
20
- const reveal = Object.entries(mapping)
27
+ function buildReveal(mapping) {
28
+ const rows = Object.entries(mapping)
21
29
  .map(([label, provider]) => `| ${label} | **${provider}** |`)
22
30
  .join('\n');
23
-
24
- return `${body}
25
-
26
- ---
27
-
28
- ## 🔒 REVEAL — 仅在完成评估后阅读
31
+ return `\n\n---\n\n## 🔒 REVEAL — 仅在完成评估后阅读
29
32
 
30
33
  > **Judge 指令**:请先完成你的完整评估和综合输出,再阅读以下映射表,并在回复末尾告知用户每个模型对应的真实身份。
31
34
 
32
35
  | 匿名标签 | 真实模型 |
33
36
  |---------|---------|
34
- ${reveal}`;
37
+ ${rows}`;
38
+ }
39
+
40
+ function buildManifest(results, { successCount, totalCount }) {
41
+ const ts = new Date().toISOString().slice(0, 19) + 'Z';
42
+ const degraded = successCount < totalCount;
43
+ const lines = results.map(r =>
44
+ r.error
45
+ ? ` - ${r.provider}: ${r.error_type || r.error}`
46
+ : ` - ${r.provider}: ${(r.duration_ms / 1000).toFixed(1)}s`
47
+ ).join('\n');
48
+ return `**Run manifest** · \`${ts}\` · ${successCount}/${totalCount} models${degraded ? ' ⚠ degraded' : ''}\n${lines}`;
35
49
  }
36
50
 
37
- /**
38
- * Format results as a compact JSON string for programmatic consumption.
39
- */
40
51
  export function formatAsJson(prompt, results, mapping = null) {
41
52
  return JSON.stringify({ prompt, results, mapping }, null, 2);
42
53
  }
package/src/providers.js CHANGED
@@ -10,7 +10,8 @@ export const CLAUDE_ARGS_PREFIX = ['--output-format', 'json', '-p'];
10
10
 
11
11
  /**
12
12
  * Spawn a subprocess with an AbortController-based timeout.
13
- * Returns { stdout, stderr, code } — never rejects.
13
+ * Returns { stdout, stderr, code, error_type } — never rejects.
14
+ * error_type: 'enoent' | 'timeout' | 'nonzero' | 'spawn_error' | null
14
15
  */
15
16
  export function runProcess(cmd, args, { cwd, timeoutMs } = {}) {
16
17
  const ac = new AbortController();
@@ -28,23 +29,28 @@ export function runProcess(cmd, args, { cwd, timeoutMs } = {}) {
28
29
  const timer = timeoutMs ? setTimeout(() => ac.abort(), timeoutMs) : null;
29
30
 
30
31
  return new Promise(resolve => {
31
- const done = code => {
32
+ let resolved = false;
33
+ const done = (code, error_type = null) => {
34
+ if (resolved) return;
35
+ resolved = true;
32
36
  if (timer) clearTimeout(timer);
33
- resolve({ stdout, stderr, code });
37
+ resolve({ stdout, stderr, code, error_type });
34
38
  };
35
- proc.on('close', done);
36
- proc.on('error', err => done(err.name === 'AbortError' ? 'timeout' : -1));
39
+ proc.on('close', code => done(code, code !== 0 ? 'nonzero' : null));
40
+ proc.on('error', err => {
41
+ if (err.name === 'AbortError') done('timeout', 'timeout');
42
+ else if (err.code === 'ENOENT') done(-1, 'enoent');
43
+ else done(-1, 'spawn_error');
44
+ });
37
45
  });
38
46
  }
39
47
 
40
48
  // ─── Adapters ─────────────────────────────────────────────────────────────────
41
49
 
42
- /** Last-resort: take the tail of raw stdout. */
43
50
  export function fallback(rawStdout) {
44
51
  return { content: rawStdout.slice(-2000).trim() || '[no output]', parse_mode: 'fallback' };
45
52
  }
46
53
 
47
- /** Parse Codex JSONL stream → extract the last agent_message text. */
48
54
  export function adaptCodex(raw) {
49
55
  try {
50
56
  const events = raw.stdout.trim().split('\n').flatMap(l => {
@@ -57,7 +63,6 @@ export function adaptCodex(raw) {
57
63
  return fallback(raw.stdout);
58
64
  }
59
65
 
60
- /** Skip any MCP startup noise before the first '{', then extract .response */
61
66
  export function parseGeminiResponse(stdout) {
62
67
  const jsonStart = stdout.indexOf('{');
63
68
  if (jsonStart === -1) return null;
@@ -69,7 +74,6 @@ export function parseGeminiResponse(stdout) {
69
74
  return null;
70
75
  }
71
76
 
72
- /** Parse Gemini JSON output → content string. */
73
77
  export function adaptGemini(raw) {
74
78
  try {
75
79
  const response = parseGeminiResponse(raw.stdout);
@@ -78,7 +82,6 @@ export function adaptGemini(raw) {
78
82
  return fallback(raw.stdout);
79
83
  }
80
84
 
81
- /** Parse Claude CLI JSON output → extract .result text. */
82
85
  export function adaptClaude(raw) {
83
86
  try {
84
87
  const j = JSON.parse(raw.stdout);
package/src/server.js CHANGED
@@ -1,94 +1,60 @@
1
1
  #!/usr/bin/env node
2
2
  import { Server } from '@modelcontextprotocol/sdk/server/index.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
- import {
5
- CallToolRequestSchema,
6
- ListToolsRequestSchema,
7
- } from '@modelcontextprotocol/sdk/types.js';
4
+ import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
8
5
  import { consult } from './consult.js';
9
6
  import { formatAsMarkdown } from './format.js';
10
7
 
11
- // ─── Tool definition ──────────────────────────────────────────────────────────
12
-
13
8
  const CONSULT_TOOL = {
14
9
  name: 'consult',
15
- description:
16
- '并发调用 Codex 和 Gemini CLI,获取两个顶尖模型对同一问题的独立视角。' +
17
- '适合架构选型、方案设计、技术决策、复杂调研。' +
18
- '调用方(通常是主 Claude)负责综合融合输出,自己担任 Judge。' +
19
- '不适合:typo 修复、单行改动、只读信息查询。',
10
+ description: '并发调用 Codex、Gemini、Claude CLI,获取三个模型的独立视角。适合架构选型、方案设计、技术决策、复杂调研。调用方担任 Judge 盲评综合输出。不适合:typo 修复、单行改动、只读查询。',
20
11
  inputSchema: {
21
12
  type: 'object',
22
13
  properties: {
23
- prompt: {
24
- type: 'string',
25
- description: '要问两个模型的问题。建议精炼、自包含(含必要上下文)。',
26
- },
27
- only: {
28
- type: 'string',
29
- enum: ['codex', 'gemini', 'claude'],
30
- description: '只调用指定一个模型(省成本或调试)。',
31
- },
32
- skip: {
33
- type: 'array',
34
- items: { type: 'string', enum: ['codex', 'gemini', 'claude'] },
35
- description: '跳过指定模型列表。',
36
- },
37
- timeout_sec: {
38
- type: 'number',
39
- description: '每个模型的超时秒数,默认 90。传 0 表示不限时,等待直到完成。',
40
- },
41
- blind: {
42
- type: 'boolean',
43
- description: '匿名化 provider 名称(默认 true)。结果以 Model A/B/C 呈现,顺序随机打乱,Judge 无法通过名字或位置判断来源。传 false 可查看真实模型名称。',
44
- },
45
- cwd: {
46
- type: 'string',
47
- description: '子进程工作目录,默认继承 MCP server 的 cwd。',
48
- },
14
+ prompt: { type: 'string', description: '要问各模型的问题,建议精炼自包含。' },
15
+ only: { type: 'string', enum: ['codex', 'gemini', 'claude'], description: '只调用指定一个模型。' },
16
+ skip: { type: 'array', items: { type: 'string', enum: ['codex', 'gemini', 'claude'] }, description: '跳过指定模型。' },
17
+ timeout_sec: { type: 'number', description: '每个模型超时秒数,默认 90。0 = 不限时。' },
18
+ blind: { type: 'boolean', description: '匿名化 provider 名称(默认 true)。' },
19
+ cwd: { type: 'string', description: '子进程工作目录。' },
49
20
  },
50
21
  required: ['prompt'],
51
22
  },
52
23
  };
53
24
 
54
- // ─── Server setup ─────────────────────────────────────────────────────────────
55
-
56
25
  const server = new Server(
57
- { name: 'braintrust-lite', version: '0.1.0' },
26
+ { name: 'braintrust-lite', version: '0.1.7' },
58
27
  { capabilities: { tools: {} } }
59
28
  );
60
29
 
61
- server.setRequestHandler(ListToolsRequestSchema, async () => ({
62
- tools: [CONSULT_TOOL],
63
- }));
30
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: [CONSULT_TOOL] }));
64
31
 
65
32
  server.setRequestHandler(CallToolRequestSchema, async req => {
66
- if (req.params.name !== 'consult') {
67
- throw new Error(`Unknown tool: ${req.params.name}`);
68
- }
33
+ if (req.params.name !== 'consult') throw new Error(`Unknown tool: ${req.params.name}`);
69
34
 
70
35
  const args = req.params.arguments ?? {};
71
- const { results, mapping } = await consult({
72
- prompt: String(args.prompt ?? ''),
73
- only: args.only,
74
- skip: Array.isArray(args.skip) ? args.skip : [],
75
- timeoutMs: args.timeout_sec != null ? Number(args.timeout_sec) * 1000 : 90_000,
76
- blind: args.blind !== false,
77
- cwd: args.cwd,
36
+ const timeoutMs = args.timeout_sec != null
37
+ ? (Number(args.timeout_sec) === 0 ? 0 : Number(args.timeout_sec) * 1000)
38
+ : 90_000;
39
+
40
+ const { results, mapping, successCount, totalCount } = await consult({
41
+ prompt: String(args.prompt ?? ''),
42
+ only: args.only,
43
+ skip: Array.isArray(args.skip) ? args.skip : [],
44
+ timeoutMs,
45
+ blind: args.blind !== false,
46
+ cwd: args.cwd,
78
47
  });
79
48
 
80
49
  if (results.every(r => r.error)) {
81
- throw new Error(
82
- `All providers failed: ${results.map(r => `${r.provider}=${r.error}`).join(', ')}`
83
- );
50
+ const detail = results.map(r => `${r.provider}=${r.error_type || r.error}`).join(', ');
51
+ throw new Error(`All providers failed: ${detail}`);
84
52
  }
85
53
 
86
54
  return {
87
- content: [{ type: 'text', text: formatAsMarkdown(results, mapping) }],
55
+ content: [{ type: 'text', text: formatAsMarkdown(results, mapping, { successCount, totalCount }) }],
88
56
  };
89
57
  });
90
58
 
91
- // ─── Start ────────────────────────────────────────────────────────────────────
92
-
93
59
  const transport = new StdioServerTransport();
94
60
  await server.connect(transport);