braintrust-lite 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/consult.js +46 -6
- package/src/format.js +30 -8
- package/src/providers.js +10 -0
- package/src/server.js +9 -4
package/package.json
CHANGED
package/src/consult.js
CHANGED
|
@@ -2,8 +2,10 @@ import {
|
|
|
2
2
|
runProcess,
|
|
3
3
|
adaptCodex,
|
|
4
4
|
adaptGemini,
|
|
5
|
+
adaptClaude,
|
|
5
6
|
CODEX_ARGS_PREFIX,
|
|
6
7
|
GEMINI_ARGS_PREFIX,
|
|
8
|
+
CLAUDE_ARGS_PREFIX,
|
|
7
9
|
} from './providers.js';
|
|
8
10
|
|
|
9
11
|
const SYSTEM_PROMPT = `你是一个独立思考的高级专家。请基于自己的判断给出高质量、可执行的回答。
|
|
@@ -20,8 +22,40 @@ const PROVIDERS = {
|
|
|
20
22
|
buildArgs: prompt => ['-p', `${SYSTEM_PROMPT}\n\n${prompt}`, ...GEMINI_ARGS_PREFIX],
|
|
21
23
|
adapt: adaptGemini,
|
|
22
24
|
},
|
|
25
|
+
claude: {
|
|
26
|
+
cmd: 'claude',
|
|
27
|
+
buildArgs: prompt => [...CLAUDE_ARGS_PREFIX, `${SYSTEM_PROMPT}\n\n${prompt}`],
|
|
28
|
+
adapt: adaptClaude,
|
|
29
|
+
},
|
|
23
30
|
};
|
|
24
31
|
|
|
32
|
+
/**
|
|
33
|
+
* Shuffle an array in-place using Fisher-Yates and return it.
|
|
34
|
+
*/
|
|
35
|
+
function shuffle(arr) {
|
|
36
|
+
for (let i = arr.length - 1; i > 0; i--) {
|
|
37
|
+
const j = Math.floor(Math.random() * (i + 1));
|
|
38
|
+
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
39
|
+
}
|
|
40
|
+
return arr;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/**
|
|
44
|
+
* Replace provider names with anonymous labels (Model A, B, C…).
|
|
45
|
+
* Order is randomised so the judge cannot infer identity from position.
|
|
46
|
+
* Returns { results: anonymized array, mapping: { 'Model A': 'gemini', … } }
|
|
47
|
+
*/
|
|
48
|
+
function anonymize(results) {
|
|
49
|
+
const labels = ['Model A', 'Model B', 'Model C', 'Model D', 'Model E'];
|
|
50
|
+
const shuffled = shuffle([...results]);
|
|
51
|
+
const mapping = {};
|
|
52
|
+
const anonymized = shuffled.map((r, i) => {
|
|
53
|
+
mapping[labels[i]] = r.provider;
|
|
54
|
+
return { ...r, provider: labels[i] };
|
|
55
|
+
});
|
|
56
|
+
return { results: anonymized, mapping };
|
|
57
|
+
}
|
|
58
|
+
|
|
25
59
|
/**
|
|
26
60
|
* Run a single provider and return a normalized result object.
|
|
27
61
|
* Never throws — errors are captured in the `error` field.
|
|
@@ -41,17 +75,18 @@ async function runOne(name, prompt, { cwd, timeoutMs }) {
|
|
|
41
75
|
}
|
|
42
76
|
|
|
43
77
|
/**
|
|
44
|
-
* Consult Codex and/or
|
|
78
|
+
* Consult Codex, Gemini, and/or Claude in parallel.
|
|
45
79
|
*
|
|
46
80
|
* @param {object} opts
|
|
47
81
|
* @param {string} opts.prompt - The question to ask.
|
|
48
|
-
* @param {string} [opts.only] - 'codex' | 'gemini' — only run this one.
|
|
82
|
+
* @param {string} [opts.only] - 'codex' | 'gemini' | 'claude' — only run this one.
|
|
49
83
|
* @param {string[]} [opts.skip] - Providers to skip.
|
|
50
|
-
* @param {number} [opts.timeoutMs] - Per-provider timeout in ms (default 90 000).
|
|
84
|
+
* @param {number} [opts.timeoutMs] - Per-provider timeout in ms (default 90 000). 0 = no timeout.
|
|
51
85
|
* @param {string} [opts.cwd] - Working directory for subprocesses.
|
|
52
|
-
* @
|
|
86
|
+
* @param {boolean} [opts.blind] - Anonymise provider names (default true).
|
|
87
|
+
* @returns {Promise<{ results: Array, mapping: object|null }>}
|
|
53
88
|
*/
|
|
54
|
-
export async function consult({ prompt, only, skip = [], timeoutMs = 90_000, cwd } = {}) {
|
|
89
|
+
export async function consult({ prompt, only, skip = [], timeoutMs = 90_000, cwd, blind = true } = {}) {
|
|
55
90
|
const targets = Object.keys(PROVIDERS)
|
|
56
91
|
.filter(name => (only ? name === only : true))
|
|
57
92
|
.filter(name => !skip.includes(name));
|
|
@@ -64,9 +99,14 @@ export async function consult({ prompt, only, skip = [], timeoutMs = 90_000, cwd
|
|
|
64
99
|
targets.map(name => runOne(name, prompt, { cwd, timeoutMs }))
|
|
65
100
|
);
|
|
66
101
|
|
|
67
|
-
|
|
102
|
+
const results = targets.map((name, i) =>
|
|
68
103
|
settled[i].status === 'fulfilled'
|
|
69
104
|
? settled[i].value
|
|
70
105
|
: { provider: name, content: '', duration_ms: 0, error: settled[i].reason?.message ?? 'unknown' }
|
|
71
106
|
);
|
|
107
|
+
|
|
108
|
+
if (blind) {
|
|
109
|
+
return anonymize(results);
|
|
110
|
+
}
|
|
111
|
+
return { results, mapping: null };
|
|
72
112
|
}
|
package/src/format.js
CHANGED
|
@@ -1,20 +1,42 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Format an array of provider results as human-readable Markdown.
|
|
3
3
|
* Each provider gets a ## header with timing (or error), then its content.
|
|
4
|
+
*
|
|
5
|
+
* When a mapping is provided (blind mode), a REVEAL section is appended
|
|
6
|
+
* at the end so the Judge can disclose model identity to the user AFTER
|
|
7
|
+
* completing their evaluation.
|
|
4
8
|
*/
|
|
5
|
-
export function formatAsMarkdown(results) {
|
|
6
|
-
|
|
9
|
+
export function formatAsMarkdown(results, mapping = null) {
|
|
10
|
+
const body = results.map(r => {
|
|
7
11
|
const label = r.error
|
|
8
|
-
? `## ${r.provider
|
|
9
|
-
: `## ${r.provider
|
|
10
|
-
const
|
|
11
|
-
return `${label}\n\n${
|
|
12
|
+
? `## ${r.provider} (${r.error})`
|
|
13
|
+
: `## ${r.provider} (${(r.duration_ms / 1000).toFixed(1)}s)`;
|
|
14
|
+
const content = r.error ? `*调用失败: ${r.error}*` : r.content;
|
|
15
|
+
return `${label}\n\n${content}`;
|
|
12
16
|
}).join('\n\n---\n\n');
|
|
17
|
+
|
|
18
|
+
if (!mapping) return body;
|
|
19
|
+
|
|
20
|
+
const reveal = Object.entries(mapping)
|
|
21
|
+
.map(([label, provider]) => `| ${label} | **${provider}** |`)
|
|
22
|
+
.join('\n');
|
|
23
|
+
|
|
24
|
+
return `${body}
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## 🔒 REVEAL — 仅在完成评估后阅读
|
|
29
|
+
|
|
30
|
+
> **Judge 指令**:请先完成你的完整评估和综合输出,再阅读以下映射表,并在回复末尾告知用户每个模型对应的真实身份。
|
|
31
|
+
|
|
32
|
+
| 匿名标签 | 真实模型 |
|
|
33
|
+
|---------|---------|
|
|
34
|
+
${reveal}`;
|
|
13
35
|
}
|
|
14
36
|
|
|
15
37
|
/**
|
|
16
38
|
* Format results as a compact JSON string for programmatic consumption.
|
|
17
39
|
*/
|
|
18
|
-
export function formatAsJson(prompt, results) {
|
|
19
|
-
return JSON.stringify({ prompt, results }, null, 2);
|
|
40
|
+
export function formatAsJson(prompt, results, mapping = null) {
|
|
41
|
+
return JSON.stringify({ prompt, results, mapping }, null, 2);
|
|
20
42
|
}
|
package/src/providers.js
CHANGED
|
@@ -4,6 +4,7 @@ import { spawn } from 'child_process';
|
|
|
4
4
|
|
|
5
5
|
export const CODEX_ARGS_PREFIX = ['exec', '--json', '--skip-git-repo-check', '--ephemeral'];
|
|
6
6
|
export const GEMINI_ARGS_PREFIX = ['-o', 'json'];
|
|
7
|
+
export const CLAUDE_ARGS_PREFIX = ['--output-format', 'json', '-p'];
|
|
7
8
|
|
|
8
9
|
// ─── Process runner ───────────────────────────────────────────────────────────
|
|
9
10
|
|
|
@@ -76,3 +77,12 @@ export function adaptGemini(raw) {
|
|
|
76
77
|
} catch { /* fall through */ }
|
|
77
78
|
return fallback(raw.stdout);
|
|
78
79
|
}
|
|
80
|
+
|
|
81
|
+
/** Parse Claude CLI JSON output → extract .result text. */
|
|
82
|
+
export function adaptClaude(raw) {
|
|
83
|
+
try {
|
|
84
|
+
const j = JSON.parse(raw.stdout);
|
|
85
|
+
if (typeof j.result === 'string') return { content: j.result, parse_mode: 'json' };
|
|
86
|
+
} catch { /* fall through */ }
|
|
87
|
+
return fallback(raw.stdout);
|
|
88
|
+
}
|
package/src/server.js
CHANGED
|
@@ -26,18 +26,22 @@ const CONSULT_TOOL = {
|
|
|
26
26
|
},
|
|
27
27
|
only: {
|
|
28
28
|
type: 'string',
|
|
29
|
-
enum: ['codex', 'gemini'],
|
|
29
|
+
enum: ['codex', 'gemini', 'claude'],
|
|
30
30
|
description: '只调用指定一个模型(省成本或调试)。',
|
|
31
31
|
},
|
|
32
32
|
skip: {
|
|
33
33
|
type: 'array',
|
|
34
|
-
items: { type: 'string', enum: ['codex', 'gemini'] },
|
|
34
|
+
items: { type: 'string', enum: ['codex', 'gemini', 'claude'] },
|
|
35
35
|
description: '跳过指定模型列表。',
|
|
36
36
|
},
|
|
37
37
|
timeout_sec: {
|
|
38
38
|
type: 'number',
|
|
39
39
|
description: '每个模型的超时秒数,默认 90。传 0 表示不限时,等待直到完成。',
|
|
40
40
|
},
|
|
41
|
+
blind: {
|
|
42
|
+
type: 'boolean',
|
|
43
|
+
description: '匿名化 provider 名称(默认 true)。结果以 Model A/B/C 呈现,顺序随机打乱,Judge 无法通过名字或位置判断来源。传 false 可查看真实模型名称。',
|
|
44
|
+
},
|
|
41
45
|
cwd: {
|
|
42
46
|
type: 'string',
|
|
43
47
|
description: '子进程工作目录,默认继承 MCP server 的 cwd。',
|
|
@@ -64,11 +68,12 @@ server.setRequestHandler(CallToolRequestSchema, async req => {
|
|
|
64
68
|
}
|
|
65
69
|
|
|
66
70
|
const args = req.params.arguments ?? {};
|
|
67
|
-
const results = await consult({
|
|
71
|
+
const { results, mapping } = await consult({
|
|
68
72
|
prompt: String(args.prompt ?? ''),
|
|
69
73
|
only: args.only,
|
|
70
74
|
skip: Array.isArray(args.skip) ? args.skip : [],
|
|
71
75
|
timeoutMs: args.timeout_sec != null ? Number(args.timeout_sec) * 1000 : 90_000,
|
|
76
|
+
blind: args.blind !== false,
|
|
72
77
|
cwd: args.cwd,
|
|
73
78
|
});
|
|
74
79
|
|
|
@@ -79,7 +84,7 @@ server.setRequestHandler(CallToolRequestSchema, async req => {
|
|
|
79
84
|
}
|
|
80
85
|
|
|
81
86
|
return {
|
|
82
|
-
content: [{ type: 'text', text: formatAsMarkdown(results) }],
|
|
87
|
+
content: [{ type: 'text', text: formatAsMarkdown(results, mapping) }],
|
|
83
88
|
};
|
|
84
89
|
});
|
|
85
90
|
|